Hey hail team, I have a question about the _n_partitions
arg for read_table
. I tried running the following code:
import hail as hl
hl.init(log='/no_end_ht.log')
ht = hl.read_table('gs://regional_missense_constraint/temp/simul_break_prep_no_end.ht', _n_partitions=5000)
ht.describe()
ht.show()
and it crashed with
Traceback (most recent call last): (0 + 1) / 1]
File "/tmp/ede99857bc0a4f32aef119292bc4ddd4/no_end_ht.py", line 5, in <module>
ht = hl.read_table('gs://regional_missense_constraint/temp/simul_break_prep_no_end.ht', _n_partitions=5000)
File "<decorator-gen-1443>", line 2, in read_table
File "/opt/conda/default/lib/python3.8/site-packages/hail/typecheck/check.py", line 577, in wrapper
return __original_func(*args_, **kwargs_)
File "/opt/conda/default/lib/python3.8/site-packages/hail/methods/impex.py", line 2466, in read_table
intervals = ht._calculate_new_partitions(_n_partitions)
File "/opt/conda/default/lib/python3.8/site-packages/hail/table.py", line 3523, in _calculate_new_partitions
return Env.backend().execute(ir.TableToValueApply(
File "/opt/conda/default/lib/python3.8/site-packages/hail/backend/py4j_backend.py", line 75, in execute
value = ir.typ._from_json(result['value'])
File "/opt/conda/default/lib/python3.8/site-packages/hail/expr/types.py", line 254, in _from_json
return self._convert_from_json_na(x)
File "/opt/conda/default/lib/python3.8/site-packages/hail/expr/types.py", line 260, in _convert_from_json_na
return self._convert_from_json(x)
File "/opt/conda/default/lib/python3.8/site-packages/hail/expr/types.py", line 757, in _convert_from_json
return [self.element_type._convert_from_json_na(elt) for elt in x]
File "/opt/conda/default/lib/python3.8/site-packages/hail/expr/types.py", line 757, in <listcomp>
return [self.element_type._convert_from_json_na(elt) for elt in x]
File "/opt/conda/default/lib/python3.8/site-packages/hail/expr/types.py", line 260, in _convert_from_json_na
return self._convert_from_json(x)
File "/opt/conda/default/lib/python3.8/site-packages/hail/expr/types.py", line 1627, in _convert_from_json
return Interval(self.point_type._convert_from_json_na(x['start']),
File "<decorator-gen-18>", line 2, in __init__
File "/opt/conda/default/lib/python3.8/site-packages/hail/typecheck/check.py", line 577, in wrapper
return __original_func(*args_, **kwargs_)
File "/opt/conda/default/lib/python3.8/site-packages/hail/utils/interval.py", line 45, in __init__
end_type = impute_type(end)
File "/opt/conda/default/lib/python3.8/site-packages/hail/expr/expressions/base_expression.py", line 153, in impute_type
return tstruct(**{k: impute_type(x[k]) for k in x})
File "/opt/conda/default/lib/python3.8/site-packages/hail/expr/expressions/base_expression.py", line 153, in <dictcomp>
return tstruct(**{k: impute_type(x[k]) for k in x})
File "/opt/conda/default/lib/python3.8/site-packages/hail/expr/expressions/base_expression.py", line 194, in impute_type
raise ExpressionException("Hail cannot impute the type of 'None'")
hail.expr.expressions.base_expression.ExpressionException: Hail cannot impute the type of 'None'
ERROR: (gcloud.dataproc.jobs.submit.pyspark) Job [ede99857bc0a4f32aef119292bc4ddd4] failed with error:
Google Cloud Dataproc Agent reports job failure. If logs are available, they can be found at:
https://console.cloud.google.com/dataproc/jobs/ede99857bc0a4f32aef119292bc4ddd4?project=broad-mpg-gnomad®ion=us-central1
gcloud dataproc jobs wait 'ede99857bc0a4f32aef119292bc4ddd4' --region 'us-central1' --project 'broad-mpg-gnomad'
https://console.cloud.google.com/storage/browser/dataproc-faa46220-ec08-4f5b-92bd-9722e1963047-us-central1/google-cloud-dataproc-metainfo/4b102115-ddcb-48b2-9805-beeb9fa53418/jobs/ede99857bc0a4f32aef119292bc4ddd4/
gs://dataproc-faa46220-ec08-4f5b-92bd-9722e1963047-us-central1/google-cloud-dataproc-metainfo/4b102115-ddcb-48b2-9805-beeb9fa53418/jobs/ede99857bc0a4f32aef119292bc4ddd4/driveroutput
Traceback (most recent call last):
File "/Users/kchao/anaconda3/envs/hail/bin/hailctl", line 8, in <module>
sys.exit(main())
File "/Users/kchao/anaconda3/envs/hail/lib/python3.7/site-packages/hailtop/hailctl/__main__.py", line 100, in main
cli.main(args)
File "/Users/kchao/anaconda3/envs/hail/lib/python3.7/site-packages/hailtop/hailctl/dataproc/cli.py", line 122, in main
jmp[args.module].main(args, pass_through_args)
File "/Users/kchao/anaconda3/envs/hail/lib/python3.7/site-packages/hailtop/hailctl/dataproc/submit.py", line 78, in main
gcloud.run(cmd)
File "/Users/kchao/anaconda3/envs/hail/lib/python3.7/site-packages/hailtop/hailctl/dataproc/gcloud.py", line 9, in run
return subprocess.check_call(["gcloud"] + command)
File "/Users/kchao/anaconda3/envs/hail/lib/python3.7/subprocess.py", line 328, in check_call
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['gcloud', 'dataproc', 'jobs', 'submit', 'pyspark', '/Users/kchao/Desktop/no_end_ht.py', '--cluster=kc', '--files=', '--py-files=/var/folders/xq/8jnhrt2s2h58ts2v0br5g8gm0000gp/T/pyscripts_d58scrd5.zip', '--properties=']' returned non-zero exit status 1.
Am I using the argument incorrectly? Is there a reason why this table can’t be repartitioned? This table has 22750697 rows but was written to a single partition. I’d appreciate any insight!
Log:
no_end_ht.log (36.0 KB)