Hi,
I was trying to import a large 200G vcf.gz file into hail and converted it to a matrix table for the analysis. However, an error occured in the first step.
Here’s my code:
import hail as hl
from hail.plot import output_notebook, show
import bokeh.plotting
hl.init(spark_conf={'spark.driver.memory': '256g'})
#region import merged vcf file
hl.import_vcf('exome.gatk.vcf.gz', reference_genome='GRCh38', \
contig_recoding={'1': 'chr1', '2': 'chr2', '3': 'chr3', '4': 'chr4', '5': 'chr5', '6': 'chr6', '7': 'chr7', '8': 'chr8', '9': 'chr9', '10': 'chr10', \
'11': 'chr11', '12': 'chr12', '13': 'chr13', '14': 'chr14', '15': 'chr15', '16': 'chr16', '17': 'chr17', '18': 'chr18', '19': 'chr19', '20': 'chr20', \
'21': 'chr21', '22': 'chr22', 'X': 'chrX', 'Y': 'chrY'}, \
force_bgz=True, array_elements_required=False).write('exome.gatk.mt', overwrite=True)
mt = hl.read_matrix_table('exome.gatk.mt')
mt.count()
Here’s the error message:
2022-04-08 09:46:10 Hail: INFO: Coerced prefix-sorted dataset>(1223 + 1) / 1224]
---------------------------------------------------------------------------
FatalError Traceback (most recent call last)
<ipython-input-2-3ecbeadf63e8> in <module>
3 '11': 'chr11', '12': 'chr12', '13': 'chr13', '14': 'chr14', '15': 'chr15', '16': 'chr16', '17': 'chr17', '18': 'chr18', '19': 'chr19', '20': 'chr20', \
4 '21': 'chr21', '22': 'chr22', 'X': 'chrX', 'Y': 'chrY'}, \
----> 5 force_bgz=True, array_elements_required=False).write('exome.gatk.mt', overwrite=True)
<decorator-gen-1258> in write(self, output, overwrite, stage_locally, _codec_spec, _partitions, _checkpoint_file)
/gpfs/home/qwu24/wes/lib/python3.7/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
575 def wrapper(__original_func, *args, **kwargs):
576 args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 577 return __original_func(*args_, **kwargs_)
578
579 return wrapper
/gpfs/home/qwu24/wes/lib/python3.7/site-packages/hail/matrixtable.py in write(self, output, overwrite, stage_locally, _codec_spec, _partitions, _checkpoint_file)
2527
2528 writer = ir.MatrixNativeWriter(output, overwrite, stage_locally, _codec_spec, _partitions, _partitions_type, _checkpoint_file)
-> 2529 Env.backend().execute(ir.MatrixWrite(self._mir, writer))
2530
2531 class _Show:
/gpfs/home/qwu24/wes/lib/python3.7/site-packages/hail/backend/py4j_backend.py in execute(self, ir, timed)
96 raise HailUserError(message_and_trace) from None
97
---> 98 raise e
/gpfs/home/qwu24/wes/lib/python3.7/site-packages/hail/backend/py4j_backend.py in execute(self, ir, timed)
72 # print(self._hail_package.expr.ir.Pretty.apply(jir, True, -1))
73 try:
---> 74 result = json.loads(self._jhc.backend().executeJSON(jir))
75 value = ir.typ._from_json(result['value'])
76 timings = result['timings']
/gpfs/home/qwu24/wes/lib/python3.7/site-packages/py4j/java_gateway.py in __call__(self, *args)
1303 answer = self.gateway_client.send_command(command)
1304 return_value = get_return_value(
-> 1305 answer, self.gateway_client, self.target_id, self.name)
1306
1307 for temp_arg in temp_args:
/gpfs/home/qwu24/wes/lib/python3.7/site-packages/hail/backend/py4j_backend.py in deco(*args, **kwargs)
30 raise FatalError('%s\n\nJava stack trace:\n%s\n'
31 'Hail version: %s\n'
---> 32 'Error summary: %s' % (deepest, full, hail.__version__, deepest), error_id) from None
33 except pyspark.sql.utils.CapturedException as e:
34 raise FatalError('%s\n\nJava stack trace:\n%s\n'
FatalError: IllegalArgumentException: requirement failed
Java stack trace:
java.lang.IllegalArgumentException: requirement failed
at scala.Predef$.require(Predef.scala:268)
at is.hail.rvd.RVDPartitioner.<init>(RVDPartitioner.scala:52)
at is.hail.rvd.RVDPartitioner.extendKeySamePartitions(RVDPartitioner.scala:141)
at is.hail.expr.ir.LoweredTableReader$$anon$2.coerce(TableIR.scala:387)
at is.hail.expr.ir.GenericTableValue.toTableStage(GenericTableValue.scala:159)
at is.hail.io.vcf.MatrixVCFReader.lower(LoadVCF.scala:1791)
at is.hail.expr.ir.lowering.LowerTableIR$.lower$1(LowerTableIR.scala:402)
at is.hail.expr.ir.lowering.LowerTableIR$.apply(LowerTableIR.scala:1327)
at is.hail.expr.ir.lowering.LowerToCDA$.lower(LowerToCDA.scala:68)
at is.hail.expr.ir.lowering.LowerToCDA$.apply(LowerToCDA.scala:17)
at is.hail.expr.ir.lowering.LowerToDistributedArrayPass.transform(LoweringPass.scala:76)
at is.hail.expr.ir.LowerOrInterpretNonCompilable$.evaluate$1(LowerOrInterpretNonCompilable.scala:26)
at is.hail.expr.ir.LowerOrInterpretNonCompilable$.rewrite$1(LowerOrInterpretNonCompilable.scala:66)
at is.hail.expr.ir.LowerOrInterpretNonCompilable$.apply(LowerOrInterpretNonCompilable.scala:71)
at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.transform(LoweringPass.scala:68)
at is.hail.expr.ir.lowering.LoweringPass.$anonfun$apply$3(LoweringPass.scala:15)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.lowering.LoweringPass.$anonfun$apply$1(LoweringPass.scala:15)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.lowering.LoweringPass.apply(LoweringPass.scala:13)
at is.hail.expr.ir.lowering.LoweringPass.apply$(LoweringPass.scala:12)
at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.apply(LoweringPass.scala:63)
at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$1(LoweringPipeline.scala:14)
at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$1$adapted(LoweringPipeline.scala:12)
at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
at is.hail.expr.ir.lowering.LoweringPipeline.apply(LoweringPipeline.scala:12)
at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:46)
at is.hail.backend.spark.SparkBackend._execute(SparkBackend.scala:381)
at is.hail.backend.spark.SparkBackend.$anonfun$execute$1(SparkBackend.scala:365)
at is.hail.expr.ir.ExecuteContext$.$anonfun$scoped$3(ExecuteContext.scala:47)
at is.hail.utils.package$.using(package.scala:638)
at is.hail.expr.ir.ExecuteContext$.$anonfun$scoped$2(ExecuteContext.scala:47)
at is.hail.utils.package$.using(package.scala:638)
at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:17)
at is.hail.expr.ir.ExecuteContext$.scoped(ExecuteContext.scala:46)
at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:275)
at is.hail.backend.spark.SparkBackend.execute(SparkBackend.scala:362)
at is.hail.backend.spark.SparkBackend.$anonfun$executeJSON$1(SparkBackend.scala:406)
at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
at is.hail.backend.spark.SparkBackend.executeJSON(SparkBackend.scala:404)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:745)
Hail version: 0.2.77-684f32d73643
Error summary: IllegalArgumentException: requirement failed
Any clue of how this happened?
This file is a merged file from 15995 individuals.
Any advice will be appreciated.
Thank you.