Hi!
I am running into an java.lang.ArrayIndexOutOfBoundsException error when trying to call hail’s gvcf combiner (hail.experimental.run_combiner). I’ve attached the full error file below. Do you have any suggestions or ideas on where to go from here as far as diagnosing what might be causing the error? My input files appear to be properly formatted as far as I can tell.
Here is a rough copy of my code
import hail as hl
# ....................#
## Parse user inputs ##
# ....................#
hl.experimental.run_combiner(
gvcf_list,
sample_names=samples_list,
header=args.gvcf_header_file,
out_file=args.output_cloud_path,
tmp_path=args.tmp_bucket,
key_by_locus_and_alleles=True,
overwrite=args.overwrite_existing,
reference_genome='GRCh38',
use_exome_default_intervals=True,
target_records=10000
)
Error Message:
Welcome to
__ __ <>__
/ /_/ /__ __/ /
/ __ / _ `/ / /
/_/ /_/\_,_/_/_/ version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/hail/combiner.log
2021-06-21 18:37:37 Hail: INFO: Using 65 intervals with default exome size 60000000 as partitioning for GVCF import
2021-06-21 18:37:37 Hail: INFO: GVCF combiner plan:
Branch factor: 100
Batch size: 100
Combining 5 input files in 1 phases with 1 total jobs.
Phase 1: 1 job corresponding to 1 final output file.
2021-06-21 18:37:37 Hail: INFO: Starting phase 1/1, merging 5 input GVCFs in 1 job.
2021-06-21 18:37:37 Hail: INFO: Starting phase 1/1, job 1/1 to create 1 merged file, corresponding to ~100.0% of total I/O.
[Stage 0:> (0 + 8) / 65]
[Stage 0:> (0 + 15) / 65]
[Stage 0:> (0 + 8) / 65]
[Stage 0:> (0 + 12) / 65]Traceback (most recent call last):
File "/tmp/a335abd27f1041da8eaffc174c60366b/test_combiner.py", line 38, in <module>
target_records=10000
File "/opt/conda/default/lib/python3.6/site-packages/hail/experimental/vcf_combiner/vcf_combiner.py", line 681, in run_combiner
final_mt.write(out_file, overwrite=overwrite)
File "<decorator-gen-1231>", line 2, in write
File "/opt/conda/default/lib/python3.6/site-packages/hail/typecheck/check.py", line 614, in wrapper
return __original_func(*args_, **kwargs_)
File "/opt/conda/default/lib/python3.6/site-packages/hail/matrixtable.py", line 2528, in write
Env.backend().execute(ir.MatrixWrite(self._mir, writer))
File "/opt/conda/default/lib/python3.6/site-packages/hail/backend/py4j_backend.py", line 98, in execute
raise e
File "/opt/conda/default/lib/python3.6/site-packages/hail/backend/py4j_backend.py", line 74, in execute
result = json.loads(self._jhc.backend().executeJSON(jir))
File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
File "/opt/conda/default/lib/python3.6/site-packages/hail/backend/py4j_backend.py", line 32, in deco
'Error summary: %s' % (deepest, full, hail.__version__, deepest), error_id) from None
hail.utils.java.FatalError: ArrayIndexOutOfBoundsException: null
Java stack trace:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 0.0 failed 20 times, most recent failure: Lost task 2.19 in stage 0.0 (TID 145, test-w-1.c.strokeanderson-hail.internal, executor 2): java.lang.ArrayIndexOutOfBoundsException
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1892)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1880)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1879)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2113)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2062)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2051)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
at org.apache.spark.rdd.RDD.collect(RDD.scala:989)
at is.hail.sparkextras.ContextRDD.collect(ContextRDD.scala:166)
at is.hail.rvd.RVD.writeRowsSplit(RVD.scala:952)
at is.hail.expr.ir.MatrixValue.write(MatrixValue.scala:246)
at is.hail.expr.ir.MatrixNativeWriter.apply(MatrixWriter.scala:61)
at is.hail.expr.ir.WrappedMatrixWriter.apply(MatrixWriter.scala:40)
at is.hail.expr.ir.Interpret$.run(Interpret.scala:825)
at is.hail.expr.ir.Interpret$.alreadyLowered(Interpret.scala:53)
at is.hail.expr.ir.InterpretNonCompilable$.interpretAndCoerce$1(InterpretNonCompilable.scala:16)
at is.hail.expr.ir.InterpretNonCompilable$.is$hail$expr$ir$InterpretNonCompilable$$rewrite$1(InterpretNonCompilable.scala:53)
at is.hail.expr.ir.InterpretNonCompilable$.apply(InterpretNonCompilable.scala:58)
at is.hail.expr.ir.lowering.InterpretNonCompilablePass$.transform(LoweringPass.scala:67)
at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3$$anonfun$1.apply(LoweringPass.scala:15)
at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3$$anonfun$1.apply(LoweringPass.scala:15)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3.apply(LoweringPass.scala:15)
at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3.apply(LoweringPass.scala:13)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.lowering.LoweringPass$class.apply(LoweringPass.scala:13)
at is.hail.expr.ir.lowering.InterpretNonCompilablePass$.apply(LoweringPass.scala:62)
at is.hail.expr.ir.lowering.LoweringPipeline$$anonfun$apply$1.apply(LoweringPipeline.scala:14)
at is.hail.expr.ir.lowering.LoweringPipeline$$anonfun$apply$1.apply(LoweringPipeline.scala:12)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)
at is.hail.expr.ir.lowering.LoweringPipeline.apply(LoweringPipeline.scala:12)
at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:28)
at is.hail.backend.spark.SparkBackend.is$hail$backend$spark$SparkBackend$$_execute(SparkBackend.scala:354)
at is.hail.backend.spark.SparkBackend$$anonfun$execute$1.apply(SparkBackend.scala:338)
at is.hail.backend.spark.SparkBackend$$anonfun$execute$1.apply(SparkBackend.scala:335)
at is.hail.expr.ir.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:25)
at is.hail.expr.ir.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:23)
at is.hail.utils.package$.using(package.scala:618)
at is.hail.annotations.Region$.scoped(Region.scala:18)
at is.hail.expr.ir.ExecuteContext$.scoped(ExecuteContext.scala:23)
at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:247)
at is.hail.backend.spark.SparkBackend.execute(SparkBackend.scala:335)
at is.hail.backend.spark.SparkBackend$$anonfun$7.apply(SparkBackend.scala:379)
at is.hail.backend.spark.SparkBackend$$anonfun$7.apply(SparkBackend.scala:377)
at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
at is.hail.backend.spark.SparkBackend.executeJSON(SparkBackend.scala:377)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
java.lang.ArrayIndexOutOfBoundsException: null