Hi! I am trying to use import_vcf
to import some VCF files into Hail. My VCF files are stored in Amazon S3. I am using the s3a://
URI scheme, as you can see below. The minimal full example that fails for me is:
>>> indel_files = ['s3a://some/path/fileI1.vcf.gz', 's3a://some/path/fileI2.vcf.gz']
>>> mt1 = hl.import_vcf(indel_files, array_elements_required=False, force_bgz=True)
Hail version: 0.2.19-c6ec8b76eb26
Error summary: ConnectionClosedException: Premature end of Content-Length delimited message body (expected: 419466954; received: 98304
I have included the complete stack trace from the Hail log file at the end of this post. What should I do?
FatalError: ConnectionClosedException: Premature end of Content-Length delimited message body (expected: 1728230; received: 131072
Java stack trace:
java.lang.reflect.InvocationTargetException: null
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.json4s.reflect.Executable.invoke(Executable.scala:52)
at org.json4s.Extraction$ClassInstanceBuilder.instantiate(Extraction.scala:554)
at org.json4s.Extraction$ClassInstanceBuilder.result(Extraction.scala:597)
at org.json4s.Extraction$$anonfun$extract$6.apply(Extraction.scala:400)
at org.json4s.Extraction$$anonfun$extract$6.apply(Extraction.scala:392)
at org.json4s.Extraction$.customOrElse(Extraction.scala:606)
at org.json4s.Extraction$.extract(Extraction.scala:392)
at org.json4s.Extraction$ClassInstanceBuilder.mkWithTypeHint(Extraction.scala:587)
at org.json4s.Extraction$ClassInstanceBuilder.result(Extraction.scala:596)
at org.json4s.Extraction$$anonfun$extract$6.apply(Extraction.scala:400)
at org.json4s.Extraction$$anonfun$extract$6.apply(Extraction.scala:392)
at org.json4s.Extraction$.customOrElse(Extraction.scala:606)
at org.json4s.Extraction$.extract(Extraction.scala:392)
at org.json4s.Extraction$.extract(Extraction.scala:39)
at org.json4s.ExtractableJsonAstNode.extract(ExtractableJsonAstNode.scala:21)
at org.json4s.jackson.Serialization$.read(Serialization.scala:50)
at org.json4s.Serialization$class.read(Serialization.scala:30)
at org.json4s.jackson.Serialization$.read(Serialization.scala:17)
at is.hail.expr.ir.IRParser$.deserialize(Parser.scala:146)
at is.hail.expr.ir.IRParser$.matrix_ir_1(Parser.scala:1260)
at is.hail.expr.ir.IRParser$.matrix_ir(Parser.scala:1197)
at is.hail.expr.ir.IRParser$$anonfun$parse_matrix_ir$2.apply(Parser.scala:1437)
at is.hail.expr.ir.IRParser$$anonfun$parse_matrix_ir$2.apply(Parser.scala:1437)
at is.hail.expr.ir.IRParser$.parse(Parser.scala:1421)
at is.hail.expr.ir.IRParser$.parse_matrix_ir(Parser.scala:1437)
at is.hail.expr.ir.IRParser$.parse_matrix_ir(Parser.scala:1436)
at is.hail.expr.ir.IRParser.parse_matrix_ir(Parser.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 4 times, most recent failure: Lost task 0.3 in stage 0.0 (TID 3, 192.168.226.22, executor 6): org.apache.http.ConnectionClosedException: Premature end of Content-Length delimited message body (expected: 1728230; received: 131072
at org.apache.http.impl.io.ContentLengthInputStream.read(ContentLengthInputStream.java:178)
at org.apache.http.impl.io.ContentLengthInputStream.read(ContentLengthInputStream.java:198)
at org.apache.http.impl.io.ContentLengthInputStream.close(ContentLengthInputStream.java:101)
at org.apache.http.conn.BasicManagedEntity.streamClosed(BasicManagedEntity.java:166)
at org.apache.http.conn.EofSensorInputStream.checkClose(EofSensorInputStream.java:228)
at org.apache.http.conn.EofSensorInputStream.close(EofSensorInputStream.java:172)
at java.base/java.io.FilterInputStream.close(FilterInputStream.java:180)
at java.base/java.io.FilterInputStream.close(FilterInputStream.java:180)
at java.base/java.io.FilterInputStream.close(FilterInputStream.java:180)
at java.base/java.io.FilterInputStream.close(FilterInputStream.java:180)
at com.amazonaws.services.s3.model.S3ObjectInputStream.abort(S3ObjectInputStream.java:90)
at org.apache.hadoop.fs.s3a.S3AInputStream.close(S3AInputStream.java:199)
at java.base/java.io.FilterInputStream.close(FilterInputStream.java:180)
at org.apache.hadoop.io.compress.CompressionInputStream.close(CompressionInputStream.java:63)
at is.hail.utils.package$.using(package.scala:598)
at is.hail.io.fs.HadoopFS.readFile(HadoopFS.scala:401)
at is.hail.io.fs.HadoopFS.readLines(HadoopFS.scala:413)
at is.hail.io.vcf.LoadVCF$.getHeaderLines(LoadVCF.scala:1237)
at is.hail.io.vcf.MatrixVCFReader$$anonfun$13.apply(LoadVCF.scala:1489)
at is.hail.io.vcf.MatrixVCFReader$$anonfun$13.apply(LoadVCF.scala:1486)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$27.apply(RDD.scala:927)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$27.apply(RDD.scala:927)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at java.base/java.lang.Thread.run(Thread.java:834)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:927)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:925)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.foreach(RDD.scala:925)
at is.hail.io.vcf.MatrixVCFReader.<init>(LoadVCF.scala:1486)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.json4s.reflect.Executable.invoke(Executable.scala:52)
at org.json4s.Extraction$ClassInstanceBuilder.instantiate(Extraction.scala:554)
at org.json4s.Extraction$ClassInstanceBuilder.result(Extraction.scala:597)
at org.json4s.Extraction$$anonfun$extract$6.apply(Extraction.scala:400)
at org.json4s.Extraction$$anonfun$extract$6.apply(Extraction.scala:392)
at org.json4s.Extraction$.customOrElse(Extraction.scala:606)
at org.json4s.Extraction$.extract(Extraction.scala:392)
at org.json4s.Extraction$ClassInstanceBuilder.mkWithTypeHint(Extraction.scala:587)
at org.json4s.Extraction$ClassInstanceBuilder.result(Extraction.scala:596)
at org.json4s.Extraction$$anonfun$extract$6.apply(Extraction.scala:400)
at org.json4s.Extraction$$anonfun$extract$6.apply(Extraction.scala:392)
at org.json4s.Extraction$.customOrElse(Extraction.scala:606)
at org.json4s.Extraction$.extract(Extraction.scala:392)
at org.json4s.Extraction$.extract(Extraction.scala:39)
at org.json4s.ExtractableJsonAstNode.extract(ExtractableJsonAstNode.scala:21)
at org.json4s.jackson.Serialization$.read(Serialization.scala:50)
at org.json4s.Serialization$class.read(Serialization.scala:30)
at org.json4s.jackson.Serialization$.read(Serialization.scala:17)
at is.hail.expr.ir.IRParser$.deserialize(Parser.scala:146)
at is.hail.expr.ir.IRParser$.matrix_ir_1(Parser.scala:1260)
at is.hail.expr.ir.IRParser$.matrix_ir(Parser.scala:1197)
at is.hail.expr.ir.IRParser$$anonfun$parse_matrix_ir$2.apply(Parser.scala:1437)
at is.hail.expr.ir.IRParser$$anonfun$parse_matrix_ir$2.apply(Parser.scala:1437)
at is.hail.expr.ir.IRParser$.parse(Parser.scala:1421)
at is.hail.expr.ir.IRParser$.parse_matrix_ir(Parser.scala:1437)
at is.hail.expr.ir.IRParser$.parse_matrix_ir(Parser.scala:1436)
at is.hail.expr.ir.IRParser.parse_matrix_ir(Parser.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
org.apache.http.ConnectionClosedException: Premature end of Content-Length delimited message body (expected: 1728230; received: 131072
at org.apache.http.impl.io.ContentLengthInputStream.read(ContentLengthInputStream.java:178)
at org.apache.http.impl.io.ContentLengthInputStream.read(ContentLengthInputStream.java:198)
at org.apache.http.impl.io.ContentLengthInputStream.close(ContentLengthInputStream.java:101)
at org.apache.http.conn.BasicManagedEntity.streamClosed(BasicManagedEntity.java:166)
at org.apache.http.conn.EofSensorInputStream.checkClose(EofSensorInputStream.java:228)
at org.apache.http.conn.EofSensorInputStream.close(EofSensorInputStream.java:172)
at java.io.FilterInputStream.close(FilterInputStream.java:180)
at java.io.FilterInputStream.close(FilterInputStream.java:180)
at java.io.FilterInputStream.close(FilterInputStream.java:180)
at java.io.FilterInputStream.close(FilterInputStream.java:180)
at com.amazonaws.services.s3.model.S3ObjectInputStream.abort(S3ObjectInputStream.java:90)
at org.apache.hadoop.fs.s3a.S3AInputStream.close(S3AInputStream.java:199)
at java.io.FilterInputStream.close(FilterInputStream.java:180)
at org.apache.hadoop.io.compress.CompressionInputStream.close(CompressionInputStream.java:63)
at is.hail.utils.package$.using(package.scala:598)
at is.hail.io.fs.HadoopFS.readFile(HadoopFS.scala:401)
at is.hail.io.fs.HadoopFS.readLines(HadoopFS.scala:413)
at is.hail.io.vcf.LoadVCF$.getHeaderLines(LoadVCF.scala:1237)
at is.hail.io.vcf.MatrixVCFReader$$anonfun$13.apply(LoadVCF.scala:1489)
at is.hail.io.vcf.MatrixVCFReader$$anonfun$13.apply(LoadVCF.scala:1486)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$27.apply(RDD.scala:927)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$27.apply(RDD.scala:927)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at java.lang.Thread.run(Thread.java:834)