We are using hail to speed up vcf merging in our workflows. I’m seeing a ClassTooLargeException on a merge for 692 vcfs each with ~130K columns:
is.hail.relocated.org.objectweb.asm.ClassTooLargeException: Class too large: __C2Compiled
at is.hail.relocated.org.objectweb.asm.ClassWriter.toByteArray(ClassWriter.java:599)
at is.hail.lir.Emit$.apply(Emit.scala:217)
at is.hail.lir.Classx.$anonfun$asBytes$4(X.scala:110)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
at scala.collection.AbstractIterator.to(Iterator.scala:1431)
at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1431)
at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1431)
at is.hail.lir.Classx.asBytes(X.scala:123)
at is.hail.asm4s.ClassBuilder.classBytes(ClassBuilder.scala:347)
at is.hail.asm4s.ModuleBuilder.$anonfun$classesBytes$1(ClassBuilder.scala:148)
at is.hail.asm4s.ModuleBuilder.$anonfun$classesBytes$1$adapted(ClassBuilder.scala:148)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
at scala.collection.AbstractIterator.to(Iterator.scala:1431)
at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1431)
at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1431)
at is.hail.asm4s.ModuleBuilder.classesBytes(ClassBuilder.scala:149)
at is.hail.expr.ir.EmitClassBuilder.resultWithIndex(EmitClassBuilder.scala:631)
at is.hail.expr.ir.WrappedEmitClassBuilder.resultWithIndex(EmitClassBuilder.scala:148)
at is.hail.expr.ir.WrappedEmitClassBuilder.resultWithIndex$(EmitClassBuilder.scala:148)
at is.hail.expr.ir.EmitFunctionBuilder.resultWithIndex(EmitClassBuilder.scala:1056)
at is.hail.expr.ir.Compile$.apply(Compile.scala:78)
at is.hail.expr.ir.CompileAndEvaluate$.$anonfun$_apply$1(CompileAndEvaluate.scala:35)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:35)
at is.hail.backend.spark.SparkBackend._execute(SparkBackend.scala:381)
at is.hail.backend.spark.SparkBackend.$anonfun$execute$1(SparkBackend.scala:365)
at is.hail.expr.ir.ExecuteContext$.$anonfun$scoped$3(ExecuteContext.scala:47)
at is.hail.utils.package$.using(package.scala:627)
at is.hail.expr.ir.ExecuteContext$.$anonfun$scoped$2(ExecuteContext.scala:47)
at is.hail.utils.package$.using(package.scala:627)
at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:17)
at is.hail.expr.ir.ExecuteContext$.scoped(ExecuteContext.scala:46)
at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:275)
at is.hail.backend.spark.SparkBackend.execute(SparkBackend.scala:362)
at is.hail.backend.spark.SparkBackend.$anonfun$executeJSON$1(SparkBackend.scala:406)
at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
at is.hail.backend.spark.SparkBackend.executeJSON(SparkBackend.scala:404)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Here is the command we used to install hail:
RUN mkdir -p /usr/share/man/man1 && \
apt-get update && apt-get install -y \
openjdk-8-jre-headless \
&& rm -rf /var/lib/apt/lists/* && \
pip3 --no-cache-dir install hail==0.2.71 && \
pip3 --no-cache-dir install google-cloud-dataproc
dataproc launch command Python):
print(os.popen("hailctl dataproc start --region {} --project {} --num-master-local-ssds 1 --num-worker-local-ssds 1 --max-idle=60m --max-age=120m {}".format("~{region}", "~{project}", cluster_name)).read())
cluster_client = dataproc.ClusterControllerClient(
client_options={"api_endpoint": f"~{region}-dataproc.googleapis.com:443"}
)
for cluster in cluster_client.list_clusters(request={"project_id": "~{project}", "region": "~{region}"}):
if cluster.cluster_name == cluster_name:
cluster_staging_bucket = cluster.config.temp_bucket
os.popen("gcloud dataproc jobs submit pyspark {} --cluster={} --project {} --files=files.list --region={} --driver-log-levels root=WARN -- {} {}".format("~{hail_script}", cluster_name, "~{project}", "~{region}", cluster_staging_bucket, cluster_name)).read()
os.popen("gsutil cp -r gs://{}/{}/merged.vcf.bgz .".format(cluster_staging_bucket, cluster_name)).read()
break
and the script:
ref = hl.ReferenceGenome(name="hg38", contigs=contigs, lengths=lengths, x_contigs="chrX", y_contigs="chrY")
all_datasets = [hl.import_vcf(f, reference_genome=ref, force_bgz=True) for f in files]
mt = hl.MatrixTable.union_rows(*all_datasets)
# rest the qual to missing because hail by default populates it with -1.00e+01
merged_reset_qual = mt.annotate_rows(qual=hl.missing('float64'))
hl.export_vcf(merged_reset_qual,
"gs://{}/{}/merged.vcf.bgz".format(args.out_bucket, args.cluster_name),
metadata=hl.get_vcf_metadata(files[0]))
Any help would be appreciated!