Hail version:
0.1-74bf1eb
What you did:
Export vcf to local file:// path
What went wrong (all error messages here, including the full java stack trace):
When exporting vcf to path that begins with ‘file://’, I get the error: ClassNotFoundException: Class org.apache.hadoop.mapred.DirectFileOutputCommitter not found
. I am using Spark 2.2.1 (prebuilt with hadoop2.7) with AWS-Hadoop 2.7.4. I have the following settings in spark config and am using a custom directParquetOutputCommitter. Standard writes to ‘file://’ of Spark dataframes work without issue.
Thanks for any help!
spark.sql.parquet.output.committer.class org.apache.spark.sql.parquet.DirectParquetOutputCommitter
spark.hadoop.mapred.output.committer.class org.apache.hadoop.mapred.DirectFileOutputCommitter
spark.hadoop.mapreduce.use.directfileoutputcommitter true
spark.hadoop.spark.sql.parquet.output.committer.class org.apache.spark.sql.parquet.DirectParquetOutputCommitter
Code and stack trace:
================================================================================================== FAILURES ===================================================================================================
__________________________________________________________________________________________ TestHAIL.test_export_vcf ___________________________________________________________________________________________
self = <test_hail.TestHAIL testMethod=test_export_vcf>
def test_export_vcf(self):
# define files
bgen_file = os.path.join(self.testdir, 'example.10bits.bgen')
sample_file = os.path.join(self.testdir, 'example.sample')
# make index
self.hc.index_bgen(bgen_file)
# load to vds
bgen_vds = self.hc.import_bgen(bgen_file, sample_file=sample_file)
# export vcf
out_path = 'file://' + os.path.join(self.tmpdir, 'test_vcf_export.vcf.bgz')
> bgen_vds.export_vcf(out_path, export_pp=False, parallel=False)
tests/hail/test_hail.py:55:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
<decorator-gen-398>:2: in export_vcf
???
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
func = <function export_vcf at 0x7fa13c4d9938>, args = (<hail.dataset.VariantDataset object at 0x7fa13c3c9390>, 'file:///scratch/test_vcf_export.vcf.bgz', None, False, False), kwargs = {}
e = Py4JJavaError(u'An error occurred while calling o160.exportVCF.\n', JavaObject id=o162), tpl = JavaObject id=o210
deepest = 'ClassNotFoundException: Class org.apache.hadoop.mapred.DirectFileOutputCommitter not found'
full = 'java.lang.RuntimeException: java.lang.RuntimeException: java.lang.ClassNotFoundException: Class org.apache.hadoop.map...mmand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
'
@decorator
def handle_py4j(func, *args, **kwargs):
try:
r = func(*args, **kwargs)
except py4j.protocol.Py4JJavaError as e:
tpl = Env.jutils().handleForPython(e.java_exception)
deepest, full = tpl._1(), tpl._2()
raise FatalError('%s\n\nJava stack trace:\n%s\n'
'Hail version: %s\n'
> 'Error summary: %s' % (deepest, full, Env.hc().version, deepest))
E FatalError: ClassNotFoundException: Class org.apache.hadoop.mapred.DirectFileOutputCommitter not found
E
E Java stack trace:
E java.lang.RuntimeException: java.lang.RuntimeException: java.lang.ClassNotFoundException: Class org.apache.hadoop.mapred.DirectFileOutputCommitter not found
E at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2227)
E at org.apache.hadoop.mapred.JobConf.getOutputCommitter(JobConf.java:726)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:1051)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1035)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1035)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
E at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
E at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1035)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$3.apply$mcV$sp(PairRDDFunctions.scala:1016)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$3.apply(PairRDDFunctions.scala:1016)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$3.apply(PairRDDFunctions.scala:1016)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
E at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
E at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1015)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$2.apply$mcV$sp(PairRDDFunctions.scala:973)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$2.apply(PairRDDFunctions.scala:971)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$2.apply(PairRDDFunctions.scala:971)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
E at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
E at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:971)
E at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$2.apply$mcV$sp(RDD.scala:1507)
E at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$2.apply(RDD.scala:1495)
E at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$2.apply(RDD.scala:1495)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
E at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
E at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1495)
E at is.hail.utils.richUtils.RichRDD$.writeTable$extension1(RichRDD.scala:77)
E at is.hail.utils.richUtils.RichRDD$.writeTable$extension0(RichRDD.scala:38)
E at is.hail.io.vcf.ExportVCF$.apply(ExportVCF.scala:453)
E at is.hail.variant.VariantDatasetFunctions$.exportVCF$extension(VariantDataset.scala:425)
E at is.hail.variant.VariantDatasetFunctions.exportVCF(VariantDataset.scala:425)
E at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
E at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
E at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
E at java.lang.reflect.Method.invoke(Method.java:498)
E at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
E at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
E at py4j.Gateway.invoke(Gateway.java:280)
E at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
E at py4j.commands.CallCommand.execute(CallCommand.java:79)
E at py4j.GatewayConnection.run(GatewayConnection.java:214)
E at java.lang.Thread.run(Thread.java:748)java.lang.RuntimeException: java.lang.ClassNotFoundException: Class org.apache.hadoop.mapred.DirectFileOutputCommitter not found
E at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2195)
E at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2219)
E at org.apache.hadoop.mapred.JobConf.getOutputCommitter(JobConf.java:726)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:1051)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1035)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1035)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
E at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
E at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1035)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$3.apply$mcV$sp(PairRDDFunctions.scala:1016)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$3.apply(PairRDDFunctions.scala:1016)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$3.apply(PairRDDFunctions.scala:1016)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
E at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
E at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1015)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$2.apply$mcV$sp(PairRDDFunctions.scala:973)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$2.apply(PairRDDFunctions.scala:971)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$2.apply(PairRDDFunctions.scala:971)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
E at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
E at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:971)
E at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$2.apply$mcV$sp(RDD.scala:1507)
E at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$2.apply(RDD.scala:1495)
E at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$2.apply(RDD.scala:1495)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
E at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
E at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1495)
E at is.hail.utils.richUtils.RichRDD$.writeTable$extension1(RichRDD.scala:77)
E at is.hail.utils.richUtils.RichRDD$.writeTable$extension0(RichRDD.scala:38)
E at is.hail.io.vcf.ExportVCF$.apply(ExportVCF.scala:453)
E at is.hail.variant.VariantDatasetFunctions$.exportVCF$extension(VariantDataset.scala:425)
E at is.hail.variant.VariantDatasetFunctions.exportVCF(VariantDataset.scala:425)
E at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
E at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
E at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
E at java.lang.reflect.Method.invoke(Method.java:498)
E at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
E at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
E at py4j.Gateway.invoke(Gateway.java:280)
E at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
E at py4j.commands.CallCommand.execute(CallCommand.java:79)
E at py4j.GatewayConnection.run(GatewayConnection.java:214)
E at java.lang.Thread.run(Thread.java:748)java.lang.ClassNotFoundException: Class org.apache.hadoop.mapred.DirectFileOutputCommitter not found
E at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2101)
E at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)
E at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2219)
E at org.apache.hadoop.mapred.JobConf.getOutputCommitter(JobConf.java:726)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:1051)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1035)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1035)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
E at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
E at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1035)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$3.apply$mcV$sp(PairRDDFunctions.scala:1016)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$3.apply(PairRDDFunctions.scala:1016)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$3.apply(PairRDDFunctions.scala:1016)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
E at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
E at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1015)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$2.apply$mcV$sp(PairRDDFunctions.scala:973)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$2.apply(PairRDDFunctions.scala:971)
E at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$2.apply(PairRDDFunctions.scala:971)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
E at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
E at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:971)
E at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$2.apply$mcV$sp(RDD.scala:1507)
E at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$2.apply(RDD.scala:1495)
E at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$2.apply(RDD.scala:1495)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
E at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
E at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1495)
E at is.hail.utils.richUtils.RichRDD$.writeTable$extension1(RichRDD.scala:77)
E at is.hail.utils.richUtils.RichRDD$.writeTable$extension0(RichRDD.scala:38)
E at is.hail.io.vcf.ExportVCF$.apply(ExportVCF.scala:453)
E at is.hail.variant.VariantDatasetFunctions$.exportVCF$extension(VariantDataset.scala:425)
E at is.hail.variant.VariantDatasetFunctions.exportVCF(VariantDataset.scala:425)
E at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
E at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
E at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
E at java.lang.reflect.Method.invoke(Method.java:498)
E at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
E at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
E at py4j.Gateway.invoke(Gateway.java:280)
E at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
E at py4j.commands.CallCommand.execute(CallCommand.java:79)
E at py4j.GatewayConnection.run(GatewayConnection.java:214)
E at java.lang.Thread.run(Thread.java:748)
E
E
E
E Hail version: 0.1-74bf1eb
E Error summary: ClassNotFoundException: Class org.apache.hadoop.mapred.DirectFileOutputCommitter not found