It is an EMR cluster 5.10.0 AWS. With Spark 2.2.0 on top of that. [Hail 0.2.6]
Where is the VCF file? Can you put that file in s3
and try with an s3 path?
This error is happening because the file isn’t in default directory of the default file scheme.
Tim,
Thanks for that. But, I am getting a new error.
[root@ip-172-31-91-41 hail-elasticsearch-pipelines]# ipython
Python 3.6.2 (default, Feb 19 2018, 21:55:54)
Type 'copyright', 'credits' or 'license' for more information
IPython 7.2.0 -- An enhanced Interactive Python. Type '?' for help.
In [1]: import hail as l
In [2]: l.init()
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Running on Apache Spark version 2.2.0
SparkUI available at http://ip-172-31-91-41.ec2.internal:4040
Welcome to
__ __ <>__
/ /_/ /__ __/ /
/ __ / _ `/ / /
/_/ /_/\_,_/_/_/ version 0.2.6-6c5e6a3d5047
LOGGING: writing to /home/hadoop/hail-elasticsearch-pipelines/hail-20181221-1752-0.2.6-6c5e6a3d5047.log
In [3]: a="s3://vds-test-sample/build-hail/SEQ187500194.vcf.bgz"
In [4]: l.import_vcf(a)
---------------------------------------------------------------------------
FatalError Traceback (most recent call last)
<ipython-input-4-ad3ce697bf42> in <module>
----> 1 l.import_vcf(a)
<decorator-gen-1093> in import_vcf(path, force, force_bgz, header_file, min_partitions, drop_samples, call_fields, reference_genome, contig_recoding, array_elements_required, skip_invalid_loci, _partitions)
/home/hadoop/hail/hail/build/distributions/hail-python.zip/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
558 def wrapper(__original_func, *args, **kwargs):
559 args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 560 return __original_func(*args_, **kwargs_)
561
562 return wrapper
/home/hadoop/hail/hail/build/distributions/hail-python.zip/hail/methods/impex.py in import_vcf(path, force, force_bgz, header_file, min_partitions, drop_samples, call_fields, reference_genome, contig_recoding, array_elements_required, skip_invalid_loci, _partitions)
1886 reference_genome, contig_recoding, array_elements_required,
1887 skip_invalid_loci, force_bgz, force, _partitions)
-> 1888 return MatrixTable(MatrixRead(reader, drop_cols=drop_samples))
1889
1890 @typecheck(path=sequenceof(str),
/home/hadoop/hail/hail/build/distributions/hail-python.zip/hail/matrixtable.py in __init__(self, mir)
551 self._mir = mir
552 self._jmt = Env.hail().variant.MatrixTable(
--> 553 Env.hc()._jhc, Env.hc()._backend._to_java_ir(self._mir))
554
555 self._globals = None
/home/hadoop/hail/hail/build/distributions/hail-python.zip/hail/backend/backend.py in _to_java_ir(self, ir)
28 code = r(ir)
29 # FIXME parse should be static
---> 30 ir._jir = ir.parse(code, ir_map=r.jirs)
31 return ir._jir
32
/home/hadoop/hail/hail/build/distributions/hail-python.zip/hail/ir/base_ir.py in parse(self, code, ref_map, ir_map)
94
95 def parse(self, code, ref_map={}, ir_map={}):
---> 96 return Env.hail().expr.ir.IRParser.parse_matrix_ir(code, ref_map, ir_map)
/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
/home/hadoop/hail/hail/build/distributions/hail-python.zip/hail/utils/java.py in deco(*args, **kwargs)
212 raise FatalError('%s\n\nJava stack trace:\n%s\n'
213 'Hail version: %s\n'
--> 214 'Error summary: %s' % (deepest, full, hail.__version__, deepest)) from None
215 except pyspark.sql.utils.CapturedException as e:
216 raise FatalError('%s\n\nJava stack trace:\n%s\n'
FatalError: ClassNotFoundException: Class com.amazon.ws.emr.hadoop.fs.EmrFileSystem not found
Java stack trace:
org.json4s.package$MappingException: unknown error
at org.json4s.Extraction$.extract(Extraction.scala:46)
at org.json4s.ExtractableJsonAstNode.extract(ExtractableJsonAstNode.scala:21)
at org.json4s.jackson.Serialization$.read(Serialization.scala:50)
at is.hail.expr.ir.IRParser$.matrix_ir_1(Parser.scala:967)
at is.hail.expr.ir.IRParser$.matrix_ir(Parser.scala:908)
at is.hail.expr.ir.IRParser$$anonfun$parse_matrix_ir$2.apply(Parser.scala:1046)
at is.hail.expr.ir.IRParser$$anonfun$parse_matrix_ir$2.apply(Parser.scala:1046)
at is.hail.expr.ir.IRParser$.parse(Parser.scala:1030)
at is.hail.expr.ir.IRParser$.parse_matrix_ir(Parser.scala:1046)
at is.hail.expr.ir.IRParser$.parse_matrix_ir(Parser.scala:1045)
at is.hail.expr.ir.IRParser.parse_matrix_ir(Parser.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
java.lang.reflect.InvocationTargetException: null
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.json4s.Extraction$ClassInstanceBuilder.org$json4s$Extraction$ClassInstanceBuilder$$instantiate(Extraction.scala:490)
at org.json4s.Extraction$ClassInstanceBuilder$$anonfun$result$6.apply(Extraction.scala:515)
at org.json4s.Extraction$ClassInstanceBuilder$$anonfun$result$6.apply(Extraction.scala:512)
at org.json4s.Extraction$.org$json4s$Extraction$$customOrElse(Extraction.scala:524)
at org.json4s.Extraction$ClassInstanceBuilder.result(Extraction.scala:512)
at org.json4s.Extraction$.extract(Extraction.scala:351)
at org.json4s.Extraction$ClassInstanceBuilder.org$json4s$Extraction$ClassInstanceBuilder$$mkWithTypeHint(Extraction.scala:507)
at org.json4s.Extraction$ClassInstanceBuilder$$anonfun$result$6.apply(Extraction.scala:514)
at org.json4s.Extraction$ClassInstanceBuilder$$anonfun$result$6.apply(Extraction.scala:512)
at org.json4s.Extraction$.org$json4s$Extraction$$customOrElse(Extraction.scala:524)
at org.json4s.Extraction$ClassInstanceBuilder.result(Extraction.scala:512)
at org.json4s.Extraction$.extract(Extraction.scala:351)
at org.json4s.Extraction$.extract(Extraction.scala:42)
at org.json4s.ExtractableJsonAstNode.extract(ExtractableJsonAstNode.scala:21)
at org.json4s.jackson.Serialization$.read(Serialization.scala:50)
at is.hail.expr.ir.IRParser$.matrix_ir_1(Parser.scala:967)
at is.hail.expr.ir.IRParser$.matrix_ir(Parser.scala:908)
at is.hail.expr.ir.IRParser$$anonfun$parse_matrix_ir$2.apply(Parser.scala:1046)
at is.hail.expr.ir.IRParser$$anonfun$parse_matrix_ir$2.apply(Parser.scala:1046)
at is.hail.expr.ir.IRParser$.parse(Parser.scala:1030)
at is.hail.expr.ir.IRParser$.parse_matrix_ir(Parser.scala:1046)
at is.hail.expr.ir.IRParser$.parse_matrix_ir(Parser.scala:1045)
at is.hail.expr.ir.IRParser.parse_matrix_ir(Parser.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
java.lang.RuntimeException: java.lang.ClassNotFoundException: Class com.amazon.ws.emr.hadoop.fs.EmrFileSystem not found
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2195)
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2702)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2715)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:93)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2751)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2733)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:377)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
at is.hail.utils.richUtils.RichHadoopConfiguration$.fileSystem$extension(RichHadoopConfiguration.scala:19)
at is.hail.utils.richUtils.RichHadoopConfiguration$.glob$extension(RichHadoopConfiguration.scala:126)
at is.hail.utils.richUtils.RichHadoopConfiguration$$anonfun$globAll$extension$1.apply(RichHadoopConfiguration.scala:108)
at is.hail.utils.richUtils.RichHadoopConfiguration$$anonfun$globAll$extension$1.apply(RichHadoopConfiguration.scala:107)
at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at scala.collection.AbstractIterator.to(Iterator.scala:1336)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
at is.hail.utils.richUtils.RichHadoopConfiguration$.globAll$extension(RichHadoopConfiguration.scala:113)
at is.hail.io.vcf.MatrixVCFReader.<init>(LoadVCF.scala:981)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.json4s.Extraction$ClassInstanceBuilder.org$json4s$Extraction$ClassInstanceBuilder$$instantiate(Extraction.scala:490)
at org.json4s.Extraction$ClassInstanceBuilder$$anonfun$result$6.apply(Extraction.scala:515)
at org.json4s.Extraction$ClassInstanceBuilder$$anonfun$result$6.apply(Extraction.scala:512)
at org.json4s.Extraction$.org$json4s$Extraction$$customOrElse(Extraction.scala:524)
at org.json4s.Extraction$ClassInstanceBuilder.result(Extraction.scala:512)
at org.json4s.Extraction$.extract(Extraction.scala:351)
at org.json4s.Extraction$ClassInstanceBuilder.org$json4s$Extraction$ClassInstanceBuilder$$mkWithTypeHint(Extraction.scala:507)
at org.json4s.Extraction$ClassInstanceBuilder$$anonfun$result$6.apply(Extraction.scala:514)
at org.json4s.Extraction$ClassInstanceBuilder$$anonfun$result$6.apply(Extraction.scala:512)
at org.json4s.Extraction$.org$json4s$Extraction$$customOrElse(Extraction.scala:524)
at org.json4s.Extraction$ClassInstanceBuilder.result(Extraction.scala:512)
at org.json4s.Extraction$.extract(Extraction.scala:351)
at org.json4s.Extraction$.extract(Extraction.scala:42)
at org.json4s.ExtractableJsonAstNode.extract(ExtractableJsonAstNode.scala:21)
at org.json4s.jackson.Serialization$.read(Serialization.scala:50)
at is.hail.expr.ir.IRParser$.matrix_ir_1(Parser.scala:967)
at is.hail.expr.ir.IRParser$.matrix_ir(Parser.scala:908)
at is.hail.expr.ir.IRParser$$anonfun$parse_matrix_ir$2.apply(Parser.scala:1046)
at is.hail.expr.ir.IRParser$$anonfun$parse_matrix_ir$2.apply(Parser.scala:1046)
at is.hail.expr.ir.IRParser$.parse(Parser.scala:1030)
at is.hail.expr.ir.IRParser$.parse_matrix_ir(Parser.scala:1046)
at is.hail.expr.ir.IRParser$.parse_matrix_ir(Parser.scala:1045)
at is.hail.expr.ir.IRParser.parse_matrix_ir(Parser.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
java.lang.ClassNotFoundException: Class com.amazon.ws.emr.hadoop.fs.EmrFileSystem not found
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2101)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2702)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2715)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:93)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2751)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2733)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:377)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
at is.hail.utils.richUtils.RichHadoopConfiguration$.fileSystem$extension(RichHadoopConfiguration.scala:19)
at is.hail.utils.richUtils.RichHadoopConfiguration$.glob$extension(RichHadoopConfiguration.scala:126)
at is.hail.utils.richUtils.RichHadoopConfiguration$$anonfun$globAll$extension$1.apply(RichHadoopConfiguration.scala:108)
at is.hail.utils.richUtils.RichHadoopConfiguration$$anonfun$globAll$extension$1.apply(RichHadoopConfiguration.scala:107)
at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at scala.collection.AbstractIterator.to(Iterator.scala:1336)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
at is.hail.utils.richUtils.RichHadoopConfiguration$.globAll$extension(RichHadoopConfiguration.scala:113)
at is.hail.io.vcf.MatrixVCFReader.<init>(LoadVCF.scala:981)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.json4s.Extraction$ClassInstanceBuilder.org$json4s$Extraction$ClassInstanceBuilder$$instantiate(Extraction.scala:490)
at org.json4s.Extraction$ClassInstanceBuilder$$anonfun$result$6.apply(Extraction.scala:515)
at org.json4s.Extraction$ClassInstanceBuilder$$anonfun$result$6.apply(Extraction.scala:512)
at org.json4s.Extraction$.org$json4s$Extraction$$customOrElse(Extraction.scala:524)
at org.json4s.Extraction$ClassInstanceBuilder.result(Extraction.scala:512)
at org.json4s.Extraction$.extract(Extraction.scala:351)
at org.json4s.Extraction$ClassInstanceBuilder.org$json4s$Extraction$ClassInstanceBuilder$$mkWithTypeHint(Extraction.scala:507)
at org.json4s.Extraction$ClassInstanceBuilder$$anonfun$result$6.apply(Extraction.scala:514)
at org.json4s.Extraction$ClassInstanceBuilder$$anonfun$result$6.apply(Extraction.scala:512)
at org.json4s.Extraction$.org$json4s$Extraction$$customOrElse(Extraction.scala:524)
at org.json4s.Extraction$ClassInstanceBuilder.result(Extraction.scala:512)
at org.json4s.Extraction$.extract(Extraction.scala:351)
at org.json4s.Extraction$.extract(Extraction.scala:42)
at org.json4s.ExtractableJsonAstNode.extract(ExtractableJsonAstNode.scala:21)
at org.json4s.jackson.Serialization$.read(Serialization.scala:50)
at is.hail.expr.ir.IRParser$.matrix_ir_1(Parser.scala:967)
at is.hail.expr.ir.IRParser$.matrix_ir(Parser.scala:908)
at is.hail.expr.ir.IRParser$$anonfun$parse_matrix_ir$2.apply(Parser.scala:1046)
at is.hail.expr.ir.IRParser$$anonfun$parse_matrix_ir$2.apply(Parser.scala:1046)
at is.hail.expr.ir.IRParser$.parse(Parser.scala:1030)
at is.hail.expr.ir.IRParser$.parse_matrix_ir(Parser.scala:1046)
at is.hail.expr.ir.IRParser$.parse_matrix_ir(Parser.scala:1045)
at is.hail.expr.ir.IRParser.parse_matrix_ir(Parser.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
Hail version: 0.2.6-6c5e6a3d5047
Error summary: ClassNotFoundException: Class com.amazon.ws.emr.hadoop.fs.EmrFileSystem not found
[root@ip-172-31-91-41 hail-elasticsearch-pipelines]# hadoop version
Hadoop 2.7.3-amzn-5
Subversion git@aws157git.com:/pkg/Aws157BigTop -r d73c901b4228f4e75d3a527ec2318ce7376036cb
Compiled by ec2-user on 2017-11-16T08:50Z
Compiled with protoc 2.5.0
From source with checksum 59c81a5c59fcd6429190a689ead8f6
This command was run using /usr/lib/hadoop/hadoop-common-2.7.3-amzn-5.jar
[root@ip-172-31-91-41 hail-elasticsearch-pipelines]#
How did you set up Hail to run on the EMR cluster? This repository has some good info:
Hi Nara,
This repo will save you time as it sets up the EMR cluster, with the latest Hail version and all the proper configurations, and it spot instances (cheaper cluters): https://github.com/hms-dbmi/hail-on-AWS-spot-instances. I don’t know how familiar are you with AWS as you have to add some security feats but if you have admin
permits in your account you should not have issues.
Due to the file system management in your EMR, it is more convenient to read your files from s3:
s3_path='s3://your-path-here/'
mt=hl.import_vcf(s3_path+'SEQ187500194.vcf.gz')
I hope this helps,
Carlos
Thanks for the link Tim. I found that I was missing on including the emrfs directory in env.
export SPARK_HOME=/usr/lib/spark
export HAIL_HOME=/opt/hail/hail
export SPARK_CLASSPATH=/opt/hail/hail/build/libs/hail-all-spark.jar
export PYTHONPATH=/opt/hail/hail/build/distributions/hail-python.zip:/usr/lib/spark/python:/usr/lib/spark/python/lib/py4j-0.10.4-src.zip:/opt/hail-elasticsearch-pipelines/hail_scripts.zip
export PYSPARK_SUBMIT_ARGS=“–conf spark.driver.extraClassPath=/home/hadoop/hail-all-spark.jar:/usr/share/aws/emr/emrfs/lib/emrfs-hadoop-assembly-2.20.0.jar --conf spark.executor.extraClassPath=/home/hadoop/hail-all-spark.jar:/usr/share/aws/emr/emrfs/lib/emrfs-hadoop-assembly-2.20.0.jar pyspark-shell”
export PYSPARK_PYTHON=python3
Now I am getting a different error. I want to transfer the vcf data to elasticsearch and I am running into this
>>> t = l.import_vcf(‘s3n://path/xyz.vcf.gz’, reference_genome=‘GRCh38’,force_bgz=True, min_partitions=10000, drop_samples=False).make_table(‘s3n://path/raw.t’)
[Stage 19:=======================================================>(90 + 1) / 91]2019-01-07 21:18:02 Hail: INFO: Ordering unsorted dataset with network shuffle
>>> t=l.methods.read_table(‘s3n://path/raw.t/’)
Traceback (most recent call last):
File “<stdin>”, line 1, in <module>
File “<decorator-gen-986>”, line 2, in read_table
File “/opt/hail/hail/build/distributions/hail-python.zip/hail/typecheck/check.py”, line 560, in wrapper
File “/opt/hail/hail/build/distributions/hail-python.zip/hail/methods/impex.py”, line 2036, in read_table
File “/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py”, line 1133, in call
File “/opt/hail/hail/build/distributions/hail-python.zip/hail/utils/java.py”, line 224, in deco
hail.utils.java.FatalError: HailException: file is a MatrixTable, not a Table: ‘s3n://path/raw.t/’
Java stack trace:
is.hail.utils.HailException: file is a MatrixTable, not a Table: ‘s3n://path/raw.t/’
at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:9)
at is.hail.utils.package$.fatal(package.scala:26)
at is.hail.expr.ir.TableIR$.read(TableIR.scala:28)
at is.hail.table.Table$.read(Table.scala:62)
at is.hail.HailContext.readTable(HailContext.scala:532)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
Hail version: 0.2.7-b3cde8ce9779
Error summary: HailException: file is a MatrixTable, not a Table: ‘s3n://path/raw.t/’
>>>
Hi Nara,
Please see the docs for make_table: https://hail.is/docs/0.2/hail.MatrixTable.html#hail.MatrixTable.make_table
The argument it expects is not a file path, but a field name separator. I think probably you wrote a matrix table to that path earlier.
Hi Tim,
Thanks! That worked fine. But I am getting an error on ElasticSearch side. Do let me know, if anyone have idea on it.
>>> mt=l.export_elasticsearch(ht,host='https://xxxxx.us-east-1.es.amazonaws.com',port=80,index='singlevcf',index_type='variant',block_size=1000,config=None,verbose=True)
Config Map(es.nodes -> https://xxxxxx.us-east-1.es.amazonaws.com, es.port -> 80, es.batch.size.entries -> 1000, es.index.auto.create -> true)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "<decorator-gen-996>", line 2, in export_elasticsearch
File "/opt/hail/hail/build/distributions/hail-python.zip/hail/typecheck/check.py", line 560, in wrapper
File "/opt/hail/hail/build/distributions/hail-python.zip/hail/methods/impex.py", line 2052, in export_elasticsearch
File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
File "/opt/hail/hail/build/distributions/hail-python.zip/hail/utils/java.py", line 224, in deco
hail.utils.java.FatalError: SSLException: Unrecognized SSL message, plaintext connection?
Java stack trace:
org.elasticsearch.hadoop.EsHadoopIllegalArgumentException: Cannot detect ES version - typically this happens if the network/Elasticsearch cluster is not accessible or when targeting a WAN/Cloud instance without the proper setting 'es.nodes.wan.only'
at org.elasticsearch.hadoop.rest.InitializationUtils.discoverEsVersion(InitializationUtils.java:327)
at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:97)
at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:83)
at org.elasticsearch.spark.sql.package$SparkDataFrameFunctions.saveToEs(package.scala:49)
at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:47)
at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:21)
at is.hail.io.ElasticsearchConnector.export(ElasticsearchConnector.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
org.elasticsearch.hadoop.rest.EsHadoopTransportException: javax.net.ssl.SSLException: Unrecognized SSL message, plaintext connection?
at org.elasticsearch.hadoop.rest.NetworkClient.execute(NetworkClient.java:124)
at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:380)
at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:344)
at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:348)
at org.elasticsearch.hadoop.rest.RestClient.get(RestClient.java:158)
at org.elasticsearch.hadoop.rest.RestClient.remoteEsVersion(RestClient.java:574)
at org.elasticsearch.hadoop.rest.InitializationUtils.discoverEsVersion(InitializationUtils.java:320)
at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:97)
at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:83)
at org.elasticsearch.spark.sql.package$SparkDataFrameFunctions.saveToEs(package.scala:49)
at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:47)
at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:21)
at is.hail.io.ElasticsearchConnector.export(ElasticsearchConnector.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
javax.net.ssl.SSLException: Unrecognized SSL message, plaintext connection?
at sun.security.ssl.InputRecord.handleUnknownRecord(InputRecord.java:710)
at sun.security.ssl.InputRecord.read(InputRecord.java:527)
at sun.security.ssl.SSLSocketImpl.readRecord(SSLSocketImpl.java:983)
at sun.security.ssl.SSLSocketImpl.performInitialHandshake(SSLSocketImpl.java:1385)
at sun.security.ssl.SSLSocketImpl.writeRecord(SSLSocketImpl.java:757)
at sun.security.ssl.AppOutputStream.write(AppOutputStream.java:123)
at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
at java.io.BufferedOutputStream.flush(BufferedOutputStream.java:140)
at org.apache.commons.httpclient.HttpConnection.flushRequestOutputStream(HttpConnection.java:828)
at org.apache.commons.httpclient.HttpMethodBase.writeRequest(HttpMethodBase.java:2116)
at org.apache.commons.httpclient.HttpMethodBase.execute(HttpMethodBase.java:1096)
at org.apache.commons.httpclient.HttpMethodDirector.executeWithRetry(HttpMethodDirector.java:398)
at org.apache.commons.httpclient.HttpMethodDirector.executeMethod(HttpMethodDirector.java:171)
at org.apache.commons.httpclient.HttpClient.executeMethod(HttpClient.java:397)
at org.apache.commons.httpclient.HttpClient.executeMethod(HttpClient.java:323)
at org.elasticsearch.hadoop.rest.commonshttp.CommonsHttpTransport.execute(CommonsHttpTransport.java:478)
at org.elasticsearch.hadoop.rest.NetworkClient.execute(NetworkClient.java:112)
at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:380)
at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:344)
at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:348)
at org.elasticsearch.hadoop.rest.RestClient.get(RestClient.java:158)
at org.elasticsearch.hadoop.rest.RestClient.remoteEsVersion(RestClient.java:574)
at org.elasticsearch.hadoop.rest.InitializationUtils.discoverEsVersion(InitializationUtils.java:320)
at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:97)
at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:83)
at org.elasticsearch.spark.sql.package$SparkDataFrameFunctions.saveToEs(package.scala:49)
at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:47)
at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:21)
at is.hail.io.ElasticsearchConnector.export(ElasticsearchConnector.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
Hail version: 0.2.7-2ec8f4d9d137
Error summary: SSLException: Unrecognized SSL message, plaintext connection?
>>>
Sorry, I don’t have any experience configuring elastic search.
Thanks for the help till now Tim. It gave me a lot of progress.