Could not able to export the data to ElasticSearch


#1

Hi,

I am running Spark 2.2.0 and Hail 0.2. I have converted the vcf to matrix table and on to table. I am now trying to export that data on to ElasticSearch. I am getting an error like.
(Note: Both EMR and Elastic Search are hosted in AWS)
>>> mt=l.export_elasticsearch(ht,host=‘https://xxxxxxx.us-east-1.es.amazonaws.com’,port=80,index=‘singlevcf’,index_type=‘variant’,block_size=10000,config=None,verbose=True)
Config Map(es.nodes -> https://xxxxxxx.us-east-1.es.amazonaws.com, es.port -> 80, es.batch.size.entries -> 10000, es.index.auto.create -> true)
Traceback (most recent call last):
File “”, line 1, in
File “”, line 2, in export_elasticsearch
File “/opt/hail/hail/build/distributions/hail-python.zip/hail/typecheck/check.py”, line 560, in wrapper
File “/opt/hail/hail/build/distributions/hail-python.zip/hail/methods/impex.py”, line 2052, in export_elasticsearch
File “/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py”, line 1133, in call
File “/opt/hail/hail/build/distributions/hail-python.zip/hail/utils/java.py”, line 224, in deco
hail.utils.java.FatalError: SSLException: Unrecognized SSL message, plaintext connection?

Java stack trace:
org.elasticsearch.hadoop.EsHadoopIllegalArgumentException: Cannot detect ES version - typically this happens if the network/Elasticsearch cluster is not accessible or when targeting a WAN/Cloud instance without the proper setting 'es.nodes.wan.only'
	at org.elasticsearch.hadoop.rest.InitializationUtils.discoverEsVersion(InitializationUtils.java:327)
	at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:97)
	at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:83)
	at org.elasticsearch.spark.sql.package$SparkDataFrameFunctions.saveToEs(package.scala:49)
	at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:47)
	at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:21)
	at is.hail.io.ElasticsearchConnector.export(ElasticsearchConnector.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)

org.elasticsearch.hadoop.rest.EsHadoopTransportException: javax.net.ssl.SSLException: Unrecognized SSL message, plaintext connection?
	at org.elasticsearch.hadoop.rest.NetworkClient.execute(NetworkClient.java:124)
	at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:380)
	at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:344)
	at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:348)
	at org.elasticsearch.hadoop.rest.RestClient.get(RestClient.java:158)
	at org.elasticsearch.hadoop.rest.RestClient.remoteEsVersion(RestClient.java:574)
	at org.elasticsearch.hadoop.rest.InitializationUtils.discoverEsVersion(InitializationUtils.java:320)
	at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:97)
	at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:83)
	at org.elasticsearch.spark.sql.package$SparkDataFrameFunctions.saveToEs(package.scala:49)
	at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:47)
	at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:21)
	at is.hail.io.ElasticsearchConnector.export(ElasticsearchConnector.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)

javax.net.ssl.SSLException: Unrecognized SSL message, plaintext connection?
	at sun.security.ssl.InputRecord.handleUnknownRecord(InputRecord.java:710)
	at sun.security.ssl.InputRecord.read(InputRecord.java:527)
	at sun.security.ssl.SSLSocketImpl.readRecord(SSLSocketImpl.java:983)
	at sun.security.ssl.SSLSocketImpl.performInitialHandshake(SSLSocketImpl.java:1385)
	at sun.security.ssl.SSLSocketImpl.writeRecord(SSLSocketImpl.java:757)
	at sun.security.ssl.AppOutputStream.write(AppOutputStream.java:123)
	at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
	at java.io.BufferedOutputStream.flush(BufferedOutputStream.java:140)
	at org.apache.commons.httpclient.HttpConnection.flushRequestOutputStream(HttpConnection.java:828)
	at org.apache.commons.httpclient.HttpMethodBase.writeRequest(HttpMethodBase.java:2116)
	at org.apache.commons.httpclient.HttpMethodBase.execute(HttpMethodBase.java:1096)
	at org.apache.commons.httpclient.HttpMethodDirector.executeWithRetry(HttpMethodDirector.java:398)
	at org.apache.commons.httpclient.HttpMethodDirector.executeMethod(HttpMethodDirector.java:171)
	at org.apache.commons.httpclient.HttpClient.executeMethod(HttpClient.java:397)
	at org.apache.commons.httpclient.HttpClient.executeMethod(HttpClient.java:323)
	at org.elasticsearch.hadoop.rest.commonshttp.CommonsHttpTransport.execute(CommonsHttpTransport.java:478)
	at org.elasticsearch.hadoop.rest.NetworkClient.execute(NetworkClient.java:112)
	at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:380)
	at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:344)
	at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:348)
	at org.elasticsearch.hadoop.rest.RestClient.get(RestClient.java:158)
	at org.elasticsearch.hadoop.rest.RestClient.remoteEsVersion(RestClient.java:574)
	at org.elasticsearch.hadoop.rest.InitializationUtils.discoverEsVersion(InitializationUtils.java:320)
	at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:97)
	at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:83)
	at org.elasticsearch.spark.sql.package$SparkDataFrameFunctions.saveToEs(package.scala:49)
	at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:47)
	at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:21)
	at is.hail.io.ElasticsearchConnector.export(ElasticsearchConnector.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)





Hail version: 0.2.7-e08cc2a17c4a
Error summary: SSLException: Unrecognized SSL message, plaintext connection?

I am able to curl the ES url from the cluster though.


#2

Are you certain that your elastic search server is running TLS/SSL on port 80? Usually port 80 is reserved for insecure HTTP, not secure TLS/SSL HTTPS. Can you try with port=443?


#3

Tried that, getting a bad request error. Tried previously too. Thank you!

>>> t=l.export_elasticsearch(ht,host='xxxxxxxxxxxxx.us-east-1.es.amazonaws.com',port=443,index='singlevcf',index_type='variant',block_size=1000,config=None,verbose=True)
Config Map(es.nodes -> xxxxxxxxxxxxx.us-east-1.es.amazonaws.com, es.port -> 443, es.batch.size.entries -> 1000, es.index.auto.create -> true)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "<decorator-gen-1000>", line 2, in export_elasticsearch
  File "/opt/hail/hail/build/distributions/hail-python.zip/hail/typecheck/check.py", line 560, in wrapper
  File "/opt/hail/hail/build/distributions/hail-python.zip/hail/methods/impex.py", line 2052, in export_elasticsearch
  File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
  File "/opt/hail/hail/build/distributions/hail-python.zip/hail/utils/java.py", line 224, in deco
hail.utils.java.FatalError: EsHadoopInvalidRequest: [GET] on [] failed; server[18.209.185.175:443] returned [400|Bad Request:]

Java stack trace:
org.elasticsearch.hadoop.EsHadoopIllegalArgumentException: Cannot detect ES version - typically this happens if the network/Elasticsearch cluster is not accessible or when targeting a WAN/Cloud instance without the proper setting 'es.nodes.wan.only'
	at org.elasticsearch.hadoop.rest.InitializationUtils.discoverEsVersion(InitializationUtils.java:327)
	at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:97)
	at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:83)
	at org.elasticsearch.spark.sql.package$SparkDataFrameFunctions.saveToEs(package.scala:49)
	at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:47)
	at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:21)
	at is.hail.io.ElasticsearchConnector.export(ElasticsearchConnector.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)

org.elasticsearch.hadoop.rest.EsHadoopInvalidRequest: [GET] on [] failed; server[18.209.185.175:443] returned [400|Bad Request:]
	at org.elasticsearch.hadoop.rest.RestClient.checkResponse(RestClient.java:424)
	at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:382)
	at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:344)
	at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:348)
	at org.elasticsearch.hadoop.rest.RestClient.get(RestClient.java:158)
	at org.elasticsearch.hadoop.rest.RestClient.remoteEsVersion(RestClient.java:574)
	at org.elasticsearch.hadoop.rest.InitializationUtils.discoverEsVersion(InitializationUtils.java:320)
	at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:97)
	at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:83)
	at org.elasticsearch.spark.sql.package$SparkDataFrameFunctions.saveToEs(package.scala:49)
	at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:47)
	at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:21)
	at is.hail.io.ElasticsearchConnector.export(ElasticsearchConnector.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)




Hail version: 0.2.7-97fb2a5dd4a1
Error summary: EsHadoopInvalidRequest: [GET] on [] failed; server[18.209.185.175:443] returned [400|Bad Request:]
>>>

#4

According to the ElasticSearch error, you might try setting es.nodes.wan.only. You can pass extra configuration arguments to hl.export_elasticsearch with the config named argument.

If you SSH to an EMR worker node and try to connect to elastic search directly without hail does it work?


#5

Could you please post me the command, as when I try to change the config parameter. I am getting this error

>>> mt=l.export_elasticsearch(ht,host=‘https://xxxxxxxxxxxxx.us-east-1.es.amazonaws.com’,port=443,index=‘singlevcf’,index_type=‘variant’,block_size=1000,config=dict(‘es.nodes.wan.only’,‘true’),verbose=True)

Traceback (most recent call last):

File “<stdin>”, line 1, in <module>

TypeError: dict expected at most 1 arguments, got 2


#6

dict(‘es.nodes.wan.only’,‘true’) is a python error. This should be

{'es.nodes.wan.only': 'true'}

#7

I am getting this

mt=l.export_elasticsearch(ht,host=‘https://xxxxxxxxxxxxxx.us-east-1.es.amazonaws.com’,port=443,index=‘singlevcf’,index_type=‘variant’,block_size=1000,config={‘es.nodes.wan.only’: ‘true’},verbose=True)
File “”, line 1
mt=l.export_elasticsearch(ht,host=‘xxxxxxxxxxxxxx.us-east-1.es.amazonaws.com’,port=443,index=‘singlevcf’,index_type=‘variant’,block_size=1000,config={‘es.nodes.wan.only’: ‘true’},verbose=True)
^
SyntaxError: invalid character in identifier


#8

what character is it pointing to?


#9

It is pointing on s.


#10

can you try retyping the script? possibly it’s a weird unicode s copy/pasted from another font


#11

@nara, discourse, our forum software, converts matching quotes into curly quotes “like these” if the text is not code. When you copy source code and error messages into discourse, you should use a pair of three backticks (`), like this:

>>> mt=l.export_elasticsearch(ht,host='https://xxxxxxxxxxxxx.us-east-1.es.amazonaws.com',port=443,index='singlevcf',index_type='variant',block_size=1000,config=dict('es.nodes.wan.only','true'),verbose=True)

Traceback (most recent call last):

File “<stdin>”, line 1, in <module>

TypeError: dict expected at most 1 arguments, got 2

That avoids the automatic conversion to curly quotes (note that the quote before https is a straight up and down quote ', not a curly quote . You can learn more about this format syntax here.


Tim copied your code and modified it without fixing the quotes. I’ve edited his post to use the non-curly quotes. Try this instead:

mt=l.export_elasticsearch(ht,
                          host='https://xxxxxxxxxxxxx.us-east-1.es.amazonaws.com',
                          port=443,
                          index='singlevcf',
                          index_type='variant',
                          block_size=1000,
                          config={'es.nodes.wan.only': 'true'},
                          verbose=True)

The python syntax for a dictionary / key-value-mapping, is {key1: value1, key2: value2}. The dict function is used to convert things that are not dictionaries into dictionaries, for example: you can convert a list of pairs to a dictionary like this: dict([(key1, value1), (key2, value2)]).


#12

Thanks for that @danking and @tpoterba. But I am running into this, I have given both the syntaxes

mt=l.export_elasticsearch(ht,
… host=‘https://xxxxxxxxxxxxxx.us-east-1.es.amazonaws.com’,
… port=443,
… index=‘singlevcf’,
… index_type=‘variant’,
… block_size=1000,
… config={‘es.nodes.wan.only’,‘true’},
… verbose=True)
Traceback (most recent call last):
File “/opt/hail/hail/build/distributions/hail-python.zip/hail/typecheck/check.py”, line 487, in check_all
File “/opt/hail/hail/build/distributions/hail-python.zip/hail/typecheck/check.py”, line 59, in check
hail.typecheck.check.TypecheckFailure

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
File “”, line 8, in
File “”, line 2, in export_elasticsearch
File “/opt/hail/hail/build/distributions/hail-python.zip/hail/typecheck/check.py”, line 559, in wrapper
File “/opt/hail/hail/build/distributions/hail-python.zip/hail/typecheck/check.py”, line 513, in check_all
TypeError: export_elasticsearch: parameter ‘config’: expected (None or dict[str, str]), found set: {‘true’, ‘es.nodes.wan.only’}

mt=l.export_elasticsearch(ht,
… host=‘https://xxxxxxxxxxxxxx.us-east-1.es.amazonaws.com’,
… port=443,
… index=‘singlevcf’,
… index_type=‘variant’,
… block_size=1000,
… config={‘true’, ‘es.nodes.wan.only’},
… verbose=True)
Traceback (most recent call last):
File “/opt/hail/hail/build/distributions/hail-python.zip/hail/typecheck/check.py”, line 487, in check_all
File “/opt/hail/hail/build/distributions/hail-python.zip/hail/typecheck/check.py”, line 59, in check
hail.typecheck.check.TypecheckFailure

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
File “”, line 8, in
File “”, line 2, in export_elasticsearch
File “/opt/hail/hail/build/distributions/hail-python.zip/hail/typecheck/check.py”, line 559, in wrapper
File “/opt/hail/hail/build/distributions/hail-python.zip/hail/typecheck/check.py”, line 513, in check_all
TypeError: export_elasticsearch: parameter ‘config’: expected (None or dict[str, str]), found set: {‘true’, ‘es.nodes.wan.only’}


#13

Hey thanks for the support guys. Finally this has worked,

>>> mt=l.export_elasticsearch(ht,host=‘https://xxxxxxxxxxxx.us-east-1.es.amazonaws.com’,port=443,index=‘singlevcf’,index_type=‘variant’,block_size=1000,config={‘es.nodes.wan.only’:‘true’},verbose=True)

Config Map(es.nodes.wan.only -> true, es.batch.size.entries -> 1000, es.index.auto.create -> true, es.port -> 443, es.nodes -> https://xxxxxxxxxxxxxxx.us-east-1.es.amazonaws.com)

[Stage 3:========================================================>(90 + 1) / 91]>>>


#14

Ah, my bad. I fixed the bug that you found in my post.


#15

Hey, when I am trying to convert the Hail table to table. I am loosing the fields which I do not see in the Elasticsearch as well.

>>> mt = hl.import_vcf('xxxxxxxxxxxxxxx/SEQ187522109.vcf.gz', reference_genome='GRCh38',force_bgz=True, min_partitions=10000, drop_samples=False).write('hdfs:///user/hadoop/build-hail/raw.mt', overwrite=True)
[Stage 1:=======================================================> (88 + 3) / 91]2019-01-25 16:56:08 Hail: INFO: Ordering unsorted dataset with network shuffle
[Stage 3:========================================================>(90 + 1) / 91]2019-01-25 16:57:16 Hail: INFO: wrote matrix table with 4586617 rows and 1 column in 91 partitions to hdfs:///user/hadoop/build-hail/raw.mt

>>> mt = hl.read_matrix_table('hdfs:///user/hadoop/build-hail/raw.mt')
>>> mt.describe()
----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's':     str 
----------------------------------------
Row fields:
    'locus':     locus<GRCh38> 
    'alleles':     array<str> 
    'rsid':     str 
    'qual':     float64 
    'filters':     set<str> 
    'info':     struct {
        END: int32, 
        BLOCKAVG_min30p3a: bool, 
        SNVSB: float64, 
        SNVHPOL: int32, 
        CIGAR: array<str>, 
        RU: array<str>, 
        REFREP: array<int32>, 
        IDREP: array<int32>
    } 
----------------------------------------
Entry fields:
    'GQX':     int32 
    'GT':     call 
    'GQ':     int32 
    'DP':     int32 
    'DPF':     int32 
    'AD':     array<int32> 
    'DPI':     int32 
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------
>>> t = mt.make_table(separator='.') # doctest: +SKIP
>>> t.describe()
----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'locus':     locus<GRCh38> 
    'alleles':     array<str> 
    '187522109.GQX':     int32 
    '187522109.GT':     call 
    '187522109.GQ':     int32 
    '187522109.DP':     int32 
    '187522109.DPF':     int32 
    '187522109.AD':     array<int32> 
    '187522109.DPI':     int32 
----------------------------------------
Key: ['locus', 'alleles']
----------------------------------------
>>>

#16

This is a bug:


#17

Thanks for that, will check and let you know @tpoterba


#18

Hi guys,

While trying to import the vcf. It would not allow me to import hs37d5 which is a part of GRCh37.
https://googlegenomics.readthedocs.io/en/latest/use_cases/discover_public_data/reference_genomes.html
This the error when I am trying to import.
mt = l.import_vcf(‘s3a://xxxxxxxxxxx/xxxxxxx001242_p95a28_xxxx.vcf’,
… force_bgz=True,
… min_partitions=10000,
… drop_samples=False,
… skip_invalid_loci=False).write(‘s3a://xxxxxxxxxx/raw.mt’, overwrite=True)
[Stage 4:==================================================> (1004 + 2) / 1059]Traceback (most recent call last):
File “”, line 5, in
File “</root/anaconda3/envs/hail/lib/python3.6/site-packages/decorator.py:decorator-gen-824>”, line 2, in write
File “/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/typecheck/check.py”, line 560, in wrapper
return original_func(*args, **kwargs)
File “/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/matrixtable.py”, line 2193, in write
Env.backend().execute(MatrixWrite(self._mir, writer))
File “/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/backend/backend.py”, line 44, in execute
self._to_java_ir(ir)))
File “/root/anaconda3/envs/hail/lib/python3.6/site-packages/py4j/java_gateway.py”, line 1257, in call
answer, self.gateway_client, self.target_id, self.name)
File “/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/utils/java.py”, line 227, in deco
‘Error summary: %s’ % (deepest, full, hail.version, deepest)) from None
hail.utils.java.FatalError: HailException: Invalid locus hs37d5:256' found. Contighs37d5’ is not in the reference genome `GRCh37’.

    Java stack trace:
    org.apache.spark.SparkException: Job aborted due to stage failure: Task 1005 in stage 4.0 failed 1 times, most recent failure: Lost task 1005.0 in stage 4.0 (TID 2175, localhost, executor driver): is.hail.utils.HailException: xxxxxxx001242_p95a28_xxxx.vcf: Invalid locus `hs37d5:256' found. Contig `hs37d5' is not in the reference genome `GRCh37'.
      offending line: hs37d5	256	.	T	C	114.90	.	AC=2;AF=1.00;AN=2;DP=5;ExcessHet=3...
    	at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:20)
    	at is.hail.utils.package$.fatal(package.scala:26)
    	at is.hail.utils.Context.wrapException(Context.scala:19)
    	at is.hail.io.vcf.LoadVCF$$anonfun$parseLines$1$$anon$1.hasNext(LoadVCF.scala:854)
    	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:438)
    	at is.hail.rvd.RVDPartitionInfo$$anonfun$apply$1.apply(RVDPartitionInfo.scala:64)
    	at is.hail.rvd.RVDPartitionInfo$$anonfun$apply$1.apply(RVDPartitionInfo.scala:37)
    	at is.hail.utils.package$.using(package.scala:587)
    	at is.hail.rvd.RVDPartitionInfo$.apply(RVDPartitionInfo.scala:37)
    	at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1040)
    	at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1038)
    	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
    	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
    	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
    	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
    	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
    	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
    	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
    	at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
    	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
    	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
    	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
    	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
    	at scala.collection.AbstractIterator.to(Iterator.scala:1336)
    	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
    	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
    	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
    	at scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
    	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
    	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
    	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
    	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
    	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
    	at org.apache.spark.scheduler.Task.run(Task.scala:109)
    	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:344)
    	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    	at java.lang.Thread.run(Thread.java:748)
    Caused by: is.hail.utils.HailException: Invalid locus `hs37d5:256' found. Contig `hs37d5' is not in the reference genome `GRCh37'.
    	at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:9)
    	at is.hail.utils.package$.fatal(package.scala:26)
    	at is.hail.variant.ReferenceGenome.checkLocus(ReferenceGenome.scala:218)
    	at is.hail.io.vcf.VCFLine$$anonfun$parseAddVariant$2.apply(LoadVCF.scala:235)
    	at is.hail.io.vcf.VCFLine$$anonfun$parseAddVariant$2.apply(LoadVCF.scala:235)
    	at scala.Option.foreach(Option.scala:257)
    	at is.hail.io.vcf.VCFLine.parseAddVariant(LoadVCF.scala:235)
    	at is.hail.io.vcf.LoadVCF$$anonfun$parseLines$1$$anon$1.hasNext(LoadVCF.scala:828)
    	... 34 more

    Driver stacktrace:
    	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1575)
    	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1563)
    	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1562)
    	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
    	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1562)
    	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:803)
    	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:803)
    	at scala.Option.foreach(Option.scala:257)
    	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:803)
    	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1790)
    	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1745)
    	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1734)
    	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
    	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:619)
    	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2029)
    	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2050)
    	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2069)
    	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2094)
    	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:944)
    	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
    	at org.apache.spark.rdd.RDD.collect(RDD.scala:943)
    	at is.hail.sparkextras.ContextRDD.collect(ContextRDD.scala:153)
    	at is.hail.rvd.RVD$.getKeyInfo(RVD.scala:1044)
    	at is.hail.rvd.RVD$.makeCoercer(RVD.scala:1108)
    	at is.hail.io.vcf.MatrixVCFReader.coercer$lzycompute(LoadVCF.scala:1097)
    	at is.hail.io.vcf.MatrixVCFReader.coercer(LoadVCF.scala:1097)
    	at is.hail.io.vcf.MatrixVCFReader.apply(LoadVCF.scala:1126)
    	at is.hail.expr.ir.MatrixRead.execute(MatrixIR.scala:359)
    	at is.hail.expr.ir.Interpret$.apply(Interpret.scala:754)
    	at is.hail.expr.ir.Interpret$.apply(Interpret.scala:93)
    	at is.hail.expr.ir.Interpret$.apply(Interpret.scala:63)
    	at is.hail.expr.ir.Interpret$.interpretJSON(Interpret.scala:22)
    	at is.hail.expr.ir.Interpret.interpretJSON(Interpret.scala)
    	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    	at java.lang.reflect.Method.invoke(Method.java:498)
    	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
    	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
    	at py4j.Gateway.invoke(Gateway.java:282)
    	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
    	at py4j.commands.CallCommand.execute(CallCommand.java:79)
    	at py4j.GatewayConnection.run(GatewayConnection.java:238)
    	at java.lang.Thread.run(Thread.java:748)

    is.hail.utils.HailException: MCW2018-001242_p95a28_SMT4.vcf: Invalid locus `hs37d5:256' found. Contig `hs37d5' is not in the reference genome `GRCh37'.
      offending line: hs37d5	256	.	T	C	114.90	.	AC=2;AF=1.00;AN=2;DP=5;ExcessHet=3...
    	at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:20)
    	at is.hail.utils.package$.fatal(package.scala:26)
    	at is.hail.utils.Context.wrapException(Context.scala:19)
    	at is.hail.io.vcf.LoadVCF$$anonfun$parseLines$1$$anon$1.hasNext(LoadVCF.scala:854)
    	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:438)
    	at is.hail.rvd.RVDPartitionInfo$$anonfun$apply$1.apply(RVDPartitionInfo.scala:64)
    	at is.hail.rvd.RVDPartitionInfo$$anonfun$apply$1.apply(RVDPartitionInfo.scala:37)
    	at is.hail.utils.package$.using(package.scala:587)
    	at is.hail.rvd.RVDPartitionInfo$.apply(RVDPartitionInfo.scala:37)
    	at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1040)
    	at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1038)
    	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
    	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
    	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
    	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
    	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
    	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
    	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
    	at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
    	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
    	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
    	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
    	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
    	at scala.collection.AbstractIterator.to(Iterator.scala:1336)
    	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
    	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
    	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
    	at scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
    	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
    	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
    	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
    	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
    	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
    	at org.apache.spark.scheduler.Task.run(Task.scala:109)
    	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:344)
    	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    	at java.lang.Thread.run(Thread.java:748)

    is.hail.utils.HailException: Invalid locus `hs37d5:256' found. Contig `hs37d5' is not in the reference genome `GRCh37'.
    	at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:9)
    	at is.hail.utils.package$.fatal(package.scala:26)
    	at is.hail.variant.ReferenceGenome.checkLocus(ReferenceGenome.scala:218)
    	at is.hail.io.vcf.VCFLine$$anonfun$parseAddVariant$2.apply(LoadVCF.scala:235)
    	at is.hail.io.vcf.VCFLine$$anonfun$parseAddVariant$2.apply(LoadVCF.scala:235)
    	at scala.Option.foreach(Option.scala:257)
    	at is.hail.io.vcf.VCFLine.parseAddVariant(LoadVCF.scala:235)
    	at is.hail.io.vcf.LoadVCF$$anonfun$parseLines$1$$anon$1.hasNext(LoadVCF.scala:828)
    	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:438)
    	at is.hail.rvd.RVDPartitionInfo$$anonfun$apply$1.apply(RVDPartitionInfo.scala:64)
    	at is.hail.rvd.RVDPartitionInfo$$anonfun$apply$1.apply(RVDPartitionInfo.scala:37)
    	at is.hail.utils.package$.using(package.scala:587)
    	at is.hail.rvd.RVDPartitionInfo$.apply(RVDPartitionInfo.scala:37)
    	at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1040)
    	at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1038)
    	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
    	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
    	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
    	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
    	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
    	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
    	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
    	at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
    	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
    	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
    	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
    	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
    	at scala.collection.AbstractIterator.to(Iterator.scala:1336)
    	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
    	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
    	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
    	at scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
    	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
    	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
    	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
    	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
    	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
    	at org.apache.spark.scheduler.Task.run(Task.scala:109)
    	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:344)
    	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    	at java.lang.Thread.run(Thread.java:748)





    Hail version: 0.2.8-70304a52d33d
    Error summary: HailException: Invalid locus `hs37d5:256' found. Contig `hs37d5' is not in the reference genome `GRCh37'.

#19

The link to Google Genomics describes hs37d5 as a separate reference genome, not a part of GRCh37. That’s consistent with my knowledge too.

It will be possible for you to add this as a reference by constructing the reference genome in Python with the appropriate information: https://hail.is/docs/0.2/genetics/hail.genetics.ReferenceGenome.html?highlight=referencegenome


#20

Hi @tpoterba,

Thanks for helping me this far. I have an error even after loading the hs37d5. Can you please check this and let me know.
>>> rg = l.get_reference(‘GRCh37’) # doctest: +SKIP
>>> rg.add_sequence(‘hs37d5.fa.gz’,‘hs37d5.fa.gz.fai’) # doctest: +SKIP
>>> rgts=rg.add_sequence(‘hs37d5.fa.gz’,‘hs37d5.fa.gz.fai’) # doctest: +SKIP
Traceback (most recent call last):
File “”, line 1, in
File “</root/anaconda3/envs/hail/lib/python3.6/site-packages/decorator.py:decorator-gen-34>”, line 2, in add_sequence
File “/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/typecheck/check.py”, line 560, in wrapper
return original_func(*args, **kwargs)
File “/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/genetics/reference_genome.py”, line 338, in add_sequence
self._jrep.addSequence(Env.hc()._jhc, fasta_file, index_file)
File “/root/anaconda3/envs/hail/lib/python3.6/site-packages/py4j/java_gateway.py”, line 1257, in call
answer, self.gateway_client, self.target_id, self.name)
File “/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/utils/java.py”, line 227, in deco
‘Error summary: %s’ % (deepest, full, hail.version, deepest)) from None
hail.utils.java.FatalError: HailException: FASTA sequence has already been loaded for reference genome `GRCh37’.

Java stack trace:
is.hail.utils.HailException: FASTA sequence has already been loaded for reference genome `GRCh37'.
	at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:9)
	at is.hail.utils.package$.fatal(package.scala:26)
	at is.hail.variant.ReferenceGenome.addSequence(ReferenceGenome.scala:314)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)



Hail version: 0.2.8-70304a52d33d
Error summary: HailException: FASTA sequence has already been loaded for reference genome `GRCh37'.
>>> mt = l.import_vcf('s3a://xxxxxxxxxxxxx/xxxxxxx-001242_p95a28_xxxxx.vcf', 
...                 force_bgz=True, 
...                 min_partitions=10000, 
...                 drop_samples=False,
...                 reference_genome='GRCh37',
...                 skip_invalid_loci=False).write('s3a://xxxxxxxxxxxxx/raw.mt', overwrite=True)
[Stage 1:==================================================>  (1005 + 2) / 1059]Traceback (most recent call last):
  File "<stdin>", line 6, in <module>
  File "</root/anaconda3/envs/hail/lib/python3.6/site-packages/decorator.py:decorator-gen-824>", line 2, in write
  File "/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/typecheck/check.py", line 560, in wrapper
    return __original_func(*args_, **kwargs_)
  File "/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/matrixtable.py", line 2193, in write
    Env.backend().execute(MatrixWrite(self._mir, writer))
  File "/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/backend/backend.py", line 44, in execute
    self._to_java_ir(ir)))
  File "/root/anaconda3/envs/hail/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/utils/java.py", line 227, in deco
    'Error summary: %s' % (deepest, full, hail.__version__, deepest)) from None
hail.utils.java.FatalError: HailException: Invalid locus `hs37d5:243520' found. Contig `hs37d5' is not in the reference genome `GRCh37'.

Java stack trace:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 1006 in stage 1.0 failed 1 times, most recent failure: Lost task 1006.0 in stage 1.0 (TID 1007, localhost, executor driver): is.hail.utils.HailException: xxxxxxxxx-001242_p95a28_xxxxxxxxx.vcf: Invalid locus `hs37d5:243520' found. Contig `hs37d5' is not in the reference genome `GRCh37'.
  offending line: hs37d5	243520	.	A	G	74.77	.	AC=1;AF=0.500;AN=2;BaseQRankSum=...
	at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:20)
	at is.hail.utils.package$.fatal(package.scala:26)
	at is.hail.utils.Context.wrapException(Context.scala:19)
	at is.hail.io.vcf.LoadVCF$$anonfun$parseLines$1$$anon$1.hasNext(LoadVCF.scala:854)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:438)
	at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1039)
	at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1038)
	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at scala.collection.AbstractIterator.to(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:344)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: is.hail.utils.HailException: Invalid locus `hs37d5:243520' found. Contig `hs37d5' is not in the reference genome `GRCh37'.
	at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:9)
	at is.hail.utils.package$.fatal(package.scala:26)
	at is.hail.variant.ReferenceGenome.checkLocus(ReferenceGenome.scala:218)
	at is.hail.io.vcf.VCFLine$$anonfun$parseAddVariant$2.apply(LoadVCF.scala:235)
	at is.hail.io.vcf.VCFLine$$anonfun$parseAddVariant$2.apply(LoadVCF.scala:235)
	at scala.Option.foreach(Option.scala:257)
	at is.hail.io.vcf.VCFLine.parseAddVariant(LoadVCF.scala:235)
	at is.hail.io.vcf.LoadVCF$$anonfun$parseLines$1$$anon$1.hasNext(LoadVCF.scala:828)
	... 30 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1575)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1563)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1562)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1562)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:803)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:803)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:803)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1790)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1745)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1734)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:619)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2029)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2050)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2069)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2094)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:944)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:943)
	at is.hail.sparkextras.ContextRDD.collect(ContextRDD.scala:153)
	at is.hail.rvd.RVD$.getKeyInfo(RVD.scala:1044)
	at is.hail.rvd.RVD$.makeCoercer(RVD.scala:1108)
	at is.hail.io.vcf.MatrixVCFReader.coercer$lzycompute(LoadVCF.scala:1097)
	at is.hail.io.vcf.MatrixVCFReader.coercer(LoadVCF.scala:1097)
	at is.hail.io.vcf.MatrixVCFReader.apply(LoadVCF.scala:1126)
	at is.hail.expr.ir.MatrixRead.execute(MatrixIR.scala:359)
	at is.hail.expr.ir.Interpret$.apply(Interpret.scala:754)
	at is.hail.expr.ir.Interpret$.apply(Interpret.scala:93)
	at is.hail.expr.ir.Interpret$.apply(Interpret.scala:63)
	at is.hail.expr.ir.Interpret$.interpretJSON(Interpret.scala:22)
	at is.hail.expr.ir.Interpret.interpretJSON(Interpret.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)

is.hail.utils.HailException: MCW2018-001242_p95a28_SMT4.vcf: Invalid locus `hs37d5:243520' found. Contig `hs37d5' is not in the reference genome `GRCh37'.
  offending line: hs37d5	243520	.	A	G	74.77	.	AC=1;AF=0.500;AN=2;BaseQRankSum=...
	at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:20)
	at is.hail.utils.package$.fatal(package.scala:26)
	at is.hail.utils.Context.wrapException(Context.scala:19)
	at is.hail.io.vcf.LoadVCF$$anonfun$parseLines$1$$anon$1.hasNext(LoadVCF.scala:854)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:438)
	at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1039)
	at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1038)
	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at scala.collection.AbstractIterator.to(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:344)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

is.hail.utils.HailException: Invalid locus `hs37d5:243520' found. Contig `hs37d5' is not in the reference genome `GRCh37'.
	at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:9)
	at is.hail.utils.package$.fatal(package.scala:26)
	at is.hail.variant.ReferenceGenome.checkLocus(ReferenceGenome.scala:218)
	at is.hail.io.vcf.VCFLine$$anonfun$parseAddVariant$2.apply(LoadVCF.scala:235)
	at is.hail.io.vcf.VCFLine$$anonfun$parseAddVariant$2.apply(LoadVCF.scala:235)
	at scala.Option.foreach(Option.scala:257)
	at is.hail.io.vcf.VCFLine.parseAddVariant(LoadVCF.scala:235)
	at is.hail.io.vcf.LoadVCF$$anonfun$parseLines$1$$anon$1.hasNext(LoadVCF.scala:828)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:438)
	at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1039)
	at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1038)
	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at scala.collection.AbstractIterator.to(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:344)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)





Hail version: 0.2.8-70304a52d33d
Error summary: HailException: Invalid locus `hs37d5:243520' found. Contig `hs37d5' is not in the reference genome `GRCh37'.