Could not able to export the data to ElasticSearch

Hi,

I am running Spark 2.2.0 and Hail 0.2. I have converted the vcf to matrix table and on to table. I am now trying to export that data on to ElasticSearch. I am getting an error like.
(Note: Both EMR and Elastic Search are hosted in AWS)
>>> mt=l.export_elasticsearch(ht,host=‘https://xxxxxxx.us-east-1.es.amazonaws.com’,port=80,index=‘singlevcf’,index_type=‘variant’,block_size=10000,config=None,verbose=True)
Config Map(es.nodes -> https://xxxxxxx.us-east-1.es.amazonaws.com, es.port -> 80, es.batch.size.entries -> 10000, es.index.auto.create -> true)
Traceback (most recent call last):
File “”, line 1, in
File “”, line 2, in export_elasticsearch
File “/opt/hail/hail/build/distributions/hail-python.zip/hail/typecheck/check.py”, line 560, in wrapper
File “/opt/hail/hail/build/distributions/hail-python.zip/hail/methods/impex.py”, line 2052, in export_elasticsearch
File “/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py”, line 1133, in call
File “/opt/hail/hail/build/distributions/hail-python.zip/hail/utils/java.py”, line 224, in deco
hail.utils.java.FatalError: SSLException: Unrecognized SSL message, plaintext connection?

Java stack trace:
org.elasticsearch.hadoop.EsHadoopIllegalArgumentException: Cannot detect ES version - typically this happens if the network/Elasticsearch cluster is not accessible or when targeting a WAN/Cloud instance without the proper setting 'es.nodes.wan.only'
	at org.elasticsearch.hadoop.rest.InitializationUtils.discoverEsVersion(InitializationUtils.java:327)
	at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:97)
	at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:83)
	at org.elasticsearch.spark.sql.package$SparkDataFrameFunctions.saveToEs(package.scala:49)
	at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:47)
	at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:21)
	at is.hail.io.ElasticsearchConnector.export(ElasticsearchConnector.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)

org.elasticsearch.hadoop.rest.EsHadoopTransportException: javax.net.ssl.SSLException: Unrecognized SSL message, plaintext connection?
	at org.elasticsearch.hadoop.rest.NetworkClient.execute(NetworkClient.java:124)
	at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:380)
	at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:344)
	at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:348)
	at org.elasticsearch.hadoop.rest.RestClient.get(RestClient.java:158)
	at org.elasticsearch.hadoop.rest.RestClient.remoteEsVersion(RestClient.java:574)
	at org.elasticsearch.hadoop.rest.InitializationUtils.discoverEsVersion(InitializationUtils.java:320)
	at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:97)
	at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:83)
	at org.elasticsearch.spark.sql.package$SparkDataFrameFunctions.saveToEs(package.scala:49)
	at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:47)
	at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:21)
	at is.hail.io.ElasticsearchConnector.export(ElasticsearchConnector.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)

javax.net.ssl.SSLException: Unrecognized SSL message, plaintext connection?
	at sun.security.ssl.InputRecord.handleUnknownRecord(InputRecord.java:710)
	at sun.security.ssl.InputRecord.read(InputRecord.java:527)
	at sun.security.ssl.SSLSocketImpl.readRecord(SSLSocketImpl.java:983)
	at sun.security.ssl.SSLSocketImpl.performInitialHandshake(SSLSocketImpl.java:1385)
	at sun.security.ssl.SSLSocketImpl.writeRecord(SSLSocketImpl.java:757)
	at sun.security.ssl.AppOutputStream.write(AppOutputStream.java:123)
	at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
	at java.io.BufferedOutputStream.flush(BufferedOutputStream.java:140)
	at org.apache.commons.httpclient.HttpConnection.flushRequestOutputStream(HttpConnection.java:828)
	at org.apache.commons.httpclient.HttpMethodBase.writeRequest(HttpMethodBase.java:2116)
	at org.apache.commons.httpclient.HttpMethodBase.execute(HttpMethodBase.java:1096)
	at org.apache.commons.httpclient.HttpMethodDirector.executeWithRetry(HttpMethodDirector.java:398)
	at org.apache.commons.httpclient.HttpMethodDirector.executeMethod(HttpMethodDirector.java:171)
	at org.apache.commons.httpclient.HttpClient.executeMethod(HttpClient.java:397)
	at org.apache.commons.httpclient.HttpClient.executeMethod(HttpClient.java:323)
	at org.elasticsearch.hadoop.rest.commonshttp.CommonsHttpTransport.execute(CommonsHttpTransport.java:478)
	at org.elasticsearch.hadoop.rest.NetworkClient.execute(NetworkClient.java:112)
	at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:380)
	at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:344)
	at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:348)
	at org.elasticsearch.hadoop.rest.RestClient.get(RestClient.java:158)
	at org.elasticsearch.hadoop.rest.RestClient.remoteEsVersion(RestClient.java:574)
	at org.elasticsearch.hadoop.rest.InitializationUtils.discoverEsVersion(InitializationUtils.java:320)
	at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:97)
	at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:83)
	at org.elasticsearch.spark.sql.package$SparkDataFrameFunctions.saveToEs(package.scala:49)
	at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:47)
	at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:21)
	at is.hail.io.ElasticsearchConnector.export(ElasticsearchConnector.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)





Hail version: 0.2.7-e08cc2a17c4a
Error summary: SSLException: Unrecognized SSL message, plaintext connection?

I am able to curl the ES url from the cluster though.

Are you certain that your elastic search server is running TLS/SSL on port 80? Usually port 80 is reserved for insecure HTTP, not secure TLS/SSL HTTPS. Can you try with port=443?

Tried that, getting a bad request error. Tried previously too. Thank you!

>>> t=l.export_elasticsearch(ht,host='xxxxxxxxxxxxx.us-east-1.es.amazonaws.com',port=443,index='singlevcf',index_type='variant',block_size=1000,config=None,verbose=True)
Config Map(es.nodes -> xxxxxxxxxxxxx.us-east-1.es.amazonaws.com, es.port -> 443, es.batch.size.entries -> 1000, es.index.auto.create -> true)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "<decorator-gen-1000>", line 2, in export_elasticsearch
  File "/opt/hail/hail/build/distributions/hail-python.zip/hail/typecheck/check.py", line 560, in wrapper
  File "/opt/hail/hail/build/distributions/hail-python.zip/hail/methods/impex.py", line 2052, in export_elasticsearch
  File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
  File "/opt/hail/hail/build/distributions/hail-python.zip/hail/utils/java.py", line 224, in deco
hail.utils.java.FatalError: EsHadoopInvalidRequest: [GET] on [] failed; server[18.209.185.175:443] returned [400|Bad Request:]

Java stack trace:
org.elasticsearch.hadoop.EsHadoopIllegalArgumentException: Cannot detect ES version - typically this happens if the network/Elasticsearch cluster is not accessible or when targeting a WAN/Cloud instance without the proper setting 'es.nodes.wan.only'
	at org.elasticsearch.hadoop.rest.InitializationUtils.discoverEsVersion(InitializationUtils.java:327)
	at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:97)
	at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:83)
	at org.elasticsearch.spark.sql.package$SparkDataFrameFunctions.saveToEs(package.scala:49)
	at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:47)
	at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:21)
	at is.hail.io.ElasticsearchConnector.export(ElasticsearchConnector.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)

org.elasticsearch.hadoop.rest.EsHadoopInvalidRequest: [GET] on [] failed; server[18.209.185.175:443] returned [400|Bad Request:]
	at org.elasticsearch.hadoop.rest.RestClient.checkResponse(RestClient.java:424)
	at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:382)
	at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:344)
	at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:348)
	at org.elasticsearch.hadoop.rest.RestClient.get(RestClient.java:158)
	at org.elasticsearch.hadoop.rest.RestClient.remoteEsVersion(RestClient.java:574)
	at org.elasticsearch.hadoop.rest.InitializationUtils.discoverEsVersion(InitializationUtils.java:320)
	at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:97)
	at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:83)
	at org.elasticsearch.spark.sql.package$SparkDataFrameFunctions.saveToEs(package.scala:49)
	at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:47)
	at is.hail.io.ElasticsearchConnector$.export(ElasticsearchConnector.scala:21)
	at is.hail.io.ElasticsearchConnector.export(ElasticsearchConnector.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)




Hail version: 0.2.7-97fb2a5dd4a1
Error summary: EsHadoopInvalidRequest: [GET] on [] failed; server[18.209.185.175:443] returned [400|Bad Request:]
>>>

According to the ElasticSearch error, you might try setting es.nodes.wan.only. You can pass extra configuration arguments to hl.export_elasticsearch with the config named argument.

If you SSH to an EMR worker node and try to connect to elastic search directly without hail does it work?

Could you please post me the command, as when I try to change the config parameter. I am getting this error

>>> mt=l.export_elasticsearch(ht,host=‘https://xxxxxxxxxxxxx.us-east-1.es.amazonaws.com’,port=443,index=‘singlevcf’,index_type=‘variant’,block_size=1000,config=dict(‘es.nodes.wan.only’,‘true’),verbose=True)

Traceback (most recent call last):

File “<stdin>”, line 1, in <module>

TypeError: dict expected at most 1 arguments, got 2

dict(‘es.nodes.wan.only’,‘true’) is a python error. This should be

{'es.nodes.wan.only': 'true'}
1 Like

I am getting this

mt=l.export_elasticsearch(ht,host=‘https://xxxxxxxxxxxxxx.us-east-1.es.amazonaws.com’,port=443,index=‘singlevcf’,index_type=‘variant’,block_size=1000,config={‘es.nodes.wan.only’: ‘true’},verbose=True)
File “”, line 1
mt=l.export_elasticsearch(ht,host=‘xxxxxxxxxxxxxx.us-east-1.es.amazonaws.com’,port=443,index=‘singlevcf’,index_type=‘variant’,block_size=1000,config={‘es.nodes.wan.only’: ‘true’},verbose=True)
^
SyntaxError: invalid character in identifier

what character is it pointing to?

It is pointing on s.

can you try retyping the script? possibly it’s a weird unicode s copy/pasted from another font

1 Like

@nara, discourse, our forum software, converts matching quotes into curly quotes “like these” if the text is not code. When you copy source code and error messages into discourse, you should use a pair of three backticks (`), like this:

>>> mt=l.export_elasticsearch(ht,host='https://xxxxxxxxxxxxx.us-east-1.es.amazonaws.com',port=443,index='singlevcf',index_type='variant',block_size=1000,config=dict('es.nodes.wan.only','true'),verbose=True)

Traceback (most recent call last):

File “<stdin>”, line 1, in <module>

TypeError: dict expected at most 1 arguments, got 2

That avoids the automatic conversion to curly quotes (note that the quote before https is a straight up and down quote ', not a curly quote . You can learn more about this format syntax here.


Tim copied your code and modified it without fixing the quotes. I’ve edited his post to use the non-curly quotes. Try this instead:

mt=l.export_elasticsearch(ht,
                          host='https://xxxxxxxxxxxxx.us-east-1.es.amazonaws.com',
                          port=443,
                          index='singlevcf',
                          index_type='variant',
                          block_size=1000,
                          config={'es.nodes.wan.only': 'true'},
                          verbose=True)

The python syntax for a dictionary / key-value-mapping, is {key1: value1, key2: value2}. The dict function is used to convert things that are not dictionaries into dictionaries, for example: you can convert a list of pairs to a dictionary like this: dict([(key1, value1), (key2, value2)]).

1 Like

Thanks for that @danking and @tpoterba. But I am running into this, I have given both the syntaxes

mt=l.export_elasticsearch(ht,
… host=‘https://xxxxxxxxxxxxxx.us-east-1.es.amazonaws.com’,
… port=443,
… index=‘singlevcf’,
… index_type=‘variant’,
… block_size=1000,
… config={‘es.nodes.wan.only’,‘true’},
… verbose=True)
Traceback (most recent call last):
File “/opt/hail/hail/build/distributions/hail-python.zip/hail/typecheck/check.py”, line 487, in check_all
File “/opt/hail/hail/build/distributions/hail-python.zip/hail/typecheck/check.py”, line 59, in check
hail.typecheck.check.TypecheckFailure

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
File “”, line 8, in
File “”, line 2, in export_elasticsearch
File “/opt/hail/hail/build/distributions/hail-python.zip/hail/typecheck/check.py”, line 559, in wrapper
File “/opt/hail/hail/build/distributions/hail-python.zip/hail/typecheck/check.py”, line 513, in check_all
TypeError: export_elasticsearch: parameter ‘config’: expected (None or dict[str, str]), found set: {‘true’, ‘es.nodes.wan.only’}

mt=l.export_elasticsearch(ht,
… host=‘https://xxxxxxxxxxxxxx.us-east-1.es.amazonaws.com’,
… port=443,
… index=‘singlevcf’,
… index_type=‘variant’,
… block_size=1000,
… config={‘true’, ‘es.nodes.wan.only’},
… verbose=True)
Traceback (most recent call last):
File “/opt/hail/hail/build/distributions/hail-python.zip/hail/typecheck/check.py”, line 487, in check_all
File “/opt/hail/hail/build/distributions/hail-python.zip/hail/typecheck/check.py”, line 59, in check
hail.typecheck.check.TypecheckFailure

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
File “”, line 8, in
File “”, line 2, in export_elasticsearch
File “/opt/hail/hail/build/distributions/hail-python.zip/hail/typecheck/check.py”, line 559, in wrapper
File “/opt/hail/hail/build/distributions/hail-python.zip/hail/typecheck/check.py”, line 513, in check_all
TypeError: export_elasticsearch: parameter ‘config’: expected (None or dict[str, str]), found set: {‘true’, ‘es.nodes.wan.only’}

Hey thanks for the support guys. Finally this has worked,

>>> mt=l.export_elasticsearch(ht,host=‘https://xxxxxxxxxxxx.us-east-1.es.amazonaws.com’,port=443,index=‘singlevcf’,index_type=‘variant’,block_size=1000,config={‘es.nodes.wan.only’:‘true’},verbose=True)

Config Map(es.nodes.wan.only -> true, es.batch.size.entries -> 1000, es.index.auto.create -> true, es.port -> 443, es.nodes -> https://xxxxxxxxxxxxxxx.us-east-1.es.amazonaws.com)

[Stage 3:========================================================>(90 + 1) / 91]>>>

1 Like

Ah, my bad. I fixed the bug that you found in my post.

Hey, when I am trying to convert the Hail table to table. I am loosing the fields which I do not see in the Elasticsearch as well.

>>> mt = hl.import_vcf('xxxxxxxxxxxxxxx/SEQ187522109.vcf.gz', reference_genome='GRCh38',force_bgz=True, min_partitions=10000, drop_samples=False).write('hdfs:///user/hadoop/build-hail/raw.mt', overwrite=True)
[Stage 1:=======================================================> (88 + 3) / 91]2019-01-25 16:56:08 Hail: INFO: Ordering unsorted dataset with network shuffle
[Stage 3:========================================================>(90 + 1) / 91]2019-01-25 16:57:16 Hail: INFO: wrote matrix table with 4586617 rows and 1 column in 91 partitions to hdfs:///user/hadoop/build-hail/raw.mt

>>> mt = hl.read_matrix_table('hdfs:///user/hadoop/build-hail/raw.mt')
>>> mt.describe()
----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's':     str 
----------------------------------------
Row fields:
    'locus':     locus<GRCh38> 
    'alleles':     array<str> 
    'rsid':     str 
    'qual':     float64 
    'filters':     set<str> 
    'info':     struct {
        END: int32, 
        BLOCKAVG_min30p3a: bool, 
        SNVSB: float64, 
        SNVHPOL: int32, 
        CIGAR: array<str>, 
        RU: array<str>, 
        REFREP: array<int32>, 
        IDREP: array<int32>
    } 
----------------------------------------
Entry fields:
    'GQX':     int32 
    'GT':     call 
    'GQ':     int32 
    'DP':     int32 
    'DPF':     int32 
    'AD':     array<int32> 
    'DPI':     int32 
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------
>>> t = mt.make_table(separator='.') # doctest: +SKIP
>>> t.describe()
----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'locus':     locus<GRCh38> 
    'alleles':     array<str> 
    '187522109.GQX':     int32 
    '187522109.GT':     call 
    '187522109.GQ':     int32 
    '187522109.DP':     int32 
    '187522109.DPF':     int32 
    '187522109.AD':     array<int32> 
    '187522109.DPI':     int32 
----------------------------------------
Key: ['locus', 'alleles']
----------------------------------------
>>>

This is a bug:

1 Like

Thanks for that, will check and let you know @tpoterba

Hi guys,

While trying to import the vcf. It would not allow me to import hs37d5 which is a part of GRCh37.
https://googlegenomics.readthedocs.io/en/latest/use_cases/discover_public_data/reference_genomes.html
This the error when I am trying to import.
mt = l.import_vcf(‘s3a://xxxxxxxxxxx/xxxxxxx001242_p95a28_xxxx.vcf’,
… force_bgz=True,
… min_partitions=10000,
… drop_samples=False,
… skip_invalid_loci=False).write(‘s3a://xxxxxxxxxx/raw.mt’, overwrite=True)
[Stage 4:==================================================> (1004 + 2) / 1059]Traceback (most recent call last):
File “”, line 5, in
File “</root/anaconda3/envs/hail/lib/python3.6/site-packages/decorator.py:decorator-gen-824>”, line 2, in write
File “/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/typecheck/check.py”, line 560, in wrapper
return original_func(*args, **kwargs)
File “/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/matrixtable.py”, line 2193, in write
Env.backend().execute(MatrixWrite(self._mir, writer))
File “/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/backend/backend.py”, line 44, in execute
self._to_java_ir(ir)))
File “/root/anaconda3/envs/hail/lib/python3.6/site-packages/py4j/java_gateway.py”, line 1257, in call
answer, self.gateway_client, self.target_id, self.name)
File “/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/utils/java.py”, line 227, in deco
‘Error summary: %s’ % (deepest, full, hail.version, deepest)) from None
hail.utils.java.FatalError: HailException: Invalid locus hs37d5:256' found. Contighs37d5’ is not in the reference genome `GRCh37’.

    Java stack trace:
    org.apache.spark.SparkException: Job aborted due to stage failure: Task 1005 in stage 4.0 failed 1 times, most recent failure: Lost task 1005.0 in stage 4.0 (TID 2175, localhost, executor driver): is.hail.utils.HailException: xxxxxxx001242_p95a28_xxxx.vcf: Invalid locus `hs37d5:256' found. Contig `hs37d5' is not in the reference genome `GRCh37'.
      offending line: hs37d5	256	.	T	C	114.90	.	AC=2;AF=1.00;AN=2;DP=5;ExcessHet=3...
    	at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:20)
    	at is.hail.utils.package$.fatal(package.scala:26)
    	at is.hail.utils.Context.wrapException(Context.scala:19)
    	at is.hail.io.vcf.LoadVCF$$anonfun$parseLines$1$$anon$1.hasNext(LoadVCF.scala:854)
    	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:438)
    	at is.hail.rvd.RVDPartitionInfo$$anonfun$apply$1.apply(RVDPartitionInfo.scala:64)
    	at is.hail.rvd.RVDPartitionInfo$$anonfun$apply$1.apply(RVDPartitionInfo.scala:37)
    	at is.hail.utils.package$.using(package.scala:587)
    	at is.hail.rvd.RVDPartitionInfo$.apply(RVDPartitionInfo.scala:37)
    	at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1040)
    	at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1038)
    	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
    	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
    	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
    	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
    	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
    	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
    	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
    	at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
    	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
    	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
    	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
    	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
    	at scala.collection.AbstractIterator.to(Iterator.scala:1336)
    	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
    	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
    	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
    	at scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
    	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
    	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
    	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
    	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
    	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
    	at org.apache.spark.scheduler.Task.run(Task.scala:109)
    	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:344)
    	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    	at java.lang.Thread.run(Thread.java:748)
    Caused by: is.hail.utils.HailException: Invalid locus `hs37d5:256' found. Contig `hs37d5' is not in the reference genome `GRCh37'.
    	at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:9)
    	at is.hail.utils.package$.fatal(package.scala:26)
    	at is.hail.variant.ReferenceGenome.checkLocus(ReferenceGenome.scala:218)
    	at is.hail.io.vcf.VCFLine$$anonfun$parseAddVariant$2.apply(LoadVCF.scala:235)
    	at is.hail.io.vcf.VCFLine$$anonfun$parseAddVariant$2.apply(LoadVCF.scala:235)
    	at scala.Option.foreach(Option.scala:257)
    	at is.hail.io.vcf.VCFLine.parseAddVariant(LoadVCF.scala:235)
    	at is.hail.io.vcf.LoadVCF$$anonfun$parseLines$1$$anon$1.hasNext(LoadVCF.scala:828)
    	... 34 more

    Driver stacktrace:
    	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1575)
    	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1563)
    	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1562)
    	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
    	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1562)
    	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:803)
    	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:803)
    	at scala.Option.foreach(Option.scala:257)
    	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:803)
    	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1790)
    	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1745)
    	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1734)
    	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
    	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:619)
    	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2029)
    	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2050)
    	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2069)
    	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2094)
    	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:944)
    	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
    	at org.apache.spark.rdd.RDD.collect(RDD.scala:943)
    	at is.hail.sparkextras.ContextRDD.collect(ContextRDD.scala:153)
    	at is.hail.rvd.RVD$.getKeyInfo(RVD.scala:1044)
    	at is.hail.rvd.RVD$.makeCoercer(RVD.scala:1108)
    	at is.hail.io.vcf.MatrixVCFReader.coercer$lzycompute(LoadVCF.scala:1097)
    	at is.hail.io.vcf.MatrixVCFReader.coercer(LoadVCF.scala:1097)
    	at is.hail.io.vcf.MatrixVCFReader.apply(LoadVCF.scala:1126)
    	at is.hail.expr.ir.MatrixRead.execute(MatrixIR.scala:359)
    	at is.hail.expr.ir.Interpret$.apply(Interpret.scala:754)
    	at is.hail.expr.ir.Interpret$.apply(Interpret.scala:93)
    	at is.hail.expr.ir.Interpret$.apply(Interpret.scala:63)
    	at is.hail.expr.ir.Interpret$.interpretJSON(Interpret.scala:22)
    	at is.hail.expr.ir.Interpret.interpretJSON(Interpret.scala)
    	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    	at java.lang.reflect.Method.invoke(Method.java:498)
    	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
    	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
    	at py4j.Gateway.invoke(Gateway.java:282)
    	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
    	at py4j.commands.CallCommand.execute(CallCommand.java:79)
    	at py4j.GatewayConnection.run(GatewayConnection.java:238)
    	at java.lang.Thread.run(Thread.java:748)

    is.hail.utils.HailException: MCW2018-001242_p95a28_SMT4.vcf: Invalid locus `hs37d5:256' found. Contig `hs37d5' is not in the reference genome `GRCh37'.
      offending line: hs37d5	256	.	T	C	114.90	.	AC=2;AF=1.00;AN=2;DP=5;ExcessHet=3...
    	at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:20)
    	at is.hail.utils.package$.fatal(package.scala:26)
    	at is.hail.utils.Context.wrapException(Context.scala:19)
    	at is.hail.io.vcf.LoadVCF$$anonfun$parseLines$1$$anon$1.hasNext(LoadVCF.scala:854)
    	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:438)
    	at is.hail.rvd.RVDPartitionInfo$$anonfun$apply$1.apply(RVDPartitionInfo.scala:64)
    	at is.hail.rvd.RVDPartitionInfo$$anonfun$apply$1.apply(RVDPartitionInfo.scala:37)
    	at is.hail.utils.package$.using(package.scala:587)
    	at is.hail.rvd.RVDPartitionInfo$.apply(RVDPartitionInfo.scala:37)
    	at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1040)
    	at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1038)
    	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
    	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
    	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
    	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
    	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
    	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
    	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
    	at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
    	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
    	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
    	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
    	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
    	at scala.collection.AbstractIterator.to(Iterator.scala:1336)
    	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
    	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
    	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
    	at scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
    	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
    	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
    	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
    	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
    	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
    	at org.apache.spark.scheduler.Task.run(Task.scala:109)
    	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:344)
    	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    	at java.lang.Thread.run(Thread.java:748)

    is.hail.utils.HailException: Invalid locus `hs37d5:256' found. Contig `hs37d5' is not in the reference genome `GRCh37'.
    	at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:9)
    	at is.hail.utils.package$.fatal(package.scala:26)
    	at is.hail.variant.ReferenceGenome.checkLocus(ReferenceGenome.scala:218)
    	at is.hail.io.vcf.VCFLine$$anonfun$parseAddVariant$2.apply(LoadVCF.scala:235)
    	at is.hail.io.vcf.VCFLine$$anonfun$parseAddVariant$2.apply(LoadVCF.scala:235)
    	at scala.Option.foreach(Option.scala:257)
    	at is.hail.io.vcf.VCFLine.parseAddVariant(LoadVCF.scala:235)
    	at is.hail.io.vcf.LoadVCF$$anonfun$parseLines$1$$anon$1.hasNext(LoadVCF.scala:828)
    	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:438)
    	at is.hail.rvd.RVDPartitionInfo$$anonfun$apply$1.apply(RVDPartitionInfo.scala:64)
    	at is.hail.rvd.RVDPartitionInfo$$anonfun$apply$1.apply(RVDPartitionInfo.scala:37)
    	at is.hail.utils.package$.using(package.scala:587)
    	at is.hail.rvd.RVDPartitionInfo$.apply(RVDPartitionInfo.scala:37)
    	at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1040)
    	at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1038)
    	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
    	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
    	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
    	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
    	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
    	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
    	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
    	at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
    	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
    	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
    	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
    	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
    	at scala.collection.AbstractIterator.to(Iterator.scala:1336)
    	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
    	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
    	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
    	at scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
    	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
    	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
    	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
    	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
    	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
    	at org.apache.spark.scheduler.Task.run(Task.scala:109)
    	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:344)
    	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    	at java.lang.Thread.run(Thread.java:748)





    Hail version: 0.2.8-70304a52d33d
    Error summary: HailException: Invalid locus `hs37d5:256' found. Contig `hs37d5' is not in the reference genome `GRCh37'.

The link to Google Genomics describes hs37d5 as a separate reference genome, not a part of GRCh37. That’s consistent with my knowledge too.

It will be possible for you to add this as a reference by constructing the reference genome in Python with the appropriate information: https://hail.is/docs/0.2/genetics/hail.genetics.ReferenceGenome.html?highlight=referencegenome

Hi @tpoterba,

Thanks for helping me this far. I have an error even after loading the hs37d5. Can you please check this and let me know.
>>> rg = l.get_reference(‘GRCh37’) # doctest: +SKIP
>>> rg.add_sequence(‘hs37d5.fa.gz’,‘hs37d5.fa.gz.fai’) # doctest: +SKIP
>>> rgts=rg.add_sequence(‘hs37d5.fa.gz’,‘hs37d5.fa.gz.fai’) # doctest: +SKIP
Traceback (most recent call last):
File “”, line 1, in
File “</root/anaconda3/envs/hail/lib/python3.6/site-packages/decorator.py:decorator-gen-34>”, line 2, in add_sequence
File “/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/typecheck/check.py”, line 560, in wrapper
return original_func(*args, **kwargs)
File “/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/genetics/reference_genome.py”, line 338, in add_sequence
self._jrep.addSequence(Env.hc()._jhc, fasta_file, index_file)
File “/root/anaconda3/envs/hail/lib/python3.6/site-packages/py4j/java_gateway.py”, line 1257, in call
answer, self.gateway_client, self.target_id, self.name)
File “/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/utils/java.py”, line 227, in deco
‘Error summary: %s’ % (deepest, full, hail.version, deepest)) from None
hail.utils.java.FatalError: HailException: FASTA sequence has already been loaded for reference genome `GRCh37’.

Java stack trace:
is.hail.utils.HailException: FASTA sequence has already been loaded for reference genome `GRCh37'.
	at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:9)
	at is.hail.utils.package$.fatal(package.scala:26)
	at is.hail.variant.ReferenceGenome.addSequence(ReferenceGenome.scala:314)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)



Hail version: 0.2.8-70304a52d33d
Error summary: HailException: FASTA sequence has already been loaded for reference genome `GRCh37'.
>>> mt = l.import_vcf('s3a://xxxxxxxxxxxxx/xxxxxxx-001242_p95a28_xxxxx.vcf', 
...                 force_bgz=True, 
...                 min_partitions=10000, 
...                 drop_samples=False,
...                 reference_genome='GRCh37',
...                 skip_invalid_loci=False).write('s3a://xxxxxxxxxxxxx/raw.mt', overwrite=True)
[Stage 1:==================================================>  (1005 + 2) / 1059]Traceback (most recent call last):
  File "<stdin>", line 6, in <module>
  File "</root/anaconda3/envs/hail/lib/python3.6/site-packages/decorator.py:decorator-gen-824>", line 2, in write
  File "/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/typecheck/check.py", line 560, in wrapper
    return __original_func(*args_, **kwargs_)
  File "/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/matrixtable.py", line 2193, in write
    Env.backend().execute(MatrixWrite(self._mir, writer))
  File "/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/backend/backend.py", line 44, in execute
    self._to_java_ir(ir)))
  File "/root/anaconda3/envs/hail/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/utils/java.py", line 227, in deco
    'Error summary: %s' % (deepest, full, hail.__version__, deepest)) from None
hail.utils.java.FatalError: HailException: Invalid locus `hs37d5:243520' found. Contig `hs37d5' is not in the reference genome `GRCh37'.

Java stack trace:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 1006 in stage 1.0 failed 1 times, most recent failure: Lost task 1006.0 in stage 1.0 (TID 1007, localhost, executor driver): is.hail.utils.HailException: xxxxxxxxx-001242_p95a28_xxxxxxxxx.vcf: Invalid locus `hs37d5:243520' found. Contig `hs37d5' is not in the reference genome `GRCh37'.
  offending line: hs37d5	243520	.	A	G	74.77	.	AC=1;AF=0.500;AN=2;BaseQRankSum=...
	at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:20)
	at is.hail.utils.package$.fatal(package.scala:26)
	at is.hail.utils.Context.wrapException(Context.scala:19)
	at is.hail.io.vcf.LoadVCF$$anonfun$parseLines$1$$anon$1.hasNext(LoadVCF.scala:854)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:438)
	at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1039)
	at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1038)
	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at scala.collection.AbstractIterator.to(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:344)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: is.hail.utils.HailException: Invalid locus `hs37d5:243520' found. Contig `hs37d5' is not in the reference genome `GRCh37'.
	at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:9)
	at is.hail.utils.package$.fatal(package.scala:26)
	at is.hail.variant.ReferenceGenome.checkLocus(ReferenceGenome.scala:218)
	at is.hail.io.vcf.VCFLine$$anonfun$parseAddVariant$2.apply(LoadVCF.scala:235)
	at is.hail.io.vcf.VCFLine$$anonfun$parseAddVariant$2.apply(LoadVCF.scala:235)
	at scala.Option.foreach(Option.scala:257)
	at is.hail.io.vcf.VCFLine.parseAddVariant(LoadVCF.scala:235)
	at is.hail.io.vcf.LoadVCF$$anonfun$parseLines$1$$anon$1.hasNext(LoadVCF.scala:828)
	... 30 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1575)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1563)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1562)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1562)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:803)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:803)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:803)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1790)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1745)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1734)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:619)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2029)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2050)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2069)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2094)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:944)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:943)
	at is.hail.sparkextras.ContextRDD.collect(ContextRDD.scala:153)
	at is.hail.rvd.RVD$.getKeyInfo(RVD.scala:1044)
	at is.hail.rvd.RVD$.makeCoercer(RVD.scala:1108)
	at is.hail.io.vcf.MatrixVCFReader.coercer$lzycompute(LoadVCF.scala:1097)
	at is.hail.io.vcf.MatrixVCFReader.coercer(LoadVCF.scala:1097)
	at is.hail.io.vcf.MatrixVCFReader.apply(LoadVCF.scala:1126)
	at is.hail.expr.ir.MatrixRead.execute(MatrixIR.scala:359)
	at is.hail.expr.ir.Interpret$.apply(Interpret.scala:754)
	at is.hail.expr.ir.Interpret$.apply(Interpret.scala:93)
	at is.hail.expr.ir.Interpret$.apply(Interpret.scala:63)
	at is.hail.expr.ir.Interpret$.interpretJSON(Interpret.scala:22)
	at is.hail.expr.ir.Interpret.interpretJSON(Interpret.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)

is.hail.utils.HailException: MCW2018-001242_p95a28_SMT4.vcf: Invalid locus `hs37d5:243520' found. Contig `hs37d5' is not in the reference genome `GRCh37'.
  offending line: hs37d5	243520	.	A	G	74.77	.	AC=1;AF=0.500;AN=2;BaseQRankSum=...
	at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:20)
	at is.hail.utils.package$.fatal(package.scala:26)
	at is.hail.utils.Context.wrapException(Context.scala:19)
	at is.hail.io.vcf.LoadVCF$$anonfun$parseLines$1$$anon$1.hasNext(LoadVCF.scala:854)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:438)
	at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1039)
	at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1038)
	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at scala.collection.AbstractIterator.to(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:344)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

is.hail.utils.HailException: Invalid locus `hs37d5:243520' found. Contig `hs37d5' is not in the reference genome `GRCh37'.
	at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:9)
	at is.hail.utils.package$.fatal(package.scala:26)
	at is.hail.variant.ReferenceGenome.checkLocus(ReferenceGenome.scala:218)
	at is.hail.io.vcf.VCFLine$$anonfun$parseAddVariant$2.apply(LoadVCF.scala:235)
	at is.hail.io.vcf.VCFLine$$anonfun$parseAddVariant$2.apply(LoadVCF.scala:235)
	at scala.Option.foreach(Option.scala:257)
	at is.hail.io.vcf.VCFLine.parseAddVariant(LoadVCF.scala:235)
	at is.hail.io.vcf.LoadVCF$$anonfun$parseLines$1$$anon$1.hasNext(LoadVCF.scala:828)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:438)
	at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1039)
	at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1038)
	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at scala.collection.AbstractIterator.to(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:344)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)





Hail version: 0.2.8-70304a52d33d
Error summary: HailException: Invalid locus `hs37d5:243520' found. Contig `hs37d5' is not in the reference genome `GRCh37'.