I cannot import the UKB 200K WGS VCFs in Hail due to an empty line in the VCF after the header lines

Hi all,

I would like to process UK-Biobank 200K WGS data with hail.
Unfortunately, I could not import vcf with the attached error.
How can we import those vcf files? I appreciate any information about this

Thank you so much.


vcf = 'file:///mnt/project/Bulk/Whole genome sequences/Population level WGS variants, pVCF format - interim 200k release/ukb24304_c2_b1_v1.vcf.gz'
mt = hl.import_vcf(vcf, force_bgz = True, reference_genome = 'GRCh38',  find_replace = ('nul', '.'), array_elements_required = False)

FatalError                                Traceback (most recent call last)
<ipython-input-3-35fc70fb23bd> in <module>
      6        find_replace = ('nul', '.'),
----> 7        array_elements_required = False
      8      )

<decorator-gen-1475> in import_vcf(path, force, force_bgz, header_file, min_partitions, drop_samples, call_fields, reference_genome, contig_recoding, array_elements_required, skip_invalid_loci, entry_float_type, filter, find_replace, n_partitions, block_size, _partitions)

/opt/conda/lib/python3.6/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
    575     def wrapper(__original_func, *args, **kwargs):
    576         args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 577         return __original_func(*args_, **kwargs_)
    578 
    579     return wrapper

/opt/conda/lib/python3.6/site-packages/hail/methods/impex.py in import_vcf(path, force, force_bgz, header_file, min_partitions, drop_samples, call_fields, reference_genome, contig_recoding, array_elements_required, skip_invalid_loci, entry_float_type, filter, find_replace, n_partitions, block_size, _partitions)
   2370                                 skip_invalid_loci, force_bgz, force, filter, find_replace,
   2371                                 _partitions)
-> 2372     return MatrixTable(ir.MatrixRead(reader, drop_cols=drop_samples))
   2373 
   2374 

/opt/conda/lib/python3.6/site-packages/hail/matrixtable.py in __init__(self, mir)
    556         self._entry_indices = Indices(self, {self._row_axis, self._col_axis})
    557 
--> 558         self._type = self._mir.typ
    559 
    560         self._global_type = self._type.global_type

/opt/conda/lib/python3.6/site-packages/hail/ir/base_ir.py in typ(self)
    359     def typ(self):
    360         if self._type is None:
--> 361             self._compute_type()
    362             assert self._type is not None, self
    363         return self._type

/opt/conda/lib/python3.6/site-packages/hail/ir/matrix_ir.py in _compute_type(self)
     59 
     60     def _compute_type(self):
---> 61         self._type = Env.backend().matrix_type(self)
     62 
     63 

/opt/conda/lib/python3.6/site-packages/hail/backend/spark_backend.py in matrix_type(self, mir)
    284 
    285     def matrix_type(self, mir):
--> 286         jir = self._to_java_matrix_ir(mir)
    287         return tmatrix._from_java(jir.typ())
    288 

/opt/conda/lib/python3.6/site-packages/hail/backend/spark_backend.py in _to_java_matrix_ir(self, ir)
    270 
    271     def _to_java_matrix_ir(self, ir):
--> 272         return self._to_java_ir(ir, self._parse_matrix_ir)
    273 
    274     def _to_java_blockmatrix_ir(self, ir):

/opt/conda/lib/python3.6/site-packages/hail/backend/spark_backend.py in _to_java_ir(self, ir, parse)
    260             r = CSERenderer(stop_at_jir=True)
    261             # FIXME parse should be static
--> 262             ir._jir = parse(r(ir), ir_map=r.jirs)
    263         return ir._jir
    264 

/opt/conda/lib/python3.6/site-packages/hail/backend/spark_backend.py in _parse_matrix_ir(self, code, ref_map, ir_map)
    238 
    239     def _parse_matrix_ir(self, code, ref_map={}, ir_map={}):
--> 240         return self._jbackend.parse_matrix_ir(code, ref_map, ir_map)
    241 
    242     def _parse_blockmatrix_ir(self, code, ref_map={}, ir_map={}):

/cluster/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
   1255         answer = self.gateway_client.send_command(command)
   1256         return_value = get_return_value(
-> 1257             answer, self.gateway_client, self.target_id, self.name)
   1258 
   1259         for temp_arg in temp_args:

/opt/conda/lib/python3.6/site-packages/hail/backend/py4j_backend.py in deco(*args, **kwargs)
     29                 raise FatalError('%s\n\nJava stack trace:\n%s\n'
     30                                  'Hail version: %s\n'
---> 31                                  'Error summary: %s' % (deepest, full, hail.__version__, deepest), error_id) from None
     32         except pyspark.sql.utils.CapturedException as e:
     33             raise FatalError('%s\n\nJava stack trace:\n%s\n'

FatalError: StringIndexOutOfBoundsException: String index out of range: 0

Java stack trace:
java.lang.StringIndexOutOfBoundsException: String index out of range: 0
	at java.lang.String.charAt(String.java:658)
	at scala.collection.immutable.StringOps$.apply$extension(StringOps.scala:37)
	at is.hail.io.vcf.LoadVCF$$anonfun$getHeaderLines$1$$anonfun$apply$5.apply(LoadVCF.scala:1293)
	at is.hail.io.vcf.LoadVCF$$anonfun$getHeaderLines$1$$anonfun$apply$5.apply(LoadVCF.scala:1293)
	at scala.collection.Iterator$$anon$16.hasNext(Iterator.scala:600)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at scala.collection.AbstractIterator.to(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1334)
	at is.hail.io.vcf.LoadVCF$$anonfun$getHeaderLines$1.apply(LoadVCF.scala:1295)
	at is.hail.io.vcf.LoadVCF$$anonfun$getHeaderLines$1.apply(LoadVCF.scala:1291)
	at is.hail.io.fs.FS$$anonfun$readLines$1.apply(FS.scala:221)
	at is.hail.io.fs.FS$$anonfun$readLines$1.apply(FS.scala:212)
	at is.hail.utils.package$.using(package.scala:638)
	at is.hail.io.fs.FS$class.readLines(FS.scala:211)
	at is.hail.io.fs.HadoopFS.readLines(HadoopFS.scala:70)
	at is.hail.io.vcf.LoadVCF$.getHeaderLines(LoadVCF.scala:1291)
	at is.hail.io.vcf.MatrixVCFReader$.apply(LoadVCF.scala:1573)
	at is.hail.io.vcf.MatrixVCFReader$.fromJValue(LoadVCF.scala:1665)
	at is.hail.expr.ir.MatrixReader$.fromJson(MatrixIR.scala:90)
	at is.hail.expr.ir.IRParser$.matrix_ir_1(Parser.scala:1720)
	at is.hail.expr.ir.IRParser$$anonfun$matrix_ir$1.apply(Parser.scala:1646)
	at is.hail.expr.ir.IRParser$$anonfun$matrix_ir$1.apply(Parser.scala:1646)
	at is.hail.utils.StackSafe$More.advance(StackSafe.scala:64)
	at is.hail.utils.StackSafe$.run(StackSafe.scala:16)
	at is.hail.utils.StackSafe$StackFrame.run(StackSafe.scala:32)
	at is.hail.expr.ir.IRParser$$anonfun$parse_matrix_ir$1.apply(Parser.scala:1986)
	at is.hail.expr.ir.IRParser$$anonfun$parse_matrix_ir$1.apply(Parser.scala:1986)
	at is.hail.expr.ir.IRParser$.parse(Parser.scala:1973)
	at is.hail.expr.ir.IRParser$.parse_matrix_ir(Parser.scala:1986)
	at is.hail.backend.spark.SparkBackend$$anonfun$parse_matrix_ir$1$$anonfun$apply$22.apply(SparkBackend.scala:660)
	at is.hail.backend.spark.SparkBackend$$anonfun$parse_matrix_ir$1$$anonfun$apply$22.apply(SparkBackend.scala:659)
	at is.hail.backend.ExecuteContext$$anonfun$scoped$1$$anonfun$apply$1.apply(ExecuteContext.scala:47)
	at is.hail.backend.ExecuteContext$$anonfun$scoped$1$$anonfun$apply$1.apply(ExecuteContext.scala:47)
	at is.hail.utils.package$.using(package.scala:638)
	at is.hail.backend.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:47)
	at is.hail.backend.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:46)
	at is.hail.utils.package$.using(package.scala:638)
	at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:17)
	at is.hail.backend.ExecuteContext$.scoped(ExecuteContext.scala:46)
	at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:275)
	at is.hail.backend.spark.SparkBackend$$anonfun$parse_matrix_ir$1.apply(SparkBackend.scala:659)
	at is.hail.backend.spark.SparkBackend$$anonfun$parse_matrix_ir$1.apply(SparkBackend.scala:658)
	at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
	at is.hail.utils.ExecutionTimer$.logTime(ExecutionTimer.scala:59)
	at is.hail.backend.spark.SparkBackend.parse_matrix_ir(SparkBackend.scala:658)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)



Hail version: 0.2.78-b17627756568
Error summary: StringIndexOutOfBoundsException: String index out of range: 0

Does that VCF have empty lines? I’m pretty sure that’s a violation of the VCF spec.

Thanks! I found some discssion about this.
It looks the issues of vcf file.
Thank you for suggestion!

1 Like