Import_table error

I am trying to import a .csv file with annotations for the 1k genome project. However, when I run this code:
table = (hl.import_table('gs://1k-genome/1000-genomes/other/sample_info/sample_info.csv', impute = True, delimiter=',', missing=''))
I get the following error:
FatalError Traceback (most recent call last)
in ()
4 #table = hl.import_table(‘gs://1k-genome/1000-genomes/other/sample_info/sample_info.csv’, key_by=(‘Sample’))
5
----> 6 table = (hl.import_table(‘gs://1k-genome/1000-genomes/other/sample_info/sample_info.csv’, impute = True, delimiter=’,’, missing=’’))

/home/hail/hail.zip/hail/typecheck/check.py in wrapper(*args, **kwargs)
    545         def wrapper(*args, **kwargs):
    546             args_, kwargs_ = check_all(f, args, kwargs, checkers, is_method=is_method)
--> 547             return f(*args_, **kwargs_)
    548 
    549         update_wrapper(wrapper, f)

/home/hail/hail.zip/hail/methods/impex.py in import_table(paths, key, min_partitions, impute, no_header, comment, delimiter, missing, types, quote, skip_blank_lines, force_bgz)
   1222     jt = Env.hc()._jhc.importTable(paths, key, min_partitions, jtypes, comment,
   1223                                    delimiter, missing, no_header, impute, quote,
-> 1224                                    skip_blank_lines, force_bgz)
   1225     return Table(jt)
   1226 

/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py in __call__(self, *args)
   1131         answer = self.gateway_client.send_command(command)
   1132         return_value = get_return_value(
-> 1133             answer, self.gateway_client, self.target_id, self.name)
   1134 
   1135         for temp_arg in temp_args:

/home/hail/hail.zip/hail/utils/java.py in deco(*args, **kwargs)
    194             raise FatalError('%s\n\nJava stack trace:\n%s\n'
    195                              'Hail version: %s\n'
--> 196                              'Error summary: %s' % (deepest, full, hail.__version__, deepest)) from None
    197         except pyspark.sql.utils.CapturedException as e:
    198             raise FatalError('%s\n\nJava stack trace:\n%s\n'

FatalError: HailException: expected 62 fields, but found 63

Java stack trace:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 11.0 failed 20 times, most recent failure: Lost task 0.19 in stage 11.0 (TID 182, nneka-w-0.c.avl-hail-ines.internal, executor 2): is.hail.utils.HailException: sample_info.csv: expected 62 fields, but found 63
  offending line: "HG00403","SH001","CHS","Southern Han Chinese, China","male"...
	at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:17)
	at is.hail.utils.package$.fatal(package.scala:26)
	at is.hail.utils.Context.wrapException(Context.scala:19)
	at is.hail.utils.WithContext.foreach(Context.scala:43)
	at is.hail.utils.TextTableReader$$anonfun$5.apply(TextTableReader.scala:116)
	at is.hail.utils.TextTableReader$$anonfun$5.apply(TextTableReader.scala:115)
	at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:157)
	at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:157)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:157)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.aggregate(TraversableOnce.scala:214)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1336)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1136)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1136)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$25.apply(RDD.scala:1137)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$25.apply(RDD.scala:1137)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:797)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:797)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: is.hail.utils.HailException: expected 62 fields, but found 63
	at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:6)
	at is.hail.utils.package$.fatal(package.scala:26)
	at is.hail.utils.TextTableReader$$anonfun$5$$anonfun$apply$2.apply(TextTableReader.scala:119)
	at is.hail.utils.TextTableReader$$anonfun$5$$anonfun$apply$2.apply(TextTableReader.scala:116)
	at is.hail.utils.WithContext.foreach(Context.scala:41)
	... 25 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1517)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1505)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1504)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1504)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1732)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1687)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1676)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2029)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$reduce$1.apply(RDD.scala:1026)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.reduce(RDD.scala:1008)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1.apply(RDD.scala:1151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1128)
	at is.hail.utils.TextTableReader$.imputeTypes(TextTableReader.scala:115)
	at is.hail.utils.TextTableReader$.read(TextTableReader.scala:217)
	at is.hail.HailContext$$anonfun$importTables$3.apply(HailContext.scala:466)
	at is.hail.HailContext$$anonfun$importTables$3.apply(HailContext.scala:466)
	at is.hail.HailContext.forceBGZip(HailContext.scala:568)
	at is.hail.HailContext.importTables(HailContext.scala:465)
	at is.hail.HailContext.importTable(HailContext.scala:427)
	at sun.reflect.GeneratedMethodAccessor88.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)is.hail.utils.HailException: sample_info.csv: expected 62 fields, but found 63
  offending line: "HG00403","SH001","CHS","Southern Han Chinese, China","male"...
	at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:17)
	at is.hail.utils.package$.fatal(package.scala:26)
	at is.hail.utils.Context.wrapException(Context.scala:19)
	at is.hail.utils.WithContext.foreach(Context.scala:43)
	at is.hail.utils.TextTableReader$$anonfun$5.apply(TextTableReader.scala:116)
	at is.hail.utils.TextTableReader$$anonfun$5.apply(TextTableReader.scala:115)
	at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:157)
	at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:157)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:157)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.aggregate(TraversableOnce.scala:214)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1336)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1136)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1136)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$25.apply(RDD.scala:1137)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$25.apply(RDD.scala:1137)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:797)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:797)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)is.hail.utils.HailException: expected 62 fields, but found 63
	at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:6)
	at is.hail.utils.package$.fatal(package.scala:26)
	at is.hail.utils.TextTableReader$$anonfun$5$$anonfun$apply$2.apply(TextTableReader.scala:119)
	at is.hail.utils.TextTableReader$$anonfun$5$$anonfun$apply$2.apply(TextTableReader.scala:116)
	at is.hail.utils.WithContext.foreach(Context.scala:41)
	at is.hail.utils.TextTableReader$$anonfun$5.apply(TextTableReader.scala:116)
	at is.hail.utils.TextTableReader$$anonfun$5.apply(TextTableReader.scala:115)
	at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:157)
	at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:157)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:157)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.aggregate(TraversableOnce.scala:214)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1336)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1136)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1136)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$25.apply(RDD.scala:1137)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$25.apply(RDD.scala:1137)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:797)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:797)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)



Hail version: devel-aa83f2a1d041
Error summary: HailException: expected 62 fields, but found 63

it looks like there’s a field which has a comma in it – this is causing problems because it’s generating an extra delimiter. But there’s a solution! pass quote='"' to import_table. This will properly read quoted fields like you have.

1 Like