Cant convert mt into pandas df

I am facing the following error while converting my mt into pandas df;

FatalError Traceback (most recent call last)
in
121 n_hets=ds_result7.n_het,
122 homs=ds_result7.homs,
–> 123 n_homs=ds_result7.n_hom
124 ).to_pandas()
125

in to_pandas(self, flatten)

/opt/conda/default/lib/python3.6/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
612 def wrapper(original_func, *args, **kwargs):
613 args
, kwargs
= check_all(__original_func, args, kwargs, checkers, is_method=is_method)
–> 614 return original_func(*args, **kwargs)
615
616 return wrapper

/opt/conda/default/lib/python3.6/site-packages/hail/table.py in to_pandas(self, flatten)
3234
3235 β€œβ€"
-> 3236 return Env.spark_backend(β€˜to_pandas’).to_pandas(self, flatten)
3237
3238 @staticmethod

/opt/conda/default/lib/python3.6/site-packages/hail/backend/spark_backend.py in to_pandas(self, t, flatten)
339
340 def to_pandas(self, t, flatten):
–> 341 return self.to_spark(t, flatten).toPandas()
342
343 def from_pandas(self, df, key):

/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/dataframe.py in toPandas(self)
2148
2149 # Below is toPandas without Arrow optimization.
-> 2150 pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
2151
2152 dtype = {}

/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/dataframe.py in collect(self)
532 β€œβ€"
533 with SCCallSiteSync(self._sc) as css:
–> 534 sock_info = self._jdf.collectToPython()
535 return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
536

/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in call(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:

/opt/conda/default/lib/python3.6/site-packages/hail/backend/spark_backend.py in deco(*args, **kwargs)
40 raise FatalError(’%s\n\nJava stack trace:\n%s\n’
41 β€˜Hail version: %s\n’
β€”> 42 β€˜Error summary: %s’ % (deepest, full, hail.version, deepest)) from None
43 except pyspark.sql.utils.CapturedException as e:
44 raise FatalError(’%s\n\nJava stack trace:\n%s\n’

FatalError: HailException: array index out of bounds: index=0, length=0

Python traceback:
File β€œβ€, line 119, in
gene=ds_result7.vep.transcript_consequences[0].gene_symbol,

File β€œ/opt/conda/default/lib/python3.6/site-packages/hail/expr/expressions/typed_expressions.py”, line 776, in getitem
return super().getitem(item)

Java stack trace:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 11 in stage 2344.0 failed 20 times, most recent failure: Lost task 11.19 in stage 2344.0 (TID 236455, hailpy600-sw-j76p.us-central1-c.c.cncd-cncd.internal, executor 3441): is.hail.utils.HailException: array index out of bounds: index=0, length=0

Python traceback:
File β€œβ€, line 119, in
gene=ds_result7.vep.transcript_consequences[0].gene_symbol,

File β€œ/opt/conda/default/lib/python3.6/site-packages/hail/expr/expressions/typed_expressions.py”, line 776, in getitem
return super().getitem(item)

at __C33025Compiled.applyregion0_13(Unknown Source)
at __C33025Compiled.apply(Unknown Source)
at is.hail.expr.ir.TableMapRows$$anonfun$70$$anonfun$apply$3.apply$mcJJ$sp(TableIR.scala:1529)
at is.hail.expr.ir.TableMapRows$$anonfun$70$$anonfun$apply$3.apply(TableIR.scala:1528)
at is.hail.expr.ir.TableMapRows$$anonfun$70$$anonfun$apply$3.apply(TableIR.scala:1528)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
at scala.collection.Iterator$$anon$12.next(Iterator.scala:445)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
at scala.collection.Iterator$$anon$12.next(Iterator.scala:445)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
at scala.collection.Iterator$$anon$12.next(Iterator.scala:445)
at is.hail.utils.richUtils.RichContextRDD$$anonfun$cleanupRegions$1$$anon$1.next(RichContextRDD.scala:74)
at scala.collection.Iterator$$anon$12.next(Iterator.scala:445)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:256)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1892)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1880)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1879)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2113)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2062)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2051)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
at org.apache.spark.rdd.RDD.collect(RDD.scala:989)
at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:299)
at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3263)
at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3260)
at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3260)
at sun.reflect.GeneratedMethodAccessor79.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)

is.hail.utils.HailException: array index out of bounds: index=0, length=0

Python traceback:
File β€œβ€, line 119, in
gene=ds_result7.vep.transcript_consequences[0].gene_symbol,

File β€œ/opt/conda/default/lib/python3.6/site-packages/hail/expr/expressions/typed_expressions.py”, line 776, in getitem
return super().getitem(item)

at __C33025Compiled.applyregion0_13(Unknown Source)
at __C33025Compiled.apply(Unknown Source)
at is.hail.expr.ir.TableMapRows$$anonfun$70$$anonfun$apply$3.apply$mcJJ$sp(TableIR.scala:1529)
at is.hail.expr.ir.TableMapRows$$anonfun$70$$anonfun$apply$3.apply(TableIR.scala:1528)
at is.hail.expr.ir.TableMapRows$$anonfun$70$$anonfun$apply$3.apply(TableIR.scala:1528)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
at scala.collection.Iterator$$anon$12.next(Iterator.scala:445)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
at scala.collection.Iterator$$anon$12.next(Iterator.scala:445)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
at scala.collection.Iterator$$anon$12.next(Iterator.scala:445)
at is.hail.utils.richUtils.RichContextRDD$$anonfun$cleanupRegions$1$$anon$1.next(RichContextRDD.scala:74)
at scala.collection.Iterator$$anon$12.next(Iterator.scala:445)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:256)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)

Hail version: 0.2.57-582b2e31b8bd
Error summary: HailException: array index out of bounds: index=0, length=0

Python traceback:
File β€œβ€, line 119, in
gene=ds_result7.vep.transcript_consequences[0].gene_symbol,

File β€œ/opt/conda/default/lib/python3.6/site-packages/hail/expr/expressions/typed_expressions.py”, line 776, in getitem
return super().getitem(item)

My piece of code;

ds_result7 = table_miss.rows()
ds_result7 = ds_result7.key_by()
df = ds_result7.select(
locus=ds_result7.locus,
alleles=ds_result7.alleles, gene=ds_result7.vep.transcript_consequences[0].gene_symbol,
hets=ds_result7.hets,
n_hets=ds_result7.n_het,
homs=ds_result7.homs,
n_homs=ds_result7.n_hom
).to_pandas()

The error points to this line as causing the index out of bounds exception. transcript_consequences was an empty array in at least one of the rows of the table.

Can you suggest any workarounds

Using ds_result7.vep.transcript_consequences.head() may give you what you want – this returns a missing value if the array has length 0.

The array indeed had empty/missing values. Actually im making excel by selecting the specified fields, so what im doing now is exporting the whole vep field and doing it in pandas. Can you suggest something?

Why not stick with

ds_result7 = table_miss.rows()
ds_result7 = ds_result7.key_by()
df = ds_result7.select(
  locus=ds_result7.locus,
  alleles=ds_result7.alleles,
  gene=ds_result7.vep.transcript_consequences.head().gene_symbol,
  hets=ds_result7.hets,
  n_hets=ds_result7.n_het,
  homs=ds_result7.homs,
  n_homs=ds_result7.n_hom
).to_pandas()

, as suggested?
You may also wish to do something like

gene=hl.delim(
  ds_result7.vep.transcript_consequences\
  .map(lambda transcript: transcript.gene_symbol),
  ","
)
1 Like

Thanks @olszewskip, i did not get it earlier how to use it.

1 Like