Hi Hail team,
I was wondering if you have a tweak to solve the following:
I pase (a big file) to the function to_pandas()
and I get the following error:
---------------------------------------------------------------------------
FatalError Traceback (most recent call last)
<ipython-input-23-e634dc0fa600> in <module>()
----> 1 pdRows=common_mt.rows().to_pandas(flatten=True)
<decorator-gen-832> in to_pandas(self, flatten)
~/hail-python.zip/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
558 def wrapper(__original_func, *args, **kwargs):
559 args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 560 return __original_func(*args_, **kwargs_)
561
562 return wrapper
~/hail-python.zip/hail/table.py in to_pandas(self, flatten)
2517
2518 """
-> 2519 return self.to_spark(flatten).toPandas()
2520
2521 @staticmethod
/usr/lib/spark/python/pyspark/sql/dataframe.py in toPandas(self)
1964 raise RuntimeError("%s\n%s" % (_exception_message(e), msg))
1965 else:
-> 1966 pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
1967
1968 dtype = {}
/usr/lib/spark/python/pyspark/sql/dataframe.py in collect(self)
464 """
465 with SCCallSiteSync(self._sc) as css:
--> 466 port = self._jdf.collectToPython()
467 return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
468
/usr/lib/spark/python/lib/py4j-src.zip/py4j/java_gateway.py in __call__(self, *args)
1158 answer = self.gateway_client.send_command(command)
1159 return_value = get_return_value(
-> 1160 answer, self.gateway_client, self.target_id, self.name)
1161
1162 for temp_arg in temp_args:
~/hail-python.zip/hail/utils/java.py in deco(*args, **kwargs)
208 raise FatalError('%s\n\nJava stack trace:\n%s\n'
209 'Hail version: %s\n'
--> 210 'Error summary: %s' % (deepest, full, hail.__version__, deepest)) from None
211 except pyspark.sql.utils.CapturedException as e:
212 raise FatalError('%s\n\nJava stack trace:\n%s\n'
FatalError: SparkException: Job aborted due to stage failure: Total size of serialized results of 53 tasks (1029.6 MB) is bigger than spark.driver.maxResultSize (1024.0 MB)
I modified the file /hail/hail/create_config_file.py
and then compile hail again but I still get the same error:
'properties': {
'spark:spark.driver.memory': '{driver_memory}g',
'spark:spark.driver.maxResultSize': '0',
'spark:spark.task.maxFailures': '40',
'spark:spark.kryoserializer.buffer.max': '25g',
'spark:spark.driver.extraJavaOptions': '-Xss8M',
'spark:spark.executor.extraJavaOptions': '-Xss8M',
'hdfs:dfs.replication': '1'
}
While performing this particular operation my instances hit ~90% of CPU usage and 15 out of 120 GB of RAM.
Any help will be appreciated.
Thanks,
Carlos