Hi
I’m using hail 0.2.83 and am having a problem with creating a table from a pandas df.
I’m attempting to run gnomad’s assign_population_pcs: gnomad.sample_qc.ancestry — gnomad master documentation
The error I get is:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Input In [68], in <module>
1 from gnomad.sample_qc.ancestry import assign_population_pcs
----> 2 pop_ht, pop_clf = assign_population_pcs(pca_scores, pca_scores.scores, known_col="cohort", n_estimators=100, prop_train=0.8, min_prob=0.5)
File ~/venv/lib/python3.8/site-packages/gnomad/sample_qc/ancestry.py:232, in assign_population_pcs(pop_pca_scores, pc_cols, known_col, fit, seed, prop_train, n_estimators, min_prob, output_col, missing_label)
224 logger.info(
225 "Found the following sample count after population assignment: %s",
226 ", ".join(
227 f"{pop}: {count}" for pop, count in Counter(pop_pc_pd[output_col]).items()
228 ),
229 )
231 if hail_input:
--> 232 pops_ht = hl.Table.from_pandas(pop_pc_pd, key=list(pop_pca_scores.key))
233 pops_ht.annotate_globals(
234 assign_pops_from_pc_params=hl.struct(min_assignment_prob=min_prob)
235 )
236 return pops_ht, pop_clf
File <decorator-gen-1085>:2, in from_pandas(df, key)
File ~/venv/lib/python3.8/site-packages/hail/typecheck/check.py:577, in _make_dec.<locals>.wrapper(__original_func, *args, **kwargs)
574 @decorator
575 def wrapper(__original_func, *args, **kwargs):
576 args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 577 return __original_func(*args_, **kwargs_)
File ~/venv/lib/python3.8/site-packages/hail/table.py:3293, in Table.from_pandas(df, key)
3271 @staticmethod
3272 @typecheck(df=pandas.DataFrame,
3273 key=oneof(str, sequenceof(str)))
3274 def from_pandas(df, key=[]) -> 'Table':
3275 """Create table from Pandas DataFrame
3276
3277 Examples
(...)
3291 :class:`.Table`
3292 """
-> 3293 return Env.spark_backend('from_pandas').from_pandas(df, key)
File ~/venv/lib/python3.8/site-packages/hail/backend/spark_backend.py:325, in SparkBackend.from_pandas(self, df, key)
324 def from_pandas(self, df, key):
--> 325 return Table.from_spark(Env.spark_session().createDataFrame(df), key)
File ~/venv/lib/python3.8/site-packages/pyspark/sql/session.py:673, in SparkSession.createDataFrame(self, data, schema, samplingRatio, verifySchema)
670 has_pandas = False
671 if has_pandas and isinstance(data, pandas.DataFrame):
672 # Create a DataFrame from pandas DataFrame.
--> 673 return super(SparkSession, self).createDataFrame(
674 data, schema, samplingRatio, verifySchema)
675 return self._create_dataframe(data, schema, samplingRatio, verifySchema)
File ~/venv/lib/python3.8/site-packages/pyspark/sql/pandas/conversion.py:300, in SparkConversionMixin.createDataFrame(self, data, schema, samplingRatio, verifySchema)
298 raise
299 data = self._convert_from_pandas(data, schema, timezone)
--> 300 return self._create_dataframe(data, schema, samplingRatio, verifySchema)
File ~/venv/lib/python3.8/site-packages/pyspark/sql/session.py:700, in SparkSession._create_dataframe(self, data, schema, samplingRatio, verifySchema)
698 rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio)
699 else:
--> 700 rdd, schema = self._createFromLocal(map(prepare, data), schema)
701 jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
702 jdf = self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(), schema.json())
File ~/venv/lib/python3.8/site-packages/pyspark/sql/session.py:512, in SparkSession._createFromLocal(self, data, schema)
509 data = list(data)
511 if schema is None or isinstance(schema, (list, tuple)):
--> 512 struct = self._inferSchemaFromList(data, names=schema)
513 converter = _create_converter(struct)
514 data = map(converter, data)
File ~/venv/lib/python3.8/site-packages/pyspark/sql/session.py:439, in SparkSession._inferSchemaFromList(self, data, names)
437 if not data:
438 raise ValueError("can not infer schema from empty dataset")
--> 439 schema = reduce(_merge_type, (_infer_schema(row, names) for row in data))
440 if _has_nulltype(schema):
441 raise ValueError("Some of types cannot be determined after inferring")
File ~/venv/lib/python3.8/site-packages/pyspark/sql/types.py:1107, in _merge_type(a, b, name)
1105 if isinstance(a, StructType):
1106 nfs = dict((f.name, f.dataType) for f in b.fields)
-> 1107 fields = [StructField(f.name, _merge_type(f.dataType, nfs.get(f.name, NullType()),
1108 name=new_name(f.name)))
1109 for f in a.fields]
1110 names = set([f.name for f in fields])
1111 for n in nfs:
File ~/venv/lib/python3.8/site-packages/pyspark/sql/types.py:1107, in <listcomp>(.0)
1105 if isinstance(a, StructType):
1106 nfs = dict((f.name, f.dataType) for f in b.fields)
-> 1107 fields = [StructField(f.name, _merge_type(f.dataType, nfs.get(f.name, NullType()),
1108 name=new_name(f.name)))
1109 for f in a.fields]
1110 names = set([f.name for f in fields])
1111 for n in nfs:
File ~/venv/lib/python3.8/site-packages/pyspark/sql/types.py:1102, in _merge_type(a, b, name)
1099 return a
1100 elif type(a) is not type(b):
1101 # TODO: type cast (such as int -> long)
-> 1102 raise TypeError(new_msg("Can not merge type %s and %s" % (type(a), type(b))))
1104 # same type
1105 if isinstance(a, StructType):
TypeError: field cohort: Can not merge type <class 'pyspark.sql.types.StructType'> and <class 'pyspark.sql.types.StringType'>
I’ve run through the function from it’s source code and have found that the step that fails is very near the end of the function where a pandas dataframe is converted to a hail table:
pops_ht = hl.Table.from_pandas(pop_pc_pd, key=list(pop_pca_scores.key))
The error is the same as that given above:
TypeError: field known_pop: Can not merge type <class 'pyspark.sql.types.StructType'> and <class 'pyspark.sql.types.StringType'>
The datatypes of the pop_pc_pd are:
s string
known_pop string
pca_scores object
pop object
prob_AFR float64
prob_AMR float64
prob_EAS float64
prob_EUR float64
prob_SAS float64
dtype: object
list(pop_pca_scores.key) = [‘s’]
Can you suggest a fix for this?
It worked with a previous version (0.2.62 running on python 3.6) but I had to upgrade due to the recent log4j vulnerabilities.
Thank you!