How to fix the error of 'MatrixTable.union_rows' expects all datasets to have the same columns

Hi Hail team,

I’m trying to merge all my chromosome MT into one MT to run sample QC.
I have exactly the same samples, but I cannot successfully ran union_row(), as below.

The code I ran:

mt_2 = "chr2.mt"
mt_20 = "chr20.mt"
mt2 = hl.read_matrix_table(mt_2)
print("mt2.count() = {}".format(mt2.count()))
mt20 = hl.read_matrix_table(mt_20)
print("mt20.count() = {}".format(mt20.count()))

# Union Rows
all_mt = mt2.union_rows(mt20)
all_mt.count()

The error message:

mt2.count() = (4796512, 366)
mt20.count() = (1512641, 366)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/tmp/738332.tmpdir/ipykernel_33/2717554317.py in <module>
     10 print("mt20.count() = {}".format(mt20.count()))
     11 
---> 12 all_mt = mt2.union_rows(mt20)
     13 all_mt.count()

<decorator-gen-1304> in union_rows(_check_cols, *datasets)

/opt/conda/lib/python3.7/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
    575     def wrapper(__original_func, *args, **kwargs):
    576         args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 577         return __original_func(*args_, **kwargs_)
    578 
    579     return wrapper

/opt/conda/lib/python3.7/site-packages/hail/matrixtable.py in union_rows(_check_cols, *datasets)
   3621                     .find(lambda x: ~(x[1] == first_keys))[0])))
   3622                 if wrong_keys is not None:
-> 3623                     raise ValueError(f"'MatrixTable.union_rows' expects all datasets to have the same columns. "
   3624                                      f"Datasets 0 and {wrong_keys + 1} have different columns (or possibly different order).")
   3625             return MatrixTable(ir.MatrixUnionRows(*[d._mir for d in datasets]))

ValueError: 'MatrixTable.union_rows' expects all datasets to have the same columns. Datasets 0 and 1 have different columns (or possibly different order).

And I take a look the source code (Hail | hail.matrixtable), it seems like my column has different order.

Does anyone know how to fix this problem? I’m thinking re-order my column, but I’m not sure how to do it.

// chr2:
mt2_col_list = mt2.col_key.collect()
mt2_col_list[:10]

[Struct(s='TWHJ-PNRR-10145'),
 Struct(s='TWHJ-PNRR-10826-10826'),
 Struct(s='TWHJ-PNRR-10245'),
 Struct(s='TWHJ-PNRR-10703'),
 Struct(s='TWHJ-PNRR-10867-10867'),
 Struct(s='TWHJ-PNRR-10787'),
 Struct(s='TWHJ-PNRR-10833-10833'),
 Struct(s='TWHJ-PNRR-10859-10859'),
 Struct(s='TWHJ-PNRR-10716-10716'),
 Struct(s='TWHJ-PNRR-10823-10823')]

// chr20:
mt20_col_list = mt20.col_key.collect()
mt20_col_list[:10]
[Struct(s='TWHJ-PNRR-10800-10800'),
 Struct(s='TWHJ-PNRR-10577'),
 Struct(s='TWHJ-PNRR-10332-10332'),
 Struct(s='TWHJ-PNRR-10257'),
 Struct(s='TWHJ-PNRR-10388'),
 Struct(s='TWHJ-PNRR-10951'),
 Struct(s='TWHJ-PNRR-10954'),
 Struct(s='TWHJ-PNRR-10105-10105'),
 Struct(s='TWHJ-PNRR-10188'),
 Struct(s='TWHJ-PNRR-10453')]

Thanks for helping and happy new year!

Best,
Po-Ying

Here’s a function you can use:


def align_mt2_cols_to_mt1(mt1, mt2):
    mt2 = mt2.add_col_idx()
    new_col_order = mt2.index_cols(mt1.col_key).col_idx.collect()
   return mt2.choose_cols(new_col_order)

mt2 = align_mt2_cols_to_mt1(mt1)
1 Like

Hi @tpoterba

Really appreciate your reply, but I still had some issue, as below:

mt_2 = "/storage1/fs1/jin810/Active/Neuropathy_WGS_2021May/hail_runCombiner_resultneuropathy_batch1_hail_biallels_denseMT_chr2.mt"
mt_20 = "/storage1/fs1/jin810/Active/Neuropathy_WGS_2021May/hail_runCombiner_resultneuropathy_batch1_hail_biallels_denseMT_chr20.mt"
mt2 = hl.read_matrix_table(mt_2)
print("mt2.count() = {}".format(mt2.count()))
mt20 = hl.read_matrix_table(mt_20)
print("mt20.count() = {}".format(mt20.count()))

# def align_mt2_cols_to_mt1(mt1, mt2):
#     mt1 = mt1.add_col_index()
#     mt2 = mt2.add_col_index()
#     new_col_order = mt2.index_cols(mt1.col_idx).col_idx.collect()
#     return mt2.choose_cols(new_col_order)

# mt2 = align_mt2_cols_to_mt1(mt20, mt2)

### test:
mt2 = mt2.add_col_index()
# mt2.describe()
mt20 = mt20.add_col_index()
new_col_order = mt2.index_cols(mt20.col_idx).col_idx.collect()
print("new_col_order = {}".format(new_col_order))

# mt2 = mt2.choose_cols(new_col_order)

# # Union Rows
# all_mt = mt2.union_rows(mt20)
# all_mt.count()

The error:

mt2.count() = (4796512, 366)
mt20.count() = (1512641, 366)
---------------------------------------------------------------------------
TableIndexKeyError                        Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/hail/matrixtable.py in index_cols(self, all_matches, *exprs)
   2858         try:
-> 2859             return self.cols()._index(*exprs, all_matches=all_matches)
   2860         except TableIndexKeyError as err:

/opt/conda/lib/python3.7/site-packages/hail/table.py in _index(self, all_matches, *exprs)
   1656             if not is_interval:
-> 1657                 raise TableIndexKeyError(self.key.dtype, exprs)
   1658 

TableIndexKeyError: 

During handling of the above exception, another exception occurred:

ExpressionException                       Traceback (most recent call last)
/tmp/738332.tmpdir/ipykernel_33/1838263181.py in <module>
     22 # mt2.describe()
     23 mt20 = mt20.add_col_index()
---> 24 new_col_order = mt2.index_cols(mt20.col_idx).col_idx.collect()
     25 print("new_col_order = {}".format(new_col_order))
     26 

/opt/conda/lib/python3.7/site-packages/hail/matrixtable.py in index_cols(self, all_matches, *exprs)
   2860         except TableIndexKeyError as err:
   2861             raise ExpressionException(
-> 2862                 f"Key type mismatch: cannot index matrix table with given expressions:\n"
   2863                 f"  MatrixTable col key: {', '.join(str(t) for t in err.key_type.values()) or '<<<empty key>>>'}\n"
   2864                 f"  Index expressions:   {', '.join(str(e.dtype) for e in err.index_expressions)}")

ExpressionException: Key type mismatch: cannot index matrix table with given expressions:
  MatrixTable col key: str
  Index expressions:   int64

Hi Hail team,

Thanks for point a way out. I modified the function a little bit, and it works now.
Here is the modified solution:

def align_mt2_cols_to_mt1(mt1, mt2):
    mt1 = mt1.add_col_index()
    mt2 = mt2.add_col_index()
    new_col_order = mt2.index_cols(mt1.col_key).col_idx.collect()
    return mt2.choose_cols(new_col_order)

mt2 = align_mt2_cols_to_mt1(mt1, mt2)

My testing sets:

mt_2 = "chr2.mt"
mt_20 = "chr20.mt"
mt2 = hl.read_matrix_table(mt_2)
print("mt2.count() = {}".format(mt2.count()))

mt20 = hl.read_matrix_table(mt_20)
print("mt20.count() = {}".format(mt20.count()))

def align_mt2_cols_to_mt1(mt1, mt2):
    mt1 = mt1.add_col_index()
    mt2 = mt2.add_col_index()
    new_col_order = mt2.index_cols(mt1.col_key).col_idx.collect()
    return mt2.choose_cols(new_col_order)

mt2 = align_mt2_cols_to_mt1(mt20, mt2)

# Union Rows
all_mt = mt2.union_rows(mt20)
all_mt.count()

///////////////
// Result:
///////////////
mt2.count() = (4796512, 366)
mt20.count() = (1512641, 366)
[53]: (6309153, 366)

Thanks again!

Best,
Po-Ying