Strange keyerror (KeyError: 'g')

Hey hail team, I’m running into a weird bug. I’m running (version 0.2.28-61941242c15d):

data_source = 'broad'
freeze = 5

hardcalls = get_ukbb_data(data_source, freeze, raw=False, split=True, adj=False)

sample_map_ht = hl.read_table(array_sample_map_ht(data_source, freeze))
sample_map = hl.import_table(array_sample_map(freeze), delimiter=',', quote='"')
sample_map = sample_map.key_by(s=sample_map.eid_26041)

print(hardcalls.count())
chr20 = hl.filter_intervals(hardcalls, [hl.parse_locus_interval('chr20', reference_genome='GRCh38')])
print(chr20.count())

chr20 = chr20.select_rows('a_index', 'was_split')

chr20 = chr20.annotate_cols(**sample_map_ht[chr20.s])
chr20 = chr20.annotate_cols(**sample_map[chr20.ukbb_app_26041_id])

chr20 = chr20.select_cols('batch', 'batch.c')
chr20 = chr20.transmute_cols(batch_num=chr20['batch'],
                             batch=chr20['batch.c'])
chr20.describe()

chr20 = chr20.annotate_rows(
    n_not_called_50K=chr20.aggregate_cols(
        hl.agg.filter(chr20.batch == '150K', 
        hl.agg.count_where(hl.is_missing(chr20.GT)))),
    n_not_called_100K=chr20.aggregate_cols(
        hl.agg.filter(chr20.batch == '100K', 
        hl.agg.count_where(hl.is_missing(chr20.GT)))),
    n_not_called_200K=chr20.aggregate_cols(
        hl.agg.filter(chr20.batch == '200K', 
        hl.agg.count_where(hl.is_missing(chr20.GT))))
)

and getting this error:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-5-10337294da16> in <module>
      2     n_not_called_50K=chr20.aggregate_cols(
      3         hl.agg.filter(chr20.batch == '150K', 
----> 4         hl.agg.count_where(hl.is_missing(chr20.GT)))),
      5     n_not_called_100K=chr20.aggregate_cols(
      6         hl.agg.filter(chr20.batch == '100K', 

</opt/conda/default/lib/python3.6/site-packages/decorator.py:decorator-gen-1157> in aggregate_cols(self, expr, _localize)

/opt/conda/default/lib/python3.6/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
    583     def wrapper(__original_func, *args, **kwargs):
    584         args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 585         return __original_func(*args_, **kwargs_)
    586 
    587     return wrapper

/opt/conda/default/lib/python3.6/site-packages/hail/matrixtable.py in aggregate_cols(self, expr, _localize)
   2037         agg_ir = TableAggregate(MatrixColsTable(base._mir), subst_query)
   2038         if _localize:
-> 2039             return Env.backend().execute(agg_ir)
   2040         else:
   2041             return construct_expr(agg_ir, expr.dtype)

/opt/conda/default/lib/python3.6/site-packages/hail/backend/backend.py in execute(self, ir, timed)
    107 
    108     def execute(self, ir, timed=False):
--> 109         result = json.loads(Env.hc()._jhc.backend().executeJSON(self._to_java_ir(ir)))
    110         value = ir.typ._from_json(result['value'])
    111         timings = result['timings']

/opt/conda/default/lib/python3.6/site-packages/hail/backend/backend.py in _to_java_ir(self, ir)
    103             r = CSERenderer(stop_at_jir=True)
    104             # FIXME parse should be static
--> 105             ir._jir = ir.parse(r(ir), ir_map=r.jirs)
    106         return ir._jir
    107 

/opt/conda/default/lib/python3.6/site-packages/hail/ir/renderer.py in __call__(self, root)
    181 
    182     def __call__(self, root: 'ir.BaseIR') -> str:
--> 183         binding_sites = CSEAnalysisPass(self)(root)
    184         return CSEPrintPass(self)(root, binding_sites)
    185 

/opt/conda/default/lib/python3.6/site-packages/hail/ir/renderer.py in __call__(self, root)
    251 
    252             if isinstance(child, ir.IR):
--> 253                 bind_depth = child_frame.bind_depth()
    254                 lets = None
    255                 if bind_depth < len(stack):

/opt/conda/default/lib/python3.6/site-packages/hail/ir/renderer.py in bind_depth(self)
    345                 bind_depth = max(bind_depth, max(self.context[0][var] for var in self.node.free_vars))
    346             if len(self.node.free_agg_vars) > 0:
--> 347                 bind_depth = max(bind_depth, max(self.context[1][var] for var in self.node.free_agg_vars))
    348             if len(self.node.free_scan_vars) > 0:
    349                 bind_depth = max(bind_depth, max(self.context[2][var] for var in self.node.free_scan_vars))

/opt/conda/default/lib/python3.6/site-packages/hail/ir/renderer.py in <genexpr>(.0)
    345                 bind_depth = max(bind_depth, max(self.context[0][var] for var in self.node.free_vars))
    346             if len(self.node.free_agg_vars) > 0:
--> 347                 bind_depth = max(bind_depth, max(self.context[1][var] for var in self.node.free_agg_vars))
    348             if len(self.node.free_scan_vars) > 0:
    349                 bind_depth = max(bind_depth, max(self.context[2][var] for var in self.node.free_scan_vars))

KeyError: 'g'

I’d appreciate any insight into this error!

This is one for @patrick-schultz

see conversation at Java errors in notebook

can you try updating to 0.2.29? I think this might be fixed already

I just tried

import hail as hl
from ukbb_qc.resources import *

hl.init(default_reference='GRCh38', log='/test.log')

data_source = 'broad'
freeze = 5

#mt = get_ukbb_data(data_source, freeze, raw=True, split=False)
hardcalls = get_ukbb_data(data_source, freeze, raw=False, split=True, adj=False)

sample_map_ht = hl.read_table(array_sample_map_ht(data_source, freeze))
sample_map = hl.import_table(array_sample_map(freeze), delimiter=',', quote='"')
sample_map = sample_map.key_by(s=sample_map.eid_26041)

print(hardcalls.count())
chr20 = hl.filter_intervals(hardcalls, [hl.parse_locus_interval('chr20', reference_genome='GRCh38')])
print(chr20.count())

chr20 = chr20.select_rows('a_index', 'was_split')

chr20 = chr20.annotate_cols(**sample_map_ht[chr20.s])
chr20 = chr20.annotate_cols(**sample_map[chr20.ukbb_app_26041_id])

chr20 = chr20.select_cols('batch', 'batch.c')
chr20 = chr20.transmute_cols(batch_num=chr20['batch'],
                             batch=chr20['batch.c'])
chr20.describe()

chr20 = chr20.filter_cols(hl.is_defined(chr20.batch))

chr20 = chr20.annotate_rows(
    n_not_called_50K=
        hl.agg.filter(chr20.batch == '150K',
        hl.agg.count_where(hl.is_missing(chr20.GT))),
    n_not_called_100K=
        hl.agg.filter(chr20.batch == '100K',
        hl.agg.count_where(hl.is_missing(chr20.GT))),
    n_not_called_200K=
        hl.agg.filter(chr20.batch == '200K',
        hl.agg.count_where(hl.is_missing(chr20.GT)))
)

chr20 = chr20.rows()
chr20.show(5)

on 0.2.29, and it crashed – test.log (1.4 MB)

Can replicate this locally. Totally a “bad error message” bug, the one you should have gotten is:

ExpressionException: scope violation: 'MatrixTable.aggregate_cols' supports aggregation over indices ['column']
    Found indices ['column', 'row'], with unexpected indices ['row']. Invalid fields:
        'GT' (indices ['column', 'row'])
1 Like

Fixed here:

1 Like