Group_by with scans

Hi hail team,

I’m trying to use hl.scan.sum in combination with group_by to get the cumulative number of variants per exon. I’m not super familiar with scans and was wondering how to get a show() working?

I’m running:

ht_result = ht.group_by(ht.transcript, ht.exon).aggregate(
    test_sum=hl.scan.sum(ht.variant_count))
ht_result.show()

and getting this error:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/anaconda3/envs/hail/lib/python3.7/site-packages/IPython/core/formatters.py in __call__(self, obj)
    700                 type_pprinters=self.type_printers,
    701                 deferred_pprinters=self.deferred_printers)
--> 702             printer.pretty(obj)
    703             printer.flush()
    704             return stream.getvalue()

~/anaconda3/envs/hail/lib/python3.7/site-packages/IPython/lib/pretty.py in pretty(self, obj)
    392                         if cls is not object \
    393                                 and callable(cls.__dict__.get('__repr__')):
--> 394                             return _repr_pprint(obj, self, cycle)
    395 
    396             return _default_pprint(obj, self, cycle)

~/anaconda3/envs/hail/lib/python3.7/site-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
    698     """A pprint that just redirects to the normal repr function."""
    699     # Find newlines and replace them with p.break_()
--> 700     output = repr(obj)
    701     lines = output.splitlines()
    702     with p.group():

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/table.py in __repr__(self)
   1267 
   1268         def __repr__(self):
-> 1269             return self.__str__()
   1270 
   1271         def data(self):

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/table.py in __str__(self)
   1264 
   1265         def __str__(self):
-> 1266             return self._ascii_str()
   1267 
   1268         def __repr__(self):

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/table.py in _ascii_str(self)
   1290                 return s
   1291 
-> 1292             rows, has_more, dtype = self.data()
   1293             fields = list(dtype)
   1294             trunc_fields = [trunc(f) for f in fields]

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/table.py in data(self)
   1274                 row_dtype = t.row.dtype
   1275                 t = t.select(**{k: hl._showstr(v) for (k, v) in t.row.items()})
-> 1276                 rows, has_more = t._take_n(self.n)
   1277                 self._data = (rows, has_more, row_dtype)
   1278             return self._data

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/table.py in _take_n(self, n)
   1421             has_more = False
   1422         else:
-> 1423             rows = self.take(n + 1)
   1424             has_more = len(rows) > n
   1425             rows = rows[:n]

<decorator-gen-1103> in take(self, n, _localize)

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
    612     def wrapper(__original_func, *args, **kwargs):
    613         args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 614         return __original_func(*args_, **kwargs_)
    615 
    616     return wrapper

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/table.py in take(self, n, _localize)
   2085         """
   2086 
-> 2087         return self.head(n).collect(_localize)
   2088 
   2089     @typecheck_method(n=int)

<decorator-gen-1097> in collect(self, _localize)

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
    612     def wrapper(__original_func, *args, **kwargs):
    613         args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 614         return __original_func(*args_, **kwargs_)
    615 
    616     return wrapper

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/table.py in collect(self, _localize)
   1884         e = construct_expr(rows_ir, hl.tarray(t.row.dtype))
   1885         if _localize:
-> 1886             return Env.backend().execute(e._ir)
   1887         else:
   1888             return e

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/backend/spark_backend.py in execute(self, ir, timed)
    292 
    293     def execute(self, ir, timed=False):
--> 294         jir = self._to_java_value_ir(ir)
    295         # print(self._hail_package.expr.ir.Pretty.apply(jir, True, -1))
    296         result = json.loads(self._jhc.backend().executeJSON(jir))

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/backend/spark_backend.py in _to_java_value_ir(self, ir)
    280 
    281     def _to_java_value_ir(self, ir):
--> 282         return self._to_java_ir(ir, self._parse_value_ir)
    283 
    284     def _to_java_table_ir(self, ir):

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/backend/spark_backend.py in _to_java_ir(self, ir, parse)
    276             r = CSERenderer(stop_at_jir=True)
    277             # FIXME parse should be static
--> 278             ir._jir = parse(r(ir), ir_map=r.jirs)
    279         return ir._jir
    280 

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/ir/renderer.py in __call__(self, root)
    181 
    182     def __call__(self, root: 'ir.BaseIR') -> str:
--> 183         binding_sites = CSEAnalysisPass(self)(root)
    184         return CSEPrintPass(self)(root, binding_sites)
    185 

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/ir/renderer.py in __call__(self, root)
    251 
    252             if isinstance(child, ir.IR):
--> 253                 bind_depth = child_frame.bind_depth()
    254                 lets = None
    255                 if bind_depth < len(stack):

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/ir/renderer.py in bind_depth(self)
    349                 bind_depth = max(bind_depth, max(self.context[1][var] for var in self.node.free_agg_vars))
    350             if len(self.node.free_scan_vars) > 0:
--> 351                 bind_depth = max(bind_depth, max(self.context[2][var] for var in self.node.free_scan_vars))
    352             return bind_depth
    353 

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/ir/renderer.py in <genexpr>(.0)
    349                 bind_depth = max(bind_depth, max(self.context[1][var] for var in self.node.free_agg_vars))
    350             if len(self.node.free_scan_vars) > 0:
--> 351                 bind_depth = max(bind_depth, max(self.context[2][var] for var in self.node.free_scan_vars))
    352             return bind_depth
    353 

KeyError: 'row'

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/anaconda3/envs/hail/lib/python3.7/site-packages/IPython/core/formatters.py in __call__(self, obj)
    343             method = get_real_method(obj, self.print_method)
    344             if method is not None:
--> 345                 return method()
    346             return None
    347         else:

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/table.py in _repr_html_(self)
   1279 
   1280         def _repr_html_(self):
-> 1281             return self._html_str()
   1282 
   1283         def _ascii_str(self):

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/table.py in _html_str(self)
   1369             types = self.types
   1370 
-> 1371             rows, has_more, dtype = self.data()
   1372             fields = list(dtype)
   1373 

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/table.py in data(self)
   1274                 row_dtype = t.row.dtype
   1275                 t = t.select(**{k: hl._showstr(v) for (k, v) in t.row.items()})
-> 1276                 rows, has_more = t._take_n(self.n)
   1277                 self._data = (rows, has_more, row_dtype)
   1278             return self._data

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/table.py in _take_n(self, n)
   1421             has_more = False
   1422         else:
-> 1423             rows = self.take(n + 1)
   1424             has_more = len(rows) > n
   1425             rows = rows[:n]

<decorator-gen-1103> in take(self, n, _localize)

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
    612     def wrapper(__original_func, *args, **kwargs):
    613         args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 614         return __original_func(*args_, **kwargs_)
    615 
    616     return wrapper

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/table.py in take(self, n, _localize)
   2085         """
   2086 
-> 2087         return self.head(n).collect(_localize)
   2088 
   2089     @typecheck_method(n=int)

<decorator-gen-1097> in collect(self, _localize)

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
    612     def wrapper(__original_func, *args, **kwargs):
    613         args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 614         return __original_func(*args_, **kwargs_)
    615 
    616     return wrapper

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/table.py in collect(self, _localize)
   1884         e = construct_expr(rows_ir, hl.tarray(t.row.dtype))
   1885         if _localize:
-> 1886             return Env.backend().execute(e._ir)
   1887         else:
   1888             return e

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/backend/spark_backend.py in execute(self, ir, timed)
    292 
    293     def execute(self, ir, timed=False):
--> 294         jir = self._to_java_value_ir(ir)
    295         # print(self._hail_package.expr.ir.Pretty.apply(jir, True, -1))
    296         result = json.loads(self._jhc.backend().executeJSON(jir))

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/backend/spark_backend.py in _to_java_value_ir(self, ir)
    280 
    281     def _to_java_value_ir(self, ir):
--> 282         return self._to_java_ir(ir, self._parse_value_ir)
    283 
    284     def _to_java_table_ir(self, ir):

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/backend/spark_backend.py in _to_java_ir(self, ir, parse)
    276             r = CSERenderer(stop_at_jir=True)
    277             # FIXME parse should be static
--> 278             ir._jir = parse(r(ir), ir_map=r.jirs)
    279         return ir._jir
    280 

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/ir/renderer.py in __call__(self, root)
    181 
    182     def __call__(self, root: 'ir.BaseIR') -> str:
--> 183         binding_sites = CSEAnalysisPass(self)(root)
    184         return CSEPrintPass(self)(root, binding_sites)
    185 

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/ir/renderer.py in __call__(self, root)
    251 
    252             if isinstance(child, ir.IR):
--> 253                 bind_depth = child_frame.bind_depth()
    254                 lets = None
    255                 if bind_depth < len(stack):

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/ir/renderer.py in bind_depth(self)
    349                 bind_depth = max(bind_depth, max(self.context[1][var] for var in self.node.free_agg_vars))
    350             if len(self.node.free_scan_vars) > 0:
--> 351                 bind_depth = max(bind_depth, max(self.context[2][var] for var in self.node.free_scan_vars))
    352             return bind_depth
    353 

~/anaconda3/envs/hail/lib/python3.7/site-packages/hail/ir/renderer.py in <genexpr>(.0)
    349                 bind_depth = max(bind_depth, max(self.context[1][var] for var in self.node.free_agg_vars))
    350             if len(self.node.free_scan_vars) > 0:
--> 351                 bind_depth = max(bind_depth, max(self.context[2][var] for var in self.node.free_scan_vars))
    352             return bind_depth
    353 

KeyError: 'row'

I’d appreciate any tips!

The error message here is obviously a bug, but the core issue is that grouped scans like this aren’t supported (this is really tricky to implement on the backend when you’re running in parallel). We have some functionality coming soon which might make this kind of thing possible, but probably not easy.

Can you explain what in particular you’re trying to do here? I’m having trouble piecing together what the scan should produce.

Thanks for responding! I’m looking for a way to get cumulative variant counts across exons, given an input Table that has locus, transcript, exon, and variant count information. For example, with input that looks something like this:

chr  pos    transcript      exon      variant_count
1    100    transcript_x    exon_1    3
1    150    transcript_x    exon_1    1
1    175    transcript_x    exon_1    4
1    200    transcript_x    exon_2    2
1    250    transcript_x    exon_2    3
2    100    transcript_y    exon_1    1
2    150    transcript_y    exon_1    1

I’d like to be able to produce an output like this:

chr  pos    transcript      exon      variant_count         cumulative_count
1    100    transcript_x    exon_1    3                     3
1    150    transcript_x    exon_1    1                     4
1    174    transcript_x    exon_1    4                     8
1    200    transcript_x    exon_2    2                     2
1    250    transcript_x    exon_2    3                     5
2    100    transcript_y    exon_1    1                     1
2    150    transcript_y    exon_1    1                     2

I’m trying to figure out how to tell a scan where exon boundaries are (when to start and stop the cumulative sums).

Ah, I see. I think it’s hard to see how the original approach would have done this anyway – I think the order on chr/pos would have been lost as soon as you group by transcript/exon.

I think there’s a strategy here which will work, and that’s to do group_by('transcript', 'exon').aggregate(variants = hl.agg.collect(hl.struct(chr=ht.chr, pos=ht.pos))

You can almost certainly fit all the positions in a transcript/exon into memory, letting you do this scan using hl.scan on the array locally inside a ht.annotate.

1 Like

Thanks! Yeah, my original group by wouldn’t work…oops. Basic question, would this group by and collect method work on the full gnomAD sites Table?

It should be fine, yeah.

1 Like