I’d like to subset a table (or matrix table) containing individual-level data to a list of a couple of hundred variants. Say the list of variants is something like variants = [["10", 123, "G", "C"], ["10", 456, "T", "A"], ...]
. I have tried:
hl.filter_intervals()
with one interval of length one for each variant, but that takes forever, and also doesn’t check whether the alleles are equal,-
which works nicely and reasonably fast for, say, 50 variants, but crashes with afrom functools import reduce from operator import or_ match_exprs = [(mt.locus.contig == contig) & (mt.locus.position == pos) & (mt.alleles == [ref, alt]) for contig, pos, ref, alt in variants] mt_subset = mt.filter_rows(reduce(or_, match_exprs))
StackOverflow
error for 300 variants, -
along the lines of which ChatGPT pointed me, but that crashes with something like the following error (copied from the actual failure and not my slight rewording of the actual code above):mt_subset = mt.filter_rows(hl.any(lambda x: (mt.locus.contig == hl.literal(x[0])) & (mt.locus.position == hl.literal(int(x[1]))) & (mt.alleles == hl.literal(x[2:])), variants))`
Traceback (most recent call last):
[…]
, in _subset_matrix_table_to_variants
return mt.filter_rows(hl.any(lambda x: (mt.locus.contig == hl.literal(x[0])) & (mt.locus.position == hl.literal(int(x[1]))) & (mt.alleles == hl.literal(x[2:])), variants))
File “/app/.venv/lib/python3.10/site-packages/hail/expr/functions.py”, line 3531, in any
return collection.any(f)
File “”, line 2, in any
File “/app/.venv/lib/python3.10/site-packages/hail/typecheck/check.py”, line 577, in wrapper
return original_func(*args, **kwargs)
File “/app/.venv/lib/python3.10/site-packages/hail/expr/expressions/typed_expressions.py”, line 68, in any
return hl.array(self).fold(lambda accum, elt: accum | f(elt), False)
File “”, line 2, in fold
File “/app/.venv/lib/python3.10/site-packages/hail/typecheck/check.py”, line 577, in wrapper
return original_func(*args, **kwargs)
File “/app/.venv/lib/python3.10/site-packages/hail/expr/expressions/typed_expressions.py”, line 221, in fold
return collection._to_stream().fold(lambda x, y: f(x, y), zero)
File “”, line 2, in fold
File “/app/.venv/lib/python3.10/site-packages/hail/typecheck/check.py”, line 577, in wrapper
return original_func(*args, **kwargs)
File “/app/.venv/lib/python3.10/site-packages/hail/expr/expressions/typed_expressions.py”, line 4522, in fold
body = to_expr(f(accum_ref, elt_ref))
File “/app/.venv/lib/python3.10/site-packages/hail/typecheck/check.py”, line 364, in f
ret = x(*args)
File “/app/.venv/lib/python3.10/site-packages/hail/expr/expressions/typed_expressions.py”, line 221, in
return collection._to_stream().fold(lambda x, y: f(x, y), zero)
File “/app/.venv/lib/python3.10/site-packages/hail/typecheck/check.py”, line 364, in f
ret = x(*args)
File “/app/.venv/lib/python3.10/site-packages/hail/expr/expressions/typed_expressions.py”, line 68, in
return hl.array(self).fold(lambda accum, elt: accum | f(elt), False)
File “/app/.venv/lib/python3.10/site-packages/hail/typecheck/check.py”, line 364, in f
ret = x(*args)
File “/app/.venv/lib/python3.10/site-packages/hail/typecheck/check.py”, line 364, in f
ret = x(*args)
File “/app/linkage_disequilibrium/ld_hail/pearson_correlations.py”, line 209, in
return mt.filter_rows(hl.any(lambda x: (mt.locus.contig == hl.literal(x[0])) & (mt.locus.position == hl.literal(x[1])) & (mt.alleles == hl.literal(x[2:])), ids))
File “”, line 2, in literal
File “/app/.venv/lib/python3.10/site-packages/hail/typecheck/check.py”, line 577, in wrapper
return original_func(*args, **kwargs)
File “/app/.venv/lib/python3.10/site-packages/hail/expr/functions.py”, line 261, in literal
return literal(hl.eval(to_expr(x, dtype)), dtype)
File “”, line 2, in eval
File “/app/.venv/lib/python3.10/site-packages/hail/typecheck/check.py”, line 577, in wrapper
return original_func(*args, **kwargs)
File “/app/.venv/lib/python3.10/site-packages/hail/expr/expressions/expression_utils.py”, line 223, in eval
return eval_timed(expression)[0]
File “”, line 2, in eval_timed
File “/app/.venv/lib/python3.10/site-packages/hail/typecheck/check.py”, line 577, in wrapper
return original_func(*args, **kwargs)
File “/app/.venv/lib/python3.10/site-packages/hail/expr/expressions/expression_utils.py”, line 189, in eval_timed
return _eval_many(expression, timed=True, name=‘eval_timed’)[0]
File “/app/.venv/lib/python3.10/site-packages/hail/expr/expressions/expression_utils.py”, line 150, in _eval_many
return Env.backend().execute_many(*irs, timed=timed)
File “/app/.venv/lib/python3.10/site-packages/hail/backend/backend.py”, line 38, in execute_many
return [self.execute(MakeTuple([ir]), timed=timed)[0] for ir in irs]
File “/app/.venv/lib/python3.10/site-packages/hail/backend/backend.py”, line 38, in
return [self.execute(MakeTuple([ir]), timed=timed)[0] for ir in irs]
File “/app/.venv/lib/python3.10/site-packages/hail/backend/py4j_backend.py”, line 94, in execute
jir = self._to_java_value_ir(ir)
File “/app/.venv/lib/python3.10/site-packages/hail/backend/spark_backend.py”, line 280, in _to_java_value_ir
return self._to_java_ir(ir, self._parse_value_ir)
File “/app/.venv/lib/python3.10/site-packages/hail/backend/spark_backend.py”, line 276, in _to_java_ir
ir._jir = parse(r(finalize_randomness(ir)), ir_map=r.jirs)
File “/app/.venv/lib/python3.10/site-packages/hail/backend/spark_backend.py”, line 245, in _parse_value_ir
return self._jbackend.parse_value_ir(
File “/app/.venv/lib/python3.10/site-packages/py4j/java_gateway.py”, line 1304, in call
return_value = get_return_value(
File “/app/.venv/lib/python3.10/site-packages/hail/backend/py4j_backend.py”, line 21, in deco
return f(*args, **kwargs)
File “/app/.venv/lib/python3.10/site-packages/py4j/protocol.py”, line 326, in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o46.parse_value_ir.
: java.util.NoSuchElementException: key not found: __uid_4
at scala.collection.immutable.Map$Map1.apply(Map.scala:114)
at is.hail.expr.ir.Env.apply(Env.scala:128)
at is.hail.expr.ir.IRParser$.ir_value_expr_1(Parser.scala:890)
at is.hail.expr.ir.IRParser$.$anonfun$ir_value_expr$1(Parser.scala:820)
at is.hail.utils.StackSafe$More.advance(StackSafe.scala:64)
at is.hail.utils.StackSafe$.run(StackSafe.scala:16)
at is.hail.utils.StackSafe$StackFrame.run(StackSafe.scala:32)
at is.hail.expr.ir.IRParser$.$anonfun$parse_value_ir$1(Parser.scala:2072)
at is.hail.expr.ir.IRParser$.parse(Parser.scala:2068)
at is.hail.expr.ir.IRParser$.parse_value_ir(Parser.scala:2072)
at is.hail.backend.spark.SparkBackend.$anonfun$parse_value_ir$2(SparkBackend.scala:710)
at is.hail.backend.ExecuteContext$.$anonfun$scoped$3(ExecuteContext.scala:70)
at is.hail.utils.package$.using(package.scala:635)
at is.hail.backend.ExecuteContext$.$anonfun$scoped$2(ExecuteContext.scala:70)
at is.hail.utils.package$.using(package.scala:635)
at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:17)
at is.hail.backend.ExecuteContext$.scoped(ExecuteContext.scala:59)
at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:339)
at is.hail.backend.spark.SparkBackend.$anonfun$parse_value_ir$1(SparkBackend.scala:709)
at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
at is.hail.utils.ExecutionTimer$.logTime(ExecutionTimer.scala:59)
at is.hail.backend.spark.SparkBackend.parse_value_ir(SparkBackend.scala:708)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.base/java.lang.Thread.run(Thread.java:829)
I am also aware of hl.import_locus_intervals
, but would like to avoid that, because it seems I need to write out a file with the variants first or create a table akin to the result of import_locus_intervals
in memory first. Also, this again doesn’t check for matching alleles.
Any help or ideas would be appreciated