Hi, I’ve run the below code multiple times but I get slightly different results on each run (the final counts and samples that are filtered varies). Do you know why this may be the case?
input_mt = hl.read_matrix_table(input_mt_path)
over_85_expr = (input_mt.HL >= 0.85) & (input_mt.FT == ["PASS"]) & input_mt.hap_defining_variant & ~hl.str(input_mt.filters).contains("artifact-prone-site")
input_mt = input_mt.annotate_cols(over_85_mean = hl.agg.filter(over_85_expr, hl.agg.mean(input_mt.HL)),
over_85_count = hl.agg.filter(over_85_expr, hl.agg.count_where(hl.is_defined(input_mt.HL))),
bt_85_and_99_mean = hl.agg.filter(over_85_expr & (input_mt.HL <= .998), hl.agg.mean(input_mt.HL)),
bt_85_and_99_count = hl.agg.filter(over_85_expr & (input_mt.HL <= .998), hl.agg.count_where(hl.is_defined(input_mt.HL))),
)
input_mt = input_mt.annotate_cols(contam_high_het = hl.if_else(input_mt.bt_85_and_99_count >= 3, 1 - input_mt.bt_85_and_99_mean, 1 - input_mt.over_85_mean))
# if contam_high_het is nan, set to 0 (to avoid filtering out missing values which would be more common with haplogroups closer to reference haplogroup)
input_mt = input_mt.annotate_cols(contam_high_het = hl.if_else(hl.is_nan(input_mt.contam_high_het), 0, input_mt.contam_high_het))
# keep samples with contamination less than 2%
input_mt = input_mt.annotate_cols(keep=(input_mt.contamination < .02) & (input_mt.freemix_percentage < 2) & (input_mt.contam_high_het < .02))
input_ht = input_mt.cols()
input_ht.group_by(input_ht.keep).aggregate(n=hl.agg.count()).show()