I’m having some more issues (again) that I could use some help with.
I have a very large MT and its structure is as follows:
----------------------------------------
Global fields:
'gencodeVersion': str
'sourceFilePath': str
'genomeVersion': str
'sampleType': str
'hail_version': str
----------------------------------------
Column fields:
's': str
'cmg_data': struct {
family_id: int32,
project_id: str,
pmid_id: str,
trio: str,
dbgap_submission: str,
dbgap_study_id: str,
dbgap_subject_id: str,
multiple_datasets: str,
paternal_id: str,
maternal_id: str,
proband_relationship: str,
sex: str,
ancestry: str,
phenotype_group: str,
disease_id: str,
disease_description: str,
affected_status: str,
congenital_status: str,
hpo_present: str,
hpo_absent: str,
phenotype_description: str,
solve_state: str,
MME: str,
sample_id: str,
dbgap_sample_id: str,
sample_provider: str,
data_type: str,
date_data_generation: str,
consanguinity: str,
family_history: str,
`Gene.1`: str,
`Gene_Class.1`: str,
`novel_mendelian_gene.1`: str,
`phenotype_class.1`: str,
`inheritance_description.1`: str,
`Zygosity.1`: str,
`Chrom.1`: int32,
`Pos.1`: int32,
`Ref.1`: str,
`Alt.1`: str,
`hgvsc.1`: str,
`hgvsp.1`: str,
`Transcript.1`: str,
`sv_name.1`: str,
`sv_type.1`: str,
`discovery_notes.1`: str,
`Gene.2`: str,
`Gene_Class.2`: str,
`novel_mendelian_gene.2`: str,
`phenotype_class.2`: str,
`inheritance_description.2`: str,
`Zygosity.2`: str,
`Chrom.2`: str,
`Pos.2`: str,
`Ref.2`: str,
`Alt.2`: str,
`hgvsc.2`: str,
`hgvsp.2`: str,
`Transcript.2`: str,
`sv_name.2`: str,
`sv_type.2`: str,
`discovery_notes.2`: str,
`Gene.3`: str,
`Gene_Class.3`: str,
`novel_mendelian_gene.3`: str,
`phenotype_class.3`: str,
`inheritance_description.3`: str,
`Zygosity.3`: str,
`Chrom.3`: str,
`Pos.3`: str,
`Ref.3`: str,
`Alt.3`: str,
`hgvsc.3`: str,
`hgvsp.3`: str,
`Transcript.3`: str,
`sv_name.3`: str,
`sv_type.3`: str,
`discovery_notes.3`: str,
`Gene.4`: str,
`Gene_Class.4`: str,
`novel_mendelian_gene.4`: str,
`phenotype_class.4`: str,
`inheritance_description.4`: str,
`Zygosity.4`: str,
`Chrom.4`: str,
`Pos.4`: str,
`Ref.4`: str,
`Alt.4`: str,
`hgvsc.4`: str,
`hgvsp.4`: str,
`Transcript.4`: str,
`sv_name.4`: str,
`sv_type.4`: str,
`discovery_notes.4`: str,
`Gene.5`: str,
`Gene_Class.5`: str,
`novel_mendelian_gene.5`: str,
`phenotype_class.5`: str,
`inheritance_description.5`: str,
`Zygosity.5`: str,
`Chrom.5`: str,
`Pos.5`: str,
`Ref.5`: str,
`Alt.5`: str,
`hgvsc.5`: str,
`hgvsp.5`: str,
`Transcript.5`: str,
`sv_name.5`: str,
`sv_type.5`: str,
`discovery_notes.5`: str,
X: str
}
----------------------------------------
Row fields:
'locus': locus<GRCh38>
'alleles': array<str>
'aIndex': int32
'AC': int32
'AF': float64
'alt': str
'AN': int32
'cadd': struct {
PHRED: float32
}
'clinvar': struct {
allele_id: int32,
clinical_significance: str,
gold_stars: int32
}
'codingGeneIds': set<str>
'contig': str
'dbnsfp': struct {
SIFT_pred: str,
Polyphen2_HVAR_pred: str,
MutationTaster_pred: str,
FATHMM_pred: str,
MetaSVM_pred: str,
REVEL_score: str,
GERP_RS: str,
phastCons100way_vertebrate: str
}
'docId': str
'domains': set<str>
'eigen': struct {
Eigen_phred: float64
}
'end': int32
'exac': struct {
AF_POPMAX: float64,
AF: float64,
AC_Adj: int32,
AC_Het: int32,
AC_Hom: int32,
AC_Hemi: int32,
AN_Adj: int32
}
'filters': set<str>
'g1k': struct {
AC: int32,
AF: float64,
AN: int32,
POPMAX_AF: float64
}
'geneIds': set<str>
'geno2mp': struct {
HPO_Count: int32
}
'genotypes': array<struct {
num_alt: int32,
gq: int32,
ab: float64,
dp: float64,
sample_id: str
}>
'gnomad_exome_coverage': float64
'gnomad_exomes': struct {
AF: float64,
AN: int32,
AC: int32,
FAF_AF: float64,
AF_POPMAX_OR_GLOBAL: float64,
Hom: int32,
Hemi: int32
}
'gnomad_genome_coverage': float64
'gnomad_genomes': struct {
AF: float64,
AN: int32,
AC: int32,
FAF_AF: float64,
AF_POPMAX_OR_GLOBAL: float64,
Hom: int32,
Hemi: int32
}
'mainTranscript': struct {
biotype: str,
canonical: int32,
category: str,
cdna_start: int32,
cdna_end: int32,
codons: str,
gene_id: str,
gene_symbol: str,
hgvs: str,
hgvsc: str,
major_consequence: str,
major_consequence_rank: int32,
transcript_id: str,
amino_acids: str,
domains: str,
hgvsp: str,
lof: str,
lof_flags: str,
lof_filter: str,
lof_info: str,
polyphen_prediction: str,
protein_id: str,
sift_prediction: str
}
'mpc': struct {
MPC: str
}
'originalAltAlleles': array<str>
'pos': int32
'primate_ai': struct {
score: float64
}
'ref': str
'rg37_locus': locus<GRCh37>
'rsid': str
'samples_ab': struct {
0_to_5: set<str>,
5_to_10: set<str>,
10_to_15: set<str>,
15_to_20: set<str>,
20_to_25: set<str>,
25_to_30: set<str>,
30_to_35: set<str>,
35_to_40: set<str>,
40_to_45: set<str>
}
'samples_gq': struct {
0_to_5: set<str>,
5_to_10: set<str>,
10_to_15: set<str>,
15_to_20: set<str>,
20_to_25: set<str>,
25_to_30: set<str>,
30_to_35: set<str>,
35_to_40: set<str>,
40_to_45: set<str>,
45_to_50: set<str>,
50_to_55: set<str>,
55_to_60: set<str>,
60_to_65: set<str>,
65_to_70: set<str>,
70_to_75: set<str>,
75_to_80: set<str>,
80_to_85: set<str>,
85_to_90: set<str>,
90_to_95: set<str>
}
'samples_no_call': set<str>
'samples_num_alt': struct {
1: set<str>,
2: set<str>
}
'sortedTranscriptConsequences': array<struct {
biotype: str,
canonical: int32,
cdna_start: int32,
cdna_end: int32,
codons: str,
gene_id: str,
gene_symbol: str,
hgvsc: str,
hgvsp: str,
transcript_id: str,
amino_acids: str,
lof: str,
lof_filter: str,
lof_flags: str,
lof_info: str,
polyphen_prediction: str,
protein_id: str,
protein_start: int32,
sift_prediction: str,
consequence_terms: array<str>,
domains: array<str>,
major_consequence: str,
category: str,
hgvs: str,
major_consequence_rank: int32,
transcript_rank: int32
}>
'splice_ai': struct {
delta_score: float32,
splice_consequence: str
}
'start': int32
'topmed': struct {
AC: int32,
AF: float64,
AN: int32,
Hom: int32,
Het: int32
}
'transcriptConsequenceTerms': set<str>
'transcriptIds': set<str>
'variantId': str
'vep': struct {
assembly_name: str,
allele_string: str,
ancestral: str,
colocated_variants: array<struct {
aa_allele: str,
aa_maf: float64,
afr_allele: str,
afr_maf: float64,
allele_string: str,
amr_allele: str,
amr_maf: float64,
clin_sig: array<str>,
end: int32,
eas_allele: str,
eas_maf: float64,
ea_allele: str,
ea_maf: float64,
eur_allele: str,
eur_maf: float64,
exac_adj_allele: str,
exac_adj_maf: float64,
exac_allele: str,
exac_afr_allele: str,
exac_afr_maf: float64,
exac_amr_allele: str,
exac_amr_maf: float64,
exac_eas_allele: str,
exac_eas_maf: float64,
exac_fin_allele: str,
exac_fin_maf: float64,
exac_maf: float64,
exac_nfe_allele: str,
exac_nfe_maf: float64,
exac_oth_allele: str,
exac_oth_maf: float64,
exac_sas_allele: str,
exac_sas_maf: float64,
id: str,
minor_allele: str,
minor_allele_freq: float64,
phenotype_or_disease: int32,
pubmed: array<int32>,
sas_allele: str,
sas_maf: float64,
somatic: int32,
start: int32,
strand: int32
}>,
context: str,
end: int32,
id: str,
input: str,
intergenic_consequences: array<struct {
allele_num: int32,
consequence_terms: array<str>,
impact: str,
minimised: int32,
variant_allele: str
}>,
most_severe_consequence: str,
motif_feature_consequences: array<struct {
allele_num: int32,
consequence_terms: array<str>,
high_inf_pos: str,
impact: str,
minimised: int32,
motif_feature_id: str,
motif_name: str,
motif_pos: int32,
motif_score_change: float64,
strand: int32,
variant_allele: str
}>,
regulatory_feature_consequences: array<struct {
allele_num: int32,
biotype: str,
consequence_terms: array<str>,
impact: str,
minimised: int32,
regulatory_feature_id: str,
variant_allele: str
}>,
seq_region_name: str,
start: int32,
strand: int32,
transcript_consequences: array<struct {
allele_num: int32,
amino_acids: str,
appris: str,
biotype: str,
canonical: int32,
ccds: str,
cdna_start: int32,
cdna_end: int32,
cds_end: int32,
cds_start: int32,
codons: str,
consequence_terms: array<str>,
distance: int32,
domains: array<struct {
db: str,
name: str
}>,
exon: str,
gene_id: str,
gene_pheno: int32,
gene_symbol: str,
gene_symbol_source: str,
hgnc_id: str,
hgvsc: str,
hgvsp: str,
hgvs_offset: int32,
impact: str,
intron: str,
lof: str,
lof_flags: str,
lof_filter: str,
lof_info: str,
minimised: int32,
polyphen_prediction: str,
polyphen_score: float64,
protein_end: int32,
protein_start: int32,
protein_id: str,
sift_prediction: str,
sift_score: float64,
strand: int32,
swissprot: str,
transcript_id: str,
trembl: str,
tsl: int32,
uniparc: str,
variant_allele: str
}>,
variant_class: str
}
'wasSplit': bool
'xpos': int64
'xstart': int64
'xstop': int64
----------------------------------------
Entry fields:
'AD': array<int32>
'DP': int32
'GQ': int32
'GT': call
'MIN_DP': int32
'PGT': call
'PID': str
'PL': array<int32>
'PS': int32
'RGQ': int32
'SB': array<int32>
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------
I’ve joined some sample info to the MT which looks like this:
mt.select_cols(
mt.cmg_data.family_id,
mt.cmg_data.trio,
mt.cmg_data.paternal_id,
mt.cmg_data.maternal_id,
mt.cmg_data.affected_status).col.show(6)
s | family_id | trio | paternal_id | maternal_id | affected_status |
str | int32 | str | str | str | str |
“FM15-20” | 4 | “yes” | “FM15-32” | “FM15-89” | “Affected” |
“FM15-32” | 4 | “yes” | NA | NA | “Unaffected” |
“FM15-89” | 4 | “yes” | NA | NA | “Unaffected” |
“NGS00035” | 30 | “yes” | “NGS00036” | “NGS00037” | “Affected” |
“NGS00036” | 30 | “yes” | NA | NA | “Unaffected” |
“NGS00037” | 30 | “yes” | NA | NA | “Unaffected” |
…and here’s the genotype info:
mt.GT.show(5)
‘FM15-20’ | ‘FM15-32’ | ‘FM15-89’ | ‘NGS00035’ | ||
locus | alleles | GT | GT | GT | GT |
locus | array | call | call | call | call |
chr1:13649 | [“G”,“C”] | NA | NA | NA | NA |
chr1:17406 | [“C”,“T”] | NA | NA | NA | NA |
chr1:138817 | [“T”,“C”] | NA | NA | NA | NA |
chr1:935872 | [“C”,“T”] | 0/1 | 0/0 | 0/0 | 0/0 |
chr1:942388 | [“G”,“C”] | NA | 0/0 | 1 | 1 |
I want to group by family_id, and then call de novo variants. In essence I want to:
- group by family_id
- filter GT (mt.GT.is_non_ref()) in affected individuals and (mt.GT.is_hom_ref()) in unaffected individuals
- save as denovo callset
I have tried two different ways and both fail miserably!
Try 1:
denovos = hl.agg.group_by(mt.cmg_data.family_id,
hl.agg.filter(
(mt.GT.is_non_ref() & mt.cmg_data.affected_status=='Affected') &
(mt.GT.is_hom_ref() & mt.cmg_data.affected_status=='Unaffected'))))
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/var/folders/z5/m5b9qp5s7c7gtdv29ynkt8fjwphnd6/T/ipykernel_57498/772465573.py in <module>
21 # (mt.GT.is_hom_ref() & mt.cmg_data.affected_status=='Unaffected')))
22
---> 23 mt = mt.annotate(
24 denovos = hl.agg.group_by(mt.cmg_data.family_id,
25 hl.agg.filter(
~/opt/miniconda3/envs/hailenv/lib/python3.7/site-packages/hail/table.py in __getattr__(self, item)
119 return self.__dict__[item]
120
--> 121 raise AttributeError(get_nice_attr_error(self, item))
122
123 def _copy_fields_from(self, other: 'ExprContainer'):
AttributeError: MatrixTable instance has no field, method, or property 'annotate'
Did you mean:
MatrixTable methods: 'annotate_rows', 'annotate_cols', 'annotate_globals', 'annotate_entries'
…and try 2:
mt.group_cols_by(mt.cmg_data.family_id).aggregate_entries(hl.agg.filter(
(mt.GT.is_non_ref() & mt.cmg_data.affected_status=='Affected') &
(mt.GT.is_hom_ref() & mt.cmg_data.affected_status=='Unaffected')))
---------------------------------------------------------------------------
ExpressionException Traceback (most recent call last)
~/opt/miniconda3/envs/hailenv/lib/python3.7/site-packages/hail/expr/expressions/expression_typecheck.py in check(self, x, caller, param)
76 try:
---> 77 return self.coerce(to_expr(x))
78 except ExpressionException as e:
~/opt/miniconda3/envs/hailenv/lib/python3.7/site-packages/hail/expr/expressions/expression_typecheck.py in coerce(self, x)
61 if not self.can_coerce(x.dtype):
---> 62 raise ExpressionException(f"cannot coerce type '{x.dtype}' to type '{self.str_t}'")
63 if self._requires_conversion(x.dtype):
ExpressionException: cannot coerce type 'str' to type 'bool'
The above exception was the direct cause of the following exception:
TypecheckFailure Traceback (most recent call last)
~/opt/miniconda3/envs/hailenv/lib/python3.7/site-packages/hail/typecheck/check.py in arg_check(arg, function_name, arg_name, checker)
583 try:
--> 584 return checker.check(arg, function_name, arg_name)
585 except TypecheckFailure as e:
~/opt/miniconda3/envs/hailenv/lib/python3.7/site-packages/hail/expr/expressions/expression_typecheck.py in check(self, x, caller, param)
78 except ExpressionException as e:
---> 79 raise TypecheckFailure from e
80
TypecheckFailure:
The above exception was the direct cause of the following exception:
TypeError Traceback (most recent call last)
/var/folders/z5/m5b9qp5s7c7gtdv29ynkt8fjwphnd6/T/ipykernel_57498/237928755.py in <module>
18
19 mt.group_cols_by(mt.cmg_data.family_id).aggregate_entries(hl.agg.filter(
---> 20 (mt.GT.is_non_ref() & mt.cmg_data.affected_status=='Affected') &
21 (mt.GT.is_hom_ref() & mt.cmg_data.affected_status=='Unaffected')))
22
<decorator-gen-684> in __and__(self, other)
~/opt/miniconda3/envs/hailenv/lib/python3.7/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
574 @decorator
575 def wrapper(__original_func, *args, **kwargs):
--> 576 args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
577 return __original_func(*args_, **kwargs_)
578
~/opt/miniconda3/envs/hailenv/lib/python3.7/site-packages/hail/typecheck/check.py in check_all(f, args, kwargs, checks, is_method)
541 raise TypeError(
542 f'Expected {n_pos_args} positional arguments, found {len(args)}')
--> 543 args_.append(arg_check(args[i], name, arg_name, checker))
544 elif param.kind in (param.KEYWORD_ONLY, param.POSITIONAL_OR_KEYWORD):
545 arg = kwargs.pop(arg_name, param.default)
~/opt/miniconda3/envs/hailenv/lib/python3.7/site-packages/hail/typecheck/check.py in arg_check(arg, function_name, arg_name, checker)
590 expected=checker.expects(),
591 found=checker.format(arg)
--> 592 )) from e
593
594
TypeError: __and__: parameter 'other': expected expression of type bool, found <StringExpression of type str>
I’m at a bit of a loss… and would appreciate some advice as to where I’m going wrong!
E