I am trying to annotate my Nirvana which outputs a JSON. From thondeboer’s very neat notebook I’m doing:
ht = hl.import_table(
"Nirvana.json.gz",
no_header=True,
force_bgz=True,
)
nirvana_schema = """
struct{
chromosome: str,
refAllele: str,
position: int32,
altAlleles: array<str>,
cytogeneticBand: str,
quality: float64,
filters: array<str>,
jointSomaticNormalQuality: int32,
copyNumber: int32,
strandBias: float64,
recalibratedQuality: float64,
variants: array<struct{
altAllele: str,
refAllele: str,
chromosome: str,
begin: int32,
end: int32,
phylopScore: float64,
isReferenceMinor: bool,
variantType: str,
vid: str,
hgvsg: str,
isRecomposedVariant: bool,
isDecomposedVariant: bool,
regulatoryRegions: array<struct{
id: str,
type: str,
consequence: set<str>
}>,
clinvar: array<struct{
id: str,
reviewStatus: str,
isAlleleSpecific: bool,
alleleOrigins: array<str>,
refAllele: str,
altAllele: str,
phenotypes: array<str>,
medGenIds: array<str>,
omimIds: array<str>,
orphanetIds: array<str>,
significance: str,
lastUpdatedDate: str,
pubMedIds: array<str>
}>,
cosmic: array<struct{
id: str,
isAlleleSpecific: bool,
refAllele: str,
altAllele: str,
gene: str,
sampleCount: int32,
studies: array<struct{
id: int32,
histology: str,
primarySite: str
}>
}>,
dbsnp: struct{
ids: array<str>
},
globalAllele: struct{
globalMinorAllele: str,
globalMinorAlleleFrequency: float64
},
gnomad: struct{
coverage: str,
allAf: float64,
allAc: int32,
allAn: int32,
allHc: int32,
afrAf: float64,
afrAc: int32,
afrAn: int32,
afrHc: int32,
amrAf: float64,
amrAc: int32,
amrAn: int32,
amrHc: int32,
easAf: float64,
easAc: int32,
easAn: int32,
easHc: int32,
finAf: float64,
finAc: int32,
finAn: int32,
finHc: int32,
nfeAf: float64,
nfeAc: int32,
nfeAn: int32,
nfeHc: int32,
othAf: float64,
othAc: int32,
othAn: int32,
othHc: int32,
asjAf: float64,
asjAc: int32,
asjAn: int32,
asjHc: int32,
failedFilter: bool
},
gnomadExome: struct{
coverage: str,
allAf: float64,
allAc: int32,
allAn: int32,
allHc: int32,
afrAf: float64,
afrAc: int32,
afrAn: int32,
afrHc: int32,
amrAf: float64,
amrAc: int32,
amrAn: int32,
amrHc: int32,
easAf: float64,
easAc: int32,
easAn: int32,
easHc: int32,
finAf: float64,
finAc: int32,
finAn: int32,
finHc: int32,
nfeAf: float64,
nfeAc: int32,
nfeAn: int32,
nfeHc: int32,
othAf: float64,
othAc: int32,
othAn: int32,
othHc: int32,
asjAf: float64,
asjAc: int32,
asjAn: int32,
asjHc: int32,
sasAf: float64,
sasAc: int32,
sasAn: int32,
sasHc: int32,
failedFilter: bool
},
topmed: struct{
failedFilter: bool,
allAc: int32,
allAn: int32,
allAf: float64,
allHc: int32
},
oneKg: struct{
ancestralAllele: str,
allAf: float64,
allAc: int32,
allAn: int32,
afrAf: float64,
afrAc: int32,
afrAn: int32,
amrAf: float64,
amrAc: int32,
amrAn: int32,
easAf: float64,
easAc: int32,
easAn: int32,
eurAf: float64,
eurAc: int32,
eurAn: int32,
sasAf: float64,
sasAc: int32,
sasAn: int32
},
mitomap: array<struct{
refAllele: str,
altAllele: str,
diseases : array<str>,
hasHomoplasmy: bool,
hasHeteroplasmy: bool,
status: str,
clinicalSignificance: str,
scorePercentile: float64,
isAlleleSpecific: bool,
chromosome: str,
begin: int32,
end: int32,
variantType: str
}
transcripts: struct{
refSeq: array<struct{
transcript: str,
bioType: str,
aminoAcids: str,
cdnaPos: str,
codons: str,
cdsPos: str,
exons: str,
introns: str,
geneId: str,
hgnc: str,
consequence: array<str>,
hgvsc: str,
hgvsp: str,
isCanonical: bool,
polyPhenScore: float64,
polyPhenPrediction: str,
proteinId: str,
proteinPos: str,
siftScore: float64,
siftPrediction: str
}>,
ensembl: array<struct{
transcript: str,
bioType: str,
aminoAcids: str,
cdnaPos: str,
codons: str,
cdsPos: str,
exons: str,
introns: str,
geneId: str,
hgnc: str,
consequence: array<str>,
hgvsc: str,
hgvsp: str,
isCanonical: bool,
polyPhenScore: float64,
polyPhenPrediction: str,
proteinId: str,
proteinPos: str,
siftScore: float64,
siftPrediction: str
}>
},
overlappingGenes: array<str>
}>
genes: array<struct{
name: str,
omim: array<struct{
mimNumber: int32,
hgnc: str,
description: str,
phenotypes: array<struct{
mimNumber: int32,
phenotype: str,
mapping: str,
inheritance: array<str>,
comments: str
}>
}>
exac: struct{
pLi: float64,
pRec: float64,
pNull: float64
}
}>
}
"""
json_expr = hl.parse_json(ht.f0,dtype=nirvana_schema)
json_expr.describe()
Here nirvana_schema is from hl.nirvana’s documentation. This gives:
IncompleteParseError: Rule ‘type’ matched in its entirety, but it didn’t consume all the text. The non-matching portion of the text begins with ‘uct{
chromosome:’ (line 2, column 4).
Is there a good way to achieve this? I’m a bit in over my head here but thanks in advance!