Hi all
I combined GVCF 5K files to one VDS by using Hail library
And I get vds (variant dataset) directory
So I want to read vds data and convert MT (matrix table including VCF data in Hail)
But I can not convert MT to VCF format files with INFO field
Below code is my code that combine multiple GVCF to one vds and convert it to vcf)
import glob
import hail as hl
hl.init()
from gnomad.utils.annotations import (
get_adj_expr,
get_lowqual_expr,
)
from gnomad.utils.sparse_mt import (
get_as_info_expr,
get_site_info_expr,
INFO_INT32_SUM_AGG_FIELDS,
INFO_SUM_AGG_FIELDS,
split_info_annotation,
split_lowqual_annotation,
)
# combine multiple GVCF to VDS
gvcf_list = glob.glob("*.gvcf.gz")
combiner = hl.vds.new_combiner(
gvcf_paths = gvcf_list,
output_path = output.vds,
use_genome_default_intervals=True,
reference_genome = "GRCh38"
)
combiner.run()
# read vds data and make mt
vds = hl.vds.read_vds(output_vds)
mt = hl.vds.to_dense_mt(vds)
mt = mt.filter_rows((hl.len(mt.alleles) > 1))
mt = mt.transmute_entries(**mt.gvcf_info)
mt = mt.annotate_rows(alt_alleles_range_array=hl.range(1, hl.len(mt.alleles)))
## make INFO expression
info_expr = get_site_info_expr(
mt,
sum_agg_fields=[],
int32_sum_agg_fields=[ ],
array_sum_agg_fields=['SB']
)
info_expr = info_expr.annotate(
**get_as_info_expr(
mt,
sum_agg_fields=[],
int32_sum_agg_fields=[],
array_sum_agg_fields=['SB']
)
)
## Add AC and AC_raw
## First compute ACs for each non-ref allele, grouped by adj
grp_ac_expr = hl.agg.array_agg(
lambda ai: hl.agg.filter(
mt.LA.contains(ai),
hl.agg.group_by(
get_adj_expr(mt.LGT, mt.GQ, mt.DP, mt.LAD),
hl.agg.sum(
mt.LGT.one_hot_alleles(mt.LA.map(hl.str))[mt.LA.index(ai)]
),
),
),
mt.alt_alleles_range_array,
)
## Then, for each non-ref allele, compute
## AC as the adj group
## AC_raw as the sum of adj and non-adj groups
info_expr = info_expr.annotate(
AC_raw=grp_ac_expr.map(
lambda i: hl.int32(i.get(True, 0) + i.get(False, 0))
),
AC=grp_ac_expr.map(lambda i: hl.int32(i.get(True, 0))),
)
## Annotating raw MT with pab max
info_expr = info_expr.annotate(
AS_pab_max=hl.agg.array_agg(
lambda ai: hl.agg.filter(
mt.LA.contains(ai) & mt.LGT.is_het(),
hl.agg.max(
hl.binom_test(
mt.LAD[mt.LA.index(ai)], hl.sum(mt.LAD), 0.5, 'two-sided'
)
),
),
mt.alt_alleles_range_array,
)
)
info_ht = mt.select_rows(info=info_expr).rows()
mt = mt.annotate_rows(info=info_ht[mt.row_key].info)
mt = mt.annotate_entries(
GT = hl.vds.lgt_to_gt(mt.LGT, mt.LA)
)
drop_to_field = ['BaseQRankSum',
'ExcessHet',
'InbreedingCoeff',
'MLEAC',
'MLEAF',
'MQRankSum',
'RAW_MQandDP',
'ReadPosRankSum',
"LGT"]
mt = mt.drop(*drop_to_field)
hl.export_vcf(mt, output.vcf.bgz)
I got error message
“Error summary: HailException: VCF does not support SIndexablePointer(PCArray[PInt32])”
What is my fault?
Thank you.