How to keep missing GT in n_alt_alleles()?

Hi, Hail! I want to calculate alternative allele counts for each sample in my dataset
My dataset size is (2, 90).
Now I apply n_alt_alleles():

mt = mt.annotate_entries(n_alt_alleles = mt.GT.n_alt_alleles())

The final table has fewer rows than it should 152 vs 180.

mt.entries().select('n_alt_alleles').count()

Because of the empty genotypes, information is lost about samples that have at least 1 missing GT.
How am I supposed to take this into account?

Hi @Rost, that’s strange. Are you using the most up-to-date version of Hail? You can update to the latest version by running pip install -U hail. And do you have any more information about the MatrixTable you are working with? Like the results from running mt.count() and the schema from mt.describe()?

I created a 2x2 MatrixTable with a missing GT field for each sample and ran what you describe and the missing genotypes shouldn’t affect the count of the entries Table.

In [1]: import hail as hl; hl.init()
2022-02-09 13:48:46 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Running on Apache Spark version 3.1.2
SparkUI available at http://wmbfd-7b8.home:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.83-b3151b4c4271
LOGGING: writing to /Users/pcumming/hail-20220209-1348-0.2.83-b3151b4c4271.log

In [2]: gs = hl.literal(
   ...:     [[hl.missing('tcall'), hl.Call([1, 0])],
   ...:      [hl.Call([1, 1]), hl.missing('tcall')]]
   ...:     )
   ...: mt = hl.utils.range_matrix_table(n_rows=2, n_cols=2)
   ...: mt = mt.annotate_entries(GT=gs[mt.col_idx][mt.row_idx])
   ...: mt = mt.annotate_entries(n_alt_alleles = mt.GT.n_alt_alleles())
   ...: mt.show()
+---------+------+-----------------+------+-----------------+
| row_idx | 0.GT | 0.n_alt_alleles | 1.GT | 1.n_alt_alleles |
+---------+------+-----------------+------+-----------------+
|   int32 | call |           int32 | call |           int32 |
+---------+------+-----------------+------+-----------------+
|       0 | NA   |              NA | 1/1  |               2 |
|       1 | 0/1  |               1 | NA   |              NA |
+---------+------+-----------------+------+-----------------+

In [3]: mt.entries().select('n_alt_alleles').count()
2022-02-09 13:49:22 Hail: WARN: entries(): Resulting entries table is sorted by '(row_key, col_key)'.
    To preserve row-major matrix table order, first unkey columns with 'key_cols_by()'
Out[3]: 4

@pwc2, that’s my current hail’s version.

import hail as hl
hl.init()
Running on Apache Spark version 3.1.2
SparkUI available at http://sphinx:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.82-2ab242915c2c
LOGGING: writing to /mnt/tank/scratch/rskitchenko/projects/haloplex_notch/hail-20220209-2224-0.2.82-2ab242915c2c.log

My data info

mt.count()
2022-02-09 22:28:57 Hail: INFO: reading 2 of 4 data partitions
(2, 90)

mt.describe()
----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'sample_qc': struct {
        dp_stats: struct {
            mean: float64, 
            stdev: float64, 
            min: float64, 
            max: float64
        }, 
        gq_stats: struct {
            mean: float64, 
            stdev: float64, 
            min: float64, 
            max: float64
        }, 
        call_rate: float64, 
        n_called: int64, 
        n_not_called: int64, 
        n_filtered: int64, 
        n_hom_ref: int64, 
        n_het: int64, 
        n_hom_var: int64, 
        n_non_ref: int64, 
        n_singleton: int64, 
        n_snp: int64, 
        n_insertion: int64, 
        n_deletion: int64, 
        n_transition: int64, 
        n_transversion: int64, 
        n_star: int64, 
        r_ti_tv: float64, 
        r_het_hom_var: float64, 
        r_insertion_deletion: float64
    }
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        DP: int32, 
        END: int32, 
        ExcessHet: float64, 
        FS: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQRankSum: float64, 
        QD: float64, 
        RAW_MQandDP: array<int32>, 
        ReadPosRankSum: float64, 
        SOR: float64, 
        CSQ: array<str>
    }
    'a_index': int32
    'was_split': bool
    'variant_qc': struct {
        dp_stats: struct {
            mean: float64, 
            stdev: float64, 
            min: float64, 
            max: float64
        }, 
        gq_stats: struct {
            mean: float64, 
            stdev: float64, 
            min: float64, 
            max: float64
        }, 
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        homozygote_count: array<int32>, 
        call_rate: float64, 
        n_called: int64, 
        n_not_called: int64, 
        n_filtered: int64, 
        n_het: int64, 
        n_non_ref: int64, 
        het_freq_hwe: float64, 
        p_value_hwe: float64, 
        p_value_excess_het: float64
    }
    'kept_transcripts': array<str>
    'hom_ref': int64
    'hom_var': int64
    'het': int64
    'gene': str
    'consequence': str
----------------------------------------
Entry fields:
    'AD': array<int32>
    'DP': int32
    'GQ': int32
    'GT': call
    'MIN_DP': int32
    'PGT': call
    'PID': str
    'PL': array<int32>
    'PS': int32
    'RGQ': int32
    'SB': array<int32>
    'n_alt_alleles': int32
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------

mt.entries().select('n_alt_alleles').count()

2022-02-09 22:35:19 Hail: WARN: entries(): Resulting entries table is sorted by '(row_key, col_key)'.
    To preserve row-major matrix table order, first unkey columns with 'key_cols_by()'
2022-02-09 22:35:24 Hail: INFO: reading 2 of 4 data partitions
152

@pwc2, your toy example works correctly for me, so I think the version problem can be ruled out.

Thanks @Rost, can you share your script? Or at least the relevant portion of code where the MatrixTable is loaded/created up to where mt.entries().select('n_alt_alleles').count() is called?

@pwc2, yeah, sure.

# data import and preprocessing
import hail as hl
from bokeh.models import Span, BoxAnnotation
from hail.plot import show

hl.init()
hl.plot.output_notebook()

vcf = '/nfs/home/rskitchenko/projects/haloplex_notch/results/haloplex/germline/vcf/result.annotated.vcf.gz'
mt_path = '/nfs/home/rskitchenko/projects/haloplex_notch/results/haloplex/germline/vcf/result.annotated.mt'

hl.import_vcf(vcf, array_elements_required=False,
			force_bgz=True, reference_genome='GRCh38',
			min_partitions=4) \
	.write(mt_path, overwrite=True)

mt = hl.read_matrix_table(mt_path)
mt = hl.split_multi_hts(mt)

# some QC steps
mt = mt.filter_rows(hl.len(mt.filters)==0)
mt = mt.filter_entries(
	((mt.DP >= 10) & (mt.GQ >= 20)) |
	((mt.DP < 10) & (mt.GQ < 20) & (mt.info.InbreedingCoeff > -0.2 ))
)

ab = mt.AD[1] / hl.sum(mt.AD)
filter_condition_ab = (
	hl.case()
	.when(mt.GT.is_hom_ref(), ab <= 0.1)
	.when(mt.GT.is_het(), (ab >= 0.25) & (ab <= 0.75))
	.default(ab >= 0.9) # hom-var
)
mt = mt.filter_entries(filter_condition_ab)

mt = hl.variant_qc(mt)
mt = mt.filter_rows(mt.variant_qc.call_rate>=0.8)

# parse CSQ field 
consequence = 1
gene = 3
canonical = 23
split_csq = mt.info.CSQ
split_transcripts = split_csq.map(lambda x: x.split('\|'))
kept_transcripts = split_transcripts.filter(
    lambda tx: tx[canonical] == 'YES')
mt = mt.annotate_rows(kept_transcripts = kept_transcripts)
mt = mt.annotate_rows(kept_transcripts = mt.kept_transcripts.head())

mt = mt.annotate_rows(hom_ref = hl.agg.count_where(mt.GT.is_hom_ref()),
					hom_var = hl.agg.count_where(mt.GT.is_hom_var()),
					het = hl.agg.count_where(mt.GT.is_het()),
					gene = mt.kept_transcripts[gene],
					consequence = mt.kept_transcripts[consequence]
					)

# 2 loci subset
loci = hl.literal([hl.locus('chr12', 113073873, 'GRCh38'),
					hl.locus('chr9', 136508456, 'GRCh38')])
mt = mt.filter_rows(loci.contains(mt.locus))
mt = mt.annotate_entries(n_alt_alleles = mt.GT.n_alt_alleles())

# problem line
mt.entries().select('n_alt_alleles').count()

Ah okay, this is probably due to upstream filter_entries calls that reference the GT field. If the expression used for filtering evaluates to missing, that entry is removed regardless of whether keep=True or keep=False. There’s more information on the docs for filter_entries, but this can be addressed by using unfilter_entries to re-densify the sparse, filtered MatrixTable.

Try running mt.unfilter_entries().entries().select('n_alt_alleles').count(), and you should get the correct count you were expecting on the entries Table.

1 Like

@pwc2, that works, thanks!