Skip to content

gametic heterozygosity_observed #279

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
4 changes: 2 additions & 2 deletions allel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@
plot_pairwise_distance, condensed_coords, condensed_coords_between, \
condensed_coords_within

from .stats.hw import heterozygosity_observed, heterozygosity_expected, \
inbreeding_coefficient
from .stats.hw import heterozygosity_individual, heterozygosity_observed, \
heterozygosity_expected, inbreeding_coefficient

from .stats.ld import rogers_huff_r, rogers_huff_r_between, \
locate_unlinked, plot_pairwise_ld, windowed_r_squared
Expand Down
133 changes: 122 additions & 11 deletions allel/stats/hw.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,86 @@
from allel.util import ignore_invalid, asarray_ndim


def heterozygosity_observed(g, fill=np.nan):
def heterozygosity_individual(g, fill=np.nan, corrected=True, ploidy=None):
"""Calculate the individual heterozygosity of each sample for
each variant following Hardy (2016).

Parameters
----------
g : array_like, int, shape (n_variants, n_samples, ploidy)
Genotype array.
fill : float, optional
Use this value for variants with invalid inputs.
corrected : bool, optional
If True, values are corrected for ploidy level.
ploidy : array_like, int, (n_variants, n_samples), optional
Specify ploidy of each genotype call.

Returns
-------
hi : ndarray, float, shape (n_variants, n_samples)
Observed individual heterozygosity

Notes
-----
Individual heterozygosity is calculated assuming polysomic inheritance
for polyploid genotype arrays following Hardy (2016).

If the ploidy argument is used then the ploidy specified for each
genotype call must be equal to the number of alleles called for that
genotype or else the genotype will be treated as missing.

Examples
--------

>>> import allel
>>> g = allel.GenotypeArray([[[0, 0, -1, -1], [0, 0, 0, 0]],
... [[0, 1, -1, -1], [0, 0, 0, 1]],
... [[1, 1, -1, -1], [0, 1, 1, 2]],
... [[0, -1, -1, -1], [0, 1, 2, 3]],
... [[-1, -1, -1, -1], [0, 1, -1, -1]]])
>>> ploidy = [[2, 4],
... [2, 4],
... [2, 4],
... [2, 4],
... [2, 4]]
>>> allel.heterozygosity_individual(g, ploidy=ploidy)
array([[0. , 0. ],
[1. , 0.5 ],
[0. , 0.83333333],
[ nan, 1. ],
[ nan, nan]])

"""
# check inputs
if not hasattr(g, 'ploidy') or not hasattr(g, 'to_allele_counts'):
g = GenotypeArray(g, copy=False)

# use array ploidy if none provided
ploidy = g.ploidy if ploidy is None else np.array(ploidy)

# convert to genotype allele counts, remove those not matching ploidy
gac = g.to_allele_counts()
is_called = gac.values.sum(axis=-1) == ploidy
gac[~is_called] = 0

# genotype allele frequencies with partial genotypes removed
gaf = gac.to_frequencies()

# correction for ploidy level
with ignore_invalid():
correction = ploidy / (ploidy - 1) if corrected else 1

# matrix of observed heterozygosity per sample
hi = (1 - np.sum(np.power(gaf, 2), axis=-1)) * correction

if fill is not np.nan:
hi[np.isnan(hi)] = fill

return hi


def heterozygosity_observed(g, fill=np.nan, corrected=True, ploidy=None):
"""Calculate the rate of observed heterozygosity for each variant.

Parameters
Expand All @@ -16,16 +95,38 @@ def heterozygosity_observed(g, fill=np.nan):
Genotype array.
fill : float, optional
Use this value for variants where all calls are missing.
corrected : bool, optional
If True, values are corrected for ploidy level.
ploidy : array_like, int, (n_variants, n_samples), optional
Specify ploidy of each genotype call.

Returns
-------

ho : ndarray, float, shape (n_variants,)
Observed heterozygosity

Notes
-----

Observed heterozygosity is calculated assuming polysomic inheritance
for polyploid genotype arrays following Hardy (2016) (also see Meirmans
and Liu 2018).

By default, observed heterozygosity is corrected for ploidy by
*Ho = Hu * (ploidy / (ploidy - 1))* where *Ho* is the corrected observed
heterozygosity and *Hu* is the uncorrected observed heterozygosity
described by Meirmans and Liu (2018).

If the ploidy argument is used then the ploidy specified for each
genotype call must be equal to the number of called alleles for that
genotype or else the genotype will be treated as missing.

Examples
--------

Diploid genotype array

>>> import allel
>>> g = allel.GenotypeArray([[[0, 0], [0, 0], [0, 0]],
... [[0, 0], [0, 1], [1, 1]],
Expand All @@ -34,20 +135,30 @@ def heterozygosity_observed(g, fill=np.nan):
>>> allel.heterozygosity_observed(g)
array([0. , 0.33333333, 0. , 0.5 ])

"""
Tetraploid genotype array

# check inputs
if not hasattr(g, 'count_het') or not hasattr(g, 'count_called'):
g = GenotypeArray(g, copy=False)
>>> import allel
>>> g = allel.GenotypeArray([[[0, 0, 0, 0], [0, 0, 0, 0]],
... [[0, 0, 0, 0], [0, 0, 0, 1]],
... [[0, 0, 1, 1], [0, 0, 1, 2]],
... [[0, 0, 1, 1], [0, 0, -1, -1]],
... [[0, 1, 2, 3], [-1, -1, -1, -1]]])
>>> allel.heterozygosity_observed(g)
array([0. , 0.25 , 0.75 , 0.66666667, 1. ])

"""
hi = heterozygosity_individual(
g,
fill=np.nan,
corrected=corrected,
ploidy=ploidy
)
sum_het = np.nansum(hi, axis=-1)

# count hets
n_het = np.asarray(g.count_het(axis=1))
n_called = np.asarray(g.count_called(axis=1))
n_called = np.nansum(~np.isnan(hi), axis=-1)

# calculate rate of observed heterozygosity, accounting for variants
# where all calls are missing
with ignore_invalid():
ho = np.where(n_called > 0, n_het / n_called, fill)
ho = np.where(n_called > 0, sum_het / n_called, fill)

return ho

Expand Down
86 changes: 83 additions & 3 deletions allel/test/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,70 @@ def test_windowed_tajima_d(self):

class TestHardyWeinberg(unittest.TestCase):

def test_heterozygosity_individual(self):

# diploid should always be 0 (hom) or 1 (het)
g = GenotypeArray([[[0, 0], [0, 0]],
[[1, 1], [1, 1]],
[[1, 1], [2, 2]],
[[0, 0], [0, 1]],
[[0, 0], [0, 2]],
[[1, 1], [1, 2]],
[[0, 1], [0, 1]],
[[0, 1], [1, 2]],
[[0, 0], [-1, -1]],
[[0, 1], [-1, -1]],
[[-1, -1], [-1, -1]]], dtype='i1')

hi = allel.heterozygosity_individual(g)
assert np.all(hi[g.is_hom()] == 0)
assert np.all(hi[g.is_het()] == 1)
assert np.all(np.isnan(hi)[g.is_missing()])

# mixed ploidy 2x, 4x, 6x
g = GenotypeArray([
[[0, 0, -1, -1, -1, -1], [0, 0, 0, 0, -1, -1], [0, 0, 0, 0, 0, 0]],
[[1, 1, -1, -1, -1, -1], [0, 0, 0, 1, -1, -1], [0, 0, 0, 0, 0, 1]],
[[0, 1, -1, -1, -1, -1], [0, 0, 1, 1, -1, -1], [0, 0, 0, 0, 1, 1]],
[[1, 2, -1, -1, -1, -1], [0, 0, 1, 2, -1, -1], [0, 0, 0, 1, 1, 1]],
[[3, 6, -1, -1, -1, -1], [0, 1, 2, 3, -1, -1], [0, 0, 0, 0, 1, 2]],
[[0, 0, -1, -1, -1, -1], [0, 2, 2, 5, -1, -1], [0, 0, 0, 1, 1, 2]],
[[0, -1, -1, -1, -1, -1], [0, 1, 1, -1, -1, -1], [0, 0, 1, 1, 2, 2]],
[[0, -1, -1, -1, -1, -1], [0, -1, -1, -1, -1, -1], [0, 0, 1, 2, 3, 4]],
[[-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1], [0, 1, 2, 3, 4, 5]]
])
ploidy = np.array([
[2, 4, 6],
[2, 4, 6],
[2, 4, 6],
[2, 4, 6],
[2, 4, 6],
[2, 4, 6],
[2, 4, 6],
[2, 4, 6],
[2, 4, 6],
])
# 4x and 6x values from Hardy (2016)
expect = np.array([
[0, 0/6, 0/15],
[0, 3/6, 5/15],
[1, 4/6, 8/15],
[1, 5/6, 9/15],
[1, 6/6, 9/15],
[0, 5/6, 11/15],
[-1, -1, 12/15],
[-1, -1, 14/15],
[-1, -1, 15/15],
])
actual = allel.heterozygosity_individual(g, fill=-1, ploidy=ploidy)
assert_array_almost_equal(expect, actual)

# mixed ploidy uncorrected
expect /= (ploidy/(ploidy-1))
expect[expect < 0] = -1 # correct the filled values
actual = allel.heterozygosity_individual(g, fill=-1, ploidy=ploidy, corrected=False)
assert_array_almost_equal(expect, actual)

def test_heterozygosity_observed(self):

# diploid
Expand All @@ -403,7 +467,17 @@ def test_heterozygosity_observed(self):
actual = allel.heterozygosity_observed(g, fill=-1)
aeq(expect, actual)

# polyploid
# diploid uncorrected
# corrected is uncorrected * (2/1)
expect = [i / 2 if i >= 0 else i for i in expect]
actual = allel.heterozygosity_observed(g, fill=-1, corrected=False)
assert_array_almost_equal(expect, actual)

# triploid
# individual heterozygosity (Hi)
# dosage of 3:0:0 = Hi of 0/3
# dosage of 2:1:0 = Hi of 2/3
# dosage of 1:1:1 = Hi of 3/3
g = GenotypeArray([[[0, 0, 0], [0, 0, 0]],
[[1, 1, 1], [1, 1, 1]],
[[1, 1, 1], [2, 2, 2]],
Expand All @@ -415,9 +489,15 @@ def test_heterozygosity_observed(self):
[[0, 0, 0], [-1, -1, -1]],
[[0, 0, 1], [-1, -1, -1]],
[[-1, -1, -1], [-1, -1, -1]]], dtype='i1')
expect = [0, 0, 0, .5, .5, .5, 1, 1, 0, 1, -1]
expect = [0, 0, 0, 1/3, 1/3, 3/6, 4/6, 5/6, 0, 2/3, -1]
actual = allel.heterozygosity_observed(g, fill=-1)
aeq(expect, actual)
assert_array_almost_equal(expect, actual)

# triploid uncorrected
# corrected is uncorrected * (3/2)
expect = [i / (3/2) if i >= 0 else i for i in expect]
actual = allel.heterozygosity_observed(g, fill=-1, corrected=False)
assert_array_almost_equal(expect, actual)

def test_heterozygosity_expected(self):

Expand Down