cggh · timothymillar · Jul 18, 2019 · Jul 18, 2019 · Jul 18, 2019 · Jul 18, 2019
diff --git a/allel/__init__.py b/allel/__init__.py
@@ -31,8 +31,8 @@
     plot_pairwise_distance, condensed_coords, condensed_coords_between, \
     condensed_coords_within
 
-from .stats.hw import heterozygosity_observed, heterozygosity_expected, \
-    inbreeding_coefficient
+from .stats.hw import heterozygosity_individual, heterozygosity_observed, \
+    heterozygosity_expected, inbreeding_coefficient
 
 from .stats.ld import rogers_huff_r, rogers_huff_r_between, \
     locate_unlinked, plot_pairwise_ld, windowed_r_squared

diff --git a/allel/stats/hw.py b/allel/stats/hw.py
@@ -6,7 +6,86 @@
 from allel.util import ignore_invalid, asarray_ndim
 
 
-def heterozygosity_observed(g, fill=np.nan):
+def heterozygosity_individual(g, fill=np.nan, corrected=True, ploidy=None):
+    """Calculate the individual heterozygosity of each sample for
+    each variant following Hardy (2016).
+
+    Parameters
+    ----------
+    g : array_like, int, shape (n_variants, n_samples, ploidy)
+        Genotype array.
+    fill : float, optional
+        Use this value for variants with invalid inputs.
+    corrected : bool, optional
+        If True, values are corrected for ploidy level.
+    ploidy : array_like, int, (n_variants, n_samples), optional
+        Specify ploidy of each genotype call.
+
+    Returns
+    -------
+    hi : ndarray, float, shape (n_variants, n_samples)
+        Observed individual heterozygosity
+
+    Notes
+    -----
+    Individual heterozygosity is calculated assuming polysomic inheritance
+    for polyploid genotype arrays following Hardy (2016).
+
+    If the ploidy argument is used then the ploidy specified for each
+    genotype call must be equal to the number of alleles called for that
+    genotype or else the genotype will be treated as missing.
+
+    Examples
+    --------
+
+    >>> import allel
+    >>> g = allel.GenotypeArray([[[0, 0, -1, -1], [0, 0, 0, 0]],
+    ...                          [[0, 1, -1, -1], [0, 0, 0, 1]],
+    ...                          [[1, 1, -1, -1], [0, 1, 1, 2]],
+    ...                          [[0, -1, -1, -1], [0, 1, 2, 3]],
+    ...                          [[-1, -1, -1, -1], [0, 1, -1, -1]]])
+    >>> ploidy = [[2, 4],
+    ...           [2, 4],
+    ...           [2, 4],
+    ...           [2, 4],
+    ...           [2, 4]]
+    >>> allel.heterozygosity_individual(g, ploidy=ploidy)
+    array([[0.        , 0.        ],
+           [1.        , 0.5       ],
+           [0.        , 0.83333333],
+           [       nan, 1.        ],
+           [       nan,        nan]])
+
+    """
+    # check inputs
+    if not hasattr(g, 'ploidy') or not hasattr(g, 'to_allele_counts'):
+        g = GenotypeArray(g, copy=False)
+
+    # use array ploidy if none provided
+    ploidy = g.ploidy if ploidy is None else np.array(ploidy)
+
+    # convert to genotype allele counts, remove those not matching ploidy
+    gac = g.to_allele_counts()
+    is_called = gac.values.sum(axis=-1) == ploidy
+    gac[~is_called] = 0
+
+    # genotype allele frequencies with partial genotypes removed
+    gaf = gac.to_frequencies()
+
+    # correction for ploidy level
+    with ignore_invalid():
+        correction = ploidy / (ploidy - 1) if corrected else 1
+
+    # matrix of observed heterozygosity per sample
+    hi = (1 - np.sum(np.power(gaf, 2), axis=-1)) * correction
+
+    if fill is not np.nan:
+        hi[np.isnan(hi)] = fill
+
+    return hi
+
+
+def heterozygosity_observed(g, fill=np.nan, corrected=True, ploidy=None):
     """Calculate the rate of observed heterozygosity for each variant.
 
     Parameters
@@ -16,16 +95,38 @@ def heterozygosity_observed(g, fill=np.nan):
         Genotype array.
     fill : float, optional
         Use this value for variants where all calls are missing.
+    corrected : bool, optional
+        If True, values are corrected for ploidy level.
+    ploidy : array_like, int, (n_variants, n_samples), optional
+        Specify ploidy of each genotype call.
 
     Returns
     -------
 
     ho : ndarray, float, shape (n_variants,)
         Observed heterozygosity
 
+    Notes
+    -----
+
+    Observed heterozygosity is calculated assuming polysomic inheritance
+    for polyploid genotype arrays following Hardy (2016) (also see Meirmans
+    and Liu 2018).
+
+    By default, observed heterozygosity is corrected for ploidy by
+    *Ho = Hu * (ploidy / (ploidy - 1))* where *Ho* is the corrected observed
+    heterozygosity and *Hu* is the uncorrected observed heterozygosity
+    described by Meirmans and Liu (2018).
+
+    If the ploidy argument is used then the ploidy specified for each
+    genotype call must be equal to the number of called alleles for that
+    genotype or else the genotype will be treated as missing.
+
     Examples
     --------
 
+    Diploid genotype array
+
     >>> import allel
     >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [0, 0]],
     ...                          [[0, 0], [0, 1], [1, 1]],
@@ -34,20 +135,30 @@ def heterozygosity_observed(g, fill=np.nan):
     >>> allel.heterozygosity_observed(g)
     array([0.        , 0.33333333, 0.        , 0.5       ])
 
-    """
+    Tetraploid genotype array
 
-    # check inputs
-    if not hasattr(g, 'count_het') or not hasattr(g, 'count_called'):
-        g = GenotypeArray(g, copy=False)
+    >>> import allel
+    >>> g = allel.GenotypeArray([[[0, 0, 0, 0], [0, 0, 0, 0]],
+    ...                          [[0, 0, 0, 0], [0, 0, 0, 1]],
+    ...                          [[0, 0, 1, 1], [0, 0, 1, 2]],
+    ...                          [[0, 0, 1, 1], [0, 0, -1, -1]],
+    ...                          [[0, 1, 2, 3], [-1, -1, -1, -1]]])
+    >>> allel.heterozygosity_observed(g)
+    array([0.        , 0.25      , 0.75      , 0.66666667, 1.        ])
+
+    """
+    hi = heterozygosity_individual(
+        g,
+        fill=np.nan,
+        corrected=corrected,
+        ploidy=ploidy
+    )
+    sum_het = np.nansum(hi, axis=-1)
 
-    # count hets
-    n_het = np.asarray(g.count_het(axis=1))
-    n_called = np.asarray(g.count_called(axis=1))
+    n_called = np.nansum(~np.isnan(hi), axis=-1)
 
-    # calculate rate of observed heterozygosity, accounting for variants
-    # where all calls are missing
     with ignore_invalid():
-        ho = np.where(n_called > 0, n_het / n_called, fill)
+        ho = np.where(n_called > 0, sum_het / n_called, fill)
 
     return ho
 

diff --git a/allel/test/test_stats.py b/allel/test/test_stats.py
@@ -385,6 +385,70 @@ def test_windowed_tajima_d(self):
 
 class TestHardyWeinberg(unittest.TestCase):
 
+    def test_heterozygosity_individual(self):
+
+        # diploid should always be 0 (hom) or 1 (het)
+        g = GenotypeArray([[[0, 0], [0, 0]],
+                           [[1, 1], [1, 1]],
+                           [[1, 1], [2, 2]],
+                           [[0, 0], [0, 1]],
+                           [[0, 0], [0, 2]],
+                           [[1, 1], [1, 2]],
+                           [[0, 1], [0, 1]],
+                           [[0, 1], [1, 2]],
+                           [[0, 0], [-1, -1]],
+                           [[0, 1], [-1, -1]],
+                           [[-1, -1], [-1, -1]]], dtype='i1')
+
+        hi = allel.heterozygosity_individual(g)
+        assert np.all(hi[g.is_hom()] == 0)
+        assert np.all(hi[g.is_het()] == 1)
+        assert np.all(np.isnan(hi)[g.is_missing()])
+
+        # mixed ploidy 2x, 4x, 6x
+        g = GenotypeArray([
+            [[0, 0, -1, -1, -1, -1], [0, 0, 0, 0, -1, -1], [0, 0, 0, 0, 0, 0]],
+            [[1, 1, -1, -1, -1, -1], [0, 0, 0, 1, -1, -1], [0, 0, 0, 0, 0, 1]],
+            [[0, 1, -1, -1, -1, -1], [0, 0, 1, 1, -1, -1], [0, 0, 0, 0, 1, 1]],
+            [[1, 2, -1, -1, -1, -1], [0, 0, 1, 2, -1, -1], [0, 0, 0, 1, 1, 1]],
+            [[3, 6, -1, -1, -1, -1], [0, 1, 2, 3, -1, -1], [0, 0, 0, 0, 1, 2]],
+            [[0, 0, -1, -1, -1, -1], [0, 2, 2, 5, -1, -1], [0, 0, 0, 1, 1, 2]],
+            [[0, -1, -1, -1, -1, -1], [0, 1, 1, -1, -1, -1], [0, 0, 1, 1, 2, 2]],
+            [[0, -1, -1, -1, -1, -1], [0, -1, -1, -1, -1, -1], [0, 0, 1, 2, 3, 4]],
+            [[-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1], [0, 1, 2, 3, 4, 5]]
+        ])
+        ploidy = np.array([
+            [2, 4, 6],
+            [2, 4, 6],
+            [2, 4, 6],
+            [2, 4, 6],
+            [2, 4, 6],
+            [2, 4, 6],
+            [2, 4, 6],
+            [2, 4, 6],
+            [2, 4, 6],
+        ])
+        # 4x and 6x values from Hardy (2016)
+        expect = np.array([
+            [0, 0/6, 0/15],
+            [0, 3/6, 5/15],
+            [1, 4/6, 8/15],
+            [1, 5/6, 9/15],
+            [1, 6/6, 9/15],
+            [0, 5/6, 11/15],
+            [-1, -1, 12/15],
+            [-1, -1, 14/15],
+            [-1, -1, 15/15],
+        ])
+        actual = allel.heterozygosity_individual(g, fill=-1, ploidy=ploidy)
+        assert_array_almost_equal(expect, actual)
+
+        # mixed ploidy uncorrected
+        expect /= (ploidy/(ploidy-1))
+        expect[expect < 0] = -1  # correct the filled values
+        actual = allel.heterozygosity_individual(g, fill=-1, ploidy=ploidy, corrected=False)
+        assert_array_almost_equal(expect, actual)
+
     def test_heterozygosity_observed(self):
 
         # diploid
@@ -403,7 +467,17 @@ def test_heterozygosity_observed(self):
         actual = allel.heterozygosity_observed(g, fill=-1)
         aeq(expect, actual)
 
-        # polyploid
+        # diploid uncorrected
+        # corrected is uncorrected * (2/1)
+        expect = [i / 2 if i >= 0 else i for i in expect]
+        actual = allel.heterozygosity_observed(g, fill=-1, corrected=False)
+        assert_array_almost_equal(expect, actual)
+
+        # triploid
+        # individual heterozygosity (Hi)
+        # dosage of 3:0:0 = Hi of 0/3
+        # dosage of 2:1:0 = Hi of 2/3
+        # dosage of 1:1:1 = Hi of 3/3
         g = GenotypeArray([[[0, 0, 0], [0, 0, 0]],
                            [[1, 1, 1], [1, 1, 1]],
                            [[1, 1, 1], [2, 2, 2]],
@@ -415,9 +489,15 @@ def test_heterozygosity_observed(self):
                            [[0, 0, 0], [-1, -1, -1]],
                            [[0, 0, 1], [-1, -1, -1]],
                            [[-1, -1, -1], [-1, -1, -1]]], dtype='i1')
-        expect = [0, 0, 0, .5, .5, .5, 1, 1, 0, 1, -1]
+        expect = [0, 0, 0, 1/3, 1/3, 3/6, 4/6, 5/6, 0, 2/3, -1]
         actual = allel.heterozygosity_observed(g, fill=-1)
-        aeq(expect, actual)
+        assert_array_almost_equal(expect, actual)
+
+        # triploid uncorrected
+        # corrected is uncorrected * (3/2)
+        expect = [i / (3/2) if i >= 0 else i for i in expect]
+        actual = allel.heterozygosity_observed(g, fill=-1, corrected=False)
+        assert_array_almost_equal(expect, actual)
 
     def test_heterozygosity_expected(self):