Add skipna option to genomic_relationship #1076

timothymillar · tomwhite · commit a7340c020808 · 2023-05-02T10:25:24.000+01:00
diff --git a/sgkit/stats/grm.py b/sgkit/stats/grm.py
@@ -10,13 +10,36 @@
 from sgkit.utils import conditional_merge_datasets, create_dataset
 
 
+def _grm_VanRaden(
+    call_dosage: ArrayLike,
+    ancestral_frequency: ArrayLike,
+    ploidy: int,
+    skipna: bool = False,
+):
+    ancestral_dosage = ancestral_frequency * ploidy
+    M = call_dosage - ancestral_dosage[:, None]
+    if skipna:
+        nans = da.isnan(M)
+        M0 = da.where(nans, 0, M)
+        numerator = M0.T @ M0
+        AD = ~nans * ancestral_dosage[:, None]
+        AFC = ~nans * (1 - ancestral_frequency[:, None])
+        denominator = AD.T @ AFC
+    else:
+        numerator = M.T @ M
+        denominator = (ancestral_dosage * (1 - ancestral_frequency)).sum()
+    G = numerator / denominator
+    return G
+
+
 def genomic_relationship(
     ds: Dataset,
     *,
     call_dosage: Hashable = variables.call_dosage,
     estimator: Optional[Literal["VanRaden"]] = None,
     ancestral_frequency: Optional[Hashable] = None,
     ploidy: Optional[int] = None,
+    skipna: bool = False,
     merge: bool = True,
 ) -> Dataset:
     """Compute a genomic relationship matrix (AKA the GRM or G-matrix).
@@ -44,6 +67,10 @@ def genomic_relationship(
         Ploidy level of all samples within the dataset.
         By default this is inferred from the size of the "ploidy" dimension
         of the dataset.
+    skipna
+        If True, missing (nan) values of 'call_dosage' will be skipped so
+        that the relationship between each pair of individuals is estimated
+        using only variants where both samples have non-nan values.
     merge
         If True (the default), merge the input dataset and the computed
         output variables into a single dataset, otherwise return only
@@ -134,11 +161,7 @@ def genomic_relationship(
         raise ValueError(
             "The ancestral_frequency variable must have one value per variant"
         )
-    ad = af * ploidy
-    M = cd - ad[:, None]
-    num = M.T @ M
-    denom = (ad * (1 - af)).sum()
-    G = num / denom
+    G = _grm_VanRaden(cd, af, ploidy=ploidy, skipna=skipna)
 
     new_ds = create_dataset(
         {
diff --git a/sgkit/tests/test_grm.py b/sgkit/tests/test_grm.py
@@ -118,6 +118,80 @@ def test_genomic_relationship__VanRaden_AGHmatrix_tetraploid(chunks):
     np.testing.assert_array_almost_equal(actual, expect)
 
 
+def test_genomic_relationship__VanRaden_skipna():
+    # Test that skipna option skips values in call_dosage
+    # such that the relationship between each pair of individuals
+    # is calculated using only the variants where neither sample
+    # has missing data.
+    # This should be equivalent to calculating the GRM using
+    # multiple subsets of the variants and using pairwise
+    # values from the larges subset of variants that doesn't
+    # result in a nan value.
+    nan = np.nan
+    dosage = np.array(
+        [
+            [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 2.0, 0.0],
+            [1.0, 1.0, 1.0, 2.0, nan, 1.0, 1.0, 0.0, 1.0, 2.0],
+            [2.0, 2.0, 0.0, 0.0, nan, 1.0, 1.0, 1.0, 0.0, 1.0],
+            [1.0, 0.0, 0.0, 0.0, nan, 1.0, 1.0, 1.0, 1.0, 0.0],
+            [1.0, 0.0, 1.0, 1.0, nan, 2.0, 0.0, 1.0, 0.0, 2.0],
+            [2.0, 1.0, 1.0, 1.0, nan, 1.0, 2.0, nan, 0.0, 1.0],
+            [2.0, 0.0, 1.0, 1.0, nan, 2.0, 1.0, nan, 1.0, 1.0],
+            [1.0, 1.0, 1.0, 2.0, nan, 1.0, 2.0, nan, 1.0, 0.0],
+            [1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, nan, 1.0, 1.0],
+            [2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, nan, 2.0, 1.0],
+            [1.0, 2.0, 2.0, 1.0, 2.0, 0.0, 1.0, nan, 1.0, 2.0],
+            [0.0, 0.0, 1.0, 2.0, 0.0, 1.0, 0.0, nan, 1.0, 2.0],
+            [1.0, 2.0, 1.0, 2.0, 2.0, 0.0, 1.0, nan, 1.0, 0.0],
+            [0.0, 2.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0],
+            [1.0, 1.0, 2.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 2.0],
+            [2.0, 0.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 0.0, 2.0],
+            [1.0, 0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 1.0],
+            [2.0, 1.0, 2.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0],
+            [1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 0.0, 2.0, 1.0, 2.0],
+            [1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+        ]
+    )
+    ds = xr.Dataset()
+    ds["call_dosage"] = ["variants", "samples"], dosage
+    ds["ancestral_frequency"] = "variants", np.ones(len(dosage)) / 2
+    # calculating without skipna will result in nans in the GRM
+    expect = sg.genomic_relationship(
+        ds,
+        call_dosage="call_dosage",
+        ancestral_frequency="ancestral_frequency",
+        estimator="VanRaden",
+        ploidy=2,
+        skipna=False,
+    ).stat_genomic_relationship.values
+    assert np.isnan(expect).sum() > 0
+    # fill nan values using maximum subsets without missing data
+    idx_0 = ~np.isnan(dosage[:, 4])
+    idx_1 = ~np.isnan(dosage[:, 7])
+    idx_2 = np.logical_and(idx_0, idx_1)
+    for idx in [idx_0, idx_1, idx_2]:
+        sub = ds.sel(dict(variants=idx))
+        sub_expect = sg.genomic_relationship(
+            sub,
+            call_dosage="call_dosage",
+            ancestral_frequency="ancestral_frequency",
+            estimator="VanRaden",
+            ploidy=2,
+            skipna=False,
+        ).stat_genomic_relationship.values
+        expect = np.where(np.isnan(expect), sub_expect, expect)
+    # calculate actual value using skipna=True
+    actual = sg.genomic_relationship(
+        ds,
+        call_dosage="call_dosage",
+        ancestral_frequency="ancestral_frequency",
+        estimator="VanRaden",
+        ploidy=2,
+        skipna=True,
+    ).stat_genomic_relationship.values
+    np.testing.assert_array_equal(actual, expect)
+
+
 @pytest.mark.parametrize("ploidy", [2, 4])
 def test_genomic_relationship__detect_ploidy(ploidy):
     ds = xr.Dataset()