Skip to content

Commit a7340c0

Browse files
timothymillartomwhite
authored andcommitted
Add skipna option to genomic_relationship #1076
1 parent e28c0f7 commit a7340c0

File tree

2 files changed

+102
-5
lines changed

2 files changed

+102
-5
lines changed

sgkit/stats/grm.py

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,36 @@
1010
from sgkit.utils import conditional_merge_datasets, create_dataset
1111

1212

13+
def _grm_VanRaden(
14+
call_dosage: ArrayLike,
15+
ancestral_frequency: ArrayLike,
16+
ploidy: int,
17+
skipna: bool = False,
18+
):
19+
ancestral_dosage = ancestral_frequency * ploidy
20+
M = call_dosage - ancestral_dosage[:, None]
21+
if skipna:
22+
nans = da.isnan(M)
23+
M0 = da.where(nans, 0, M)
24+
numerator = M0.T @ M0
25+
AD = ~nans * ancestral_dosage[:, None]
26+
AFC = ~nans * (1 - ancestral_frequency[:, None])
27+
denominator = AD.T @ AFC
28+
else:
29+
numerator = M.T @ M
30+
denominator = (ancestral_dosage * (1 - ancestral_frequency)).sum()
31+
G = numerator / denominator
32+
return G
33+
34+
1335
def genomic_relationship(
1436
ds: Dataset,
1537
*,
1638
call_dosage: Hashable = variables.call_dosage,
1739
estimator: Optional[Literal["VanRaden"]] = None,
1840
ancestral_frequency: Optional[Hashable] = None,
1941
ploidy: Optional[int] = None,
42+
skipna: bool = False,
2043
merge: bool = True,
2144
) -> Dataset:
2245
"""Compute a genomic relationship matrix (AKA the GRM or G-matrix).
@@ -44,6 +67,10 @@ def genomic_relationship(
4467
Ploidy level of all samples within the dataset.
4568
By default this is inferred from the size of the "ploidy" dimension
4669
of the dataset.
70+
skipna
71+
If True, missing (nan) values of 'call_dosage' will be skipped so
72+
that the relationship between each pair of individuals is estimated
73+
using only variants where both samples have non-nan values.
4774
merge
4875
If True (the default), merge the input dataset and the computed
4976
output variables into a single dataset, otherwise return only
@@ -134,11 +161,7 @@ def genomic_relationship(
134161
raise ValueError(
135162
"The ancestral_frequency variable must have one value per variant"
136163
)
137-
ad = af * ploidy
138-
M = cd - ad[:, None]
139-
num = M.T @ M
140-
denom = (ad * (1 - af)).sum()
141-
G = num / denom
164+
G = _grm_VanRaden(cd, af, ploidy=ploidy, skipna=skipna)
142165

143166
new_ds = create_dataset(
144167
{

sgkit/tests/test_grm.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,80 @@ def test_genomic_relationship__VanRaden_AGHmatrix_tetraploid(chunks):
118118
np.testing.assert_array_almost_equal(actual, expect)
119119

120120

121+
def test_genomic_relationship__VanRaden_skipna():
122+
# Test that skipna option skips values in call_dosage
123+
# such that the relationship between each pair of individuals
124+
# is calculated using only the variants where neither sample
125+
# has missing data.
126+
# This should be equivalent to calculating the GRM using
127+
# multiple subsets of the variants and using pairwise
128+
# values from the larges subset of variants that doesn't
129+
# result in a nan value.
130+
nan = np.nan
131+
dosage = np.array(
132+
[
133+
[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 2.0, 0.0],
134+
[1.0, 1.0, 1.0, 2.0, nan, 1.0, 1.0, 0.0, 1.0, 2.0],
135+
[2.0, 2.0, 0.0, 0.0, nan, 1.0, 1.0, 1.0, 0.0, 1.0],
136+
[1.0, 0.0, 0.0, 0.0, nan, 1.0, 1.0, 1.0, 1.0, 0.0],
137+
[1.0, 0.0, 1.0, 1.0, nan, 2.0, 0.0, 1.0, 0.0, 2.0],
138+
[2.0, 1.0, 1.0, 1.0, nan, 1.0, 2.0, nan, 0.0, 1.0],
139+
[2.0, 0.0, 1.0, 1.0, nan, 2.0, 1.0, nan, 1.0, 1.0],
140+
[1.0, 1.0, 1.0, 2.0, nan, 1.0, 2.0, nan, 1.0, 0.0],
141+
[1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, nan, 1.0, 1.0],
142+
[2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, nan, 2.0, 1.0],
143+
[1.0, 2.0, 2.0, 1.0, 2.0, 0.0, 1.0, nan, 1.0, 2.0],
144+
[0.0, 0.0, 1.0, 2.0, 0.0, 1.0, 0.0, nan, 1.0, 2.0],
145+
[1.0, 2.0, 1.0, 2.0, 2.0, 0.0, 1.0, nan, 1.0, 0.0],
146+
[0.0, 2.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0],
147+
[1.0, 1.0, 2.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 2.0],
148+
[2.0, 0.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 0.0, 2.0],
149+
[1.0, 0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 1.0],
150+
[2.0, 1.0, 2.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0],
151+
[1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 0.0, 2.0, 1.0, 2.0],
152+
[1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
153+
]
154+
)
155+
ds = xr.Dataset()
156+
ds["call_dosage"] = ["variants", "samples"], dosage
157+
ds["ancestral_frequency"] = "variants", np.ones(len(dosage)) / 2
158+
# calculating without skipna will result in nans in the GRM
159+
expect = sg.genomic_relationship(
160+
ds,
161+
call_dosage="call_dosage",
162+
ancestral_frequency="ancestral_frequency",
163+
estimator="VanRaden",
164+
ploidy=2,
165+
skipna=False,
166+
).stat_genomic_relationship.values
167+
assert np.isnan(expect).sum() > 0
168+
# fill nan values using maximum subsets without missing data
169+
idx_0 = ~np.isnan(dosage[:, 4])
170+
idx_1 = ~np.isnan(dosage[:, 7])
171+
idx_2 = np.logical_and(idx_0, idx_1)
172+
for idx in [idx_0, idx_1, idx_2]:
173+
sub = ds.sel(dict(variants=idx))
174+
sub_expect = sg.genomic_relationship(
175+
sub,
176+
call_dosage="call_dosage",
177+
ancestral_frequency="ancestral_frequency",
178+
estimator="VanRaden",
179+
ploidy=2,
180+
skipna=False,
181+
).stat_genomic_relationship.values
182+
expect = np.where(np.isnan(expect), sub_expect, expect)
183+
# calculate actual value using skipna=True
184+
actual = sg.genomic_relationship(
185+
ds,
186+
call_dosage="call_dosage",
187+
ancestral_frequency="ancestral_frequency",
188+
estimator="VanRaden",
189+
ploidy=2,
190+
skipna=True,
191+
).stat_genomic_relationship.values
192+
np.testing.assert_array_equal(actual, expect)
193+
194+
121195
@pytest.mark.parametrize("ploidy", [2, 4])
122196
def test_genomic_relationship__detect_ploidy(ploidy):
123197
ds = xr.Dataset()

0 commit comments

Comments
 (0)