Skip to content

Commit 9545283

Browse files
committed
Merge branch 'timothymillar-1076-grm-skipna'
2 parents a7340c0 + 01eb580 commit 9545283

File tree

2 files changed

+90
-7
lines changed

2 files changed

+90
-7
lines changed

docs/changelog.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ New Features
4242
- Add :func:`sgkit.io.vcf.zarr_array_sizes` for determining array sizes for storage in Zarr.
4343
(:user:`tomwhite`, :pr:`1073`, :issue:`734`)
4444

45+
- Add ``skipna`` option to :func:`genomic_relationship` function.
46+
(:user:`timothymillar`, :pr:`1078`, :issue:`1076`)
47+
4548
- Add `additional_variant_fields` to :func:`sgkit.simulate_genotype_call_dataset` function.
4649
(:user:`benjeffery`, :pr:`1056`)
4750

sgkit/stats/grm.py

Lines changed: 87 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -101,25 +101,105 @@ def genomic_relationship(
101101
Examples
102102
--------
103103
104+
Diploid dataset without missing data:
105+
104106
>>> import sgkit as sg
105107
>>> ds = sg.simulate_genotype_call_dataset(n_variant=6, n_sample=3, seed=0)
106108
>>> ds = sg.count_call_alleles(ds)
107109
>>> # use reference allele count as dosage
108110
>>> ds["call_dosage"] = ds.call_allele_count[:,:,0]
109111
>>> ds.call_dosage.values # doctest: +NORMALIZE_WHITESPACE
110112
array([[2, 1, 1],
111-
[1, 1, 1],
112-
[2, 1, 0],
113-
[2, 1, 1],
114-
[1, 0, 0],
115-
[1, 1, 2]], dtype=uint8)
113+
[1, 1, 1],
114+
[2, 1, 0],
115+
[2, 1, 1],
116+
[1, 0, 0],
117+
[1, 1, 2]], dtype=uint8)
116118
>>> # use sample population frequency as ancestral frequency
117119
>>> ds["sample_frequency"] = ds.call_dosage.mean(dim="samples") / ds.dims["ploidy"]
118120
>>> ds = sg.genomic_relationship(ds, ancestral_frequency="sample_frequency")
119121
>>> ds.stat_genomic_relationship.values # doctest: +NORMALIZE_WHITESPACE
120122
array([[ 0.93617021, -0.21276596, -0.72340426],
121-
[-0.21276596, 0.17021277, 0.04255319],
122-
[-0.72340426, 0.04255319, 0.68085106]])
123+
[-0.21276596, 0.17021277, 0.04255319],
124+
[-0.72340426, 0.04255319, 0.68085106]])
125+
126+
Skipping partial or missing genotype calls:
127+
128+
>>> import sgkit as sg
129+
>>> import xarray as xr
130+
>>> ds = sg.simulate_genotype_call_dataset(
131+
... n_variant=6,
132+
... n_sample=4,
133+
... missing_pct=0.05,
134+
... seed=0,
135+
... )
136+
>>> ds = sg.count_call_alleles(ds)
137+
>>> ds["call_dosage"] = xr.where(
138+
... ds.call_genotype_mask.any(dim="ploidy"),
139+
... np.nan,
140+
... ds.call_allele_count[:,:,1], # alternate allele
141+
... )
142+
>>> ds.call_dosage.values # doctest: +NORMALIZE_WHITESPACE
143+
array([[ 0., 1., 1., 1.],
144+
[ 1., nan, 0., 1.],
145+
[ 2., 0., 1., 1.],
146+
[ 1., 2., nan, 1.],
147+
[ 1., 0., 1., 2.],
148+
[ 2., 2., 0., 0.]])
149+
>>> ds["sample_frequency"] = ds.call_dosage.mean(
150+
... dim="samples", skipna=True
151+
... ) / ds.dims["ploidy"]
152+
>>> ds = sg.genomic_relationship(
153+
... ds, ancestral_frequency="sample_frequency", skipna=True
154+
... )
155+
>>> ds.stat_genomic_relationship.values # doctest: +NORMALIZE_WHITESPACE
156+
array([[ 0.9744836 , -0.16978417, -0.58417266, -0.33778858],
157+
[-0.16978417, 1.45323741, -0.47619048, -0.89496403],
158+
[-0.58417266, -0.47619048, 0.62446043, 0.34820144],
159+
[-0.33778858, -0.89496403, 0.34820144, 0.79951397]])
160+
161+
Using mean imputation to replace missing genotype calls:
162+
163+
>>> import sgkit as sg
164+
>>> import xarray as xr
165+
>>> ds = sg.simulate_genotype_call_dataset(
166+
... n_variant=6,
167+
... n_sample=4,
168+
... missing_pct=0.05,
169+
... seed=0,
170+
... )
171+
>>> ds = sg.count_call_alleles(ds)
172+
>>> ds["call_dosage"] = xr.where(
173+
... ds.call_genotype_mask.any(dim="ploidy"),
174+
... np.nan,
175+
... ds.call_allele_count[:,:,1], # alternate allele
176+
... )
177+
>>> # use mean imputation to replace missing dosage
178+
>>> ds["call_dosage_imputed"] = xr.where(
179+
... ds.call_genotype_mask.any(dim="ploidy"),
180+
... ds.call_dosage.mean(dim="samples", skipna=True),
181+
... ds.call_dosage,
182+
... )
183+
>>> ds.call_dosage_imputed.values # doctest: +NORMALIZE_WHITESPACE
184+
array([[0. , 1. , 1. , 1. ],
185+
[1. , 0.66666667, 0. , 1. ],
186+
[2. , 0. , 1. , 1. ],
187+
[1. , 2. , 1.33333333, 1. ],
188+
[1. , 0. , 1. , 2. ],
189+
[2. , 2. , 0. , 0. ]])
190+
>>> ds["sample_frequency"] = ds.call_dosage.mean(
191+
... dim="samples", skipna=True
192+
... ) / ds.dims["ploidy"]
193+
>>> ds = sg.genomic_relationship(
194+
... ds,
195+
... call_dosage="call_dosage_imputed",
196+
... ancestral_frequency="sample_frequency",
197+
... )
198+
>>> ds.stat_genomic_relationship.values # doctest: +NORMALIZE_WHITESPACE
199+
array([[ 0.9744836 , -0.14337789, -0.49331713, -0.33778858],
200+
[-0.14337789, 1.2272175 , -0.32806804, -0.75577157],
201+
[-0.49331713, -0.32806804, 0.527339 , 0.29404617],
202+
[-0.33778858, -0.75577157, 0.29404617, 0.79951397]])
123203
124204
References
125205
----------

0 commit comments

Comments
 (0)