From 7a8da0749d2f6b2cfd7d01cde51579f19d1c27e0 Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Tue, 8 Oct 2024 11:38:10 +0100 Subject: [PATCH 1/4] add g123 --- allel/stats/selection.py | 112 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 111 insertions(+), 1 deletion(-) diff --git a/allel/stats/selection.py b/allel/stats/selection.py index e2ca1392..6f2d92d7 100644 --- a/allel/stats/selection.py +++ b/allel/stats/selection.py @@ -7,7 +7,7 @@ from allel.compat import memoryview_safe -from allel.util import asarray_ndim, check_dim0_aligned, check_integer_dtype +from allel.util import asarray_ndim, check_dim0_aligned, check_integer_dtype, hash_columns from allel.model.ndarray import HaplotypeArray, AlleleCountsArray from allel.stats.window import moving_statistic, index_windows from allel.stats.diversity import moving_tajima_d @@ -839,6 +839,116 @@ def garud_h(h): return h1, h12, h123, h2_h1 +def diplotype_frequencies(gt): + """Compute diplotype frequencies, returning a dictionary that maps + diplotype hash values to frequencies.""" + from collections import Counter + + # Here are some optimisations to speed up the computation + # of diplotype hashes. First we combine the two int8 alleles + # in each genotype call into a single int16. + m = gt.shape[0] + n = gt.shape[1] + x = np.asarray(gt).view(np.int16).reshape((m, n)) + + # Now call optimised hashing function. + k = [hash(x.values[:, i].tobytes()) for i in range(x.shape[1])] + + # Now compute counts and frequencies of distinct haplotypes. + counts = Counter(k) + freqs = {key: count / n for key, count in counts.items()} + + return freqs + +def garud_g(g): + """Compute the G1, G12, G123 and G2/H1 statistics for detecting signatures + of soft sweeps, as defined in Garud et al. (2018). + + Parameters + ---------- + g : array_like, int, shape (n_variants, n_samples) + Diploid genotypes at biallelic variants, coded as the number of + alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt). + + Returns + ------- + g1 : float + g1 statistic (sum of squares of haplotype frequencies). + g12 : float + g12 statistic (sum of squares of haplotype frequencies, combining + the two most common haplotypes into a single frequency). + g123 : float + g123 statistic (sum of squares of haplotype frequencies, combining + the three most common haplotypes into a single frequency). + g2_g1 : float + H2/H1 statistic, indicating the "softness" of a sweep. + + """ + + # compute diplotype frequencies + frq_counter = diplotype_frequencies(g) + + # convert to array of sorted frequencies + f = np.sort(np.fromiter(frq_counter.values(), dtype=float))[::-1] + + # compute H1 + g1 = np.sum(f**2) + + # compute H12 + g12 = np.sum(f[:2])**2 + np.sum(f[2:]**2) + + # compute H123 + g123 = np.sum(f[:3])**2 + np.sum(f[3:]**2) + + # compute H2/H1 + g2 = g1 - f[0]**2 + g2_g1 = g2 / g1 + + return g1, g12, g123, g2_g1 + +def moving_garud_g(g, size, start=0, stop=None, step=None): + """Compute the G1, G12, G123 and G2/G1 statistics for detecting signatures + of soft sweeps, as defined in Garud et al. (2015), in moving windows, + + Parameters + ---------- + g : array_like, int, shape (n_variants, n_samples) + Diploid genotypes at biallelic variants, coded as the number of + alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt). + size : int + The window size (number of variants). + start : int, optional + The index at which to start. + stop : int, optional + The index at which to stop. + step : int, optional + The number of variants between start positions of windows. If not + given, defaults to the window size, i.e., non-overlapping windows. + + Returns + ------- + g1 : ndarray, float, shape (n_windows,) + G1 statistics (sum of squares of haplotype frequencies). + g212 : ndarray, float, shape (n_windows,) + G12 statistics (sum of squares of haplotype frequencies, combining + the two most common haplotypes into a single frequency). + g123 : ndarray, float, shape (n_windows,) + G123 statistics (sum of squares of haplotype frequencies, combining + the three most common haplotypes into a single frequency). + g2_g1 : ndarray, float, shape (n_windows,) + G2/G1 statistics, indicating the "softness" of a sweep. + + """ + + gg = moving_statistic(values=g, statistic=garud_g, size=size, start=start, + stop=stop, step=step) + + g1 = gg[:, 0] + g12 = gg[:, 1] + g123 = gg[:, 2] + g2_g1 = gg[:, 3] + + return g1, g12, g123, g2_g1 def moving_garud_h(h, size, start=0, stop=None, step=None): """Compute the H1, H12, H123 and H2/H1 statistics for detecting signatures From f70ff4236df87a89186efd618d3e32e8d169e6e4 Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Tue, 8 Oct 2024 11:56:36 +0100 Subject: [PATCH 2/4] add g123 --- allel/stats/selection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/allel/stats/selection.py b/allel/stats/selection.py index 6f2d92d7..e3d71296 100644 --- a/allel/stats/selection.py +++ b/allel/stats/selection.py @@ -7,7 +7,7 @@ from allel.compat import memoryview_safe -from allel.util import asarray_ndim, check_dim0_aligned, check_integer_dtype, hash_columns +from allel.util import asarray_ndim, check_dim0_aligned, check_integer_dtype from allel.model.ndarray import HaplotypeArray, AlleleCountsArray from allel.stats.window import moving_statistic, index_windows from allel.stats.diversity import moving_tajima_d From b7c19d5612961185b38c7e59f2e2cf4df5ce71f8 Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Tue, 8 Oct 2024 11:59:04 +0100 Subject: [PATCH 3/4] add g123 --- allel/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/allel/__init__.py b/allel/__init__.py index afb55e51..6b5fb4e8 100644 --- a/allel/__init__.py +++ b/allel/__init__.py @@ -48,7 +48,7 @@ from .stats.selection import ehh_decay, voight_painting, xpehh, ihs, \ plot_voight_painting, fig_voight_painting, plot_haplotype_frequencies, \ plot_moving_haplotype_frequencies, haplotype_diversity, \ - moving_haplotype_diversity, garud_h, moving_garud_h, nsl, xpnsl, \ + moving_haplotype_diversity, garud_h, moving_garud_h, garud_g, moving_garud_g, nsl, xpnsl, \ standardize, standardize_by_allele_count, moving_delta_tajima_d, pbs from .stats.sf import sfs, sfs_folded, sfs_scaled, sfs_folded_scaled, \ From f7b0218fd14b85a25b7f07b0b9a4479426c0e894 Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Thu, 10 Oct 2024 18:02:00 +0100 Subject: [PATCH 4/4] fix in diplotype_freqs() --- allel/stats/selection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/allel/stats/selection.py b/allel/stats/selection.py index e3d71296..89c214fd 100644 --- a/allel/stats/selection.py +++ b/allel/stats/selection.py @@ -851,8 +851,8 @@ def diplotype_frequencies(gt): n = gt.shape[1] x = np.asarray(gt).view(np.int16).reshape((m, n)) - # Now call optimised hashing function. - k = [hash(x.values[:, i].tobytes()) for i in range(x.shape[1])] + # Now call hashing function. + k = [hash(x[:, i].tobytes()) for i in range(x.shape[1])] # Now compute counts and frequencies of distinct haplotypes. counts = Counter(k)