From 7a8da0749d2f6b2cfd7d01cde51579f19d1c27e0 Mon Sep 17 00:00:00 2001
From: Sanjay C Nagi <sanjay.c.nagi@gmail.com>
Date: Tue, 8 Oct 2024 11:38:10 +0100
Subject: [PATCH 1/4] add g123

---
 allel/stats/selection.py | 112 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 111 insertions(+), 1 deletion(-)

diff --git a/allel/stats/selection.py b/allel/stats/selection.py
index e2ca1392..6f2d92d7 100644
--- a/allel/stats/selection.py
+++ b/allel/stats/selection.py
@@ -7,7 +7,7 @@
 
 
 from allel.compat import memoryview_safe
-from allel.util import asarray_ndim, check_dim0_aligned, check_integer_dtype
+from allel.util import asarray_ndim, check_dim0_aligned, check_integer_dtype, hash_columns
 from allel.model.ndarray import HaplotypeArray, AlleleCountsArray
 from allel.stats.window import moving_statistic, index_windows
 from allel.stats.diversity import moving_tajima_d
@@ -839,6 +839,116 @@ def garud_h(h):
 
     return h1, h12, h123, h2_h1
 
+def diplotype_frequencies(gt):
+    """Compute diplotype frequencies, returning a dictionary that maps
+    diplotype hash values to frequencies."""
+    from collections import Counter
+
+    # Here are some optimisations to speed up the computation
+    # of diplotype hashes. First we combine the two int8 alleles
+    # in each genotype call into a single int16.
+    m = gt.shape[0]
+    n = gt.shape[1]
+    x = np.asarray(gt).view(np.int16).reshape((m, n))
+
+    # Now call optimised hashing function.
+    k = [hash(x.values[:, i].tobytes()) for i in range(x.shape[1])]
+
+    # Now compute counts and frequencies of distinct haplotypes.
+    counts = Counter(k)
+    freqs = {key: count / n for key, count in counts.items()}
+
+    return freqs
+
+def garud_g(g):
+    """Compute the G1, G12, G123 and G2/H1 statistics for detecting signatures
+    of soft sweeps, as defined in Garud et al. (2018).
+
+    Parameters
+    ----------
+    g : array_like, int, shape (n_variants, n_samples)
+        Diploid genotypes at biallelic variants, coded as the number of
+        alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt).
+
+    Returns
+    -------
+    g1 : float
+        g1 statistic (sum of squares of haplotype frequencies).
+    g12 : float
+        g12 statistic (sum of squares of haplotype frequencies, combining
+        the two most common haplotypes into a single frequency).
+    g123 : float
+        g123 statistic (sum of squares of haplotype frequencies, combining
+        the three most common haplotypes into a single frequency).
+    g2_g1 : float
+        H2/H1 statistic, indicating the "softness" of a sweep.
+
+    """
+
+    # compute diplotype frequencies
+    frq_counter = diplotype_frequencies(g)
+
+    # convert to array of sorted frequencies
+    f = np.sort(np.fromiter(frq_counter.values(), dtype=float))[::-1]
+   
+    # compute H1
+    g1 = np.sum(f**2)
+
+    # compute H12
+    g12 = np.sum(f[:2])**2 + np.sum(f[2:]**2)
+
+    # compute H123
+    g123 = np.sum(f[:3])**2 + np.sum(f[3:]**2)
+
+    # compute H2/H1
+    g2 = g1 - f[0]**2
+    g2_g1 = g2 / g1
+
+    return g1, g12, g123, g2_g1
+
+def moving_garud_g(g, size, start=0, stop=None, step=None):
+    """Compute the G1, G12, G123 and G2/G1 statistics for detecting signatures
+    of soft sweeps, as defined in Garud et al. (2015), in moving windows,
+
+    Parameters
+    ----------
+    g : array_like, int, shape (n_variants, n_samples)
+        Diploid genotypes at biallelic variants, coded as the number of
+        alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt).
+    size : int
+        The window size (number of variants).
+    start : int, optional
+        The index at which to start.
+    stop : int, optional
+        The index at which to stop.
+    step : int, optional
+        The number of variants between start positions of windows. If not
+        given, defaults to the window size, i.e., non-overlapping windows.
+
+    Returns
+    -------
+    g1 : ndarray, float, shape (n_windows,)
+        G1 statistics (sum of squares of haplotype frequencies).
+    g212 : ndarray, float, shape (n_windows,)
+        G12 statistics (sum of squares of haplotype frequencies, combining
+        the two most common haplotypes into a single frequency).
+    g123 : ndarray, float, shape (n_windows,)
+        G123 statistics (sum of squares of haplotype frequencies, combining
+        the three most common haplotypes into a single frequency).
+    g2_g1 : ndarray, float, shape (n_windows,)
+        G2/G1 statistics, indicating the "softness" of a sweep.
+
+    """
+
+    gg = moving_statistic(values=g, statistic=garud_g, size=size, start=start,
+                          stop=stop, step=step)
+
+    g1 = gg[:, 0]
+    g12 = gg[:, 1]
+    g123 = gg[:, 2]
+    g2_g1 = gg[:, 3]
+
+    return g1, g12, g123, g2_g1
 
 def moving_garud_h(h, size, start=0, stop=None, step=None):
     """Compute the H1, H12, H123 and H2/H1 statistics for detecting signatures

From f70ff4236df87a89186efd618d3e32e8d169e6e4 Mon Sep 17 00:00:00 2001
From: Sanjay C Nagi <sanjay.c.nagi@gmail.com>
Date: Tue, 8 Oct 2024 11:56:36 +0100
Subject: [PATCH 2/4] add g123

---
 allel/stats/selection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/allel/stats/selection.py b/allel/stats/selection.py
index 6f2d92d7..e3d71296 100644
--- a/allel/stats/selection.py
+++ b/allel/stats/selection.py
@@ -7,7 +7,7 @@
 
 
 from allel.compat import memoryview_safe
-from allel.util import asarray_ndim, check_dim0_aligned, check_integer_dtype, hash_columns
+from allel.util import asarray_ndim, check_dim0_aligned, check_integer_dtype
 from allel.model.ndarray import HaplotypeArray, AlleleCountsArray
 from allel.stats.window import moving_statistic, index_windows
 from allel.stats.diversity import moving_tajima_d

From b7c19d5612961185b38c7e59f2e2cf4df5ce71f8 Mon Sep 17 00:00:00 2001
From: Sanjay C Nagi <sanjay.c.nagi@gmail.com>
Date: Tue, 8 Oct 2024 11:59:04 +0100
Subject: [PATCH 3/4] add g123

---
 allel/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/allel/__init__.py b/allel/__init__.py
index afb55e51..6b5fb4e8 100644
--- a/allel/__init__.py
+++ b/allel/__init__.py
@@ -48,7 +48,7 @@
 from .stats.selection import ehh_decay, voight_painting, xpehh, ihs, \
     plot_voight_painting, fig_voight_painting, plot_haplotype_frequencies, \
     plot_moving_haplotype_frequencies, haplotype_diversity, \
-    moving_haplotype_diversity, garud_h, moving_garud_h, nsl, xpnsl, \
+    moving_haplotype_diversity, garud_h, moving_garud_h, garud_g, moving_garud_g, nsl, xpnsl, \
     standardize, standardize_by_allele_count, moving_delta_tajima_d, pbs
 
 from .stats.sf import sfs, sfs_folded, sfs_scaled, sfs_folded_scaled, \

From f7b0218fd14b85a25b7f07b0b9a4479426c0e894 Mon Sep 17 00:00:00 2001
From: Sanjay C Nagi <sanjay.c.nagi@gmail.com>
Date: Thu, 10 Oct 2024 18:02:00 +0100
Subject: [PATCH 4/4] fix in diplotype_freqs()

---
 allel/stats/selection.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/allel/stats/selection.py b/allel/stats/selection.py
index e3d71296..89c214fd 100644
--- a/allel/stats/selection.py
+++ b/allel/stats/selection.py
@@ -851,8 +851,8 @@ def diplotype_frequencies(gt):
     n = gt.shape[1]
     x = np.asarray(gt).view(np.int16).reshape((m, n))
 
-    # Now call optimised hashing function.
-    k = [hash(x.values[:, i].tobytes()) for i in range(x.shape[1])]
+    # Now call hashing function.
+    k = [hash(x[:, i].tobytes()) for i in range(x.shape[1])]
 
     # Now compute counts and frequencies of distinct haplotypes.
     counts = Counter(k)