From 67ca3e45f4a6f4d971711a44193c269bda7d7011 Mon Sep 17 00:00:00 2001
From: Liz Gehret <elizabeth.gehret@gmail.com>
Date: Tue, 22 Jul 2025 14:28:09 -0700
Subject: [PATCH 1/8] LANG: python 3.12 update & pin templating

---
 conda-recipe/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index b578b87..64b1bb8 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -17,7 +17,7 @@ requirements:
   - joblib
   - scikit-bio {{ scikit_bio }}
   - biom-format {{ biom_format }}
-  - blast >=2.13.0
+  - blast {{ blast }}
   - vsearch {{ vsearch }}
   - qiime2 >={{ qiime2 }}
   - q2-types >={{ q2_types }}

From 912b0a56c0486cc3ecc77cd76f2e9bc72ed76b37 Mon Sep 17 00:00:00 2001
From: Greg Caporaso <jgcap@fastmail.com>
Date: Thu, 14 Aug 2025 16:57:24 -0700
Subject: [PATCH 2/8] MAINT: update read extraction to use skbio 0.7.0 aligner

---
 q2_feature_classifier/_cutter.py | 105 +++++++++++++++----------------
 1 file changed, 52 insertions(+), 53 deletions(-)

diff --git a/q2_feature_classifier/_cutter.py b/q2_feature_classifier/_cutter.py
index 6954601..b74fbf4 100644
--- a/q2_feature_classifier/_cutter.py
+++ b/q2_feature_classifier/_cutter.py
@@ -37,54 +37,6 @@ def _primers_to_regex(f_primer, r_primer):
                                _seq_to_regex(r_primer.reverse_complement()))
 
 
-def _local_aln(primer, sequence):
-    best_score = None
-    for one_primer in sorted([str(s) for s in primer.expand_degenerates()]):
-        # `sequence` may contain degenerates. These will usually be N
-        # characters, which SSW will score as zero. Although undocumented, SSW
-        # will treat other degenerate characters as a mismatch. We acknowledge
-        # that this approach is a heuristic to finding an optimal alignment and
-        # may be revisited in the future if there's an aligner that explicitly
-        # handles degenerates.
-        this_aln = \
-            skbio.alignment.local_pairwise_align_ssw(skbio.DNA(one_primer),
-                                                     sequence)
-        score = this_aln[1]
-        if best_score is None or score > best_score:
-            best_score = score
-            best_aln = this_aln
-    return best_aln
-
-
-def _semisemiglobal(primer, sequence, reverse=False):
-    if reverse:
-        primer = primer.reverse_complement()
-
-    # locally align the primer
-    (aln_prim, aln_seq), score, (prim_pos, seq_pos) = \
-        _local_aln(primer, sequence)
-    amplicon_pos = seq_pos[1]+len(primer)-prim_pos[1]
-
-    # naively extend the alignment to be semi-global
-    bits = [primer[:prim_pos[0]], aln_prim, primer[prim_pos[1]+1:]]
-    aln_prim = ''.join(map(str, bits))
-    bits = ['-'*(prim_pos[0]-seq_pos[0]),
-            sequence[max(seq_pos[0]-prim_pos[0], 0):seq_pos[0]],
-            aln_seq,
-            sequence[seq_pos[1]+1:amplicon_pos],
-            '-'*(amplicon_pos-len(sequence))]
-    aln_seq = ''.join(map(str, bits))
-
-    # count the matches
-    matches = sum(s in skbio.DNA.degenerate_map.get(p, {p})
-                  for p, s in zip(aln_prim, aln_seq))
-
-    if reverse:
-        amplicon_pos = max(seq_pos[0]-prim_pos[0], 0)
-
-    return amplicon_pos, matches, len(aln_prim)
-
-
 def _exact_match(seq, f_primer, r_primer):
     try:
         regex = _primers_to_regex(f_primer, r_primer)
@@ -95,12 +47,59 @@ def _exact_match(seq, f_primer, r_primer):
         return None
 
 
+def _primer_hit(primer, seq, reverse=False):
+    if reverse:
+        primer = primer.reverse_complement()
+    best_score = None
+    for p in sorted([str(s) for s in primer.expand_degenerates()]):
+        p = skbio.DNA(p)
+        try:
+            # perform pairwise semi-global alignment, such that gaps on the
+            # ends of primer aren't scored but gaps on the ends of seq are
+            # scored
+            aln = skbio.alignment.pair_align_nucl(
+                p, seq, mode='global',
+                free_ends=[True, True, False, False], trim_ends=True)
+            score = aln.score
+            if best_score is None or score > best_score:
+                best_score = score
+                best_aln = aln
+                best_primer = p
+        except IndexError:
+            # this is currently necessary as it seems that if all positions
+            # are "ends" then `skbio.alignment.pair_align_nucl` fails with
+            # an IndexError (e.g., rather than returning an alignment with a
+            # shape of (2, 0)).
+            # See https://github.com/scikit-bio/scikit-bio/issues/2279
+            # This should be removed when that issue is addressed as we are
+            # assuming an IndexError means a very poor alignment but we may be
+            # masking other IndexErrors.
+            best_score = 0.0
+    if best_score == 0.0:
+        return None, 0, len(primer)
+    msa = skbio.TabularMSA.from_path_seqs(best_aln.paths[0],
+                                          (best_primer, seq))
+
+    if reverse:
+        amplicon_pos = best_aln.paths[0].starts[1]
+    else:
+        amplicon_pos = best_aln.paths[0].stops[1]
+
+    n_matches = msa[0].match_frequency(msa[1])
+    aligned_length = msa.shape[1]
+
+    return amplicon_pos, n_matches, aligned_length
+
+
 def _approx_match(seq, f_primer, r_primer, identity):
-    beg, b_matches, b_length = _semisemiglobal(f_primer, seq)
-    end, e_matches, e_length = _semisemiglobal(r_primer, seq, reverse=True)
-    if (b_matches + e_matches) / (b_length + e_length) >= identity:
-        return seq[beg:end]
-    return None
+    amp_start, f_matches, f_length = _primer_hit(f_primer, seq)
+    amp_end, r_matches, r_length = _primer_hit(r_primer, seq, reverse=True)
+    if f_matches == 0 or f_matches == 0:
+        return None
+    elif f_matches / f_length >= identity and r_matches / r_length >= identity:
+        return seq[amp_start:amp_end]
+    else:
+        return None
 
 
 def _gen_reads(sequence, f_primer, r_primer, trim_right, trunc_len, trim_left,

From 9618a99d4da30aee4b5f18b359a9a88dac4ef712 Mon Sep 17 00:00:00 2001
From: Greg Caporaso <jgcap@fastmail.com>
Date: Fri, 15 Aug 2025 08:07:28 -0700
Subject: [PATCH 3/8] squash

---
 q2_feature_classifier/_cutter.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/q2_feature_classifier/_cutter.py b/q2_feature_classifier/_cutter.py
index b74fbf4..1c2cbff 100644
--- a/q2_feature_classifier/_cutter.py
+++ b/q2_feature_classifier/_cutter.py
@@ -47,7 +47,7 @@ def _exact_match(seq, f_primer, r_primer):
         return None
 
 
-def _primer_hit(primer, seq, reverse=False):
+def _align_primer(primer, seq, reverse=False):
     if reverse:
         primer = primer.reverse_complement()
     best_score = None
@@ -92,9 +92,9 @@ def _primer_hit(primer, seq, reverse=False):
 
 
 def _approx_match(seq, f_primer, r_primer, identity):
-    amp_start, f_matches, f_length = _primer_hit(f_primer, seq)
-    amp_end, r_matches, r_length = _primer_hit(r_primer, seq, reverse=True)
-    if f_matches == 0 or f_matches == 0:
+    amp_start, f_matches, f_length = _align_primer(f_primer, seq)
+    amp_end, r_matches, r_length = _align_primer(r_primer, seq, reverse=True)
+    if f_matches == 0 or r_matches == 0:
         return None
     elif f_matches / f_length >= identity and r_matches / r_length >= identity:
         return seq[amp_start:amp_end]

From 94d42f7d4a77ce3501d044cfcc5ceb0b8f508e6f Mon Sep 17 00:00:00 2001
From: Liz Gehret <elizabeth.gehret@gmail.com>
Date: Mon, 18 Aug 2025 12:53:34 -0700
Subject: [PATCH 4/8] try running tests in parallel to fix sigabrt 6 on GHA osx
 runner

---
 conda-recipe/meta.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index 64b1bb8..d0fd6b7 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -36,11 +36,12 @@ test:
   - q2-taxa >={{ q2_taxa }}
   - q2-feature-table >={{ q2_feature_table }}
   - pytest
+  - pytest-xdist
   imports:
   - q2_feature_classifier
   - qiime2.plugins.feature_classifier
   commands:
-  - py.test --pyargs q2_feature_classifier
+  - py.test --pyargs q2_feature_classifier -n auto
 about:
   home: https://qiime2.org
   license: BSD-3-Clause

From bdfe769cf1f1fc001f184c7d2eeccac99e720a66 Mon Sep 17 00:00:00 2001
From: Greg Caporaso <jgcap@fastmail.com>
Date: Thu, 21 Aug 2025 09:24:44 -0700
Subject: [PATCH 5/8] squash

---
 q2_feature_classifier/_cutter.py | 54 ++++++++++++++------------------
 1 file changed, 24 insertions(+), 30 deletions(-)

diff --git a/q2_feature_classifier/_cutter.py b/q2_feature_classifier/_cutter.py
index 1c2cbff..92fd232 100644
--- a/q2_feature_classifier/_cutter.py
+++ b/q2_feature_classifier/_cutter.py
@@ -50,40 +50,34 @@ def _exact_match(seq, f_primer, r_primer):
 def _align_primer(primer, seq, reverse=False):
     if reverse:
         primer = primer.reverse_complement()
-    best_score = None
-    for p in sorted([str(s) for s in primer.expand_degenerates()]):
-        p = skbio.DNA(p)
-        try:
-            # perform pairwise semi-global alignment, such that gaps on the
-            # ends of primer aren't scored but gaps on the ends of seq are
-            # scored
-            aln = skbio.alignment.pair_align_nucl(
-                p, seq, mode='global',
-                free_ends=[True, True, False, False], trim_ends=True)
-            score = aln.score
-            if best_score is None or score > best_score:
-                best_score = score
-                best_aln = aln
-                best_primer = p
-        except IndexError:
-            # this is currently necessary as it seems that if all positions
-            # are "ends" then `skbio.alignment.pair_align_nucl` fails with
-            # an IndexError (e.g., rather than returning an alignment with a
-            # shape of (2, 0)).
-            # See https://github.com/scikit-bio/scikit-bio/issues/2279
-            # This should be removed when that issue is addressed as we are
-            # assuming an IndexError means a very poor alignment but we may be
-            # masking other IndexErrors.
-            best_score = 0.0
-    if best_score == 0.0:
+
+    # perform pairwise semi-global alignment, such that gaps on the
+    # ends of primer aren't scored but gaps on the ends of seq are
+    # scored. NUC.4.4 is a substitution matrix that accounts for degenerate
+    # nucleotide characters
+    try:
+        aln = skbio.alignment.pair_align_nucl(
+            primer, seq, mode='global', sub_score='NUC.4.4',
+            free_ends=[True, True, False, False], trim_ends=True)
+        score = aln.score
+    except IndexError:
+        # this is currently necessary as it seems that if all positions
+        # are "ends" then `skbio.alignment.pair_align_nucl` fails with
+        # an IndexError (e.g., rather than returning an alignment with a
+        # shape of (2, 0)).
+        # See https://github.com/scikit-bio/scikit-bio/issues/2279
+        # This should be removed when that issue is addressed as we are
+        # assuming an IndexError means a very poor alignment but we may be
+        # masking other IndexErrors.
+        score = 0.0
+    if score == 0.0:
         return None, 0, len(primer)
-    msa = skbio.TabularMSA.from_path_seqs(best_aln.paths[0],
-                                          (best_primer, seq))
+    msa = skbio.TabularMSA.from_path_seqs(aln.paths[0], (primer, seq))
 
     if reverse:
-        amplicon_pos = best_aln.paths[0].starts[1]
+        amplicon_pos = aln.paths[0].starts[1]
     else:
-        amplicon_pos = best_aln.paths[0].stops[1]
+        amplicon_pos = aln.paths[0].stops[1]
 
     n_matches = msa[0].match_frequency(msa[1])
     aligned_length = msa.shape[1]

From 4486beaa25d9bc382f8c2b7be2bfe7af9814a3ab Mon Sep 17 00:00:00 2001
From: Greg Caporaso <jgcap@fastmail.com>
Date: Thu, 21 Aug 2025 09:41:43 -0700
Subject: [PATCH 6/8] squash

---
 q2_feature_classifier/_cutter.py | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/q2_feature_classifier/_cutter.py b/q2_feature_classifier/_cutter.py
index 92fd232..0838a9e 100644
--- a/q2_feature_classifier/_cutter.py
+++ b/q2_feature_classifier/_cutter.py
@@ -55,23 +55,10 @@ def _align_primer(primer, seq, reverse=False):
     # ends of primer aren't scored but gaps on the ends of seq are
     # scored. NUC.4.4 is a substitution matrix that accounts for degenerate
     # nucleotide characters
-    try:
-        aln = skbio.alignment.pair_align_nucl(
-            primer, seq, mode='global', sub_score='NUC.4.4',
-            free_ends=[True, True, False, False], trim_ends=True)
-        score = aln.score
-    except IndexError:
-        # this is currently necessary as it seems that if all positions
-        # are "ends" then `skbio.alignment.pair_align_nucl` fails with
-        # an IndexError (e.g., rather than returning an alignment with a
-        # shape of (2, 0)).
-        # See https://github.com/scikit-bio/scikit-bio/issues/2279
-        # This should be removed when that issue is addressed as we are
-        # assuming an IndexError means a very poor alignment but we may be
-        # masking other IndexErrors.
-        score = 0.0
-    if score == 0.0:
-        return None, 0, len(primer)
+    aln = skbio.alignment.pair_align_nucl(
+        primer, seq, mode='global', sub_score='NUC.4.4',
+        free_ends=[True, True, False, False], trim_ends=True)
+    score = aln.score
     msa = skbio.TabularMSA.from_path_seqs(aln.paths[0], (primer, seq))
 
     if reverse:
@@ -79,6 +66,13 @@ def _align_primer(primer, seq, reverse=False):
     else:
         amplicon_pos = aln.paths[0].stops[1]
 
+    # this computation of matches doesn't account for degenerate characters -
+    # they are scored as mismatches, unless the same degenerate character is
+    # present at the same position in both sequences. so, for example, this
+    # alignment would have 2 matches (the first two positions) and two
+    # mismatches. this seems less than ideal.
+    # CNNC
+    # CNCN
     n_matches = msa[0].match_frequency(msa[1])
     aligned_length = msa.shape[1]
 

From 1f4497835e4be8c40f33ee74eb3458c33efbac7c Mon Sep 17 00:00:00 2001
From: Greg Caporaso <jgcap@fastmail.com>
Date: Thu, 21 Aug 2025 14:49:39 -0700
Subject: [PATCH 7/8] adds asymmetric substitution matrix in extract-reads
 sequence search

squash me
---
 q2_feature_classifier/_cutter.py | 99 +++++++++++++++++++++++++-------
 1 file changed, 77 insertions(+), 22 deletions(-)

diff --git a/q2_feature_classifier/_cutter.py b/q2_feature_classifier/_cutter.py
index 0838a9e..0595a86 100644
--- a/q2_feature_classifier/_cutter.py
+++ b/q2_feature_classifier/_cutter.py
@@ -8,6 +8,7 @@
 
 import skbio
 import os
+import numpy as np
 from joblib import Parallel, delayed, effective_n_jobs
 
 from qiime2.plugin import Int, Str, Float, Range, Choices
@@ -47,44 +48,98 @@ def _exact_match(seq, f_primer, r_primer):
         return None
 
 
-def _align_primer(primer, seq, reverse=False):
+def _create_asymmetric_primer_substitution_matrix(match=2, mismatch=-3):
+    """ Create an asymmetric substitution matrix for matching degenerate
+        primers to target sequences
+
+        This is asymmetic such that degenerate characters in primers will
+        score as matches when the target sequences contains a relevant
+        character. Degenerate characters in target sequences however always
+        score as a mismatch.
+
+        This is designed on the assumption that primers contain degenerate
+        characters because they represent a pool of sequences that will
+        actually be present in a PCR reaction, but degenerate characters in
+        target sequences represent error or uncertainty.
+
+        This is intended for use with `skbio.alignment.pair_align`, and the
+        primer should be passed as the first sequence and the target as the
+        second sequence.
+    """
+    definite_chars = sorted(skbio.DNA.definite_chars)
+    degenerate_chars = sorted(skbio.DNA.degenerate_chars)
+    chars = definite_chars + degenerate_chars
+
+    sm = np.zeros((len(chars), len(chars)))
+
+    for row, c1 in enumerate(chars):
+        for col, c2 in enumerate(chars):
+            if c1 in definite_chars:
+                if c2 in definite_chars:
+                    if c1 == c2:
+                        sm[(row, col)] = match
+                    else:
+                        sm[(row, col)] = mismatch
+                else:
+                    # degenerate char in query sequence is always a mismatch
+                    sm[(row, col)] = mismatch
+            else: # primer character is degenerate
+                if c2 in skbio.DNA.degenerate_map[c1]:
+                    sm[(row, col)] = match
+                else:
+                    sm[(row, col)] = mismatch
+    return skbio.SubstitutionMatrix(chars, sm)
+
+def _match_percent(primer, target):
+    """ Compute percent of matching positions in alignments, accounting for
+        primer degeneracies
+    """
+    matches = 0
+    for primer_c, target_c in zip(str(primer), str(target)):
+        if target_c in skbio.DNA.degenerate_chars:
+            continue
+        if primer_c == target_c:
+            matches += 1
+        elif (primer_c in skbio.DNA.degenerate_chars and
+              target_c in skbio.DNA.degenerate_map[primer_c]):
+            matches += 1
+        else:
+            continue
+    return matches / len(primer)
+
+
+def _align_primer(primer, target, substitution_matrix, reverse=False):
     if reverse:
         primer = primer.reverse_complement()
 
     # perform pairwise semi-global alignment, such that gaps on the
-    # ends of primer aren't scored but gaps on the ends of seq are
-    # scored. NUC.4.4 is a substitution matrix that accounts for degenerate
-    # nucleotide characters
+    # ends of primer aren't scored but gaps on the ends of target are
+    # scored.
+    # degenerate characters in primer match the characters they represent,
+    # but degenerate characters in target are always considered
+    # mismatches
     aln = skbio.alignment.pair_align_nucl(
-        primer, seq, mode='global', sub_score='NUC.4.4',
+        primer, target, mode='global', sub_score=substitution_matrix,
         free_ends=[True, True, False, False], trim_ends=True)
-    score = aln.score
-    msa = skbio.TabularMSA.from_path_seqs(aln.paths[0], (primer, seq))
+    msa = skbio.TabularMSA.from_path_seqs(aln.paths[0], (primer, target))
+    match_percent = _match_percent(msa[0], msa[1])
 
     if reverse:
         amplicon_pos = aln.paths[0].starts[1]
     else:
         amplicon_pos = aln.paths[0].stops[1]
 
-    # this computation of matches doesn't account for degenerate characters -
-    # they are scored as mismatches, unless the same degenerate character is
-    # present at the same position in both sequences. so, for example, this
-    # alignment would have 2 matches (the first two positions) and two
-    # mismatches. this seems less than ideal.
-    # CNNC
-    # CNCN
-    n_matches = msa[0].match_frequency(msa[1])
-    aligned_length = msa.shape[1]
 
-    return amplicon_pos, n_matches, aligned_length
+    return amplicon_pos, match_percent
 
 
 def _approx_match(seq, f_primer, r_primer, identity):
-    amp_start, f_matches, f_length = _align_primer(f_primer, seq)
-    amp_end, r_matches, r_length = _align_primer(r_primer, seq, reverse=True)
-    if f_matches == 0 or r_matches == 0:
-        return None
-    elif f_matches / f_length >= identity and r_matches / r_length >= identity:
+    substitution_matrix = _create_asymmetric_primer_substitution_matrix()
+    amp_start, f_match_percent = \
+        _align_primer(f_primer, seq, substitution_matrix)
+    amp_end, r_match_percent = \
+        _align_primer(r_primer, seq, substitution_matrix, reverse=True)
+    if f_match_percent >= identity and r_match_percent >= identity:
         return seq[amp_start:amp_end]
     else:
         return None

From e02fe9ef9c813a5432b384948b21cccf60e22171 Mon Sep 17 00:00:00 2001
From: Greg Caporaso <jgcap@fastmail.com>
Date: Thu, 21 Aug 2025 15:22:55 -0700
Subject: [PATCH 8/8] squash

---
 q2_feature_classifier/_cutter.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/q2_feature_classifier/_cutter.py b/q2_feature_classifier/_cutter.py
index 0595a86..bcc3cc3 100644
--- a/q2_feature_classifier/_cutter.py
+++ b/q2_feature_classifier/_cutter.py
@@ -80,16 +80,16 @@ def _create_asymmetric_primer_substitution_matrix(match=2, mismatch=-3):
                         sm[(row, col)] = match
                     else:
                         sm[(row, col)] = mismatch
-                else:
-                    # degenerate char in query sequence is always a mismatch
+                else:  # degenerate char in query sequence is always a mismatch
                     sm[(row, col)] = mismatch
-            else: # primer character is degenerate
+            else:  # primer character is degenerate
                 if c2 in skbio.DNA.degenerate_map[c1]:
                     sm[(row, col)] = match
                 else:
                     sm[(row, col)] = mismatch
     return skbio.SubstitutionMatrix(chars, sm)
 
+
 def _match_percent(primer, target):
     """ Compute percent of matching positions in alignments, accounting for
         primer degeneracies
@@ -129,7 +129,6 @@ def _align_primer(primer, target, substitution_matrix, reverse=False):
     else:
         amplicon_pos = aln.paths[0].stops[1]
 
-
     return amplicon_pos, match_percent