From 67ca3e45f4a6f4d971711a44193c269bda7d7011 Mon Sep 17 00:00:00 2001 From: Liz Gehret Date: Tue, 22 Jul 2025 14:28:09 -0700 Subject: [PATCH 1/8] LANG: python 3.12 update & pin templating --- conda-recipe/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index b578b87..64b1bb8 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -17,7 +17,7 @@ requirements: - joblib - scikit-bio {{ scikit_bio }} - biom-format {{ biom_format }} - - blast >=2.13.0 + - blast {{ blast }} - vsearch {{ vsearch }} - qiime2 >={{ qiime2 }} - q2-types >={{ q2_types }} From 912b0a56c0486cc3ecc77cd76f2e9bc72ed76b37 Mon Sep 17 00:00:00 2001 From: Greg Caporaso Date: Thu, 14 Aug 2025 16:57:24 -0700 Subject: [PATCH 2/8] MAINT: update read extraction to use skbio 0.7.0 aligner --- q2_feature_classifier/_cutter.py | 105 +++++++++++++++---------------- 1 file changed, 52 insertions(+), 53 deletions(-) diff --git a/q2_feature_classifier/_cutter.py b/q2_feature_classifier/_cutter.py index 6954601..b74fbf4 100644 --- a/q2_feature_classifier/_cutter.py +++ b/q2_feature_classifier/_cutter.py @@ -37,54 +37,6 @@ def _primers_to_regex(f_primer, r_primer): _seq_to_regex(r_primer.reverse_complement())) -def _local_aln(primer, sequence): - best_score = None - for one_primer in sorted([str(s) for s in primer.expand_degenerates()]): - # `sequence` may contain degenerates. These will usually be N - # characters, which SSW will score as zero. Although undocumented, SSW - # will treat other degenerate characters as a mismatch. We acknowledge - # that this approach is a heuristic to finding an optimal alignment and - # may be revisited in the future if there's an aligner that explicitly - # handles degenerates. - this_aln = \ - skbio.alignment.local_pairwise_align_ssw(skbio.DNA(one_primer), - sequence) - score = this_aln[1] - if best_score is None or score > best_score: - best_score = score - best_aln = this_aln - return best_aln - - -def _semisemiglobal(primer, sequence, reverse=False): - if reverse: - primer = primer.reverse_complement() - - # locally align the primer - (aln_prim, aln_seq), score, (prim_pos, seq_pos) = \ - _local_aln(primer, sequence) - amplicon_pos = seq_pos[1]+len(primer)-prim_pos[1] - - # naively extend the alignment to be semi-global - bits = [primer[:prim_pos[0]], aln_prim, primer[prim_pos[1]+1:]] - aln_prim = ''.join(map(str, bits)) - bits = ['-'*(prim_pos[0]-seq_pos[0]), - sequence[max(seq_pos[0]-prim_pos[0], 0):seq_pos[0]], - aln_seq, - sequence[seq_pos[1]+1:amplicon_pos], - '-'*(amplicon_pos-len(sequence))] - aln_seq = ''.join(map(str, bits)) - - # count the matches - matches = sum(s in skbio.DNA.degenerate_map.get(p, {p}) - for p, s in zip(aln_prim, aln_seq)) - - if reverse: - amplicon_pos = max(seq_pos[0]-prim_pos[0], 0) - - return amplicon_pos, matches, len(aln_prim) - - def _exact_match(seq, f_primer, r_primer): try: regex = _primers_to_regex(f_primer, r_primer) @@ -95,12 +47,59 @@ def _exact_match(seq, f_primer, r_primer): return None +def _primer_hit(primer, seq, reverse=False): + if reverse: + primer = primer.reverse_complement() + best_score = None + for p in sorted([str(s) for s in primer.expand_degenerates()]): + p = skbio.DNA(p) + try: + # perform pairwise semi-global alignment, such that gaps on the + # ends of primer aren't scored but gaps on the ends of seq are + # scored + aln = skbio.alignment.pair_align_nucl( + p, seq, mode='global', + free_ends=[True, True, False, False], trim_ends=True) + score = aln.score + if best_score is None or score > best_score: + best_score = score + best_aln = aln + best_primer = p + except IndexError: + # this is currently necessary as it seems that if all positions + # are "ends" then `skbio.alignment.pair_align_nucl` fails with + # an IndexError (e.g., rather than returning an alignment with a + # shape of (2, 0)). + # See https://github.com/scikit-bio/scikit-bio/issues/2279 + # This should be removed when that issue is addressed as we are + # assuming an IndexError means a very poor alignment but we may be + # masking other IndexErrors. + best_score = 0.0 + if best_score == 0.0: + return None, 0, len(primer) + msa = skbio.TabularMSA.from_path_seqs(best_aln.paths[0], + (best_primer, seq)) + + if reverse: + amplicon_pos = best_aln.paths[0].starts[1] + else: + amplicon_pos = best_aln.paths[0].stops[1] + + n_matches = msa[0].match_frequency(msa[1]) + aligned_length = msa.shape[1] + + return amplicon_pos, n_matches, aligned_length + + def _approx_match(seq, f_primer, r_primer, identity): - beg, b_matches, b_length = _semisemiglobal(f_primer, seq) - end, e_matches, e_length = _semisemiglobal(r_primer, seq, reverse=True) - if (b_matches + e_matches) / (b_length + e_length) >= identity: - return seq[beg:end] - return None + amp_start, f_matches, f_length = _primer_hit(f_primer, seq) + amp_end, r_matches, r_length = _primer_hit(r_primer, seq, reverse=True) + if f_matches == 0 or f_matches == 0: + return None + elif f_matches / f_length >= identity and r_matches / r_length >= identity: + return seq[amp_start:amp_end] + else: + return None def _gen_reads(sequence, f_primer, r_primer, trim_right, trunc_len, trim_left, From 9618a99d4da30aee4b5f18b359a9a88dac4ef712 Mon Sep 17 00:00:00 2001 From: Greg Caporaso Date: Fri, 15 Aug 2025 08:07:28 -0700 Subject: [PATCH 3/8] squash --- q2_feature_classifier/_cutter.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/q2_feature_classifier/_cutter.py b/q2_feature_classifier/_cutter.py index b74fbf4..1c2cbff 100644 --- a/q2_feature_classifier/_cutter.py +++ b/q2_feature_classifier/_cutter.py @@ -47,7 +47,7 @@ def _exact_match(seq, f_primer, r_primer): return None -def _primer_hit(primer, seq, reverse=False): +def _align_primer(primer, seq, reverse=False): if reverse: primer = primer.reverse_complement() best_score = None @@ -92,9 +92,9 @@ def _primer_hit(primer, seq, reverse=False): def _approx_match(seq, f_primer, r_primer, identity): - amp_start, f_matches, f_length = _primer_hit(f_primer, seq) - amp_end, r_matches, r_length = _primer_hit(r_primer, seq, reverse=True) - if f_matches == 0 or f_matches == 0: + amp_start, f_matches, f_length = _align_primer(f_primer, seq) + amp_end, r_matches, r_length = _align_primer(r_primer, seq, reverse=True) + if f_matches == 0 or r_matches == 0: return None elif f_matches / f_length >= identity and r_matches / r_length >= identity: return seq[amp_start:amp_end] From 94d42f7d4a77ce3501d044cfcc5ceb0b8f508e6f Mon Sep 17 00:00:00 2001 From: Liz Gehret Date: Mon, 18 Aug 2025 12:53:34 -0700 Subject: [PATCH 4/8] try running tests in parallel to fix sigabrt 6 on GHA osx runner --- conda-recipe/meta.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index 64b1bb8..d0fd6b7 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -36,11 +36,12 @@ test: - q2-taxa >={{ q2_taxa }} - q2-feature-table >={{ q2_feature_table }} - pytest + - pytest-xdist imports: - q2_feature_classifier - qiime2.plugins.feature_classifier commands: - - py.test --pyargs q2_feature_classifier + - py.test --pyargs q2_feature_classifier -n auto about: home: https://qiime2.org license: BSD-3-Clause From bdfe769cf1f1fc001f184c7d2eeccac99e720a66 Mon Sep 17 00:00:00 2001 From: Greg Caporaso Date: Thu, 21 Aug 2025 09:24:44 -0700 Subject: [PATCH 5/8] squash --- q2_feature_classifier/_cutter.py | 54 ++++++++++++++------------------ 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/q2_feature_classifier/_cutter.py b/q2_feature_classifier/_cutter.py index 1c2cbff..92fd232 100644 --- a/q2_feature_classifier/_cutter.py +++ b/q2_feature_classifier/_cutter.py @@ -50,40 +50,34 @@ def _exact_match(seq, f_primer, r_primer): def _align_primer(primer, seq, reverse=False): if reverse: primer = primer.reverse_complement() - best_score = None - for p in sorted([str(s) for s in primer.expand_degenerates()]): - p = skbio.DNA(p) - try: - # perform pairwise semi-global alignment, such that gaps on the - # ends of primer aren't scored but gaps on the ends of seq are - # scored - aln = skbio.alignment.pair_align_nucl( - p, seq, mode='global', - free_ends=[True, True, False, False], trim_ends=True) - score = aln.score - if best_score is None or score > best_score: - best_score = score - best_aln = aln - best_primer = p - except IndexError: - # this is currently necessary as it seems that if all positions - # are "ends" then `skbio.alignment.pair_align_nucl` fails with - # an IndexError (e.g., rather than returning an alignment with a - # shape of (2, 0)). - # See https://github.com/scikit-bio/scikit-bio/issues/2279 - # This should be removed when that issue is addressed as we are - # assuming an IndexError means a very poor alignment but we may be - # masking other IndexErrors. - best_score = 0.0 - if best_score == 0.0: + + # perform pairwise semi-global alignment, such that gaps on the + # ends of primer aren't scored but gaps on the ends of seq are + # scored. NUC.4.4 is a substitution matrix that accounts for degenerate + # nucleotide characters + try: + aln = skbio.alignment.pair_align_nucl( + primer, seq, mode='global', sub_score='NUC.4.4', + free_ends=[True, True, False, False], trim_ends=True) + score = aln.score + except IndexError: + # this is currently necessary as it seems that if all positions + # are "ends" then `skbio.alignment.pair_align_nucl` fails with + # an IndexError (e.g., rather than returning an alignment with a + # shape of (2, 0)). + # See https://github.com/scikit-bio/scikit-bio/issues/2279 + # This should be removed when that issue is addressed as we are + # assuming an IndexError means a very poor alignment but we may be + # masking other IndexErrors. + score = 0.0 + if score == 0.0: return None, 0, len(primer) - msa = skbio.TabularMSA.from_path_seqs(best_aln.paths[0], - (best_primer, seq)) + msa = skbio.TabularMSA.from_path_seqs(aln.paths[0], (primer, seq)) if reverse: - amplicon_pos = best_aln.paths[0].starts[1] + amplicon_pos = aln.paths[0].starts[1] else: - amplicon_pos = best_aln.paths[0].stops[1] + amplicon_pos = aln.paths[0].stops[1] n_matches = msa[0].match_frequency(msa[1]) aligned_length = msa.shape[1] From 4486beaa25d9bc382f8c2b7be2bfe7af9814a3ab Mon Sep 17 00:00:00 2001 From: Greg Caporaso Date: Thu, 21 Aug 2025 09:41:43 -0700 Subject: [PATCH 6/8] squash --- q2_feature_classifier/_cutter.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/q2_feature_classifier/_cutter.py b/q2_feature_classifier/_cutter.py index 92fd232..0838a9e 100644 --- a/q2_feature_classifier/_cutter.py +++ b/q2_feature_classifier/_cutter.py @@ -55,23 +55,10 @@ def _align_primer(primer, seq, reverse=False): # ends of primer aren't scored but gaps on the ends of seq are # scored. NUC.4.4 is a substitution matrix that accounts for degenerate # nucleotide characters - try: - aln = skbio.alignment.pair_align_nucl( - primer, seq, mode='global', sub_score='NUC.4.4', - free_ends=[True, True, False, False], trim_ends=True) - score = aln.score - except IndexError: - # this is currently necessary as it seems that if all positions - # are "ends" then `skbio.alignment.pair_align_nucl` fails with - # an IndexError (e.g., rather than returning an alignment with a - # shape of (2, 0)). - # See https://github.com/scikit-bio/scikit-bio/issues/2279 - # This should be removed when that issue is addressed as we are - # assuming an IndexError means a very poor alignment but we may be - # masking other IndexErrors. - score = 0.0 - if score == 0.0: - return None, 0, len(primer) + aln = skbio.alignment.pair_align_nucl( + primer, seq, mode='global', sub_score='NUC.4.4', + free_ends=[True, True, False, False], trim_ends=True) + score = aln.score msa = skbio.TabularMSA.from_path_seqs(aln.paths[0], (primer, seq)) if reverse: @@ -79,6 +66,13 @@ def _align_primer(primer, seq, reverse=False): else: amplicon_pos = aln.paths[0].stops[1] + # this computation of matches doesn't account for degenerate characters - + # they are scored as mismatches, unless the same degenerate character is + # present at the same position in both sequences. so, for example, this + # alignment would have 2 matches (the first two positions) and two + # mismatches. this seems less than ideal. + # CNNC + # CNCN n_matches = msa[0].match_frequency(msa[1]) aligned_length = msa.shape[1] From 1f4497835e4be8c40f33ee74eb3458c33efbac7c Mon Sep 17 00:00:00 2001 From: Greg Caporaso Date: Thu, 21 Aug 2025 14:49:39 -0700 Subject: [PATCH 7/8] adds asymmetric substitution matrix in extract-reads sequence search squash me --- q2_feature_classifier/_cutter.py | 99 +++++++++++++++++++++++++------- 1 file changed, 77 insertions(+), 22 deletions(-) diff --git a/q2_feature_classifier/_cutter.py b/q2_feature_classifier/_cutter.py index 0838a9e..0595a86 100644 --- a/q2_feature_classifier/_cutter.py +++ b/q2_feature_classifier/_cutter.py @@ -8,6 +8,7 @@ import skbio import os +import numpy as np from joblib import Parallel, delayed, effective_n_jobs from qiime2.plugin import Int, Str, Float, Range, Choices @@ -47,44 +48,98 @@ def _exact_match(seq, f_primer, r_primer): return None -def _align_primer(primer, seq, reverse=False): +def _create_asymmetric_primer_substitution_matrix(match=2, mismatch=-3): + """ Create an asymmetric substitution matrix for matching degenerate + primers to target sequences + + This is asymmetic such that degenerate characters in primers will + score as matches when the target sequences contains a relevant + character. Degenerate characters in target sequences however always + score as a mismatch. + + This is designed on the assumption that primers contain degenerate + characters because they represent a pool of sequences that will + actually be present in a PCR reaction, but degenerate characters in + target sequences represent error or uncertainty. + + This is intended for use with `skbio.alignment.pair_align`, and the + primer should be passed as the first sequence and the target as the + second sequence. + """ + definite_chars = sorted(skbio.DNA.definite_chars) + degenerate_chars = sorted(skbio.DNA.degenerate_chars) + chars = definite_chars + degenerate_chars + + sm = np.zeros((len(chars), len(chars))) + + for row, c1 in enumerate(chars): + for col, c2 in enumerate(chars): + if c1 in definite_chars: + if c2 in definite_chars: + if c1 == c2: + sm[(row, col)] = match + else: + sm[(row, col)] = mismatch + else: + # degenerate char in query sequence is always a mismatch + sm[(row, col)] = mismatch + else: # primer character is degenerate + if c2 in skbio.DNA.degenerate_map[c1]: + sm[(row, col)] = match + else: + sm[(row, col)] = mismatch + return skbio.SubstitutionMatrix(chars, sm) + +def _match_percent(primer, target): + """ Compute percent of matching positions in alignments, accounting for + primer degeneracies + """ + matches = 0 + for primer_c, target_c in zip(str(primer), str(target)): + if target_c in skbio.DNA.degenerate_chars: + continue + if primer_c == target_c: + matches += 1 + elif (primer_c in skbio.DNA.degenerate_chars and + target_c in skbio.DNA.degenerate_map[primer_c]): + matches += 1 + else: + continue + return matches / len(primer) + + +def _align_primer(primer, target, substitution_matrix, reverse=False): if reverse: primer = primer.reverse_complement() # perform pairwise semi-global alignment, such that gaps on the - # ends of primer aren't scored but gaps on the ends of seq are - # scored. NUC.4.4 is a substitution matrix that accounts for degenerate - # nucleotide characters + # ends of primer aren't scored but gaps on the ends of target are + # scored. + # degenerate characters in primer match the characters they represent, + # but degenerate characters in target are always considered + # mismatches aln = skbio.alignment.pair_align_nucl( - primer, seq, mode='global', sub_score='NUC.4.4', + primer, target, mode='global', sub_score=substitution_matrix, free_ends=[True, True, False, False], trim_ends=True) - score = aln.score - msa = skbio.TabularMSA.from_path_seqs(aln.paths[0], (primer, seq)) + msa = skbio.TabularMSA.from_path_seqs(aln.paths[0], (primer, target)) + match_percent = _match_percent(msa[0], msa[1]) if reverse: amplicon_pos = aln.paths[0].starts[1] else: amplicon_pos = aln.paths[0].stops[1] - # this computation of matches doesn't account for degenerate characters - - # they are scored as mismatches, unless the same degenerate character is - # present at the same position in both sequences. so, for example, this - # alignment would have 2 matches (the first two positions) and two - # mismatches. this seems less than ideal. - # CNNC - # CNCN - n_matches = msa[0].match_frequency(msa[1]) - aligned_length = msa.shape[1] - return amplicon_pos, n_matches, aligned_length + return amplicon_pos, match_percent def _approx_match(seq, f_primer, r_primer, identity): - amp_start, f_matches, f_length = _align_primer(f_primer, seq) - amp_end, r_matches, r_length = _align_primer(r_primer, seq, reverse=True) - if f_matches == 0 or r_matches == 0: - return None - elif f_matches / f_length >= identity and r_matches / r_length >= identity: + substitution_matrix = _create_asymmetric_primer_substitution_matrix() + amp_start, f_match_percent = \ + _align_primer(f_primer, seq, substitution_matrix) + amp_end, r_match_percent = \ + _align_primer(r_primer, seq, substitution_matrix, reverse=True) + if f_match_percent >= identity and r_match_percent >= identity: return seq[amp_start:amp_end] else: return None From e02fe9ef9c813a5432b384948b21cccf60e22171 Mon Sep 17 00:00:00 2001 From: Greg Caporaso Date: Thu, 21 Aug 2025 15:22:55 -0700 Subject: [PATCH 8/8] squash --- q2_feature_classifier/_cutter.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/q2_feature_classifier/_cutter.py b/q2_feature_classifier/_cutter.py index 0595a86..bcc3cc3 100644 --- a/q2_feature_classifier/_cutter.py +++ b/q2_feature_classifier/_cutter.py @@ -80,16 +80,16 @@ def _create_asymmetric_primer_substitution_matrix(match=2, mismatch=-3): sm[(row, col)] = match else: sm[(row, col)] = mismatch - else: - # degenerate char in query sequence is always a mismatch + else: # degenerate char in query sequence is always a mismatch sm[(row, col)] = mismatch - else: # primer character is degenerate + else: # primer character is degenerate if c2 in skbio.DNA.degenerate_map[c1]: sm[(row, col)] = match else: sm[(row, col)] = mismatch return skbio.SubstitutionMatrix(chars, sm) + def _match_percent(primer, target): """ Compute percent of matching positions in alignments, accounting for primer degeneracies @@ -129,7 +129,6 @@ def _align_primer(primer, target, substitution_matrix, reverse=False): else: amplicon_pos = aln.paths[0].stops[1] - return amplicon_pos, match_percent