From 5d65b4802a22142856d21ef0e80cc501f0f33fb8 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Wed, 27 Aug 2025 10:50:19 -0600
Subject: [PATCH 01/42] initial changes

---
 src/qp_klp/Protocol.py                        |  92 +++++++++++++-
 src/qp_klp/RevioMetagenomicWorkflow.py        |  59 +++++++++
 src/qp_klp/WorkflowFactory.py                 |   4 +-
 .../ConvertJob.py                             | 116 ++++++++++++++++++
 4 files changed, 268 insertions(+), 3 deletions(-)
 create mode 100644 src/qp_klp/RevioMetagenomicWorkflow.py

diff --git a/src/qp_klp/Protocol.py b/src/qp_klp/Protocol.py
index 1a3d805e..6aee5d81 100644
--- a/src/qp_klp/Protocol.py
+++ b/src/qp_klp/Protocol.py
@@ -1,4 +1,5 @@
-from sequence_processing_pipeline.ConvertJob import ConvertJob
+from sequence_processing_pipeline.ConvertJob import (
+    ConvertJob, ConvertRevioBam2FastqJob)
 from sequence_processing_pipeline.TellReadJob import TellReadJob
 from sequence_processing_pipeline.SeqCountsJob import SeqCountsJob
 from sequence_processing_pipeline.TRIntegrateJob import TRIntegrateJob
@@ -8,7 +9,9 @@
 from re import match
 from os import makedirs, rename, walk
 from metapool import load_sample_sheet
-from metapool.sample_sheet import PROTOCOL_NAME_ILLUMINA, PROTOCOL_NAME_TELLSEQ
+from metapool.sample_sheet import (
+    PROTOCOL_NAME_ILLUMINA, PROTOCOL_NAME_TELLSEQ,
+    PROTOCOL_NAME_PACBIO)
 import pandas as pd
 from glob import glob
 from qiita_client.util import system_call
@@ -369,3 +372,88 @@ def _post_process_file(self, fastq_file, mapping, lane):
         rename(fastq_file, final_path)
 
         return final_path
+
+
+class PacBio(Protocol):
+    protocol_type = PROTOCOL_NAME_PACBIO
+
+    def convert_raw_to_fastq(self):
+        config = self.pipeline.get_software_configuration('revio')
+
+        job = ConvertRevioBam2FastqJob(
+            self.pipeline.run_dir,
+            self.pipeline.output_path,
+            self.pipeline.input_file_path,
+            config['queue'],
+            config['nodes'],
+            config['wallclock_time_in_minutes'],
+            config['tellread_mem_limit'],
+            config['modules_to_load'],
+            self.master_qiita_job_id,
+            config['reference_base'],
+            config['reference_map'],
+            config['sing_script_path'],
+            config['tellread_cores'])
+
+        self.raw_fastq_files_path = join(self.pipeline.output_path,
+                                         'ConvertJob')
+        # self.my_sil_path = join(self.pipeline.output_path,
+        #                         'TellReadJob',
+        #                         'sample_index_list_TellReadJob.txt')
+
+        # if ConvertJob already completed, then skip the over the time-
+        # consuming portion but populate the needed member variables.
+        if 'ConvertJob' not in self.skip_steps:
+            job.run(callback=self.job_callback)
+
+        # audit the results to determine which samples failed to convert
+        # properly. Append these to the failed-samples report and also
+        # return the list directly to the caller.
+        failed_samples = job.audit()
+        if hasattr(self, 'fsr'):
+            # NB 16S does not require a failed samples report and
+            # it is not performed by SPP.
+            self.fsr.write(failed_samples, job.__class__.__name__)
+
+        return failed_samples
+
+    # def generate_sequence_counts(self):
+    #     config = self.pipeline.get_software_configuration('tell-seq')
+
+    #     files_to_count_path = join(self.pipeline.output_path,
+    #                                'files_to_count.txt')
+
+    #     with open(files_to_count_path, 'w') as f:
+    #         for root, _, files in walk(self.raw_fastq_files_path):
+    #             for _file in files:
+    #                 if determine_orientation(_file) in ['R1', 'R2']:
+    #                     print(join(root, _file), file=f)
+
+    #     job = SeqCountsJob(self.pipeline.run_dir,
+    #                        self.pipeline.output_path,
+    #                        config['queue'],
+    #                        config['nodes'],
+    #                        config['wallclock_time_in_minutes'],
+    #                        config['normcount_mem_limit'],
+    #                        config['modules_to_load'],
+    #                        self.master_qiita_job_id,
+    #                        config['job_max_array_length'],
+    #                        files_to_count_path,
+    #                        self.pipeline.get_sample_sheet_path(),
+    #                        cores_per_task=config['tellread_cores'])
+
+    #     if 'SeqCountsJob' not in self.skip_steps:
+    #         job.run(callback=self.job_callback)
+
+    #     # if successful, set self.reports_path
+    #     self.reports_path = join(self.pipeline.output_path,
+    #                              'SeqCountsJob',
+    #                              'SeqCounts.csv')
+
+    #     # Do not add an entry to fsr because w/respect to counting, either
+    #     # all jobs are going to fail or none are going to fail. It's not
+    #     # likely that we're going to fail to count sequences for only some
+    #     # of the samples.
+
+    def integrate_results(self):
+        pass
diff --git a/src/qp_klp/RevioMetagenomicWorkflow.py b/src/qp_klp/RevioMetagenomicWorkflow.py
new file mode 100644
index 00000000..c3eb80c9
--- /dev/null
+++ b/src/qp_klp/RevioMetagenomicWorkflow.py
@@ -0,0 +1,59 @@
+from .Protocol import PacBio
+from sequence_processing_pipeline.Pipeline import Pipeline
+from .Assays import Metagenomic
+from .Assays import ASSAY_NAME_METAGENOMIC
+from .FailedSamplesRecord import FailedSamplesRecord
+from .Workflows import Workflow
+
+
+class RevioMetagenomicWorkflow(Workflow, Metagenomic, PacBio):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        self.mandatory_attributes = ['qclient', 'uif_path',
+                                     'lane_number', 'config_fp',
+                                     'run_identifier', 'output_dir', 'job_id',
+                                     'lane_number', 'is_restart']
+
+        self.confirm_mandatory_attributes()
+
+        # second stage initializer that could conceivably be pushed down into
+        # specific children requiring specific parameters.
+        self.qclient = self.kwargs['qclient']
+
+        self.overwrite_prep_with_original = False
+        if 'overwrite_prep_with_original' in self.kwargs:
+            self.overwrite_prep_with_original = \
+                self.kwargs['overwrite_prep_with_original']
+        if 'files_regex' in self.kwargs:
+            self.files_regex = self.kwargs['files_regex']
+        else:
+            self.files_regex = 'SPP'
+        self.pipeline = Pipeline(self.kwargs['config_fp'],
+                                 self.kwargs['run_identifier'],
+                                 self.kwargs['uif_path'],
+                                 self.kwargs['output_dir'],
+                                 self.kwargs['job_id'],
+                                 ASSAY_NAME_METAGENOMIC,
+                                 lane_number=self.kwargs['lane_number'])
+
+        self.fsr = FailedSamplesRecord(self.kwargs['output_dir'],
+                                       self.pipeline.sample_sheet.samples)
+
+        self.master_qiita_job_id = self.kwargs['job_id']
+
+        self.lane_number = self.kwargs['lane_number']
+        self.is_restart = bool(self.kwargs['is_restart'])
+
+        if self.is_restart is True:
+            self.determine_steps_to_skip()
+
+        # this is a convenience member to allow testing w/out updating Qiita.
+        self.update = True
+
+        if 'update_qiita' in kwargs:
+            if not isinstance(kwargs['update_qiita'], bool):
+                raise ValueError("value for 'update_qiita' must be of "
+                                 "type bool")
+
+            self.update = kwargs['update_qiita']
diff --git a/src/qp_klp/WorkflowFactory.py b/src/qp_klp/WorkflowFactory.py
index 81b6e024..4853bcd0 100644
--- a/src/qp_klp/WorkflowFactory.py
+++ b/src/qp_klp/WorkflowFactory.py
@@ -3,6 +3,7 @@
 from .StandardMetatranscriptomicWorkflow import \
     StandardMetatranscriptomicWorkflow
 from .TellseqMetagenomicWorkflow import TellSeqMetagenomicWorkflow
+from .RevioMetagenomicWorkflow import RevioMetagenomicWorkflow
 from sequence_processing_pipeline.Pipeline import Pipeline
 from metapool import load_sample_sheet
 from metapool.sample_sheet import SAMPLE_SHEETS_BY_PROTOCOL as SSBP
@@ -14,7 +15,8 @@ class WorkflowFactory():
     WORKFLOWS = [StandardMetagenomicWorkflow,
                  StandardMetatranscriptomicWorkflow,
                  StandardAmpliconWorkflow,
-                 TellSeqMetagenomicWorkflow]
+                 TellSeqMetagenomicWorkflow,
+                 RevioMetagenomicWorkflow]
 
     @classmethod
     def _get_instrument_type(cls, sheet):
diff --git a/src/sequence_processing_pipeline/ConvertJob.py b/src/sequence_processing_pipeline/ConvertJob.py
index 3a521d8c..6ec1d74d 100644
--- a/src/sequence_processing_pipeline/ConvertJob.py
+++ b/src/sequence_processing_pipeline/ConvertJob.py
@@ -27,6 +27,7 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         :param node_count: The number of nodes to request.
         :param nprocs: The maximum number of parallel processes to use.
         :param wall_time_limit: A hard time limit (in min) to bound processing.
+        :param pmem: Amount of memory to be used.
         :param bcl_tool_path: The path to either bcl2fastq or bcl-convert.
         :param modules_to_load: A list of Linux module names to load
         :param qiita_job_id: identify Torque jobs using qiita_job_id
@@ -379,3 +380,118 @@ def _make_msg_base(col_name):
                 # be provided in the destination path.
                 dst_fp = join(self.output_path, dest_project, split(src_fp)[1])
                 copyfile(src_fp, dst_fp)
+
+
+class ConvertRevioBam2FastqJob(Job):
+    def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
+                 node_count, nprocs, wall_time_limit, pmem, bam2fastq_path,
+                 modules_to_load, qiita_job_id):
+        """
+        ConvertRevioBam2FastqJob provides a convenient way to run bam2fastq
+        on a directory revio bam files to generate fastq files.
+        :param run_dir: The 'run' directory that contains BCL files.
+        :param output_path: Path where all pipeline-generated files live.
+        :param sample_sheet_path: The path to a sample-sheet.
+        :param queue_name: The name of the Torque queue to use for processing.
+        :param node_count: The number of nodes to request.
+        :param nprocs: The maximum number of parallel processes to use.
+        :param wall_time_limit: A hard time limit (in min) to bound processing.
+        :param pmem: Amount of memory to be used.
+        :param bam2fastq_path: The path to either bcl2fastq or bcl-convert.
+        :param modules_to_load: A list of Linux module names to load
+        :param qiita_job_id: identify Torque jobs using qiita_job_id
+        """
+        super().__init__(run_dir,
+                         output_path,
+                         'ConvertJob',
+                         [bam2fastq_path],
+                         1000,
+                         modules_to_load=modules_to_load)
+
+        # for metagenomics pipelines, sample_sheet_path will reflect a real
+        # sample_sheet file. For amplicon pipelines, sample_sheet_path will
+        # reference a dummy sample_sheet file.
+        self.sample_sheet_path = sample_sheet_path
+        self.queue_name = queue_name
+        self.node_count = node_count
+        self.nprocs = nprocs
+        self.wall_time_limit = wall_time_limit
+        self.pmem = pmem
+        self.bcl_tool = bam2fastq_path
+        self.qiita_job_id = qiita_job_id
+        self.job_script_path = join(self.output_path, f"{self.job_name}.sh")
+        self.suffix = 'fastq.gz'
+        self.fastq_paths = None
+        self.info = None
+
+        self._file_check(self.sample_sheet_path)
+
+        self._generate_job_script()
+
+    def _generate_job_script(self):
+        """
+        Generate a Torque job script for processing supplied root_directory.
+        :return: The path to the newly-created job-script.
+        """
+        lines = []
+
+        lines.append("#!/bin/bash")
+        lines.append(f"#SBATCH --job-name {self.qiita_job_id}_{self.job_name}")
+        lines.append(f"#SBATCH -p {self.queue_name}")
+        lines.append(f'#SBATCH -N {self.node_count}')
+        lines.append(f'#SBATCH -n {self.nprocs}')
+        lines.append("#SBATCH --time %d" % self.wall_time_limit)
+
+        # send an email to the list of users defined below when a job starts,
+        # terminates, or aborts. This is used to confirm that the package's
+        # own reporting mechanism is reporting correctly.
+        lines.append("#SBATCH --mail-type=ALL")
+
+        # list of users to be contacted independently of this package's
+        # notification system, when a job starts, terminates, or gets aborted.
+        lines.append("#SBATCH --mail-user qiita.help@gmail.com")
+
+        lines.append(f"#SBATCH --mem-per-cpu {self.pmem}")
+
+        lines.append("set -x")
+        lines.append('date')
+        lines.append('hostname')
+        lines.append(f'cd {self.root_dir}')
+
+        if self.modules_to_load:
+            lines.append("module load " + ' '.join(self.modules_to_load))
+
+        # for sample
+        # parallel -j 2 --joblog build.log echo 'Built {}' ::: app1 app2 app3
+        # bam2fastq -o ${OUT}/${BARCODE}
+        # ${BASE}/${Run_prefix}.hifi_reads.${BARCODE}.bam
+
+        with open(self.job_script_path, 'w') as f:
+            for line in lines:
+                # remove long spaces in some lines.
+                line = re.sub(r'\s+', ' ', line)
+                f.write(f"{line}\n")
+
+    def run(self, callback=None):
+        """
+        Run BCL2Fastq/BCLConvert conversion
+        :param callback: optional function taking two parameters (id, status)
+                         that is called when a running process's status is
+                         changed.
+        :return:
+        """
+        try:
+            job_info = self.submit_job(self.job_script_path,
+                                       exec_from=self.log_path,
+                                       callback=callback)
+        except JobFailedError as e:
+            # When a job has failed, parse the logs generated by this specific
+            # job to return a more descriptive message to the user.
+            info = self.parse_logs()
+            # prepend just the message component of the Error.
+            info.insert(0, str(e))
+            raise JobFailedError('\n'.join(info))
+
+        self. mark_job_completed()
+
+        logging.info(f'Successful job: {job_info}')

From 55ab2106c8de623e6f27534ed25a7e22749b1a6e Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Mon, 1 Sep 2025 12:01:54 -0600
Subject: [PATCH 02/42] init push

---
 .github/workflows/qiita-plugin-ci.yml         |   2 +-
 ...rkflow.py => PacBioMetagenomicWorkflow.py} |  12 +-
 src/qp_klp/Protocol.py                        | 116 +++++++------
 src/qp_klp/WorkflowFactory.py                 |   4 +-
 .../ConvertJob.py                             |  21 ++-
 src/sequence_processing_pipeline/NuQCJob.py   |  11 +-
 src/sequence_processing_pipeline/Pipeline.py  |  18 +-
 .../pacbio_metagenomic.json                   |  64 +++++++
 .../m11111_20250101_111111_s4.metadata.xml    |  11 ++
 tests/test_PacBioWorflow.py                   | 164 ++++++++++++++++++
 10 files changed, 337 insertions(+), 86 deletions(-)
 rename src/qp_klp/{RevioMetagenomicWorkflow.py => PacBioMetagenomicWorkflow.py} (83%)
 create mode 100644 tests/data/configuration_profiles/pacbio_metagenomic.json
 create mode 100644 tests/data/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml
 create mode 100644 tests/test_PacBioWorflow.py

diff --git a/.github/workflows/qiita-plugin-ci.yml b/.github/workflows/qiita-plugin-ci.yml
index 46d280dc..92dceaac 100644
--- a/.github/workflows/qiita-plugin-ci.yml
+++ b/.github/workflows/qiita-plugin-ci.yml
@@ -83,7 +83,7 @@ jobs:
           # seqtk is a conda-installed requirement for metapool and isn't
           # installed automatically by its setup.py.
           conda config --add channels bioconda
-          conda create -q --yes -n klp python=3.9 'seqtk>=1.4'
+          conda create -q --yes -n klp python=3.9 'seqtk>=1.4' pbtk
 
           conda activate klp
 
diff --git a/src/qp_klp/RevioMetagenomicWorkflow.py b/src/qp_klp/PacBioMetagenomicWorkflow.py
similarity index 83%
rename from src/qp_klp/RevioMetagenomicWorkflow.py
rename to src/qp_klp/PacBioMetagenomicWorkflow.py
index c3eb80c9..3036755b 100644
--- a/src/qp_klp/RevioMetagenomicWorkflow.py
+++ b/src/qp_klp/PacBioMetagenomicWorkflow.py
@@ -6,7 +6,7 @@
 from .Workflows import Workflow
 
 
-class RevioMetagenomicWorkflow(Workflow, Metagenomic, PacBio):
+class PacBioMetagenomicWorkflow(Workflow, Metagenomic, PacBio):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
@@ -40,6 +40,16 @@ def __init__(self, **kwargs):
         self.fsr = FailedSamplesRecord(self.kwargs['output_dir'],
                                        self.pipeline.sample_sheet.samples)
 
+        import pandas as pd
+        samples = [
+            {'barcode': sample['Sample_ID'],
+             'sample_name': sample['Sample_Name'],
+             'project_name': sample['Sample_Project']}
+            for sample in self.pipeline.sample_sheet.samples]
+        df = pd.DataFrame(samples)
+        sample_list_fp = f"{self.kwargs['output_dir']}/sample_list.tsv"
+        df.to_csv(sample_list_fp, sep='t')
+
         self.master_qiita_job_id = self.kwargs['job_id']
 
         self.lane_number = self.kwargs['lane_number']
diff --git a/src/qp_klp/Protocol.py b/src/qp_klp/Protocol.py
index 6aee5d81..71e740e9 100644
--- a/src/qp_klp/Protocol.py
+++ b/src/qp_klp/Protocol.py
@@ -1,17 +1,17 @@
 from sequence_processing_pipeline.ConvertJob import (
-    ConvertJob, ConvertRevioBam2FastqJob)
+    ConvertJob, ConvertPacBioBam2FastqJob)
 from sequence_processing_pipeline.TellReadJob import TellReadJob
 from sequence_processing_pipeline.SeqCountsJob import SeqCountsJob
 from sequence_processing_pipeline.TRIntegrateJob import TRIntegrateJob
 from sequence_processing_pipeline.PipelineError import PipelineError
 from sequence_processing_pipeline.util import determine_orientation
-from os.path import join, split, basename, dirname
 from re import match
 from os import makedirs, rename, walk
+from os.path import join, split, basename, dirname, exists
 from metapool import load_sample_sheet
 from metapool.sample_sheet import (
     PROTOCOL_NAME_ILLUMINA, PROTOCOL_NAME_TELLSEQ,
-    PROTOCOL_NAME_PACBIO)
+    PROTOCOL_NAME_PACBIO_SMRT)
 import pandas as pd
 from glob import glob
 from qiita_client.util import system_call
@@ -82,6 +82,25 @@ def subsample_reads(self):
 
 class Illumina(Protocol):
     protocol_type = PROTOCOL_NAME_ILLUMINA
+    # required files for successful operation for Illumina (making the default
+    # here) both RTAComplete.txt and RunInfo.xml should reside in the root of
+    # the run directory.
+    required_files = ['RTAComplete.txt', 'RunInfo.xml']
+
+    def __init__(self) -> None:
+        super().__init__()
+
+        for some_file in self.required_files:
+            if not exists(join(self.run_dir, some_file)):
+                raise PipelineError(f"required file '{some_file}' is not "
+                                    f"present in {self.run_dir}.")
+
+        # verify that RunInfo.xml file is readable.
+        try:
+            fp = open(join(self.run_dir, 'RunInfo.xml'))
+            fp.close()
+        except PermissionError:
+            raise PipelineError('RunInfo.xml is present, but not readable')
 
     def convert_raw_to_fastq(self):
         def get_config(command):
@@ -375,31 +394,26 @@ def _post_process_file(self, fastq_file, mapping, lane):
 
 
 class PacBio(Protocol):
-    protocol_type = PROTOCOL_NAME_PACBIO
+    protocol_type = PROTOCOL_NAME_PACBIO_SMRT
 
     def convert_raw_to_fastq(self):
-        config = self.pipeline.get_software_configuration('revio')
+        config = self.pipeline.get_software_configuration('pacbio_convert')
 
-        job = ConvertRevioBam2FastqJob(
+        job = ConvertPacBioBam2FastqJob(
             self.pipeline.run_dir,
             self.pipeline.output_path,
             self.pipeline.input_file_path,
             config['queue'],
             config['nodes'],
+            config['nprocs'],
             config['wallclock_time_in_minutes'],
-            config['tellread_mem_limit'],
+            config['per_process_memory_limit'],
+            config['executable_path'],
             config['modules_to_load'],
-            self.master_qiita_job_id,
-            config['reference_base'],
-            config['reference_map'],
-            config['sing_script_path'],
-            config['tellread_cores'])
+            self.master_qiita_job_id)
 
         self.raw_fastq_files_path = join(self.pipeline.output_path,
                                          'ConvertJob')
-        # self.my_sil_path = join(self.pipeline.output_path,
-        #                         'TellReadJob',
-        #                         'sample_index_list_TellReadJob.txt')
 
         # if ConvertJob already completed, then skip the over the time-
         # consuming portion but populate the needed member variables.
@@ -409,7 +423,7 @@ def convert_raw_to_fastq(self):
         # audit the results to determine which samples failed to convert
         # properly. Append these to the failed-samples report and also
         # return the list directly to the caller.
-        failed_samples = job.audit()
+        failed_samples = job.audit(self.pipeline.get_sample_ids())
         if hasattr(self, 'fsr'):
             # NB 16S does not require a failed samples report and
             # it is not performed by SPP.
@@ -417,43 +431,39 @@ def convert_raw_to_fastq(self):
 
         return failed_samples
 
-    # def generate_sequence_counts(self):
-    #     config = self.pipeline.get_software_configuration('tell-seq')
-
-    #     files_to_count_path = join(self.pipeline.output_path,
-    #                                'files_to_count.txt')
-
-    #     with open(files_to_count_path, 'w') as f:
-    #         for root, _, files in walk(self.raw_fastq_files_path):
-    #             for _file in files:
-    #                 if determine_orientation(_file) in ['R1', 'R2']:
-    #                     print(join(root, _file), file=f)
-
-    #     job = SeqCountsJob(self.pipeline.run_dir,
-    #                        self.pipeline.output_path,
-    #                        config['queue'],
-    #                        config['nodes'],
-    #                        config['wallclock_time_in_minutes'],
-    #                        config['normcount_mem_limit'],
-    #                        config['modules_to_load'],
-    #                        self.master_qiita_job_id,
-    #                        config['job_max_array_length'],
-    #                        files_to_count_path,
-    #                        self.pipeline.get_sample_sheet_path(),
-    #                        cores_per_task=config['tellread_cores'])
-
-    #     if 'SeqCountsJob' not in self.skip_steps:
-    #         job.run(callback=self.job_callback)
-
-    #     # if successful, set self.reports_path
-    #     self.reports_path = join(self.pipeline.output_path,
-    #                              'SeqCountsJob',
-    #                              'SeqCounts.csv')
-
-    #     # Do not add an entry to fsr because w/respect to counting, either
-    #     # all jobs are going to fail or none are going to fail. It's not
-    #     # likely that we're going to fail to count sequences for only some
-    #     # of the samples.
+    def generate_sequence_counts(self):
+        # config = self.pipeline.get_software_configuration('tell-seq')
+
+        # files_to_count_path = join(self.pipeline.output_path,
+        #                            'files_to_count.txt')
+
+        # with open(files_to_count_path, 'w') as f:
+        #     for root, _, files in walk(self.raw_fastq_files_path):
+        #         for _file in files:
+        #             if determine_orientation(_file) in ['R1', 'R2']:
+        #                 print(join(root, _file), file=f)
+
+        # job = SeqCountsJob(self.pipeline.run_dir,
+        #                    self.pipeline.output_path,
+        #                    config['queue'],
+        #                    config['nodes'],
+        #                    config['wallclock_time_in_minutes'],
+        #                    config['normcount_mem_limit'],
+        #                    config['modules_to_load'],
+        #                    self.master_qiita_job_id,
+        #                    config['job_max_array_length'],
+        #                    files_to_count_path,
+        #                    self.pipeline.get_sample_sheet_path(),
+        #                    cores_per_task=config['tellread_cores'])
+
+        # if 'SeqCountsJob' not in self.skip_steps:
+        #     job.run(callback=self.job_callback)
+
+        # if successful, set self.reports_path
+        self.reports_path = join(self.pipeline.output_path,
+                                 'SeqCounts.csv')
+        open(self.reports_path, 'w').write(
+            'SampleID,# Reads\nA1,100')
 
     def integrate_results(self):
         pass
diff --git a/src/qp_klp/WorkflowFactory.py b/src/qp_klp/WorkflowFactory.py
index 4853bcd0..8d51cee6 100644
--- a/src/qp_klp/WorkflowFactory.py
+++ b/src/qp_klp/WorkflowFactory.py
@@ -3,7 +3,7 @@
 from .StandardMetatranscriptomicWorkflow import \
     StandardMetatranscriptomicWorkflow
 from .TellseqMetagenomicWorkflow import TellSeqMetagenomicWorkflow
-from .RevioMetagenomicWorkflow import RevioMetagenomicWorkflow
+from .PacBioMetagenomicWorkflow import PacBioMetagenomicWorkflow
 from sequence_processing_pipeline.Pipeline import Pipeline
 from metapool import load_sample_sheet
 from metapool.sample_sheet import SAMPLE_SHEETS_BY_PROTOCOL as SSBP
@@ -16,7 +16,7 @@ class WorkflowFactory():
                  StandardMetatranscriptomicWorkflow,
                  StandardAmpliconWorkflow,
                  TellSeqMetagenomicWorkflow,
-                 RevioMetagenomicWorkflow]
+                 PacBioMetagenomicWorkflow]
 
     @classmethod
     def _get_instrument_type(cls, sheet):
diff --git a/src/sequence_processing_pipeline/ConvertJob.py b/src/sequence_processing_pipeline/ConvertJob.py
index 6ec1d74d..e67227f4 100644
--- a/src/sequence_processing_pipeline/ConvertJob.py
+++ b/src/sequence_processing_pipeline/ConvertJob.py
@@ -382,13 +382,13 @@ def _make_msg_base(col_name):
                 copyfile(src_fp, dst_fp)
 
 
-class ConvertRevioBam2FastqJob(Job):
+class ConvertPacBioBam2FastqJob(Job):
     def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
                  node_count, nprocs, wall_time_limit, pmem, bam2fastq_path,
                  modules_to_load, qiita_job_id):
         """
-        ConvertRevioBam2FastqJob provides a convenient way to run bam2fastq
-        on a directory revio bam files to generate fastq files.
+        ConvertPacBioBam2FastqJob provides a convenient way to run bam2fastq
+        on a directory PacBio bam files to generate fastq files.
         :param run_dir: The 'run' directory that contains BCL files.
         :param output_path: Path where all pipeline-generated files live.
         :param sample_sheet_path: The path to a sample-sheet.
@@ -417,7 +417,7 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         self.nprocs = nprocs
         self.wall_time_limit = wall_time_limit
         self.pmem = pmem
-        self.bcl_tool = bam2fastq_path
+        self.bam2fastq_path = bam2fastq_path
         self.qiita_job_id = qiita_job_id
         self.job_script_path = join(self.output_path, f"{self.job_name}.sh")
         self.suffix = 'fastq.gz'
@@ -461,10 +461,13 @@ def _generate_job_script(self):
         if self.modules_to_load:
             lines.append("module load " + ' '.join(self.modules_to_load))
 
-        # for sample
-        # parallel -j 2 --joblog build.log echo 'Built {}' ::: app1 app2 app3
-        # bam2fastq -o ${OUT}/${BARCODE}
-        # ${BASE}/${Run_prefix}.hifi_reads.${BARCODE}.bam
+        lines.append(f"sample_list={self.root_dir}/sample_list.tsv")
+        lines.append(
+            'while read bc sn pn; do echo '
+            '"bam2fastq -j 1 -o ${sn}.fastq.gz -c 9 '
+            '${pn}/${Run_prefix}.hifi_reads.${bn}.bam"; '
+            'fqtools count ${sn}.fastq.gz > ${sn}.counts.txt;'
+            'done < ${sample_list}')
 
         with open(self.job_script_path, 'w') as f:
             for line in lines:
@@ -492,6 +495,6 @@ def run(self, callback=None):
             info.insert(0, str(e))
             raise JobFailedError('\n'.join(info))
 
-        self. mark_job_completed()
+        self.mark_job_completed()
 
         logging.info(f'Successful job: {job_info}')
diff --git a/src/sequence_processing_pipeline/NuQCJob.py b/src/sequence_processing_pipeline/NuQCJob.py
index 7273473e..640da755 100644
--- a/src/sequence_processing_pipeline/NuQCJob.py
+++ b/src/sequence_processing_pipeline/NuQCJob.py
@@ -118,10 +118,12 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path,
     def _validate_project_data(self):
         # Validate project settings in [Bioinformatics]
         for project in self.project_data:
-            if project['ForwardAdapter'] == 'NA':
+            if 'ForwardAdapter' not in project or \
+                    project['ForwardAdapter'] == 'NA':
                 project['ForwardAdapter'] = None
 
-            if project['ReverseAdapter'] == 'NA':
+            if 'ReverseAdapter' not in project or \
+                    project['ReverseAdapter'] == 'NA':
                 project['ReverseAdapter'] = None
 
             if project['ForwardAdapter'] is None:
@@ -364,7 +366,10 @@ def _process_sample_sheet(self):
             raise PipelineError(s)
 
         header = sheet.Header
-        chemistry = header['chemistry']
+        if 'chemistry' in header:
+            chemistry = header['chemistry']
+        else:
+            chemistry = ''
 
         if header['Assay'] not in Pipeline.assay_types:
             s = "Assay value '%s' is not recognized." % header['Assay']
diff --git a/src/sequence_processing_pipeline/Pipeline.py b/src/sequence_processing_pipeline/Pipeline.py
index 4fa5c6b2..1749d3c9 100644
--- a/src/sequence_processing_pipeline/Pipeline.py
+++ b/src/sequence_processing_pipeline/Pipeline.py
@@ -41,7 +41,7 @@ def _get_instrument_id(run_directory):
 
     @staticmethod
     def get_instrument_type(run_directory):
-        instrument_id = InstrumentUtils._get_instrument_id(run_directory)
+        instrument_id = basename(run_directory)
         return get_model_by_instrument_id(
             instrument_id, model_key=PROFILE_NAME_KEY)
 
@@ -244,22 +244,6 @@ def __init__(self, configuration_file_path, run_id, input_file_path,
             raise PipelineError(f"A run-dir for '{self.run_id}' could not be "
                                 "found")
 
-        # required files for successful operation
-        # both RTAComplete.txt and RunInfo.xml should reside in the root of
-        # the run directory.
-        required_files = ['RTAComplete.txt', 'RunInfo.xml']
-        for some_file in required_files:
-            if not exists(join(self.run_dir, some_file)):
-                raise PipelineError(f"required file '{some_file}' is not "
-                                    f"present in {self.run_dir}.")
-
-        # verify that RunInfo.xml file is readable.
-        try:
-            fp = open(join(self.run_dir, 'RunInfo.xml'))
-            fp.close()
-        except PermissionError:
-            raise PipelineError('RunInfo.xml is present, but not readable')
-
         self.input_file_path = input_file_path
 
         if pipeline_type == Pipeline.AMPLICON_PTYPE:
diff --git a/tests/data/configuration_profiles/pacbio_metagenomic.json b/tests/data/configuration_profiles/pacbio_metagenomic.json
new file mode 100644
index 00000000..f6a88a5a
--- /dev/null
+++ b/tests/data/configuration_profiles/pacbio_metagenomic.json
@@ -0,0 +1,64 @@
+{
+  "profile": {
+    "instrument_type": "Revio",
+    "assay_type": "Metagenomic",
+    "configuration": {
+      "pacbio_convert": {
+        "nodes": 1,
+        "nprocs": 16,
+        "queue": "qiita",
+        "wallclock_time_in_minutes": 216,
+        "modules_to_load": [
+          "bclconvert_3.7.5"
+        ],
+        "executable_path": "bcl-convert",
+        "per_process_memory_limit": "10gb"
+      },
+      "nu-qc": {
+        "nodes": 1,
+        "cpus_per_task": 8,
+        "queue": "qiita",
+        "wallclock_time_in_minutes": 240,
+        "minimap2_databases": ["/databases/minimap2/db_1.mmi", "/databases/minimap2/db_2.mmi"],
+        "modules_to_load": [
+          "fastp_0.20.1",
+          "samtools_1.12",
+          "minimap2_2.18"
+        ],
+        "fastp_executable_path": "fastp",
+        "minimap2_executable_path": "minimap2",
+        "samtools_executable_path": "samtools",
+        "job_total_memory_limit": "20",
+        "job_max_array_length": 1000,
+        "known_adapters_path": "fastp_known_adapters_formatted.fna",
+        "bucket_size": 8,
+        "length_limit": 100,
+        "cores_per_task": 4,
+        "movi_executable_path": "/home/user/user_dir/Movi/build/movi-default",
+        "gres_value": 4,
+        "pmls_path": "/home/user/user_dir/human_host_filtration/scripts/qiita_filter_pmls.py",
+        "additional_fastq_tags": ["BX"]
+      },
+      "seqpro": {
+        "seqpro_path": "seqpro",
+        "modules_to_load": []
+      },
+      "fastqc": {
+        "nodes": 1,
+        "nprocs": 16,
+        "queue": "qiita",
+        "nthreads": 16,
+        "wallclock_time_in_minutes": 60,
+        "modules_to_load": [
+          "fastqc_0.11.5"
+        ],
+        "fastqc_executable_path": "fastqc",
+        "multiqc_executable_path": "multiqc",
+        "multiqc_config_file_path": "sequence_processing_pipeline/multiqc-bclconvert-config.yaml",
+        "job_total_memory_limit": "20gb",
+        "job_pool_size": 30,
+        "job_max_array_length": 1000
+      }
+    }
+  }
+}
diff --git a/tests/data/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml b/tests/data/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml
new file mode 100644
index 00000000..d4ba2c2d
--- /dev/null
+++ b/tests/data/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="utf-8"?>
+<PacBioDataModel xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" Version="4.0.1" xmlns="http://pacificbiosciences.com/PacBioDataModel.xsd">
+  <ExperimentContainer CreatedAt="2025-01-01T00:00:00.000+00:00" ModifiedAt="0001-01-01T00:00:00" ExperimentId="XX01">
+    <InvestigatorName>Default</InvestigatorName>
+    <CreatedDate>2025-01-01</CreatedDate>
+    <Runs>
+      <Run Name="XX01" CreatedAt="2025-01-01T00:00:00.000+00:00" ModifiedAt="0001-01-01T00:00:00" UniqueId="40c5e9d1-ed6d-4b34-b3d3-e3168de3bb13" MetaType="Run" TimeStampedName="r11111_20250101_111111" Status="Running" ChipType="25mChip" InstrumentType="Rev
+io" CreatedBy="admin" StartedBy="unknown" WhenStarted="2025-01-01T00:00:00.000">
+      </Run>
+    </Runs>
+  </ExperimentContainer>
\ No newline at end of file
diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py
new file mode 100644
index 00000000..12005ae4
--- /dev/null
+++ b/tests/test_PacBioWorflow.py
@@ -0,0 +1,164 @@
+# -----------------------------------------------------------------------------
+# Copyright (c) 2014--, The Qiita Development Team.
+#
+# Distributed under the terms of the BSD 3-clause License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# -----------------------------------------------------------------------------
+from qp_klp.WorkflowFactory import WorkflowFactory
+from unittest import main
+from os import makedirs
+from os.path import dirname, abspath, join, exists
+from metapool.sample_sheet import (PROTOCOL_NAME_PACBIO_SMRT)
+from qp_klp.Assays import ASSAY_NAME_METAGENOMIC
+from shutil import rmtree
+from qiita_client.testing import PluginTestCase
+from sequence_processing_pipeline.PipelineError import PipelineError
+
+
+class WorkflowFactoryTests(PluginTestCase):
+    def setUp(self):
+        self.base_dir = dirname(abspath(__file__))
+        self.output_dir = join(self.base_dir, 'test_output')
+        self.remove_these = [self.output_dir]
+
+        self.gz_source = f'{self.base_dir}/data/dummy.fastq.gz'
+
+    def tearDown(self):
+        for fp in self.remove_these:
+            if exists(fp):
+                rmtree(fp)
+
+    def _create_directory(self, fp):
+        # just making sure that we always start with a clean folder:
+        if exists(fp):
+            rmtree(fp)
+        makedirs(fp)
+        self.remove_these.append(fp)
+
+    def _inject_data(self, wf):
+        '''This is a helper method for testing that all the steps are run
+           when testing wf.execute_pipeline()
+        '''
+        pass
+        # samples = wf.pipeline.sample_sheet.samples
+        # tellseq = True
+        # if 'index' in dict(samples[0]).keys():
+        #     tellseq = False
+
+        # # inject Convert/NuQC/FastQC/MultiQC/GenPrepFileJob files so we can
+        # # move down the pipeline; first let's create the base folders
+        # if tellseq:
+        #     reports_dir = f'{self.output_dir}/TellReadJob/'
+        #     convert_dir = f'{self.output_dir}/TRIntegrateJob/integrated'
+        #     makedirs(convert_dir, exist_ok=True)
+        # else:
+        #     convert_dir = f'{self.output_dir}/ConvertJob'
+        #     reports_dir = f'{self.output_dir}/ConvertJob/Reports'
+        #     makedirs(convert_dir, exist_ok=True)
+        #     Path(f'{convert_dir}/job_completed').touch()
+        # tellread_dir = f'{self.output_dir}/TellReadJob'
+        # nuqc_dir = f'{self.output_dir}/NuQCJob'
+        # fastqc_dir = f'{self.output_dir}/FastQCJob/logs/'
+        # multiqc_dir = f'{self.output_dir}/MultiQCJob/logs/'
+        # genprep_dir = (f'{self.output_dir}/GenPrepFileJob/'
+        #                '211021_A00000_0000_SAMPLE/')
+        # makedirs(reports_dir, exist_ok=True)
+        # makedirs(nuqc_dir, exist_ok=True)
+        # makedirs(fastqc_dir, exist_ok=True)
+        # makedirs(multiqc_dir, exist_ok=True)
+        # makedirs(genprep_dir, exist_ok=True)
+        # # now let's create the required project folders
+        # for project in wf.pipeline.get_project_info():
+        #     sp = project['project_name']
+        #     makedirs(f'{convert_dir}/{sp}', exist_ok=True)
+        #     makedirs(f'{nuqc_dir}/filtered_sequences/{sp}', exist_ok=True)
+        #     makedirs(f'{genprep_dir}/{sp}/filtered_sequences/',
+        #              exist_ok=True)
+
+        # # then loop over samples and stage all fastq.gz files
+        # dstats = []
+        # for i, sample in enumerate(samples):
+        #     rp = sample["Sample_ID"]
+        #     sp = sample["Sample_Project"]
+
+        #     # ConvertJob
+        #     if tellseq:
+        #         bid = sample['barcode_id']
+        #         dstats.append(
+        #             {'Lane': sample['Lane'], 'SampleID': rp,
+        #              'Sample_Project': sp, 'Barcode': bid,
+        #              '# Reads': 2})
+        #         dname = f'{convert_dir}/{sp}'
+        #         copyfile(self.gz_source, f'{dname}/{bid}.R1.fastq.gz')
+        #         copyfile(self.gz_source, f'{dname}/{bid}.R2.fastq.gz')
+        #     else:
+        #         dstats.append(
+        #             {'Lane': sample['Lane'], 'SampleID': rp,
+        #              'Sample_Project': sp, 'Index': sample['index'],
+        #              '# Reads': 2})
+        #         dname = f'{convert_dir}/{sp}'
+        #         Path(f'{dname}/{rp}_L001_R1_001.fastq.gz').touch()
+        #         Path(f'{dname}/{rp}_L001_R2_001.fastq.gz').touch()
+
+        #     # NuQCJob
+        #     dname = f'{nuqc_dir}/filtered_sequences/{sp}'
+        #     copyfile(self.gz_source, f'{dname}/{rp}_L001_R1_001.fastq.gz')
+        #     copyfile(self.gz_source, f'{dname}/{rp}_L001_R2_001.fastq.gz')
+
+        #     # GenPrepFileJob
+        #     gprep_base = f'{genprep_dir}/{sp}/filtered_sequences/{rp}'
+        #     Path(f'{gprep_base}_L001_R1_001.fastq.gz').touch()
+        #     Path(f'{gprep_base}_L001_R2_001.fastq.gz').touch()
+
+        # # this is required by the Convert step
+        # if tellseq:
+        #     pd.DataFrame(dstats).set_index('SampleID').to_csv(
+        #         f'{tellread_dir}/sample_index_list_TellReadJob.txt')
+        # else:
+        #     pd.DataFrame(dstats).set_index('SampleID').to_csv(
+        #         f'{reports_dir}/Demultiplex_Stats.csv')
+
+        # # generating the "*.completed" files
+        # for i in range(len(samples)*3):
+        #     Path(f'{fastqc_dir}/FastQCJob_{i}.completed').touch()
+        #     Path(f'{multiqc_dir}/MultiQCJob_{i}.completed').touch()
+
+    def test_pacbio_metagenomic_workflow_creation(self):
+        kwargs = {"uif_path": "tests/data/sample-sheets/metagenomic/"
+                  "pacbio/good_pacbio_metagv10.csv",
+                  "qclient": self.qclient,
+                  "lane_number": "1",
+                  "config_fp": "tests/configuration.json",
+                  "run_identifier": "r11111_20250101_111111",
+                  "output_dir": self.output_dir,
+                  "job_id": "78901",
+                  "is_restart": False
+                  }
+
+        self._create_directory(kwargs['output_dir'])
+
+        wf = WorkflowFactory.generate_workflow(**kwargs)
+
+        # confirm that the proper type of workflow was generated.
+        self.assertEqual(wf.protocol_type, PROTOCOL_NAME_PACBIO_SMRT)
+        self.assertEqual(wf.assay_type, ASSAY_NAME_METAGENOMIC)
+
+        # self._inject_data(wf)
+        # ConvertJob/ConvertJob.sh
+
+        # Metagenomic is a valid data type in the default qiita test
+        # database but job-id: 78901 doesn't exist; however, if we get
+        # to here, it means that all steps have run to completion
+        # and the system is trying to create the preps.
+        # Note: Qiita job_id's are UUID in the database and this tests
+        # uses 78901 as the job_id so the db complains about the format
+        # with self.assertRaisesRegex(RuntimeError, 'invalid input '
+        #                             'syntax for type uuid: "78901"'):
+        with self.assertRaisesRegex(PipelineError, 'There are no fastq '
+                                    'files for FastQCJob'):
+            wf.execute_pipeline()
+
+
+if __name__ == '__main__':
+    main()

From 4439472c72f4790fef2c03fb46543049a1e0e12b Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Tue, 2 Sep 2025 06:56:07 -0600
Subject: [PATCH 03/42] fix InstrumentUtils

---
 src/sequence_processing_pipeline/Pipeline.py  | 50 ++++++++++++++-----
 .../m11111_20250101_111111_s4.metadata.xml    |  6 +--
 tests/test_Pipeline.py                        |  5 +-
 3 files changed, 44 insertions(+), 17 deletions(-)
 rename tests/data/{ => sample_run_directories}/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml (84%)

diff --git a/src/sequence_processing_pipeline/Pipeline.py b/src/sequence_processing_pipeline/Pipeline.py
index 1749d3c9..cf0a83cd 100644
--- a/src/sequence_processing_pipeline/Pipeline.py
+++ b/src/sequence_processing_pipeline/Pipeline.py
@@ -3,6 +3,7 @@
 from json.decoder import JSONDecodeError
 from os import makedirs, listdir, walk
 from os.path import join, exists, isdir, basename
+from glob import glob
 from metapool import (load_sample_sheet, AmpliconSampleSheet, is_blank,
                       parse_project_name, SAMPLE_NAME_KEY, QIITA_ID_KEY,
                       PROJECT_SHORT_NAME_KEY, PROJECT_FULL_NAME_KEY,
@@ -30,18 +31,31 @@ class InstrumentUtils():
     @staticmethod
     def _get_instrument_id(run_directory):
         run_info = join(run_directory, 'RunInfo.xml')
+        instrument_text = 'Run/Instrument'
 
+        # if RunInfo.xml doesn't exist, this might be a PacBio
+        # folder, let's check for it
         if not exists(run_info):
-            raise ValueError(f"'{run_info}' doesn't exist")
+            pacbio_metadata_fps = glob(
+                f'{run_directory}/*/metadata/*.metadata.xml')
+            if pacbio_metadata_fps:
+                run_info = pacbio_metadata_fps[0]
+                tree = ET.parse(run_info)
+                mtag = tree.getroot().tag.split('}')[0] + '}'
+                instrument_text = (
+                    f'{mtag}ExperimentContainer/{mtag}Runs/{mtag}Run')
+                run = ET.parse(run_info).find(instrument_text)
+                text = run.attrib['TimeStampedName']
+            else:
+                raise ValueError(f"'{run_info}' doesn't exist")
+        else:
+            text = ET.parse(run_info).find(instrument_text).text
 
-        with open(run_info) as f:
-            # Instrument element should appear in all valid RunInfo.xml
-            # files.
-            return ET.fromstring(f.read()).find('Run/Instrument').text
+        return text
 
     @staticmethod
     def get_instrument_type(run_directory):
-        instrument_id = basename(run_directory)
+        instrument_id = InstrumentUtils._get_instrument_id(run_directory)
         return get_model_by_instrument_id(
             instrument_id, model_key=PROFILE_NAME_KEY)
 
@@ -49,13 +63,23 @@ def get_instrument_type(run_directory):
     def _get_date(run_directory):
         run_info = join(run_directory, 'RunInfo.xml')
 
+        # if RunInfo.xml doesn't exist, this might be a PacBio
+        # folder, let's check for it
         if not exists(run_info):
-            raise ValueError(f"'{run_info}' doesn't exist")
-
-        with open(run_info) as f:
-            # Date element should appear in all valid RunInfo.xml
-            # files.
-            date_string = ET.fromstring(f.read()).find('Run/Date').text
+            pacbio_metadata_fps = glob(
+                f'{run_directory}/*/metadata/*.metadata.xml')
+            if pacbio_metadata_fps:
+                run_info = pacbio_metadata_fps[0]
+                tree = ET.parse(run_info)
+                mtag = tree.getroot().tag.split('}')[0] + '}'
+                instrument_text = (
+                    f'{mtag}ExperimentContainer/{mtag}Runs/{mtag}Run')
+                run = ET.parse(run_info).find(instrument_text)
+                date_string = run.attrib['CreatedAt'].split('T')[0]
+            else:
+                raise ValueError(f"'{run_info}' doesn't exist")
+        else:
+            date_string = ET.parse(run_info).find('Run/Date').text
 
         # date is recorded in RunInfo.xml in various formats. Iterate
         # through all known formats until the date is properly parsed.
@@ -64,7 +88,7 @@ def _get_date(run_directory):
         # UTC time w/out confirming the machines were actually set for
         # and/or reporting UTC time.
         formats = ["%y%m%d", "%Y-%m-%dT%H:%M:%s", "%Y-%m-%dT%H:%M:%SZ",
-                   "%m/%d/%Y %I:%M:%S %p"]
+                   "%m/%d/%Y %I:%M:%S %p", "%Y-%m-%d",]
 
         for format in formats:
             try:
diff --git a/tests/data/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml b/tests/data/sample_run_directories/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml
similarity index 84%
rename from tests/data/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml
rename to tests/data/sample_run_directories/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml
index d4ba2c2d..8bc3a16c 100644
--- a/tests/data/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml
+++ b/tests/data/sample_run_directories/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml
@@ -4,8 +4,8 @@
     <InvestigatorName>Default</InvestigatorName>
     <CreatedDate>2025-01-01</CreatedDate>
     <Runs>
-      <Run Name="XX01" CreatedAt="2025-01-01T00:00:00.000+00:00" ModifiedAt="0001-01-01T00:00:00" UniqueId="40c5e9d1-ed6d-4b34-b3d3-e3168de3bb13" MetaType="Run" TimeStampedName="r11111_20250101_111111" Status="Running" ChipType="25mChip" InstrumentType="Rev
-io" CreatedBy="admin" StartedBy="unknown" WhenStarted="2025-01-01T00:00:00.000">
+      <Run Name="XX01" CreatedAt="2025-01-01T00:00:00.000+00:00" ModifiedAt="0001-01-01T00:00:00" UniqueId="40c5e9d1-ed6d-4b34-b3d3-e3168de3bb13" MetaType="Run" TimeStampedName="r11111_20250101_111111" Status="Running" ChipType="25mChip" InstrumentType="Revio" CreatedBy="admin" StartedBy="unknown" WhenStarted="2025-01-01T00:00:00.000">
       </Run>
     </Runs>
-  </ExperimentContainer>
\ No newline at end of file
+  </ExperimentContainer>
+</PacBioDataModel>
diff --git a/tests/test_Pipeline.py b/tests/test_Pipeline.py
index f1f69e2f..ebe4a495 100644
--- a/tests/test_Pipeline.py
+++ b/tests/test_Pipeline.py
@@ -2431,7 +2431,10 @@ def test_instrument_utils(self):
                                                  'date': '2023-12-15'},
                '150629_K1001_0511_AH5L7GBCXX': {'id': 'K1001',
                                                 'type': 'HiSeq 4000',
-                                                'date': '2015-06-29'}}
+                                                'date': '2015-06-29'},
+               'r11111_20250101_111111': {'id': 'r11111',
+                                          'type': 'Revio',
+                                          'date': '2025-01-01'}}
 
         run_directories = []
         for root, dirs, files in walk(self.path('sample_run_directories')):

From 6bf358678399b99711a5c8e02424cec85285d5c9 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Tue, 2 Sep 2025 09:53:40 -0600
Subject: [PATCH 04/42] rm test_required_file_checks

---
 .../pacbio/good_pacbio_metagv10.csv           | 27 +++++++
 tests/test_Pipeline.py                        | 73 -------------------
 2 files changed, 27 insertions(+), 73 deletions(-)
 create mode 100644 tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv

diff --git a/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv b/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv
new file mode 100644
index 00000000..f2407be1
--- /dev/null
+++ b/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv
@@ -0,0 +1,27 @@
+[Header],,,,,,
+SheetType,pacbio_metag,,,,,
+SheetVersion,10,,,,,
+Investigator Name,Knight,,,,,
+Experiment Name,RKL_experiment,,,,,
+Date,2025-08-29,,,,,
+Assay,Metagenomic,,,,,
+Description,,,,,,
+,,,,,,
+[Data],,,,,,
+Sample_ID,Sample_Name,Sample_Plate,well_id_384,Sample_Project,Well_description,Lane
+bc3011,3A,sample_plate_1,A1,MyProject_6123,sample_plate_1.3A.A1,1
+bc0112,4A,sample_plate_1,A2,MyProject_6123,sample_plate_1.4A.A2,1
+bc9992,5B,sample_plate_1,A3,MyProject_6123,sample_plate_1.5B.A3,1
+,,,,,,
+[Bioinformatics],,,,,,
+Sample_Project,QiitaID,HumanFiltering,library_construction_protocol,experiment_design_description,contains_replicates,
+MyProject_6123,6123,False,some protocol,some description,False,
+,,,,,,
+[Contact],,,,,,
+Sample_Project,Email,,,,,
+MyProject_6123,foo@bar.org,,,,,
+,,,,,,
+[SampleContext],,,,,,
+sample_name,sample_type,primary_qiita_study,secondary_qiita_studies,,,
+5B,control blank,6123,,,,
+,,,,,,
diff --git a/tests/test_Pipeline.py b/tests/test_Pipeline.py
index ebe4a495..b1cc502e 100644
--- a/tests/test_Pipeline.py
+++ b/tests/test_Pipeline.py
@@ -207,43 +207,6 @@ def test_get_orig_names_from_sheet_with_replicates(self):
 
         self.assertEqual(set(obs), exp)
 
-    def test_required_file_checks(self):
-        # begin this test by deleting the RunInfo.txt file and verifying that
-        # Pipeline object will raise an Error.
-        self.delete_runinfo_file()
-
-        with self.assertRaisesRegex(PipelineError, "required file 'RunInfo.xml"
-                                                   "' is not present."):
-            Pipeline(self.good_config_file, self.good_run_id,
-                     self.good_sample_sheet_path,
-                     self.output_file_path,
-                     self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
-
-        # delete RTAComplete.txt and recreate RunInfo.txt file to verify that
-        # an Error is raised when only RTAComplete.txt is missing.
-        self.delete_rtacomplete_file()
-        self.create_runinfo_file()
-
-        with self.assertRaisesRegex(PipelineError, "required file 'RTAComplete"
-                                                   ".txt' is not present."):
-            Pipeline(self.good_config_file, self.good_run_id,
-                     self.good_sample_sheet_path,
-                     self.output_file_path,
-                     self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
-
-        # make RunInfo.xml file unreadable and verify that Pipeline object
-        # raises the expected Error.
-        self.create_rtacomplete_file()
-        self.make_runinfo_file_unreadable()
-
-        with self.assertRaisesRegex(PipelineError, "RunInfo.xml is present, bu"
-                                                   "t not readable"):
-            Pipeline(self.good_config_file, self.good_run_id,
-                     self.good_sample_sheet_path,
-                     self.output_file_path,
-                     self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
-        self.make_runinfo_file_readable()
-
     def test_creation(self):
         # Pipeline should assert due to config_file
         with self.assertRaises(PipelineError) as e:
@@ -1772,42 +1735,6 @@ def delete_rtacomplete_file(self):
             # make method idempotent
             pass
 
-    def test_required_file_checks(self):
-        # begin this test by deleting the RunInfo.txt file and verifying that
-        # Pipeline object will raise an Error.
-        self.delete_runinfo_file()
-
-        with self.assertRaisesRegex(PipelineError, "required file 'RunInfo.xml"
-                                                   "' is not present."):
-            Pipeline(self.good_config_file, self.good_run_id,
-                     self.good_mapping_file_path,
-                     self.output_file_path,
-                     self.qiita_id, Pipeline.AMPLICON_PTYPE)
-
-        # delete RTAComplete.txt and recreate RunInfo.txt file to verify that
-        # an Error is raised when only RTAComplete.txt is missing.
-        self.delete_rtacomplete_file()
-        self.create_runinfo_file()
-
-        with self.assertRaisesRegex(PipelineError, "required file 'RTAComplete"
-                                                   ".txt' is not present."):
-            Pipeline(self.good_config_file, self.good_run_id,
-                     self.good_mapping_file_path,
-                     self.output_file_path,
-                     self.qiita_id, Pipeline.AMPLICON_PTYPE)
-
-        # make RunInfo.xml file unreadable and verify that Pipeline object
-        # raises the expected Error.
-        self.create_rtacomplete_file()
-        self.make_runinfo_file_unreadable()
-
-        with self.assertRaisesRegex(PipelineError, "RunInfo.xml is present, "
-                                                   "but not readable"):
-            Pipeline(self.good_config_file, self.good_run_id,
-                     self.good_mapping_file_path, self.output_file_path,
-                     self.qiita_id, Pipeline.AMPLICON_PTYPE)
-            self.make_runinfo_file_readable()
-
     def test_creation(self):
         # Pipeline should assert due to config_file
         with self.assertRaises(PipelineError) as e:

From 411a1195274e458113df2f7a3185d7d0ee9b6397 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Tue, 2 Sep 2025 11:53:53 -0600
Subject: [PATCH 05/42] m11111_20250101_111111_s4

---
 .../metadata/m11111_20250101_111111_s4.metadata.xml   | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 tests/data/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml

diff --git a/tests/data/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml b/tests/data/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml
new file mode 100644
index 00000000..8bc3a16c
--- /dev/null
+++ b/tests/data/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="utf-8"?>
+<PacBioDataModel xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" Version="4.0.1" xmlns="http://pacificbiosciences.com/PacBioDataModel.xsd">
+  <ExperimentContainer CreatedAt="2025-01-01T00:00:00.000+00:00" ModifiedAt="0001-01-01T00:00:00" ExperimentId="XX01">
+    <InvestigatorName>Default</InvestigatorName>
+    <CreatedDate>2025-01-01</CreatedDate>
+    <Runs>
+      <Run Name="XX01" CreatedAt="2025-01-01T00:00:00.000+00:00" ModifiedAt="0001-01-01T00:00:00" UniqueId="40c5e9d1-ed6d-4b34-b3d3-e3168de3bb13" MetaType="Run" TimeStampedName="r11111_20250101_111111" Status="Running" ChipType="25mChip" InstrumentType="Revio" CreatedBy="admin" StartedBy="unknown" WhenStarted="2025-01-01T00:00:00.000">
+      </Run>
+    </Runs>
+  </ExperimentContainer>
+</PacBioDataModel>

From e8a2d0b9d50486d2c5844f0a7c91b22729beee59 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Tue, 2 Sep 2025 16:09:05 -0600
Subject: [PATCH 06/42] add conda

---
 src/qp_klp/PacBioMetagenomicWorkflow.py        |  2 +-
 src/sequence_processing_pipeline/ConvertJob.py | 11 +++++------
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/qp_klp/PacBioMetagenomicWorkflow.py b/src/qp_klp/PacBioMetagenomicWorkflow.py
index 3036755b..96040f16 100644
--- a/src/qp_klp/PacBioMetagenomicWorkflow.py
+++ b/src/qp_klp/PacBioMetagenomicWorkflow.py
@@ -48,7 +48,7 @@ def __init__(self, **kwargs):
             for sample in self.pipeline.sample_sheet.samples]
         df = pd.DataFrame(samples)
         sample_list_fp = f"{self.kwargs['output_dir']}/sample_list.tsv"
-        df.to_csv(sample_list_fp, sep='t')
+        df.to_csv(sample_list_fp, sep='\t', header=False, index=False)
 
         self.master_qiita_job_id = self.kwargs['job_id']
 
diff --git a/src/sequence_processing_pipeline/ConvertJob.py b/src/sequence_processing_pipeline/ConvertJob.py
index e67227f4..4c6093ff 100644
--- a/src/sequence_processing_pipeline/ConvertJob.py
+++ b/src/sequence_processing_pipeline/ConvertJob.py
@@ -456,16 +456,15 @@ def _generate_job_script(self):
         lines.append("set -x")
         lines.append('date')
         lines.append('hostname')
-        lines.append(f'cd {self.root_dir}')
-
-        if self.modules_to_load:
-            lines.append("module load " + ' '.join(self.modules_to_load))
+        lines.append('conda activate klp-2025-05')
+        lines.append(f'cd {self.output_path}')
 
-        lines.append(f"sample_list={self.root_dir}/sample_list.tsv")
+        lines.append("sample_list=${PWD}/sample_list.tsv")
         lines.append(
             'while read bc sn pn; do echo '
             '"bam2fastq -j 1 -o ${sn}.fastq.gz -c 9 '
-            '${pn}/${Run_prefix}.hifi_reads.${bn}.bam"; '
+            f'{self.root_dir}'
+            '/*/hifi_reads/*{bc}*.bam"; '
             'fqtools count ${sn}.fastq.gz > ${sn}.counts.txt;'
             'done < ${sample_list}')
 

From 5e16ab11bfd4a064fc74f701983860bc20c39ba4 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Wed, 3 Sep 2025 06:47:08 -0600
Subject: [PATCH 07/42] pacbio_generate_bam2fastq_commands

---
 pyproject.toml                                |  1 +
 src/qp_klp/PacBioMetagenomicWorkflow.py       |  2 +-
 src/qp_klp/scripts/pacbio_commands.py         | 49 +++++++++++++++++++
 .../ConvertJob.py                             | 10 ++--
 4 files changed, 54 insertions(+), 8 deletions(-)
 create mode 100755 src/qp_klp/scripts/pacbio_commands.py

diff --git a/pyproject.toml b/pyproject.toml
index fb74ed59..64a74943 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,3 +59,4 @@ dependencies = [
 configure_klp = "qp_klp.scripts.configure_klp:config"
 start_klp = "qp_klp.scripts.start_klp:execute"
 demux = "sequence_processing_pipeline.scripts.cli:demux"
+pacbio_generate_bam2fastq_commands = "qp_klp.scripts.pacbio_commands:generate_bam2fastq_commands"
diff --git a/src/qp_klp/PacBioMetagenomicWorkflow.py b/src/qp_klp/PacBioMetagenomicWorkflow.py
index 96040f16..aae032f8 100644
--- a/src/qp_klp/PacBioMetagenomicWorkflow.py
+++ b/src/qp_klp/PacBioMetagenomicWorkflow.py
@@ -48,7 +48,7 @@ def __init__(self, **kwargs):
             for sample in self.pipeline.sample_sheet.samples]
         df = pd.DataFrame(samples)
         sample_list_fp = f"{self.kwargs['output_dir']}/sample_list.tsv"
-        df.to_csv(sample_list_fp, sep='\t', header=False, index=False)
+        df.to_csv(sample_list_fp, sep='\t', index=False)
 
         self.master_qiita_job_id = self.kwargs['job_id']
 
diff --git a/src/qp_klp/scripts/pacbio_commands.py b/src/qp_klp/scripts/pacbio_commands.py
new file mode 100755
index 00000000..611c0e94
--- /dev/null
+++ b/src/qp_klp/scripts/pacbio_commands.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+
+# -----------------------------------------------------------------------------
+# Copyright (c) 2014--, The Qiita Development Team.
+#
+# Distributed under the terms of the BSD 3-clause License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# -----------------------------------------------------------------------------
+import click
+import pandas as pd
+from glob import glob
+from os import makedirs
+
+
+@click.command()
+@click.argument('sample_list', required=True)
+@click.argument('run_folder', required=True)
+@click.argument('outdir', required=True)
+def generate_bam2fastq_commands(sample_list, run_folder, outdir):
+    """Generates the bam2fastq commands"""
+    df = pd.read_csv(sample_list)
+    files = {f.split('.')[-2]: f
+             for f in glob(f'{run_folder}/*/hifi_reads/*.bam')}
+
+    makedirs(outdir, exist_ok=True)
+
+    commands, missing_files = [], []
+    for row in df.iterrows():
+        bc = row['barcode']
+        sn = row['sample_name']
+        pn = row['project_name']
+        if bc not in files:
+            missing_files.append(bc)
+            continue
+        od = f'{outdir}/{pn}'
+        makedirs(od, exist_ok=True)
+        cmd = (f'bam2fastq -j 1 -o {od}/{sn} -c 9 '
+               f'{files[bc]}; '
+               f'fqtools count {od}/{sn}.fastq.gz > '
+               f'{od}/{sn}.counts.txt')
+        commands.append(cmd)
+
+    if missing_files:
+        raise ValueError(
+            f'{run_folder} is missing barcodes: {missing_files}')
+
+    for cmd in commands:
+        print(cmd)
diff --git a/src/sequence_processing_pipeline/ConvertJob.py b/src/sequence_processing_pipeline/ConvertJob.py
index 4c6093ff..17b7ca9e 100644
--- a/src/sequence_processing_pipeline/ConvertJob.py
+++ b/src/sequence_processing_pipeline/ConvertJob.py
@@ -459,14 +459,10 @@ def _generate_job_script(self):
         lines.append('conda activate klp-2025-05')
         lines.append(f'cd {self.output_path}')
 
-        lines.append("sample_list=${PWD}/sample_list.tsv")
         lines.append(
-            'while read bc sn pn; do echo '
-            '"bam2fastq -j 1 -o ${sn}.fastq.gz -c 9 '
-            f'{self.root_dir}'
-            '/*/hifi_reads/*{bc}*.bam"; '
-            'fqtools count ${sn}.fastq.gz > ${sn}.counts.txt;'
-            'done < ${sample_list}')
+            f'pacbio_generate_bam2fastq_commands ../sample_list.tsv '
+            f'{self.root_dir} {self.output_path} '
+            f'| parallel -j {self.node_count}')
 
         with open(self.job_script_path, 'w') as f:
             for line in lines:

From 827a37238a7f114b68e690f98af797dc1d59b348 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Wed, 3 Sep 2025 07:43:58 -0600
Subject: [PATCH 08/42] self.node_count -> self.nprocs

---
 src/qp_klp/scripts/pacbio_commands.py          | 9 +++++----
 src/sequence_processing_pipeline/ConvertJob.py | 4 +++-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/qp_klp/scripts/pacbio_commands.py b/src/qp_klp/scripts/pacbio_commands.py
index 611c0e94..ff0c0369 100755
--- a/src/qp_klp/scripts/pacbio_commands.py
+++ b/src/qp_klp/scripts/pacbio_commands.py
@@ -19,14 +19,14 @@
 @click.argument('outdir', required=True)
 def generate_bam2fastq_commands(sample_list, run_folder, outdir):
     """Generates the bam2fastq commands"""
-    df = pd.read_csv(sample_list)
+    df = pd.read_csv(sample_list, sep='\t')
     files = {f.split('.')[-2]: f
              for f in glob(f'{run_folder}/*/hifi_reads/*.bam')}
 
     makedirs(outdir, exist_ok=True)
 
     commands, missing_files = [], []
-    for row in df.iterrows():
+    for _, row in df.iterrows():
         bc = row['barcode']
         sn = row['sample_name']
         pn = row['project_name']
@@ -35,9 +35,10 @@ def generate_bam2fastq_commands(sample_list, run_folder, outdir):
             continue
         od = f'{outdir}/{pn}'
         makedirs(od, exist_ok=True)
-        cmd = (f'bam2fastq -j 1 -o {od}/{sn} -c 9 '
+        fn = f'{od}/{sn}_R1'
+        cmd = (f'bam2fastq -j 1 -o {fn} -c 9 '
                f'{files[bc]}; '
-               f'fqtools count {od}/{sn}.fastq.gz > '
+               f'fqtools count {fn}.fastq.gz > '
                f'{od}/{sn}.counts.txt')
         commands.append(cmd)
 
diff --git a/src/sequence_processing_pipeline/ConvertJob.py b/src/sequence_processing_pipeline/ConvertJob.py
index 17b7ca9e..a752d67d 100644
--- a/src/sequence_processing_pipeline/ConvertJob.py
+++ b/src/sequence_processing_pipeline/ConvertJob.py
@@ -454,15 +454,17 @@ def _generate_job_script(self):
         lines.append(f"#SBATCH --mem-per-cpu {self.pmem}")
 
         lines.append("set -x")
+        lines.append('set -e')
         lines.append('date')
         lines.append('hostname')
+        lines.append('source ~/.bash_profile')
         lines.append('conda activate klp-2025-05')
         lines.append(f'cd {self.output_path}')
 
         lines.append(
             f'pacbio_generate_bam2fastq_commands ../sample_list.tsv '
             f'{self.root_dir} {self.output_path} '
-            f'| parallel -j {self.node_count}')
+            f'| parallel -j {self.nprocs}')
 
         with open(self.job_script_path, 'w') as f:
             for line in lines:

From 551bfd18edd15bbbd1e45de2f18b409a10ce014c Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Wed, 3 Sep 2025 08:11:51 -0600
Subject: [PATCH 09/42] generate_sequence_counts

---
 src/qp_klp/Protocol.py                        | 51 ++++++++-----------
 .../pacbio_metagenomic.json                   | 21 +++-----
 2 files changed, 29 insertions(+), 43 deletions(-)

diff --git a/src/qp_klp/Protocol.py b/src/qp_klp/Protocol.py
index 71e740e9..41e5dc46 100644
--- a/src/qp_klp/Protocol.py
+++ b/src/qp_klp/Protocol.py
@@ -432,38 +432,29 @@ def convert_raw_to_fastq(self):
         return failed_samples
 
     def generate_sequence_counts(self):
-        # config = self.pipeline.get_software_configuration('tell-seq')
-
-        # files_to_count_path = join(self.pipeline.output_path,
-        #                            'files_to_count.txt')
-
-        # with open(files_to_count_path, 'w') as f:
-        #     for root, _, files in walk(self.raw_fastq_files_path):
-        #         for _file in files:
-        #             if determine_orientation(_file) in ['R1', 'R2']:
-        #                 print(join(root, _file), file=f)
-
-        # job = SeqCountsJob(self.pipeline.run_dir,
-        #                    self.pipeline.output_path,
-        #                    config['queue'],
-        #                    config['nodes'],
-        #                    config['wallclock_time_in_minutes'],
-        #                    config['normcount_mem_limit'],
-        #                    config['modules_to_load'],
-        #                    self.master_qiita_job_id,
-        #                    config['job_max_array_length'],
-        #                    files_to_count_path,
-        #                    self.pipeline.get_sample_sheet_path(),
-        #                    cores_per_task=config['tellread_cores'])
-
-        # if 'SeqCountsJob' not in self.skip_steps:
-        #     job.run(callback=self.job_callback)
-
-        # if successful, set self.reports_path
+        # for other isntances of generate_sequence_counts in other objects
+        # the sequence counting needs to be done; however, for PacBio we
+        # already have done it and just need to merge the results.
+        gz_files = glob(f'{self.raw_fastq_files_path}/*/*.fastq.gz')
+        data, missing_files = [], []
+
+        for gzf in gz_files:
+            cf = gzf.replace('.fastq.gz', '.counts.txt')
+            sn = basename(cf).replace('.counts.txt', '')
+            if not exists(cf):
+                missing_files.append(sn)
+                continue
+            with open(cf, 'r') as fh:
+                counts = fh.read().strip()
+            data.append({'SampleID': sn, '# Reads': counts})
+
+        if missing_files:
+            raise ValueError(f'Missing count files: {missing_files}')
+
+        df = pd.DataFrame(data)
         self.reports_path = join(self.pipeline.output_path,
                                  'SeqCounts.csv')
-        open(self.reports_path, 'w').write(
-            'SampleID,# Reads\nA1,100')
+        df.to_csv(self.reports_path, index=False)
 
     def integrate_results(self):
         pass
diff --git a/tests/data/configuration_profiles/pacbio_metagenomic.json b/tests/data/configuration_profiles/pacbio_metagenomic.json
index f6a88a5a..4e4eb7cb 100644
--- a/tests/data/configuration_profiles/pacbio_metagenomic.json
+++ b/tests/data/configuration_profiles/pacbio_metagenomic.json
@@ -8,23 +8,20 @@
         "nprocs": 16,
         "queue": "qiita",
         "wallclock_time_in_minutes": 216,
-        "modules_to_load": [
-          "bclconvert_3.7.5"
-        ],
-        "executable_path": "bcl-convert",
-        "per_process_memory_limit": "10gb"
+        "modules_to_load": [],
+        "executable_path": "",
+        "per_process_memory_limit": "1gb"
       },
       "nu-qc": {
         "nodes": 1,
         "cpus_per_task": 8,
         "queue": "qiita",
         "wallclock_time_in_minutes": 240,
-        "minimap2_databases": ["/databases/minimap2/db_1.mmi", "/databases/minimap2/db_2.mmi"],
-        "modules_to_load": [
-          "fastp_0.20.1",
-          "samtools_1.12",
-          "minimap2_2.18"
+        "minimap2_databases": [
+          "/databases/minimap2/db_1.mmi",
+          "/databases/minimap2/db_2.mmi"
         ],
+        "modules_to_load": ["fastp_0.20.1", "samtools_1.12", "minimap2_2.18"],
         "fastp_executable_path": "fastp",
         "minimap2_executable_path": "minimap2",
         "samtools_executable_path": "samtools",
@@ -49,9 +46,7 @@
         "queue": "qiita",
         "nthreads": 16,
         "wallclock_time_in_minutes": 60,
-        "modules_to_load": [
-          "fastqc_0.11.5"
-        ],
+        "modules_to_load": ["fastqc_0.11.5"],
         "fastqc_executable_path": "fastqc",
         "multiqc_executable_path": "multiqc",
         "multiqc_config_file_path": "sequence_processing_pipeline/multiqc-bclconvert-config.yaml",

From 6311723e593894282e327b146e8c0bf2f60b0150 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Wed, 3 Sep 2025 11:24:00 -0600
Subject: [PATCH 10/42] init changes to     def _inject_data(self, wf):

---
 src/qp_klp/scripts/pacbio_commands.py         |  2 +-
 .../pacbio_metagenomic.json                   |  2 +-
 tests/test_PacBioWorflow.py                   | 70 ++++++-------------
 3 files changed, 24 insertions(+), 50 deletions(-)

diff --git a/src/qp_klp/scripts/pacbio_commands.py b/src/qp_klp/scripts/pacbio_commands.py
index ff0c0369..5671909e 100755
--- a/src/qp_klp/scripts/pacbio_commands.py
+++ b/src/qp_klp/scripts/pacbio_commands.py
@@ -39,7 +39,7 @@ def generate_bam2fastq_commands(sample_list, run_folder, outdir):
         cmd = (f'bam2fastq -j 1 -o {fn} -c 9 '
                f'{files[bc]}; '
                f'fqtools count {fn}.fastq.gz > '
-               f'{od}/{sn}.counts.txt')
+               f'{fn}.counts.txt')
         commands.append(cmd)
 
     if missing_files:
diff --git a/tests/data/configuration_profiles/pacbio_metagenomic.json b/tests/data/configuration_profiles/pacbio_metagenomic.json
index 4e4eb7cb..1e4a60ce 100644
--- a/tests/data/configuration_profiles/pacbio_metagenomic.json
+++ b/tests/data/configuration_profiles/pacbio_metagenomic.json
@@ -5,7 +5,7 @@
     "configuration": {
       "pacbio_convert": {
         "nodes": 1,
-        "nprocs": 16,
+        "nprocs": 28,
         "queue": "qiita",
         "wallclock_time_in_minutes": 216,
         "modules_to_load": [],
diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py
index 12005ae4..9152c685 100644
--- a/tests/test_PacBioWorflow.py
+++ b/tests/test_PacBioWorflow.py
@@ -12,8 +12,10 @@
 from metapool.sample_sheet import (PROTOCOL_NAME_PACBIO_SMRT)
 from qp_klp.Assays import ASSAY_NAME_METAGENOMIC
 from shutil import rmtree
+from pathlib import Path
 from qiita_client.testing import PluginTestCase
 from sequence_processing_pipeline.PipelineError import PipelineError
+import pandas as pd
 
 
 class WorkflowFactoryTests(PluginTestCase):
@@ -40,30 +42,19 @@ def _inject_data(self, wf):
         '''This is a helper method for testing that all the steps are run
            when testing wf.execute_pipeline()
         '''
-        pass
-        # samples = wf.pipeline.sample_sheet.samples
-        # tellseq = True
-        # if 'index' in dict(samples[0]).keys():
-        #     tellseq = False
-
-        # # inject Convert/NuQC/FastQC/MultiQC/GenPrepFileJob files so we can
-        # # move down the pipeline; first let's create the base folders
-        # if tellseq:
-        #     reports_dir = f'{self.output_dir}/TellReadJob/'
-        #     convert_dir = f'{self.output_dir}/TRIntegrateJob/integrated'
-        #     makedirs(convert_dir, exist_ok=True)
-        # else:
-        #     convert_dir = f'{self.output_dir}/ConvertJob'
-        #     reports_dir = f'{self.output_dir}/ConvertJob/Reports'
-        #     makedirs(convert_dir, exist_ok=True)
-        #     Path(f'{convert_dir}/job_completed').touch()
+        samples = wf.pipeline.sample_sheet.samples
+
+        convert_dir = f'{self.output_dir}/ConvertJob'
+        reports_dir = f'{self.output_dir}/ConvertJob/Reports'
+        makedirs(convert_dir, exist_ok=True)
+        makedirs(reports_dir, exist_ok=True)
+        Path(f'{convert_dir}/job_completed').touch()
         # tellread_dir = f'{self.output_dir}/TellReadJob'
         # nuqc_dir = f'{self.output_dir}/NuQCJob'
         # fastqc_dir = f'{self.output_dir}/FastQCJob/logs/'
         # multiqc_dir = f'{self.output_dir}/MultiQCJob/logs/'
         # genprep_dir = (f'{self.output_dir}/GenPrepFileJob/'
         #                '211021_A00000_0000_SAMPLE/')
-        # makedirs(reports_dir, exist_ok=True)
         # makedirs(nuqc_dir, exist_ok=True)
         # makedirs(fastqc_dir, exist_ok=True)
         # makedirs(multiqc_dir, exist_ok=True)
@@ -77,29 +68,17 @@ def _inject_data(self, wf):
         #              exist_ok=True)
 
         # # then loop over samples and stage all fastq.gz files
-        # dstats = []
-        # for i, sample in enumerate(samples):
-        #     rp = sample["Sample_ID"]
-        #     sp = sample["Sample_Project"]
-
-        #     # ConvertJob
-        #     if tellseq:
-        #         bid = sample['barcode_id']
-        #         dstats.append(
-        #             {'Lane': sample['Lane'], 'SampleID': rp,
-        #              'Sample_Project': sp, 'Barcode': bid,
-        #              '# Reads': 2})
-        #         dname = f'{convert_dir}/{sp}'
-        #         copyfile(self.gz_source, f'{dname}/{bid}.R1.fastq.gz')
-        #         copyfile(self.gz_source, f'{dname}/{bid}.R2.fastq.gz')
-        #     else:
-        #         dstats.append(
-        #             {'Lane': sample['Lane'], 'SampleID': rp,
-        #              'Sample_Project': sp, 'Index': sample['index'],
-        #              '# Reads': 2})
-        #         dname = f'{convert_dir}/{sp}'
-        #         Path(f'{dname}/{rp}_L001_R1_001.fastq.gz').touch()
-        #         Path(f'{dname}/{rp}_L001_R2_001.fastq.gz').touch()
+        dstats = []
+        for i, sample in enumerate(samples):
+            rp = sample["Sample_ID"]
+            sp = sample["Sample_Project"]
+            dstats.append(
+                {'Lane': sample['Lane'], 'SampleID': rp,
+                    'Sample_Project': sp, 'Index': sample['index'],
+                    '# Reads': 2})
+            dname = f'{convert_dir}/{sp}'
+            Path(f'{dname}/{rp}_L001_R1_001.fastq.gz').touch()
+            Path(f'{dname}/{rp}_L001_R2_001.fastq.gz').touch()
 
         #     # NuQCJob
         #     dname = f'{nuqc_dir}/filtered_sequences/{sp}'
@@ -111,13 +90,8 @@ def _inject_data(self, wf):
         #     Path(f'{gprep_base}_L001_R1_001.fastq.gz').touch()
         #     Path(f'{gprep_base}_L001_R2_001.fastq.gz').touch()
 
-        # # this is required by the Convert step
-        # if tellseq:
-        #     pd.DataFrame(dstats).set_index('SampleID').to_csv(
-        #         f'{tellread_dir}/sample_index_list_TellReadJob.txt')
-        # else:
-        #     pd.DataFrame(dstats).set_index('SampleID').to_csv(
-        #         f'{reports_dir}/Demultiplex_Stats.csv')
+        pd.DataFrame(dstats).set_index('SampleID').to_csv(
+            f'{reports_dir}/Demultiplex_Stats.csv')
 
         # # generating the "*.completed" files
         # for i in range(len(samples)*3):

From 40afadf10ed332ac577670c5b9da6340badfd651 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Wed, 3 Sep 2025 13:11:05 -0600
Subject: [PATCH 11/42] read_length

---
 src/qp_klp/Assays.py                          |  3 ++-
 src/qp_klp/Protocol.py                        |  3 +++
 src/qp_klp/WorkflowFactory.py                 |  3 +++
 src/sequence_processing_pipeline/NuQCJob.py   | 22 +++++++++++++++----
 .../pacbio_metagenomic.json                   |  2 +-
 tests/test_PacBioWorflow.py                   |  5 ++---
 6 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/src/qp_klp/Assays.py b/src/qp_klp/Assays.py
index 3ad6e873..e2fc0ede 100644
--- a/src/qp_klp/Assays.py
+++ b/src/qp_klp/Assays.py
@@ -517,7 +517,8 @@ def qc_reads(self):
                       bucket_size=config['bucket_size'],
                       length_limit=config['length_limit'],
                       cores_per_task=config['cores_per_task'],
-                      files_regex=self.files_regex)
+                      files_regex=self.files_regex,
+                      read_length=self.read_length)
 
         if 'NuQCJob' not in self.skip_steps:
             job.run(callback=self.job_callback)
diff --git a/src/qp_klp/Protocol.py b/src/qp_klp/Protocol.py
index 41e5dc46..1b0eb71e 100644
--- a/src/qp_klp/Protocol.py
+++ b/src/qp_klp/Protocol.py
@@ -86,6 +86,7 @@ class Illumina(Protocol):
     # here) both RTAComplete.txt and RunInfo.xml should reside in the root of
     # the run directory.
     required_files = ['RTAComplete.txt', 'RunInfo.xml']
+    read_length = 'short'
 
     def __init__(self) -> None:
         super().__init__()
@@ -178,6 +179,7 @@ def generate_sequence_counts(self):
 
 class TellSeq(Protocol):
     protocol_type = PROTOCOL_NAME_TELLSEQ
+    read_length = 'short'
 
     def convert_raw_to_fastq(self):
         config = self.pipeline.get_software_configuration('tell-seq')
@@ -395,6 +397,7 @@ def _post_process_file(self, fastq_file, mapping, lane):
 
 class PacBio(Protocol):
     protocol_type = PROTOCOL_NAME_PACBIO_SMRT
+    read_length = 'long'
 
     def convert_raw_to_fastq(self):
         config = self.pipeline.get_software_configuration('pacbio_convert')
diff --git a/src/qp_klp/WorkflowFactory.py b/src/qp_klp/WorkflowFactory.py
index 8d51cee6..870449e5 100644
--- a/src/qp_klp/WorkflowFactory.py
+++ b/src/qp_klp/WorkflowFactory.py
@@ -78,6 +78,9 @@ def generate_workflow(cls, **kwargs):
                              "sample-sheet or a mapping-file.")
 
         for workflow in WorkflowFactory.WORKFLOWS:
+            if workflow.read_length not in {'short', 'long'}:
+                raise ValueError('Invalid read_length: '
+                                 f'{workflow.read_length} for {workflow}')
             if workflow.assay_type == assay_type:
                 if workflow.protocol_type == instrument_type:
                     # return instantiated workflow object
diff --git a/src/sequence_processing_pipeline/NuQCJob.py b/src/sequence_processing_pipeline/NuQCJob.py
index 640da755..fabc42eb 100644
--- a/src/sequence_processing_pipeline/NuQCJob.py
+++ b/src/sequence_processing_pipeline/NuQCJob.py
@@ -24,7 +24,8 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path,
                  samtools_path, modules_to_load, qiita_job_id,
                  max_array_length, known_adapters_path, movi_path, gres_value,
                  pmls_path, additional_fastq_tags, bucket_size=8,
-                 length_limit=100, cores_per_task=4, files_regex='SPP'):
+                 length_limit=100, cores_per_task=4, files_regex='SPP',
+                 read_length='short'):
         """
         Submit a slurm job where the contents of fastq_root_dir are processed
         using fastp, minimap2, and samtools. Human-genome sequences will be
@@ -52,6 +53,7 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path,
         :param additional_fastq_tags: A list of fastq tags to preserve during
         filtering.
         :param files_regex: the FILES_REGEX to use for parsing files
+        :param read_length: the FILES_REGEX to use for parsing files
         """
         super().__init__(fastq_root_dir,
                          output_path,
@@ -417,6 +419,17 @@ def _generate_mmi_filter_cmds(self, working_dir):
             tags = " -T %s" % ','.join(self.additional_fastq_tags)
             t_switch = " -y"
 
+        if self.read_length == 'short':
+            minimap2_prefix = f'minimap2 -2 -ax sr{t_switch}'
+            minimap2_subfix = '-a'
+            samtools_params = '-f 12 -F 256'
+        elif self.read_length == 'long':
+            minimap2_prefix = 'minimap2 -2 -ax map-hifi'
+            minimap2_subfix = '--no-pairing'
+            samtools_params = '-f 4 -F 256'
+        else:
+            raise ValueError(f'minimap2 prefix not set for {self.read_length}')
+
         for count, mmi_db_path in enumerate(self.mmi_file_paths):
             if count == 0:
                 # prime initial state with unfiltered file and create first of
@@ -432,9 +445,10 @@ def _generate_mmi_filter_cmds(self, working_dir):
                 input = tmp_file1
                 output = tmp_file2
 
-            cmds.append(f"minimap2 -2 -ax sr{t_switch} -t {cores_to_allocate} "
-                        f"{mmi_db_path} {input} -a | samtools fastq -@ "
-                        f"{cores_to_allocate} -f 12 -F 256{tags} > "
+            cmds.append(f"{minimap2_prefix} -t {cores_to_allocate} "
+                        f"{mmi_db_path} {input} {minimap2_subfix} | "
+                        "samtools fastq -@ "
+                        f"{cores_to_allocate} {samtools_params}{tags} > "
                         f"{output}")
 
         # rename the latest tmp file to the final output filename.
diff --git a/tests/data/configuration_profiles/pacbio_metagenomic.json b/tests/data/configuration_profiles/pacbio_metagenomic.json
index 1e4a60ce..19168ba3 100644
--- a/tests/data/configuration_profiles/pacbio_metagenomic.json
+++ b/tests/data/configuration_profiles/pacbio_metagenomic.json
@@ -9,7 +9,7 @@
         "queue": "qiita",
         "wallclock_time_in_minutes": 216,
         "modules_to_load": [],
-        "executable_path": "",
+        "executable_path": "bam2fastq",
         "per_process_memory_limit": "1gb"
       },
       "nu-qc": {
diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py
index 9152c685..437689c9 100644
--- a/tests/test_PacBioWorflow.py
+++ b/tests/test_PacBioWorflow.py
@@ -77,8 +77,7 @@ def _inject_data(self, wf):
                     'Sample_Project': sp, 'Index': sample['index'],
                     '# Reads': 2})
             dname = f'{convert_dir}/{sp}'
-            Path(f'{dname}/{rp}_L001_R1_001.fastq.gz').touch()
-            Path(f'{dname}/{rp}_L001_R2_001.fastq.gz').touch()
+            Path(f'{dname}/{rp}_R1_001.fastq.gz').touch()
 
         #     # NuQCJob
         #     dname = f'{nuqc_dir}/filtered_sequences/{sp}'
@@ -118,7 +117,7 @@ def test_pacbio_metagenomic_workflow_creation(self):
         self.assertEqual(wf.protocol_type, PROTOCOL_NAME_PACBIO_SMRT)
         self.assertEqual(wf.assay_type, ASSAY_NAME_METAGENOMIC)
 
-        # self._inject_data(wf)
+        self._inject_data(wf)
         # ConvertJob/ConvertJob.sh
 
         # Metagenomic is a valid data type in the default qiita test

From 52f4c533e7f3613b2f5487079410110004764df8 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Wed, 3 Sep 2025 13:38:20 -0600
Subject: [PATCH 12/42] pmls_extra_parameters

---
 src/sequence_processing_pipeline/NuQCJob.py   | 61 +++++++++++--------
 .../templates/nuqc_job.sh                     |  6 +-
 2 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/src/sequence_processing_pipeline/NuQCJob.py b/src/sequence_processing_pipeline/NuQCJob.py
index fabc42eb..a68331bd 100644
--- a/src/sequence_processing_pipeline/NuQCJob.py
+++ b/src/sequence_processing_pipeline/NuQCJob.py
@@ -83,6 +83,7 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path,
         self.pmls_path = pmls_path
         self.additional_fastq_tags = additional_fastq_tags
         self.audit_folders = ['filtered_sequences']
+        self.read_length = read_length
 
         # for projects that use sequence_processing_pipeline as a dependency,
         # jinja_env must be set to sequence_processing_pipeline's root path,
@@ -492,6 +493,14 @@ def _generate_job_script(self, max_bucket_size):
         # files can be created. (${jobd})
         mmi_filter_cmds = self._generate_mmi_filter_cmds("${jobd}")
 
+        if self.read_length == 'short':
+            pmls_extra_parameters = ''
+        elif self.read_length == 'long':
+            pmls_extra_parameters = '"max" 21'
+        else:
+            raise ValueError(
+                f'pmls_extra_parameters not set for {self.read_length}')
+
         with open(job_script_path, mode="w", encoding="utf-8") as f:
             # the job resources should come from a configuration file
 
@@ -499,31 +508,33 @@ def _generate_job_script(self, max_bucket_size):
             # processing begins.
             mtl = ' '.join(self.modules_to_load)
 
-            f.write(template.render(job_name=job_name,
-                                    queue_name=self.queue_name,
-                                    # should be 4 * 24 * 60 = 4 days
-                                    wall_time_limit=self.wall_time_limit,
-                                    mem_in_gb=self.jmem,
-                                    # Note NuQCJob now maps node_count to
-                                    # SLURM -N parameter to act like other
-                                    # Job classes.
-                                    # self.node_count should be 1
-                                    node_count=self.node_count,
-                                    # cores-per-task (-c) should be 4
-                                    cores_per_task=self.cores_per_task,
-                                    knwn_adpt_path=self.known_adapters_path,
-                                    output_path=self.output_path,
-                                    html_path=html_path,
-                                    json_path=json_path,
-                                    demux_path=demux_path,
-                                    temp_dir=self.temp_dir,
-                                    splitter_binary=splitter_binary,
-                                    modules_to_load=mtl,
-                                    length_limit=self.length_limit,
-                                    gres_value=self.gres_value,
-                                    movi_path=self.movi_path,
-                                    mmi_filter_cmds=mmi_filter_cmds,
-                                    pmls_path=self.pmls_path))
+            f.write(template.render(
+                job_name=job_name,
+                queue_name=self.queue_name,
+                # should be 4 * 24 * 60 = 4 days
+                wall_time_limit=self.wall_time_limit,
+                mem_in_gb=self.jmem,
+                # Note NuQCJob now maps node_count to
+                # SLURM -N parameter to act like other
+                # Job classes.
+                # self.node_count should be 1
+                node_count=self.node_count,
+                # cores-per-task (-c) should be 4
+                cores_per_task=self.cores_per_task,
+                knwn_adpt_path=self.known_adapters_path,
+                output_path=self.output_path,
+                html_path=html_path,
+                json_path=json_path,
+                demux_path=demux_path,
+                temp_dir=self.temp_dir,
+                splitter_binary=splitter_binary,
+                modules_to_load=mtl,
+                length_limit=self.length_limit,
+                gres_value=self.gres_value,
+                movi_path=self.movi_path,
+                mmi_filter_cmds=mmi_filter_cmds,
+                pmls_extra_parameters=pmls_extra_parameters,
+                pmls_path=self.pmls_path))
 
         return job_script_path
 
diff --git a/src/sequence_processing_pipeline/templates/nuqc_job.sh b/src/sequence_processing_pipeline/templates/nuqc_job.sh
index 7ada81f4..962bba24 100644
--- a/src/sequence_processing_pipeline/templates/nuqc_job.sh
+++ b/src/sequence_processing_pipeline/templates/nuqc_job.sh
@@ -125,10 +125,10 @@ function mux-runner () {
         --index /scratch/movi_hg38_chm13_hprc94 \
         --read ${seq_reads_filter_alignment} \
         --stdout | gzip > ${jobd}/seqs.movi.txt.gz
-        
-    python {{pmls_path}} <(zcat ${jobd}/seqs.movi.txt.gz) | \
+
+    python {{pmls_path}} <(zcat ${jobd}/seqs.movi.txt.gz) {{pmls_extra_parameters}} | \
         seqtk subseq ${seq_reads_filter_alignment} - > ${jobd}/seqs.final.fastq
-         
+
     {{splitter_binary}} ${jobd}/seqs.final.fastq \
         ${jobd}/reads.r1.fastq ${delimiter} ${r1_tag} &
     {{splitter_binary}} ${jobd}/seqs.final.fastq \

From e18cf5df8fe59b714ae4d960bd3ea5c7313fef1b Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Wed, 3 Sep 2025 14:07:33 -0600
Subject: [PATCH 13/42] rm index in _inject_data

---
 src/qp_klp/Protocol.py                        |  3 +-
 src/qp_klp/scripts/pacbio_commands.py         |  5 +-
 src/sequence_processing_pipeline/Commands.py  | 78 ++++++++++++-------
 .../ConvertJob.py                             |  8 +-
 src/sequence_processing_pipeline/NuQCJob.py   |  6 +-
 tests/test_PacBioWorflow.py                   |  6 +-
 6 files changed, 68 insertions(+), 38 deletions(-)

diff --git a/src/qp_klp/Protocol.py b/src/qp_klp/Protocol.py
index 1b0eb71e..a6ba180b 100644
--- a/src/qp_klp/Protocol.py
+++ b/src/qp_klp/Protocol.py
@@ -443,7 +443,7 @@ def generate_sequence_counts(self):
 
         for gzf in gz_files:
             cf = gzf.replace('.fastq.gz', '.counts.txt')
-            sn = basename(cf).replace('.counts.txt', '')
+            sn = basename(cf).replace('_R1.counts.txt', '')
             if not exists(cf):
                 missing_files.append(sn)
                 continue
@@ -456,6 +456,7 @@ def generate_sequence_counts(self):
 
         df = pd.DataFrame(data)
         self.reports_path = join(self.pipeline.output_path,
+                                 'ConvertJob',
                                  'SeqCounts.csv')
         df.to_csv(self.reports_path, index=False)
 
diff --git a/src/qp_klp/scripts/pacbio_commands.py b/src/qp_klp/scripts/pacbio_commands.py
index 5671909e..37dd6048 100755
--- a/src/qp_klp/scripts/pacbio_commands.py
+++ b/src/qp_klp/scripts/pacbio_commands.py
@@ -17,7 +17,8 @@
 @click.argument('sample_list', required=True)
 @click.argument('run_folder', required=True)
 @click.argument('outdir', required=True)
-def generate_bam2fastq_commands(sample_list, run_folder, outdir):
+@click.argument('threads', required=True, default=1)
+def generate_bam2fastq_commands(sample_list, run_folder, outdir, threads):
     """Generates the bam2fastq commands"""
     df = pd.read_csv(sample_list, sep='\t')
     files = {f.split('.')[-2]: f
@@ -36,7 +37,7 @@ def generate_bam2fastq_commands(sample_list, run_folder, outdir):
         od = f'{outdir}/{pn}'
         makedirs(od, exist_ok=True)
         fn = f'{od}/{sn}_R1'
-        cmd = (f'bam2fastq -j 1 -o {fn} -c 9 '
+        cmd = (f'bam2fastq -j {threads} -o {fn} -c 9 '
                f'{files[bc]}; '
                f'fqtools count {fn}.fastq.gz > '
                f'{fn}.counts.txt')
diff --git a/src/sequence_processing_pipeline/Commands.py b/src/sequence_processing_pipeline/Commands.py
index f2a196b2..cfa60c2c 100644
--- a/src/sequence_processing_pipeline/Commands.py
+++ b/src/sequence_processing_pipeline/Commands.py
@@ -6,12 +6,13 @@
 
 
 def split_similar_size_bins(data_location_path, max_file_list_size_in_gb,
-                            batch_prefix):
+                            batch_prefix, allow_fwd_only=False):
     '''Partitions input fastqs to coarse bins
 
     :param data_location_path: Path to the ConvertJob directory.
     :param max_file_list_size_in_gb: Upper threshold for file-size.
     :param batch_prefix: Path + file-name prefix for output-files.
+    :param allow_fwd_only: ignore rev match, helpful for long reads.
     :return: The number of output-files created, size of largest bin.
     '''
     # to prevent issues w/filenames like the ones below from being mistaken
@@ -37,31 +38,56 @@ def split_similar_size_bins(data_location_path, max_file_list_size_in_gb,
     bucket_size = 0
     max_bucket_size = 0
 
-    for a, b in iter_paired_files(fastq_paths):
-        r1_size = os.stat(a).st_size
-        r2_size = os.stat(b).st_size
-
-        output_base = os.path.dirname(a).split('/')[-1]
-        if current_size + r1_size > max_size:
-            # bucket is full.
-            if bucket_size > max_bucket_size:
-                max_bucket_size = bucket_size
-
-            # reset bucket_size.
-            bucket_size = r1_size + r2_size
-
-            if fp is not None:
-                fp.close()
-
-            split_offset += 1
-            current_size = r1_size
-            fp = open(batch_prefix + '-%d' % split_offset, 'w')
-        else:
-            # add to bucket_size
-            bucket_size += r1_size + r2_size
-            current_size += r1_size
-
-        fp.write("%s\t%s\t%s\n" % (a, b, output_base))
+    if allow_fwd_only:
+        for a in fastq_paths:
+            r1_size = os.stat(a).st_size
+            output_base = os.path.dirname(a).split('/')[-1]
+            if current_size + r1_size > max_size:
+                # bucket is full.
+                if bucket_size > max_bucket_size:
+                    max_bucket_size = bucket_size
+
+                # reset bucket_size.
+                bucket_size = r1_size
+
+                if fp is not None:
+                    fp.close()
+
+                split_offset += 1
+                current_size = r1_size
+                fp = open(batch_prefix + '-%d' % split_offset, 'w')
+            else:
+                # add to bucket_size
+                bucket_size += r1_size
+                current_size += r1_size
+
+            fp.write("%s\t%s\n" % (a, output_base))
+    else:
+        for a, b in iter_paired_files(fastq_paths):
+            r1_size = os.stat(a).st_size
+            r2_size = os.stat(b).st_size
+
+            output_base = os.path.dirname(a).split('/')[-1]
+            if current_size + r1_size > max_size:
+                # bucket is full.
+                if bucket_size > max_bucket_size:
+                    max_bucket_size = bucket_size
+
+                # reset bucket_size.
+                bucket_size = r1_size + r2_size
+
+                if fp is not None:
+                    fp.close()
+
+                split_offset += 1
+                current_size = r1_size
+                fp = open(batch_prefix + '-%d' % split_offset, 'w')
+            else:
+                # add to bucket_size
+                bucket_size += r1_size + r2_size
+                current_size += r1_size
+
+            fp.write("%s\t%s\t%s\n" % (a, b, output_base))
 
     if fp is not None:
         fp.close()
diff --git a/src/sequence_processing_pipeline/ConvertJob.py b/src/sequence_processing_pipeline/ConvertJob.py
index a752d67d..3772ae86 100644
--- a/src/sequence_processing_pipeline/ConvertJob.py
+++ b/src/sequence_processing_pipeline/ConvertJob.py
@@ -433,8 +433,10 @@ def _generate_job_script(self):
         Generate a Torque job script for processing supplied root_directory.
         :return: The path to the newly-created job-script.
         """
-        lines = []
+        threads_per_job = 2
+        parallel_jobs = int(self.nprocs/threads_per_job)
 
+        lines = []
         lines.append("#!/bin/bash")
         lines.append(f"#SBATCH --job-name {self.qiita_job_id}_{self.job_name}")
         lines.append(f"#SBATCH -p {self.queue_name}")
@@ -463,8 +465,8 @@ def _generate_job_script(self):
 
         lines.append(
             f'pacbio_generate_bam2fastq_commands ../sample_list.tsv '
-            f'{self.root_dir} {self.output_path} '
-            f'| parallel -j {self.nprocs}')
+            f'{self.root_dir} {self.output_path} {threads_per_job}'
+            f'| parallel -j {parallel_jobs}')
 
         with open(self.job_script_path, 'w') as f:
             for line in lines:
diff --git a/src/sequence_processing_pipeline/NuQCJob.py b/src/sequence_processing_pipeline/NuQCJob.py
index a68331bd..7b9288da 100644
--- a/src/sequence_processing_pipeline/NuQCJob.py
+++ b/src/sequence_processing_pipeline/NuQCJob.py
@@ -239,9 +239,9 @@ def run(self, callback=None):
             batch_count = 0
             max_size = 0
         else:
-            batch_count, max_size = split_similar_size_bins(self.root_dir,
-                                                            self.bucket_size,
-                                                            batch_location)
+            batch_count, max_size = split_similar_size_bins(
+                self.root_dir, self.bucket_size, batch_location,
+                self.read_length == 'long')
 
         job_script_path = self._generate_job_script(max_size)
 
diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py
index 437689c9..8ed636b0 100644
--- a/tests/test_PacBioWorflow.py
+++ b/tests/test_PacBioWorflow.py
@@ -73,9 +73,9 @@ def _inject_data(self, wf):
             rp = sample["Sample_ID"]
             sp = sample["Sample_Project"]
             dstats.append(
-                {'Lane': sample['Lane'], 'SampleID': rp,
-                    'Sample_Project': sp, 'Index': sample['index'],
-                    '# Reads': 2})
+                {'SampleID': rp,
+                 'Sample_Project': sp,
+                 '# Reads': 2})
             dname = f'{convert_dir}/{sp}'
             Path(f'{dname}/{rp}_R1_001.fastq.gz').touch()
 

From 0734a6ef3c81d1c4f1b8faa321482968a49d5b9d Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Wed, 3 Sep 2025 15:10:47 -0600
Subject: [PATCH 14/42] dstats

---
 tests/test_PacBioWorflow.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py
index 8ed636b0..46327cfd 100644
--- a/tests/test_PacBioWorflow.py
+++ b/tests/test_PacBioWorflow.py
@@ -70,14 +70,11 @@ def _inject_data(self, wf):
         # # then loop over samples and stage all fastq.gz files
         dstats = []
         for i, sample in enumerate(samples):
-            rp = sample["Sample_ID"]
+            sn = sample['Sample_Name'],
             sp = sample["Sample_Project"]
-            dstats.append(
-                {'SampleID': rp,
-                 'Sample_Project': sp,
-                 '# Reads': 2})
+            dstats.append({'SampleID': sn, '# Reads': 2})
             dname = f'{convert_dir}/{sp}'
-            Path(f'{dname}/{rp}_R1_001.fastq.gz').touch()
+            Path(f'{dname}/{sn}_R1.fastq.gz').touch()
 
         #     # NuQCJob
         #     dname = f'{nuqc_dir}/filtered_sequences/{sp}'

From 281bf7578968a7a9ad85d72f9b4a181f9360be8c Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Wed, 3 Sep 2025 16:42:32 -0600
Subject: [PATCH 15/42] nuqc_job_single.sh

---
 .../ConvertJob.py                             |   2 +-
 src/sequence_processing_pipeline/NuQCJob.py   |   3 +-
 .../templates/nuqc_job_single.sh              | 171 ++++++++++++++++++
 .../pacbio/good_pacbio_metagv10.csv           |   2 +-
 4 files changed, 175 insertions(+), 3 deletions(-)
 create mode 100644 src/sequence_processing_pipeline/templates/nuqc_job_single.sh

diff --git a/src/sequence_processing_pipeline/ConvertJob.py b/src/sequence_processing_pipeline/ConvertJob.py
index 3772ae86..4e99a8aa 100644
--- a/src/sequence_processing_pipeline/ConvertJob.py
+++ b/src/sequence_processing_pipeline/ConvertJob.py
@@ -466,7 +466,7 @@ def _generate_job_script(self):
         lines.append(
             f'pacbio_generate_bam2fastq_commands ../sample_list.tsv '
             f'{self.root_dir} {self.output_path} {threads_per_job}'
-            f'| parallel -j {parallel_jobs}')
+            f' | parallel -j {parallel_jobs}')
 
         with open(self.job_script_path, 'w') as f:
             for line in lines:
diff --git a/src/sequence_processing_pipeline/NuQCJob.py b/src/sequence_processing_pipeline/NuQCJob.py
index 7b9288da..540c4762 100644
--- a/src/sequence_processing_pipeline/NuQCJob.py
+++ b/src/sequence_processing_pipeline/NuQCJob.py
@@ -468,7 +468,6 @@ def _generate_job_script(self, max_bucket_size):
             return None
 
         job_script_path = join(self.output_path, 'process_all_fastq_files.sh')
-        template = self.jinja_env.get_template("nuqc_job.sh")
 
         job_name = f'{self.qiita_job_id}_{self.job_name}'
 
@@ -495,8 +494,10 @@ def _generate_job_script(self, max_bucket_size):
 
         if self.read_length == 'short':
             pmls_extra_parameters = ''
+            template = self.jinja_env.get_template("nuqc_job.sh")
         elif self.read_length == 'long':
             pmls_extra_parameters = '"max" 21'
+            template = self.jinja_env.get_template("nuqc_job_single.sh")
         else:
             raise ValueError(
                 f'pmls_extra_parameters not set for {self.read_length}')
diff --git a/src/sequence_processing_pipeline/templates/nuqc_job_single.sh b/src/sequence_processing_pipeline/templates/nuqc_job_single.sh
new file mode 100644
index 00000000..dc991fb6
--- /dev/null
+++ b/src/sequence_processing_pipeline/templates/nuqc_job_single.sh
@@ -0,0 +1,171 @@
+#!/bin/bash -l
+#SBATCH -J {{job_name}}
+#SBATCH -p {{queue_name}}
+### wall-time-limit in minutes
+#SBATCH --time {{wall_time_limit}}
+#SBATCH --mem {{mem_in_gb}}G
+#SBATCH -N {{node_count}}
+### Note cores_per_task maps to fastp & minimap2 thread counts
+### as well as sbatch -c. demux threads remains fixed at 1.
+### Note -c set to 4 and thread counts set to 7 during testing.
+#SBATCH -c {{cores_per_task}}
+### Commented out for now, but there is a possibility it will be needed
+### in the future.
+###SBATCH --gres=node_jobs:{{gres_value}}
+#SBATCH --constraint="amd"
+
+
+echo "---------------"
+echo "Run details:"
+echo "$SLURM_JOB_NAME $SLURM_JOB_ID $SLURMD_NODENAME $SLURM_ARRAY_TASK_ID"
+echo "---------------"
+
+if [[ -z "${SLURM_ARRAY_TASK_ID}" ]]; then
+    echo "Not operating within an array"
+    exit 1
+fi
+if [[ -z ${PREFIX} ]]; then
+    echo "PREFIX is not set"
+    exit 1
+fi
+if [[ -z ${OUTPUT} ]]; then
+    echo "OUTPUT is not set"
+    exit 1
+fi
+
+conda activate qp-knight-lab-processing-2022.03
+module load {{modules_to_load}}
+
+set -x
+set -e
+set -o pipefail
+
+export FILES=$(printf "%s-%d" ${PREFIX} ${SLURM_ARRAY_TASK_ID})
+if [[ ! -f ${FILES} ]]; then
+    logger ${FILES} not found
+    exit 1
+fi
+# set a temp directory, make a new unique one under it and
+# make sure we clean up as we're dumping to shm
+# DO NOT do this casually. Only do a clean up like this if
+# you know for sure TMPDIR is what you want.
+
+WKDIR=${OUTPUT}/
+TMPDIR=${OUTPUT}
+export TMPDIR=${TMPDIR}
+export TMPDIR=$(mktemp -d)
+echo $TMPDIR
+
+mkdir -p ${WKDIR}/fastp_reports_dir/html
+mkdir -p ${WKDIR}/fastp_reports_dir/json
+
+export ADAPTER_ONLY_OUTPUT=${OUTPUT}/only-adapter-filtered
+mkdir -p ${ADAPTER_ONLY_OUTPUT}
+
+function cleanup {
+  echo "Removing $TMPDIR"
+  rm -fr $TMPDIR
+  unset TMPDIR
+}
+trap cleanup EXIT
+
+export delimiter=::MUX::
+export r1_tag=/1
+function mux-runner () {
+    n=$(wc -l ${FILES} | cut -f 1 -d" ")
+
+    jobd=${TMPDIR}
+    id_map=${jobd}/id_map
+    seqs_reads=${jobd}/seqs.interleaved.fastq
+    seq_reads_filter_alignment=${jobd}/seqs.interleaved.filter_alignment.fastq
+
+    for i in $(seq 1 ${n})
+    do
+        line=$(head -n ${i} ${FILES} | tail -n 1)
+        r1=$(echo ${line} | cut -f 1 -d" ")
+        base=$(echo ${line} | cut -f 2 -d" ")
+        r1_name=$(basename ${r1} .fastq.gz)
+        r_adapter_only=${ADAPTER_ONLY_OUTPUT}/${r1_name}.interleave.fastq.gz
+
+        s_name=$(basename "${r1}" | sed -r 's/\.fastq\.gz//')
+        html_name=$(echo "$s_name.html")
+        json_name=$(echo "$s_name.json")
+
+        echo -e "${i}\t${r1_name}\t${base}" >> ${id_map}
+
+        # movi, in the current version, works on the interleaved version of the
+        # fwd/rev reads so we are gonna take advantage fastp default output
+        # to minimize steps. Additionally, movi expects the input to not be
+        # gz, so we are not going to compress seqs_r1
+
+        fastp \
+            -l {{length_limit}} \
+            -i ${r1} \
+            -w {{cores_per_task}} \
+            --adapter_fasta {{knwn_adpt_path}} \
+            --html {{html_path}}/${html_name} \
+            --json {{json_path}}/${json_name} \
+            --stdout | gzip > ${r_adapter_only}
+
+        # multiplex and write adapter filtered data all at once
+        zcat ${r_adapter_only} | \
+            sed -r "1~4s/^@(.*)/@${i}${delimiter}\1/" \
+            >> ${seqs_reads}
+    done
+
+    # minimap/samtools pair commands are now generated in NuQCJob._generate_mmi_filter_cmds()
+    # and passed to this template.
+    {{mmi_filter_cmds}}
+
+    {{movi_path}} query \
+        --index /scratch/movi_hg38_chm13_hprc94 \
+        --read ${seq_reads_filter_alignment} \
+        --stdout | gzip > ${jobd}/seqs.movi.txt.gz
+
+    python {{pmls_path}} <(zcat ${jobd}/seqs.movi.txt.gz) {{pmls_extra_parameters}} | \
+        seqtk subseq ${seq_reads_filter_alignment} - > ${jobd}/seqs.final.fastq
+
+    {{splitter_binary}} ${jobd}/seqs.final.fastq \
+        ${jobd}/reads.r1.fastq ${delimiter} ${r1_tag} &
+    wait
+
+    # keep seqs.movi.txt and migrate it to NuQCJob directory.
+    mv ${jobd}/seqs.movi.txt.gz {{output_path}}/logs/seqs.movi.${SLURM_ARRAY_TASK_ID}.txt.gz
+}
+export -f mux-runner
+
+
+function demux-runner () {
+    n_demux_jobs=${SLURM_CPUS_PER_TASK}
+    jobd=${TMPDIR}
+    id_map=${jobd}/id_map
+    seqs_r1=${jobd}/reads.r1.fastq.paired.fq
+
+    id_map=${jobd}/id_map
+    if [[ ! -f ${id_map} ]]; then
+        echo "No samples..."
+        return
+    fi
+
+    for idx in $(seq 0 ${n_demux_jobs})
+    do
+        python {{demux_path}} \
+            --id-map ${id_map} \
+            --infile <(cat ${seqs_r1}) \
+            --output ${OUTPUT} \
+            --task ${idx} \
+            --maxtask ${n_demux_jobs} &
+    done
+    wait
+}
+export -f demux-runner
+
+mux-runner
+
+mkdir -p ${OUTPUT}
+
+echo "$(date) :: demux start"
+demux-runner
+echo "$(date) :: demux stop"
+
+touch ${OUTPUT}/${SLURM_JOB_NAME}.${SLURM_ARRAY_TASK_ID}.completed
diff --git a/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv b/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv
index f2407be1..786c4b1f 100644
--- a/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv
+++ b/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv
@@ -8,7 +8,7 @@ Assay,Metagenomic,,,,,
 Description,,,,,,
 ,,,,,,
 [Data],,,,,,
-Sample_ID,Sample_Name,Sample_Plate,well_id_384,Sample_Project,Well_description,Lane
+Sample_ID,Sample_Name,Sample_Plate,Sample_Well,Sample_Project,Well_description,Lane
 bc3011,3A,sample_plate_1,A1,MyProject_6123,sample_plate_1.3A.A1,1
 bc0112,4A,sample_plate_1,A2,MyProject_6123,sample_plate_1.4A.A2,1
 bc9992,5B,sample_plate_1,A3,MyProject_6123,sample_plate_1.5B.A3,1

From 1aa102cd0befe70c467414f7da2524acc17688a8 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Wed, 3 Sep 2025 17:13:25 -0600
Subject: [PATCH 16/42] rm extra ,

---
 tests/test_PacBioWorflow.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py
index 46327cfd..fd35c159 100644
--- a/tests/test_PacBioWorflow.py
+++ b/tests/test_PacBioWorflow.py
@@ -69,8 +69,8 @@ def _inject_data(self, wf):
 
         # # then loop over samples and stage all fastq.gz files
         dstats = []
-        for i, sample in enumerate(samples):
-            sn = sample['Sample_Name'],
+        for sample in samples:
+            sn = sample['Sample_Name']
             sp = sample["Sample_Project"]
             dstats.append({'SampleID': sn, '# Reads': 2})
             dname = f'{convert_dir}/{sp}'

From 01c862de03fe5b3373adb2a8f6c66ff64ed8f752 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Thu, 4 Sep 2025 03:10:50 -0600
Subject: [PATCH 17/42] print Profile selected

---
 src/sequence_processing_pipeline/Pipeline.py | 2 ++
 tests/test_PacBioWorflow.py                  | 1 +
 2 files changed, 3 insertions(+)

diff --git a/src/sequence_processing_pipeline/Pipeline.py b/src/sequence_processing_pipeline/Pipeline.py
index cf0a83cd..667bd6d8 100644
--- a/src/sequence_processing_pipeline/Pipeline.py
+++ b/src/sequence_processing_pipeline/Pipeline.py
@@ -445,6 +445,8 @@ def _configure_profile(self):
                              f"{self.assay_type}) was not found. Please notify"
                              " an administrator")
 
+        print(f'Profile selected: {selected_profile}')
+
         self.config_profile = selected_profile
 
     def _directory_check(self, directory_path, create=False):
diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py
index fd35c159..3cddfbc1 100644
--- a/tests/test_PacBioWorflow.py
+++ b/tests/test_PacBioWorflow.py
@@ -74,6 +74,7 @@ def _inject_data(self, wf):
             sp = sample["Sample_Project"]
             dstats.append({'SampleID': sn, '# Reads': 2})
             dname = f'{convert_dir}/{sp}'
+            makedirs(dname, exist_ok=True)
             Path(f'{dname}/{sn}_R1.fastq.gz').touch()
 
         #     # NuQCJob

From 05036b71c8c5029b05b27d9bad48d7b7c79c1a92 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Thu, 4 Sep 2025 08:28:15 -0600
Subject: [PATCH 18/42] counts.txt in _inject_data

---
 .../pacbio_metagenomic.json                     | 17 +++++++++++------
 tests/test_PacBioWorflow.py                     |  2 ++
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/tests/data/configuration_profiles/pacbio_metagenomic.json b/tests/data/configuration_profiles/pacbio_metagenomic.json
index 19168ba3..0981ffc7 100644
--- a/tests/data/configuration_profiles/pacbio_metagenomic.json
+++ b/tests/data/configuration_profiles/pacbio_metagenomic.json
@@ -16,25 +16,30 @@
         "nodes": 1,
         "cpus_per_task": 8,
         "queue": "qiita",
-        "wallclock_time_in_minutes": 240,
+        "wallclock_time_in_minutes": 2440,
         "minimap2_databases": [
           "/databases/minimap2/db_1.mmi",
           "/databases/minimap2/db_2.mmi"
         ],
-        "modules_to_load": ["fastp_0.20.1", "samtools_1.12", "minimap2_2.18"],
+        "modules_to_load": [
+          "gcc_9.3.0",
+          "fastp_0.20.1",
+          "samtools_1.12",
+          "minimap2_2.28"
+        ],
         "fastp_executable_path": "fastp",
         "minimap2_executable_path": "minimap2",
         "samtools_executable_path": "samtools",
-        "job_total_memory_limit": "20",
+        "job_total_memory_limit": "110",
         "job_max_array_length": 1000,
         "known_adapters_path": "fastp_known_adapters_formatted.fna",
-        "bucket_size": 8,
+        "bucket_size": 18,
         "length_limit": 100,
-        "cores_per_task": 4,
+        "cores_per_task": 16,
         "movi_executable_path": "/home/user/user_dir/Movi/build/movi-default",
         "gres_value": 4,
         "pmls_path": "/home/user/user_dir/human_host_filtration/scripts/qiita_filter_pmls.py",
-        "additional_fastq_tags": ["BX"]
+        "additional_fastq_tags": []
       },
       "seqpro": {
         "seqpro_path": "seqpro",
diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py
index 3cddfbc1..9c355e6e 100644
--- a/tests/test_PacBioWorflow.py
+++ b/tests/test_PacBioWorflow.py
@@ -76,6 +76,8 @@ def _inject_data(self, wf):
             dname = f'{convert_dir}/{sp}'
             makedirs(dname, exist_ok=True)
             Path(f'{dname}/{sn}_R1.fastq.gz').touch()
+            with open(f'{dname}/{sn}_R1.counts.txt', 'w') as f:
+                f.write("2")
 
         #     # NuQCJob
         #     dname = f'{nuqc_dir}/filtered_sequences/{sp}'

From d5adee0ebb2544fe97b1901142199a3bbbd7fdb9 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Thu, 4 Sep 2025 10:00:11 -0600
Subject: [PATCH 19/42] demux_just_fwd

---
 pyproject.toml                                |  1 +
 src/qp_klp/PacBioMetagenomicWorkflow.py       |  2 +-
 src/sequence_processing_pipeline/Commands.py  | 79 +++++++++++++++++++
 .../templates/nuqc_job_single.sh              |  2 +-
 .../pacbio_metagenomic.json                   |  2 +-
 5 files changed, 83 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 64a74943..3ee2658b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,4 +59,5 @@ dependencies = [
 configure_klp = "qp_klp.scripts.configure_klp:config"
 start_klp = "qp_klp.scripts.start_klp:execute"
 demux = "sequence_processing_pipeline.scripts.cli:demux"
+demux_single = "sequence_processing_pipeline.scripts.cli:demux_single"
 pacbio_generate_bam2fastq_commands = "qp_klp.scripts.pacbio_commands:generate_bam2fastq_commands"
diff --git a/src/qp_klp/PacBioMetagenomicWorkflow.py b/src/qp_klp/PacBioMetagenomicWorkflow.py
index aae032f8..a1815fff 100644
--- a/src/qp_klp/PacBioMetagenomicWorkflow.py
+++ b/src/qp_klp/PacBioMetagenomicWorkflow.py
@@ -4,6 +4,7 @@
 from .Assays import ASSAY_NAME_METAGENOMIC
 from .FailedSamplesRecord import FailedSamplesRecord
 from .Workflows import Workflow
+import pandas as pd
 
 
 class PacBioMetagenomicWorkflow(Workflow, Metagenomic, PacBio):
@@ -40,7 +41,6 @@ def __init__(self, **kwargs):
         self.fsr = FailedSamplesRecord(self.kwargs['output_dir'],
                                        self.pipeline.sample_sheet.samples)
 
-        import pandas as pd
         samples = [
             {'barcode': sample['Sample_ID'],
              'sample_name': sample['Sample_Name'],
diff --git a/src/sequence_processing_pipeline/Commands.py b/src/sequence_processing_pipeline/Commands.py
index cfa60c2c..68effc2e 100644
--- a/src/sequence_processing_pipeline/Commands.py
+++ b/src/sequence_processing_pipeline/Commands.py
@@ -192,3 +192,82 @@ def demux(id_map, fp, out_d, task, maxtask):
     for d in openfps.values():
         for f in d.values():
             f.close()
+
+
+def demux_just_fwd(id_map, fp, out_d, task, maxtask):
+    """Split infile data based in provided map"""
+    delimiter = '::MUX::'
+    mode = 'wt'
+    ext = '.fastq.gz'
+    sep = '/'
+    rec = '@'
+
+    openfps = {}
+
+    for offset, (idx, r1, outbase) in enumerate(id_map):
+        if offset % maxtask == task:
+            idx = rec + idx
+
+            # setup output locations
+            outdir = out_d + sep + outbase
+            fullname_r1 = outdir + sep + r1 + ext
+
+            # we have seen in lustre that sometime this line
+            # can have a raise condition; making sure it doesn't break
+            # things
+            try:
+                os.makedirs(outdir, exist_ok=True)
+            except FileExistsError:
+                pass
+            current_fp_r1 = gzip.open(fullname_r1, mode)
+            current_fp = {'1': current_fp_r1}
+            openfps[idx] = current_fp
+
+    # setup a parser
+    seq_id = iter(fp)
+    seq = iter(fp)
+    dumb = iter(fp)
+    qual = iter(fp)
+
+    for i, s, d, q in zip(seq_id, seq, dumb, qual):
+        # '@1', 'LH00444:84:227CNHLT4:7:1101:41955:2443/1'
+        # '@1', 'LH00444:84:227CNHLT4:7:1101:41955:2443/1 BX:Z:TATGACACATGCGGCCCT' # noqa
+        # '@baz/1
+
+        # NB: from 6d794a37-12cd-4f8e-95d6-72a4b8a1ec1c's only-adapter-filtered results: # noqa
+        # @A00953:244:HYHYWDSXY:3:1101:14082:3740 1:N:0:CCGTAAGA+TCTAACGC
+
+        fname_encoded, sid = i.split(delimiter, 1)
+
+        if fname_encoded not in openfps:
+            continue
+
+        current_fp = openfps[fname_encoded]
+
+        # remove '\n' from sid and split on all whitespace.
+        tmp = sid.strip().split()
+
+        if len(tmp) == 1:
+            # sequence id line contains no optional metadata.
+            # don't change sid.
+            # -1 is \n
+            orientation = sid[-2]
+            sid = rec + sid
+        elif len(tmp) == 2:
+            sid = tmp[0]
+            metadata = tmp[1]
+            # no '\n'
+            orientation = sid[-1]
+            # hexdump confirms separator is ' ', not '\t'
+            sid = rec + sid + ' ' + metadata + '\n'
+        else:
+            raise ValueError(f"'{sid}' is not a recognized form")
+
+        current_fp[orientation].write(sid)
+        current_fp[orientation].write(s)
+        current_fp[orientation].write(d)
+        current_fp[orientation].write(q)
+
+    for d in openfps.values():
+        for f in d.values():
+            f.close()
diff --git a/src/sequence_processing_pipeline/templates/nuqc_job_single.sh b/src/sequence_processing_pipeline/templates/nuqc_job_single.sh
index dc991fb6..1ad6db54 100644
--- a/src/sequence_processing_pipeline/templates/nuqc_job_single.sh
+++ b/src/sequence_processing_pipeline/templates/nuqc_job_single.sh
@@ -149,7 +149,7 @@ function demux-runner () {
 
     for idx in $(seq 0 ${n_demux_jobs})
     do
-        python {{demux_path}} \
+        demux_just_fwd \
             --id-map ${id_map} \
             --infile <(cat ${seqs_r1}) \
             --output ${OUTPUT} \
diff --git a/tests/data/configuration_profiles/pacbio_metagenomic.json b/tests/data/configuration_profiles/pacbio_metagenomic.json
index 0981ffc7..6cd9fc95 100644
--- a/tests/data/configuration_profiles/pacbio_metagenomic.json
+++ b/tests/data/configuration_profiles/pacbio_metagenomic.json
@@ -51,7 +51,7 @@
         "queue": "qiita",
         "nthreads": 16,
         "wallclock_time_in_minutes": 60,
-        "modules_to_load": ["fastqc_0.11.5"],
+        "modules_to_load": ["fastqc_0.12.1"],
         "fastqc_executable_path": "fastqc",
         "multiqc_executable_path": "multiqc",
         "multiqc_config_file_path": "sequence_processing_pipeline/multiqc-bclconvert-config.yaml",

From 2fcfc28981b4a69841c955071bd2bab4870a10fa Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Thu, 4 Sep 2025 13:51:04 -0600
Subject: [PATCH 20/42] demux_single -? demux_just_fwd

---
 pyproject.toml                                      |  2 +-
 src/sequence_processing_pipeline/Commands.py        | 13 ++++++++++++-
 .../templates/nuqc_job_single.sh                    |  2 +-
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3ee2658b..36fa6ca9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,5 +59,5 @@ dependencies = [
 configure_klp = "qp_klp.scripts.configure_klp:config"
 start_klp = "qp_klp.scripts.start_klp:execute"
 demux = "sequence_processing_pipeline.scripts.cli:demux"
-demux_single = "sequence_processing_pipeline.scripts.cli:demux_single"
+demux_just_fwd = "sequence_processing_pipeline.scripts.cli:demux_just_fwd"
 pacbio_generate_bam2fastq_commands = "qp_klp.scripts.pacbio_commands:generate_bam2fastq_commands"
diff --git a/src/sequence_processing_pipeline/Commands.py b/src/sequence_processing_pipeline/Commands.py
index 68effc2e..19fc455e 100644
--- a/src/sequence_processing_pipeline/Commands.py
+++ b/src/sequence_processing_pipeline/Commands.py
@@ -194,7 +194,18 @@ def demux(id_map, fp, out_d, task, maxtask):
             f.close()
 
 
-def demux_just_fwd(id_map, fp, out_d, task, maxtask):
+def demux_just_fwd(id_map_fp, fp_fp, out_d, task, maxtask):
+    with open(id_map_fp, 'r') as f:
+        id_map = f.readlines()
+        id_map = [line.strip().split('\t') for line in id_map]
+
+    # fp needs to be an open file handle.
+    # ensure task and maxtask are proper ints when coming from cmd-line.
+    with open(fp_fp, 'r') as fp:
+        demux_just_fwd_processing(id_map, fp, out_d, int(task), int(maxtask))
+
+
+def demux_just_fwd_processing(id_map, fp, out_d, task, maxtask):
     """Split infile data based in provided map"""
     delimiter = '::MUX::'
     mode = 'wt'
diff --git a/src/sequence_processing_pipeline/templates/nuqc_job_single.sh b/src/sequence_processing_pipeline/templates/nuqc_job_single.sh
index 1ad6db54..f223e63d 100644
--- a/src/sequence_processing_pipeline/templates/nuqc_job_single.sh
+++ b/src/sequence_processing_pipeline/templates/nuqc_job_single.sh
@@ -33,7 +33,7 @@ if [[ -z ${OUTPUT} ]]; then
     exit 1
 fi
 
-conda activate qp-knight-lab-processing-2022.03
+conda activate klp-2025-05
 module load {{modules_to_load}}
 
 set -x

From 27f49d104214a8a050be41581703be3f97d01580 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Thu, 4 Sep 2025 14:06:23 -0600
Subject: [PATCH 21/42] add cli for demux_just_fwd

---
 pyproject.toml                               |  2 +-
 src/sequence_processing_pipeline/Commands.py | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 36fa6ca9..09a38ea5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,5 +59,5 @@ dependencies = [
 configure_klp = "qp_klp.scripts.configure_klp:config"
 start_klp = "qp_klp.scripts.start_klp:execute"
 demux = "sequence_processing_pipeline.scripts.cli:demux"
-demux_just_fwd = "sequence_processing_pipeline.scripts.cli:demux_just_fwd"
+demux_just_fwd = "sequence_processing_pipeline.Commands:demux_just_fwd"
 pacbio_generate_bam2fastq_commands = "qp_klp.scripts.pacbio_commands:generate_bam2fastq_commands"
diff --git a/src/sequence_processing_pipeline/Commands.py b/src/sequence_processing_pipeline/Commands.py
index 19fc455e..78f04818 100644
--- a/src/sequence_processing_pipeline/Commands.py
+++ b/src/sequence_processing_pipeline/Commands.py
@@ -1,6 +1,7 @@
 import glob
 import gzip
 import os
+import click
 from sequence_processing_pipeline.util import (iter_paired_files,
                                                determine_orientation)
 
@@ -194,6 +195,17 @@ def demux(id_map, fp, out_d, task, maxtask):
             f.close()
 
 
+@click.group()
+def cli():
+    pass
+
+
+@cli.command()
+@click.option('--id-map', type=click.Path(exists=True), required=True)
+@click.option('--infile', type=click.Path(exists=True), required=True)
+@click.option('--output', type=click.Path(exists=True), required=True)
+@click.option('--task', type=int, required=True)
+@click.option('--maxtask', type=int, required=True)
 def demux_just_fwd(id_map_fp, fp_fp, out_d, task, maxtask):
     with open(id_map_fp, 'r') as f:
         id_map = f.readlines()

From dea4fd948206697d339699dafe9b241dd212bfb8 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Fri, 5 Sep 2025 07:45:53 -0600
Subject: [PATCH 22/42] fix demux_just_fwd params

---
 src/sequence_processing_pipeline/Commands.py |  8 +--
 src/sequence_processing_pipeline/Pipeline.py |  2 -
 tests/test_PacBioWorflow.py                  | 69 ++++++++++----------
 3 files changed, 38 insertions(+), 41 deletions(-)

diff --git a/src/sequence_processing_pipeline/Commands.py b/src/sequence_processing_pipeline/Commands.py
index 78f04818..5d574c1a 100644
--- a/src/sequence_processing_pipeline/Commands.py
+++ b/src/sequence_processing_pipeline/Commands.py
@@ -206,15 +206,15 @@ def cli():
 @click.option('--output', type=click.Path(exists=True), required=True)
 @click.option('--task', type=int, required=True)
 @click.option('--maxtask', type=int, required=True)
-def demux_just_fwd(id_map_fp, fp_fp, out_d, task, maxtask):
-    with open(id_map_fp, 'r') as f:
+def demux_just_fwd(id_map, infile, output, task, maxtask):
+    with open(id_map, 'r') as f:
         id_map = f.readlines()
         id_map = [line.strip().split('\t') for line in id_map]
 
     # fp needs to be an open file handle.
     # ensure task and maxtask are proper ints when coming from cmd-line.
-    with open(fp_fp, 'r') as fp:
-        demux_just_fwd_processing(id_map, fp, out_d, int(task), int(maxtask))
+    with open(infile, 'r') as fp:
+        demux_just_fwd_processing(id_map, fp, output, int(task), int(maxtask))
 
 
 def demux_just_fwd_processing(id_map, fp, out_d, task, maxtask):
diff --git a/src/sequence_processing_pipeline/Pipeline.py b/src/sequence_processing_pipeline/Pipeline.py
index 667bd6d8..cf0a83cd 100644
--- a/src/sequence_processing_pipeline/Pipeline.py
+++ b/src/sequence_processing_pipeline/Pipeline.py
@@ -445,8 +445,6 @@ def _configure_profile(self):
                              f"{self.assay_type}) was not found. Please notify"
                              " an administrator")
 
-        print(f'Profile selected: {selected_profile}')
-
         self.config_profile = selected_profile
 
     def _directory_check(self, directory_path, create=False):
diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py
index 9c355e6e..6234665f 100644
--- a/tests/test_PacBioWorflow.py
+++ b/tests/test_PacBioWorflow.py
@@ -9,13 +9,14 @@
 from unittest import main
 from os import makedirs
 from os.path import dirname, abspath, join, exists
-from metapool.sample_sheet import (PROTOCOL_NAME_PACBIO_SMRT)
-from qp_klp.Assays import ASSAY_NAME_METAGENOMIC
+from shutil import copyfile
 from shutil import rmtree
 from pathlib import Path
-from qiita_client.testing import PluginTestCase
-from sequence_processing_pipeline.PipelineError import PipelineError
 import pandas as pd
+from metapool.sample_sheet import (PROTOCOL_NAME_PACBIO_SMRT)
+from sequence_processing_pipeline.PipelineError import PipelineError
+from qp_klp.Assays import ASSAY_NAME_METAGENOMIC
+from qiita_client.testing import PluginTestCase
 
 
 class WorkflowFactoryTests(PluginTestCase):
@@ -49,23 +50,22 @@ def _inject_data(self, wf):
         makedirs(convert_dir, exist_ok=True)
         makedirs(reports_dir, exist_ok=True)
         Path(f'{convert_dir}/job_completed').touch()
-        # tellread_dir = f'{self.output_dir}/TellReadJob'
-        # nuqc_dir = f'{self.output_dir}/NuQCJob'
-        # fastqc_dir = f'{self.output_dir}/FastQCJob/logs/'
-        # multiqc_dir = f'{self.output_dir}/MultiQCJob/logs/'
-        # genprep_dir = (f'{self.output_dir}/GenPrepFileJob/'
-        #                '211021_A00000_0000_SAMPLE/')
-        # makedirs(nuqc_dir, exist_ok=True)
-        # makedirs(fastqc_dir, exist_ok=True)
-        # makedirs(multiqc_dir, exist_ok=True)
-        # makedirs(genprep_dir, exist_ok=True)
-        # # now let's create the required project folders
-        # for project in wf.pipeline.get_project_info():
-        #     sp = project['project_name']
-        #     makedirs(f'{convert_dir}/{sp}', exist_ok=True)
-        #     makedirs(f'{nuqc_dir}/filtered_sequences/{sp}', exist_ok=True)
-        #     makedirs(f'{genprep_dir}/{sp}/filtered_sequences/',
-        #              exist_ok=True)
+        nuqc_dir = f'{self.output_dir}/NuQCJob'
+        fastqc_dir = f'{self.output_dir}/FastQCJob/logs/'
+        multiqc_dir = f'{self.output_dir}/MultiQCJob/logs/'
+        genprep_dir = (f'{self.output_dir}/GenPrepFileJob/'
+                       '211021_A00000_0000_SAMPLE/')
+        makedirs(nuqc_dir, exist_ok=True)
+        makedirs(fastqc_dir, exist_ok=True)
+        makedirs(multiqc_dir, exist_ok=True)
+        makedirs(genprep_dir, exist_ok=True)
+        # now let's create the required project folders
+        for project in wf.pipeline.get_project_info():
+            sp = project['project_name']
+            makedirs(f'{convert_dir}/{sp}', exist_ok=True)
+            makedirs(f'{nuqc_dir}/filtered_sequences/{sp}', exist_ok=True)
+            makedirs(f'{genprep_dir}/{sp}/filtered_sequences/',
+                     exist_ok=True)
 
         # # then loop over samples and stage all fastq.gz files
         dstats = []
@@ -74,28 +74,25 @@ def _inject_data(self, wf):
             sp = sample["Sample_Project"]
             dstats.append({'SampleID': sn, '# Reads': 2})
             dname = f'{convert_dir}/{sp}'
-            makedirs(dname, exist_ok=True)
             Path(f'{dname}/{sn}_R1.fastq.gz').touch()
             with open(f'{dname}/{sn}_R1.counts.txt', 'w') as f:
                 f.write("2")
 
-        #     # NuQCJob
-        #     dname = f'{nuqc_dir}/filtered_sequences/{sp}'
-        #     copyfile(self.gz_source, f'{dname}/{rp}_L001_R1_001.fastq.gz')
-        #     copyfile(self.gz_source, f'{dname}/{rp}_L001_R2_001.fastq.gz')
+            # NuQCJob
+            dname = f'{nuqc_dir}/filtered_sequences/{sp}'
+            copyfile(self.gz_source, f'{dname}/{sn}_R1.fastq.gz')
 
-        #     # GenPrepFileJob
-        #     gprep_base = f'{genprep_dir}/{sp}/filtered_sequences/{rp}'
-        #     Path(f'{gprep_base}_L001_R1_001.fastq.gz').touch()
-        #     Path(f'{gprep_base}_L001_R2_001.fastq.gz').touch()
+            # GenPrepFileJob
+            gprep_base = f'{genprep_dir}/{sp}/filtered_sequences/{sn}'
+            Path(f'{gprep_base}_R1.fastq.gz').touch()
 
         pd.DataFrame(dstats).set_index('SampleID').to_csv(
             f'{reports_dir}/Demultiplex_Stats.csv')
 
-        # # generating the "*.completed" files
-        # for i in range(len(samples)*3):
-        #     Path(f'{fastqc_dir}/FastQCJob_{i}.completed').touch()
-        #     Path(f'{multiqc_dir}/MultiQCJob_{i}.completed').touch()
+        # generating the "*.completed" files
+        for i in range(len(samples)*3):
+            Path(f'{fastqc_dir}/FastQCJob_{i}.completed').touch()
+            Path(f'{multiqc_dir}/MultiQCJob_{i}.completed').touch()
 
     def test_pacbio_metagenomic_workflow_creation(self):
         kwargs = {"uif_path": "tests/data/sample-sheets/metagenomic/"
@@ -118,7 +115,9 @@ def test_pacbio_metagenomic_workflow_creation(self):
         self.assertEqual(wf.assay_type, ASSAY_NAME_METAGENOMIC)
 
         self._inject_data(wf)
-        # ConvertJob/ConvertJob.sh
+
+        # we can add some checks/tests of the initial scripts (mainly Convert)
+        # but not doing now as is not required
 
         # Metagenomic is a valid data type in the default qiita test
         # database but job-id: 78901 doesn't exist; however, if we get

From e596ade624a382bc9f2a409cecc27e23c77cded5 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Fri, 5 Sep 2025 11:22:23 -0600
Subject: [PATCH 23/42] rm demux_just_fwd_processing splitter_binary

---
 src/sequence_processing_pipeline/Commands.py  | 22 +++----------------
 .../templates/nuqc_job_single.sh              |  6 +----
 2 files changed, 4 insertions(+), 24 deletions(-)

diff --git a/src/sequence_processing_pipeline/Commands.py b/src/sequence_processing_pipeline/Commands.py
index 5d574c1a..d8f45496 100644
--- a/src/sequence_processing_pipeline/Commands.py
+++ b/src/sequence_processing_pipeline/Commands.py
@@ -252,6 +252,9 @@ def demux_just_fwd_processing(id_map, fp, out_d, task, maxtask):
     dumb = iter(fp)
     qual = iter(fp)
 
+    # there is only fwd so the orientation is always '1'
+    orientation = '1'
+
     for i, s, d, q in zip(seq_id, seq, dumb, qual):
         # '@1', 'LH00444:84:227CNHLT4:7:1101:41955:2443/1'
         # '@1', 'LH00444:84:227CNHLT4:7:1101:41955:2443/1 BX:Z:TATGACACATGCGGCCCT' # noqa
@@ -267,25 +270,6 @@ def demux_just_fwd_processing(id_map, fp, out_d, task, maxtask):
 
         current_fp = openfps[fname_encoded]
 
-        # remove '\n' from sid and split on all whitespace.
-        tmp = sid.strip().split()
-
-        if len(tmp) == 1:
-            # sequence id line contains no optional metadata.
-            # don't change sid.
-            # -1 is \n
-            orientation = sid[-2]
-            sid = rec + sid
-        elif len(tmp) == 2:
-            sid = tmp[0]
-            metadata = tmp[1]
-            # no '\n'
-            orientation = sid[-1]
-            # hexdump confirms separator is ' ', not '\t'
-            sid = rec + sid + ' ' + metadata + '\n'
-        else:
-            raise ValueError(f"'{sid}' is not a recognized form")
-
         current_fp[orientation].write(sid)
         current_fp[orientation].write(s)
         current_fp[orientation].write(d)
diff --git a/src/sequence_processing_pipeline/templates/nuqc_job_single.sh b/src/sequence_processing_pipeline/templates/nuqc_job_single.sh
index f223e63d..d06ac833 100644
--- a/src/sequence_processing_pipeline/templates/nuqc_job_single.sh
+++ b/src/sequence_processing_pipeline/templates/nuqc_job_single.sh
@@ -125,10 +125,6 @@ function mux-runner () {
     python {{pmls_path}} <(zcat ${jobd}/seqs.movi.txt.gz) {{pmls_extra_parameters}} | \
         seqtk subseq ${seq_reads_filter_alignment} - > ${jobd}/seqs.final.fastq
 
-    {{splitter_binary}} ${jobd}/seqs.final.fastq \
-        ${jobd}/reads.r1.fastq ${delimiter} ${r1_tag} &
-    wait
-
     # keep seqs.movi.txt and migrate it to NuQCJob directory.
     mv ${jobd}/seqs.movi.txt.gz {{output_path}}/logs/seqs.movi.${SLURM_ARRAY_TASK_ID}.txt.gz
 }
@@ -139,7 +135,7 @@ function demux-runner () {
     n_demux_jobs=${SLURM_CPUS_PER_TASK}
     jobd=${TMPDIR}
     id_map=${jobd}/id_map
-    seqs_r1=${jobd}/reads.r1.fastq.paired.fq
+    seqs_r1=${jobd}/seqs.final.fastq
 
     id_map=${jobd}/id_map
     if [[ ! -f ${id_map} ]]; then

From b27e7d6f9a38651d2085e1501b4ee24346993786 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Fri, 5 Sep 2025 13:39:54 -0600
Subject: [PATCH 24/42] self.files_regex = long_read

---
 src/qp_klp/Assays.py                     |  1 +
 src/qp_klp/PacBioMetagenomicWorkflow.py  |  5 +----
 src/sequence_processing_pipeline/util.py | 10 ++++++++--
 tests/test_PacBioWorflow.py              |  2 +-
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/qp_klp/Assays.py b/src/qp_klp/Assays.py
index e2fc0ede..7ed35cf3 100644
--- a/src/qp_klp/Assays.py
+++ b/src/qp_klp/Assays.py
@@ -494,6 +494,7 @@ def qc_reads(self):
         # is not defined, just fallback to the SPP expected default regex
         if not hasattr(self, 'files_regex'):
             self.files_regex = 'SPP'
+
         # base quality control used by multiple Assay types.
         job = NuQCJob(self.raw_fastq_files_path,
                       self.pipeline.output_path,
diff --git a/src/qp_klp/PacBioMetagenomicWorkflow.py b/src/qp_klp/PacBioMetagenomicWorkflow.py
index a1815fff..c8fb011c 100644
--- a/src/qp_klp/PacBioMetagenomicWorkflow.py
+++ b/src/qp_klp/PacBioMetagenomicWorkflow.py
@@ -26,10 +26,7 @@ def __init__(self, **kwargs):
         if 'overwrite_prep_with_original' in self.kwargs:
             self.overwrite_prep_with_original = \
                 self.kwargs['overwrite_prep_with_original']
-        if 'files_regex' in self.kwargs:
-            self.files_regex = self.kwargs['files_regex']
-        else:
-            self.files_regex = 'SPP'
+        self.files_regex = 'long_read'
         self.pipeline = Pipeline(self.kwargs['config_fp'],
                                  self.kwargs['run_identifier'],
                                  self.kwargs['uif_path'],
diff --git a/src/sequence_processing_pipeline/util.py b/src/sequence_processing_pipeline/util.py
index f38b38ee..b07b55fa 100644
--- a/src/sequence_processing_pipeline/util.py
+++ b/src/sequence_processing_pipeline/util.py
@@ -4,8 +4,8 @@
 
 PAIR_UNDERSCORE = (REC(r'_R1_'), '_R1_', '_R2_')
 PAIR_DOT = (REC(r'\.R1\.'), '.R1.', '.R2.')
-SIMPLE_PAIR_UNDERSCAOTE = (REC(r'_R1'), '_R1', '_R2')
-PAIR_TESTS = (PAIR_UNDERSCORE, PAIR_DOT, SIMPLE_PAIR_UNDERSCAOTE)
+SIMPLE_PAIR_UNDERSCORE = (REC(r'_R1'), '_R1', '_R2')
+PAIR_TESTS = (PAIR_UNDERSCORE, PAIR_DOT, SIMPLE_PAIR_UNDERSCORE)
 FILES_REGEX = {
     'SPP': {
         'fastq': REC(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}\.fastq\.gz$'),
@@ -32,6 +32,12 @@
         'html': REC(r'^(.*).html$'),
         'json':  REC(r'^(.*).json$')
     },
+    'long_read': {
+        'fastq': REC(r'^(.*)_R1.fastq\.gz$'),
+        'interleave_fastq': REC(r'^(.*)_R1.interleave\.fastq\.gz$'),
+        'html': REC(r'^(.*)_R1.html$'),
+        'json':  REC(r'^(.*)_R1.json$')
+    },
 }
 
 
diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py
index 6234665f..7429f8df 100644
--- a/tests/test_PacBioWorflow.py
+++ b/tests/test_PacBioWorflow.py
@@ -54,7 +54,7 @@ def _inject_data(self, wf):
         fastqc_dir = f'{self.output_dir}/FastQCJob/logs/'
         multiqc_dir = f'{self.output_dir}/MultiQCJob/logs/'
         genprep_dir = (f'{self.output_dir}/GenPrepFileJob/'
-                       '211021_A00000_0000_SAMPLE/')
+                       'r11111_20250101_111111/')
         makedirs(nuqc_dir, exist_ok=True)
         makedirs(fastqc_dir, exist_ok=True)
         makedirs(multiqc_dir, exist_ok=True)

From 166651667e2e5efde1874d8ff3e999272e00b85c Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Sat, 6 Sep 2025 09:31:41 -0600
Subject: [PATCH 25/42] sample_id_column

---
 src/sequence_processing_pipeline/NuQCJob.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/sequence_processing_pipeline/NuQCJob.py b/src/sequence_processing_pipeline/NuQCJob.py
index 540c4762..fdf7d5b2 100644
--- a/src/sequence_processing_pipeline/NuQCJob.py
+++ b/src/sequence_processing_pipeline/NuQCJob.py
@@ -379,8 +379,15 @@ def _process_sample_sheet(self):
             raise PipelineError(s)
 
         sample_ids = []
+        # long_reads use the Sample_Name as the main sample identifier,
+        # everything else uses Sample_ID; we might want to change this
+        # in the future for everything to use Sample_Name
+        sample_id_column = 'Sample_ID'
+        if self.read_length == 'long':
+            sample_id_column = 'Sample_Name'
         for sample in sheet.samples:
-            sample_ids.append((sample['Sample_ID'], sample['Sample_Project']))
+            sample_ids.append(
+                (sample[sample_id_column], sample['Sample_Project']))
 
         bioinformatics = sheet.Bioinformatics
 

From 0141440d89c94fbb51a89936ec9911463facf245 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Sat, 6 Sep 2025 10:22:52 -0600
Subject: [PATCH 26/42] mv self.read_length = read_length up

---
 src/sequence_processing_pipeline/NuQCJob.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/sequence_processing_pipeline/NuQCJob.py b/src/sequence_processing_pipeline/NuQCJob.py
index fdf7d5b2..49f71179 100644
--- a/src/sequence_processing_pipeline/NuQCJob.py
+++ b/src/sequence_processing_pipeline/NuQCJob.py
@@ -61,6 +61,7 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path,
                          [fastp_path, minimap2_path, samtools_path],
                          max_array_length,
                          modules_to_load=modules_to_load)
+        self.read_length = read_length
         self.sample_sheet_path = sample_sheet_path
         self._file_check(self.sample_sheet_path)
         metadata = self._process_sample_sheet()
@@ -83,7 +84,6 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path,
         self.pmls_path = pmls_path
         self.additional_fastq_tags = additional_fastq_tags
         self.audit_folders = ['filtered_sequences']
-        self.read_length = read_length
 
         # for projects that use sequence_processing_pipeline as a dependency,
         # jinja_env must be set to sequence_processing_pipeline's root path,

From 2b5700fba9fe4feef27c9987110ca6bb446cf27f Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Sat, 6 Sep 2025 12:49:55 -0600
Subject: [PATCH 27/42] _filter_empty_fastq_files

---
 src/sequence_processing_pipeline/NuQCJob.py | 25 +++++++++++++--------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/sequence_processing_pipeline/NuQCJob.py b/src/sequence_processing_pipeline/NuQCJob.py
index 49f71179..01242718 100644
--- a/src/sequence_processing_pipeline/NuQCJob.py
+++ b/src/sequence_processing_pipeline/NuQCJob.py
@@ -156,15 +156,22 @@ def _filter_empty_fastq_files(self, filtered_directory,
 
         files = glob(join(filtered_directory, f'*.{self.suffix}'))
 
-        for r1, r2 in iter_paired_files(files):
-            full_path = join(filtered_directory, r1)
-            full_path_reverse = join(filtered_directory, r2)
-            if stat(full_path).st_size <= minimum_bytes or stat(
-                    full_path_reverse).st_size <= minimum_bytes:
-                logging.debug(f'moving {full_path} and {full_path_reverse}'
-                              f' to empty list.')
-                empty_list.append(full_path)
-                empty_list.append(full_path_reverse)
+        if self.read_length == 'long':
+            for r1 in files:
+                full_path = join(filtered_directory, r1)
+                if stat(full_path).st_size <= minimum_bytes:
+                    logging.debug(f'moving {full_path} to empty list.')
+                    empty_list.append(full_path)
+        else:
+            for r1, r2 in iter_paired_files(files):
+                full_path = join(filtered_directory, r1)
+                full_path_reverse = join(filtered_directory, r2)
+                if stat(full_path).st_size <= minimum_bytes or stat(
+                        full_path_reverse).st_size <= minimum_bytes:
+                    logging.debug(f'moving {full_path} and {full_path_reverse}'
+                                  f' to empty list.')
+                    empty_list.append(full_path)
+                    empty_list.append(full_path_reverse)
 
         if empty_list:
             logging.debug(f'making directory {empty_files_directory}')

From cc29d39d8805d59b3d4503b1001f0dad998436ca Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Sat, 6 Sep 2025 20:31:23 -0600
Subject: [PATCH 28/42] zip_longest

---
 src/sequence_processing_pipeline/FastQCJob.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/sequence_processing_pipeline/FastQCJob.py b/src/sequence_processing_pipeline/FastQCJob.py
index 9d34cb04..27a1c7c4 100644
--- a/src/sequence_processing_pipeline/FastQCJob.py
+++ b/src/sequence_processing_pipeline/FastQCJob.py
@@ -2,6 +2,7 @@
 from jinja2 import Environment
 from json import dumps
 import logging
+from itertools import zip_longest
 from os import listdir, makedirs
 from os.path import join, basename
 from re import sub
@@ -156,15 +157,16 @@ def _scan_fastq_files(self, is_raw_input=False):
         # gather a list of names from the outputs of previous jobs.
         project_names = []
 
-        for proj_name, fltr_type, fastq_fp, fwd_files, rev_files in projects:
+        for proj_name, fltr_type, _, fwd_files, rev_files in projects:
             project_names.append(proj_name)
             p_path = partial(join, self.output_path, 'fastqc', proj_name)
             output_dir = p_path('bclconvert' if is_raw_input else fltr_type)
             makedirs(output_dir, exist_ok=True)
 
-            for some_fwd_file, some_rev_file in zip(fwd_files, rev_files):
-                fastqc_results.append((some_fwd_file, some_rev_file,
-                                       output_dir))
+            for sff, srf in zip_longest(fwd_files, rev_files):
+                if srf is None:
+                    srf = ''
+                fastqc_results.append((sff, srf, output_dir))
         # remove duplicates
         project_names = list(set(project_names))
 

From 3a94746e6cffa950b7ce2052ecfd093a143d20f6 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Mon, 8 Sep 2025 07:25:27 -0600
Subject: [PATCH 29/42] raw_reads_r1r2

---
 src/qp_klp/Protocol.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/qp_klp/Protocol.py b/src/qp_klp/Protocol.py
index a6ba180b..e3c8f972 100644
--- a/src/qp_klp/Protocol.py
+++ b/src/qp_klp/Protocol.py
@@ -449,7 +449,9 @@ def generate_sequence_counts(self):
                 continue
             with open(cf, 'r') as fh:
                 counts = fh.read().strip()
-            data.append({'SampleID': sn, '# Reads': counts})
+            data.append({'Sample_ID': sn,
+                         'raw_reads_r1r2': counts,
+                         'Lane': self.lane_number})
 
         if missing_files:
             raise ValueError(f'Missing count files: {missing_files}')

From 6e12dd5b93ed3b1dcaf9f2674d67f5ebff62acc4 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Tue, 9 Sep 2025 10:07:52 -0600
Subject: [PATCH 30/42] fastq.gz -> trimmed.fastq.gz

---
 tests/test_PacBioWorflow.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py
index 7429f8df..9183e01c 100644
--- a/tests/test_PacBioWorflow.py
+++ b/tests/test_PacBioWorflow.py
@@ -80,11 +80,11 @@ def _inject_data(self, wf):
 
             # NuQCJob
             dname = f'{nuqc_dir}/filtered_sequences/{sp}'
-            copyfile(self.gz_source, f'{dname}/{sn}_R1.fastq.gz')
+            copyfile(self.gz_source, f'{dname}/{sn}_R1.trimmed.fastq.gz')
 
             # GenPrepFileJob
             gprep_base = f'{genprep_dir}/{sp}/filtered_sequences/{sn}'
-            Path(f'{gprep_base}_R1.fastq.gz').touch()
+            Path(f'{gprep_base}_R1.trimmed.fastq.gz').touch()
 
         pd.DataFrame(dstats).set_index('SampleID').to_csv(
             f'{reports_dir}/Demultiplex_Stats.csv')

From b2a308efa15f2b033dced3b3d5b71c49cb544a3b Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Tue, 9 Sep 2025 16:15:23 -0600
Subject: [PATCH 31/42] barcode

---
 src/qp_klp/PacBioMetagenomicWorkflow.py                   | 3 ++-
 src/qp_klp/scripts/pacbio_commands.py                     | 3 ++-
 src/sequence_processing_pipeline/NuQCJob.py               | 8 +-------
 .../metagenomic/pacbio/good_pacbio_metagv10.csv           | 8 ++++----
 4 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/src/qp_klp/PacBioMetagenomicWorkflow.py b/src/qp_klp/PacBioMetagenomicWorkflow.py
index c8fb011c..580566f3 100644
--- a/src/qp_klp/PacBioMetagenomicWorkflow.py
+++ b/src/qp_klp/PacBioMetagenomicWorkflow.py
@@ -41,7 +41,8 @@ def __init__(self, **kwargs):
         samples = [
             {'barcode': sample['Sample_ID'],
              'sample_name': sample['Sample_Name'],
-             'project_name': sample['Sample_Project']}
+             'project_name': sample['Sample_Project'],
+             'lane': sample['Lane']}
             for sample in self.pipeline.sample_sheet.samples]
         df = pd.DataFrame(samples)
         sample_list_fp = f"{self.kwargs['output_dir']}/sample_list.tsv"
diff --git a/src/qp_klp/scripts/pacbio_commands.py b/src/qp_klp/scripts/pacbio_commands.py
index 37dd6048..1a4c2af3 100755
--- a/src/qp_klp/scripts/pacbio_commands.py
+++ b/src/qp_klp/scripts/pacbio_commands.py
@@ -31,12 +31,13 @@ def generate_bam2fastq_commands(sample_list, run_folder, outdir, threads):
         bc = row['barcode']
         sn = row['sample_name']
         pn = row['project_name']
+        lane = row['lane']
         if bc not in files:
             missing_files.append(bc)
             continue
         od = f'{outdir}/{pn}'
         makedirs(od, exist_ok=True)
-        fn = f'{od}/{sn}_R1'
+        fn = f'{od}/{sn}_L00{lane}_R1_001'
         cmd = (f'bam2fastq -j {threads} -o {fn} -c 9 '
                f'{files[bc]}; '
                f'fqtools count {fn}.fastq.gz > '
diff --git a/src/sequence_processing_pipeline/NuQCJob.py b/src/sequence_processing_pipeline/NuQCJob.py
index 01242718..8fa693c4 100644
--- a/src/sequence_processing_pipeline/NuQCJob.py
+++ b/src/sequence_processing_pipeline/NuQCJob.py
@@ -386,15 +386,9 @@ def _process_sample_sheet(self):
             raise PipelineError(s)
 
         sample_ids = []
-        # long_reads use the Sample_Name as the main sample identifier,
-        # everything else uses Sample_ID; we might want to change this
-        # in the future for everything to use Sample_Name
-        sample_id_column = 'Sample_ID'
-        if self.read_length == 'long':
-            sample_id_column = 'Sample_Name'
         for sample in sheet.samples:
             sample_ids.append(
-                (sample[sample_id_column], sample['Sample_Project']))
+                (sample['Sample_ID'], sample['Sample_Project']))
 
         bioinformatics = sheet.Bioinformatics
 
diff --git a/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv b/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv
index 786c4b1f..de0bc699 100644
--- a/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv
+++ b/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv
@@ -8,10 +8,10 @@ Assay,Metagenomic,,,,,
 Description,,,,,,
 ,,,,,,
 [Data],,,,,,
-Sample_ID,Sample_Name,Sample_Plate,Sample_Well,Sample_Project,Well_description,Lane
-bc3011,3A,sample_plate_1,A1,MyProject_6123,sample_plate_1.3A.A1,1
-bc0112,4A,sample_plate_1,A2,MyProject_6123,sample_plate_1.4A.A2,1
-bc9992,5B,sample_plate_1,A3,MyProject_6123,sample_plate_1.5B.A3,1
+Sample_ID,barcode_id,Sample_Name,Sample_Plate,Sample_Well,Sample_Project,Well_description,Lane
+3A,bc3011,3A,sample_plate_1,A1,MyProject_6123,sample_plate_1.3A.A1,1
+4A,bc0112,4A,sample_plate_1,A2,MyProject_6123,sample_plate_1.4A.A2,1
+5B,bc9992,5B,sample_plate_1,A3,MyProject_6123,sample_plate_1.5B.A3,1
 ,,,,,,
 [Bioinformatics],,,,,,
 Sample_Project,QiitaID,HumanFiltering,library_construction_protocol,experiment_design_description,contains_replicates,

From 1da1abb95c58455adb9d5ddf5c18ff2f2fa28f6d Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Tue, 9 Sep 2025 17:04:51 -0600
Subject: [PATCH 32/42] barcode -> barcode_id

---
 src/qp_klp/PacBioMetagenomicWorkflow.py       |  2 +-
 .../pacbio/good_pacbio_metagv10.csv           | 54 +++++++++----------
 2 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/src/qp_klp/PacBioMetagenomicWorkflow.py b/src/qp_klp/PacBioMetagenomicWorkflow.py
index 580566f3..f9f12cf4 100644
--- a/src/qp_klp/PacBioMetagenomicWorkflow.py
+++ b/src/qp_klp/PacBioMetagenomicWorkflow.py
@@ -39,7 +39,7 @@ def __init__(self, **kwargs):
                                        self.pipeline.sample_sheet.samples)
 
         samples = [
-            {'barcode': sample['Sample_ID'],
+            {'barcode': sample['barcode_id'],
              'sample_name': sample['Sample_Name'],
              'project_name': sample['Sample_Project'],
              'lane': sample['Lane']}
diff --git a/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv b/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv
index de0bc699..b2134a64 100644
--- a/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv
+++ b/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv
@@ -1,27 +1,27 @@
-[Header],,,,,,
-SheetType,pacbio_metag,,,,,
-SheetVersion,10,,,,,
-Investigator Name,Knight,,,,,
-Experiment Name,RKL_experiment,,,,,
-Date,2025-08-29,,,,,
-Assay,Metagenomic,,,,,
-Description,,,,,,
-,,,,,,
-[Data],,,,,,
-Sample_ID,barcode_id,Sample_Name,Sample_Plate,Sample_Well,Sample_Project,Well_description,Lane
-3A,bc3011,3A,sample_plate_1,A1,MyProject_6123,sample_plate_1.3A.A1,1
-4A,bc0112,4A,sample_plate_1,A2,MyProject_6123,sample_plate_1.4A.A2,1
-5B,bc9992,5B,sample_plate_1,A3,MyProject_6123,sample_plate_1.5B.A3,1
-,,,,,,
-[Bioinformatics],,,,,,
-Sample_Project,QiitaID,HumanFiltering,library_construction_protocol,experiment_design_description,contains_replicates,
-MyProject_6123,6123,False,some protocol,some description,False,
-,,,,,,
-[Contact],,,,,,
-Sample_Project,Email,,,,,
-MyProject_6123,foo@bar.org,,,,,
-,,,,,,
-[SampleContext],,,,,,
-sample_name,sample_type,primary_qiita_study,secondary_qiita_studies,,,
-5B,control blank,6123,,,,
-,,,,,,
+[Header],,,,,,,
+SheetType,pacbio_metag,,,,,,
+SheetVersion,10,,,,,,
+Investigator Name,Knight,,,,,,
+Experiment Name,RKL_experiment,,,,,,
+Date,2025-08-29,,,,,,
+Assay,Metagenomic,,,,,,
+Description,,,,,,,
+,,,,,,,
+[Data],,,,,,,
+Sample_ID,Sample_Name,Sample_Plate,Sample_Well,Sample_Project,Well_description,Lane,barcode_id
+3A,3A,sample_plate_1,A1,MyProject_6123,sample_plate_1.3A.A1,1,bc3011
+4A,4A,sample_plate_1,A2,MyProject_6123,sample_plate_1.4A.A2,1,bc0112
+5B,5B,sample_plate_1,A3,MyProject_6123,sample_plate_1.5B.A3,1,bc9992
+,,,,,,,
+[Bioinformatics],,,,,,,
+Sample_Project,QiitaID,HumanFiltering,library_construction_protocol,experiment_design_description,contains_replicates,,
+MyProject_6123,6123,False,some protocol,some description,False,,
+,,,,,,,
+[Contact],,,,,,,
+Sample_Project,Email,,,,,,
+MyProject_6123,foo@bar.org,,,,,,
+,,,,,,,
+[SampleContext],,,,,,,
+sample_name,sample_type,primary_qiita_study,secondary_qiita_studies,,,,
+5B,control blank,6123,,,,,
+,,,,,,,

From e600bfb0662e2acead61033916e70f9c0a2770cb Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Tue, 9 Sep 2025 18:25:04 -0600
Subject: [PATCH 33/42] S000

---
 src/qp_klp/PacBioMetagenomicWorkflow.py | 2 +-
 src/qp_klp/scripts/pacbio_commands.py   | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/qp_klp/PacBioMetagenomicWorkflow.py b/src/qp_klp/PacBioMetagenomicWorkflow.py
index f9f12cf4..bcc6e883 100644
--- a/src/qp_klp/PacBioMetagenomicWorkflow.py
+++ b/src/qp_klp/PacBioMetagenomicWorkflow.py
@@ -40,7 +40,7 @@ def __init__(self, **kwargs):
 
         samples = [
             {'barcode': sample['barcode_id'],
-             'sample_name': sample['Sample_Name'],
+             'sample_name': sample['Sample_ID'],
              'project_name': sample['Sample_Project'],
              'lane': sample['Lane']}
             for sample in self.pipeline.sample_sheet.samples]
diff --git a/src/qp_klp/scripts/pacbio_commands.py b/src/qp_klp/scripts/pacbio_commands.py
index 1a4c2af3..7caf26c3 100755
--- a/src/qp_klp/scripts/pacbio_commands.py
+++ b/src/qp_klp/scripts/pacbio_commands.py
@@ -36,8 +36,9 @@ def generate_bam2fastq_commands(sample_list, run_folder, outdir, threads):
             missing_files.append(bc)
             continue
         od = f'{outdir}/{pn}'
+
         makedirs(od, exist_ok=True)
-        fn = f'{od}/{sn}_L00{lane}_R1_001'
+        fn = f'{od}/{sn}_S000_L00{lane}_R1_001'
         cmd = (f'bam2fastq -j {threads} -o {fn} -c 9 '
                f'{files[bc]}; '
                f'fqtools count {fn}.fastq.gz > '

From d73b6a73cd4cc03679363ff8933c6999d5774ea2 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Wed, 10 Sep 2025 06:42:54 -0600
Subject: [PATCH 34/42] del raw_reverse_seqs

---
 src/qp_klp/PacBioMetagenomicWorkflow.py  | 1 -
 src/qp_klp/Workflows.py                  | 5 +++++
 src/sequence_processing_pipeline/util.py | 6 ------
 tests/test_PacBioWorflow.py              | 2 +-
 4 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/qp_klp/PacBioMetagenomicWorkflow.py b/src/qp_klp/PacBioMetagenomicWorkflow.py
index bcc6e883..0348ab1d 100644
--- a/src/qp_klp/PacBioMetagenomicWorkflow.py
+++ b/src/qp_klp/PacBioMetagenomicWorkflow.py
@@ -26,7 +26,6 @@ def __init__(self, **kwargs):
         if 'overwrite_prep_with_original' in self.kwargs:
             self.overwrite_prep_with_original = \
                 self.kwargs['overwrite_prep_with_original']
-        self.files_regex = 'long_read'
         self.pipeline = Pipeline(self.kwargs['config_fp'],
                                  self.kwargs['run_identifier'],
                                  self.kwargs['uif_path'],
diff --git a/src/qp_klp/Workflows.py b/src/qp_klp/Workflows.py
index 19ab808b..1520d1f2 100644
--- a/src/qp_klp/Workflows.py
+++ b/src/qp_klp/Workflows.py
@@ -8,6 +8,7 @@
 import logging
 from shutil import rmtree
 from .Assays import ASSAY_NAME_AMPLICON
+from metapool.sample_sheet import PROTOCOL_NAME_PACBIO_SMRT
 from sequence_processing_pipeline.util import determine_orientation
 
 
@@ -600,6 +601,10 @@ def _get_postqc_fastq_files(self, out_dir, project):
         if self.pipeline.pipeline_type != ASSAY_NAME_AMPLICON:
             del (files['raw_barcodes'])
 
+        # PacBio doesn't have reverse reads
+        if self.protocol_type == PROTOCOL_NAME_PACBIO_SMRT:
+            del (files['raw_reverse_seqs'])
+
         # confirm expected lists of reads are not empty.
         for f_type in files:
             if not files[f_type]:
diff --git a/src/sequence_processing_pipeline/util.py b/src/sequence_processing_pipeline/util.py
index b07b55fa..bacf8306 100644
--- a/src/sequence_processing_pipeline/util.py
+++ b/src/sequence_processing_pipeline/util.py
@@ -32,12 +32,6 @@
         'html': REC(r'^(.*).html$'),
         'json':  REC(r'^(.*).json$')
     },
-    'long_read': {
-        'fastq': REC(r'^(.*)_R1.fastq\.gz$'),
-        'interleave_fastq': REC(r'^(.*)_R1.interleave\.fastq\.gz$'),
-        'html': REC(r'^(.*)_R1.html$'),
-        'json':  REC(r'^(.*)_R1.json$')
-    },
 }
 
 
diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py
index 9183e01c..0ca11211 100644
--- a/tests/test_PacBioWorflow.py
+++ b/tests/test_PacBioWorflow.py
@@ -13,7 +13,7 @@
 from shutil import rmtree
 from pathlib import Path
 import pandas as pd
-from metapool.sample_sheet import (PROTOCOL_NAME_PACBIO_SMRT)
+from metapool.sample_sheet import PROTOCOL_NAME_PACBIO_SMRT
 from sequence_processing_pipeline.PipelineError import PipelineError
 from qp_klp.Assays import ASSAY_NAME_METAGENOMIC
 from qiita_client.testing import PluginTestCase

From c2b0a3e0ea75fea539f9a83fd5a7cb5c8351491e Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Wed, 10 Sep 2025 07:03:18 -0600
Subject: [PATCH 35/42] test filenames

---
 tests/test_PacBioWorflow.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py
index 0ca11211..cfd342bb 100644
--- a/tests/test_PacBioWorflow.py
+++ b/tests/test_PacBioWorflow.py
@@ -70,21 +70,22 @@ def _inject_data(self, wf):
         # # then loop over samples and stage all fastq.gz files
         dstats = []
         for sample in samples:
-            sn = sample['Sample_Name']
+            sn = sample['Sample_ID']
             sp = sample["Sample_Project"]
             dstats.append({'SampleID': sn, '# Reads': 2})
             dname = f'{convert_dir}/{sp}'
-            Path(f'{dname}/{sn}_R1.fastq.gz').touch()
+            Path(f'{dname}/{sn}_S000_L001_R1_001.fastq.gz').touch()
             with open(f'{dname}/{sn}_R1.counts.txt', 'w') as f:
                 f.write("2")
 
             # NuQCJob
             dname = f'{nuqc_dir}/filtered_sequences/{sp}'
-            copyfile(self.gz_source, f'{dname}/{sn}_R1.trimmed.fastq.gz')
+            copyfile(self.gz_source,
+                     f'{dname}/{sn}_S000_L001_R1_001.trimmed.fastq.gz')
 
             # GenPrepFileJob
             gprep_base = f'{genprep_dir}/{sp}/filtered_sequences/{sn}'
-            Path(f'{gprep_base}_R1.trimmed.fastq.gz').touch()
+            Path(f'{gprep_base}_S000_L001_R1_001.trimmed.fastq.gz').touch()
 
         pd.DataFrame(dstats).set_index('SampleID').to_csv(
             f'{reports_dir}/Demultiplex_Stats.csv')

From b2b4d60d514bbfc5948e79ddd4898884d878808d Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Wed, 10 Sep 2025 07:43:49 -0600
Subject: [PATCH 36/42] S000_L001_R1_001.counts.txt

---
 tests/test_PacBioWorflow.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py
index cfd342bb..5f73efa5 100644
--- a/tests/test_PacBioWorflow.py
+++ b/tests/test_PacBioWorflow.py
@@ -75,7 +75,7 @@ def _inject_data(self, wf):
             dstats.append({'SampleID': sn, '# Reads': 2})
             dname = f'{convert_dir}/{sp}'
             Path(f'{dname}/{sn}_S000_L001_R1_001.fastq.gz').touch()
-            with open(f'{dname}/{sn}_R1.counts.txt', 'w') as f:
+            with open(f'{dname}/{sn}_S000_L001_R1_001.counts.txt', 'w') as f:
                 f.write("2")
 
             # NuQCJob

From 0051f4999d3c5f94fe15675f4ee0fc5141a48621 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Wed, 10 Sep 2025 09:05:08 -0600
Subject: [PATCH 37/42] {rec}{sid}

---
 src/sequence_processing_pipeline/Commands.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/sequence_processing_pipeline/Commands.py b/src/sequence_processing_pipeline/Commands.py
index d8f45496..9337ba4f 100644
--- a/src/sequence_processing_pipeline/Commands.py
+++ b/src/sequence_processing_pipeline/Commands.py
@@ -270,7 +270,7 @@ def demux_just_fwd_processing(id_map, fp, out_d, task, maxtask):
 
         current_fp = openfps[fname_encoded]
 
-        current_fp[orientation].write(sid)
+        current_fp[orientation].write(f'{rec}{sid}')
         current_fp[orientation].write(s)
         current_fp[orientation].write(d)
         current_fp[orientation].write(q)

From 1418cbcd4b4259d68e8e1bef81f135fe5c8773c1 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Wed, 10 Sep 2025 11:23:38 -0600
Subject: [PATCH 38/42] rm touch for gz files

---
 tests/test_PacBioWorflow.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py
index 5f73efa5..4129df9c 100644
--- a/tests/test_PacBioWorflow.py
+++ b/tests/test_PacBioWorflow.py
@@ -74,7 +74,8 @@ def _inject_data(self, wf):
             sp = sample["Sample_Project"]
             dstats.append({'SampleID': sn, '# Reads': 2})
             dname = f'{convert_dir}/{sp}'
-            Path(f'{dname}/{sn}_S000_L001_R1_001.fastq.gz').touch()
+            copyfile(self.gz_source,
+                     f'{dname}/{sn}_S000_L001_R1_001.fastq.gz')
             with open(f'{dname}/{sn}_S000_L001_R1_001.counts.txt', 'w') as f:
                 f.write("2")
 
@@ -85,7 +86,8 @@ def _inject_data(self, wf):
 
             # GenPrepFileJob
             gprep_base = f'{genprep_dir}/{sp}/filtered_sequences/{sn}'
-            Path(f'{gprep_base}_S000_L001_R1_001.trimmed.fastq.gz').touch()
+            copyfile(self.gz_source,
+                     f'{gprep_base}_S000_L001_R1_001.trimmed.fastq.gz')
 
         pd.DataFrame(dstats).set_index('SampleID').to_csv(
             f'{reports_dir}/Demultiplex_Stats.csv')

From 62e388e7b687750789b716ff3af6fa4f6beac761 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Wed, 10 Sep 2025 14:18:28 -0600
Subject: [PATCH 39/42] add_default_workflow

---
 src/qp_klp/PacBioMetagenomicWorkflow.py       |  2 +-
 src/qp_klp/Protocol.py                        |  2 +-
 src/qp_klp/Workflows.py                       |  6 +-
 src/qp_klp/scripts/pacbio_commands.py         |  5 ++
 .../ConvertJob.py                             |  2 +-
 src/sequence_processing_pipeline/NuQCJob.py   |  7 +--
 src/sequence_processing_pipeline/Pipeline.py  | 55 +++++++------------
 .../pacbio/good_pacbio_metagv10.csv           |  8 +--
 tests/test_PacBioWorflow.py                   |  9 +--
 9 files changed, 42 insertions(+), 54 deletions(-)

diff --git a/src/qp_klp/PacBioMetagenomicWorkflow.py b/src/qp_klp/PacBioMetagenomicWorkflow.py
index 0348ab1d..d57fe61b 100644
--- a/src/qp_klp/PacBioMetagenomicWorkflow.py
+++ b/src/qp_klp/PacBioMetagenomicWorkflow.py
@@ -14,7 +14,7 @@ def __init__(self, **kwargs):
         self.mandatory_attributes = ['qclient', 'uif_path',
                                      'lane_number', 'config_fp',
                                      'run_identifier', 'output_dir', 'job_id',
-                                     'lane_number', 'is_restart']
+                                     'is_restart']
 
         self.confirm_mandatory_attributes()
 
diff --git a/src/qp_klp/Protocol.py b/src/qp_klp/Protocol.py
index e3c8f972..5f49248b 100644
--- a/src/qp_klp/Protocol.py
+++ b/src/qp_klp/Protocol.py
@@ -435,7 +435,7 @@ def convert_raw_to_fastq(self):
         return failed_samples
 
     def generate_sequence_counts(self):
-        # for other isntances of generate_sequence_counts in other objects
+        # for other instances of generate_sequence_counts in other objects
         # the sequence counting needs to be done; however, for PacBio we
         # already have done it and just need to merge the results.
         gz_files = glob(f'{self.raw_fastq_files_path}/*/*.fastq.gz')
diff --git a/src/qp_klp/Workflows.py b/src/qp_klp/Workflows.py
index 1520d1f2..1f979bf0 100644
--- a/src/qp_klp/Workflows.py
+++ b/src/qp_klp/Workflows.py
@@ -626,7 +626,11 @@ def _load_prep_into_qiita(self, qclient, prep_id, artifact_name,
                  'prep_id': prep_id,
                  'artifact_type': atype,
                  'command_artifact_name': artifact_name,
-                 'add_default_workflow': True,
+                 # this will block adding the default workflow to the
+                 # long read / pacbio processing; which is desirable
+                 # until we have a processing pipeline.
+                 'add_default_workflow': self.read_length != 'long',
+                 # 'add_default_workflow': True,
                  'files': dumps(fastq_files)}
 
         job_id = qclient.post('/qiita_db/artifact/', data=pdata)['job_id']
diff --git a/src/qp_klp/scripts/pacbio_commands.py b/src/qp_klp/scripts/pacbio_commands.py
index 7caf26c3..b0ab0717 100755
--- a/src/qp_klp/scripts/pacbio_commands.py
+++ b/src/qp_klp/scripts/pacbio_commands.py
@@ -21,6 +21,11 @@
 def generate_bam2fastq_commands(sample_list, run_folder, outdir, threads):
     """Generates the bam2fastq commands"""
     df = pd.read_csv(sample_list, sep='\t')
+
+    # pacbio raw files are in a hifi_reads folder, wihtin multiple folders
+    # (1_A01, 2_A02, ect), within the run-id folder; and are named
+    # m[run-id]XXX.hifi_reads.[barcode].bam; thus to find the [barcode] we
+    # can split on '.' and then the second to last element [-2].
     files = {f.split('.')[-2]: f
              for f in glob(f'{run_folder}/*/hifi_reads/*.bam')}
 
diff --git a/src/sequence_processing_pipeline/ConvertJob.py b/src/sequence_processing_pipeline/ConvertJob.py
index 4e99a8aa..6fb4967d 100644
--- a/src/sequence_processing_pipeline/ConvertJob.py
+++ b/src/sequence_processing_pipeline/ConvertJob.py
@@ -476,7 +476,7 @@ def _generate_job_script(self):
 
     def run(self, callback=None):
         """
-        Run BCL2Fastq/BCLConvert conversion
+        Run pacbio_generate_bam2fastq_commands, which runs bam2fastq
         :param callback: optional function taking two parameters (id, status)
                          that is called when a running process's status is
                          changed.
diff --git a/src/sequence_processing_pipeline/NuQCJob.py b/src/sequence_processing_pipeline/NuQCJob.py
index 8fa693c4..8e1a31f6 100644
--- a/src/sequence_processing_pipeline/NuQCJob.py
+++ b/src/sequence_processing_pipeline/NuQCJob.py
@@ -53,7 +53,7 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path,
         :param additional_fastq_tags: A list of fastq tags to preserve during
         filtering.
         :param files_regex: the FILES_REGEX to use for parsing files
-        :param read_length: the FILES_REGEX to use for parsing files
+        :param read_length: string defining the read length: long/short.
         """
         super().__init__(fastq_root_dir,
                          output_path,
@@ -376,10 +376,7 @@ def _process_sample_sheet(self):
             raise PipelineError(s)
 
         header = sheet.Header
-        if 'chemistry' in header:
-            chemistry = header['chemistry']
-        else:
-            chemistry = ''
+        chemistry = header.get('chemistry', '')
 
         if header['Assay'] not in Pipeline.assay_types:
             s = "Assay value '%s' is not recognized." % header['Assay']
diff --git a/src/sequence_processing_pipeline/Pipeline.py b/src/sequence_processing_pipeline/Pipeline.py
index cf0a83cd..d0060d6b 100644
--- a/src/sequence_processing_pipeline/Pipeline.py
+++ b/src/sequence_processing_pipeline/Pipeline.py
@@ -36,18 +36,8 @@ def _get_instrument_id(run_directory):
         # if RunInfo.xml doesn't exist, this might be a PacBio
         # folder, let's check for it
         if not exists(run_info):
-            pacbio_metadata_fps = glob(
-                f'{run_directory}/*/metadata/*.metadata.xml')
-            if pacbio_metadata_fps:
-                run_info = pacbio_metadata_fps[0]
-                tree = ET.parse(run_info)
-                mtag = tree.getroot().tag.split('}')[0] + '}'
-                instrument_text = (
-                    f'{mtag}ExperimentContainer/{mtag}Runs/{mtag}Run')
-                run = ET.parse(run_info).find(instrument_text)
-                text = run.attrib['TimeStampedName']
-            else:
-                raise ValueError(f"'{run_info}' doesn't exist")
+            run = InstrumentUtils._get_pacbio_run_str(run_info, run_directory)
+            text = run.attrib['TimeStampedName']
         else:
             text = ET.parse(run_info).find(instrument_text).text
 
@@ -59,6 +49,21 @@ def get_instrument_type(run_directory):
         return get_model_by_instrument_id(
             instrument_id, model_key=PROFILE_NAME_KEY)
 
+    @staticmethod
+    def _get_pacbio_run_str(run_info, run_directory):
+        pacbio_metadata_fps = glob(
+            f'{run_directory}/*/metadata/*.metadata.xml')
+        if not pacbio_metadata_fps:
+            raise ValueError(f"'{run_info}' doesn't exist")
+
+        run_info = pacbio_metadata_fps[0]
+        tree = ET.parse(run_info)
+        mtag = tree.getroot().tag.split('}')[0] + '}'
+        instrument_text = (
+            f'{mtag}ExperimentContainer/{mtag}Runs/{mtag}Run')
+        run = ET.parse(run_info).find(instrument_text)
+        return run
+
     @staticmethod
     def _get_date(run_directory):
         run_info = join(run_directory, 'RunInfo.xml')
@@ -66,18 +71,8 @@ def _get_date(run_directory):
         # if RunInfo.xml doesn't exist, this might be a PacBio
         # folder, let's check for it
         if not exists(run_info):
-            pacbio_metadata_fps = glob(
-                f'{run_directory}/*/metadata/*.metadata.xml')
-            if pacbio_metadata_fps:
-                run_info = pacbio_metadata_fps[0]
-                tree = ET.parse(run_info)
-                mtag = tree.getroot().tag.split('}')[0] + '}'
-                instrument_text = (
-                    f'{mtag}ExperimentContainer/{mtag}Runs/{mtag}Run')
-                run = ET.parse(run_info).find(instrument_text)
-                date_string = run.attrib['CreatedAt'].split('T')[0]
-            else:
-                raise ValueError(f"'{run_info}' doesn't exist")
+            run = InstrumentUtils._get_pacbio_run_str(run_info, run_directory)
+            date_string = run.attrib['CreatedAt'].split('T')[0]
         else:
             date_string = ET.parse(run_info).find('Run/Date').text
 
@@ -688,23 +683,13 @@ def generate_sample_info_files(self, dereplicated_input_file_paths):
             controls_in_proj_df['description'] = controls_in_proj_df[TEMP_KEY]
             controls_in_proj_df.drop(columns=[TEMP_KEY], inplace=True)
             controls_in_proj_df['collection_timestamp'] = \
-                self.get_date_from_run_id()
+                InstrumentUtils._get_date(self.run_dir)
 
             controls_in_proj_df = controls_in_proj_df[Pipeline.sif_header]
             controls_in_proj_df.to_csv(curr_fp, sep='\t', index=False)
 
         return paths
 
-    def get_date_from_run_id(self):
-        # assume all run_ids begin with coded datestamp:
-        # 210518_...
-        # allow exception if substrings cannot convert to int
-        # or if array indexes are out of bounds.
-        year = int(self.run_id[0:2]) + 2000
-        month = int(self.run_id[2:4])
-        day = int(self.run_id[4:6])
-        return f'{year}-{month}-{day}'
-
     def get_sample_ids(self):
         '''
         Returns list of sample-ids sourced from sample-sheet or pre-prep file
diff --git a/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv b/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv
index b2134a64..c18e8f33 100644
--- a/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv
+++ b/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv
@@ -8,10 +8,10 @@ Assay,Metagenomic,,,,,,
 Description,,,,,,,
 ,,,,,,,
 [Data],,,,,,,
-Sample_ID,Sample_Name,Sample_Plate,Sample_Well,Sample_Project,Well_description,Lane,barcode_id
-3A,3A,sample_plate_1,A1,MyProject_6123,sample_plate_1.3A.A1,1,bc3011
-4A,4A,sample_plate_1,A2,MyProject_6123,sample_plate_1.4A.A2,1,bc0112
-5B,5B,sample_plate_1,A3,MyProject_6123,sample_plate_1.5B.A3,1,bc9992
+Sample_ID,Sample_Name,Sample_Plate,Sample_Well,barcode_id,Sample_Project,Well_description,Lane
+3A,3A,sample_plate_1,A1,bc3011,MyProject_6123,sample_plate_1.3A.A1,1
+4A,4A,sample_plate_1,A2,bc0112,MyProject_6123,sample_plate_1.4A.A2,1
+5B,5B,sample_plate_1,A3,bc9992,MyProject_6123,sample_plate_1.5B.A3,1
 ,,,,,,,
 [Bioinformatics],,,,,,,
 Sample_Project,QiitaID,HumanFiltering,library_construction_protocol,experiment_design_description,contains_replicates,,
diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py
index 4129df9c..bc1f1e46 100644
--- a/tests/test_PacBioWorflow.py
+++ b/tests/test_PacBioWorflow.py
@@ -14,7 +14,6 @@
 from pathlib import Path
 import pandas as pd
 from metapool.sample_sheet import PROTOCOL_NAME_PACBIO_SMRT
-from sequence_processing_pipeline.PipelineError import PipelineError
 from qp_klp.Assays import ASSAY_NAME_METAGENOMIC
 from qiita_client.testing import PluginTestCase
 
@@ -67,7 +66,7 @@ def _inject_data(self, wf):
             makedirs(f'{genprep_dir}/{sp}/filtered_sequences/',
                      exist_ok=True)
 
-        # # then loop over samples and stage all fastq.gz files
+        # then loop over samples and stage all fastq.gz files
         dstats = []
         for sample in samples:
             sn = sample['Sample_ID']
@@ -128,10 +127,8 @@ def test_pacbio_metagenomic_workflow_creation(self):
         # and the system is trying to create the preps.
         # Note: Qiita job_id's are UUID in the database and this tests
         # uses 78901 as the job_id so the db complains about the format
-        # with self.assertRaisesRegex(RuntimeError, 'invalid input '
-        #                             'syntax for type uuid: "78901"'):
-        with self.assertRaisesRegex(PipelineError, 'There are no fastq '
-                                    'files for FastQCJob'):
+        with self.assertRaisesRegex(RuntimeError, 'invalid input '
+                                    'syntax for type uuid: "78901"'):
             wf.execute_pipeline()
 
 

From 27efe2d41ea3e092ac251d4f6cf8fbbca69fd954 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Wed, 10 Sep 2025 14:26:18 -0600
Subject: [PATCH 40/42] fixing counts

---
 src/qp_klp/Protocol.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/qp_klp/Protocol.py b/src/qp_klp/Protocol.py
index 5f49248b..4191d757 100644
--- a/src/qp_klp/Protocol.py
+++ b/src/qp_klp/Protocol.py
@@ -443,7 +443,8 @@ def generate_sequence_counts(self):
 
         for gzf in gz_files:
             cf = gzf.replace('.fastq.gz', '.counts.txt')
-            sn = basename(cf).replace('_R1.counts.txt', '')
+            sn = basename(cf).replace(
+                f'_S000_L00{self.lane_number}_R1_001.counts.txt', '')
             if not exists(cf):
                 missing_files.append(sn)
                 continue

From a323f466b75c7807835a2e28f1e856dbb3eb5b04 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Wed, 10 Sep 2025 14:49:10 -0600
Subject: [PATCH 41/42] fix TestPipeline.test_generate_sample_information_files

---
 tests/test_Pipeline.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test_Pipeline.py b/tests/test_Pipeline.py
index b1cc502e..d27d73f2 100644
--- a/tests/test_Pipeline.py
+++ b/tests/test_Pipeline.py
@@ -400,7 +400,7 @@ def test_generate_sample_information_files(self):
 
         exp_first_lines = {
             f'{self.good_run_id}_StudyA_13059_blanks.tsv':
-            'BLANK1.1A\t2021-10-21\t193\t'
+            'BLANK1.1A\t2017-05-23\t193\t'
             'Control\tNegative\tSterile w'
             'ater blank\tSterile water blank\turban biome\tres'
             'earch facility\tsterile wate'
@@ -411,7 +411,7 @@ def test_generate_sample_information_files(self):
             'StudyA\tTRUE\t'
             'UCSD\tFALSE',
             f'{self.good_run_id}_StudyB_11661_blanks.tsv':
-            'BLANK.40.12G\t2021-10-21\t193\tControl'
+            'BLANK.40.12G\t2017-05-23\t193\tControl'
             '\tNegative\tSterile water blank\tSterile water blank\turban '
             'biome\tresearch facility\tsterile water'
             '\tmisc environment\tUSA:CA:San Diego\tB'
@@ -419,7 +419,7 @@ def test_generate_sample_information_files(self):
             'nk\tmetagenome\t256318\tBLANK.40.12G\t'
             'StudyB\tTRUE\tUCSD\tFALSE',
             f'{self.good_run_id}_StudyC_6123_blanks.tsv':
-            'BLANK.41.12G\t2021-10-21\t193\tControl'
+            'BLANK.41.12G\t2017-05-23\t193\tControl'
             '\tNegative\tSterile water blank\tSterile water blank\turban'
             ' biome\tresearch facility\tsterile wat'
             'er\tmisc environment\tUSA:CA:San Diego'
@@ -430,7 +430,7 @@ def test_generate_sample_information_files(self):
 
         exp_last_lines = {
             f'{self.good_run_id}_StudyA_13059_blanks.tsv':
-            'BLANK4.4H\t2021-10-21\t193\t'
+            'BLANK4.4H\t2017-05-23\t193\t'
             'Control\tNegative\tSterile w'
             'ater blank\tSterile water blank\turban biome\tres'
             'earch facility\tsterile wate'
@@ -441,7 +441,7 @@ def test_generate_sample_information_files(self):
             'StudyA\tTRUE\t'
             'UCSD\tFALSE',
             f'{self.good_run_id}_StudyB_11661_blanks.tsv':
-            'BLANK.43.12H\t2021-10-21\t193\tControl'
+            'BLANK.43.12H\t2017-05-23\t193\tControl'
             '\tNegative\tSterile water blank\tSterile water blank\turban'
             ' biome\tresearch facility\tsterile wat'
             'er\tmisc environment\tUSA:CA:San Diego'
@@ -449,7 +449,7 @@ def test_generate_sample_information_files(self):
             ' blank\tmetagenome\t256318\tBLANK.43.1'
             '2H\tStudyB\tTRUE\tUCSD\tFALSE',
             f'{self.good_run_id}_StudyC_6123_blanks.tsv':
-            'BLANK.41.12G\t2021-10-21\t193\tContro'
+            'BLANK.41.12G\t2017-05-23\t193\tContro'
             'l\tNegative\tSterile water blank\tSterile water blank\turb'
             'an biome\tresearch facility\tsterile '
             'water\tmisc environment\tUSA:CA:San D'

From 48010d349345e9e76ab19922c30ca16c09644858 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Wed, 10 Sep 2025 16:21:14 -0600
Subject: [PATCH 42/42] restart changes

---
 src/qp_klp/PacBioMetagenomicWorkflow.py |  5 +++++
 src/qp_klp/Workflows.py                 | 11 ++++++-----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/qp_klp/PacBioMetagenomicWorkflow.py b/src/qp_klp/PacBioMetagenomicWorkflow.py
index d57fe61b..7a83d8e5 100644
--- a/src/qp_klp/PacBioMetagenomicWorkflow.py
+++ b/src/qp_klp/PacBioMetagenomicWorkflow.py
@@ -5,6 +5,7 @@
 from .FailedSamplesRecord import FailedSamplesRecord
 from .Workflows import Workflow
 import pandas as pd
+from os.path import join
 
 
 class PacBioMetagenomicWorkflow(Workflow, Metagenomic, PacBio):
@@ -53,6 +54,10 @@ def __init__(self, **kwargs):
         self.is_restart = bool(self.kwargs['is_restart'])
 
         if self.is_restart is True:
+            self.raw_fastq_files_path = join(self.pipeline.output_path,
+                                             'ConvertJob')
+            self.reports_path = join(self.raw_fastq_files_path,
+                                     'SeqCounts.csv')
             self.determine_steps_to_skip()
 
         # this is a convenience member to allow testing w/out updating Qiita.
diff --git a/src/qp_klp/Workflows.py b/src/qp_klp/Workflows.py
index 1f979bf0..9155004c 100644
--- a/src/qp_klp/Workflows.py
+++ b/src/qp_klp/Workflows.py
@@ -626,12 +626,13 @@ def _load_prep_into_qiita(self, qclient, prep_id, artifact_name,
                  'prep_id': prep_id,
                  'artifact_type': atype,
                  'command_artifact_name': artifact_name,
-                 # this will block adding the default workflow to the
-                 # long read / pacbio processing; which is desirable
-                 # until we have a processing pipeline.
-                 'add_default_workflow': self.read_length != 'long',
-                 # 'add_default_workflow': True,
                  'files': dumps(fastq_files)}
+        # this will block adding the default workflow to the
+        # long read / pacbio processing; which is desirable
+        # until we have a processing pipeline - note that
+        # this will only be added if _not_ 'long'
+        if self.read_length != 'long':
+            pdata['add_default_workflow'] = True
 
         job_id = qclient.post('/qiita_db/artifact/', data=pdata)['job_id']