From 5d65b4802a22142856d21ef0e80cc501f0f33fb8 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Wed, 27 Aug 2025 10:50:19 -0600 Subject: [PATCH 01/42] initial changes --- src/qp_klp/Protocol.py | 92 +++++++++++++- src/qp_klp/RevioMetagenomicWorkflow.py | 59 +++++++++ src/qp_klp/WorkflowFactory.py | 4 +- .../ConvertJob.py | 116 ++++++++++++++++++ 4 files changed, 268 insertions(+), 3 deletions(-) create mode 100644 src/qp_klp/RevioMetagenomicWorkflow.py diff --git a/src/qp_klp/Protocol.py b/src/qp_klp/Protocol.py index 1a3d805e..6aee5d81 100644 --- a/src/qp_klp/Protocol.py +++ b/src/qp_klp/Protocol.py @@ -1,4 +1,5 @@ -from sequence_processing_pipeline.ConvertJob import ConvertJob +from sequence_processing_pipeline.ConvertJob import ( + ConvertJob, ConvertRevioBam2FastqJob) from sequence_processing_pipeline.TellReadJob import TellReadJob from sequence_processing_pipeline.SeqCountsJob import SeqCountsJob from sequence_processing_pipeline.TRIntegrateJob import TRIntegrateJob @@ -8,7 +9,9 @@ from re import match from os import makedirs, rename, walk from metapool import load_sample_sheet -from metapool.sample_sheet import PROTOCOL_NAME_ILLUMINA, PROTOCOL_NAME_TELLSEQ +from metapool.sample_sheet import ( + PROTOCOL_NAME_ILLUMINA, PROTOCOL_NAME_TELLSEQ, + PROTOCOL_NAME_PACBIO) import pandas as pd from glob import glob from qiita_client.util import system_call @@ -369,3 +372,88 @@ def _post_process_file(self, fastq_file, mapping, lane): rename(fastq_file, final_path) return final_path + + +class PacBio(Protocol): + protocol_type = PROTOCOL_NAME_PACBIO + + def convert_raw_to_fastq(self): + config = self.pipeline.get_software_configuration('revio') + + job = ConvertRevioBam2FastqJob( + self.pipeline.run_dir, + self.pipeline.output_path, + self.pipeline.input_file_path, + config['queue'], + config['nodes'], + config['wallclock_time_in_minutes'], + config['tellread_mem_limit'], + config['modules_to_load'], + self.master_qiita_job_id, + config['reference_base'], + config['reference_map'], + config['sing_script_path'], + config['tellread_cores']) + + self.raw_fastq_files_path = join(self.pipeline.output_path, + 'ConvertJob') + # self.my_sil_path = join(self.pipeline.output_path, + # 'TellReadJob', + # 'sample_index_list_TellReadJob.txt') + + # if ConvertJob already completed, then skip the over the time- + # consuming portion but populate the needed member variables. + if 'ConvertJob' not in self.skip_steps: + job.run(callback=self.job_callback) + + # audit the results to determine which samples failed to convert + # properly. Append these to the failed-samples report and also + # return the list directly to the caller. + failed_samples = job.audit() + if hasattr(self, 'fsr'): + # NB 16S does not require a failed samples report and + # it is not performed by SPP. + self.fsr.write(failed_samples, job.__class__.__name__) + + return failed_samples + + # def generate_sequence_counts(self): + # config = self.pipeline.get_software_configuration('tell-seq') + + # files_to_count_path = join(self.pipeline.output_path, + # 'files_to_count.txt') + + # with open(files_to_count_path, 'w') as f: + # for root, _, files in walk(self.raw_fastq_files_path): + # for _file in files: + # if determine_orientation(_file) in ['R1', 'R2']: + # print(join(root, _file), file=f) + + # job = SeqCountsJob(self.pipeline.run_dir, + # self.pipeline.output_path, + # config['queue'], + # config['nodes'], + # config['wallclock_time_in_minutes'], + # config['normcount_mem_limit'], + # config['modules_to_load'], + # self.master_qiita_job_id, + # config['job_max_array_length'], + # files_to_count_path, + # self.pipeline.get_sample_sheet_path(), + # cores_per_task=config['tellread_cores']) + + # if 'SeqCountsJob' not in self.skip_steps: + # job.run(callback=self.job_callback) + + # # if successful, set self.reports_path + # self.reports_path = join(self.pipeline.output_path, + # 'SeqCountsJob', + # 'SeqCounts.csv') + + # # Do not add an entry to fsr because w/respect to counting, either + # # all jobs are going to fail or none are going to fail. It's not + # # likely that we're going to fail to count sequences for only some + # # of the samples. + + def integrate_results(self): + pass diff --git a/src/qp_klp/RevioMetagenomicWorkflow.py b/src/qp_klp/RevioMetagenomicWorkflow.py new file mode 100644 index 00000000..c3eb80c9 --- /dev/null +++ b/src/qp_klp/RevioMetagenomicWorkflow.py @@ -0,0 +1,59 @@ +from .Protocol import PacBio +from sequence_processing_pipeline.Pipeline import Pipeline +from .Assays import Metagenomic +from .Assays import ASSAY_NAME_METAGENOMIC +from .FailedSamplesRecord import FailedSamplesRecord +from .Workflows import Workflow + + +class RevioMetagenomicWorkflow(Workflow, Metagenomic, PacBio): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + self.mandatory_attributes = ['qclient', 'uif_path', + 'lane_number', 'config_fp', + 'run_identifier', 'output_dir', 'job_id', + 'lane_number', 'is_restart'] + + self.confirm_mandatory_attributes() + + # second stage initializer that could conceivably be pushed down into + # specific children requiring specific parameters. + self.qclient = self.kwargs['qclient'] + + self.overwrite_prep_with_original = False + if 'overwrite_prep_with_original' in self.kwargs: + self.overwrite_prep_with_original = \ + self.kwargs['overwrite_prep_with_original'] + if 'files_regex' in self.kwargs: + self.files_regex = self.kwargs['files_regex'] + else: + self.files_regex = 'SPP' + self.pipeline = Pipeline(self.kwargs['config_fp'], + self.kwargs['run_identifier'], + self.kwargs['uif_path'], + self.kwargs['output_dir'], + self.kwargs['job_id'], + ASSAY_NAME_METAGENOMIC, + lane_number=self.kwargs['lane_number']) + + self.fsr = FailedSamplesRecord(self.kwargs['output_dir'], + self.pipeline.sample_sheet.samples) + + self.master_qiita_job_id = self.kwargs['job_id'] + + self.lane_number = self.kwargs['lane_number'] + self.is_restart = bool(self.kwargs['is_restart']) + + if self.is_restart is True: + self.determine_steps_to_skip() + + # this is a convenience member to allow testing w/out updating Qiita. + self.update = True + + if 'update_qiita' in kwargs: + if not isinstance(kwargs['update_qiita'], bool): + raise ValueError("value for 'update_qiita' must be of " + "type bool") + + self.update = kwargs['update_qiita'] diff --git a/src/qp_klp/WorkflowFactory.py b/src/qp_klp/WorkflowFactory.py index 81b6e024..4853bcd0 100644 --- a/src/qp_klp/WorkflowFactory.py +++ b/src/qp_klp/WorkflowFactory.py @@ -3,6 +3,7 @@ from .StandardMetatranscriptomicWorkflow import \ StandardMetatranscriptomicWorkflow from .TellseqMetagenomicWorkflow import TellSeqMetagenomicWorkflow +from .RevioMetagenomicWorkflow import RevioMetagenomicWorkflow from sequence_processing_pipeline.Pipeline import Pipeline from metapool import load_sample_sheet from metapool.sample_sheet import SAMPLE_SHEETS_BY_PROTOCOL as SSBP @@ -14,7 +15,8 @@ class WorkflowFactory(): WORKFLOWS = [StandardMetagenomicWorkflow, StandardMetatranscriptomicWorkflow, StandardAmpliconWorkflow, - TellSeqMetagenomicWorkflow] + TellSeqMetagenomicWorkflow, + RevioMetagenomicWorkflow] @classmethod def _get_instrument_type(cls, sheet): diff --git a/src/sequence_processing_pipeline/ConvertJob.py b/src/sequence_processing_pipeline/ConvertJob.py index 3a521d8c..6ec1d74d 100644 --- a/src/sequence_processing_pipeline/ConvertJob.py +++ b/src/sequence_processing_pipeline/ConvertJob.py @@ -27,6 +27,7 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, :param node_count: The number of nodes to request. :param nprocs: The maximum number of parallel processes to use. :param wall_time_limit: A hard time limit (in min) to bound processing. + :param pmem: Amount of memory to be used. :param bcl_tool_path: The path to either bcl2fastq or bcl-convert. :param modules_to_load: A list of Linux module names to load :param qiita_job_id: identify Torque jobs using qiita_job_id @@ -379,3 +380,118 @@ def _make_msg_base(col_name): # be provided in the destination path. dst_fp = join(self.output_path, dest_project, split(src_fp)[1]) copyfile(src_fp, dst_fp) + + +class ConvertRevioBam2FastqJob(Job): + def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, + node_count, nprocs, wall_time_limit, pmem, bam2fastq_path, + modules_to_load, qiita_job_id): + """ + ConvertRevioBam2FastqJob provides a convenient way to run bam2fastq + on a directory revio bam files to generate fastq files. + :param run_dir: The 'run' directory that contains BCL files. + :param output_path: Path where all pipeline-generated files live. + :param sample_sheet_path: The path to a sample-sheet. + :param queue_name: The name of the Torque queue to use for processing. + :param node_count: The number of nodes to request. + :param nprocs: The maximum number of parallel processes to use. + :param wall_time_limit: A hard time limit (in min) to bound processing. + :param pmem: Amount of memory to be used. + :param bam2fastq_path: The path to either bcl2fastq or bcl-convert. + :param modules_to_load: A list of Linux module names to load + :param qiita_job_id: identify Torque jobs using qiita_job_id + """ + super().__init__(run_dir, + output_path, + 'ConvertJob', + [bam2fastq_path], + 1000, + modules_to_load=modules_to_load) + + # for metagenomics pipelines, sample_sheet_path will reflect a real + # sample_sheet file. For amplicon pipelines, sample_sheet_path will + # reference a dummy sample_sheet file. + self.sample_sheet_path = sample_sheet_path + self.queue_name = queue_name + self.node_count = node_count + self.nprocs = nprocs + self.wall_time_limit = wall_time_limit + self.pmem = pmem + self.bcl_tool = bam2fastq_path + self.qiita_job_id = qiita_job_id + self.job_script_path = join(self.output_path, f"{self.job_name}.sh") + self.suffix = 'fastq.gz' + self.fastq_paths = None + self.info = None + + self._file_check(self.sample_sheet_path) + + self._generate_job_script() + + def _generate_job_script(self): + """ + Generate a Torque job script for processing supplied root_directory. + :return: The path to the newly-created job-script. + """ + lines = [] + + lines.append("#!/bin/bash") + lines.append(f"#SBATCH --job-name {self.qiita_job_id}_{self.job_name}") + lines.append(f"#SBATCH -p {self.queue_name}") + lines.append(f'#SBATCH -N {self.node_count}') + lines.append(f'#SBATCH -n {self.nprocs}') + lines.append("#SBATCH --time %d" % self.wall_time_limit) + + # send an email to the list of users defined below when a job starts, + # terminates, or aborts. This is used to confirm that the package's + # own reporting mechanism is reporting correctly. + lines.append("#SBATCH --mail-type=ALL") + + # list of users to be contacted independently of this package's + # notification system, when a job starts, terminates, or gets aborted. + lines.append("#SBATCH --mail-user qiita.help@gmail.com") + + lines.append(f"#SBATCH --mem-per-cpu {self.pmem}") + + lines.append("set -x") + lines.append('date') + lines.append('hostname') + lines.append(f'cd {self.root_dir}') + + if self.modules_to_load: + lines.append("module load " + ' '.join(self.modules_to_load)) + + # for sample + # parallel -j 2 --joblog build.log echo 'Built {}' ::: app1 app2 app3 + # bam2fastq -o ${OUT}/${BARCODE} + # ${BASE}/${Run_prefix}.hifi_reads.${BARCODE}.bam + + with open(self.job_script_path, 'w') as f: + for line in lines: + # remove long spaces in some lines. + line = re.sub(r'\s+', ' ', line) + f.write(f"{line}\n") + + def run(self, callback=None): + """ + Run BCL2Fastq/BCLConvert conversion + :param callback: optional function taking two parameters (id, status) + that is called when a running process's status is + changed. + :return: + """ + try: + job_info = self.submit_job(self.job_script_path, + exec_from=self.log_path, + callback=callback) + except JobFailedError as e: + # When a job has failed, parse the logs generated by this specific + # job to return a more descriptive message to the user. + info = self.parse_logs() + # prepend just the message component of the Error. + info.insert(0, str(e)) + raise JobFailedError('\n'.join(info)) + + self. mark_job_completed() + + logging.info(f'Successful job: {job_info}') From 55ab2106c8de623e6f27534ed25a7e22749b1a6e Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Mon, 1 Sep 2025 12:01:54 -0600 Subject: [PATCH 02/42] init push --- .github/workflows/qiita-plugin-ci.yml | 2 +- ...rkflow.py => PacBioMetagenomicWorkflow.py} | 12 +- src/qp_klp/Protocol.py | 116 +++++++------ src/qp_klp/WorkflowFactory.py | 4 +- .../ConvertJob.py | 21 ++- src/sequence_processing_pipeline/NuQCJob.py | 11 +- src/sequence_processing_pipeline/Pipeline.py | 18 +- .../pacbio_metagenomic.json | 64 +++++++ .../m11111_20250101_111111_s4.metadata.xml | 11 ++ tests/test_PacBioWorflow.py | 164 ++++++++++++++++++ 10 files changed, 337 insertions(+), 86 deletions(-) rename src/qp_klp/{RevioMetagenomicWorkflow.py => PacBioMetagenomicWorkflow.py} (83%) create mode 100644 tests/data/configuration_profiles/pacbio_metagenomic.json create mode 100644 tests/data/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml create mode 100644 tests/test_PacBioWorflow.py diff --git a/.github/workflows/qiita-plugin-ci.yml b/.github/workflows/qiita-plugin-ci.yml index 46d280dc..92dceaac 100644 --- a/.github/workflows/qiita-plugin-ci.yml +++ b/.github/workflows/qiita-plugin-ci.yml @@ -83,7 +83,7 @@ jobs: # seqtk is a conda-installed requirement for metapool and isn't # installed automatically by its setup.py. conda config --add channels bioconda - conda create -q --yes -n klp python=3.9 'seqtk>=1.4' + conda create -q --yes -n klp python=3.9 'seqtk>=1.4' pbtk conda activate klp diff --git a/src/qp_klp/RevioMetagenomicWorkflow.py b/src/qp_klp/PacBioMetagenomicWorkflow.py similarity index 83% rename from src/qp_klp/RevioMetagenomicWorkflow.py rename to src/qp_klp/PacBioMetagenomicWorkflow.py index c3eb80c9..3036755b 100644 --- a/src/qp_klp/RevioMetagenomicWorkflow.py +++ b/src/qp_klp/PacBioMetagenomicWorkflow.py @@ -6,7 +6,7 @@ from .Workflows import Workflow -class RevioMetagenomicWorkflow(Workflow, Metagenomic, PacBio): +class PacBioMetagenomicWorkflow(Workflow, Metagenomic, PacBio): def __init__(self, **kwargs): super().__init__(**kwargs) @@ -40,6 +40,16 @@ def __init__(self, **kwargs): self.fsr = FailedSamplesRecord(self.kwargs['output_dir'], self.pipeline.sample_sheet.samples) + import pandas as pd + samples = [ + {'barcode': sample['Sample_ID'], + 'sample_name': sample['Sample_Name'], + 'project_name': sample['Sample_Project']} + for sample in self.pipeline.sample_sheet.samples] + df = pd.DataFrame(samples) + sample_list_fp = f"{self.kwargs['output_dir']}/sample_list.tsv" + df.to_csv(sample_list_fp, sep='t') + self.master_qiita_job_id = self.kwargs['job_id'] self.lane_number = self.kwargs['lane_number'] diff --git a/src/qp_klp/Protocol.py b/src/qp_klp/Protocol.py index 6aee5d81..71e740e9 100644 --- a/src/qp_klp/Protocol.py +++ b/src/qp_klp/Protocol.py @@ -1,17 +1,17 @@ from sequence_processing_pipeline.ConvertJob import ( - ConvertJob, ConvertRevioBam2FastqJob) + ConvertJob, ConvertPacBioBam2FastqJob) from sequence_processing_pipeline.TellReadJob import TellReadJob from sequence_processing_pipeline.SeqCountsJob import SeqCountsJob from sequence_processing_pipeline.TRIntegrateJob import TRIntegrateJob from sequence_processing_pipeline.PipelineError import PipelineError from sequence_processing_pipeline.util import determine_orientation -from os.path import join, split, basename, dirname from re import match from os import makedirs, rename, walk +from os.path import join, split, basename, dirname, exists from metapool import load_sample_sheet from metapool.sample_sheet import ( PROTOCOL_NAME_ILLUMINA, PROTOCOL_NAME_TELLSEQ, - PROTOCOL_NAME_PACBIO) + PROTOCOL_NAME_PACBIO_SMRT) import pandas as pd from glob import glob from qiita_client.util import system_call @@ -82,6 +82,25 @@ def subsample_reads(self): class Illumina(Protocol): protocol_type = PROTOCOL_NAME_ILLUMINA + # required files for successful operation for Illumina (making the default + # here) both RTAComplete.txt and RunInfo.xml should reside in the root of + # the run directory. + required_files = ['RTAComplete.txt', 'RunInfo.xml'] + + def __init__(self) -> None: + super().__init__() + + for some_file in self.required_files: + if not exists(join(self.run_dir, some_file)): + raise PipelineError(f"required file '{some_file}' is not " + f"present in {self.run_dir}.") + + # verify that RunInfo.xml file is readable. + try: + fp = open(join(self.run_dir, 'RunInfo.xml')) + fp.close() + except PermissionError: + raise PipelineError('RunInfo.xml is present, but not readable') def convert_raw_to_fastq(self): def get_config(command): @@ -375,31 +394,26 @@ def _post_process_file(self, fastq_file, mapping, lane): class PacBio(Protocol): - protocol_type = PROTOCOL_NAME_PACBIO + protocol_type = PROTOCOL_NAME_PACBIO_SMRT def convert_raw_to_fastq(self): - config = self.pipeline.get_software_configuration('revio') + config = self.pipeline.get_software_configuration('pacbio_convert') - job = ConvertRevioBam2FastqJob( + job = ConvertPacBioBam2FastqJob( self.pipeline.run_dir, self.pipeline.output_path, self.pipeline.input_file_path, config['queue'], config['nodes'], + config['nprocs'], config['wallclock_time_in_minutes'], - config['tellread_mem_limit'], + config['per_process_memory_limit'], + config['executable_path'], config['modules_to_load'], - self.master_qiita_job_id, - config['reference_base'], - config['reference_map'], - config['sing_script_path'], - config['tellread_cores']) + self.master_qiita_job_id) self.raw_fastq_files_path = join(self.pipeline.output_path, 'ConvertJob') - # self.my_sil_path = join(self.pipeline.output_path, - # 'TellReadJob', - # 'sample_index_list_TellReadJob.txt') # if ConvertJob already completed, then skip the over the time- # consuming portion but populate the needed member variables. @@ -409,7 +423,7 @@ def convert_raw_to_fastq(self): # audit the results to determine which samples failed to convert # properly. Append these to the failed-samples report and also # return the list directly to the caller. - failed_samples = job.audit() + failed_samples = job.audit(self.pipeline.get_sample_ids()) if hasattr(self, 'fsr'): # NB 16S does not require a failed samples report and # it is not performed by SPP. @@ -417,43 +431,39 @@ def convert_raw_to_fastq(self): return failed_samples - # def generate_sequence_counts(self): - # config = self.pipeline.get_software_configuration('tell-seq') - - # files_to_count_path = join(self.pipeline.output_path, - # 'files_to_count.txt') - - # with open(files_to_count_path, 'w') as f: - # for root, _, files in walk(self.raw_fastq_files_path): - # for _file in files: - # if determine_orientation(_file) in ['R1', 'R2']: - # print(join(root, _file), file=f) - - # job = SeqCountsJob(self.pipeline.run_dir, - # self.pipeline.output_path, - # config['queue'], - # config['nodes'], - # config['wallclock_time_in_minutes'], - # config['normcount_mem_limit'], - # config['modules_to_load'], - # self.master_qiita_job_id, - # config['job_max_array_length'], - # files_to_count_path, - # self.pipeline.get_sample_sheet_path(), - # cores_per_task=config['tellread_cores']) - - # if 'SeqCountsJob' not in self.skip_steps: - # job.run(callback=self.job_callback) - - # # if successful, set self.reports_path - # self.reports_path = join(self.pipeline.output_path, - # 'SeqCountsJob', - # 'SeqCounts.csv') - - # # Do not add an entry to fsr because w/respect to counting, either - # # all jobs are going to fail or none are going to fail. It's not - # # likely that we're going to fail to count sequences for only some - # # of the samples. + def generate_sequence_counts(self): + # config = self.pipeline.get_software_configuration('tell-seq') + + # files_to_count_path = join(self.pipeline.output_path, + # 'files_to_count.txt') + + # with open(files_to_count_path, 'w') as f: + # for root, _, files in walk(self.raw_fastq_files_path): + # for _file in files: + # if determine_orientation(_file) in ['R1', 'R2']: + # print(join(root, _file), file=f) + + # job = SeqCountsJob(self.pipeline.run_dir, + # self.pipeline.output_path, + # config['queue'], + # config['nodes'], + # config['wallclock_time_in_minutes'], + # config['normcount_mem_limit'], + # config['modules_to_load'], + # self.master_qiita_job_id, + # config['job_max_array_length'], + # files_to_count_path, + # self.pipeline.get_sample_sheet_path(), + # cores_per_task=config['tellread_cores']) + + # if 'SeqCountsJob' not in self.skip_steps: + # job.run(callback=self.job_callback) + + # if successful, set self.reports_path + self.reports_path = join(self.pipeline.output_path, + 'SeqCounts.csv') + open(self.reports_path, 'w').write( + 'SampleID,# Reads\nA1,100') def integrate_results(self): pass diff --git a/src/qp_klp/WorkflowFactory.py b/src/qp_klp/WorkflowFactory.py index 4853bcd0..8d51cee6 100644 --- a/src/qp_klp/WorkflowFactory.py +++ b/src/qp_klp/WorkflowFactory.py @@ -3,7 +3,7 @@ from .StandardMetatranscriptomicWorkflow import \ StandardMetatranscriptomicWorkflow from .TellseqMetagenomicWorkflow import TellSeqMetagenomicWorkflow -from .RevioMetagenomicWorkflow import RevioMetagenomicWorkflow +from .PacBioMetagenomicWorkflow import PacBioMetagenomicWorkflow from sequence_processing_pipeline.Pipeline import Pipeline from metapool import load_sample_sheet from metapool.sample_sheet import SAMPLE_SHEETS_BY_PROTOCOL as SSBP @@ -16,7 +16,7 @@ class WorkflowFactory(): StandardMetatranscriptomicWorkflow, StandardAmpliconWorkflow, TellSeqMetagenomicWorkflow, - RevioMetagenomicWorkflow] + PacBioMetagenomicWorkflow] @classmethod def _get_instrument_type(cls, sheet): diff --git a/src/sequence_processing_pipeline/ConvertJob.py b/src/sequence_processing_pipeline/ConvertJob.py index 6ec1d74d..e67227f4 100644 --- a/src/sequence_processing_pipeline/ConvertJob.py +++ b/src/sequence_processing_pipeline/ConvertJob.py @@ -382,13 +382,13 @@ def _make_msg_base(col_name): copyfile(src_fp, dst_fp) -class ConvertRevioBam2FastqJob(Job): +class ConvertPacBioBam2FastqJob(Job): def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, node_count, nprocs, wall_time_limit, pmem, bam2fastq_path, modules_to_load, qiita_job_id): """ - ConvertRevioBam2FastqJob provides a convenient way to run bam2fastq - on a directory revio bam files to generate fastq files. + ConvertPacBioBam2FastqJob provides a convenient way to run bam2fastq + on a directory PacBio bam files to generate fastq files. :param run_dir: The 'run' directory that contains BCL files. :param output_path: Path where all pipeline-generated files live. :param sample_sheet_path: The path to a sample-sheet. @@ -417,7 +417,7 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, self.nprocs = nprocs self.wall_time_limit = wall_time_limit self.pmem = pmem - self.bcl_tool = bam2fastq_path + self.bam2fastq_path = bam2fastq_path self.qiita_job_id = qiita_job_id self.job_script_path = join(self.output_path, f"{self.job_name}.sh") self.suffix = 'fastq.gz' @@ -461,10 +461,13 @@ def _generate_job_script(self): if self.modules_to_load: lines.append("module load " + ' '.join(self.modules_to_load)) - # for sample - # parallel -j 2 --joblog build.log echo 'Built {}' ::: app1 app2 app3 - # bam2fastq -o ${OUT}/${BARCODE} - # ${BASE}/${Run_prefix}.hifi_reads.${BARCODE}.bam + lines.append(f"sample_list={self.root_dir}/sample_list.tsv") + lines.append( + 'while read bc sn pn; do echo ' + '"bam2fastq -j 1 -o ${sn}.fastq.gz -c 9 ' + '${pn}/${Run_prefix}.hifi_reads.${bn}.bam"; ' + 'fqtools count ${sn}.fastq.gz > ${sn}.counts.txt;' + 'done < ${sample_list}') with open(self.job_script_path, 'w') as f: for line in lines: @@ -492,6 +495,6 @@ def run(self, callback=None): info.insert(0, str(e)) raise JobFailedError('\n'.join(info)) - self. mark_job_completed() + self.mark_job_completed() logging.info(f'Successful job: {job_info}') diff --git a/src/sequence_processing_pipeline/NuQCJob.py b/src/sequence_processing_pipeline/NuQCJob.py index 7273473e..640da755 100644 --- a/src/sequence_processing_pipeline/NuQCJob.py +++ b/src/sequence_processing_pipeline/NuQCJob.py @@ -118,10 +118,12 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path, def _validate_project_data(self): # Validate project settings in [Bioinformatics] for project in self.project_data: - if project['ForwardAdapter'] == 'NA': + if 'ForwardAdapter' not in project or \ + project['ForwardAdapter'] == 'NA': project['ForwardAdapter'] = None - if project['ReverseAdapter'] == 'NA': + if 'ReverseAdapter' not in project or \ + project['ReverseAdapter'] == 'NA': project['ReverseAdapter'] = None if project['ForwardAdapter'] is None: @@ -364,7 +366,10 @@ def _process_sample_sheet(self): raise PipelineError(s) header = sheet.Header - chemistry = header['chemistry'] + if 'chemistry' in header: + chemistry = header['chemistry'] + else: + chemistry = '' if header['Assay'] not in Pipeline.assay_types: s = "Assay value '%s' is not recognized." % header['Assay'] diff --git a/src/sequence_processing_pipeline/Pipeline.py b/src/sequence_processing_pipeline/Pipeline.py index 4fa5c6b2..1749d3c9 100644 --- a/src/sequence_processing_pipeline/Pipeline.py +++ b/src/sequence_processing_pipeline/Pipeline.py @@ -41,7 +41,7 @@ def _get_instrument_id(run_directory): @staticmethod def get_instrument_type(run_directory): - instrument_id = InstrumentUtils._get_instrument_id(run_directory) + instrument_id = basename(run_directory) return get_model_by_instrument_id( instrument_id, model_key=PROFILE_NAME_KEY) @@ -244,22 +244,6 @@ def __init__(self, configuration_file_path, run_id, input_file_path, raise PipelineError(f"A run-dir for '{self.run_id}' could not be " "found") - # required files for successful operation - # both RTAComplete.txt and RunInfo.xml should reside in the root of - # the run directory. - required_files = ['RTAComplete.txt', 'RunInfo.xml'] - for some_file in required_files: - if not exists(join(self.run_dir, some_file)): - raise PipelineError(f"required file '{some_file}' is not " - f"present in {self.run_dir}.") - - # verify that RunInfo.xml file is readable. - try: - fp = open(join(self.run_dir, 'RunInfo.xml')) - fp.close() - except PermissionError: - raise PipelineError('RunInfo.xml is present, but not readable') - self.input_file_path = input_file_path if pipeline_type == Pipeline.AMPLICON_PTYPE: diff --git a/tests/data/configuration_profiles/pacbio_metagenomic.json b/tests/data/configuration_profiles/pacbio_metagenomic.json new file mode 100644 index 00000000..f6a88a5a --- /dev/null +++ b/tests/data/configuration_profiles/pacbio_metagenomic.json @@ -0,0 +1,64 @@ +{ + "profile": { + "instrument_type": "Revio", + "assay_type": "Metagenomic", + "configuration": { + "pacbio_convert": { + "nodes": 1, + "nprocs": 16, + "queue": "qiita", + "wallclock_time_in_minutes": 216, + "modules_to_load": [ + "bclconvert_3.7.5" + ], + "executable_path": "bcl-convert", + "per_process_memory_limit": "10gb" + }, + "nu-qc": { + "nodes": 1, + "cpus_per_task": 8, + "queue": "qiita", + "wallclock_time_in_minutes": 240, + "minimap2_databases": ["/databases/minimap2/db_1.mmi", "/databases/minimap2/db_2.mmi"], + "modules_to_load": [ + "fastp_0.20.1", + "samtools_1.12", + "minimap2_2.18" + ], + "fastp_executable_path": "fastp", + "minimap2_executable_path": "minimap2", + "samtools_executable_path": "samtools", + "job_total_memory_limit": "20", + "job_max_array_length": 1000, + "known_adapters_path": "fastp_known_adapters_formatted.fna", + "bucket_size": 8, + "length_limit": 100, + "cores_per_task": 4, + "movi_executable_path": "/home/user/user_dir/Movi/build/movi-default", + "gres_value": 4, + "pmls_path": "/home/user/user_dir/human_host_filtration/scripts/qiita_filter_pmls.py", + "additional_fastq_tags": ["BX"] + }, + "seqpro": { + "seqpro_path": "seqpro", + "modules_to_load": [] + }, + "fastqc": { + "nodes": 1, + "nprocs": 16, + "queue": "qiita", + "nthreads": 16, + "wallclock_time_in_minutes": 60, + "modules_to_load": [ + "fastqc_0.11.5" + ], + "fastqc_executable_path": "fastqc", + "multiqc_executable_path": "multiqc", + "multiqc_config_file_path": "sequence_processing_pipeline/multiqc-bclconvert-config.yaml", + "job_total_memory_limit": "20gb", + "job_pool_size": 30, + "job_max_array_length": 1000 + } + } + } +} diff --git a/tests/data/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml b/tests/data/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml new file mode 100644 index 00000000..d4ba2c2d --- /dev/null +++ b/tests/data/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml @@ -0,0 +1,11 @@ + + + + Default + 2025-01-01 + + + + + \ No newline at end of file diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py new file mode 100644 index 00000000..12005ae4 --- /dev/null +++ b/tests/test_PacBioWorflow.py @@ -0,0 +1,164 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) 2014--, The Qiita Development Team. +# +# Distributed under the terms of the BSD 3-clause License. +# +# The full license is in the file LICENSE, distributed with this software. +# ----------------------------------------------------------------------------- +from qp_klp.WorkflowFactory import WorkflowFactory +from unittest import main +from os import makedirs +from os.path import dirname, abspath, join, exists +from metapool.sample_sheet import (PROTOCOL_NAME_PACBIO_SMRT) +from qp_klp.Assays import ASSAY_NAME_METAGENOMIC +from shutil import rmtree +from qiita_client.testing import PluginTestCase +from sequence_processing_pipeline.PipelineError import PipelineError + + +class WorkflowFactoryTests(PluginTestCase): + def setUp(self): + self.base_dir = dirname(abspath(__file__)) + self.output_dir = join(self.base_dir, 'test_output') + self.remove_these = [self.output_dir] + + self.gz_source = f'{self.base_dir}/data/dummy.fastq.gz' + + def tearDown(self): + for fp in self.remove_these: + if exists(fp): + rmtree(fp) + + def _create_directory(self, fp): + # just making sure that we always start with a clean folder: + if exists(fp): + rmtree(fp) + makedirs(fp) + self.remove_these.append(fp) + + def _inject_data(self, wf): + '''This is a helper method for testing that all the steps are run + when testing wf.execute_pipeline() + ''' + pass + # samples = wf.pipeline.sample_sheet.samples + # tellseq = True + # if 'index' in dict(samples[0]).keys(): + # tellseq = False + + # # inject Convert/NuQC/FastQC/MultiQC/GenPrepFileJob files so we can + # # move down the pipeline; first let's create the base folders + # if tellseq: + # reports_dir = f'{self.output_dir}/TellReadJob/' + # convert_dir = f'{self.output_dir}/TRIntegrateJob/integrated' + # makedirs(convert_dir, exist_ok=True) + # else: + # convert_dir = f'{self.output_dir}/ConvertJob' + # reports_dir = f'{self.output_dir}/ConvertJob/Reports' + # makedirs(convert_dir, exist_ok=True) + # Path(f'{convert_dir}/job_completed').touch() + # tellread_dir = f'{self.output_dir}/TellReadJob' + # nuqc_dir = f'{self.output_dir}/NuQCJob' + # fastqc_dir = f'{self.output_dir}/FastQCJob/logs/' + # multiqc_dir = f'{self.output_dir}/MultiQCJob/logs/' + # genprep_dir = (f'{self.output_dir}/GenPrepFileJob/' + # '211021_A00000_0000_SAMPLE/') + # makedirs(reports_dir, exist_ok=True) + # makedirs(nuqc_dir, exist_ok=True) + # makedirs(fastqc_dir, exist_ok=True) + # makedirs(multiqc_dir, exist_ok=True) + # makedirs(genprep_dir, exist_ok=True) + # # now let's create the required project folders + # for project in wf.pipeline.get_project_info(): + # sp = project['project_name'] + # makedirs(f'{convert_dir}/{sp}', exist_ok=True) + # makedirs(f'{nuqc_dir}/filtered_sequences/{sp}', exist_ok=True) + # makedirs(f'{genprep_dir}/{sp}/filtered_sequences/', + # exist_ok=True) + + # # then loop over samples and stage all fastq.gz files + # dstats = [] + # for i, sample in enumerate(samples): + # rp = sample["Sample_ID"] + # sp = sample["Sample_Project"] + + # # ConvertJob + # if tellseq: + # bid = sample['barcode_id'] + # dstats.append( + # {'Lane': sample['Lane'], 'SampleID': rp, + # 'Sample_Project': sp, 'Barcode': bid, + # '# Reads': 2}) + # dname = f'{convert_dir}/{sp}' + # copyfile(self.gz_source, f'{dname}/{bid}.R1.fastq.gz') + # copyfile(self.gz_source, f'{dname}/{bid}.R2.fastq.gz') + # else: + # dstats.append( + # {'Lane': sample['Lane'], 'SampleID': rp, + # 'Sample_Project': sp, 'Index': sample['index'], + # '# Reads': 2}) + # dname = f'{convert_dir}/{sp}' + # Path(f'{dname}/{rp}_L001_R1_001.fastq.gz').touch() + # Path(f'{dname}/{rp}_L001_R2_001.fastq.gz').touch() + + # # NuQCJob + # dname = f'{nuqc_dir}/filtered_sequences/{sp}' + # copyfile(self.gz_source, f'{dname}/{rp}_L001_R1_001.fastq.gz') + # copyfile(self.gz_source, f'{dname}/{rp}_L001_R2_001.fastq.gz') + + # # GenPrepFileJob + # gprep_base = f'{genprep_dir}/{sp}/filtered_sequences/{rp}' + # Path(f'{gprep_base}_L001_R1_001.fastq.gz').touch() + # Path(f'{gprep_base}_L001_R2_001.fastq.gz').touch() + + # # this is required by the Convert step + # if tellseq: + # pd.DataFrame(dstats).set_index('SampleID').to_csv( + # f'{tellread_dir}/sample_index_list_TellReadJob.txt') + # else: + # pd.DataFrame(dstats).set_index('SampleID').to_csv( + # f'{reports_dir}/Demultiplex_Stats.csv') + + # # generating the "*.completed" files + # for i in range(len(samples)*3): + # Path(f'{fastqc_dir}/FastQCJob_{i}.completed').touch() + # Path(f'{multiqc_dir}/MultiQCJob_{i}.completed').touch() + + def test_pacbio_metagenomic_workflow_creation(self): + kwargs = {"uif_path": "tests/data/sample-sheets/metagenomic/" + "pacbio/good_pacbio_metagv10.csv", + "qclient": self.qclient, + "lane_number": "1", + "config_fp": "tests/configuration.json", + "run_identifier": "r11111_20250101_111111", + "output_dir": self.output_dir, + "job_id": "78901", + "is_restart": False + } + + self._create_directory(kwargs['output_dir']) + + wf = WorkflowFactory.generate_workflow(**kwargs) + + # confirm that the proper type of workflow was generated. + self.assertEqual(wf.protocol_type, PROTOCOL_NAME_PACBIO_SMRT) + self.assertEqual(wf.assay_type, ASSAY_NAME_METAGENOMIC) + + # self._inject_data(wf) + # ConvertJob/ConvertJob.sh + + # Metagenomic is a valid data type in the default qiita test + # database but job-id: 78901 doesn't exist; however, if we get + # to here, it means that all steps have run to completion + # and the system is trying to create the preps. + # Note: Qiita job_id's are UUID in the database and this tests + # uses 78901 as the job_id so the db complains about the format + # with self.assertRaisesRegex(RuntimeError, 'invalid input ' + # 'syntax for type uuid: "78901"'): + with self.assertRaisesRegex(PipelineError, 'There are no fastq ' + 'files for FastQCJob'): + wf.execute_pipeline() + + +if __name__ == '__main__': + main() From 4439472c72f4790fef2c03fb46543049a1e0e12b Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Tue, 2 Sep 2025 06:56:07 -0600 Subject: [PATCH 03/42] fix InstrumentUtils --- src/sequence_processing_pipeline/Pipeline.py | 50 ++++++++++++++----- .../m11111_20250101_111111_s4.metadata.xml | 6 +-- tests/test_Pipeline.py | 5 +- 3 files changed, 44 insertions(+), 17 deletions(-) rename tests/data/{ => sample_run_directories}/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml (84%) diff --git a/src/sequence_processing_pipeline/Pipeline.py b/src/sequence_processing_pipeline/Pipeline.py index 1749d3c9..cf0a83cd 100644 --- a/src/sequence_processing_pipeline/Pipeline.py +++ b/src/sequence_processing_pipeline/Pipeline.py @@ -3,6 +3,7 @@ from json.decoder import JSONDecodeError from os import makedirs, listdir, walk from os.path import join, exists, isdir, basename +from glob import glob from metapool import (load_sample_sheet, AmpliconSampleSheet, is_blank, parse_project_name, SAMPLE_NAME_KEY, QIITA_ID_KEY, PROJECT_SHORT_NAME_KEY, PROJECT_FULL_NAME_KEY, @@ -30,18 +31,31 @@ class InstrumentUtils(): @staticmethod def _get_instrument_id(run_directory): run_info = join(run_directory, 'RunInfo.xml') + instrument_text = 'Run/Instrument' + # if RunInfo.xml doesn't exist, this might be a PacBio + # folder, let's check for it if not exists(run_info): - raise ValueError(f"'{run_info}' doesn't exist") + pacbio_metadata_fps = glob( + f'{run_directory}/*/metadata/*.metadata.xml') + if pacbio_metadata_fps: + run_info = pacbio_metadata_fps[0] + tree = ET.parse(run_info) + mtag = tree.getroot().tag.split('}')[0] + '}' + instrument_text = ( + f'{mtag}ExperimentContainer/{mtag}Runs/{mtag}Run') + run = ET.parse(run_info).find(instrument_text) + text = run.attrib['TimeStampedName'] + else: + raise ValueError(f"'{run_info}' doesn't exist") + else: + text = ET.parse(run_info).find(instrument_text).text - with open(run_info) as f: - # Instrument element should appear in all valid RunInfo.xml - # files. - return ET.fromstring(f.read()).find('Run/Instrument').text + return text @staticmethod def get_instrument_type(run_directory): - instrument_id = basename(run_directory) + instrument_id = InstrumentUtils._get_instrument_id(run_directory) return get_model_by_instrument_id( instrument_id, model_key=PROFILE_NAME_KEY) @@ -49,13 +63,23 @@ def get_instrument_type(run_directory): def _get_date(run_directory): run_info = join(run_directory, 'RunInfo.xml') + # if RunInfo.xml doesn't exist, this might be a PacBio + # folder, let's check for it if not exists(run_info): - raise ValueError(f"'{run_info}' doesn't exist") - - with open(run_info) as f: - # Date element should appear in all valid RunInfo.xml - # files. - date_string = ET.fromstring(f.read()).find('Run/Date').text + pacbio_metadata_fps = glob( + f'{run_directory}/*/metadata/*.metadata.xml') + if pacbio_metadata_fps: + run_info = pacbio_metadata_fps[0] + tree = ET.parse(run_info) + mtag = tree.getroot().tag.split('}')[0] + '}' + instrument_text = ( + f'{mtag}ExperimentContainer/{mtag}Runs/{mtag}Run') + run = ET.parse(run_info).find(instrument_text) + date_string = run.attrib['CreatedAt'].split('T')[0] + else: + raise ValueError(f"'{run_info}' doesn't exist") + else: + date_string = ET.parse(run_info).find('Run/Date').text # date is recorded in RunInfo.xml in various formats. Iterate # through all known formats until the date is properly parsed. @@ -64,7 +88,7 @@ def _get_date(run_directory): # UTC time w/out confirming the machines were actually set for # and/or reporting UTC time. formats = ["%y%m%d", "%Y-%m-%dT%H:%M:%s", "%Y-%m-%dT%H:%M:%SZ", - "%m/%d/%Y %I:%M:%S %p"] + "%m/%d/%Y %I:%M:%S %p", "%Y-%m-%d",] for format in formats: try: diff --git a/tests/data/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml b/tests/data/sample_run_directories/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml similarity index 84% rename from tests/data/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml rename to tests/data/sample_run_directories/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml index d4ba2c2d..8bc3a16c 100644 --- a/tests/data/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml +++ b/tests/data/sample_run_directories/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml @@ -4,8 +4,8 @@ Default 2025-01-01 - + - \ No newline at end of file + + diff --git a/tests/test_Pipeline.py b/tests/test_Pipeline.py index f1f69e2f..ebe4a495 100644 --- a/tests/test_Pipeline.py +++ b/tests/test_Pipeline.py @@ -2431,7 +2431,10 @@ def test_instrument_utils(self): 'date': '2023-12-15'}, '150629_K1001_0511_AH5L7GBCXX': {'id': 'K1001', 'type': 'HiSeq 4000', - 'date': '2015-06-29'}} + 'date': '2015-06-29'}, + 'r11111_20250101_111111': {'id': 'r11111', + 'type': 'Revio', + 'date': '2025-01-01'}} run_directories = [] for root, dirs, files in walk(self.path('sample_run_directories')): From 6bf358678399b99711a5c8e02424cec85285d5c9 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Tue, 2 Sep 2025 09:53:40 -0600 Subject: [PATCH 04/42] rm test_required_file_checks --- .../pacbio/good_pacbio_metagv10.csv | 27 +++++++ tests/test_Pipeline.py | 73 ------------------- 2 files changed, 27 insertions(+), 73 deletions(-) create mode 100644 tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv diff --git a/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv b/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv new file mode 100644 index 00000000..f2407be1 --- /dev/null +++ b/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv @@ -0,0 +1,27 @@ +[Header],,,,,, +SheetType,pacbio_metag,,,,, +SheetVersion,10,,,,, +Investigator Name,Knight,,,,, +Experiment Name,RKL_experiment,,,,, +Date,2025-08-29,,,,, +Assay,Metagenomic,,,,, +Description,,,,,, +,,,,,, +[Data],,,,,, +Sample_ID,Sample_Name,Sample_Plate,well_id_384,Sample_Project,Well_description,Lane +bc3011,3A,sample_plate_1,A1,MyProject_6123,sample_plate_1.3A.A1,1 +bc0112,4A,sample_plate_1,A2,MyProject_6123,sample_plate_1.4A.A2,1 +bc9992,5B,sample_plate_1,A3,MyProject_6123,sample_plate_1.5B.A3,1 +,,,,,, +[Bioinformatics],,,,,, +Sample_Project,QiitaID,HumanFiltering,library_construction_protocol,experiment_design_description,contains_replicates, +MyProject_6123,6123,False,some protocol,some description,False, +,,,,,, +[Contact],,,,,, +Sample_Project,Email,,,,, +MyProject_6123,foo@bar.org,,,,, +,,,,,, +[SampleContext],,,,,, +sample_name,sample_type,primary_qiita_study,secondary_qiita_studies,,, +5B,control blank,6123,,,, +,,,,,, diff --git a/tests/test_Pipeline.py b/tests/test_Pipeline.py index ebe4a495..b1cc502e 100644 --- a/tests/test_Pipeline.py +++ b/tests/test_Pipeline.py @@ -207,43 +207,6 @@ def test_get_orig_names_from_sheet_with_replicates(self): self.assertEqual(set(obs), exp) - def test_required_file_checks(self): - # begin this test by deleting the RunInfo.txt file and verifying that - # Pipeline object will raise an Error. - self.delete_runinfo_file() - - with self.assertRaisesRegex(PipelineError, "required file 'RunInfo.xml" - "' is not present."): - Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, - self.output_file_path, - self.qiita_id, Pipeline.METAGENOMIC_PTYPE) - - # delete RTAComplete.txt and recreate RunInfo.txt file to verify that - # an Error is raised when only RTAComplete.txt is missing. - self.delete_rtacomplete_file() - self.create_runinfo_file() - - with self.assertRaisesRegex(PipelineError, "required file 'RTAComplete" - ".txt' is not present."): - Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, - self.output_file_path, - self.qiita_id, Pipeline.METAGENOMIC_PTYPE) - - # make RunInfo.xml file unreadable and verify that Pipeline object - # raises the expected Error. - self.create_rtacomplete_file() - self.make_runinfo_file_unreadable() - - with self.assertRaisesRegex(PipelineError, "RunInfo.xml is present, bu" - "t not readable"): - Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, - self.output_file_path, - self.qiita_id, Pipeline.METAGENOMIC_PTYPE) - self.make_runinfo_file_readable() - def test_creation(self): # Pipeline should assert due to config_file with self.assertRaises(PipelineError) as e: @@ -1772,42 +1735,6 @@ def delete_rtacomplete_file(self): # make method idempotent pass - def test_required_file_checks(self): - # begin this test by deleting the RunInfo.txt file and verifying that - # Pipeline object will raise an Error. - self.delete_runinfo_file() - - with self.assertRaisesRegex(PipelineError, "required file 'RunInfo.xml" - "' is not present."): - Pipeline(self.good_config_file, self.good_run_id, - self.good_mapping_file_path, - self.output_file_path, - self.qiita_id, Pipeline.AMPLICON_PTYPE) - - # delete RTAComplete.txt and recreate RunInfo.txt file to verify that - # an Error is raised when only RTAComplete.txt is missing. - self.delete_rtacomplete_file() - self.create_runinfo_file() - - with self.assertRaisesRegex(PipelineError, "required file 'RTAComplete" - ".txt' is not present."): - Pipeline(self.good_config_file, self.good_run_id, - self.good_mapping_file_path, - self.output_file_path, - self.qiita_id, Pipeline.AMPLICON_PTYPE) - - # make RunInfo.xml file unreadable and verify that Pipeline object - # raises the expected Error. - self.create_rtacomplete_file() - self.make_runinfo_file_unreadable() - - with self.assertRaisesRegex(PipelineError, "RunInfo.xml is present, " - "but not readable"): - Pipeline(self.good_config_file, self.good_run_id, - self.good_mapping_file_path, self.output_file_path, - self.qiita_id, Pipeline.AMPLICON_PTYPE) - self.make_runinfo_file_readable() - def test_creation(self): # Pipeline should assert due to config_file with self.assertRaises(PipelineError) as e: From 411a1195274e458113df2f7a3185d7d0ee9b6397 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Tue, 2 Sep 2025 11:53:53 -0600 Subject: [PATCH 05/42] m11111_20250101_111111_s4 --- .../metadata/m11111_20250101_111111_s4.metadata.xml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 tests/data/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml diff --git a/tests/data/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml b/tests/data/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml new file mode 100644 index 00000000..8bc3a16c --- /dev/null +++ b/tests/data/r11111_20250101_111111/1_A01/metadata/m11111_20250101_111111_s4.metadata.xml @@ -0,0 +1,11 @@ + + + + Default + 2025-01-01 + + + + + + From e8a2d0b9d50486d2c5844f0a7c91b22729beee59 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Tue, 2 Sep 2025 16:09:05 -0600 Subject: [PATCH 06/42] add conda --- src/qp_klp/PacBioMetagenomicWorkflow.py | 2 +- src/sequence_processing_pipeline/ConvertJob.py | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/qp_klp/PacBioMetagenomicWorkflow.py b/src/qp_klp/PacBioMetagenomicWorkflow.py index 3036755b..96040f16 100644 --- a/src/qp_klp/PacBioMetagenomicWorkflow.py +++ b/src/qp_klp/PacBioMetagenomicWorkflow.py @@ -48,7 +48,7 @@ def __init__(self, **kwargs): for sample in self.pipeline.sample_sheet.samples] df = pd.DataFrame(samples) sample_list_fp = f"{self.kwargs['output_dir']}/sample_list.tsv" - df.to_csv(sample_list_fp, sep='t') + df.to_csv(sample_list_fp, sep='\t', header=False, index=False) self.master_qiita_job_id = self.kwargs['job_id'] diff --git a/src/sequence_processing_pipeline/ConvertJob.py b/src/sequence_processing_pipeline/ConvertJob.py index e67227f4..4c6093ff 100644 --- a/src/sequence_processing_pipeline/ConvertJob.py +++ b/src/sequence_processing_pipeline/ConvertJob.py @@ -456,16 +456,15 @@ def _generate_job_script(self): lines.append("set -x") lines.append('date') lines.append('hostname') - lines.append(f'cd {self.root_dir}') - - if self.modules_to_load: - lines.append("module load " + ' '.join(self.modules_to_load)) + lines.append('conda activate klp-2025-05') + lines.append(f'cd {self.output_path}') - lines.append(f"sample_list={self.root_dir}/sample_list.tsv") + lines.append("sample_list=${PWD}/sample_list.tsv") lines.append( 'while read bc sn pn; do echo ' '"bam2fastq -j 1 -o ${sn}.fastq.gz -c 9 ' - '${pn}/${Run_prefix}.hifi_reads.${bn}.bam"; ' + f'{self.root_dir}' + '/*/hifi_reads/*{bc}*.bam"; ' 'fqtools count ${sn}.fastq.gz > ${sn}.counts.txt;' 'done < ${sample_list}') From 5e16ab11bfd4a064fc74f701983860bc20c39ba4 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Wed, 3 Sep 2025 06:47:08 -0600 Subject: [PATCH 07/42] pacbio_generate_bam2fastq_commands --- pyproject.toml | 1 + src/qp_klp/PacBioMetagenomicWorkflow.py | 2 +- src/qp_klp/scripts/pacbio_commands.py | 49 +++++++++++++++++++ .../ConvertJob.py | 10 ++-- 4 files changed, 54 insertions(+), 8 deletions(-) create mode 100755 src/qp_klp/scripts/pacbio_commands.py diff --git a/pyproject.toml b/pyproject.toml index fb74ed59..64a74943 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,3 +59,4 @@ dependencies = [ configure_klp = "qp_klp.scripts.configure_klp:config" start_klp = "qp_klp.scripts.start_klp:execute" demux = "sequence_processing_pipeline.scripts.cli:demux" +pacbio_generate_bam2fastq_commands = "qp_klp.scripts.pacbio_commands:generate_bam2fastq_commands" diff --git a/src/qp_klp/PacBioMetagenomicWorkflow.py b/src/qp_klp/PacBioMetagenomicWorkflow.py index 96040f16..aae032f8 100644 --- a/src/qp_klp/PacBioMetagenomicWorkflow.py +++ b/src/qp_klp/PacBioMetagenomicWorkflow.py @@ -48,7 +48,7 @@ def __init__(self, **kwargs): for sample in self.pipeline.sample_sheet.samples] df = pd.DataFrame(samples) sample_list_fp = f"{self.kwargs['output_dir']}/sample_list.tsv" - df.to_csv(sample_list_fp, sep='\t', header=False, index=False) + df.to_csv(sample_list_fp, sep='\t', index=False) self.master_qiita_job_id = self.kwargs['job_id'] diff --git a/src/qp_klp/scripts/pacbio_commands.py b/src/qp_klp/scripts/pacbio_commands.py new file mode 100755 index 00000000..611c0e94 --- /dev/null +++ b/src/qp_klp/scripts/pacbio_commands.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python + +# ----------------------------------------------------------------------------- +# Copyright (c) 2014--, The Qiita Development Team. +# +# Distributed under the terms of the BSD 3-clause License. +# +# The full license is in the file LICENSE, distributed with this software. +# ----------------------------------------------------------------------------- +import click +import pandas as pd +from glob import glob +from os import makedirs + + +@click.command() +@click.argument('sample_list', required=True) +@click.argument('run_folder', required=True) +@click.argument('outdir', required=True) +def generate_bam2fastq_commands(sample_list, run_folder, outdir): + """Generates the bam2fastq commands""" + df = pd.read_csv(sample_list) + files = {f.split('.')[-2]: f + for f in glob(f'{run_folder}/*/hifi_reads/*.bam')} + + makedirs(outdir, exist_ok=True) + + commands, missing_files = [], [] + for row in df.iterrows(): + bc = row['barcode'] + sn = row['sample_name'] + pn = row['project_name'] + if bc not in files: + missing_files.append(bc) + continue + od = f'{outdir}/{pn}' + makedirs(od, exist_ok=True) + cmd = (f'bam2fastq -j 1 -o {od}/{sn} -c 9 ' + f'{files[bc]}; ' + f'fqtools count {od}/{sn}.fastq.gz > ' + f'{od}/{sn}.counts.txt') + commands.append(cmd) + + if missing_files: + raise ValueError( + f'{run_folder} is missing barcodes: {missing_files}') + + for cmd in commands: + print(cmd) diff --git a/src/sequence_processing_pipeline/ConvertJob.py b/src/sequence_processing_pipeline/ConvertJob.py index 4c6093ff..17b7ca9e 100644 --- a/src/sequence_processing_pipeline/ConvertJob.py +++ b/src/sequence_processing_pipeline/ConvertJob.py @@ -459,14 +459,10 @@ def _generate_job_script(self): lines.append('conda activate klp-2025-05') lines.append(f'cd {self.output_path}') - lines.append("sample_list=${PWD}/sample_list.tsv") lines.append( - 'while read bc sn pn; do echo ' - '"bam2fastq -j 1 -o ${sn}.fastq.gz -c 9 ' - f'{self.root_dir}' - '/*/hifi_reads/*{bc}*.bam"; ' - 'fqtools count ${sn}.fastq.gz > ${sn}.counts.txt;' - 'done < ${sample_list}') + f'pacbio_generate_bam2fastq_commands ../sample_list.tsv ' + f'{self.root_dir} {self.output_path} ' + f'| parallel -j {self.node_count}') with open(self.job_script_path, 'w') as f: for line in lines: From 827a37238a7f114b68e690f98af797dc1d59b348 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Wed, 3 Sep 2025 07:43:58 -0600 Subject: [PATCH 08/42] self.node_count -> self.nprocs --- src/qp_klp/scripts/pacbio_commands.py | 9 +++++---- src/sequence_processing_pipeline/ConvertJob.py | 4 +++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/qp_klp/scripts/pacbio_commands.py b/src/qp_klp/scripts/pacbio_commands.py index 611c0e94..ff0c0369 100755 --- a/src/qp_klp/scripts/pacbio_commands.py +++ b/src/qp_klp/scripts/pacbio_commands.py @@ -19,14 +19,14 @@ @click.argument('outdir', required=True) def generate_bam2fastq_commands(sample_list, run_folder, outdir): """Generates the bam2fastq commands""" - df = pd.read_csv(sample_list) + df = pd.read_csv(sample_list, sep='\t') files = {f.split('.')[-2]: f for f in glob(f'{run_folder}/*/hifi_reads/*.bam')} makedirs(outdir, exist_ok=True) commands, missing_files = [], [] - for row in df.iterrows(): + for _, row in df.iterrows(): bc = row['barcode'] sn = row['sample_name'] pn = row['project_name'] @@ -35,9 +35,10 @@ def generate_bam2fastq_commands(sample_list, run_folder, outdir): continue od = f'{outdir}/{pn}' makedirs(od, exist_ok=True) - cmd = (f'bam2fastq -j 1 -o {od}/{sn} -c 9 ' + fn = f'{od}/{sn}_R1' + cmd = (f'bam2fastq -j 1 -o {fn} -c 9 ' f'{files[bc]}; ' - f'fqtools count {od}/{sn}.fastq.gz > ' + f'fqtools count {fn}.fastq.gz > ' f'{od}/{sn}.counts.txt') commands.append(cmd) diff --git a/src/sequence_processing_pipeline/ConvertJob.py b/src/sequence_processing_pipeline/ConvertJob.py index 17b7ca9e..a752d67d 100644 --- a/src/sequence_processing_pipeline/ConvertJob.py +++ b/src/sequence_processing_pipeline/ConvertJob.py @@ -454,15 +454,17 @@ def _generate_job_script(self): lines.append(f"#SBATCH --mem-per-cpu {self.pmem}") lines.append("set -x") + lines.append('set -e') lines.append('date') lines.append('hostname') + lines.append('source ~/.bash_profile') lines.append('conda activate klp-2025-05') lines.append(f'cd {self.output_path}') lines.append( f'pacbio_generate_bam2fastq_commands ../sample_list.tsv ' f'{self.root_dir} {self.output_path} ' - f'| parallel -j {self.node_count}') + f'| parallel -j {self.nprocs}') with open(self.job_script_path, 'w') as f: for line in lines: From 551bfd18edd15bbbd1e45de2f18b409a10ce014c Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Wed, 3 Sep 2025 08:11:51 -0600 Subject: [PATCH 09/42] generate_sequence_counts --- src/qp_klp/Protocol.py | 51 ++++++++----------- .../pacbio_metagenomic.json | 21 +++----- 2 files changed, 29 insertions(+), 43 deletions(-) diff --git a/src/qp_klp/Protocol.py b/src/qp_klp/Protocol.py index 71e740e9..41e5dc46 100644 --- a/src/qp_klp/Protocol.py +++ b/src/qp_klp/Protocol.py @@ -432,38 +432,29 @@ def convert_raw_to_fastq(self): return failed_samples def generate_sequence_counts(self): - # config = self.pipeline.get_software_configuration('tell-seq') - - # files_to_count_path = join(self.pipeline.output_path, - # 'files_to_count.txt') - - # with open(files_to_count_path, 'w') as f: - # for root, _, files in walk(self.raw_fastq_files_path): - # for _file in files: - # if determine_orientation(_file) in ['R1', 'R2']: - # print(join(root, _file), file=f) - - # job = SeqCountsJob(self.pipeline.run_dir, - # self.pipeline.output_path, - # config['queue'], - # config['nodes'], - # config['wallclock_time_in_minutes'], - # config['normcount_mem_limit'], - # config['modules_to_load'], - # self.master_qiita_job_id, - # config['job_max_array_length'], - # files_to_count_path, - # self.pipeline.get_sample_sheet_path(), - # cores_per_task=config['tellread_cores']) - - # if 'SeqCountsJob' not in self.skip_steps: - # job.run(callback=self.job_callback) - - # if successful, set self.reports_path + # for other isntances of generate_sequence_counts in other objects + # the sequence counting needs to be done; however, for PacBio we + # already have done it and just need to merge the results. + gz_files = glob(f'{self.raw_fastq_files_path}/*/*.fastq.gz') + data, missing_files = [], [] + + for gzf in gz_files: + cf = gzf.replace('.fastq.gz', '.counts.txt') + sn = basename(cf).replace('.counts.txt', '') + if not exists(cf): + missing_files.append(sn) + continue + with open(cf, 'r') as fh: + counts = fh.read().strip() + data.append({'SampleID': sn, '# Reads': counts}) + + if missing_files: + raise ValueError(f'Missing count files: {missing_files}') + + df = pd.DataFrame(data) self.reports_path = join(self.pipeline.output_path, 'SeqCounts.csv') - open(self.reports_path, 'w').write( - 'SampleID,# Reads\nA1,100') + df.to_csv(self.reports_path, index=False) def integrate_results(self): pass diff --git a/tests/data/configuration_profiles/pacbio_metagenomic.json b/tests/data/configuration_profiles/pacbio_metagenomic.json index f6a88a5a..4e4eb7cb 100644 --- a/tests/data/configuration_profiles/pacbio_metagenomic.json +++ b/tests/data/configuration_profiles/pacbio_metagenomic.json @@ -8,23 +8,20 @@ "nprocs": 16, "queue": "qiita", "wallclock_time_in_minutes": 216, - "modules_to_load": [ - "bclconvert_3.7.5" - ], - "executable_path": "bcl-convert", - "per_process_memory_limit": "10gb" + "modules_to_load": [], + "executable_path": "", + "per_process_memory_limit": "1gb" }, "nu-qc": { "nodes": 1, "cpus_per_task": 8, "queue": "qiita", "wallclock_time_in_minutes": 240, - "minimap2_databases": ["/databases/minimap2/db_1.mmi", "/databases/minimap2/db_2.mmi"], - "modules_to_load": [ - "fastp_0.20.1", - "samtools_1.12", - "minimap2_2.18" + "minimap2_databases": [ + "/databases/minimap2/db_1.mmi", + "/databases/minimap2/db_2.mmi" ], + "modules_to_load": ["fastp_0.20.1", "samtools_1.12", "minimap2_2.18"], "fastp_executable_path": "fastp", "minimap2_executable_path": "minimap2", "samtools_executable_path": "samtools", @@ -49,9 +46,7 @@ "queue": "qiita", "nthreads": 16, "wallclock_time_in_minutes": 60, - "modules_to_load": [ - "fastqc_0.11.5" - ], + "modules_to_load": ["fastqc_0.11.5"], "fastqc_executable_path": "fastqc", "multiqc_executable_path": "multiqc", "multiqc_config_file_path": "sequence_processing_pipeline/multiqc-bclconvert-config.yaml", From 6311723e593894282e327b146e8c0bf2f60b0150 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Wed, 3 Sep 2025 11:24:00 -0600 Subject: [PATCH 10/42] init changes to def _inject_data(self, wf): --- src/qp_klp/scripts/pacbio_commands.py | 2 +- .../pacbio_metagenomic.json | 2 +- tests/test_PacBioWorflow.py | 70 ++++++------------- 3 files changed, 24 insertions(+), 50 deletions(-) diff --git a/src/qp_klp/scripts/pacbio_commands.py b/src/qp_klp/scripts/pacbio_commands.py index ff0c0369..5671909e 100755 --- a/src/qp_klp/scripts/pacbio_commands.py +++ b/src/qp_klp/scripts/pacbio_commands.py @@ -39,7 +39,7 @@ def generate_bam2fastq_commands(sample_list, run_folder, outdir): cmd = (f'bam2fastq -j 1 -o {fn} -c 9 ' f'{files[bc]}; ' f'fqtools count {fn}.fastq.gz > ' - f'{od}/{sn}.counts.txt') + f'{fn}.counts.txt') commands.append(cmd) if missing_files: diff --git a/tests/data/configuration_profiles/pacbio_metagenomic.json b/tests/data/configuration_profiles/pacbio_metagenomic.json index 4e4eb7cb..1e4a60ce 100644 --- a/tests/data/configuration_profiles/pacbio_metagenomic.json +++ b/tests/data/configuration_profiles/pacbio_metagenomic.json @@ -5,7 +5,7 @@ "configuration": { "pacbio_convert": { "nodes": 1, - "nprocs": 16, + "nprocs": 28, "queue": "qiita", "wallclock_time_in_minutes": 216, "modules_to_load": [], diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py index 12005ae4..9152c685 100644 --- a/tests/test_PacBioWorflow.py +++ b/tests/test_PacBioWorflow.py @@ -12,8 +12,10 @@ from metapool.sample_sheet import (PROTOCOL_NAME_PACBIO_SMRT) from qp_klp.Assays import ASSAY_NAME_METAGENOMIC from shutil import rmtree +from pathlib import Path from qiita_client.testing import PluginTestCase from sequence_processing_pipeline.PipelineError import PipelineError +import pandas as pd class WorkflowFactoryTests(PluginTestCase): @@ -40,30 +42,19 @@ def _inject_data(self, wf): '''This is a helper method for testing that all the steps are run when testing wf.execute_pipeline() ''' - pass - # samples = wf.pipeline.sample_sheet.samples - # tellseq = True - # if 'index' in dict(samples[0]).keys(): - # tellseq = False - - # # inject Convert/NuQC/FastQC/MultiQC/GenPrepFileJob files so we can - # # move down the pipeline; first let's create the base folders - # if tellseq: - # reports_dir = f'{self.output_dir}/TellReadJob/' - # convert_dir = f'{self.output_dir}/TRIntegrateJob/integrated' - # makedirs(convert_dir, exist_ok=True) - # else: - # convert_dir = f'{self.output_dir}/ConvertJob' - # reports_dir = f'{self.output_dir}/ConvertJob/Reports' - # makedirs(convert_dir, exist_ok=True) - # Path(f'{convert_dir}/job_completed').touch() + samples = wf.pipeline.sample_sheet.samples + + convert_dir = f'{self.output_dir}/ConvertJob' + reports_dir = f'{self.output_dir}/ConvertJob/Reports' + makedirs(convert_dir, exist_ok=True) + makedirs(reports_dir, exist_ok=True) + Path(f'{convert_dir}/job_completed').touch() # tellread_dir = f'{self.output_dir}/TellReadJob' # nuqc_dir = f'{self.output_dir}/NuQCJob' # fastqc_dir = f'{self.output_dir}/FastQCJob/logs/' # multiqc_dir = f'{self.output_dir}/MultiQCJob/logs/' # genprep_dir = (f'{self.output_dir}/GenPrepFileJob/' # '211021_A00000_0000_SAMPLE/') - # makedirs(reports_dir, exist_ok=True) # makedirs(nuqc_dir, exist_ok=True) # makedirs(fastqc_dir, exist_ok=True) # makedirs(multiqc_dir, exist_ok=True) @@ -77,29 +68,17 @@ def _inject_data(self, wf): # exist_ok=True) # # then loop over samples and stage all fastq.gz files - # dstats = [] - # for i, sample in enumerate(samples): - # rp = sample["Sample_ID"] - # sp = sample["Sample_Project"] - - # # ConvertJob - # if tellseq: - # bid = sample['barcode_id'] - # dstats.append( - # {'Lane': sample['Lane'], 'SampleID': rp, - # 'Sample_Project': sp, 'Barcode': bid, - # '# Reads': 2}) - # dname = f'{convert_dir}/{sp}' - # copyfile(self.gz_source, f'{dname}/{bid}.R1.fastq.gz') - # copyfile(self.gz_source, f'{dname}/{bid}.R2.fastq.gz') - # else: - # dstats.append( - # {'Lane': sample['Lane'], 'SampleID': rp, - # 'Sample_Project': sp, 'Index': sample['index'], - # '# Reads': 2}) - # dname = f'{convert_dir}/{sp}' - # Path(f'{dname}/{rp}_L001_R1_001.fastq.gz').touch() - # Path(f'{dname}/{rp}_L001_R2_001.fastq.gz').touch() + dstats = [] + for i, sample in enumerate(samples): + rp = sample["Sample_ID"] + sp = sample["Sample_Project"] + dstats.append( + {'Lane': sample['Lane'], 'SampleID': rp, + 'Sample_Project': sp, 'Index': sample['index'], + '# Reads': 2}) + dname = f'{convert_dir}/{sp}' + Path(f'{dname}/{rp}_L001_R1_001.fastq.gz').touch() + Path(f'{dname}/{rp}_L001_R2_001.fastq.gz').touch() # # NuQCJob # dname = f'{nuqc_dir}/filtered_sequences/{sp}' @@ -111,13 +90,8 @@ def _inject_data(self, wf): # Path(f'{gprep_base}_L001_R1_001.fastq.gz').touch() # Path(f'{gprep_base}_L001_R2_001.fastq.gz').touch() - # # this is required by the Convert step - # if tellseq: - # pd.DataFrame(dstats).set_index('SampleID').to_csv( - # f'{tellread_dir}/sample_index_list_TellReadJob.txt') - # else: - # pd.DataFrame(dstats).set_index('SampleID').to_csv( - # f'{reports_dir}/Demultiplex_Stats.csv') + pd.DataFrame(dstats).set_index('SampleID').to_csv( + f'{reports_dir}/Demultiplex_Stats.csv') # # generating the "*.completed" files # for i in range(len(samples)*3): From 40afadf10ed332ac577670c5b9da6340badfd651 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Wed, 3 Sep 2025 13:11:05 -0600 Subject: [PATCH 11/42] read_length --- src/qp_klp/Assays.py | 3 ++- src/qp_klp/Protocol.py | 3 +++ src/qp_klp/WorkflowFactory.py | 3 +++ src/sequence_processing_pipeline/NuQCJob.py | 22 +++++++++++++++---- .../pacbio_metagenomic.json | 2 +- tests/test_PacBioWorflow.py | 5 ++--- 6 files changed, 29 insertions(+), 9 deletions(-) diff --git a/src/qp_klp/Assays.py b/src/qp_klp/Assays.py index 3ad6e873..e2fc0ede 100644 --- a/src/qp_klp/Assays.py +++ b/src/qp_klp/Assays.py @@ -517,7 +517,8 @@ def qc_reads(self): bucket_size=config['bucket_size'], length_limit=config['length_limit'], cores_per_task=config['cores_per_task'], - files_regex=self.files_regex) + files_regex=self.files_regex, + read_length=self.read_length) if 'NuQCJob' not in self.skip_steps: job.run(callback=self.job_callback) diff --git a/src/qp_klp/Protocol.py b/src/qp_klp/Protocol.py index 41e5dc46..1b0eb71e 100644 --- a/src/qp_klp/Protocol.py +++ b/src/qp_klp/Protocol.py @@ -86,6 +86,7 @@ class Illumina(Protocol): # here) both RTAComplete.txt and RunInfo.xml should reside in the root of # the run directory. required_files = ['RTAComplete.txt', 'RunInfo.xml'] + read_length = 'short' def __init__(self) -> None: super().__init__() @@ -178,6 +179,7 @@ def generate_sequence_counts(self): class TellSeq(Protocol): protocol_type = PROTOCOL_NAME_TELLSEQ + read_length = 'short' def convert_raw_to_fastq(self): config = self.pipeline.get_software_configuration('tell-seq') @@ -395,6 +397,7 @@ def _post_process_file(self, fastq_file, mapping, lane): class PacBio(Protocol): protocol_type = PROTOCOL_NAME_PACBIO_SMRT + read_length = 'long' def convert_raw_to_fastq(self): config = self.pipeline.get_software_configuration('pacbio_convert') diff --git a/src/qp_klp/WorkflowFactory.py b/src/qp_klp/WorkflowFactory.py index 8d51cee6..870449e5 100644 --- a/src/qp_klp/WorkflowFactory.py +++ b/src/qp_klp/WorkflowFactory.py @@ -78,6 +78,9 @@ def generate_workflow(cls, **kwargs): "sample-sheet or a mapping-file.") for workflow in WorkflowFactory.WORKFLOWS: + if workflow.read_length not in {'short', 'long'}: + raise ValueError('Invalid read_length: ' + f'{workflow.read_length} for {workflow}') if workflow.assay_type == assay_type: if workflow.protocol_type == instrument_type: # return instantiated workflow object diff --git a/src/sequence_processing_pipeline/NuQCJob.py b/src/sequence_processing_pipeline/NuQCJob.py index 640da755..fabc42eb 100644 --- a/src/sequence_processing_pipeline/NuQCJob.py +++ b/src/sequence_processing_pipeline/NuQCJob.py @@ -24,7 +24,8 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path, samtools_path, modules_to_load, qiita_job_id, max_array_length, known_adapters_path, movi_path, gres_value, pmls_path, additional_fastq_tags, bucket_size=8, - length_limit=100, cores_per_task=4, files_regex='SPP'): + length_limit=100, cores_per_task=4, files_regex='SPP', + read_length='short'): """ Submit a slurm job where the contents of fastq_root_dir are processed using fastp, minimap2, and samtools. Human-genome sequences will be @@ -52,6 +53,7 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path, :param additional_fastq_tags: A list of fastq tags to preserve during filtering. :param files_regex: the FILES_REGEX to use for parsing files + :param read_length: the FILES_REGEX to use for parsing files """ super().__init__(fastq_root_dir, output_path, @@ -417,6 +419,17 @@ def _generate_mmi_filter_cmds(self, working_dir): tags = " -T %s" % ','.join(self.additional_fastq_tags) t_switch = " -y" + if self.read_length == 'short': + minimap2_prefix = f'minimap2 -2 -ax sr{t_switch}' + minimap2_subfix = '-a' + samtools_params = '-f 12 -F 256' + elif self.read_length == 'long': + minimap2_prefix = 'minimap2 -2 -ax map-hifi' + minimap2_subfix = '--no-pairing' + samtools_params = '-f 4 -F 256' + else: + raise ValueError(f'minimap2 prefix not set for {self.read_length}') + for count, mmi_db_path in enumerate(self.mmi_file_paths): if count == 0: # prime initial state with unfiltered file and create first of @@ -432,9 +445,10 @@ def _generate_mmi_filter_cmds(self, working_dir): input = tmp_file1 output = tmp_file2 - cmds.append(f"minimap2 -2 -ax sr{t_switch} -t {cores_to_allocate} " - f"{mmi_db_path} {input} -a | samtools fastq -@ " - f"{cores_to_allocate} -f 12 -F 256{tags} > " + cmds.append(f"{minimap2_prefix} -t {cores_to_allocate} " + f"{mmi_db_path} {input} {minimap2_subfix} | " + "samtools fastq -@ " + f"{cores_to_allocate} {samtools_params}{tags} > " f"{output}") # rename the latest tmp file to the final output filename. diff --git a/tests/data/configuration_profiles/pacbio_metagenomic.json b/tests/data/configuration_profiles/pacbio_metagenomic.json index 1e4a60ce..19168ba3 100644 --- a/tests/data/configuration_profiles/pacbio_metagenomic.json +++ b/tests/data/configuration_profiles/pacbio_metagenomic.json @@ -9,7 +9,7 @@ "queue": "qiita", "wallclock_time_in_minutes": 216, "modules_to_load": [], - "executable_path": "", + "executable_path": "bam2fastq", "per_process_memory_limit": "1gb" }, "nu-qc": { diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py index 9152c685..437689c9 100644 --- a/tests/test_PacBioWorflow.py +++ b/tests/test_PacBioWorflow.py @@ -77,8 +77,7 @@ def _inject_data(self, wf): 'Sample_Project': sp, 'Index': sample['index'], '# Reads': 2}) dname = f'{convert_dir}/{sp}' - Path(f'{dname}/{rp}_L001_R1_001.fastq.gz').touch() - Path(f'{dname}/{rp}_L001_R2_001.fastq.gz').touch() + Path(f'{dname}/{rp}_R1_001.fastq.gz').touch() # # NuQCJob # dname = f'{nuqc_dir}/filtered_sequences/{sp}' @@ -118,7 +117,7 @@ def test_pacbio_metagenomic_workflow_creation(self): self.assertEqual(wf.protocol_type, PROTOCOL_NAME_PACBIO_SMRT) self.assertEqual(wf.assay_type, ASSAY_NAME_METAGENOMIC) - # self._inject_data(wf) + self._inject_data(wf) # ConvertJob/ConvertJob.sh # Metagenomic is a valid data type in the default qiita test From 52f4c533e7f3613b2f5487079410110004764df8 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Wed, 3 Sep 2025 13:38:20 -0600 Subject: [PATCH 12/42] pmls_extra_parameters --- src/sequence_processing_pipeline/NuQCJob.py | 61 +++++++++++-------- .../templates/nuqc_job.sh | 6 +- 2 files changed, 39 insertions(+), 28 deletions(-) diff --git a/src/sequence_processing_pipeline/NuQCJob.py b/src/sequence_processing_pipeline/NuQCJob.py index fabc42eb..a68331bd 100644 --- a/src/sequence_processing_pipeline/NuQCJob.py +++ b/src/sequence_processing_pipeline/NuQCJob.py @@ -83,6 +83,7 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path, self.pmls_path = pmls_path self.additional_fastq_tags = additional_fastq_tags self.audit_folders = ['filtered_sequences'] + self.read_length = read_length # for projects that use sequence_processing_pipeline as a dependency, # jinja_env must be set to sequence_processing_pipeline's root path, @@ -492,6 +493,14 @@ def _generate_job_script(self, max_bucket_size): # files can be created. (${jobd}) mmi_filter_cmds = self._generate_mmi_filter_cmds("${jobd}") + if self.read_length == 'short': + pmls_extra_parameters = '' + elif self.read_length == 'long': + pmls_extra_parameters = '"max" 21' + else: + raise ValueError( + f'pmls_extra_parameters not set for {self.read_length}') + with open(job_script_path, mode="w", encoding="utf-8") as f: # the job resources should come from a configuration file @@ -499,31 +508,33 @@ def _generate_job_script(self, max_bucket_size): # processing begins. mtl = ' '.join(self.modules_to_load) - f.write(template.render(job_name=job_name, - queue_name=self.queue_name, - # should be 4 * 24 * 60 = 4 days - wall_time_limit=self.wall_time_limit, - mem_in_gb=self.jmem, - # Note NuQCJob now maps node_count to - # SLURM -N parameter to act like other - # Job classes. - # self.node_count should be 1 - node_count=self.node_count, - # cores-per-task (-c) should be 4 - cores_per_task=self.cores_per_task, - knwn_adpt_path=self.known_adapters_path, - output_path=self.output_path, - html_path=html_path, - json_path=json_path, - demux_path=demux_path, - temp_dir=self.temp_dir, - splitter_binary=splitter_binary, - modules_to_load=mtl, - length_limit=self.length_limit, - gres_value=self.gres_value, - movi_path=self.movi_path, - mmi_filter_cmds=mmi_filter_cmds, - pmls_path=self.pmls_path)) + f.write(template.render( + job_name=job_name, + queue_name=self.queue_name, + # should be 4 * 24 * 60 = 4 days + wall_time_limit=self.wall_time_limit, + mem_in_gb=self.jmem, + # Note NuQCJob now maps node_count to + # SLURM -N parameter to act like other + # Job classes. + # self.node_count should be 1 + node_count=self.node_count, + # cores-per-task (-c) should be 4 + cores_per_task=self.cores_per_task, + knwn_adpt_path=self.known_adapters_path, + output_path=self.output_path, + html_path=html_path, + json_path=json_path, + demux_path=demux_path, + temp_dir=self.temp_dir, + splitter_binary=splitter_binary, + modules_to_load=mtl, + length_limit=self.length_limit, + gres_value=self.gres_value, + movi_path=self.movi_path, + mmi_filter_cmds=mmi_filter_cmds, + pmls_extra_parameters=pmls_extra_parameters, + pmls_path=self.pmls_path)) return job_script_path diff --git a/src/sequence_processing_pipeline/templates/nuqc_job.sh b/src/sequence_processing_pipeline/templates/nuqc_job.sh index 7ada81f4..962bba24 100644 --- a/src/sequence_processing_pipeline/templates/nuqc_job.sh +++ b/src/sequence_processing_pipeline/templates/nuqc_job.sh @@ -125,10 +125,10 @@ function mux-runner () { --index /scratch/movi_hg38_chm13_hprc94 \ --read ${seq_reads_filter_alignment} \ --stdout | gzip > ${jobd}/seqs.movi.txt.gz - - python {{pmls_path}} <(zcat ${jobd}/seqs.movi.txt.gz) | \ + + python {{pmls_path}} <(zcat ${jobd}/seqs.movi.txt.gz) {{pmls_extra_parameters}} | \ seqtk subseq ${seq_reads_filter_alignment} - > ${jobd}/seqs.final.fastq - + {{splitter_binary}} ${jobd}/seqs.final.fastq \ ${jobd}/reads.r1.fastq ${delimiter} ${r1_tag} & {{splitter_binary}} ${jobd}/seqs.final.fastq \ From e18cf5df8fe59b714ae4d960bd3ea5c7313fef1b Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Wed, 3 Sep 2025 14:07:33 -0600 Subject: [PATCH 13/42] rm index in _inject_data --- src/qp_klp/Protocol.py | 3 +- src/qp_klp/scripts/pacbio_commands.py | 5 +- src/sequence_processing_pipeline/Commands.py | 78 ++++++++++++------- .../ConvertJob.py | 8 +- src/sequence_processing_pipeline/NuQCJob.py | 6 +- tests/test_PacBioWorflow.py | 6 +- 6 files changed, 68 insertions(+), 38 deletions(-) diff --git a/src/qp_klp/Protocol.py b/src/qp_klp/Protocol.py index 1b0eb71e..a6ba180b 100644 --- a/src/qp_klp/Protocol.py +++ b/src/qp_klp/Protocol.py @@ -443,7 +443,7 @@ def generate_sequence_counts(self): for gzf in gz_files: cf = gzf.replace('.fastq.gz', '.counts.txt') - sn = basename(cf).replace('.counts.txt', '') + sn = basename(cf).replace('_R1.counts.txt', '') if not exists(cf): missing_files.append(sn) continue @@ -456,6 +456,7 @@ def generate_sequence_counts(self): df = pd.DataFrame(data) self.reports_path = join(self.pipeline.output_path, + 'ConvertJob', 'SeqCounts.csv') df.to_csv(self.reports_path, index=False) diff --git a/src/qp_klp/scripts/pacbio_commands.py b/src/qp_klp/scripts/pacbio_commands.py index 5671909e..37dd6048 100755 --- a/src/qp_klp/scripts/pacbio_commands.py +++ b/src/qp_klp/scripts/pacbio_commands.py @@ -17,7 +17,8 @@ @click.argument('sample_list', required=True) @click.argument('run_folder', required=True) @click.argument('outdir', required=True) -def generate_bam2fastq_commands(sample_list, run_folder, outdir): +@click.argument('threads', required=True, default=1) +def generate_bam2fastq_commands(sample_list, run_folder, outdir, threads): """Generates the bam2fastq commands""" df = pd.read_csv(sample_list, sep='\t') files = {f.split('.')[-2]: f @@ -36,7 +37,7 @@ def generate_bam2fastq_commands(sample_list, run_folder, outdir): od = f'{outdir}/{pn}' makedirs(od, exist_ok=True) fn = f'{od}/{sn}_R1' - cmd = (f'bam2fastq -j 1 -o {fn} -c 9 ' + cmd = (f'bam2fastq -j {threads} -o {fn} -c 9 ' f'{files[bc]}; ' f'fqtools count {fn}.fastq.gz > ' f'{fn}.counts.txt') diff --git a/src/sequence_processing_pipeline/Commands.py b/src/sequence_processing_pipeline/Commands.py index f2a196b2..cfa60c2c 100644 --- a/src/sequence_processing_pipeline/Commands.py +++ b/src/sequence_processing_pipeline/Commands.py @@ -6,12 +6,13 @@ def split_similar_size_bins(data_location_path, max_file_list_size_in_gb, - batch_prefix): + batch_prefix, allow_fwd_only=False): '''Partitions input fastqs to coarse bins :param data_location_path: Path to the ConvertJob directory. :param max_file_list_size_in_gb: Upper threshold for file-size. :param batch_prefix: Path + file-name prefix for output-files. + :param allow_fwd_only: ignore rev match, helpful for long reads. :return: The number of output-files created, size of largest bin. ''' # to prevent issues w/filenames like the ones below from being mistaken @@ -37,31 +38,56 @@ def split_similar_size_bins(data_location_path, max_file_list_size_in_gb, bucket_size = 0 max_bucket_size = 0 - for a, b in iter_paired_files(fastq_paths): - r1_size = os.stat(a).st_size - r2_size = os.stat(b).st_size - - output_base = os.path.dirname(a).split('/')[-1] - if current_size + r1_size > max_size: - # bucket is full. - if bucket_size > max_bucket_size: - max_bucket_size = bucket_size - - # reset bucket_size. - bucket_size = r1_size + r2_size - - if fp is not None: - fp.close() - - split_offset += 1 - current_size = r1_size - fp = open(batch_prefix + '-%d' % split_offset, 'w') - else: - # add to bucket_size - bucket_size += r1_size + r2_size - current_size += r1_size - - fp.write("%s\t%s\t%s\n" % (a, b, output_base)) + if allow_fwd_only: + for a in fastq_paths: + r1_size = os.stat(a).st_size + output_base = os.path.dirname(a).split('/')[-1] + if current_size + r1_size > max_size: + # bucket is full. + if bucket_size > max_bucket_size: + max_bucket_size = bucket_size + + # reset bucket_size. + bucket_size = r1_size + + if fp is not None: + fp.close() + + split_offset += 1 + current_size = r1_size + fp = open(batch_prefix + '-%d' % split_offset, 'w') + else: + # add to bucket_size + bucket_size += r1_size + current_size += r1_size + + fp.write("%s\t%s\n" % (a, output_base)) + else: + for a, b in iter_paired_files(fastq_paths): + r1_size = os.stat(a).st_size + r2_size = os.stat(b).st_size + + output_base = os.path.dirname(a).split('/')[-1] + if current_size + r1_size > max_size: + # bucket is full. + if bucket_size > max_bucket_size: + max_bucket_size = bucket_size + + # reset bucket_size. + bucket_size = r1_size + r2_size + + if fp is not None: + fp.close() + + split_offset += 1 + current_size = r1_size + fp = open(batch_prefix + '-%d' % split_offset, 'w') + else: + # add to bucket_size + bucket_size += r1_size + r2_size + current_size += r1_size + + fp.write("%s\t%s\t%s\n" % (a, b, output_base)) if fp is not None: fp.close() diff --git a/src/sequence_processing_pipeline/ConvertJob.py b/src/sequence_processing_pipeline/ConvertJob.py index a752d67d..3772ae86 100644 --- a/src/sequence_processing_pipeline/ConvertJob.py +++ b/src/sequence_processing_pipeline/ConvertJob.py @@ -433,8 +433,10 @@ def _generate_job_script(self): Generate a Torque job script for processing supplied root_directory. :return: The path to the newly-created job-script. """ - lines = [] + threads_per_job = 2 + parallel_jobs = int(self.nprocs/threads_per_job) + lines = [] lines.append("#!/bin/bash") lines.append(f"#SBATCH --job-name {self.qiita_job_id}_{self.job_name}") lines.append(f"#SBATCH -p {self.queue_name}") @@ -463,8 +465,8 @@ def _generate_job_script(self): lines.append( f'pacbio_generate_bam2fastq_commands ../sample_list.tsv ' - f'{self.root_dir} {self.output_path} ' - f'| parallel -j {self.nprocs}') + f'{self.root_dir} {self.output_path} {threads_per_job}' + f'| parallel -j {parallel_jobs}') with open(self.job_script_path, 'w') as f: for line in lines: diff --git a/src/sequence_processing_pipeline/NuQCJob.py b/src/sequence_processing_pipeline/NuQCJob.py index a68331bd..7b9288da 100644 --- a/src/sequence_processing_pipeline/NuQCJob.py +++ b/src/sequence_processing_pipeline/NuQCJob.py @@ -239,9 +239,9 @@ def run(self, callback=None): batch_count = 0 max_size = 0 else: - batch_count, max_size = split_similar_size_bins(self.root_dir, - self.bucket_size, - batch_location) + batch_count, max_size = split_similar_size_bins( + self.root_dir, self.bucket_size, batch_location, + self.read_length == 'long') job_script_path = self._generate_job_script(max_size) diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py index 437689c9..8ed636b0 100644 --- a/tests/test_PacBioWorflow.py +++ b/tests/test_PacBioWorflow.py @@ -73,9 +73,9 @@ def _inject_data(self, wf): rp = sample["Sample_ID"] sp = sample["Sample_Project"] dstats.append( - {'Lane': sample['Lane'], 'SampleID': rp, - 'Sample_Project': sp, 'Index': sample['index'], - '# Reads': 2}) + {'SampleID': rp, + 'Sample_Project': sp, + '# Reads': 2}) dname = f'{convert_dir}/{sp}' Path(f'{dname}/{rp}_R1_001.fastq.gz').touch() From 0734a6ef3c81d1c4f1b8faa321482968a49d5b9d Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Wed, 3 Sep 2025 15:10:47 -0600 Subject: [PATCH 14/42] dstats --- tests/test_PacBioWorflow.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py index 8ed636b0..46327cfd 100644 --- a/tests/test_PacBioWorflow.py +++ b/tests/test_PacBioWorflow.py @@ -70,14 +70,11 @@ def _inject_data(self, wf): # # then loop over samples and stage all fastq.gz files dstats = [] for i, sample in enumerate(samples): - rp = sample["Sample_ID"] + sn = sample['Sample_Name'], sp = sample["Sample_Project"] - dstats.append( - {'SampleID': rp, - 'Sample_Project': sp, - '# Reads': 2}) + dstats.append({'SampleID': sn, '# Reads': 2}) dname = f'{convert_dir}/{sp}' - Path(f'{dname}/{rp}_R1_001.fastq.gz').touch() + Path(f'{dname}/{sn}_R1.fastq.gz').touch() # # NuQCJob # dname = f'{nuqc_dir}/filtered_sequences/{sp}' From 281bf7578968a7a9ad85d72f9b4a181f9360be8c Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Wed, 3 Sep 2025 16:42:32 -0600 Subject: [PATCH 15/42] nuqc_job_single.sh --- .../ConvertJob.py | 2 +- src/sequence_processing_pipeline/NuQCJob.py | 3 +- .../templates/nuqc_job_single.sh | 171 ++++++++++++++++++ .../pacbio/good_pacbio_metagv10.csv | 2 +- 4 files changed, 175 insertions(+), 3 deletions(-) create mode 100644 src/sequence_processing_pipeline/templates/nuqc_job_single.sh diff --git a/src/sequence_processing_pipeline/ConvertJob.py b/src/sequence_processing_pipeline/ConvertJob.py index 3772ae86..4e99a8aa 100644 --- a/src/sequence_processing_pipeline/ConvertJob.py +++ b/src/sequence_processing_pipeline/ConvertJob.py @@ -466,7 +466,7 @@ def _generate_job_script(self): lines.append( f'pacbio_generate_bam2fastq_commands ../sample_list.tsv ' f'{self.root_dir} {self.output_path} {threads_per_job}' - f'| parallel -j {parallel_jobs}') + f' | parallel -j {parallel_jobs}') with open(self.job_script_path, 'w') as f: for line in lines: diff --git a/src/sequence_processing_pipeline/NuQCJob.py b/src/sequence_processing_pipeline/NuQCJob.py index 7b9288da..540c4762 100644 --- a/src/sequence_processing_pipeline/NuQCJob.py +++ b/src/sequence_processing_pipeline/NuQCJob.py @@ -468,7 +468,6 @@ def _generate_job_script(self, max_bucket_size): return None job_script_path = join(self.output_path, 'process_all_fastq_files.sh') - template = self.jinja_env.get_template("nuqc_job.sh") job_name = f'{self.qiita_job_id}_{self.job_name}' @@ -495,8 +494,10 @@ def _generate_job_script(self, max_bucket_size): if self.read_length == 'short': pmls_extra_parameters = '' + template = self.jinja_env.get_template("nuqc_job.sh") elif self.read_length == 'long': pmls_extra_parameters = '"max" 21' + template = self.jinja_env.get_template("nuqc_job_single.sh") else: raise ValueError( f'pmls_extra_parameters not set for {self.read_length}') diff --git a/src/sequence_processing_pipeline/templates/nuqc_job_single.sh b/src/sequence_processing_pipeline/templates/nuqc_job_single.sh new file mode 100644 index 00000000..dc991fb6 --- /dev/null +++ b/src/sequence_processing_pipeline/templates/nuqc_job_single.sh @@ -0,0 +1,171 @@ +#!/bin/bash -l +#SBATCH -J {{job_name}} +#SBATCH -p {{queue_name}} +### wall-time-limit in minutes +#SBATCH --time {{wall_time_limit}} +#SBATCH --mem {{mem_in_gb}}G +#SBATCH -N {{node_count}} +### Note cores_per_task maps to fastp & minimap2 thread counts +### as well as sbatch -c. demux threads remains fixed at 1. +### Note -c set to 4 and thread counts set to 7 during testing. +#SBATCH -c {{cores_per_task}} +### Commented out for now, but there is a possibility it will be needed +### in the future. +###SBATCH --gres=node_jobs:{{gres_value}} +#SBATCH --constraint="amd" + + +echo "---------------" +echo "Run details:" +echo "$SLURM_JOB_NAME $SLURM_JOB_ID $SLURMD_NODENAME $SLURM_ARRAY_TASK_ID" +echo "---------------" + +if [[ -z "${SLURM_ARRAY_TASK_ID}" ]]; then + echo "Not operating within an array" + exit 1 +fi +if [[ -z ${PREFIX} ]]; then + echo "PREFIX is not set" + exit 1 +fi +if [[ -z ${OUTPUT} ]]; then + echo "OUTPUT is not set" + exit 1 +fi + +conda activate qp-knight-lab-processing-2022.03 +module load {{modules_to_load}} + +set -x +set -e +set -o pipefail + +export FILES=$(printf "%s-%d" ${PREFIX} ${SLURM_ARRAY_TASK_ID}) +if [[ ! -f ${FILES} ]]; then + logger ${FILES} not found + exit 1 +fi +# set a temp directory, make a new unique one under it and +# make sure we clean up as we're dumping to shm +# DO NOT do this casually. Only do a clean up like this if +# you know for sure TMPDIR is what you want. + +WKDIR=${OUTPUT}/ +TMPDIR=${OUTPUT} +export TMPDIR=${TMPDIR} +export TMPDIR=$(mktemp -d) +echo $TMPDIR + +mkdir -p ${WKDIR}/fastp_reports_dir/html +mkdir -p ${WKDIR}/fastp_reports_dir/json + +export ADAPTER_ONLY_OUTPUT=${OUTPUT}/only-adapter-filtered +mkdir -p ${ADAPTER_ONLY_OUTPUT} + +function cleanup { + echo "Removing $TMPDIR" + rm -fr $TMPDIR + unset TMPDIR +} +trap cleanup EXIT + +export delimiter=::MUX:: +export r1_tag=/1 +function mux-runner () { + n=$(wc -l ${FILES} | cut -f 1 -d" ") + + jobd=${TMPDIR} + id_map=${jobd}/id_map + seqs_reads=${jobd}/seqs.interleaved.fastq + seq_reads_filter_alignment=${jobd}/seqs.interleaved.filter_alignment.fastq + + for i in $(seq 1 ${n}) + do + line=$(head -n ${i} ${FILES} | tail -n 1) + r1=$(echo ${line} | cut -f 1 -d" ") + base=$(echo ${line} | cut -f 2 -d" ") + r1_name=$(basename ${r1} .fastq.gz) + r_adapter_only=${ADAPTER_ONLY_OUTPUT}/${r1_name}.interleave.fastq.gz + + s_name=$(basename "${r1}" | sed -r 's/\.fastq\.gz//') + html_name=$(echo "$s_name.html") + json_name=$(echo "$s_name.json") + + echo -e "${i}\t${r1_name}\t${base}" >> ${id_map} + + # movi, in the current version, works on the interleaved version of the + # fwd/rev reads so we are gonna take advantage fastp default output + # to minimize steps. Additionally, movi expects the input to not be + # gz, so we are not going to compress seqs_r1 + + fastp \ + -l {{length_limit}} \ + -i ${r1} \ + -w {{cores_per_task}} \ + --adapter_fasta {{knwn_adpt_path}} \ + --html {{html_path}}/${html_name} \ + --json {{json_path}}/${json_name} \ + --stdout | gzip > ${r_adapter_only} + + # multiplex and write adapter filtered data all at once + zcat ${r_adapter_only} | \ + sed -r "1~4s/^@(.*)/@${i}${delimiter}\1/" \ + >> ${seqs_reads} + done + + # minimap/samtools pair commands are now generated in NuQCJob._generate_mmi_filter_cmds() + # and passed to this template. + {{mmi_filter_cmds}} + + {{movi_path}} query \ + --index /scratch/movi_hg38_chm13_hprc94 \ + --read ${seq_reads_filter_alignment} \ + --stdout | gzip > ${jobd}/seqs.movi.txt.gz + + python {{pmls_path}} <(zcat ${jobd}/seqs.movi.txt.gz) {{pmls_extra_parameters}} | \ + seqtk subseq ${seq_reads_filter_alignment} - > ${jobd}/seqs.final.fastq + + {{splitter_binary}} ${jobd}/seqs.final.fastq \ + ${jobd}/reads.r1.fastq ${delimiter} ${r1_tag} & + wait + + # keep seqs.movi.txt and migrate it to NuQCJob directory. + mv ${jobd}/seqs.movi.txt.gz {{output_path}}/logs/seqs.movi.${SLURM_ARRAY_TASK_ID}.txt.gz +} +export -f mux-runner + + +function demux-runner () { + n_demux_jobs=${SLURM_CPUS_PER_TASK} + jobd=${TMPDIR} + id_map=${jobd}/id_map + seqs_r1=${jobd}/reads.r1.fastq.paired.fq + + id_map=${jobd}/id_map + if [[ ! -f ${id_map} ]]; then + echo "No samples..." + return + fi + + for idx in $(seq 0 ${n_demux_jobs}) + do + python {{demux_path}} \ + --id-map ${id_map} \ + --infile <(cat ${seqs_r1}) \ + --output ${OUTPUT} \ + --task ${idx} \ + --maxtask ${n_demux_jobs} & + done + wait +} +export -f demux-runner + +mux-runner + +mkdir -p ${OUTPUT} + +echo "$(date) :: demux start" +demux-runner +echo "$(date) :: demux stop" + +touch ${OUTPUT}/${SLURM_JOB_NAME}.${SLURM_ARRAY_TASK_ID}.completed diff --git a/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv b/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv index f2407be1..786c4b1f 100644 --- a/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv +++ b/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv @@ -8,7 +8,7 @@ Assay,Metagenomic,,,,, Description,,,,,, ,,,,,, [Data],,,,,, -Sample_ID,Sample_Name,Sample_Plate,well_id_384,Sample_Project,Well_description,Lane +Sample_ID,Sample_Name,Sample_Plate,Sample_Well,Sample_Project,Well_description,Lane bc3011,3A,sample_plate_1,A1,MyProject_6123,sample_plate_1.3A.A1,1 bc0112,4A,sample_plate_1,A2,MyProject_6123,sample_plate_1.4A.A2,1 bc9992,5B,sample_plate_1,A3,MyProject_6123,sample_plate_1.5B.A3,1 From 1aa102cd0befe70c467414f7da2524acc17688a8 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Wed, 3 Sep 2025 17:13:25 -0600 Subject: [PATCH 16/42] rm extra , --- tests/test_PacBioWorflow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py index 46327cfd..fd35c159 100644 --- a/tests/test_PacBioWorflow.py +++ b/tests/test_PacBioWorflow.py @@ -69,8 +69,8 @@ def _inject_data(self, wf): # # then loop over samples and stage all fastq.gz files dstats = [] - for i, sample in enumerate(samples): - sn = sample['Sample_Name'], + for sample in samples: + sn = sample['Sample_Name'] sp = sample["Sample_Project"] dstats.append({'SampleID': sn, '# Reads': 2}) dname = f'{convert_dir}/{sp}' From 01c862de03fe5b3373adb2a8f6c66ff64ed8f752 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Thu, 4 Sep 2025 03:10:50 -0600 Subject: [PATCH 17/42] print Profile selected --- src/sequence_processing_pipeline/Pipeline.py | 2 ++ tests/test_PacBioWorflow.py | 1 + 2 files changed, 3 insertions(+) diff --git a/src/sequence_processing_pipeline/Pipeline.py b/src/sequence_processing_pipeline/Pipeline.py index cf0a83cd..667bd6d8 100644 --- a/src/sequence_processing_pipeline/Pipeline.py +++ b/src/sequence_processing_pipeline/Pipeline.py @@ -445,6 +445,8 @@ def _configure_profile(self): f"{self.assay_type}) was not found. Please notify" " an administrator") + print(f'Profile selected: {selected_profile}') + self.config_profile = selected_profile def _directory_check(self, directory_path, create=False): diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py index fd35c159..3cddfbc1 100644 --- a/tests/test_PacBioWorflow.py +++ b/tests/test_PacBioWorflow.py @@ -74,6 +74,7 @@ def _inject_data(self, wf): sp = sample["Sample_Project"] dstats.append({'SampleID': sn, '# Reads': 2}) dname = f'{convert_dir}/{sp}' + makedirs(dname, exist_ok=True) Path(f'{dname}/{sn}_R1.fastq.gz').touch() # # NuQCJob From 05036b71c8c5029b05b27d9bad48d7b7c79c1a92 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Thu, 4 Sep 2025 08:28:15 -0600 Subject: [PATCH 18/42] counts.txt in _inject_data --- .../pacbio_metagenomic.json | 17 +++++++++++------ tests/test_PacBioWorflow.py | 2 ++ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/tests/data/configuration_profiles/pacbio_metagenomic.json b/tests/data/configuration_profiles/pacbio_metagenomic.json index 19168ba3..0981ffc7 100644 --- a/tests/data/configuration_profiles/pacbio_metagenomic.json +++ b/tests/data/configuration_profiles/pacbio_metagenomic.json @@ -16,25 +16,30 @@ "nodes": 1, "cpus_per_task": 8, "queue": "qiita", - "wallclock_time_in_minutes": 240, + "wallclock_time_in_minutes": 2440, "minimap2_databases": [ "/databases/minimap2/db_1.mmi", "/databases/minimap2/db_2.mmi" ], - "modules_to_load": ["fastp_0.20.1", "samtools_1.12", "minimap2_2.18"], + "modules_to_load": [ + "gcc_9.3.0", + "fastp_0.20.1", + "samtools_1.12", + "minimap2_2.28" + ], "fastp_executable_path": "fastp", "minimap2_executable_path": "minimap2", "samtools_executable_path": "samtools", - "job_total_memory_limit": "20", + "job_total_memory_limit": "110", "job_max_array_length": 1000, "known_adapters_path": "fastp_known_adapters_formatted.fna", - "bucket_size": 8, + "bucket_size": 18, "length_limit": 100, - "cores_per_task": 4, + "cores_per_task": 16, "movi_executable_path": "/home/user/user_dir/Movi/build/movi-default", "gres_value": 4, "pmls_path": "/home/user/user_dir/human_host_filtration/scripts/qiita_filter_pmls.py", - "additional_fastq_tags": ["BX"] + "additional_fastq_tags": [] }, "seqpro": { "seqpro_path": "seqpro", diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py index 3cddfbc1..9c355e6e 100644 --- a/tests/test_PacBioWorflow.py +++ b/tests/test_PacBioWorflow.py @@ -76,6 +76,8 @@ def _inject_data(self, wf): dname = f'{convert_dir}/{sp}' makedirs(dname, exist_ok=True) Path(f'{dname}/{sn}_R1.fastq.gz').touch() + with open(f'{dname}/{sn}_R1.counts.txt', 'w') as f: + f.write("2") # # NuQCJob # dname = f'{nuqc_dir}/filtered_sequences/{sp}' From d5adee0ebb2544fe97b1901142199a3bbbd7fdb9 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Thu, 4 Sep 2025 10:00:11 -0600 Subject: [PATCH 19/42] demux_just_fwd --- pyproject.toml | 1 + src/qp_klp/PacBioMetagenomicWorkflow.py | 2 +- src/sequence_processing_pipeline/Commands.py | 79 +++++++++++++++++++ .../templates/nuqc_job_single.sh | 2 +- .../pacbio_metagenomic.json | 2 +- 5 files changed, 83 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 64a74943..3ee2658b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,4 +59,5 @@ dependencies = [ configure_klp = "qp_klp.scripts.configure_klp:config" start_klp = "qp_klp.scripts.start_klp:execute" demux = "sequence_processing_pipeline.scripts.cli:demux" +demux_single = "sequence_processing_pipeline.scripts.cli:demux_single" pacbio_generate_bam2fastq_commands = "qp_klp.scripts.pacbio_commands:generate_bam2fastq_commands" diff --git a/src/qp_klp/PacBioMetagenomicWorkflow.py b/src/qp_klp/PacBioMetagenomicWorkflow.py index aae032f8..a1815fff 100644 --- a/src/qp_klp/PacBioMetagenomicWorkflow.py +++ b/src/qp_klp/PacBioMetagenomicWorkflow.py @@ -4,6 +4,7 @@ from .Assays import ASSAY_NAME_METAGENOMIC from .FailedSamplesRecord import FailedSamplesRecord from .Workflows import Workflow +import pandas as pd class PacBioMetagenomicWorkflow(Workflow, Metagenomic, PacBio): @@ -40,7 +41,6 @@ def __init__(self, **kwargs): self.fsr = FailedSamplesRecord(self.kwargs['output_dir'], self.pipeline.sample_sheet.samples) - import pandas as pd samples = [ {'barcode': sample['Sample_ID'], 'sample_name': sample['Sample_Name'], diff --git a/src/sequence_processing_pipeline/Commands.py b/src/sequence_processing_pipeline/Commands.py index cfa60c2c..68effc2e 100644 --- a/src/sequence_processing_pipeline/Commands.py +++ b/src/sequence_processing_pipeline/Commands.py @@ -192,3 +192,82 @@ def demux(id_map, fp, out_d, task, maxtask): for d in openfps.values(): for f in d.values(): f.close() + + +def demux_just_fwd(id_map, fp, out_d, task, maxtask): + """Split infile data based in provided map""" + delimiter = '::MUX::' + mode = 'wt' + ext = '.fastq.gz' + sep = '/' + rec = '@' + + openfps = {} + + for offset, (idx, r1, outbase) in enumerate(id_map): + if offset % maxtask == task: + idx = rec + idx + + # setup output locations + outdir = out_d + sep + outbase + fullname_r1 = outdir + sep + r1 + ext + + # we have seen in lustre that sometime this line + # can have a raise condition; making sure it doesn't break + # things + try: + os.makedirs(outdir, exist_ok=True) + except FileExistsError: + pass + current_fp_r1 = gzip.open(fullname_r1, mode) + current_fp = {'1': current_fp_r1} + openfps[idx] = current_fp + + # setup a parser + seq_id = iter(fp) + seq = iter(fp) + dumb = iter(fp) + qual = iter(fp) + + for i, s, d, q in zip(seq_id, seq, dumb, qual): + # '@1', 'LH00444:84:227CNHLT4:7:1101:41955:2443/1' + # '@1', 'LH00444:84:227CNHLT4:7:1101:41955:2443/1 BX:Z:TATGACACATGCGGCCCT' # noqa + # '@baz/1 + + # NB: from 6d794a37-12cd-4f8e-95d6-72a4b8a1ec1c's only-adapter-filtered results: # noqa + # @A00953:244:HYHYWDSXY:3:1101:14082:3740 1:N:0:CCGTAAGA+TCTAACGC + + fname_encoded, sid = i.split(delimiter, 1) + + if fname_encoded not in openfps: + continue + + current_fp = openfps[fname_encoded] + + # remove '\n' from sid and split on all whitespace. + tmp = sid.strip().split() + + if len(tmp) == 1: + # sequence id line contains no optional metadata. + # don't change sid. + # -1 is \n + orientation = sid[-2] + sid = rec + sid + elif len(tmp) == 2: + sid = tmp[0] + metadata = tmp[1] + # no '\n' + orientation = sid[-1] + # hexdump confirms separator is ' ', not '\t' + sid = rec + sid + ' ' + metadata + '\n' + else: + raise ValueError(f"'{sid}' is not a recognized form") + + current_fp[orientation].write(sid) + current_fp[orientation].write(s) + current_fp[orientation].write(d) + current_fp[orientation].write(q) + + for d in openfps.values(): + for f in d.values(): + f.close() diff --git a/src/sequence_processing_pipeline/templates/nuqc_job_single.sh b/src/sequence_processing_pipeline/templates/nuqc_job_single.sh index dc991fb6..1ad6db54 100644 --- a/src/sequence_processing_pipeline/templates/nuqc_job_single.sh +++ b/src/sequence_processing_pipeline/templates/nuqc_job_single.sh @@ -149,7 +149,7 @@ function demux-runner () { for idx in $(seq 0 ${n_demux_jobs}) do - python {{demux_path}} \ + demux_just_fwd \ --id-map ${id_map} \ --infile <(cat ${seqs_r1}) \ --output ${OUTPUT} \ diff --git a/tests/data/configuration_profiles/pacbio_metagenomic.json b/tests/data/configuration_profiles/pacbio_metagenomic.json index 0981ffc7..6cd9fc95 100644 --- a/tests/data/configuration_profiles/pacbio_metagenomic.json +++ b/tests/data/configuration_profiles/pacbio_metagenomic.json @@ -51,7 +51,7 @@ "queue": "qiita", "nthreads": 16, "wallclock_time_in_minutes": 60, - "modules_to_load": ["fastqc_0.11.5"], + "modules_to_load": ["fastqc_0.12.1"], "fastqc_executable_path": "fastqc", "multiqc_executable_path": "multiqc", "multiqc_config_file_path": "sequence_processing_pipeline/multiqc-bclconvert-config.yaml", From 2fcfc28981b4a69841c955071bd2bab4870a10fa Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Thu, 4 Sep 2025 13:51:04 -0600 Subject: [PATCH 20/42] demux_single -? demux_just_fwd --- pyproject.toml | 2 +- src/sequence_processing_pipeline/Commands.py | 13 ++++++++++++- .../templates/nuqc_job_single.sh | 2 +- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3ee2658b..36fa6ca9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,5 +59,5 @@ dependencies = [ configure_klp = "qp_klp.scripts.configure_klp:config" start_klp = "qp_klp.scripts.start_klp:execute" demux = "sequence_processing_pipeline.scripts.cli:demux" -demux_single = "sequence_processing_pipeline.scripts.cli:demux_single" +demux_just_fwd = "sequence_processing_pipeline.scripts.cli:demux_just_fwd" pacbio_generate_bam2fastq_commands = "qp_klp.scripts.pacbio_commands:generate_bam2fastq_commands" diff --git a/src/sequence_processing_pipeline/Commands.py b/src/sequence_processing_pipeline/Commands.py index 68effc2e..19fc455e 100644 --- a/src/sequence_processing_pipeline/Commands.py +++ b/src/sequence_processing_pipeline/Commands.py @@ -194,7 +194,18 @@ def demux(id_map, fp, out_d, task, maxtask): f.close() -def demux_just_fwd(id_map, fp, out_d, task, maxtask): +def demux_just_fwd(id_map_fp, fp_fp, out_d, task, maxtask): + with open(id_map_fp, 'r') as f: + id_map = f.readlines() + id_map = [line.strip().split('\t') for line in id_map] + + # fp needs to be an open file handle. + # ensure task and maxtask are proper ints when coming from cmd-line. + with open(fp_fp, 'r') as fp: + demux_just_fwd_processing(id_map, fp, out_d, int(task), int(maxtask)) + + +def demux_just_fwd_processing(id_map, fp, out_d, task, maxtask): """Split infile data based in provided map""" delimiter = '::MUX::' mode = 'wt' diff --git a/src/sequence_processing_pipeline/templates/nuqc_job_single.sh b/src/sequence_processing_pipeline/templates/nuqc_job_single.sh index 1ad6db54..f223e63d 100644 --- a/src/sequence_processing_pipeline/templates/nuqc_job_single.sh +++ b/src/sequence_processing_pipeline/templates/nuqc_job_single.sh @@ -33,7 +33,7 @@ if [[ -z ${OUTPUT} ]]; then exit 1 fi -conda activate qp-knight-lab-processing-2022.03 +conda activate klp-2025-05 module load {{modules_to_load}} set -x From 27f49d104214a8a050be41581703be3f97d01580 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Thu, 4 Sep 2025 14:06:23 -0600 Subject: [PATCH 21/42] add cli for demux_just_fwd --- pyproject.toml | 2 +- src/sequence_processing_pipeline/Commands.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 36fa6ca9..09a38ea5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,5 +59,5 @@ dependencies = [ configure_klp = "qp_klp.scripts.configure_klp:config" start_klp = "qp_klp.scripts.start_klp:execute" demux = "sequence_processing_pipeline.scripts.cli:demux" -demux_just_fwd = "sequence_processing_pipeline.scripts.cli:demux_just_fwd" +demux_just_fwd = "sequence_processing_pipeline.Commands:demux_just_fwd" pacbio_generate_bam2fastq_commands = "qp_klp.scripts.pacbio_commands:generate_bam2fastq_commands" diff --git a/src/sequence_processing_pipeline/Commands.py b/src/sequence_processing_pipeline/Commands.py index 19fc455e..78f04818 100644 --- a/src/sequence_processing_pipeline/Commands.py +++ b/src/sequence_processing_pipeline/Commands.py @@ -1,6 +1,7 @@ import glob import gzip import os +import click from sequence_processing_pipeline.util import (iter_paired_files, determine_orientation) @@ -194,6 +195,17 @@ def demux(id_map, fp, out_d, task, maxtask): f.close() +@click.group() +def cli(): + pass + + +@cli.command() +@click.option('--id-map', type=click.Path(exists=True), required=True) +@click.option('--infile', type=click.Path(exists=True), required=True) +@click.option('--output', type=click.Path(exists=True), required=True) +@click.option('--task', type=int, required=True) +@click.option('--maxtask', type=int, required=True) def demux_just_fwd(id_map_fp, fp_fp, out_d, task, maxtask): with open(id_map_fp, 'r') as f: id_map = f.readlines() From dea4fd948206697d339699dafe9b241dd212bfb8 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Fri, 5 Sep 2025 07:45:53 -0600 Subject: [PATCH 22/42] fix demux_just_fwd params --- src/sequence_processing_pipeline/Commands.py | 8 +-- src/sequence_processing_pipeline/Pipeline.py | 2 - tests/test_PacBioWorflow.py | 69 ++++++++++---------- 3 files changed, 38 insertions(+), 41 deletions(-) diff --git a/src/sequence_processing_pipeline/Commands.py b/src/sequence_processing_pipeline/Commands.py index 78f04818..5d574c1a 100644 --- a/src/sequence_processing_pipeline/Commands.py +++ b/src/sequence_processing_pipeline/Commands.py @@ -206,15 +206,15 @@ def cli(): @click.option('--output', type=click.Path(exists=True), required=True) @click.option('--task', type=int, required=True) @click.option('--maxtask', type=int, required=True) -def demux_just_fwd(id_map_fp, fp_fp, out_d, task, maxtask): - with open(id_map_fp, 'r') as f: +def demux_just_fwd(id_map, infile, output, task, maxtask): + with open(id_map, 'r') as f: id_map = f.readlines() id_map = [line.strip().split('\t') for line in id_map] # fp needs to be an open file handle. # ensure task and maxtask are proper ints when coming from cmd-line. - with open(fp_fp, 'r') as fp: - demux_just_fwd_processing(id_map, fp, out_d, int(task), int(maxtask)) + with open(infile, 'r') as fp: + demux_just_fwd_processing(id_map, fp, output, int(task), int(maxtask)) def demux_just_fwd_processing(id_map, fp, out_d, task, maxtask): diff --git a/src/sequence_processing_pipeline/Pipeline.py b/src/sequence_processing_pipeline/Pipeline.py index 667bd6d8..cf0a83cd 100644 --- a/src/sequence_processing_pipeline/Pipeline.py +++ b/src/sequence_processing_pipeline/Pipeline.py @@ -445,8 +445,6 @@ def _configure_profile(self): f"{self.assay_type}) was not found. Please notify" " an administrator") - print(f'Profile selected: {selected_profile}') - self.config_profile = selected_profile def _directory_check(self, directory_path, create=False): diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py index 9c355e6e..6234665f 100644 --- a/tests/test_PacBioWorflow.py +++ b/tests/test_PacBioWorflow.py @@ -9,13 +9,14 @@ from unittest import main from os import makedirs from os.path import dirname, abspath, join, exists -from metapool.sample_sheet import (PROTOCOL_NAME_PACBIO_SMRT) -from qp_klp.Assays import ASSAY_NAME_METAGENOMIC +from shutil import copyfile from shutil import rmtree from pathlib import Path -from qiita_client.testing import PluginTestCase -from sequence_processing_pipeline.PipelineError import PipelineError import pandas as pd +from metapool.sample_sheet import (PROTOCOL_NAME_PACBIO_SMRT) +from sequence_processing_pipeline.PipelineError import PipelineError +from qp_klp.Assays import ASSAY_NAME_METAGENOMIC +from qiita_client.testing import PluginTestCase class WorkflowFactoryTests(PluginTestCase): @@ -49,23 +50,22 @@ def _inject_data(self, wf): makedirs(convert_dir, exist_ok=True) makedirs(reports_dir, exist_ok=True) Path(f'{convert_dir}/job_completed').touch() - # tellread_dir = f'{self.output_dir}/TellReadJob' - # nuqc_dir = f'{self.output_dir}/NuQCJob' - # fastqc_dir = f'{self.output_dir}/FastQCJob/logs/' - # multiqc_dir = f'{self.output_dir}/MultiQCJob/logs/' - # genprep_dir = (f'{self.output_dir}/GenPrepFileJob/' - # '211021_A00000_0000_SAMPLE/') - # makedirs(nuqc_dir, exist_ok=True) - # makedirs(fastqc_dir, exist_ok=True) - # makedirs(multiqc_dir, exist_ok=True) - # makedirs(genprep_dir, exist_ok=True) - # # now let's create the required project folders - # for project in wf.pipeline.get_project_info(): - # sp = project['project_name'] - # makedirs(f'{convert_dir}/{sp}', exist_ok=True) - # makedirs(f'{nuqc_dir}/filtered_sequences/{sp}', exist_ok=True) - # makedirs(f'{genprep_dir}/{sp}/filtered_sequences/', - # exist_ok=True) + nuqc_dir = f'{self.output_dir}/NuQCJob' + fastqc_dir = f'{self.output_dir}/FastQCJob/logs/' + multiqc_dir = f'{self.output_dir}/MultiQCJob/logs/' + genprep_dir = (f'{self.output_dir}/GenPrepFileJob/' + '211021_A00000_0000_SAMPLE/') + makedirs(nuqc_dir, exist_ok=True) + makedirs(fastqc_dir, exist_ok=True) + makedirs(multiqc_dir, exist_ok=True) + makedirs(genprep_dir, exist_ok=True) + # now let's create the required project folders + for project in wf.pipeline.get_project_info(): + sp = project['project_name'] + makedirs(f'{convert_dir}/{sp}', exist_ok=True) + makedirs(f'{nuqc_dir}/filtered_sequences/{sp}', exist_ok=True) + makedirs(f'{genprep_dir}/{sp}/filtered_sequences/', + exist_ok=True) # # then loop over samples and stage all fastq.gz files dstats = [] @@ -74,28 +74,25 @@ def _inject_data(self, wf): sp = sample["Sample_Project"] dstats.append({'SampleID': sn, '# Reads': 2}) dname = f'{convert_dir}/{sp}' - makedirs(dname, exist_ok=True) Path(f'{dname}/{sn}_R1.fastq.gz').touch() with open(f'{dname}/{sn}_R1.counts.txt', 'w') as f: f.write("2") - # # NuQCJob - # dname = f'{nuqc_dir}/filtered_sequences/{sp}' - # copyfile(self.gz_source, f'{dname}/{rp}_L001_R1_001.fastq.gz') - # copyfile(self.gz_source, f'{dname}/{rp}_L001_R2_001.fastq.gz') + # NuQCJob + dname = f'{nuqc_dir}/filtered_sequences/{sp}' + copyfile(self.gz_source, f'{dname}/{sn}_R1.fastq.gz') - # # GenPrepFileJob - # gprep_base = f'{genprep_dir}/{sp}/filtered_sequences/{rp}' - # Path(f'{gprep_base}_L001_R1_001.fastq.gz').touch() - # Path(f'{gprep_base}_L001_R2_001.fastq.gz').touch() + # GenPrepFileJob + gprep_base = f'{genprep_dir}/{sp}/filtered_sequences/{sn}' + Path(f'{gprep_base}_R1.fastq.gz').touch() pd.DataFrame(dstats).set_index('SampleID').to_csv( f'{reports_dir}/Demultiplex_Stats.csv') - # # generating the "*.completed" files - # for i in range(len(samples)*3): - # Path(f'{fastqc_dir}/FastQCJob_{i}.completed').touch() - # Path(f'{multiqc_dir}/MultiQCJob_{i}.completed').touch() + # generating the "*.completed" files + for i in range(len(samples)*3): + Path(f'{fastqc_dir}/FastQCJob_{i}.completed').touch() + Path(f'{multiqc_dir}/MultiQCJob_{i}.completed').touch() def test_pacbio_metagenomic_workflow_creation(self): kwargs = {"uif_path": "tests/data/sample-sheets/metagenomic/" @@ -118,7 +115,9 @@ def test_pacbio_metagenomic_workflow_creation(self): self.assertEqual(wf.assay_type, ASSAY_NAME_METAGENOMIC) self._inject_data(wf) - # ConvertJob/ConvertJob.sh + + # we can add some checks/tests of the initial scripts (mainly Convert) + # but not doing now as is not required # Metagenomic is a valid data type in the default qiita test # database but job-id: 78901 doesn't exist; however, if we get From e596ade624a382bc9f2a409cecc27e23c77cded5 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Fri, 5 Sep 2025 11:22:23 -0600 Subject: [PATCH 23/42] rm demux_just_fwd_processing splitter_binary --- src/sequence_processing_pipeline/Commands.py | 22 +++---------------- .../templates/nuqc_job_single.sh | 6 +---- 2 files changed, 4 insertions(+), 24 deletions(-) diff --git a/src/sequence_processing_pipeline/Commands.py b/src/sequence_processing_pipeline/Commands.py index 5d574c1a..d8f45496 100644 --- a/src/sequence_processing_pipeline/Commands.py +++ b/src/sequence_processing_pipeline/Commands.py @@ -252,6 +252,9 @@ def demux_just_fwd_processing(id_map, fp, out_d, task, maxtask): dumb = iter(fp) qual = iter(fp) + # there is only fwd so the orientation is always '1' + orientation = '1' + for i, s, d, q in zip(seq_id, seq, dumb, qual): # '@1', 'LH00444:84:227CNHLT4:7:1101:41955:2443/1' # '@1', 'LH00444:84:227CNHLT4:7:1101:41955:2443/1 BX:Z:TATGACACATGCGGCCCT' # noqa @@ -267,25 +270,6 @@ def demux_just_fwd_processing(id_map, fp, out_d, task, maxtask): current_fp = openfps[fname_encoded] - # remove '\n' from sid and split on all whitespace. - tmp = sid.strip().split() - - if len(tmp) == 1: - # sequence id line contains no optional metadata. - # don't change sid. - # -1 is \n - orientation = sid[-2] - sid = rec + sid - elif len(tmp) == 2: - sid = tmp[0] - metadata = tmp[1] - # no '\n' - orientation = sid[-1] - # hexdump confirms separator is ' ', not '\t' - sid = rec + sid + ' ' + metadata + '\n' - else: - raise ValueError(f"'{sid}' is not a recognized form") - current_fp[orientation].write(sid) current_fp[orientation].write(s) current_fp[orientation].write(d) diff --git a/src/sequence_processing_pipeline/templates/nuqc_job_single.sh b/src/sequence_processing_pipeline/templates/nuqc_job_single.sh index f223e63d..d06ac833 100644 --- a/src/sequence_processing_pipeline/templates/nuqc_job_single.sh +++ b/src/sequence_processing_pipeline/templates/nuqc_job_single.sh @@ -125,10 +125,6 @@ function mux-runner () { python {{pmls_path}} <(zcat ${jobd}/seqs.movi.txt.gz) {{pmls_extra_parameters}} | \ seqtk subseq ${seq_reads_filter_alignment} - > ${jobd}/seqs.final.fastq - {{splitter_binary}} ${jobd}/seqs.final.fastq \ - ${jobd}/reads.r1.fastq ${delimiter} ${r1_tag} & - wait - # keep seqs.movi.txt and migrate it to NuQCJob directory. mv ${jobd}/seqs.movi.txt.gz {{output_path}}/logs/seqs.movi.${SLURM_ARRAY_TASK_ID}.txt.gz } @@ -139,7 +135,7 @@ function demux-runner () { n_demux_jobs=${SLURM_CPUS_PER_TASK} jobd=${TMPDIR} id_map=${jobd}/id_map - seqs_r1=${jobd}/reads.r1.fastq.paired.fq + seqs_r1=${jobd}/seqs.final.fastq id_map=${jobd}/id_map if [[ ! -f ${id_map} ]]; then From b27e7d6f9a38651d2085e1501b4ee24346993786 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Fri, 5 Sep 2025 13:39:54 -0600 Subject: [PATCH 24/42] self.files_regex = long_read --- src/qp_klp/Assays.py | 1 + src/qp_klp/PacBioMetagenomicWorkflow.py | 5 +---- src/sequence_processing_pipeline/util.py | 10 ++++++++-- tests/test_PacBioWorflow.py | 2 +- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/qp_klp/Assays.py b/src/qp_klp/Assays.py index e2fc0ede..7ed35cf3 100644 --- a/src/qp_klp/Assays.py +++ b/src/qp_klp/Assays.py @@ -494,6 +494,7 @@ def qc_reads(self): # is not defined, just fallback to the SPP expected default regex if not hasattr(self, 'files_regex'): self.files_regex = 'SPP' + # base quality control used by multiple Assay types. job = NuQCJob(self.raw_fastq_files_path, self.pipeline.output_path, diff --git a/src/qp_klp/PacBioMetagenomicWorkflow.py b/src/qp_klp/PacBioMetagenomicWorkflow.py index a1815fff..c8fb011c 100644 --- a/src/qp_klp/PacBioMetagenomicWorkflow.py +++ b/src/qp_klp/PacBioMetagenomicWorkflow.py @@ -26,10 +26,7 @@ def __init__(self, **kwargs): if 'overwrite_prep_with_original' in self.kwargs: self.overwrite_prep_with_original = \ self.kwargs['overwrite_prep_with_original'] - if 'files_regex' in self.kwargs: - self.files_regex = self.kwargs['files_regex'] - else: - self.files_regex = 'SPP' + self.files_regex = 'long_read' self.pipeline = Pipeline(self.kwargs['config_fp'], self.kwargs['run_identifier'], self.kwargs['uif_path'], diff --git a/src/sequence_processing_pipeline/util.py b/src/sequence_processing_pipeline/util.py index f38b38ee..b07b55fa 100644 --- a/src/sequence_processing_pipeline/util.py +++ b/src/sequence_processing_pipeline/util.py @@ -4,8 +4,8 @@ PAIR_UNDERSCORE = (REC(r'_R1_'), '_R1_', '_R2_') PAIR_DOT = (REC(r'\.R1\.'), '.R1.', '.R2.') -SIMPLE_PAIR_UNDERSCAOTE = (REC(r'_R1'), '_R1', '_R2') -PAIR_TESTS = (PAIR_UNDERSCORE, PAIR_DOT, SIMPLE_PAIR_UNDERSCAOTE) +SIMPLE_PAIR_UNDERSCORE = (REC(r'_R1'), '_R1', '_R2') +PAIR_TESTS = (PAIR_UNDERSCORE, PAIR_DOT, SIMPLE_PAIR_UNDERSCORE) FILES_REGEX = { 'SPP': { 'fastq': REC(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}\.fastq\.gz$'), @@ -32,6 +32,12 @@ 'html': REC(r'^(.*).html$'), 'json': REC(r'^(.*).json$') }, + 'long_read': { + 'fastq': REC(r'^(.*)_R1.fastq\.gz$'), + 'interleave_fastq': REC(r'^(.*)_R1.interleave\.fastq\.gz$'), + 'html': REC(r'^(.*)_R1.html$'), + 'json': REC(r'^(.*)_R1.json$') + }, } diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py index 6234665f..7429f8df 100644 --- a/tests/test_PacBioWorflow.py +++ b/tests/test_PacBioWorflow.py @@ -54,7 +54,7 @@ def _inject_data(self, wf): fastqc_dir = f'{self.output_dir}/FastQCJob/logs/' multiqc_dir = f'{self.output_dir}/MultiQCJob/logs/' genprep_dir = (f'{self.output_dir}/GenPrepFileJob/' - '211021_A00000_0000_SAMPLE/') + 'r11111_20250101_111111/') makedirs(nuqc_dir, exist_ok=True) makedirs(fastqc_dir, exist_ok=True) makedirs(multiqc_dir, exist_ok=True) From 166651667e2e5efde1874d8ff3e999272e00b85c Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Sat, 6 Sep 2025 09:31:41 -0600 Subject: [PATCH 25/42] sample_id_column --- src/sequence_processing_pipeline/NuQCJob.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/sequence_processing_pipeline/NuQCJob.py b/src/sequence_processing_pipeline/NuQCJob.py index 540c4762..fdf7d5b2 100644 --- a/src/sequence_processing_pipeline/NuQCJob.py +++ b/src/sequence_processing_pipeline/NuQCJob.py @@ -379,8 +379,15 @@ def _process_sample_sheet(self): raise PipelineError(s) sample_ids = [] + # long_reads use the Sample_Name as the main sample identifier, + # everything else uses Sample_ID; we might want to change this + # in the future for everything to use Sample_Name + sample_id_column = 'Sample_ID' + if self.read_length == 'long': + sample_id_column = 'Sample_Name' for sample in sheet.samples: - sample_ids.append((sample['Sample_ID'], sample['Sample_Project'])) + sample_ids.append( + (sample[sample_id_column], sample['Sample_Project'])) bioinformatics = sheet.Bioinformatics From 0141440d89c94fbb51a89936ec9911463facf245 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Sat, 6 Sep 2025 10:22:52 -0600 Subject: [PATCH 26/42] mv self.read_length = read_length up --- src/sequence_processing_pipeline/NuQCJob.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sequence_processing_pipeline/NuQCJob.py b/src/sequence_processing_pipeline/NuQCJob.py index fdf7d5b2..49f71179 100644 --- a/src/sequence_processing_pipeline/NuQCJob.py +++ b/src/sequence_processing_pipeline/NuQCJob.py @@ -61,6 +61,7 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path, [fastp_path, minimap2_path, samtools_path], max_array_length, modules_to_load=modules_to_load) + self.read_length = read_length self.sample_sheet_path = sample_sheet_path self._file_check(self.sample_sheet_path) metadata = self._process_sample_sheet() @@ -83,7 +84,6 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path, self.pmls_path = pmls_path self.additional_fastq_tags = additional_fastq_tags self.audit_folders = ['filtered_sequences'] - self.read_length = read_length # for projects that use sequence_processing_pipeline as a dependency, # jinja_env must be set to sequence_processing_pipeline's root path, From 2b5700fba9fe4feef27c9987110ca6bb446cf27f Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Sat, 6 Sep 2025 12:49:55 -0600 Subject: [PATCH 27/42] _filter_empty_fastq_files --- src/sequence_processing_pipeline/NuQCJob.py | 25 +++++++++++++-------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/sequence_processing_pipeline/NuQCJob.py b/src/sequence_processing_pipeline/NuQCJob.py index 49f71179..01242718 100644 --- a/src/sequence_processing_pipeline/NuQCJob.py +++ b/src/sequence_processing_pipeline/NuQCJob.py @@ -156,15 +156,22 @@ def _filter_empty_fastq_files(self, filtered_directory, files = glob(join(filtered_directory, f'*.{self.suffix}')) - for r1, r2 in iter_paired_files(files): - full_path = join(filtered_directory, r1) - full_path_reverse = join(filtered_directory, r2) - if stat(full_path).st_size <= minimum_bytes or stat( - full_path_reverse).st_size <= minimum_bytes: - logging.debug(f'moving {full_path} and {full_path_reverse}' - f' to empty list.') - empty_list.append(full_path) - empty_list.append(full_path_reverse) + if self.read_length == 'long': + for r1 in files: + full_path = join(filtered_directory, r1) + if stat(full_path).st_size <= minimum_bytes: + logging.debug(f'moving {full_path} to empty list.') + empty_list.append(full_path) + else: + for r1, r2 in iter_paired_files(files): + full_path = join(filtered_directory, r1) + full_path_reverse = join(filtered_directory, r2) + if stat(full_path).st_size <= minimum_bytes or stat( + full_path_reverse).st_size <= minimum_bytes: + logging.debug(f'moving {full_path} and {full_path_reverse}' + f' to empty list.') + empty_list.append(full_path) + empty_list.append(full_path_reverse) if empty_list: logging.debug(f'making directory {empty_files_directory}') From cc29d39d8805d59b3d4503b1001f0dad998436ca Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Sat, 6 Sep 2025 20:31:23 -0600 Subject: [PATCH 28/42] zip_longest --- src/sequence_processing_pipeline/FastQCJob.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/sequence_processing_pipeline/FastQCJob.py b/src/sequence_processing_pipeline/FastQCJob.py index 9d34cb04..27a1c7c4 100644 --- a/src/sequence_processing_pipeline/FastQCJob.py +++ b/src/sequence_processing_pipeline/FastQCJob.py @@ -2,6 +2,7 @@ from jinja2 import Environment from json import dumps import logging +from itertools import zip_longest from os import listdir, makedirs from os.path import join, basename from re import sub @@ -156,15 +157,16 @@ def _scan_fastq_files(self, is_raw_input=False): # gather a list of names from the outputs of previous jobs. project_names = [] - for proj_name, fltr_type, fastq_fp, fwd_files, rev_files in projects: + for proj_name, fltr_type, _, fwd_files, rev_files in projects: project_names.append(proj_name) p_path = partial(join, self.output_path, 'fastqc', proj_name) output_dir = p_path('bclconvert' if is_raw_input else fltr_type) makedirs(output_dir, exist_ok=True) - for some_fwd_file, some_rev_file in zip(fwd_files, rev_files): - fastqc_results.append((some_fwd_file, some_rev_file, - output_dir)) + for sff, srf in zip_longest(fwd_files, rev_files): + if srf is None: + srf = '' + fastqc_results.append((sff, srf, output_dir)) # remove duplicates project_names = list(set(project_names)) From 3a94746e6cffa950b7ce2052ecfd093a143d20f6 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Mon, 8 Sep 2025 07:25:27 -0600 Subject: [PATCH 29/42] raw_reads_r1r2 --- src/qp_klp/Protocol.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/qp_klp/Protocol.py b/src/qp_klp/Protocol.py index a6ba180b..e3c8f972 100644 --- a/src/qp_klp/Protocol.py +++ b/src/qp_klp/Protocol.py @@ -449,7 +449,9 @@ def generate_sequence_counts(self): continue with open(cf, 'r') as fh: counts = fh.read().strip() - data.append({'SampleID': sn, '# Reads': counts}) + data.append({'Sample_ID': sn, + 'raw_reads_r1r2': counts, + 'Lane': self.lane_number}) if missing_files: raise ValueError(f'Missing count files: {missing_files}') From 6e12dd5b93ed3b1dcaf9f2674d67f5ebff62acc4 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Tue, 9 Sep 2025 10:07:52 -0600 Subject: [PATCH 30/42] fastq.gz -> trimmed.fastq.gz --- tests/test_PacBioWorflow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py index 7429f8df..9183e01c 100644 --- a/tests/test_PacBioWorflow.py +++ b/tests/test_PacBioWorflow.py @@ -80,11 +80,11 @@ def _inject_data(self, wf): # NuQCJob dname = f'{nuqc_dir}/filtered_sequences/{sp}' - copyfile(self.gz_source, f'{dname}/{sn}_R1.fastq.gz') + copyfile(self.gz_source, f'{dname}/{sn}_R1.trimmed.fastq.gz') # GenPrepFileJob gprep_base = f'{genprep_dir}/{sp}/filtered_sequences/{sn}' - Path(f'{gprep_base}_R1.fastq.gz').touch() + Path(f'{gprep_base}_R1.trimmed.fastq.gz').touch() pd.DataFrame(dstats).set_index('SampleID').to_csv( f'{reports_dir}/Demultiplex_Stats.csv') From b2a308efa15f2b033dced3b3d5b71c49cb544a3b Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Tue, 9 Sep 2025 16:15:23 -0600 Subject: [PATCH 31/42] barcode --- src/qp_klp/PacBioMetagenomicWorkflow.py | 3 ++- src/qp_klp/scripts/pacbio_commands.py | 3 ++- src/sequence_processing_pipeline/NuQCJob.py | 8 +------- .../metagenomic/pacbio/good_pacbio_metagv10.csv | 8 ++++---- 4 files changed, 9 insertions(+), 13 deletions(-) diff --git a/src/qp_klp/PacBioMetagenomicWorkflow.py b/src/qp_klp/PacBioMetagenomicWorkflow.py index c8fb011c..580566f3 100644 --- a/src/qp_klp/PacBioMetagenomicWorkflow.py +++ b/src/qp_klp/PacBioMetagenomicWorkflow.py @@ -41,7 +41,8 @@ def __init__(self, **kwargs): samples = [ {'barcode': sample['Sample_ID'], 'sample_name': sample['Sample_Name'], - 'project_name': sample['Sample_Project']} + 'project_name': sample['Sample_Project'], + 'lane': sample['Lane']} for sample in self.pipeline.sample_sheet.samples] df = pd.DataFrame(samples) sample_list_fp = f"{self.kwargs['output_dir']}/sample_list.tsv" diff --git a/src/qp_klp/scripts/pacbio_commands.py b/src/qp_klp/scripts/pacbio_commands.py index 37dd6048..1a4c2af3 100755 --- a/src/qp_klp/scripts/pacbio_commands.py +++ b/src/qp_klp/scripts/pacbio_commands.py @@ -31,12 +31,13 @@ def generate_bam2fastq_commands(sample_list, run_folder, outdir, threads): bc = row['barcode'] sn = row['sample_name'] pn = row['project_name'] + lane = row['lane'] if bc not in files: missing_files.append(bc) continue od = f'{outdir}/{pn}' makedirs(od, exist_ok=True) - fn = f'{od}/{sn}_R1' + fn = f'{od}/{sn}_L00{lane}_R1_001' cmd = (f'bam2fastq -j {threads} -o {fn} -c 9 ' f'{files[bc]}; ' f'fqtools count {fn}.fastq.gz > ' diff --git a/src/sequence_processing_pipeline/NuQCJob.py b/src/sequence_processing_pipeline/NuQCJob.py index 01242718..8fa693c4 100644 --- a/src/sequence_processing_pipeline/NuQCJob.py +++ b/src/sequence_processing_pipeline/NuQCJob.py @@ -386,15 +386,9 @@ def _process_sample_sheet(self): raise PipelineError(s) sample_ids = [] - # long_reads use the Sample_Name as the main sample identifier, - # everything else uses Sample_ID; we might want to change this - # in the future for everything to use Sample_Name - sample_id_column = 'Sample_ID' - if self.read_length == 'long': - sample_id_column = 'Sample_Name' for sample in sheet.samples: sample_ids.append( - (sample[sample_id_column], sample['Sample_Project'])) + (sample['Sample_ID'], sample['Sample_Project'])) bioinformatics = sheet.Bioinformatics diff --git a/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv b/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv index 786c4b1f..de0bc699 100644 --- a/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv +++ b/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv @@ -8,10 +8,10 @@ Assay,Metagenomic,,,,, Description,,,,,, ,,,,,, [Data],,,,,, -Sample_ID,Sample_Name,Sample_Plate,Sample_Well,Sample_Project,Well_description,Lane -bc3011,3A,sample_plate_1,A1,MyProject_6123,sample_plate_1.3A.A1,1 -bc0112,4A,sample_plate_1,A2,MyProject_6123,sample_plate_1.4A.A2,1 -bc9992,5B,sample_plate_1,A3,MyProject_6123,sample_plate_1.5B.A3,1 +Sample_ID,barcode_id,Sample_Name,Sample_Plate,Sample_Well,Sample_Project,Well_description,Lane +3A,bc3011,3A,sample_plate_1,A1,MyProject_6123,sample_plate_1.3A.A1,1 +4A,bc0112,4A,sample_plate_1,A2,MyProject_6123,sample_plate_1.4A.A2,1 +5B,bc9992,5B,sample_plate_1,A3,MyProject_6123,sample_plate_1.5B.A3,1 ,,,,,, [Bioinformatics],,,,,, Sample_Project,QiitaID,HumanFiltering,library_construction_protocol,experiment_design_description,contains_replicates, From 1da1abb95c58455adb9d5ddf5c18ff2f2fa28f6d Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Tue, 9 Sep 2025 17:04:51 -0600 Subject: [PATCH 32/42] barcode -> barcode_id --- src/qp_klp/PacBioMetagenomicWorkflow.py | 2 +- .../pacbio/good_pacbio_metagv10.csv | 54 +++++++++---------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/src/qp_klp/PacBioMetagenomicWorkflow.py b/src/qp_klp/PacBioMetagenomicWorkflow.py index 580566f3..f9f12cf4 100644 --- a/src/qp_klp/PacBioMetagenomicWorkflow.py +++ b/src/qp_klp/PacBioMetagenomicWorkflow.py @@ -39,7 +39,7 @@ def __init__(self, **kwargs): self.pipeline.sample_sheet.samples) samples = [ - {'barcode': sample['Sample_ID'], + {'barcode': sample['barcode_id'], 'sample_name': sample['Sample_Name'], 'project_name': sample['Sample_Project'], 'lane': sample['Lane']} diff --git a/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv b/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv index de0bc699..b2134a64 100644 --- a/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv +++ b/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv @@ -1,27 +1,27 @@ -[Header],,,,,, -SheetType,pacbio_metag,,,,, -SheetVersion,10,,,,, -Investigator Name,Knight,,,,, -Experiment Name,RKL_experiment,,,,, -Date,2025-08-29,,,,, -Assay,Metagenomic,,,,, -Description,,,,,, -,,,,,, -[Data],,,,,, -Sample_ID,barcode_id,Sample_Name,Sample_Plate,Sample_Well,Sample_Project,Well_description,Lane -3A,bc3011,3A,sample_plate_1,A1,MyProject_6123,sample_plate_1.3A.A1,1 -4A,bc0112,4A,sample_plate_1,A2,MyProject_6123,sample_plate_1.4A.A2,1 -5B,bc9992,5B,sample_plate_1,A3,MyProject_6123,sample_plate_1.5B.A3,1 -,,,,,, -[Bioinformatics],,,,,, -Sample_Project,QiitaID,HumanFiltering,library_construction_protocol,experiment_design_description,contains_replicates, -MyProject_6123,6123,False,some protocol,some description,False, -,,,,,, -[Contact],,,,,, -Sample_Project,Email,,,,, -MyProject_6123,foo@bar.org,,,,, -,,,,,, -[SampleContext],,,,,, -sample_name,sample_type,primary_qiita_study,secondary_qiita_studies,,, -5B,control blank,6123,,,, -,,,,,, +[Header],,,,,,, +SheetType,pacbio_metag,,,,,, +SheetVersion,10,,,,,, +Investigator Name,Knight,,,,,, +Experiment Name,RKL_experiment,,,,,, +Date,2025-08-29,,,,,, +Assay,Metagenomic,,,,,, +Description,,,,,,, +,,,,,,, +[Data],,,,,,, +Sample_ID,Sample_Name,Sample_Plate,Sample_Well,Sample_Project,Well_description,Lane,barcode_id +3A,3A,sample_plate_1,A1,MyProject_6123,sample_plate_1.3A.A1,1,bc3011 +4A,4A,sample_plate_1,A2,MyProject_6123,sample_plate_1.4A.A2,1,bc0112 +5B,5B,sample_plate_1,A3,MyProject_6123,sample_plate_1.5B.A3,1,bc9992 +,,,,,,, +[Bioinformatics],,,,,,, +Sample_Project,QiitaID,HumanFiltering,library_construction_protocol,experiment_design_description,contains_replicates,, +MyProject_6123,6123,False,some protocol,some description,False,, +,,,,,,, +[Contact],,,,,,, +Sample_Project,Email,,,,,, +MyProject_6123,foo@bar.org,,,,,, +,,,,,,, +[SampleContext],,,,,,, +sample_name,sample_type,primary_qiita_study,secondary_qiita_studies,,,, +5B,control blank,6123,,,,, +,,,,,,, From e600bfb0662e2acead61033916e70f9c0a2770cb Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Tue, 9 Sep 2025 18:25:04 -0600 Subject: [PATCH 33/42] S000 --- src/qp_klp/PacBioMetagenomicWorkflow.py | 2 +- src/qp_klp/scripts/pacbio_commands.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/qp_klp/PacBioMetagenomicWorkflow.py b/src/qp_klp/PacBioMetagenomicWorkflow.py index f9f12cf4..bcc6e883 100644 --- a/src/qp_klp/PacBioMetagenomicWorkflow.py +++ b/src/qp_klp/PacBioMetagenomicWorkflow.py @@ -40,7 +40,7 @@ def __init__(self, **kwargs): samples = [ {'barcode': sample['barcode_id'], - 'sample_name': sample['Sample_Name'], + 'sample_name': sample['Sample_ID'], 'project_name': sample['Sample_Project'], 'lane': sample['Lane']} for sample in self.pipeline.sample_sheet.samples] diff --git a/src/qp_klp/scripts/pacbio_commands.py b/src/qp_klp/scripts/pacbio_commands.py index 1a4c2af3..7caf26c3 100755 --- a/src/qp_klp/scripts/pacbio_commands.py +++ b/src/qp_klp/scripts/pacbio_commands.py @@ -36,8 +36,9 @@ def generate_bam2fastq_commands(sample_list, run_folder, outdir, threads): missing_files.append(bc) continue od = f'{outdir}/{pn}' + makedirs(od, exist_ok=True) - fn = f'{od}/{sn}_L00{lane}_R1_001' + fn = f'{od}/{sn}_S000_L00{lane}_R1_001' cmd = (f'bam2fastq -j {threads} -o {fn} -c 9 ' f'{files[bc]}; ' f'fqtools count {fn}.fastq.gz > ' From d73b6a73cd4cc03679363ff8933c6999d5774ea2 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Wed, 10 Sep 2025 06:42:54 -0600 Subject: [PATCH 34/42] del raw_reverse_seqs --- src/qp_klp/PacBioMetagenomicWorkflow.py | 1 - src/qp_klp/Workflows.py | 5 +++++ src/sequence_processing_pipeline/util.py | 6 ------ tests/test_PacBioWorflow.py | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/qp_klp/PacBioMetagenomicWorkflow.py b/src/qp_klp/PacBioMetagenomicWorkflow.py index bcc6e883..0348ab1d 100644 --- a/src/qp_klp/PacBioMetagenomicWorkflow.py +++ b/src/qp_klp/PacBioMetagenomicWorkflow.py @@ -26,7 +26,6 @@ def __init__(self, **kwargs): if 'overwrite_prep_with_original' in self.kwargs: self.overwrite_prep_with_original = \ self.kwargs['overwrite_prep_with_original'] - self.files_regex = 'long_read' self.pipeline = Pipeline(self.kwargs['config_fp'], self.kwargs['run_identifier'], self.kwargs['uif_path'], diff --git a/src/qp_klp/Workflows.py b/src/qp_klp/Workflows.py index 19ab808b..1520d1f2 100644 --- a/src/qp_klp/Workflows.py +++ b/src/qp_klp/Workflows.py @@ -8,6 +8,7 @@ import logging from shutil import rmtree from .Assays import ASSAY_NAME_AMPLICON +from metapool.sample_sheet import PROTOCOL_NAME_PACBIO_SMRT from sequence_processing_pipeline.util import determine_orientation @@ -600,6 +601,10 @@ def _get_postqc_fastq_files(self, out_dir, project): if self.pipeline.pipeline_type != ASSAY_NAME_AMPLICON: del (files['raw_barcodes']) + # PacBio doesn't have reverse reads + if self.protocol_type == PROTOCOL_NAME_PACBIO_SMRT: + del (files['raw_reverse_seqs']) + # confirm expected lists of reads are not empty. for f_type in files: if not files[f_type]: diff --git a/src/sequence_processing_pipeline/util.py b/src/sequence_processing_pipeline/util.py index b07b55fa..bacf8306 100644 --- a/src/sequence_processing_pipeline/util.py +++ b/src/sequence_processing_pipeline/util.py @@ -32,12 +32,6 @@ 'html': REC(r'^(.*).html$'), 'json': REC(r'^(.*).json$') }, - 'long_read': { - 'fastq': REC(r'^(.*)_R1.fastq\.gz$'), - 'interleave_fastq': REC(r'^(.*)_R1.interleave\.fastq\.gz$'), - 'html': REC(r'^(.*)_R1.html$'), - 'json': REC(r'^(.*)_R1.json$') - }, } diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py index 9183e01c..0ca11211 100644 --- a/tests/test_PacBioWorflow.py +++ b/tests/test_PacBioWorflow.py @@ -13,7 +13,7 @@ from shutil import rmtree from pathlib import Path import pandas as pd -from metapool.sample_sheet import (PROTOCOL_NAME_PACBIO_SMRT) +from metapool.sample_sheet import PROTOCOL_NAME_PACBIO_SMRT from sequence_processing_pipeline.PipelineError import PipelineError from qp_klp.Assays import ASSAY_NAME_METAGENOMIC from qiita_client.testing import PluginTestCase From c2b0a3e0ea75fea539f9a83fd5a7cb5c8351491e Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Wed, 10 Sep 2025 07:03:18 -0600 Subject: [PATCH 35/42] test filenames --- tests/test_PacBioWorflow.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py index 0ca11211..cfd342bb 100644 --- a/tests/test_PacBioWorflow.py +++ b/tests/test_PacBioWorflow.py @@ -70,21 +70,22 @@ def _inject_data(self, wf): # # then loop over samples and stage all fastq.gz files dstats = [] for sample in samples: - sn = sample['Sample_Name'] + sn = sample['Sample_ID'] sp = sample["Sample_Project"] dstats.append({'SampleID': sn, '# Reads': 2}) dname = f'{convert_dir}/{sp}' - Path(f'{dname}/{sn}_R1.fastq.gz').touch() + Path(f'{dname}/{sn}_S000_L001_R1_001.fastq.gz').touch() with open(f'{dname}/{sn}_R1.counts.txt', 'w') as f: f.write("2") # NuQCJob dname = f'{nuqc_dir}/filtered_sequences/{sp}' - copyfile(self.gz_source, f'{dname}/{sn}_R1.trimmed.fastq.gz') + copyfile(self.gz_source, + f'{dname}/{sn}_S000_L001_R1_001.trimmed.fastq.gz') # GenPrepFileJob gprep_base = f'{genprep_dir}/{sp}/filtered_sequences/{sn}' - Path(f'{gprep_base}_R1.trimmed.fastq.gz').touch() + Path(f'{gprep_base}_S000_L001_R1_001.trimmed.fastq.gz').touch() pd.DataFrame(dstats).set_index('SampleID').to_csv( f'{reports_dir}/Demultiplex_Stats.csv') From b2b4d60d514bbfc5948e79ddd4898884d878808d Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Wed, 10 Sep 2025 07:43:49 -0600 Subject: [PATCH 36/42] S000_L001_R1_001.counts.txt --- tests/test_PacBioWorflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py index cfd342bb..5f73efa5 100644 --- a/tests/test_PacBioWorflow.py +++ b/tests/test_PacBioWorflow.py @@ -75,7 +75,7 @@ def _inject_data(self, wf): dstats.append({'SampleID': sn, '# Reads': 2}) dname = f'{convert_dir}/{sp}' Path(f'{dname}/{sn}_S000_L001_R1_001.fastq.gz').touch() - with open(f'{dname}/{sn}_R1.counts.txt', 'w') as f: + with open(f'{dname}/{sn}_S000_L001_R1_001.counts.txt', 'w') as f: f.write("2") # NuQCJob From 0051f4999d3c5f94fe15675f4ee0fc5141a48621 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Wed, 10 Sep 2025 09:05:08 -0600 Subject: [PATCH 37/42] {rec}{sid} --- src/sequence_processing_pipeline/Commands.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sequence_processing_pipeline/Commands.py b/src/sequence_processing_pipeline/Commands.py index d8f45496..9337ba4f 100644 --- a/src/sequence_processing_pipeline/Commands.py +++ b/src/sequence_processing_pipeline/Commands.py @@ -270,7 +270,7 @@ def demux_just_fwd_processing(id_map, fp, out_d, task, maxtask): current_fp = openfps[fname_encoded] - current_fp[orientation].write(sid) + current_fp[orientation].write(f'{rec}{sid}') current_fp[orientation].write(s) current_fp[orientation].write(d) current_fp[orientation].write(q) From 1418cbcd4b4259d68e8e1bef81f135fe5c8773c1 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Wed, 10 Sep 2025 11:23:38 -0600 Subject: [PATCH 38/42] rm touch for gz files --- tests/test_PacBioWorflow.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py index 5f73efa5..4129df9c 100644 --- a/tests/test_PacBioWorflow.py +++ b/tests/test_PacBioWorflow.py @@ -74,7 +74,8 @@ def _inject_data(self, wf): sp = sample["Sample_Project"] dstats.append({'SampleID': sn, '# Reads': 2}) dname = f'{convert_dir}/{sp}' - Path(f'{dname}/{sn}_S000_L001_R1_001.fastq.gz').touch() + copyfile(self.gz_source, + f'{dname}/{sn}_S000_L001_R1_001.fastq.gz') with open(f'{dname}/{sn}_S000_L001_R1_001.counts.txt', 'w') as f: f.write("2") @@ -85,7 +86,8 @@ def _inject_data(self, wf): # GenPrepFileJob gprep_base = f'{genprep_dir}/{sp}/filtered_sequences/{sn}' - Path(f'{gprep_base}_S000_L001_R1_001.trimmed.fastq.gz').touch() + copyfile(self.gz_source, + f'{gprep_base}_S000_L001_R1_001.trimmed.fastq.gz') pd.DataFrame(dstats).set_index('SampleID').to_csv( f'{reports_dir}/Demultiplex_Stats.csv') From 62e388e7b687750789b716ff3af6fa4f6beac761 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Wed, 10 Sep 2025 14:18:28 -0600 Subject: [PATCH 39/42] add_default_workflow --- src/qp_klp/PacBioMetagenomicWorkflow.py | 2 +- src/qp_klp/Protocol.py | 2 +- src/qp_klp/Workflows.py | 6 +- src/qp_klp/scripts/pacbio_commands.py | 5 ++ .../ConvertJob.py | 2 +- src/sequence_processing_pipeline/NuQCJob.py | 7 +-- src/sequence_processing_pipeline/Pipeline.py | 55 +++++++------------ .../pacbio/good_pacbio_metagv10.csv | 8 +-- tests/test_PacBioWorflow.py | 9 +-- 9 files changed, 42 insertions(+), 54 deletions(-) diff --git a/src/qp_klp/PacBioMetagenomicWorkflow.py b/src/qp_klp/PacBioMetagenomicWorkflow.py index 0348ab1d..d57fe61b 100644 --- a/src/qp_klp/PacBioMetagenomicWorkflow.py +++ b/src/qp_klp/PacBioMetagenomicWorkflow.py @@ -14,7 +14,7 @@ def __init__(self, **kwargs): self.mandatory_attributes = ['qclient', 'uif_path', 'lane_number', 'config_fp', 'run_identifier', 'output_dir', 'job_id', - 'lane_number', 'is_restart'] + 'is_restart'] self.confirm_mandatory_attributes() diff --git a/src/qp_klp/Protocol.py b/src/qp_klp/Protocol.py index e3c8f972..5f49248b 100644 --- a/src/qp_klp/Protocol.py +++ b/src/qp_klp/Protocol.py @@ -435,7 +435,7 @@ def convert_raw_to_fastq(self): return failed_samples def generate_sequence_counts(self): - # for other isntances of generate_sequence_counts in other objects + # for other instances of generate_sequence_counts in other objects # the sequence counting needs to be done; however, for PacBio we # already have done it and just need to merge the results. gz_files = glob(f'{self.raw_fastq_files_path}/*/*.fastq.gz') diff --git a/src/qp_klp/Workflows.py b/src/qp_klp/Workflows.py index 1520d1f2..1f979bf0 100644 --- a/src/qp_klp/Workflows.py +++ b/src/qp_klp/Workflows.py @@ -626,7 +626,11 @@ def _load_prep_into_qiita(self, qclient, prep_id, artifact_name, 'prep_id': prep_id, 'artifact_type': atype, 'command_artifact_name': artifact_name, - 'add_default_workflow': True, + # this will block adding the default workflow to the + # long read / pacbio processing; which is desirable + # until we have a processing pipeline. + 'add_default_workflow': self.read_length != 'long', + # 'add_default_workflow': True, 'files': dumps(fastq_files)} job_id = qclient.post('/qiita_db/artifact/', data=pdata)['job_id'] diff --git a/src/qp_klp/scripts/pacbio_commands.py b/src/qp_klp/scripts/pacbio_commands.py index 7caf26c3..b0ab0717 100755 --- a/src/qp_klp/scripts/pacbio_commands.py +++ b/src/qp_klp/scripts/pacbio_commands.py @@ -21,6 +21,11 @@ def generate_bam2fastq_commands(sample_list, run_folder, outdir, threads): """Generates the bam2fastq commands""" df = pd.read_csv(sample_list, sep='\t') + + # pacbio raw files are in a hifi_reads folder, wihtin multiple folders + # (1_A01, 2_A02, ect), within the run-id folder; and are named + # m[run-id]XXX.hifi_reads.[barcode].bam; thus to find the [barcode] we + # can split on '.' and then the second to last element [-2]. files = {f.split('.')[-2]: f for f in glob(f'{run_folder}/*/hifi_reads/*.bam')} diff --git a/src/sequence_processing_pipeline/ConvertJob.py b/src/sequence_processing_pipeline/ConvertJob.py index 4e99a8aa..6fb4967d 100644 --- a/src/sequence_processing_pipeline/ConvertJob.py +++ b/src/sequence_processing_pipeline/ConvertJob.py @@ -476,7 +476,7 @@ def _generate_job_script(self): def run(self, callback=None): """ - Run BCL2Fastq/BCLConvert conversion + Run pacbio_generate_bam2fastq_commands, which runs bam2fastq :param callback: optional function taking two parameters (id, status) that is called when a running process's status is changed. diff --git a/src/sequence_processing_pipeline/NuQCJob.py b/src/sequence_processing_pipeline/NuQCJob.py index 8fa693c4..8e1a31f6 100644 --- a/src/sequence_processing_pipeline/NuQCJob.py +++ b/src/sequence_processing_pipeline/NuQCJob.py @@ -53,7 +53,7 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path, :param additional_fastq_tags: A list of fastq tags to preserve during filtering. :param files_regex: the FILES_REGEX to use for parsing files - :param read_length: the FILES_REGEX to use for parsing files + :param read_length: string defining the read length: long/short. """ super().__init__(fastq_root_dir, output_path, @@ -376,10 +376,7 @@ def _process_sample_sheet(self): raise PipelineError(s) header = sheet.Header - if 'chemistry' in header: - chemistry = header['chemistry'] - else: - chemistry = '' + chemistry = header.get('chemistry', '') if header['Assay'] not in Pipeline.assay_types: s = "Assay value '%s' is not recognized." % header['Assay'] diff --git a/src/sequence_processing_pipeline/Pipeline.py b/src/sequence_processing_pipeline/Pipeline.py index cf0a83cd..d0060d6b 100644 --- a/src/sequence_processing_pipeline/Pipeline.py +++ b/src/sequence_processing_pipeline/Pipeline.py @@ -36,18 +36,8 @@ def _get_instrument_id(run_directory): # if RunInfo.xml doesn't exist, this might be a PacBio # folder, let's check for it if not exists(run_info): - pacbio_metadata_fps = glob( - f'{run_directory}/*/metadata/*.metadata.xml') - if pacbio_metadata_fps: - run_info = pacbio_metadata_fps[0] - tree = ET.parse(run_info) - mtag = tree.getroot().tag.split('}')[0] + '}' - instrument_text = ( - f'{mtag}ExperimentContainer/{mtag}Runs/{mtag}Run') - run = ET.parse(run_info).find(instrument_text) - text = run.attrib['TimeStampedName'] - else: - raise ValueError(f"'{run_info}' doesn't exist") + run = InstrumentUtils._get_pacbio_run_str(run_info, run_directory) + text = run.attrib['TimeStampedName'] else: text = ET.parse(run_info).find(instrument_text).text @@ -59,6 +49,21 @@ def get_instrument_type(run_directory): return get_model_by_instrument_id( instrument_id, model_key=PROFILE_NAME_KEY) + @staticmethod + def _get_pacbio_run_str(run_info, run_directory): + pacbio_metadata_fps = glob( + f'{run_directory}/*/metadata/*.metadata.xml') + if not pacbio_metadata_fps: + raise ValueError(f"'{run_info}' doesn't exist") + + run_info = pacbio_metadata_fps[0] + tree = ET.parse(run_info) + mtag = tree.getroot().tag.split('}')[0] + '}' + instrument_text = ( + f'{mtag}ExperimentContainer/{mtag}Runs/{mtag}Run') + run = ET.parse(run_info).find(instrument_text) + return run + @staticmethod def _get_date(run_directory): run_info = join(run_directory, 'RunInfo.xml') @@ -66,18 +71,8 @@ def _get_date(run_directory): # if RunInfo.xml doesn't exist, this might be a PacBio # folder, let's check for it if not exists(run_info): - pacbio_metadata_fps = glob( - f'{run_directory}/*/metadata/*.metadata.xml') - if pacbio_metadata_fps: - run_info = pacbio_metadata_fps[0] - tree = ET.parse(run_info) - mtag = tree.getroot().tag.split('}')[0] + '}' - instrument_text = ( - f'{mtag}ExperimentContainer/{mtag}Runs/{mtag}Run') - run = ET.parse(run_info).find(instrument_text) - date_string = run.attrib['CreatedAt'].split('T')[0] - else: - raise ValueError(f"'{run_info}' doesn't exist") + run = InstrumentUtils._get_pacbio_run_str(run_info, run_directory) + date_string = run.attrib['CreatedAt'].split('T')[0] else: date_string = ET.parse(run_info).find('Run/Date').text @@ -688,23 +683,13 @@ def generate_sample_info_files(self, dereplicated_input_file_paths): controls_in_proj_df['description'] = controls_in_proj_df[TEMP_KEY] controls_in_proj_df.drop(columns=[TEMP_KEY], inplace=True) controls_in_proj_df['collection_timestamp'] = \ - self.get_date_from_run_id() + InstrumentUtils._get_date(self.run_dir) controls_in_proj_df = controls_in_proj_df[Pipeline.sif_header] controls_in_proj_df.to_csv(curr_fp, sep='\t', index=False) return paths - def get_date_from_run_id(self): - # assume all run_ids begin with coded datestamp: - # 210518_... - # allow exception if substrings cannot convert to int - # or if array indexes are out of bounds. - year = int(self.run_id[0:2]) + 2000 - month = int(self.run_id[2:4]) - day = int(self.run_id[4:6]) - return f'{year}-{month}-{day}' - def get_sample_ids(self): ''' Returns list of sample-ids sourced from sample-sheet or pre-prep file diff --git a/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv b/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv index b2134a64..c18e8f33 100644 --- a/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv +++ b/tests/data/sample-sheets/metagenomic/pacbio/good_pacbio_metagv10.csv @@ -8,10 +8,10 @@ Assay,Metagenomic,,,,,, Description,,,,,,, ,,,,,,, [Data],,,,,,, -Sample_ID,Sample_Name,Sample_Plate,Sample_Well,Sample_Project,Well_description,Lane,barcode_id -3A,3A,sample_plate_1,A1,MyProject_6123,sample_plate_1.3A.A1,1,bc3011 -4A,4A,sample_plate_1,A2,MyProject_6123,sample_plate_1.4A.A2,1,bc0112 -5B,5B,sample_plate_1,A3,MyProject_6123,sample_plate_1.5B.A3,1,bc9992 +Sample_ID,Sample_Name,Sample_Plate,Sample_Well,barcode_id,Sample_Project,Well_description,Lane +3A,3A,sample_plate_1,A1,bc3011,MyProject_6123,sample_plate_1.3A.A1,1 +4A,4A,sample_plate_1,A2,bc0112,MyProject_6123,sample_plate_1.4A.A2,1 +5B,5B,sample_plate_1,A3,bc9992,MyProject_6123,sample_plate_1.5B.A3,1 ,,,,,,, [Bioinformatics],,,,,,, Sample_Project,QiitaID,HumanFiltering,library_construction_protocol,experiment_design_description,contains_replicates,, diff --git a/tests/test_PacBioWorflow.py b/tests/test_PacBioWorflow.py index 4129df9c..bc1f1e46 100644 --- a/tests/test_PacBioWorflow.py +++ b/tests/test_PacBioWorflow.py @@ -14,7 +14,6 @@ from pathlib import Path import pandas as pd from metapool.sample_sheet import PROTOCOL_NAME_PACBIO_SMRT -from sequence_processing_pipeline.PipelineError import PipelineError from qp_klp.Assays import ASSAY_NAME_METAGENOMIC from qiita_client.testing import PluginTestCase @@ -67,7 +66,7 @@ def _inject_data(self, wf): makedirs(f'{genprep_dir}/{sp}/filtered_sequences/', exist_ok=True) - # # then loop over samples and stage all fastq.gz files + # then loop over samples and stage all fastq.gz files dstats = [] for sample in samples: sn = sample['Sample_ID'] @@ -128,10 +127,8 @@ def test_pacbio_metagenomic_workflow_creation(self): # and the system is trying to create the preps. # Note: Qiita job_id's are UUID in the database and this tests # uses 78901 as the job_id so the db complains about the format - # with self.assertRaisesRegex(RuntimeError, 'invalid input ' - # 'syntax for type uuid: "78901"'): - with self.assertRaisesRegex(PipelineError, 'There are no fastq ' - 'files for FastQCJob'): + with self.assertRaisesRegex(RuntimeError, 'invalid input ' + 'syntax for type uuid: "78901"'): wf.execute_pipeline() From 27efe2d41ea3e092ac251d4f6cf8fbbca69fd954 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Wed, 10 Sep 2025 14:26:18 -0600 Subject: [PATCH 40/42] fixing counts --- src/qp_klp/Protocol.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/qp_klp/Protocol.py b/src/qp_klp/Protocol.py index 5f49248b..4191d757 100644 --- a/src/qp_klp/Protocol.py +++ b/src/qp_klp/Protocol.py @@ -443,7 +443,8 @@ def generate_sequence_counts(self): for gzf in gz_files: cf = gzf.replace('.fastq.gz', '.counts.txt') - sn = basename(cf).replace('_R1.counts.txt', '') + sn = basename(cf).replace( + f'_S000_L00{self.lane_number}_R1_001.counts.txt', '') if not exists(cf): missing_files.append(sn) continue From a323f466b75c7807835a2e28f1e856dbb3eb5b04 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Wed, 10 Sep 2025 14:49:10 -0600 Subject: [PATCH 41/42] fix TestPipeline.test_generate_sample_information_files --- tests/test_Pipeline.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_Pipeline.py b/tests/test_Pipeline.py index b1cc502e..d27d73f2 100644 --- a/tests/test_Pipeline.py +++ b/tests/test_Pipeline.py @@ -400,7 +400,7 @@ def test_generate_sample_information_files(self): exp_first_lines = { f'{self.good_run_id}_StudyA_13059_blanks.tsv': - 'BLANK1.1A\t2021-10-21\t193\t' + 'BLANK1.1A\t2017-05-23\t193\t' 'Control\tNegative\tSterile w' 'ater blank\tSterile water blank\turban biome\tres' 'earch facility\tsterile wate' @@ -411,7 +411,7 @@ def test_generate_sample_information_files(self): 'StudyA\tTRUE\t' 'UCSD\tFALSE', f'{self.good_run_id}_StudyB_11661_blanks.tsv': - 'BLANK.40.12G\t2021-10-21\t193\tControl' + 'BLANK.40.12G\t2017-05-23\t193\tControl' '\tNegative\tSterile water blank\tSterile water blank\turban ' 'biome\tresearch facility\tsterile water' '\tmisc environment\tUSA:CA:San Diego\tB' @@ -419,7 +419,7 @@ def test_generate_sample_information_files(self): 'nk\tmetagenome\t256318\tBLANK.40.12G\t' 'StudyB\tTRUE\tUCSD\tFALSE', f'{self.good_run_id}_StudyC_6123_blanks.tsv': - 'BLANK.41.12G\t2021-10-21\t193\tControl' + 'BLANK.41.12G\t2017-05-23\t193\tControl' '\tNegative\tSterile water blank\tSterile water blank\turban' ' biome\tresearch facility\tsterile wat' 'er\tmisc environment\tUSA:CA:San Diego' @@ -430,7 +430,7 @@ def test_generate_sample_information_files(self): exp_last_lines = { f'{self.good_run_id}_StudyA_13059_blanks.tsv': - 'BLANK4.4H\t2021-10-21\t193\t' + 'BLANK4.4H\t2017-05-23\t193\t' 'Control\tNegative\tSterile w' 'ater blank\tSterile water blank\turban biome\tres' 'earch facility\tsterile wate' @@ -441,7 +441,7 @@ def test_generate_sample_information_files(self): 'StudyA\tTRUE\t' 'UCSD\tFALSE', f'{self.good_run_id}_StudyB_11661_blanks.tsv': - 'BLANK.43.12H\t2021-10-21\t193\tControl' + 'BLANK.43.12H\t2017-05-23\t193\tControl' '\tNegative\tSterile water blank\tSterile water blank\turban' ' biome\tresearch facility\tsterile wat' 'er\tmisc environment\tUSA:CA:San Diego' @@ -449,7 +449,7 @@ def test_generate_sample_information_files(self): ' blank\tmetagenome\t256318\tBLANK.43.1' '2H\tStudyB\tTRUE\tUCSD\tFALSE', f'{self.good_run_id}_StudyC_6123_blanks.tsv': - 'BLANK.41.12G\t2021-10-21\t193\tContro' + 'BLANK.41.12G\t2017-05-23\t193\tContro' 'l\tNegative\tSterile water blank\tSterile water blank\turb' 'an biome\tresearch facility\tsterile ' 'water\tmisc environment\tUSA:CA:San D' From 48010d349345e9e76ab19922c30ca16c09644858 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Wed, 10 Sep 2025 16:21:14 -0600 Subject: [PATCH 42/42] restart changes --- src/qp_klp/PacBioMetagenomicWorkflow.py | 5 +++++ src/qp_klp/Workflows.py | 11 ++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/qp_klp/PacBioMetagenomicWorkflow.py b/src/qp_klp/PacBioMetagenomicWorkflow.py index d57fe61b..7a83d8e5 100644 --- a/src/qp_klp/PacBioMetagenomicWorkflow.py +++ b/src/qp_klp/PacBioMetagenomicWorkflow.py @@ -5,6 +5,7 @@ from .FailedSamplesRecord import FailedSamplesRecord from .Workflows import Workflow import pandas as pd +from os.path import join class PacBioMetagenomicWorkflow(Workflow, Metagenomic, PacBio): @@ -53,6 +54,10 @@ def __init__(self, **kwargs): self.is_restart = bool(self.kwargs['is_restart']) if self.is_restart is True: + self.raw_fastq_files_path = join(self.pipeline.output_path, + 'ConvertJob') + self.reports_path = join(self.raw_fastq_files_path, + 'SeqCounts.csv') self.determine_steps_to_skip() # this is a convenience member to allow testing w/out updating Qiita. diff --git a/src/qp_klp/Workflows.py b/src/qp_klp/Workflows.py index 1f979bf0..9155004c 100644 --- a/src/qp_klp/Workflows.py +++ b/src/qp_klp/Workflows.py @@ -626,12 +626,13 @@ def _load_prep_into_qiita(self, qclient, prep_id, artifact_name, 'prep_id': prep_id, 'artifact_type': atype, 'command_artifact_name': artifact_name, - # this will block adding the default workflow to the - # long read / pacbio processing; which is desirable - # until we have a processing pipeline. - 'add_default_workflow': self.read_length != 'long', - # 'add_default_workflow': True, 'files': dumps(fastq_files)} + # this will block adding the default workflow to the + # long read / pacbio processing; which is desirable + # until we have a processing pipeline - note that + # this will only be added if _not_ 'long' + if self.read_length != 'long': + pdata['add_default_workflow'] = True job_id = qclient.post('/qiita_db/artifact/', data=pdata)['job_id']