diff --git a/.github/workflows/qiita-plugin-ci.yml b/.github/workflows/qiita-plugin-ci.yml index 2671001..9d4abd8 100644 --- a/.github/workflows/qiita-plugin-ci.yml +++ b/.github/workflows/qiita-plugin-ci.yml @@ -87,9 +87,9 @@ jobs: pip --quiet install -U pip pip --quiet install https://github.com/qiita-spots/qtp-job-output-folder/archive/refs/heads/main.zip + export ENVIRONMENT="source /home/runner/.profile; conda activate qp_pacbio_2025.9" pip install -e . pip --quiet install coveralls - export ENVIRONMENT="source /home/runner/.profile; conda activate qp_pacbio_2025.9" configure_qtp_job_output_folder --env-script "source /home/runner/.profile; conda activate qp_pacbio_2025.9" --ca-cert $QIITA_ROOTCA_CERT configure_qp_pacbio --env-script 'source /home/runner/.profile; conda activate qp_pacbio_2025.9; export ENVIRONMENT="source /home/runner/.profile; conda activate qp_pacbio_2025.9"' --ca-cert $QIITA_ROOTCA_CERT @@ -134,8 +134,9 @@ jobs: export QIITA_ROOTCA_CERT=`pwd`/qiita-dev/qiita_core/support_files/ci_rootca.crt export QIITA_CONFIG_FP=`pwd`/qiita-dev/qiita_core/support_files/config_test_local.cfg export PYTHONWARNINGS="ignore:Certificate for localhost has no \`subjectAltName\`" + export ENVIRONMENT="source /home/runner/.profile; conda activate qp_pacbio_2025.9" - pytest qp_pacbio --doctest-modules --cov=qp_pacbio --cov-report=lcov + pytest qp_pacbio --doctest-modules --cov=qp_pacbio --cov-report=lcov --ignore=qp_pacbio/data - uses: codecov/codecov-action@v3 with: diff --git a/README.rst b/README.rst index 7b0a6e1..96b4142 100644 --- a/README.rst +++ b/README.rst @@ -1,6 +1,18 @@ |Build Status| |Coverage Status| -Qiita plugin to process Pacbio. The plugin follows these processing steps: +Qiita plugin to process PacBio reads; it currently provides 2 commands for Qiita: + +* **Woltka v0.1.7, minimap2**: which generates feature and functional profiles agains WoLr2; + the expected output are BIOM artifacts + +* **PacBio processing**: which goes from step 1 to 7 in the image below. The expected output + is a main folder with folders per-sample and folders for each of the different outputs, as follows: + + * **MAG** folder: all Metagenome-Assembled Genome (MAG) generatedfor that sample + * **LCG** folder: all Long-Circular Genome (LCG) generated for that sample that are over 512kb in size - approximate 515,000 bases (half a million) + * **small_LCG** folder: all Long-Circular Genome (LCG) generated for that sample that are under 512kb in size + * **[sample-name].fna.gz**: the no LCG reads used for MAG generation + * **[sample-name].checkm.txt.gz**: MAG quanlity information from CheckM v1.2.3 .. image:: images/PacBioProcessing.png diff --git a/data/templates/2.get-circular-genomes.sbatch b/data/templates/2.get-circular-genomes.sbatch deleted file mode 100644 index faa26c5..0000000 --- a/data/templates/2.get-circular-genomes.sbatch +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/bash -#SBATCH -J {{job_name}} -#SBATCH -p qiita -#SBATCH -N {{node_count}} -#SBATCH -n {{nprocs}} -#SBATCH --time {{wall_time_limit}} -#SBATCH --mem {{mem_in_gb}}G -#SBATCH -o {{output}}/step-2/logs/%x-%A_%a.out -#SBATCH -e {{output}}/step-2/logs/%x-%A_%a.err -#SBATCH --array {{array_params}} - -source ~/.bashrc -set -e -conda activate {{conda_environment}} -cd {{output}}/step-1 - -step=${SLURM_ARRAY_TASK_ID} -input=$(head -n $step {{output}}/sample_list.txt | tail -n 1) -sample_name=`echo $input | awk '{print $1}'` -filename=`echo $input | awk '{print $2}'` -fn=`basename ${filename}` - -# updating the GUI when task 1 runs -if [[ "$step" == "1" ]]; then - python -c "from qp_pacbio.util import client_connect; qclient = client_connect('{{url}}'); qclient.update_job_step('{{qjid}}', 'Running step 2: ${SLURM_ARRAY_JOB_ID}')" -fi - -cat ${sample_name}.p_ctg.gfa | awk '$1=="S" && ($2 ~ /.c$/) {printf ">%s\n%s\n", $2, $3} ' > ../step-2/${sample_name}_circ.fa -seqkit split --by-id ../step-2/${sample_name}_circ.fa -O ../step-2/${sample_name}_split - -### get all contigs for each sample -cat ${sample_name}.p_ctg.gfa | awk '$1=="S" {printf ">%s\n%s\n", $2, $3} ' > ../step-2/${sample_name}_all_contigs.fa - -cd ../step-2/${sample_name}_split - -### remove small circular genomes -find . -type f -size -512k -exec rm -f {} + -### extract fasta id for all the genomes in the split folder -for f in *.fa; do - k=${f##*/} - n=${f%.*} - grep -E "^>" $f >> circular_id.txt -done - -sed -i 's/>//' circular_id.txt -seqkit grep -v -f circular_id.txt ../${sample_name}_all_contigs.fa > ../${sample_name}_noLCG.fa - -shopt -s nullglob - -FILES=({{output}}/step-2/${sample_name}_split/*.fa) -if [ -f $FILES ]; then - lcg_folder={{result_fp}}/${sample_name}/LCG/ - mkdir -p ${lcg_folder} - for f in `ls {{output}}/step-2/${sample_name}_split/*.fa`; do - sn=`basename ${f/_circ/}`; - sn=${sn/part_/}; - cat $f | gzip > ${lcg_folder}/${sn/.fa/.fna}.gz; - done -fi - -if [ -f ${{output}}/step-2/${sample_name}_noLCG.fa ]; then - cat {{output}}/step-2/${sample_name}_noLCG.fa | gzip > {{result_fp}}/${sample_name}/${sample_name}.noLCG.fna.gz -fi diff --git a/pyproject.toml b/pyproject.toml index adae62d..00f2b0e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,13 +4,15 @@ build-backend = "setuptools.build_meta" [tool.setuptools] packages = ["qp_pacbio"] -include-package-data = true + +[tool.setuptools.package-data] +"qp_pacbio" = ["data/*"] [project] name = "qp_pacbio" # version strings must comply with PEP 440: # https://peps.python.org/pep-0440/ -version = "2025.09" +version = "2025.11" authors = [{ name = "Qiita Development Team", email = "qiita.help@gmail.com" }] description = "Qiita Plugin: PacBio Processing" readme = "README.rst" @@ -39,11 +41,15 @@ dependencies = [ 'pytest-cov', 'numpy', 'Jinja2', + 'PyYAML', + 'micov', "qiita-files@https://github.com/qiita-spots/qiita-files/archive/master.zip", "qiita_client@https://github.com/qiita-spots/qiita_client/archive/master.zip", + "woltka@git+https://github.com/qiyunzhu/woltka.git#egg=woltka", ] [project.scripts] configure_qp_pacbio = "qp_pacbio.scripts:config" start_qp_pacbio = "qp_pacbio.scripts:execute" finish_qp_pacbio = "qp_pacbio.scripts:finish_qp_pacbio" +biom_merge_pacbio = "qp_pacbio.scripts:biom_merge" diff --git a/qp_pacbio/__init__.py b/qp_pacbio/__init__.py index 997fdbd..dec18ca 100644 --- a/qp_pacbio/__init__.py +++ b/qp_pacbio/__init__.py @@ -16,8 +16,8 @@ # minimap2 command # -req_params = {"artifact": ("integer", ["per_sample_FASTQ"])} -opt_params = dict() +req_params = {"artifact": ("artifact", ["per_sample_FASTQ"])} +opt_params = {"Database": ['choice:["WoLr2"]', "WoLr2"]} outputs = { # taxonomic "Per genome Predictions": "BIOM", @@ -27,8 +27,7 @@ "KEGG Enzyme (EC)": "BIOM", "KEGG Pathway": "BIOM", } -dflt_param_set = dict() - +dflt_param_set = {"WoLr2": {"Database": "WoLr2"}} minimap2_cmd = QiitaCommand( "Woltka v0.1.7, minimap2", "Functional and Taxonomic Predictions", @@ -45,12 +44,11 @@ # req_params = { - "artifact": ("integer", ["per_sample_FASTQ"]), + "artifact": ("artifact", ["per_sample_FASTQ"]), } -opt_params = dict() +opt_params = {"Processing": ['choice:["default"]', "default"]} outputs = {"output": "job-output-folder"} -dflt_param_set = dict() - +dflt_param_set = {"default": {"Processing": "default"}} pacbio_processing_cmd = QiitaCommand( "PacBio processing", "Default PacBio processing for Metagenomic Data", diff --git a/qp_pacbio/data/resources.yaml b/qp_pacbio/data/resources.yaml new file mode 100644 index 0000000..d201c6f --- /dev/null +++ b/qp_pacbio/data/resources.yaml @@ -0,0 +1,60 @@ +PacBio processing: + step-1: + node_count: 1 + nprocs: 16 + wall_time_limit: 1-00:00:00 + mem_in_gb: 200 + max_tasks: 16 + step-2: + node_count: 1 + nprocs: 1 + wall_time_limit: 00:10:00 + mem_in_gb: 2 + max_tasks: 16 + step-3: + node_count: 1 + nprocs: 8 + wall_time_limit: 01:00:00 + mem_in_gb: 10 + max_tasks: 16 + step-4: + node_count: 1 + nprocs: 8 + wall_time_limit: 01:00:00 + mem_in_gb: 6 + max_tasks: 16 + step-5: + node_count: 1 + nprocs: 8 + wall_time_limit: 00:30:00 + mem_in_gb: 2 + max_tasks: 16 + step-6: + node_count: 1 + nprocs: 8 + wall_time_limit: 00:30:00 + mem_in_gb: 2 + max_tasks: 16 + step-7: + node_count: 1 + nprocs: 8 + wall_time_limit: 01:00:00 + mem_in_gb: 50 + max_tasks: 16 + finish: + node_count: 1 + nprocs: 1 + wall_time_limit: 00:10:00 + mem_in_gb: 10 +Woltka v0.1.7, minimap2: + minimap2: + node_count: 1 + nprocs: 16 + wall_time_limit: 10:00:00 + mem_in_gb: 60 + max_tasks: 16 + merge: + node_count: 1 + nprocs: 16 + wall_time_limit: 1-00:00:00 + mem_in_gb: 120 diff --git a/data/templates/1.hifiasm-meta_new.sbatch b/qp_pacbio/data/templates/1.hifiasm-meta_new.sbatch similarity index 91% rename from data/templates/1.hifiasm-meta_new.sbatch rename to qp_pacbio/data/templates/1.hifiasm-meta_new.sbatch index 7ea09cb..045c997 100644 --- a/data/templates/1.hifiasm-meta_new.sbatch +++ b/qp_pacbio/data/templates/1.hifiasm-meta_new.sbatch @@ -11,7 +11,7 @@ source ~/.bashrc set -e -conda activate {{conda_environment}} +{{conda_environment}} cd {{output}}/step-1 step=${SLURM_ARRAY_TASK_ID} @@ -28,3 +28,4 @@ if [[ "$step" == "1" ]]; then fi hifiasm_meta -t {{nprocs}} -o {{output}}/step-1/${sample_name} ${filename} +touch {{output}}/step-1/completed_${SLURM_ARRAY_TASK_ID}.log diff --git a/qp_pacbio/data/templates/2.get-circular-genomes.sbatch b/qp_pacbio/data/templates/2.get-circular-genomes.sbatch new file mode 100644 index 0000000..0527edc --- /dev/null +++ b/qp_pacbio/data/templates/2.get-circular-genomes.sbatch @@ -0,0 +1,96 @@ +#!/bin/bash +#SBATCH -J {{job_name}} +#SBATCH -p qiita +#SBATCH -N {{node_count}} +#SBATCH -n {{nprocs}} +#SBATCH --time {{wall_time_limit}} +#SBATCH --mem {{mem_in_gb}}G +#SBATCH -o {{output}}/step-2/logs/%x-%A_%a.out +#SBATCH -e {{output}}/step-2/logs/%x-%A_%a.err +#SBATCH --array {{array_params}} + +source ~/.bashrc +set -e +{{conda_environment}} +cd {{output}}/step-1 + +step=${SLURM_ARRAY_TASK_ID} +input=$(head -n $step {{output}}/sample_list.txt | tail -n 1) +sample_name=`echo $input | awk '{print $1}'` +filename=`echo $input | awk '{print $2}'` +fn=`basename ${filename}` + +# updating the GUI when task 1 runs +if [[ "$step" == "1" ]]; then + python -c "from qp_pacbio.util import client_connect; qclient = client_connect('{{url}}'); qclient.update_job_step('{{qjid}}', 'Running step 2: ${SLURM_ARRAY_JOB_ID}')" +fi + +cat ${sample_name}.p_ctg.gfa | awk '$1=="S" && ($2 ~ /.c$/) {printf ">%s\n%s\n", $2, $3} ' > ../step-2/${sample_name}_circ.fa +seqkit split --by-id ../step-2/${sample_name}_circ.fa -O ../step-2/${sample_name}_split + +### get all contigs for each sample +cat ${sample_name}.p_ctg.gfa | awk '$1=="S" {printf ">%s\n%s\n", $2, $3} ' > ../step-2/${sample_name}_all_contigs.fa + +cd ../step-2/${sample_name}_split +# making a copy of the small_LCG before they are removed +mkdir -p {{output}}/step-2/${sample_name}_small_LCG +find . -maxdepth 1 -type f -size -512k -print0 | xargs -0 -r cp -t ../${sample_name}_small_LCG +### remove small circular genomes +find . -type f -size -512k -exec rm -f {} + + +# this can result on not having any files left so +# making sure we have files left +# +# extract fasta id for all the genomes in the split folder +FILES=(*.fa) +if [ -f $FILES ]; then + for f in *.fa; do + k=${f##*/} + n=${f%.*} + grep -E "^>" $f >> circular_id.txt + done + sed -i 's/>//' circular_id.txt + seqkit grep -v -f circular_id.txt ../${sample_name}_all_contigs.fa > ../${sample_name}_noLCG.fa +else + cp ../${sample_name}_all_contigs.fa ../${sample_name}_noLCG.fa +fi + +lcg_folder={{result_fp}}/${sample_name}/LCG/ +mkdir -p ${lcg_folder} +FILES=({{output}}/step-2/${sample_name}_split/*.fa) +if [ -f $FILES ]; then + for f in `ls {{output}}/step-2/${sample_name}_split/*.fa`; do + sn=`basename ${f/_circ/}`; + sn=${sn/part_/}; + cat $f | gzip > ${lcg_folder}/${sn/.fa/.fna}.gz; + done +fi + +mkdir -p {{result_fp}}/${sample_name}/ +if [ -f {{output}}/step-2/${sample_name}_noLCG.fa ]; then + cat {{output}}/step-2/${sample_name}_noLCG.fa | gzip > {{result_fp}}/${sample_name}/${sample_name}.noLCG.fna.gz +fi + +touch {{output}}/step-2/completed_${SLURM_ARRAY_TASK_ID}.log +# if the files don't exist, it means that this step didn't generate any +# inputs for the next step; thus generating all the completed files +if [[ ! -f "$FILES" && ! -f "{{output}}/step-2/${sample_name}_noLCG.fa" ]]; then + touch {{output}}/step-3/completed_${SLURM_ARRAY_TASK_ID}.log + touch {{output}}/step-4/completed_${SLURM_ARRAY_TASK_ID}.log + touch {{output}}/step-5/completed_${SLURM_ARRAY_TASK_ID}.log + touch {{output}}/step-6/completed_${SLURM_ARRAY_TASK_ID}.log + touch {{output}}/step-7/completed_${SLURM_ARRAY_TASK_ID}.log +fi + +# saving small LCG, note that these are not processed downstrem so not +# relevant to the "complete" files +small_lcg_folder={{result_fp}}/${sample_name}/small_LCG/ +mkdir -p ${small_lcg_folder} +FILES=({{output}}/step-2/${sample_name}_small_LCG/*.fa) +if [ -f $FILES ]; then + for f in `ls {{output}}/step-2/${sample_name}_small_LCG/*.fa`; do + sn=`basename ${f/_circ/}`; + sn=${sn/part_/}; + cat $f | gzip > ${small_lcg_folder}/${sn/.fa/.fna}.gz; + done +fi diff --git a/data/templates/3.minimap2_assembly.sbatch b/qp_pacbio/data/templates/3.minimap2_assembly.sbatch similarity index 55% rename from data/templates/3.minimap2_assembly.sbatch rename to qp_pacbio/data/templates/3.minimap2_assembly.sbatch index eea2cf0..1cb939d 100644 --- a/data/templates/3.minimap2_assembly.sbatch +++ b/qp_pacbio/data/templates/3.minimap2_assembly.sbatch @@ -11,7 +11,7 @@ source ~/.bashrc set -e -conda activate {{conda_environment}} +{{conda_environment}} cd {{output}} step=${SLURM_ARRAY_TASK_ID} @@ -28,5 +28,14 @@ fi folder=step-3/${sample_name}_binning mkdir -p ${folder} -minimap2 -x map-hifi -t {{nprocs}} -a --MD --eqx -o ${folder}/${sample_name}.sam step-2/${sample_name}_noLCG.fa ${filename} -samtools view -bS -@4 ${folder}/${sample_name}.sam | samtools sort -@4 -O bam -o ${folder}/${sample_name}.sorted.bam + +if [ -f step-2/${sample_name}_noLCG.fa ]; then + minimap2 -x map-hifi -I {{mem_in_gb}}G -t {{nprocs}} -a --MD --eqx -o ${folder}/${sample_name}.sam step-2/${sample_name}_noLCG.fa ${filename} + samtools view -bS -@4 ${folder}/${sample_name}.sam | samtools sort -@4 -O bam -o ${folder}/${sample_name}.sorted.bam +else + touch {{output}}/step-4/completed_${SLURM_ARRAY_TASK_ID}.log + touch {{output}}/step-5/completed_${SLURM_ARRAY_TASK_ID}.log + touch {{output}}/step-6/completed_${SLURM_ARRAY_TASK_ID}.log + touch {{output}}/step-7/completed_${SLURM_ARRAY_TASK_ID}.log +fi +touch {{output}}/step-3/completed_${SLURM_ARRAY_TASK_ID}.log diff --git a/data/templates/4.metawrap_binning_new.sbatch b/qp_pacbio/data/templates/4.metawrap_binning_new.sbatch similarity index 72% rename from data/templates/4.metawrap_binning_new.sbatch rename to qp_pacbio/data/templates/4.metawrap_binning_new.sbatch index c811659..8099888 100644 --- a/data/templates/4.metawrap_binning_new.sbatch +++ b/qp_pacbio/data/templates/4.metawrap_binning_new.sbatch @@ -11,7 +11,7 @@ source ~/.bashrc set -e -conda activate {{conda_environment}} +{{conda_environment}} cd {{output}} step=${SLURM_ARRAY_TASK_ID} @@ -26,13 +26,11 @@ if [[ "$step" == "1" ]]; then python -c "from qp_pacbio.util import client_connect; qclient = client_connect('{{url}}'); qclient.update_job_step('{{qjid}}', 'Running step 4: ${SLURM_ARRAY_JOB_ID}')" fi -folder=step-4/${sample_name}_binning - -mkdir -p {{output}}/${folder}/work_files/ -cp {{output}}/step-3/${sample_name}_binning/${sample_name}.sorted.bam {{output}}/step-4/${sample_name}_binning/work_files/${sample_name}.bam -rm {{output}}/step-3/${sample_name}_binning/${sample_name}.sorted.bam +folder={{output}}/step-4/${sample_name}_binning/work_files/ +mkdir -p ${folder} +cp {{output}}/step-3/${sample_name}_binning/${sample_name}.sorted.bam ${folder}/${sample_name}.bam ln -s ${filename} {{output}}/step-4/${sample_name}_binning/work_files/${sample_name}.fastq metawrap binning -a {{output}}/step-2/${sample_name}_noLCG.fa -o {{output}}/step-4/${sample_name}_binning \ - -t {{nprocs}} -m 100 -l 16000 --single-end --metabat2 --maxbin2 --concoct --universal {{output}}/step-4/${sample_name}_binning/work_files/${sample_name}.fastq -rm -rf {{output}}/step-4/${sample_name}_binning/work_files + -t {{nprocs}} -m 100 -l 16000 --single-end --metabat2 --maxbin2 --concoct --universal ${folder}/${sample_name}.fastq +touch {{output}}/step-4/completed_${SLURM_ARRAY_TASK_ID}.log diff --git a/data/templates/5.DAS_Tools_prepare_batch3_test.sbatch b/qp_pacbio/data/templates/5.DAS_Tools_prepare_batch3_test.sbatch similarity index 93% rename from data/templates/5.DAS_Tools_prepare_batch3_test.sbatch rename to qp_pacbio/data/templates/5.DAS_Tools_prepare_batch3_test.sbatch index b6bc585..d53df1f 100755 --- a/data/templates/5.DAS_Tools_prepare_batch3_test.sbatch +++ b/qp_pacbio/data/templates/5.DAS_Tools_prepare_batch3_test.sbatch @@ -11,7 +11,7 @@ source ~/.bashrc set -e -conda activate {{conda_environment}} +{{conda_environment}} cd {{output}} step=${SLURM_ARRAY_TASK_ID} @@ -30,7 +30,7 @@ folder=step-5/${sample_name}_refinement mkdir -p ${folder} ### make sure qiita have access to this DB path. Otherwise copy the folder to qiita-owned folders -DAS_db=/ddn_scratch/qiita/databases/qp-pacbio/DAS_Tool/db/ +DAS_db=/scratch/qp-pacbio/DAS_Tool/db/ cd {{output}}/step-4/${sample_name}_binning rm metabat2_bins/bin.unbinned.fa @@ -46,3 +46,5 @@ Fasta_to_Contig2Bin.sh -i ./metabat2_bins -e fa | awk 'BEGIN{FS=OFS="\t"}{print mkdir {{output}}/${folder}/${sample_name} DAS_Tool --bins=${sample_name}.concoct.tsv,${sample_name}.maxbin2.tsv,${sample_name}.metabat2.tsv --contigs={{output}}/step-2/${sample_name}_noLCG.fa --outputbasename={{output}}/${folder}/${sample_name}/${sample_name} --labels=CONCOCT,MaxBin,MetaBAT --threads={{nprocs}} --search_engine=diamond --dbDirectory=${DAS_db} --write_bins + +touch {{output}}/step-5/completed_${SLURM_ARRAY_TASK_ID}.log diff --git a/data/templates/6.MAG_rename.sbatch b/qp_pacbio/data/templates/6.MAG_rename.sbatch similarity index 93% rename from data/templates/6.MAG_rename.sbatch rename to qp_pacbio/data/templates/6.MAG_rename.sbatch index 2c86544..fb2557f 100644 --- a/data/templates/6.MAG_rename.sbatch +++ b/qp_pacbio/data/templates/6.MAG_rename.sbatch @@ -11,7 +11,7 @@ source ~/.bashrc set -e -conda activate {{conda_environment}} +{{conda_environment}} cd {{output}} step=${SLURM_ARRAY_TASK_ID} @@ -44,3 +44,5 @@ for f in `ls {{output}}/step-6/${sample_name}_MAG_rename/${sample_name}*.fa`; do sn=`basename ${f}`; cat $f | gzip > ${mag_folder}/${sn/.fa/.fna}.gz; done + +touch {{output}}/step-6/completed_${SLURM_ARRAY_TASK_ID}.log diff --git a/data/templates/7.checkm_batch.sbatch b/qp_pacbio/data/templates/7.checkm_batch.sbatch similarity index 89% rename from data/templates/7.checkm_batch.sbatch rename to qp_pacbio/data/templates/7.checkm_batch.sbatch index 3a8bc64..ac5b65d 100644 --- a/data/templates/7.checkm_batch.sbatch +++ b/qp_pacbio/data/templates/7.checkm_batch.sbatch @@ -12,7 +12,7 @@ source ~/.bashrc set -e -conda activate {{conda_environment}} +{{conda_environment}} cd {{output}} step=${SLURM_ARRAY_TASK_ID} @@ -38,4 +38,6 @@ cd ../ cp ${sample_name}_checkm_table.txt {{output}}/step-7/checkm_out/ ### Save results -cat {{output}}/step-7/checkm_out/${sample_name}*.txt | gzip > {{result_fp}}/${sample_name}/${sample_name}.txt.gz +cat {{output}}/step-7/checkm_out/${sample_name}*.txt | gzip > {{result_fp}}/${sample_name}/${sample_name}.checkm.txt.gz + +touch {{output}}/step-7/completed_${SLURM_ARRAY_TASK_ID}.log diff --git a/data/templates/8.lingenome_withmd.py b/qp_pacbio/data/templates/8.lingenome_withmd.py similarity index 100% rename from data/templates/8.lingenome_withmd.py rename to qp_pacbio/data/templates/8.lingenome_withmd.py diff --git a/data/templates/finish.pacbio.processing.sbatch b/qp_pacbio/data/templates/finish.pacbio.processing.sbatch similarity index 70% rename from data/templates/finish.pacbio.processing.sbatch rename to qp_pacbio/data/templates/finish.pacbio.processing.sbatch index d94512f..97bf60e 100644 --- a/data/templates/finish.pacbio.processing.sbatch +++ b/qp_pacbio/data/templates/finish.pacbio.processing.sbatch @@ -5,13 +5,12 @@ #SBATCH -n 1 #SBATCH --time {{wall_time_limit}} #SBATCH --mem {{mem_in_gb}}G -#SBATCH -o {{output}}/log-finish.out -#SBATCH -e {{output}}/log-finish.err - +#SBATCH -o {{output}}/finish/logs/finish.out +#SBATCH -e {{output}}/finish/logs/finish.err source ~/.bashrc set -e -conda activate {{conda_environment}} +{{conda_environment}} cd {{output}} tar zcvf results/logs.tgz step*/logs/ diff --git a/data/templates/woltka_minimap2.sbatch b/qp_pacbio/data/templates/woltka_minimap2.sbatch similarity index 84% rename from data/templates/woltka_minimap2.sbatch rename to qp_pacbio/data/templates/woltka_minimap2.sbatch index 31ba89a..3cda720 100644 --- a/data/templates/woltka_minimap2.sbatch +++ b/qp_pacbio/data/templates/woltka_minimap2.sbatch @@ -11,10 +11,10 @@ source ~/.bashrc set -e -conda activate {{conda_environment}} +{{conda_environment}} mkdir -p {{output}}/alignments cd {{output}}/ -db=/ddn_scratch/qiita_t/working_dir/tmp/db/WoLr2.mmi +db=/scratch/qp-pacbio/minimap2/WoLr2/WoLr2.map-hifi.mmi step=${SLURM_ARRAY_TASK_ID} input=$(head -n $step {{output}}/sample_list.txt | tail -n 1) @@ -27,6 +27,5 @@ fn=`basename ${filename}` minimap2 -x map-hifi -t {{nprocs}} -a \ --secondary=no --MD --eqx ${db} \ ${filename} | \ - samtools sort -@ {{nprocs}} - | \ - awk 'BEGIN { FS=OFS="\t" } /^@/ { print; next } { $10="*"; $11="*" } 1' | \ + awk 'BEGIN { FS=OFS="\t" } /^@/ { print; next } { $10="*"; $11="*" } 1' | grep -v "^@" | sort -k 1 | \ xz -1 -T1 > {{output}}/alignments/${sample_name}.sam.xz diff --git a/data/templates/woltka_minimap2_merge.sbatch b/qp_pacbio/data/templates/woltka_minimap2_merge.sbatch similarity index 78% rename from data/templates/woltka_minimap2_merge.sbatch rename to qp_pacbio/data/templates/woltka_minimap2_merge.sbatch index 6126ffb..95ccfed 100644 --- a/data/templates/woltka_minimap2_merge.sbatch +++ b/qp_pacbio/data/templates/woltka_minimap2_merge.sbatch @@ -10,12 +10,12 @@ source ~/.bashrc set -e -conda activate {{conda_environment}} +{{conda_environment}} cd {{output}}/ -tax=/projects/wol/qiyun/wol2/databases/minimap2/WoLr2.tax -coords=/projects/wol/qiyun/wol2/databases/minimap2/WoLr2.coords -len_map=/projects/wol/qiyun/wol2/databases/minimap2/WoLr2/length.map -functional_dir=/projects/wol/qiyun/wol2/databases/minimap2/WoLr2/ +tax=/scratch/qp-woltka/WoLr2/WoLr2.tax +coords=/scratch/qp-woltka/WoLr2/WoLr2.coords +len_map=/scratch/qp-woltka/WoLr2/genomes/length.map +functional_dir=/scratch/qp-woltka/WoLr2/function/kegg/ mkdir -p {{output}}/coverages/ @@ -25,25 +25,26 @@ for f in `ls alignments/*.sam.xz`; do mkdir -p ${of}; echo "woltka classify -i ${f} -o ${of}/none.biom --no-demux --lineage ${tax} --rank none --outcov {{output}}/coverages/"; echo "woltka classify -i ${f} -o ${of}/per-gene.biom --no-demux -c ${coords}"; -done | parallel -j {{node_count}} +done | parallel -j {{nprocs}} wait for f in `ls bioms/*/per-gene.biom`; do dn=`dirname ${f}`; - sn=`basename ${sn}`; + sn=`basename ${dn}`; echo "woltka collapse -i ${f} -m ${functional_dir}/orf-to-ko.map.xz -o ${dn}/ko.biom; " \ "woltka collapse -i ${dn}/ko.biom -m ${functional_dir}/ko-to-ec.map -o ${dn}/ec.biom; " \ "woltka collapse -i ${dn}/ko.biom -m ${functional_dir}/ko-to-reaction.map -o ${dn}/reaction.biom; " \ "woltka collapse -i ${dn}/reaction.biom -m ${functional_dir}/reaction-to-module.map -o ${dn}/module.biom; " \ "woltka collapse -i ${dn}/module.biom -m ${functional_dir}/module-to-pathway.map -o ${dn}/pathway.biom;" -done | parallel -j {{node_count}} +done | parallel -j {{nprocs}} wait -# MISSING: -# merge bioms! +biom_merge_pacbio --base {{output}} find {{output}}/coverages/ -iname "*.cov" > {{output}}/cov_files.txt micov consolidate --paths {{output}}/cov_files.txt --lengths ${len_map} --output {{output}}/coverages.tgz cd alignments tar -cvf ../alignments.tar *.sam.xz + +finish_qp_pacbio {{url}} {{qjid}} {{output}} diff --git a/qp_pacbio/qp_pacbio.py b/qp_pacbio/qp_pacbio.py index 6bccdcd..3bf2128 100644 --- a/qp_pacbio/qp_pacbio.py +++ b/qp_pacbio/qp_pacbio.py @@ -6,20 +6,24 @@ # The full license is in the file LICENSE, distributed with this software. # ----------------------------------------------------------------------------- from os import makedirs, mkdir -from os.path import basename, join, exists, getmtime -import pathlib +from os.path import basename, join, exists from shutil import copy2 +from glob import glob - -from jinja2 import Environment, BaseLoader, TemplateNotFound +import yaml +from os import environ +from jinja2 import Environment from qiita_client import ArtifactInfo from subprocess import run +from .util import KISSLoader, find_base_path + -CONDA_ENV = "qp_pacbio_2025.9" -MAX_WALL_1000 = 1000 -MAX_WALL_500 = 500 -T5_NAME = "5.DAS_Tools_prepare_batch3_test.sbatch" +JENV = Environment(loader=KISSLoader("data/templates")) +JGT = JENV.get_template +RESOURCES = yaml.safe_load(open(join(find_base_path(), "data/resources.yaml"))) +CONDA_ENVIRONMENT = environ["ENVIRONMENT"] +PACBIO_PROCESSING_STEPS = 7 def sbatch(args): @@ -51,23 +55,6 @@ def search_by_filename(fname, lookup): raise KeyError("Cannot determine run_prefix for %s" % original) -# taken from the Jinja docs (BaseLoader API): -# https://jinja.palletsprojects.com/en/3.0.x/api/ -class KISSLoader(BaseLoader): - def __init__(self, path): - base = pathlib.Path(__file__).parent.resolve() - self.path = join(base, path) - - def get_source(self, environment, template): - path = join(self.path, template) - if not exists(path): - raise TemplateNotFound(template) - mtime = getmtime(path) - with open(path, encoding="utf-8") as f: - source = f.read() - return source, path, lambda: mtime == getmtime(path) - - def _write_slurm(path, template, **ctx): makedirs(path, exist_ok=True) makedirs(join(path, "logs"), exist_ok=True) @@ -95,145 +82,118 @@ def pacbio_generate_templates(out_dir, job_id, njobs, result_fp, url): url : str URL to update the status of the jobs """ - jinja_env = Environment(loader=KISSLoader("../data/templates")) + resources = RESOURCES["PacBio processing"] + + main_parameters = { + "conda_environment": CONDA_ENVIRONMENT, + "output": out_dir, + "url": url, + "qjid": job_id, + "result_fp": result_fp, + } # Step 1 - template1 = jinja_env.get_template("1.hifiasm-meta_new.sbatch") - _write_slurm( - join(out_dir, "step-1"), - template1, - conda_environment=CONDA_ENV, - output=out_dir, - job_name=f"s1-{job_id}", - node_count=1, - nprocs=16, - wall_time_limit=MAX_WALL_1000, - mem_in_gb=300, - array_params=f"1-{njobs}%16", - url=url, - qjid=job_id, - ) + template1 = JGT("1.hifiasm-meta_new.sbatch") + step_resources = resources["step-1"] + params = main_parameters | { + "job_name": f"s1-{job_id}", + "node_count": step_resources["node_count"], + "nprocs": step_resources["nprocs"], + "wall_time_limit": step_resources["wall_time_limit"], + "mem_in_gb": step_resources["mem_in_gb"], + "array_params": f"1-{njobs}%{step_resources['max_tasks']}", + } + _write_slurm(join(out_dir, "step-1"), template1, **params) # Step 2 - template2 = jinja_env.get_template("2.get-circular-genomes.sbatch") - _write_slurm( - join(out_dir, "step-2"), - template2, - conda_environment=CONDA_ENV, - output=out_dir, - job_name=f"s2-{job_id}", - node_count=1, - nprocs=1, - wall_time_limit=MAX_WALL_500, - mem_in_gb=16, - array_params=f"1-{njobs}%16", - result_fp=result_fp, - url=url, - qjid=job_id, - ) + template2 = JGT("2.get-circular-genomes.sbatch") + step_resources = resources["step-2"] + params = main_parameters | { + "job_name": f"s2-{job_id}", + "node_count": step_resources["node_count"], + "nprocs": step_resources["nprocs"], + "wall_time_limit": step_resources["wall_time_limit"], + "mem_in_gb": step_resources["mem_in_gb"], + "array_params": f"1-{njobs}%{step_resources['max_tasks']}", + } + _write_slurm(join(out_dir, "step-2"), template2, **params) # Step 3 - template3 = jinja_env.get_template("3.minimap2_assembly.sbatch") - _write_slurm( - join(out_dir, "step-3"), - template3, - conda_environment=CONDA_ENV, - output=out_dir, - job_name=f"s3-{job_id}", - node_count=1, - nprocs=8, - wall_time_limit=MAX_WALL_500, - mem_in_gb=50, - array_params=f"1-{njobs}%16", - url=url, - qjid=job_id, - ) + template3 = JGT("3.minimap2_assembly.sbatch") + step_resources = resources["step-3"] + params = main_parameters | { + "job_name": f"s3-{job_id}", + "node_count": step_resources["node_count"], + "nprocs": step_resources["nprocs"], + "wall_time_limit": step_resources["wall_time_limit"], + "mem_in_gb": step_resources["mem_in_gb"], + "array_params": f"1-{njobs}%{step_resources['max_tasks']}", + } + _write_slurm(join(out_dir, "step-3"), template3, **params) # Step 4 - template4 = jinja_env.get_template("4.metawrap_binning_new.sbatch") - _write_slurm( - join(out_dir, "step-4"), - template4, - conda_environment=CONDA_ENV, - output=out_dir, - job_name=f"s4-{job_id}", - node_count=1, - nprocs=8, - wall_time_limit=MAX_WALL_500, - mem_in_gb=50, - array_params=f"1-{njobs}%16", - url=url, - qjid=job_id, - ) + template4 = JGT("4.metawrap_binning_new.sbatch") + step_resources = resources["step-4"] + params = main_parameters | { + "job_name": f"s4-{job_id}", + "node_count": step_resources["node_count"], + "nprocs": step_resources["nprocs"], + "wall_time_limit": step_resources["wall_time_limit"], + "mem_in_gb": step_resources["mem_in_gb"], + "array_params": f"1-{njobs}%{step_resources['max_tasks']}", + } + _write_slurm(join(out_dir, "step-4"), template4, **params) # Step 5 (long filename isolated in T5_NAME) - template5 = jinja_env.get_template(T5_NAME) - _write_slurm( - join(out_dir, "step-5"), - template5, - conda_environment=CONDA_ENV, - output=out_dir, - job_name=f"s5-{job_id}", - node_count=1, - nprocs=8, - wall_time_limit=MAX_WALL_500, - mem_in_gb=50, - array_params=f"1-{njobs}%16", - url=url, - qjid=job_id, - ) + template5 = JGT("5.DAS_Tools_prepare_batch3_test.sbatch") + step_resources = resources["step-5"] + params = main_parameters | { + "job_name": f"s5-{job_id}", + "node_count": step_resources["node_count"], + "nprocs": step_resources["nprocs"], + "wall_time_limit": step_resources["wall_time_limit"], + "mem_in_gb": step_resources["mem_in_gb"], + "array_params": f"1-{njobs}%{step_resources['max_tasks']}", + } + _write_slurm(join(out_dir, "step-5"), template5, **params) # Step 6 - template6 = jinja_env.get_template("6.MAG_rename.sbatch") - _write_slurm( - join(out_dir, "step-6"), - template6, - conda_environment=CONDA_ENV, - output=out_dir, - job_name=f"s6-{job_id}", - node_count=1, - nprocs=8, - wall_time_limit=MAX_WALL_500, - mem_in_gb=50, - array_params=f"1-{njobs}%16", - result_fp=result_fp, - url=url, - qjid=job_id, - ) + template6 = JGT("6.MAG_rename.sbatch") + step_resources = resources["step-6"] + params = main_parameters | { + "job_name": f"s6-{job_id}", + "node_count": step_resources["node_count"], + "nprocs": step_resources["nprocs"], + "wall_time_limit": step_resources["wall_time_limit"], + "mem_in_gb": step_resources["mem_in_gb"], + "array_params": f"1-{njobs}%{step_resources['max_tasks']}", + } + _write_slurm(join(out_dir, "step-6"), template6, **params) # Step 7 - template7 = jinja_env.get_template("7.checkm_batch.sbatch") - _write_slurm( - join(out_dir, "step-7"), - template7, - conda_environment=CONDA_ENV, - output=out_dir, - job_name=f"s7-{job_id}", - node_count=1, - nprocs=8, - wall_time_limit=MAX_WALL_500, - mem_in_gb=50, - array_params=f"1-{njobs}%16", - result_fp=result_fp, - url=url, - qjid=job_id, - ) + template7 = JGT("7.checkm_batch.sbatch") + step_resources = resources["step-7"] + params = main_parameters | { + "job_name": f"s7-{job_id}", + "node_count": step_resources["node_count"], + "nprocs": step_resources["nprocs"], + "wall_time_limit": step_resources["wall_time_limit"], + "mem_in_gb": step_resources["mem_in_gb"], + "array_params": f"1-{njobs}%{step_resources['max_tasks']}", + } + _write_slurm(join(out_dir, "step-7"), template7, **params) # finish command - letting qiita know that we are done - finish = jinja_env.get_template("finish.pacbio.processing.sbatch") - _write_slurm( - join(out_dir, "finish"), - finish, - conda_environment=CONDA_ENV, - output=out_dir, - job_name=f"f-{job_id}", - node_count=1, - nprocs=1, - wall_time_limit=MAX_WALL_500, - mem_in_gb=50, - url=url, - qjid=job_id, - ) + finish = JGT("finish.pacbio.processing.sbatch") + step_resources = resources["finish"] + params = main_parameters | { + "job_name": f"f-{job_id}", + "node_count": step_resources["node_count"], + "nprocs": step_resources["nprocs"], + "wall_time_limit": step_resources["wall_time_limit"], + "mem_in_gb": step_resources["mem_in_gb"], + } + _write_slurm(join(out_dir, "finish"), finish, **params) def generate_sample_list(qclient, artifact_id, out_dir): @@ -292,6 +252,22 @@ def pacbio_processing(qclient, job_id, parameters, out_dir): makedirs(result_fp, exist_ok=True) qclient.update_job_step(job_id, "Commands finished") + # validating that all steps are complete + with open(join(out_dir, "sample_list.txt"), "r") as fp: + expected = len(fp.readlines()) + + failed_steps = [] + for step_id in range(1, PACBIO_PROCESSING_STEPS + 1, 1): + obs = glob(join(out_dir, f"step-{step_id}", "completed_*.log")) + if len(obs) != expected: + failed_steps.append(str(step_id)) + if failed_steps: + failed_steps = ", ".join(failed_steps) + message = ( + "These steps have less than expected logs: " + f" {failed_steps}; please email the admin" + ) + return (False, [], message) paths = [(f"{result_fp}/", "directory")] return ( @@ -334,7 +310,7 @@ def _coverage_copy(dest): ainfo = [] fp_biom = f"{out_dir}/none.biom" - fp_alng = f"{out_dir}/alignment.tar" + fp_alng = f"{out_dir}/alignments.tar" if exists(fp_biom) and exists(fp_alng): ainfo.append( ArtifactInfo( @@ -383,7 +359,7 @@ def _coverage_copy(dest): return True, ainfo, "" -def generate_minimap2_processing(qclient, job_id, out_dir, parameters): +def generate_minimap2_processing(qclient, job_id, out_dir, parameters, url): """generates slurm scripts for minimap2/woltka processing. Parameters @@ -396,12 +372,21 @@ def generate_minimap2_processing(qclient, job_id, out_dir, parameters): Output directory. parameters : dict Parameters for this job. + url : str + URL to send the respose, finish the job. Returns ------- str, str Returns the two filepaths of the slurm scripts """ + resources = RESOURCES["Woltka v0.1.7, minimap2"] + main_parameters = { + "conda_environment": CONDA_ENVIRONMENT, + "output": out_dir, + "qjid": job_id, + } + qclient.update_job_step( job_id, "Step 1 of 4: Collecting info and generating submission" ) @@ -415,32 +400,29 @@ def generate_minimap2_processing(qclient, job_id, out_dir, parameters): "Step 2 of 4: Creating submission templates", ) - jinja_env = Environment(loader=KISSLoader("../data/templates")) - GT = jinja_env.get_template - minimap2_template = GT("woltka_minimap2.sbatch") - minimap2_script = _write_slurm( - f"{out_dir}/minimap2", - minimap2_template, - conda_environment=CONDA_ENV, - output=out_dir, - job_name=f"m2_{job_id}", - node_count=1, - nprocs=16, - wall_time_limit=MAX_WALL_1000, - mem_in_gb=120, - array_params=f"1-{njobs}%16", - ) - minimap2_merge_template = GT("woltka_minimap2_merge.sbatch") - minimap2_merge_script = _write_slurm( - f"{out_dir}/merge", - minimap2_merge_template, - conda_environment=CONDA_ENV, - output=out_dir, - job_name=f"me_{job_id}", - node_count=1, - nprocs=16, - wall_time_limit=MAX_WALL_1000, - mem_in_gb=120, - ) + m2t = JGT("woltka_minimap2.sbatch") + step_resources = resources["minimap2"] + params = main_parameters | { + "job_name": f"m2_{job_id}", + "node_count": step_resources["node_count"], + "nprocs": step_resources["nprocs"], + "wall_time_limit": step_resources["wall_time_limit"], + "mem_in_gb": step_resources["mem_in_gb"], + "array_params": f"1-{njobs}%{step_resources['max_tasks']}", + } + minimap2_script = _write_slurm(f"{out_dir}/minimap2", m2t, **params) + + m2mt = JGT("woltka_minimap2_merge.sbatch") + step_resources = resources["merge"] + params = main_parameters | { + "job_name": f"me_{job_id}", + "node_count": step_resources["node_count"], + "nprocs": step_resources["nprocs"], + "wall_time_limit": step_resources["wall_time_limit"], + "mem_in_gb": step_resources["mem_in_gb"], + "url": url, + } + step_resources = resources["merge"] + minimap2_merge_script = _write_slurm(f"{out_dir}/merge", m2mt, **params) return minimap2_script, minimap2_merge_script diff --git a/qp_pacbio/scripts.py b/qp_pacbio/scripts.py index 7b1e27a..ed76691 100644 --- a/qp_pacbio/scripts.py +++ b/qp_pacbio/scripts.py @@ -9,11 +9,17 @@ import click from os import makedirs from os.path import join +import biom as _biom +from glob import glob +import h5py + + from qp_pacbio import plugin from qp_pacbio.qp_pacbio import ( generate_minimap2_processing, generate_sample_list, pacbio_generate_templates, + PACBIO_PROCESSING_STEPS, ) from qp_pacbio.util import client_connect from subprocess import run, PIPE @@ -52,7 +58,7 @@ def execute(url, job_id, output_dir): if command == "Woltka v0.1.7, minimap2": main_fp, merge_fp = generate_minimap2_processing( - qclient, job_id, output_dir, parameters + qclient, job_id, output_dir, parameters, url ) # Submitting jobs and returning id @@ -72,10 +78,8 @@ def execute(url, job_id, output_dir): njobs = generate_sample_list(qclient, artifact_id, output_dir) pacbio_generate_templates(output_dir, job_id, njobs, frp, url) - total_steps = 7 - all_jids = [] - for step_id in range(1, total_steps + 1, 1): + for step_id in range(1, PACBIO_PROCESSING_STEPS + 1, 1): cmd = [ "sbatch", "--parsable", @@ -92,7 +96,7 @@ def execute(url, job_id, output_dir): "sbatch", "--parsable", "--dependency", - f"afterok:{all_jids[-1]}", + f"afterany:{all_jids[-1]}", f"{output_dir}/finish/finish.slurm", ] task = run(cmd, stdout=PIPE) @@ -111,3 +115,47 @@ def execute(url, job_id, output_dir): def finish_qp_pacbio(url, job_id, output_dir): """Executes the task given by job_id and puts the output in output_dir""" plugin(url, job_id, output_dir) + + +@click.command() +@click.option("--base", type=click.Path(exists=True), required=True) +def biom_merge(base): + """Merges all PacBio biom tables""" + chunk_size = 30 + for rank in ("none", "per-gene", "ko", "ec", "pathway"): + rank = rank + ".biom" + tables = glob(f"{base}/bioms/*/{rank}") + + if not tables: + continue + + full = None + for block in range(0, len(tables), chunk_size): + lblock = block + chunk_size + chunk = tables[block:lblock] + loaded = [] + for c in chunk: + skip = True + if _biom.util.is_hdf5_file(c): + skip = False + else: + with open(c) as fh: + for i, l in enumerate(fh): + if i >= 1 and l: + skip = False + break + if not skip: + temp = _biom.load_table(c) + if temp.shape != (0, 0): + loaded.append(temp) + + if full is None: + if len(loaded) == 1: + full = loaded[0] + else: + full = loaded[0].concat(loaded[1:]) + else: + full = full.concat(loaded) + + with h5py.File(f"{base}/{rank}", "w") as out: + full.to_hdf5(out, "fast-merge") diff --git a/qp_pacbio/tests/test_pacbio.py b/qp_pacbio/tests/test_pacbio.py index 090b4b4..65d7370 100644 --- a/qp_pacbio/tests/test_pacbio.py +++ b/qp_pacbio/tests/test_pacbio.py @@ -20,33 +20,27 @@ generate_minimap2_processing, generate_sample_list, pacbio_generate_templates, + CONDA_ENVIRONMENT, ) -# Keep these in sync with your generator defaults -CONDA_ENV = "qp_pacbio_2025.9" -STEP1_NPROCS = 16 -STEP1_WALL = 1000 -STEP1_MEM_GB = 300 # generator uses 300G for step-1 -NODE_COUNT = 1 -PARTITION = "qiita" # Exact expected Step-1 script matching your template. # Escape ${...} -> ${{...}} and awk braces -> '{{print $1}}' etc. STEP_1_EXP = ( "#!/bin/bash\n" "#SBATCH -J s1-{job_id}\n" - f"#SBATCH -p {PARTITION}\n" - f"#SBATCH -N {NODE_COUNT}\n" - f"#SBATCH -n {STEP1_NPROCS}\n" - f"#SBATCH --time {STEP1_WALL}\n" - f"#SBATCH --mem {STEP1_MEM_GB}G\n" + f"#SBATCH -p qiita\n" + f"#SBATCH -N 1\n" + f"#SBATCH -n 16\n" + f"#SBATCH --time 1-00:00:00\n" + f"#SBATCH --mem 200G\n" "#SBATCH -o {out_dir}/step-1/logs/%x-%A_%a.out\n" "#SBATCH -e {out_dir}/step-1/logs/%x-%A_%a.err\n" "#SBATCH --array 1-{njobs}%16\n" "\n" "source ~/.bashrc\n" "set -e\n" - f"conda activate {CONDA_ENV}\n" + f"{CONDA_ENVIRONMENT}\n" "cd {out_dir}/step-1\n" "\n" "step=${{SLURM_ARRAY_TASK_ID}}\n" @@ -65,8 +59,9 @@ "${{SLURM_ARRAY_JOB_ID}}')\"\n" "fi\n" "\n" - f"hifiasm_meta -t {STEP1_NPROCS} -o " - "{out_dir}/step-1/${{sample_name}} ${{filename}}" + f"hifiasm_meta -t 16 -o " + "{out_dir}/step-1/${{sample_name}} ${{filename}}\n" + "touch {out_dir}/step-1/completed_${{SLURM_ARRAY_TASK_ID}}.log" ) @@ -167,9 +162,10 @@ def test_pacbio_profiling(self): out_dir = mkdtemp() self._clean_up_files.append(out_dir) + url = "https://test.test.edu/" # this should fail cause we don't have valid data main_fp, merge_fp = generate_minimap2_processing( - self.qclient, job_id, out_dir, params + self.qclient, job_id, out_dir, params, url ) with open(main_fp, "r") as f: obs_main = f.readlines() @@ -182,18 +178,18 @@ def test_pacbio_profiling(self): "#SBATCH -p qiita\n", "#SBATCH -N 1\n", "#SBATCH -n 16\n", - "#SBATCH --time 1000\n", - "#SBATCH --mem 120G\n", + "#SBATCH --time 36000\n", + "#SBATCH --mem 60G\n", f"#SBATCH -o {out_dir}/minimap2/logs/%x-%A_%a.out\n", f"#SBATCH -e {out_dir}/minimap2/logs/%x-%A_%a.err\n", "#SBATCH --array 1-2%16\n", "\n", "source ~/.bashrc\n", "set -e\n", - "conda activate qp_pacbio_2025.9\n", + f"{CONDA_ENVIRONMENT}\n", f"mkdir -p {out_dir}/alignments\n", f"cd {out_dir}/\n", - "db=/ddn_scratch/qiita_t/working_dir/tmp/db/WoLr2.mmi\n", + "db=/scratch/qp-pacbio/minimap2/WoLr2/WoLr2.map-hifi.mmi\n", "\n", "step=${SLURM_ARRAY_TASK_ID}\n", f"input=$(head -n $step {out_dir}/sample_list.txt | tail -n 1)\n", @@ -206,33 +202,32 @@ def test_pacbio_profiling(self): "minimap2 -x map-hifi -t 16 -a \\\n", " --secondary=no --MD --eqx ${db} \\\n", " ${filename} | \\\n", - " samtools sort -@ 16 - | \\\n", ' awk \'BEGIN { FS=OFS="\\t" } /^@/ { print; next } ' - '{ $10="*"; $11="*" } 1\' | \\\n', + '{ $10="*"; $11="*" } 1\' | grep -v "^@" | sort -k 1 | \\\n', f" xz -1 -T1 > {out_dir}/alignments/${{sample_name}}.sam.xz", ] self.assertEqual(obs_main, exp_main) - db_path = "/projects/wol/qiyun/wol2/databases/minimap2" + db_path = "/scratch/qp-woltka/WoLr2" exp_merge = [ "#!/bin/bash\n", "#SBATCH -J me_my-job-id\n", "#SBATCH -p qiita\n", "#SBATCH -N 1\n", "#SBATCH -n 16\n", - "#SBATCH --time 1000\n", + "#SBATCH --time 1-00:00:00\n", "#SBATCH --mem 120G\n", f"#SBATCH -o {out_dir}/merge/logs/%x-%A_%a.out\n", f"#SBATCH -e {out_dir}/merge/logs/%x-%A_%a.err\n", "\n", "source ~/.bashrc\n", "set -e\n", - "conda activate qp_pacbio_2025.9\n", + f"{CONDA_ENVIRONMENT}\n", f"cd {out_dir}/\n", f"tax={db_path}/WoLr2.tax\n", f"coords={db_path}/WoLr2.coords\n", - f"len_map={db_path}/WoLr2/length.map\n", - f"functional_dir={db_path}/WoLr2/\n", + f"len_map={db_path}/genomes/length.map\n", + f"functional_dir={db_path}/function/kegg/\n", "\n", f"mkdir -p {out_dir}/coverages/\n", "\n", @@ -245,12 +240,12 @@ def test_pacbio_profiling(self): f'--rank none --outcov {out_dir}/coverages/";\n', ' echo "woltka classify -i ${f} -o ${of}/per-gene.biom ' '--no-demux -c ${coords}";\n', - "done | parallel -j 1\n", + "done | parallel -j 16\n", "wait\n", "\n", "for f in `ls bioms/*/per-gene.biom`; do\n", " dn=`dirname ${f}`;\n", - " sn=`basename ${sn}`;\n", + " sn=`basename ${dn}`;\n", ' echo "woltka collapse -i ${f} -m ${functional_dir}/' 'orf-to-ko.map.xz -o ${dn}/ko.biom; " \\\n', ' "woltka collapse -i ${dn}/ko.biom -m ${functional_dir}/' @@ -263,11 +258,10 @@ def test_pacbio_profiling(self): ' "woltka collapse -i ${dn}/module.biom -m ' "${functional_dir}/module-to-pathway.map " '-o ${dn}/pathway.biom;"\n', - "done | parallel -j 1\n", + "done | parallel -j 16\n", "wait\n", "\n", - "# MISSING:\n", - "# merge bioms!\n", + f"biom_merge_pacbio --base {out_dir}\n", "\n", ( f'find {out_dir}/coverages/ -iname "*.cov" ' @@ -277,7 +271,9 @@ def test_pacbio_profiling(self): f"--lengths ${{len_map}} --output {out_dir}/coverages.tgz\n", "\n", "cd alignments\n", - "tar -cvf ../alignments.tar *.sam.xz", + "tar -cvf ../alignments.tar *.sam.xz\n", + "\n", + f"finish_qp_pacbio {url} {job_id} {out_dir}", ] self.assertEqual(obs_merge, exp_merge) diff --git a/qp_pacbio/util.py b/qp_pacbio/util.py index 12a22bc..1df4930 100644 --- a/qp_pacbio/util.py +++ b/qp_pacbio/util.py @@ -6,15 +6,17 @@ # The full license is in the file LICENSE, distributed with this software. # ----------------------------------------------------------------------------- from os import environ -from os.path import join, expanduser +from os.path import join, expanduser, getmtime, exists from configparser import ConfigParser +import pathlib +from jinja2 import BaseLoader, TemplateNotFound from qiita_client import QiitaClient plugin_details = { "name": "qp-pacbio", - "version": "2025.09", + "version": "2025.11", "description": "PacBio processing", } @@ -38,3 +40,23 @@ def client_connect(url): config.get("oauth2", "SERVER_CERT"), ) return qclient + + +def find_base_path(): + return pathlib.Path(__file__).parent.resolve() + + +# taken from the Jinja docs (BaseLoader API): +# https://jinja.palletsprojects.com/en/3.0.x/api/ +class KISSLoader(BaseLoader): + def __init__(self, path): + self.path = join(find_base_path(), path) + + def get_source(self, environment, template): + path = join(self.path, template) + if not exists(path): + raise TemplateNotFound(template) + mtime = getmtime(path) + with open(path, encoding="utf-8") as f: + source = f.read() + return source, path, lambda: mtime == getmtime(path) diff --git a/runner.py b/runner.py index f877974..7517724 100644 --- a/runner.py +++ b/runner.py @@ -17,7 +17,7 @@ # Using woltka because we don't have a qp_pacbio command plugin_details = { "name": "qp-woltka", - "version": "2024.09", + "version": "2025.11", "description": "Woltka", }