qiita-spots · jianshu93 · Nov 12, 2025 · Oct 28, 2025 · Oct 28, 2025 · Oct 28, 2025
diff --git a/.github/workflows/qiita-plugin-ci.yml b/.github/workflows/qiita-plugin-ci.yml
@@ -87,9 +87,9 @@ jobs:
           pip --quiet install -U pip
           pip --quiet install https://github.com/qiita-spots/qtp-job-output-folder/archive/refs/heads/main.zip
 
+          export ENVIRONMENT="source /home/runner/.profile; conda activate qp_pacbio_2025.9"
           pip install -e .
           pip --quiet install coveralls
-          export ENVIRONMENT="source /home/runner/.profile; conda activate qp_pacbio_2025.9"
 
           configure_qtp_job_output_folder --env-script "source /home/runner/.profile; conda activate qp_pacbio_2025.9" --ca-cert $QIITA_ROOTCA_CERT
           configure_qp_pacbio --env-script 'source /home/runner/.profile; conda activate qp_pacbio_2025.9; export ENVIRONMENT="source /home/runner/.profile; conda activate qp_pacbio_2025.9"' --ca-cert $QIITA_ROOTCA_CERT
@@ -134,8 +134,9 @@ jobs:
           export QIITA_ROOTCA_CERT=`pwd`/qiita-dev/qiita_core/support_files/ci_rootca.crt
           export QIITA_CONFIG_FP=`pwd`/qiita-dev/qiita_core/support_files/config_test_local.cfg
           export PYTHONWARNINGS="ignore:Certificate for localhost has no \`subjectAltName\`"
+          export ENVIRONMENT="source /home/runner/.profile; conda activate qp_pacbio_2025.9"
 
-          pytest qp_pacbio --doctest-modules --cov=qp_pacbio --cov-report=lcov
+          pytest qp_pacbio --doctest-modules --cov=qp_pacbio --cov-report=lcov --ignore=qp_pacbio/data
 
       - uses: codecov/codecov-action@v3
         with:

diff --git a/README.rst b/README.rst
@@ -1,6 +1,18 @@
 |Build Status| |Coverage Status|
 
-Qiita plugin to process Pacbio. The plugin follows these processing steps:
+Qiita plugin to process PacBio reads; it currently provides 2 commands for Qiita:
+
+* **Woltka v0.1.7, minimap2**: which generates feature and functional profiles agains WoLr2;
+  the expected output are BIOM artifacts
+
+* **PacBio processing**: which goes from step 1 to 7 in the image below. The expected output
+  is a main folder with folders per-sample and folders for each of the different outputs, as follows:
+
+  * **MAG** folder: all Metagenome-Assembled Genome (MAG) generatedfor that sample
+  * **LCG** folder: all Long-Circular Genome (LCG) generated for that sample that are over 512kb in size - approximate 515,000 bases (half a million)
+  * **small_LCG** folder: all Long-Circular Genome (LCG) generated for that sample that are under 512kb in size
+  * **[sample-name].fna.gz**: the no LCG reads used for MAG generation
+  * **[sample-name].checkm.txt.gz**: MAG quanlity information from CheckM v1.2.3
 
 
 .. image:: images/PacBioProcessing.png

diff --git a/data/templates/2.get-circular-genomes.sbatch b/data/templates/2.get-circular-genomes.sbatch
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,13 +4,15 @@ build-backend = "setuptools.build_meta"
 
 [tool.setuptools]
 packages = ["qp_pacbio"]
-include-package-data = true
+
+[tool.setuptools.package-data]
+"qp_pacbio" = ["data/*"]
 
 [project]
 name = "qp_pacbio"
 # version strings must comply with PEP 440:
 # https://peps.python.org/pep-0440/
-version = "2025.09"
+version = "2025.11"
 authors = [{ name = "Qiita Development Team", email = "[email protected]" }]
 description = "Qiita Plugin: PacBio Processing"
 readme = "README.rst"
@@ -39,11 +41,15 @@ dependencies = [
     'pytest-cov',
     'numpy',
     'Jinja2',
+    'PyYAML',
+    'micov',
     "qiita-files@https://github.com/qiita-spots/qiita-files/archive/master.zip",
     "qiita_client@https://github.com/qiita-spots/qiita_client/archive/master.zip",
+    "woltka@git+https://github.com/qiyunzhu/woltka.git#egg=woltka",
 ]
 
 [project.scripts]
 configure_qp_pacbio = "qp_pacbio.scripts:config"
 start_qp_pacbio = "qp_pacbio.scripts:execute"
 finish_qp_pacbio = "qp_pacbio.scripts:finish_qp_pacbio"
+biom_merge_pacbio = "qp_pacbio.scripts:biom_merge"
diff --git a/qp_pacbio/__init__.py b/qp_pacbio/__init__.py
@@ -16,8 +16,8 @@
 # minimap2 command
 #
 
-req_params = {"artifact": ("integer", ["per_sample_FASTQ"])}
-opt_params = dict()
+req_params = {"artifact": ("artifact", ["per_sample_FASTQ"])}
+opt_params = {"Database": ['choice:["WoLr2"]', "WoLr2"]}
 outputs = {
     # taxonomic
     "Per genome Predictions": "BIOM",
@@ -27,8 +27,7 @@
     "KEGG Enzyme (EC)": "BIOM",
     "KEGG Pathway": "BIOM",
 }
-dflt_param_set = dict()
-
+dflt_param_set = {"WoLr2": {"Database": "WoLr2"}}
 minimap2_cmd = QiitaCommand(
     "Woltka v0.1.7, minimap2",
     "Functional and Taxonomic Predictions",
@@ -45,12 +44,11 @@
 #
 
 req_params = {
-    "artifact": ("integer", ["per_sample_FASTQ"]),
+    "artifact": ("artifact", ["per_sample_FASTQ"]),
 }
-opt_params = dict()
+opt_params = {"Processing": ['choice:["default"]', "default"]}
 outputs = {"output": "job-output-folder"}
-dflt_param_set = dict()
-
+dflt_param_set = {"default": {"Processing": "default"}}
 pacbio_processing_cmd = QiitaCommand(
     "PacBio processing",
     "Default PacBio processing for Metagenomic Data",

diff --git a/qp_pacbio/data/resources.yaml b/qp_pacbio/data/resources.yaml
@@ -0,0 +1,60 @@
+PacBio processing:
+  step-1:
+    node_count: 1
+    nprocs: 16
+    wall_time_limit: 1-00:00:00
+    mem_in_gb: 200
+    max_tasks: 16
+  step-2:
+    node_count: 1
+    nprocs: 1
+    wall_time_limit: 00:10:00
+    mem_in_gb: 2
+    max_tasks: 16
+  step-3:
+    node_count: 1
+    nprocs: 8
+    wall_time_limit: 01:00:00
+    mem_in_gb: 10
+    max_tasks: 16
+  step-4:
+    node_count: 1
+    nprocs: 8
+    wall_time_limit: 01:00:00
+    mem_in_gb: 6
+    max_tasks: 16
+  step-5:
+    node_count: 1
+    nprocs: 8
+    wall_time_limit: 00:30:00
+    mem_in_gb: 2
+    max_tasks: 16
+  step-6:
+    node_count: 1
+    nprocs: 8
+    wall_time_limit: 00:30:00
+    mem_in_gb: 2
+    max_tasks: 16
+  step-7:
+    node_count: 1
+    nprocs: 8
+    wall_time_limit: 01:00:00
+    mem_in_gb: 50
+    max_tasks: 16
+  finish:
+    node_count: 1
+    nprocs: 1
+    wall_time_limit: 00:10:00
+    mem_in_gb: 10
+Woltka v0.1.7, minimap2:
+  minimap2:
+    node_count: 1
+    nprocs: 16
+    wall_time_limit: 10:00:00
+    mem_in_gb: 60
+    max_tasks: 16
+  merge:
+    node_count: 1
+    nprocs: 16
+    wall_time_limit: 1-00:00:00
+    mem_in_gb: 120
diff --git a/data/templates/1.hifiasm-meta_new.sbatch → .../data/templates/1.hifiasm-meta_new.sbatch b/data/templates/1.hifiasm-meta_new.sbatch → .../data/templates/1.hifiasm-meta_new.sbatch
@@ -11,7 +11,7 @@
 
 source ~/.bashrc
 set -e
-conda activate {{conda_environment}}
+{{conda_environment}}
 cd {{output}}/step-1
 
 step=${SLURM_ARRAY_TASK_ID}
@@ -28,3 +28,4 @@ if [[ "$step" == "1" ]]; then
 fi
 
 hifiasm_meta -t {{nprocs}} -o {{output}}/step-1/${sample_name} ${filename}
+touch {{output}}/step-1/completed_${SLURM_ARRAY_TASK_ID}.log
diff --git a/qp_pacbio/data/templates/2.get-circular-genomes.sbatch b/qp_pacbio/data/templates/2.get-circular-genomes.sbatch
@@ -0,0 +1,96 @@
+#!/bin/bash
+#SBATCH -J {{job_name}}
+#SBATCH -p qiita
+#SBATCH -N {{node_count}}
+#SBATCH -n {{nprocs}}
+#SBATCH --time {{wall_time_limit}}
+#SBATCH --mem {{mem_in_gb}}G
+#SBATCH -o {{output}}/step-2/logs/%x-%A_%a.out
+#SBATCH -e {{output}}/step-2/logs/%x-%A_%a.err
+#SBATCH --array {{array_params}}
+
+source ~/.bashrc
+set -e
+{{conda_environment}}
+cd {{output}}/step-1
+
+step=${SLURM_ARRAY_TASK_ID}
+input=$(head -n $step {{output}}/sample_list.txt | tail -n 1)
+sample_name=`echo $input | awk '{print $1}'`
+filename=`echo $input | awk '{print $2}'`
+fn=`basename ${filename}`
+
+# updating the GUI when task 1 runs
+if [[ "$step" == "1" ]]; then
+    python -c "from qp_pacbio.util import client_connect; qclient = client_connect('{{url}}'); qclient.update_job_step('{{qjid}}', 'Running step 2: ${SLURM_ARRAY_JOB_ID}')"
+fi
+
+cat ${sample_name}.p_ctg.gfa | awk '$1=="S" && ($2 ~ /.c$/) {printf ">%s\n%s\n", $2, $3} ' > ../step-2/${sample_name}_circ.fa
+seqkit split --by-id ../step-2/${sample_name}_circ.fa -O ../step-2/${sample_name}_split
+
+### get all contigs for each sample
+cat ${sample_name}.p_ctg.gfa | awk '$1=="S" {printf ">%s\n%s\n", $2, $3} ' > ../step-2/${sample_name}_all_contigs.fa
+
+cd ../step-2/${sample_name}_split
+# making a copy of the small_LCG before they are removed
+mkdir -p {{output}}/step-2/${sample_name}_small_LCG
+find . -maxdepth 1 -type f -size -512k -print0 | xargs -0 -r cp -t ../${sample_name}_small_LCG
+### remove small circular genomes
+find . -type f -size -512k -exec rm -f {} +
+
+# this can result on not having any files left so
+# making sure we have files left
+#
+# extract fasta id for all the genomes in the split folder
+FILES=(*.fa)
+if [ -f $FILES ]; then
+    for f in *.fa; do
+        k=${f##*/}
+        n=${f%.*}
+        grep -E "^>" $f >> circular_id.txt
+    done
+    sed -i 's/>//' circular_id.txt
+    seqkit grep -v -f circular_id.txt ../${sample_name}_all_contigs.fa > ../${sample_name}_noLCG.fa
+else
+    cp ../${sample_name}_all_contigs.fa ../${sample_name}_noLCG.fa
+fi
+
+lcg_folder={{result_fp}}/${sample_name}/LCG/
+mkdir -p ${lcg_folder}
+FILES=({{output}}/step-2/${sample_name}_split/*.fa)
+if [ -f $FILES ]; then
+    for f in `ls {{output}}/step-2/${sample_name}_split/*.fa`; do
+        sn=`basename ${f/_circ/}`;
+        sn=${sn/part_/};
+        cat $f | gzip > ${lcg_folder}/${sn/.fa/.fna}.gz;
+    done
+fi
+
+mkdir -p {{result_fp}}/${sample_name}/
+if [ -f {{output}}/step-2/${sample_name}_noLCG.fa ]; then
+    cat {{output}}/step-2/${sample_name}_noLCG.fa | gzip > {{result_fp}}/${sample_name}/${sample_name}.noLCG.fna.gz
+fi
+
+touch {{output}}/step-2/completed_${SLURM_ARRAY_TASK_ID}.log
+# if the files don't exist, it means that this step didn't generate any
+# inputs for the next step; thus generating all the completed files
+if [[ ! -f "$FILES" && ! -f "{{output}}/step-2/${sample_name}_noLCG.fa" ]]; then
+    touch {{output}}/step-3/completed_${SLURM_ARRAY_TASK_ID}.log
+    touch {{output}}/step-4/completed_${SLURM_ARRAY_TASK_ID}.log
+    touch {{output}}/step-5/completed_${SLURM_ARRAY_TASK_ID}.log
+    touch {{output}}/step-6/completed_${SLURM_ARRAY_TASK_ID}.log
+    touch {{output}}/step-7/completed_${SLURM_ARRAY_TASK_ID}.log
+fi
+
+# saving small LCG, note that these are not processed downstrem so not
+# relevant to the "complete" files
+small_lcg_folder={{result_fp}}/${sample_name}/small_LCG/
+mkdir -p ${small_lcg_folder}
+FILES=({{output}}/step-2/${sample_name}_small_LCG/*.fa)
+if [ -f $FILES ]; then
+    for f in `ls {{output}}/step-2/${sample_name}_small_LCG/*.fa`; do
+        sn=`basename ${f/_circ/}`;
+        sn=${sn/part_/};
+        cat $f | gzip > ${small_lcg_folder}/${sn/.fa/.fna}.gz;
+    done
+fi
diff --git a/data/templates/3.minimap2_assembly.sbatch → ...data/templates/3.minimap2_assembly.sbatch b/data/templates/3.minimap2_assembly.sbatch → ...data/templates/3.minimap2_assembly.sbatch
@@ -11,7 +11,7 @@
 
 source ~/.bashrc
 set -e
-conda activate {{conda_environment}}
+{{conda_environment}}
 cd {{output}}
 
 step=${SLURM_ARRAY_TASK_ID}
@@ -28,5 +28,14 @@ fi
 
 folder=step-3/${sample_name}_binning
 mkdir -p ${folder}
-minimap2 -x map-hifi -t {{nprocs}} -a --MD --eqx -o ${folder}/${sample_name}.sam step-2/${sample_name}_noLCG.fa ${filename}
-samtools view -bS -@4 ${folder}/${sample_name}.sam | samtools sort -@4 -O bam -o ${folder}/${sample_name}.sorted.bam
+
+if [ -f step-2/${sample_name}_noLCG.fa ]; then
+    minimap2 -x map-hifi -I {{mem_in_gb}}G -t {{nprocs}} -a --MD --eqx -o ${folder}/${sample_name}.sam step-2/${sample_name}_noLCG.fa ${filename}
+    samtools view -bS -@4 ${folder}/${sample_name}.sam | samtools sort -@4 -O bam -o ${folder}/${sample_name}.sorted.bam
+else
+    touch {{output}}/step-4/completed_${SLURM_ARRAY_TASK_ID}.log
+    touch {{output}}/step-5/completed_${SLURM_ARRAY_TASK_ID}.log
+    touch {{output}}/step-6/completed_${SLURM_ARRAY_TASK_ID}.log
+    touch {{output}}/step-7/completed_${SLURM_ARRAY_TASK_ID}.log
+fi
+touch {{output}}/step-3/completed_${SLURM_ARRAY_TASK_ID}.log