nf-core · nick-youngblut · May 13, 2025 · May 13, 2025
diff --git a/.gitignore b/.gitignore
@@ -12,3 +12,4 @@ samplesheet.csv
 *.swp
 input*
 null/
+tmp/
diff --git a/.nf-core.yml b/.nf-core.yml
@@ -6,9 +6,6 @@ lint:
     - docs/images/nf-core-scnanoseq_logo_dark.png
     - docs/images/nf-core-scnanoseq_logo_light.png
     - .gitignore
-  pipeline_todos:
-    - README.md
-    - main.nf
   template_strings: false
 nf_core_version: 3.0.2
 org_path: null

diff --git a/CITATIONS.md b/CITATIONS.md
@@ -38,9 +38,9 @@
 
   > De Coster W, D'Hert S, Schultz DT, Cruts M, Van Broeckhoven C. NanoPack: visualizing and processing long-read sequencing data. Bioinformatics 2018 Aug 1; 34(15):2666-9 doi:10.1093/bioinformatics/bty149. PubMed PMID: 29547981; PubMed Central PMCID: PMC6061794.
 
-- [Nanofilt](https://pubmed.ncbi.nlm.nih.gov/29547981/)
+- [Chopper](https://doi.org/10.1093/bioinformatics/btad311)
 
-  > De Coster W, D'Hert S, Schultz DT, Cruts M, Van Broeckhoven C. NanoPack: visualizing and processing long-read sequencing data. Bioinformatics 2018 Aug 1; 34(15):2666-9 doi:10.1093/bioinformatics/bty149. PubMed PMID: 29547981; PubMed Central PMCID: PMC6061794.
+> Wouter De Coster, Rosa Rademakers, NanoPack2: population-scale evaluation of long-read sequencing data, Bioinformatics, Volume 39, Issue 5, May 2023, btad311, https://doi.org/10.1093/bioinformatics/btad311
 
 - [NanoPlot](https://pubmed.ncbi.nlm.nih.gov/29547981/)
 

diff --git a/README.md b/README.md
@@ -25,14 +25,25 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool
 
 On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources. The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/scnanoseq/results).
 
+## test
+
+
+```bash
+nextflow run main.nf \
+   -profile test,apptainer \
+   --input tmp/samples.csv \
+   --outdir tmp/outdir
+```
+
+
 ## Pipeline summary
 
 ![scnanoseq diagram](assets/scnanoseq_tube_map.png)
 
 1. Raw read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/), [`NanoPlot`](https://github.com/wdecoster/NanoPlot), [`NanoComp`](https://github.com/wdecoster/nanocomp) and [`ToulligQC`](https://github.com/GenomiqueENS/toulligQC))
 2. Unzip and split FASTQ ([`pigz`](https://github.com/madler/pigz))
    1. Optional: Split FASTQ for faster processing ([`split`](https://linux.die.net/man/1/split))
-3. Trim and filter reads ([`Nanofilt`](https://github.com/wdecoster/nanofilt))
+3. Trim and filter reads ([`Chopper`](https://github.com/wdecoster/chopper))
 4. Post trim QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/), [`NanoPlot`](https://github.com/wdecoster/NanoPlot), [`NanoComp`](https://github.com/wdecoster/nanocomp) and [`ToulligQC`](https://github.com/GenomiqueENS/toulligQC))
 5. Barcode detection using a custom whitelist or 10X whitelist. ([`BLAZE`](https://github.com/shimlab/BLAZE))
 6. Extract barcodes. Consists of the following steps:

diff --git a/bin/cat_fastq.py b/bin/cat_fastq.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+
+import argparse
+import gzip
+import shutil
+import os
+from pathlib import Path
+
+def cat_files(input_files: list[str], output_file: str) -> None:
+    """Concatenate gzipped files."""
+    with open(output_file, 'wb') as f_out:
+        for f_path in input_files:
+            try:
+                with gzip.open(f_path, 'rb') as f_in:
+                    shutil.copyfileobj(f_in, f_out)
+            except gzip.BadGzipFile:
+                print(f"Warning: {f_path} is not a valid gzip file. Attempting to read as plain text.")
+                try:
+                    with open(f_path, 'rb') as f_in_plain: # Read as binary for consistency
+                        shutil.copyfileobj(f_in_plain, f_out)
+                except Exception as e:
+                     print(f"Error processing file {f_path}: {e}")
+            except Exception as e:
+                print(f"Error processing file {f_path}: {e}")
+
+def main():
+    parser = argparse.ArgumentParser(description="Concatenate FASTQ files.")
+    parser.add_argument("--prefix", type=str, required=True, help="Output file prefix.")
+    parser.add_argument("--single_end", action="store_true", help="Input files are single-end.")
+    parser.add_argument("--reads", nargs='+', required=True, help="List of input FASTQ files.")
+
+    args = parser.parse_args()
+
+    output_dir = Path(".")
+
+    read_paths = [Path(f) for f in args.reads]
+
+    if args.single_end:
+        output_file = output_dir / f"{args.prefix}.merged.fastq.gz"
+        if len(read_paths) == 1:
+            print(f"Symlinking {read_paths[0]} to {output_file}...")
+            os.symlink(read_paths[0], output_file)
+            print("Symlink complete.")
+        elif len(read_paths) > 1:
+            print(f"Concatenating {len(read_paths)} single-end files to {output_file}...")
+            cat_files([str(p) for p in read_paths], str(output_file))
+            print("Concatenation complete.")
+        else:
+            print("Warning: No input files provided for single-end processing.")
+    else: # Paired-end
+        output_file_1 = output_dir / f"{args.prefix}_1.merged.fastq.gz"
+        output_file_2 = output_dir / f"{args.prefix}_2.merged.fastq.gz"
+
+        if len(read_paths) == 2:
+            print(f"Symlinking {read_paths[0]} to {output_file_1}...")
+            os.symlink(read_paths[0], output_file_1)
+            print("R1 symlink complete.")
+            print(f"Symlinking {read_paths[1]} to {output_file_2}...")
+            os.symlink(read_paths[1], output_file_2)
+            print("R2 symlink complete.")
+        elif len(read_paths) > 2:
+            if len(read_paths) % 2 != 0:
+                print("Error: Paired-end reads require an even number of files.")
+                return # Or raise error
+
+            read1_paths = [str(read_paths[i]) for i in range(0, len(read_paths), 2)]
+            read2_paths = [str(read_paths[i]) for i in range(1, len(read_paths), 2)]
+
+            print(f"Concatenating {len(read1_paths)} R1 files to {output_file_1}...")
+            cat_files(read1_paths, str(output_file_1))
+            print("R1 concatenation complete.")
+
+            print(f"Concatenating {len(read2_paths)} R2 files to {output_file_2}...")
+            cat_files(read2_paths, str(output_file_2))
+            print("R2 concatenation complete.")
+        else: # len(read_paths) < 2
+            print("Warning: Less than 2 input files provided for paired-end processing.")
+
+
+if __name__ == "__main__":
+    main() 
diff --git a/conf/modules.config b/conf/modules.config
@@ -295,7 +295,7 @@ if (!params.skip_trimming) {
 
     if (params.split_amount > 0){
         process {
-            withName: '.*:SPLIT_FILE' {
+            withName: '.*:SPLIT_SEQ' {
                 publishDir = [
                     enabled: false
                 ]
@@ -312,13 +312,13 @@ if (!params.skip_trimming) {
         }
     }
 
-    // NANOFILT
+    // CHOPPER
     if ( !params.skip_trimming ){
         process {
-            withName:'.*:NANOFILT' {
+            withName:'.*:CHOPPER' {
                 ext.args = {
                     [
-                        params.min_length ? "--length ${params.min_length}" : "",
+                        params.min_length ? "--minlength ${params.min_length}" : "",
                         params.min_q_score ? "--quality ${params.min_q_score}" : ""
                     ].join(' ').trim()
                 }

diff --git a/docs/output.md b/docs/output.md
@@ -11,7 +11,7 @@ The directories listed below will be created in the results directory after the
 The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:
 
 - [Preprocessing](#preprocessing)
-  - [Nanofilt](#nanofilt) - Read Quality Filtering and Trimming
+  - [Chopper](#chopper) - Read Quality Filtering and Trimming
 - [Barcode Calling](#barcode-calling)
   - [BLAZE](#blaze) - Barcode caller
 - [Alignment](#alignment)
@@ -39,19 +39,19 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 
 ## Preprocessing
 
-### Nanofilt
+### Chopper
 
 <details markdown="1">
 <summary>Output files</summary>
 
 - `<sample_identifier>/`
   - `fastq/`
-    - `trimmed_nanofilt/`
+    - `trimmed_chopper/`
       - `*_filtered.fastq.gz`: The post-trimmed fastq. By default this will be mostly quality trimmed.
 
 </details>
 
-[Nanofilt](https://github.com/wdecoster/nanofilt) is a tool used for filtering and trimming of long read sequencing data.
+[Chopper](https://github.com/wdecoster/chopper) is a tool used for filtering and trimming of long read sequencing data.
 
 ## Barcode Calling
 

diff --git a/modules/local/chopper.nf b/modules/local/chopper.nf
@@ -0,0 +1,50 @@
+process CHOPPER {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "bioconda::nanofilt=0.10.0"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/chopper:0.10.0--hcdda2d0_0':
+        'biocontainers/chopper:0.10.0--hcdda2d0_0' }"
+
+    input:
+    tuple val(meta), path(reads)
+
+    output:
+    tuple val(meta), path("*.filtered.fastq.gz"), emit: reads
+    path "versions.yml"                         , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args   = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+
+    """
+    FILE_PREFIX=${prefix}
+    if [ ${params.split_amount} -gt 0 ]; then
+        IDX=\$(basename ${reads} | cut -f2 -d'.')
+        FILE_PREFIX=\${FILE_PREFIX}.\${IDX}
+    fi
+
+    chopper -t ${task.cpus} $args --input $reads | \\
+      gzip -c > \${FILE_PREFIX}.filtered.fastq.gz 
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        chopper: \$( chopper --version | sed -e "s/chopper //g" )
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    touch ${prefix}.filtered.fastq.gz
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        chopper: \$( chopper --version | sed -e "s/chopper //g" )
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/nanofilt.nf b/modules/local/nanofilt.nf
@@ -11,7 +11,7 @@ process NANOFILT {
     tuple val(meta), path(reads)
 
     output:
-    tuple val(meta), path("*.filtered.fastq")   , emit: reads
+    tuple val(meta), path("*.filtered.fastq.gz"), emit: reads
     path "versions.yml"                         , emit: versions
 
     when:
@@ -27,7 +27,13 @@ process NANOFILT {
         IDX=\$(basename ${reads} | cut -f2 -d'.')
         FILE_PREFIX=\${FILE_PREFIX}.\${IDX}
     fi
-    cat $reads | NanoFilt $args > \${FILE_PREFIX}.filtered.fastq
+
+    # if reads ends with .gz, then uncompress it
+    if [[ "${reads}" == *.gz ]]; then
+        gunzip -c $reads | NanoFilt $args | gzip -c > \${FILE_PREFIX}.filtered.fastq.gz 
+    else    
+        cat $reads | NanoFilt $args | gzip -c > \${FILE_PREFIX}.filtered.fastq.gz
+    fi
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
@@ -36,8 +42,9 @@ process NANOFILT {
     """
 
     stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
     """
-    touch ${prefix}.filtered.fastq
+    touch ${prefix}.filtered.fastq.gz
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/modules/local/split_file.nf b/modules/local/split_file.nf
@@ -41,4 +41,4 @@ process SPLIT_FILE {
         split: \$(echo \$(split --version 2>&1 | head -n1 | sed 's#split (GNU coreutils) ##g'))
     END_VERSIONS
     """
-}
+}
diff --git a/modules/local/split_seq.nf b/modules/local/split_seq.nf
@@ -0,0 +1,65 @@
+process SPLIT_SEQ {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "bioconda::seqkit=2.10.0 conda-forge::sed=4.7"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/seqkit:0.10.0--1' :
+        'quay.io/biocontainers/seqkit:2.10.0--h9ee0642_0' }"
+
+    input:
+    tuple val(meta), path(unsplit_file)
+    val file_ext
+    val split_amount
+
+    output:
+    tuple val(meta), path("output/*$file_ext"), emit: split_files
+    path "versions.yml"                       , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args   = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    # Split the file by number of reads
+    seqkit -j ${task.cpus} split2 ${args} \\
+      -s ${split_amount} --out-dir output --force ${unsplit_file}
+
+    # rename files to have the correct extension
+    for file in ./output/*.part_*; do
+        if [[ -f "\$file" ]]; then
+            base_name=\$(basename "\$file")
+            # Remove .gz suffix if present
+            if [[ "\$base_name" == *.gz ]]; then
+                base_name_no_gz="\${base_name%.gz}"
+            else
+                base_name_no_gz="\$base_name"
+            fi
+            # Remove the remaining extension (after the last dot)
+            base_name_final="\${base_name_no_gz%.*}"
+            # Remove up to .part_
+            base_name_final="\${base_name_final#*.part_}"
+            # Rename the file
+            mv "\$file" "output/${prefix}.\${base_name_final}${file_ext}"
+        fi
+    done
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        seqkit: \$(echo \$(seqkit version | head -n1 | sed 's/seqkit version //g'))
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    touch ${prefix}.part_001${file_ext}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        seqkit: \$(echo \$(seqkit version | head -n1 | sed 's/seqkit version //g'))
+    END_VERSIONS
+    """
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -12,3 +12,4 @@ samplesheet.csv @@
     *.swp
     input*
     null/
+    tmp/