galaxyproject · lsterck · Jun 12, 2025 · Jun 12, 2025 · Jun 13, 2025 · Jun 13, 2025
diff --git a/tools/chopper/.shed.yml b/tools/chopper/.shed.yml
@@ -0,0 +1,17 @@
+categories:
+- Sequence Analysis
+- Fastq Manipulation
+description: |
+    Filtering and trimming of fastq files with long read sequencing such as PacBio or ONT.
+long_description: |
+    Chopper is a Rust implementation of NanoFilt & NanoLyse, both originally written in Python. 
+    This tool, intended for long read sequencing such as PacBio or ONT, filters and trims a fastq
+    file.  Filtering can be done on metrics like average read quality and minimal or maximal read 
+    length, and applying a headcrop (start of read) and tailcrop (end of read) while printing the 
+    reads passing the filter.    
+    It is part of the NanoPack2 tool suite for visualizing and processing long read sequencing data.
+name: chopper
+owner: iuc
+remote_repository_url: "https://github.com/galaxyproject/tools-iuc/tree/main/tools/chopper/"
+homepage_url: "https://github.com/wdecoster/chopper"
+type: unrestricted
diff --git a/tools/chopper/chopper.xml b/tools/chopper/chopper.xml
@@ -0,0 +1,158 @@
+<tool id="chopper" name="Chopper" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="23.2">
+	<description>Filtering and trimming of long reads.</description>
+	<macros>
+		<token name="@TOOL_VERSION@">0.10.0</token>
+		<token name="@VERSION_SUFFIX@">0</token>
+	</macros>
+	<requirements>
+		<requirement type="package" version="@TOOL_VERSION@">chopper</requirement>
+	</requirements>
+	<version_command>chopper --version</version_command>
+	<command detect_errors="exit_code"><![CDATA[
+ chopper
+
+ --input $input
+
+ #if $contam
+	--contam $contam	
+ #end if
+
+ #if $option_params.quality
+	--quality $option_params.quality.value
+ #end if
+
+ #if $option_params.maxqual
+	--maxqual $option_params.maxqual.value
+ #end if
+
+ #if $option_params.minlength
+	--minlength $option_params.minlength.value
+ #end if
+
+ #if $option_params.maxlength
+	--maxlength $option_params.maxlength.value
+ #end if
+
+ #if $option_params.headcrop
+	--headcrop $option_params.headcrop.value
+ #end if
+
+ #if $option_params.tailcrop
+	--tailcrop $option_params.tailcrop.value
+ #end if
+
+ #if $option_params.mingc
+	--mingc $option_params.mingc.value
+ #end if
+
+ #if $option_params.maxgc
+	--maxgc $option_params.maxgc.value
+ #end if
+
+ ##output capture
+ #if $output_params.inverse == "yes"
+    $output_params.inverse
+ #end if 
+
+ #if $output_params.gzip == "no"
+	> $fq_filt 
+ #else
+	| gzip > $fq_filt_gz
+ #end if
+
+ ]]></command>
+	<inputs>
+		<param argument="--input" type="data" label="FASTQ file to check" format="fastq,fastq.gz,fastqsanger.gz,fastqsanger" />
+		<param argument="--contam" type="data" format="fasta" optional="True" label="Reference FASTA" help="FASTA file with reference to check potential contaminants against."/>	
+
+		<section name="option_params" title="Optional Parameters" expanded="True">
+			<param argument="--quality" type="integer" label="Minimal quality score" value="0" min="0" max="60" help="Sets a minimum Phred average quality score."/>
+			<param argument="--maxqual" type="integer" label="Maximal quality score" value="60" min="0" max="60" help="Sets a maximum Phred average quality score."/>
+			<param argument="--minlength" type="integer" label="Sets a minimum read length" value="1" min="1"  help="Minimal length of read to keep."  optional="True"/>
+			<param argument="--maxlength" type="integer" label="Sets a maximum read length" help="Maximal length of read to keep" optional="True"/>
+			<param argument="--headcrop" type="integer" optional="True" label="Headcrop" value="0" min="0" help="Trim N nucleotides from the start of a read."/>
+			<param argument="--tailcrop" type="integer" optional="True" label="Tailcrop" value="0" min="0" help="Trim N nucleotides from the end of a read."/>
+			<param argument="--mingc" type="float" optional="True" label="Minimum GC content" value="0.0" min="0.0" max="1.0" help="Sets a minimum GC content for reads to keep."/>
+			<param argument="--maxgc" type="float" optional="True" label="Maximum GC content" value="1.0" min="0.0" max="1.0" help="Sets a maximum GC content for reads to keep."/>
+			<param argument="--trim" type="integer" label="Q-score cutoff to trim read ends" value="0" min="0" max="60" help="Takes a quality score and will trim the ends of the reads if they are below the specified cut-off (window-size = 1)."/>
+		</section>
+
+		<section name="output_params" title="Output Parameters" expanded="False">
+			<param argument="--inverse" type="boolean" checked="false" truevalue="--inverse" falsevalue="" label="Output the opposite of the normal results" help="Reverse the output results (aka, the 'failed reads')"/>
+			<param name="gzip" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Gzip output data" help="Set to 'no' to NOT gzip the output file [default gzip output]."/>
+		</section>
+	</inputs>
+
+	<outputs>
+		<data name="fq_filt_gz" format="fastq.gz" label="${tool.name} on ${input.name} ($on_string), gzipped" >
+			<filter> output_params['gzip'] is True </filter>
+		</data>
+		<data name="fq_filt" format="fastq" label="${tool.name} on ${input.name} ($on_string)" >
+			<filter> output_params['gzip'] is False </filter>
+		</data>
+	</outputs>
+
+	<tests>
+		<test expect_num_outputs="1">
+			<param name="input" value="other-test.fastq"/>
+			<section name="output_params">
+				<param name="gzip" value="false"/>
+			</section>
+			<output name="fq_filt">
+				<assert_contents>
+					<has_text text="@35febf09-dcbc-424c-987e-9f3f80fe73a5"/>
+					<has_text text="@3fda06e9-62ef-4448-9993-b90124a793d5"/>
+					<has_text text="@19d9337f-4fb6-46e5-b484-14d05f562506"/>
+				</assert_contents>
+			</output>
+		</test>
+		<test expect_num_outputs="1">
+			<param name="input" value="other-test.fastq"/>
+			<param name="contam" value="random_contam.fa"/>
+			<output name="fq_filt" ftype="fastq.gz" decompress="true"> <!-- file="out2.fq.gz"/> -->
+				<assert_contents>
+					<has_text text="@35febf09-dcbc-424c-987e-9f3f80fe73a5"/>
+					<has_text text="@3fda06e9-62ef-4448-9993-b90124a793d5"/>
+					<has_text text="@19d9337f-4fb6-46e5-b484-14d05f562506"/>
+				</assert_contents>
+			</output>
+		</test>
+		<test expect_num_outputs="1">
+			<param name="input" value="testGC.fastq"/>
+			<section name="option_params">
+				<param name="mingc" value="0.3"/>
+				<param name="maxgc" value="0.8"/>
+			</section>
+			<section name="output_params">
+				<param name="gzip" value="false"/>
+			</section>
+			<output name="fq_filt" > 
+				<assert_contents>
+					<not_has_text text='@GC20'/>
+					<not_has_text text='@GC0'/>
+					<not_has_text text='@GC100'/>
+					<has_text text='@GC50'/>
+					<has_text text='@GC80'/>
+				</assert_contents>
+			</output>
+		</test>
+	</tests>
+	<help><![CDATA[
+**Chopper**
+
+Rust implementation of NanoFilt+NanoLyse, both originally written in Python. This tool, intended for long read sequencing such as PacBio or ONT, filters and trims a fastq file.
+Filtering is done on average read quality and minimal or maximal read length, and applying a headcrop (start of read) and tailcrop (end of read) while printing the reads passing the filter.
+
+Compared to the Python implementation the scope is to deliver the same results, almost the same functionality, at much faster execution times. At the moment this tool does not support filtering using a sequencing_summary file. 
+
+**More Information** 
+
+- **Official Repository**: `Chopper on GitHub`_
+
+.. _Chopper on GitHub: https://github.com/wdecoster/chopper
+
+	]]></help>
+	<citations>
+		<citation type="doi">10.1093/bioinformatics/btad311</citation>
+	</citations>
+</tool>
diff --git a/tools/chopper/test-data/other-test.fastq b/tools/chopper/test-data/other-test.fastq
diff --git a/tools/chopper/test-data/random_contam.fa b/tools/chopper/test-data/random_contam.fa
diff --git a/tools/chopper/test-data/testGC.fastq b/tools/chopper/test-data/testGC.fastq
@@ -0,0 +1,44 @@
+@GC0
+AAAAAAAAAA
++
+OOOOOOOOOO
+@GC10
+GAAAAAAAAA
++
+OOOOOOOOOO
+@GC20
+GCAAAAAAAA
++
+OOOOOOOOOO
+@GC30
+GCGAAAAAAA
++
+OOOOOOOOOO
+@GC40
+GCGCAAAAAA
++
+OOOOOOOOOO
+@GC50
+GCGCGAAAAA
++
+OOOOOOOOOO
+@GC60
+GCGCGCAAAA
++
+OOOOOOOOOO
+@GC40
+GCGCGCGAAA
++
+OOOOOOOOOO
+@GC80
+GCGCGCGCAA
++
+OOOOOOOOOO
+@GC90
+GCGCGCGCGA
++
+OOOOOOOOOO
+@GC100
+GCGCGCGCGC
++
+OOOOOOOOOO