galaxyproject · RZ9082 · Jun 18, 2025 · Jun 18, 2025 · Jun 18, 2025 · Jun 18, 2025
diff --git a/tools/tesseract/.shed.yml b/tools/tesseract/.shed.yml
@@ -0,0 +1,8 @@
+name: tesseract
+owner: iuc
+description: Tesseract is an OCR engine with support for unicode and the ability to recognize more than 100 languages out of the box.
+long_description: Tesseract is an OCR engine with support for unicode and the ability to recognize more than 100 languages out of the box.
+homepage_url: https://github.com/tesseract-ocr/tesseract
+remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/main/tools/tesseract/
+categories:
+  - Natural Language Processing
diff --git a/tools/tesseract/macros.xml b/tools/tesseract/macros.xml
@@ -0,0 +1,23 @@
+<macros>
+    <token name="@TOOL_VERSION@">5.0.0</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">tesseract</requirement>
+        </requirements>
+    </xml>
+    <xml name="creators">
+        <creator>
+            <person givenName="Rand" familyName="Zoabi" url="https://github.com/RZ9082" identifier="https://orcid.org/0009-0000-2501-8053" />
+        </creator>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1145/1815330.1815339</citation>
+            <citation type="doi">10.1145/1577802.1577804</citation>
+            <citation type="doi">10.1145/1577802.1577809</citation>
+            <citation type="doi">10.1109/ICDAR.2009.257</citation>
+            <citation type="doi">10.1109/ICDAR.2007.4376991</citation>
+        </citations>
+    </xml>
+</macros>
diff --git a/tools/tesseract/tesseract.xml b/tools/tesseract/tesseract.xml
@@ -0,0 +1,247 @@
+<tool id="tesseract" name="Tesseract" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="24.2">
+    <description>Optical Character Recognition</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <expand macro="creators" />
+    <command detect_errors="exit_code"><![CDATA[
+        echo '$input_file' | tr ',' '\n' > img_paths &&
+        tesseract img_paths output
+        --tessdata-dir '${tessdata.fields.path}'
+        #if $language:
+            -l ${ str($language).replace(",","+") }
+        #end if
+        --psm $psm
+        #if $dpi:
+            --dpi $dpi
+        #end if
+        #if $user_words
+            --user-words '$user_words'
+        #end if
+        #if $user_patterns
+            --user-patterns '$user_patterns'
+        #end if
+        #for $format in $output_formats
+            -c $format=1
+        #end for
+    ]]></command>
+    <inputs>
+        <param name="input_file" type="data" format="jpg,png,tif,tiff,bmp" label="Image file(s)" multiple="true"/>
+        <param name="tessdata" type="select" label="Tessdata" help="Language data models">
+            <options from_data_table="tessdata">
+                <column name="value" index="0"/>
+                <column name="name" index="1"/>
+                <column name="version" index="2"/>
+                <column name="path" index="3"/>
+                <filter type="sort_by" column="1"/>
+            </options>
+            <validator type="no_options" message="A built-in tesseract model is not available. Please ask the Galaxy admins to install one on the server."/>
+        </param>
+        <param name="user_words" type="data" format="txt" label="User words file" optional="true" help="The user words file allows you to specify a list of words that Tesseract should treat as known words. One word per line"/>
+        <param name="user_patterns" type="data" format="txt" label="User patterns file" optional="true" help="One pattern per line in UTF-8 format. For more information please visit the tesseract docs about patterns linked in the help section"/>
+        <param name="language" type="select" label="OCR Language(s)" multiple="true" help="In the case of a multilingual image(s), more the one language can be selected">
+            <option value="afr">Afrikaans</option>
+            <option value="amh">Amharic</option>
+            <option value="ara">Arabic</option>
+            <option value="asm">Assamese</option>
+            <option value="aze">Azerbaijani</option>
+            <option value="aze_cyrl">Azerbaijani - Cyrilic</option>
+            <option value="bel">Belarusian</option>
+            <option value="ben">Bengali</option>
+            <option value="bod">Tibetan</option>
+            <option value="bos">Bosnian</option>
+            <option value="bre">Breton</option>
+            <option value="bul">Bulgarian</option>
+            <option value="cat">Catalan; Valencian</option>
+            <option value="ceb">Cebuano</option>
+            <option value="ces">Czech</option>
+            <option value="chi_sim">Chinese simplified</option>
+            <option value="chi_tra">Chinese traditional</option>
+            <option value="chr">Cherokee</option>
+            <option value="cos">Corsican</option>
+            <option value="cym">Welsh</option>
+            <option value="dan">Danish</option>
+            <option value="deu">German</option>
+            <option value="deu_latf">German Fraktur Latin</option>
+            <option value="div">Dhivehi</option>
+            <option value="dzo">Dzongkha</option>
+            <option value="ell">Greek, Modern, 1453-</option>
+            <option value="eng">English</option>
+            <option value="enm">English, Middle, 1100-1500</option>
+            <option value="epo">Esperanto</option>
+            <option value="equ">Math / equation detection module</option>
+            <option value="est">Estonian</option>
+            <option value="eus">Basque</option>
+            <option value="fas">Persian</option>
+            <option value="fao">Faroese</option>
+            <option value="fil">Filipino</option>
+            <option value="fin">Finnish</option>
+            <option value="fra">French</option>
+            <option value="frm">French, Middle, ca.1400-1600</option>
+            <option value="fry">West Frisian</option>
+            <option value="gla">Scottish Gaelic</option>
+            <option value="gle">Irish</option>
+            <option value="glg">Galician</option>
+            <option value="grc">Greek, Ancient, to 1453</option>
+            <option value="guj">Gujarati</option>
+            <option value="hat">Haitian; Haitian Creole</option>
+            <option value="heb">Hebrew</option>
+            <option value="hin">Hindi</option>
+            <option value="hrv">Croatian</option>
+            <option value="hun">Hungarian</option>
+            <option value="hye">Armenian</option>
+            <option value="iku">Inuktitut</option>
+            <option value="ind">Indonesian</option>
+            <option value="isl">Icelandic</option>
+            <option value="ita">Italian</option>
+            <option value="ita_old">Italian - Old</option>
+            <option value="jav">Javanese</option>
+            <option value="jpn">Japanese</option>
+            <option value="kan">Kannada</option>
+            <option value="kat">Georgian</option>
+            <option value="kat_old">Georgian - Old</option>
+            <option value="kaz">Kazakh</option>
+            <option value="khm">Central Khmer</option>
+            <option value="kir">Kirghiz; Kyrgyz</option>
+            <option value="kmr">Kurdish Kurmanji</option>
+            <option value="kor">Korean</option>
+            <option value="kor_vert">Korean vertical</option>
+            <option value="lao">Lao</option>
+            <option value="lat">Latin</option>
+            <option value="lav">Latvian</option>
+            <option value="lit">Lithuanian</option>
+            <option value="ltz">Luxembourgish</option>
+            <option value="mal">Malayalam</option>
+            <option value="mar">Marathi</option>
+            <option value="mkd">Macedonian</option>
+            <option value="mlt">Maltese</option>
+            <option value="mon">Mongolian</option>
+            <option value="mri">Maori</option>
+            <option value="msa">Malay</option>
+            <option value="mya">Burmese</option>
+            <option value="nep">Nepali</option>
+            <option value="nld">Dutch; Flemish</option>
+            <option value="nor">Norwegian</option>
+            <option value="oci">Occitan post 1500</option>
+            <option value="ori">Oriya</option>
+            <option value="osd">Orientation and script detection module</option>
+            <option value="pan">Panjabi; Punjabi</option>
+            <option value="pol">Polish</option>
+            <option value="por">Portuguese</option>
+            <option value="pus">Pushto; Pashto</option>
+            <option value="que">Quechua</option>
+            <option value="ron">Romanian; Moldavian; Moldovan</option>
+            <option value="rus">Russian</option>
+            <option value="san">Sanskrit</option>
+            <option value="sin">Sinhala; Sinhalese</option>
+            <option value="slk">Slovak</option>
+            <option value="slv">Slovenian</option>
+            <option value="snd">Sindhi</option>
+            <option value="spa">Spanish; Castilian</option>
+            <option value="spa_old">Spanish; Castilian - Old</option>
+            <option value="sqi">Albanian</option>
+            <option value="srp">Serbian</option>
+            <option value="srp_latn">Serbian - Latin</option>
+            <option value="sun">Sundanese</option>
+            <option value="swa">Swahili</option>
+            <option value="swe">Swedish</option>
+            <option value="syr">Syriac</option>
+            <option value="tam">Tamil</option>
+            <option value="tat">Tatar</option>
+            <option value="tel">Telugu</option>
+            <option value="tgk">Tajik</option>
+            <option value="tha">Thai</option>
+            <option value="tir">Tigrinya</option>
+            <option value="ton">Tonga</option>
+            <option value="tur">Turkish</option>
+            <option value="uig">Uighur; Uyghur</option>
+            <option value="ukr">Ukrainian</option>
+            <option value="urd">Urdu</option>
+            <option value="uzb">Uzbek</option>
+            <option value="uzb_cyrl">Uzbek - Cyrilic</option>
+            <option value="vie">Vietnamese</option>
+            <option value="yid">Yiddish</option>
+            <option value="yor">Yoruba</option>
+        </param>
+        <param name="output_formats" type="select" label="Output format(s)" multiple="true" optional="false">
+            <option value="tessedit_create_txt" selected="true">Text</option>
+            <option value="tessedit_create_pdf">PDF</option>
+            <option value="tessedit_create_hocr">HOCR</option>
+            <option value="tessedit_create_tsv">TSV</option>
+        </param>
+        <param argument="--psm" type="select" label="Page Segmentation Mode (PSM)" help="How the page layout is interpreted." optional="true">
+            <option value="0">Orientation and script detection only</option>
+            <option value="1">Automatic page segmentation with OSD</option>
+            <option value="2">Automatic page segmentation, but no OSD, or OCR</option>
+            <option value="3" selected="true">Fully automatic page segmentation, but no OSD</option>
+            <option value="4">Assume a single column of text of variable sizes</option>
+            <option value="5">Assume a single uniform block of vertically aligned text</option>
+            <option value="6">Assume a single uniform block of text</option>
+            <option value="7">Treat the image as a single text line</option>
+            <option value="8">Treat the image as a single word</option>
+            <option value="9">Treat the image as a single word in a circle</option>
+            <option value="10">Treat the image as a single character</option>
+            <option value="11">Sparse text. Find as much text as possible in no particular order</option>
+            <option value="12">Sparse text with OSD</option>
+            <option value="13">Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific</option>
+        </param>
+        <param argument="--dpi" type="integer" label="Image DPI (dots per inch)" min="100" help="When left empty, the resolution in retrieved from the image metadata. If this information in not icluded, Tesseract will make a guess. Tesseract performes best on images with at least 300 dpi" optional="true" />
+    </inputs>
+    <outputs>
+        <data name="output_text" format="txt" from_work_dir="output.txt" label="${tool.name} on ${on_string}: Text">
+            <filter>'tessedit_create_txt' in output_formats</filter>
+        </data>
+        <data name="output_pdf" format="pdf" from_work_dir="output.pdf" label="${tool.name} on ${on_string}: PDF">
+            <filter>'tessedit_create_pdf' in output_formats</filter>
+        </data>
+        <data name="output_hocr" format="html" from_work_dir="output.hocr" label="${tool.name} on ${on_string}: HOCR">
+            <filter>'tessedit_create_hocr' in output_formats</filter>
+        </data>
+        <data name="output_tsv" format="tsv" from_work_dir="output.tsv" label="${tool.name} on ${on_string}: TSV">
+            <filter>'tessedit_create_tsv' in output_formats</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="2">
+            <param name="input_file" value="eurotext.png,test_image_cherokee.png"/>
+            <param name="tessdata" value="test_tessdata"/>
+            <param name="user_words" value="eng.user-words"/>
+            <param name="user_patterns" value="eng.user-patterns"/>
+            <param name="language" value="chr"/>
+            <param name="output_formats" value="tessedit_create_txt,tessedit_create_pdf"/>
+            <param name="psm" value="3"/>
+            <output name="output_text" file="output.txt"/>
+            <output name="output_pdf" file="output.pdf"/>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="input_file" value="test_image_cherokee.png"/>
+            <param name="tessdata" value="test_tessdata"/>
+            <param name="language" value="chr"/>
+            <param name="output_formats" value="tessedit_create_hocr,tessedit_create_tsv"/>
+            <param name="psm" value="11"/>
+            <output name="output_hocr">
+                <assert_contents>
+                    <has_text text="Ꮳ"/>
+                    <has_text text="ᏌᎠᏯᏙᏣᎠ"/>
+                    <has_size value="1785" delta="10"/>
+                </assert_contents>
+            </output>
+            <output name="output_tsv" file="output.tsv"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+Tesseract OCR Tool
+------------------
+Tesseract is an OCR engine with support for unicode and the ability to recognize more than 100 languages out of the box.
+
+* `Tesseract User Manual <https://tesseract-ocr.github.io/tessdoc/>`_
+
+* `API example for user patterns <https://tesseract-ocr.github.io/tessdoc/APIExample-user_patterns.html>`_
+
+**License**
+
+* `Apache-2.0 <https://raw.githubusercontent.com/tesseract-ocr/tesseract/refs/heads/main/LICENSE>`_
+    ]]></help>
+    <expand macro="citations" />
+</tool>
diff --git a/tools/tesseract/test-data/eng.user-patterns b/tools/tesseract/test-data/eng.user-patterns
@@ -0,0 +1,2 @@
+1-\d\d\d-GOOG-411
+www.\n\\\*.com
diff --git a/tools/tesseract/test-data/eng.user-words b/tools/tesseract/test-data/eng.user-words
@@ -0,0 +1,5 @@
+the
+quick
+brown
+fox
+jumped
diff --git a/tools/tesseract/test-data/eurotext.png b/tools/tesseract/test-data/eurotext.png
diff --git a/tools/tesseract/test-data/output.pdf b/tools/tesseract/test-data/output.pdf
diff --git a/tools/tesseract/test-data/output.tsv b/tools/tesseract/test-data/output.tsv
@@ -0,0 +1,12 @@
+level	page_num	block_num	par_num	line_num	word_num	left	top	width	height	conf	text
+1	1	0	0	0	0	0	0	1024	1024	-1	
+2	1	1	0	0	0	92	296	875	130	-1	
+3	1	1	1	0	0	92	296	875	130	-1	
+4	1	1	1	1	0	92	296	875	130	-1	
+5	1	1	1	1	1	92	297	174	129	37.403927	Ꮳ
+5	1	1	1	1	2	286	296	681	130	16.962364	ᏌᎠᏯᏙᏣᎠ
+2	1	2	0	0	0	164	572	694	133	-1	
+3	1	2	1	0	0	164	572	694	133	-1	
+4	1	2	1	1	0	164	572	694	133	-1	
+5	1	2	1	1	1	164	568	364	184	53.049427	ᎪᎶᎩ)
+5	1	2	1	1	2	650	572	200	132	35.113586	Ꮾ(Ꭰ
diff --git a/tools/tesseract/test-data/output.txt b/tools/tesseract/test-data/output.txt
@@ -0,0 +1,14 @@
+ᎢᏂᏮ (ᏄᏌᎥᏟᏦ) (ᏏᎵᏣᎳᏲ). (Ꮅ9Ꮶ). ᎫᏢᏚᎥ
+ᎤᏙᏮᎢ (ᏂᏮ ᏕᏎ3Ꮷ,Ꮞ5Ꮪ6.ᏤᎦ “ᎥᏧᏃᎩ- Ꮱ90 ᏧᎾᏴ
+(Ꮭ ᏧᏄᏟᏦᎥᏙᏴᏣᎾᏚᏮ, 3Ꮪ 12. 506 ᎾᎴ Ꭼ--Ꮑ311
+ᎥᎢᏣᏁᎸ ᏄᏚᏢᎸ ᎠᏁ 1161 (ᏇᎳᏮᏌᏚᎥ (6. ᏟᏣᎥᎸ ᎥᏚ ᏚᏢᎸᏒ.
+ᎠᎴᎢ ..ᏚᏟᏂᏒᏮᎥ16” ᏏᎠᎱᎸᏄᎠᏮ ᎬᎿᏟᏂᏚ ᏚᏢᎱᎥᎸᏰᎥ
+(ᏏᏮᎢ ᏧᏮᎯᏒ (Ꭿ016(Ꮢ ᎻᏄᏌᏲᏁᏧ. ᏞᏮ ᎢᏣᏁᎸᎢᏧ ᏏᎢᏛᏌᏲ
+ᏄᎢᏄᏢᎠᎥᏧᏮᏅ ᏚᎸᏌᎥᏮ ᎠᎧᏄᎱ-ᏄᎶᏮᏚᏚᏌᏚ 16 ᏟᏂᎥᏮᏁ
+ᎠᏰᎸᎱᏮᏚᏚᏮᏔᏦ. ᏞᏘ ᏙᎾᎤᎥᏢᏮ (131ᎢᏣᎠ6 ᎱᎸᎠᎥᏧᎸ
+ᏚᎸ1(3 ᏚᏅᏣᏢᎢᎸ ᎥᎥ ᏟᏱᏒᏮ ᎠᎥᏪᎱᎾ. ᎬᎥ ᏃᎾᎱᎱᎾ
+ᎢᏁᎸᎢᎱᏫᏒ ᎱᏰᎠᎥᏧᎾ ᏚᏰᎥᎥᎸ ᏚᎾᏏᎱᏮ ᏮᎥ ᎠᎾᎥᎱᎾ
+ᎠᏮᎱᏮᏃᏣᏚᎾ. Ꭺ ᎢᎸᎠᏣᏚᏘ (ᏁᎪ1ᎢᏣᎥᏁ ᎢᎦᏁᎠᎥᏧᎸᎸ
+ᏚᎸ1(3 ᏚᎾᏣᎠᎢᏮ Ꮕ ᏟᎸᏣ ᎠᎱᎾᎬᏪᏌᎥᏣᎾᏚᎾ.
+Ꮳ ᏌᎠᏯᏙᏣᎠ
+ᎪᎶᎩ)1 Ꮾ6Ꭰ
diff --git a/tools/tesseract/test-data/tessdata.loc b/tools/tesseract/test-data/tessdata.loc
@@ -0,0 +1 @@
+test_tessdata	Default 4.1.0	4.1.0	${__HERE__}/tessdata
diff --git a/tools/tesseract/test-data/tessdata/chr.traineddata b/tools/tesseract/test-data/tessdata/chr.traineddata
diff --git a/tools/tesseract/test-data/test_image_cherokee.png b/tools/tesseract/test-data/test_image_cherokee.png
diff --git a/tools/tesseract/tool-data/tessdata.loc.sample b/tools/tesseract/tool-data/tessdata.loc.sample
@@ -0,0 +1,9 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a tessdata model folder. The tessdata.loc 
+#file needs this format (longer white space is the TAB character):
+
+#<unique_build_id>	<display_name>	<version>	<DB_folder_path>
+
+# for example:
+
+# tessdata	name 0.1	0.1	/data/tessdata
diff --git a/tools/tesseract/tool_data_table_conf.xml.sample b/tools/tesseract/tool_data_table_conf.xml.sample
@@ -0,0 +1,7 @@
+<tables>
+    <!-- Location of tessdata files -->
+    <table name="tessdata" comment_char="#">
+        <columns>value, name, version, path</columns>
+        <file path="tool-data/tessdata.loc" />
+    </table>
+</tables>
diff --git a/tools/tesseract/tool_data_table_conf.xml.test b/tools/tesseract/tool_data_table_conf.xml.test
@@ -0,0 +1,7 @@
+<tables>
+    <!-- Location of tessdata files -->
+    <table name="tessdata" comment_char="#">
+        <columns>value, name, version, path</columns>
+        <file path="${__HERE__}/test-data/tessdata.loc" />
+    </table>
+</tables>
-Original file line number
+Diff line change
@@ -0,0 +1,5 @@
+    the
+    quick
+    brown
+    fox
+    jumped
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		test_tessdata Default 4.1.0 4.1.0 ${__HERE__}/tessdata