Skip to content

Commit 529d1f6

Browse files
authored
Chore: put tesseract multiple languages splitter "+" in constant (#2226)
^^^
1 parent ac30268 commit 529d1f6

File tree

3 files changed

+9
-5
lines changed

3 files changed

+9
-5
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
* **Refactor pdfminer code.** The pdfminer code is moved from `unstructured-inference` to `unstructured`.
77
* **Improve handling of auth data for fsspec connectors.** Leverage an extension of the dataclass paradigm to support a `sensitive` annotation for fields related to auth (i.e. passwords, tokens). Refactor all fsspec connectors to use explicit access configs rather than a generic dictionary.
88
* **Add glob support for fsspec connectors** Similar to the glob support in the ingest local source connector, similar filters are now enabled on all fsspec based source connectors to limit files being partitioned.
9+
* Define a constant for the splitter "+" used in tesseract ocr languages.
910

1011
### Features
1112

unstructured/partition/lang.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from unstructured.documents.elements import Element
88
from unstructured.logger import logger
9+
from unstructured.partition.utils.constants import TESSERACT_LANGUAGES_SPLITTER
910

1011
# pytesseract.get_languages(config="") only shows user installed language packs,
1112
# so manually include the list of all currently supported Tesseract languages
@@ -160,7 +161,7 @@ def prepare_languages_for_tesseract(languages: Optional[List[str]] = ["eng"]):
160161
)
161162
return "eng"
162163

163-
return "+".join(converted_languages)
164+
return TESSERACT_LANGUAGES_SPLITTER.join(converted_languages)
164165

165166

166167
def check_languages(languages: Optional[List[str]], ocr_languages: Optional[str]):
@@ -196,7 +197,7 @@ def convert_old_ocr_languages_to_languages(ocr_languages: str):
196197
Assumption: ocr_languages is in tesseract plus sign format
197198
"""
198199

199-
return ocr_languages.split("+")
200+
return ocr_languages.split(TESSERACT_LANGUAGES_SPLITTER)
200201

201202

202203
def convert_language_to_tesseract(lang: str) -> str:
@@ -223,17 +224,17 @@ def convert_language_to_tesseract(lang: str) -> str:
223224
# try to match ISO 639-3 code
224225
if lang_iso639.part3 in pytesseract_langs_3:
225226
matched_langcodes = _get_all_tesseract_langcodes_with_prefix(lang_iso639.part3)
226-
return "+".join(matched_langcodes)
227+
return TESSERACT_LANGUAGES_SPLITTER.join(matched_langcodes)
227228

228229
# try to match ISO 639-2b
229230
elif lang_iso639.part2b in pytesseract_langs_3:
230231
matched_langcodes = _get_all_tesseract_langcodes_with_prefix(lang_iso639.part2b)
231-
return "+".join(matched_langcodes)
232+
return TESSERACT_LANGUAGES_SPLITTER.join(matched_langcodes)
232233

233234
# try to match ISO 639-2t
234235
elif lang_iso639.part2t in pytesseract_langs_3:
235236
matched_langcodes = _get_all_tesseract_langcodes_with_prefix(lang_iso639.part2t)
236-
return "+".join(matched_langcodes)
237+
return TESSERACT_LANGUAGES_SPLITTER.join(matched_langcodes)
237238

238239
else:
239240
logger.warning(f"{lang} is not a language supported by Tesseract.")

unstructured/partition/utils/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,5 @@ class PartitionStrategy:
3636

3737
# this field is defined by pytesseract/unstructured.pytesseract
3838
TESSERACT_TEXT_HEIGHT = "height"
39+
40+
TESSERACT_LANGUAGES_SPLITTER = "+"

0 commit comments

Comments
 (0)