qurator-spk · kba · Oct 10, 2025 · Aug 19, 2025 · Aug 19, 2025 · Sep 19, 2025
diff --git a/.github/workflows/test-eynollah.yml b/.github/workflows/test-eynollah.yml
@@ -24,24 +24,39 @@ jobs:
         sudo rm -rf "$AGENT_TOOLSDIRECTORY"
         df -h
     - uses: actions/checkout@v4
-    - uses: actions/cache@v4
+    - uses: actions/cache/restore@v4
       id: seg_model_cache
       with:
         path: models_layout_v0_5_0
-        key: ${{ runner.os }}-models
-    - uses: actions/cache@v4
+        key: seg-models
+    - uses: actions/cache/restore@v4
       id: ocr_model_cache
       with:
-        path: models_ocr_v0_5_0
-        key: ${{ runner.os }}-models
-    - uses: actions/cache@v4
+        path: models_ocr_v0_5_1
+        key: ocr-models
+    - uses: actions/cache/restore@v4
       id: bin_model_cache
       with:
         path: default-2021-03-09
-        key: ${{ runner.os }}-modelbin
+        key: bin-models
     - name: Download models
       if: steps.seg_model_cache.outputs.cache-hit != 'true' || steps.bin_model_cache.outputs.cache-hit != 'true' || steps.ocr_model_cache.outputs.cache-hit != true
       run: make models
+    - uses: actions/cache/save@v4
+      if: steps.seg_model_cache.outputs.cache-hit != 'true'
+      with:
+        path: models_layout_v0_5_0
+        key: seg-models
+    - uses: actions/cache/save@v4
+      if: steps.ocr_model_cache.outputs.cache-hit != 'true'
+      with:
+        path: models_ocr_v0_5_1
+        key: ocr-models
+    - uses: actions/cache/save@v4
+      if: steps.bin_model_cache.outputs.cache-hit != 'true'
+      with:
+        path: default-2021-03-09
+        key: bin-models
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v5
       with:

diff --git a/.gitignore b/.gitignore
@@ -2,7 +2,11 @@
 __pycache__
 sbb_newspapers_org_image/pylint.log
 models_eynollah*
+models_ocr*
+models_layout*
+default-2021-03-09
 output.html
 /build
 /dist
 *.tif
+TAGS
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,33 @@ Versioned according to [Semantic Versioning](http://semver.org/).
 
 ## Unreleased
 
+Fixed:
+
+ * :fire: polygons: avoid invalid paths (use `Polygon.buffer()` instead of dilation etc.)
+ * `return_boxes_of_images_by_order_of_reading_new`: avoid Numpy.dtype mismatch, simplify
+ * `return_boxes_of_images_by_order_of_reading_new`: log any exceptions instead of ignoring
+ * `filter_contours_without_textline_inside`: avoid removing from duplicate lists twice
+ * `get_marginals`: exit early if no peaks found to avoid spurious overlap mask
+ * `get_smallest_skew`: after shifting search range of rotation angle, use overall best result
+ * Dockerfile: fix CUDA installation (cuDNN contested between Torch and TF due to extra OCR)
+ * OCR: re-instate missing methods and fix `utils_ocr` function calls
+ * :fire: writer: `SeparatorRegion` needs `SeparatorRegionType` (not `ImageRegionType`)
+f458e3e
+ * tests: switch from `pytest-subtests` to `parametrize` so we can use `pytest-isolate`
+   (so CUDA memory gets freed between tests if running on GPU)
+
+Changed:
+
+ * polygons: slightly widen for regions and lines, increase for separators
+ * various refactorings, some code style and identifier improvements
+ * deskewing/multiprocessing: switch back to ProcessPoolExecutor (faster), 
+   but use shared memory if necessary, and switch back from `loky` to stdlib,
+   and shutdown in `del()` instead of `atexit`
+ * :fire: OCR: switch CNN-RNN model to `20250930` version compatible with TF 2.12 on CPU, too
+ * :fire: writer: use `@type='heading'` instead of `'header'` for headings
+ * CI: update+improve model caching
+
+
 ## [0.5.0] - 2025-09-26
 
 Fixed:

diff --git a/Dockerfile b/Dockerfile
@@ -40,6 +40,8 @@ RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename
 RUN ocrd ocrd-tool ocrd-tool.json dump-module-dirs > $(dirname $(ocrd bashlib filename))/ocrd-all-module-dir.json
 # install everything and reduce image size
 RUN make install EXTRAS=OCR && rm -rf /build/eynollah
+# fixup for broken cuDNN installation (Torch pulls in 8.5.0, which is incompatible with Tensorflow)
+RUN pip install nvidia-cudnn-cu11==8.6.0.163
 # smoke test
 RUN eynollah --help
 

diff --git a/Makefile b/Makefile
@@ -13,12 +13,18 @@ DOCKER ?= docker
 #SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz
 #SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz
 SEG_MODEL := https://zenodo.org/records/17194824/files/models_layout_v0_5_0.tar.gz?download=1
+SEG_MODELFILE = $(notdir $(patsubst %?download=1,%,$(SEG_MODEL)))
+SEG_MODELNAME = $(SEG_MODELFILE:%.tar.gz=%)
 
 BIN_MODEL := https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2021_03_09.zip
+BIN_MODELFILE = $(notdir $(BIN_MODEL))
+BIN_MODELNAME := default-2021-03-09
 
-OCR_MODEL := https://zenodo.org/records/17194824/files/models_ocr_v0_5_0.tar.gz?download=1
+OCR_MODEL := https://zenodo.org/records/17236998/files/models_ocr_v0_5_1.tar.gz?download=1
+OCR_MODELFILE = $(notdir $(patsubst %?download=1,%,$(OCR_MODEL)))
+OCR_MODELNAME = $(OCR_MODELFILE:%.tar.gz=%)
 
-PYTEST_ARGS ?= -vv
+PYTEST_ARGS ?= -vv --isolate
 
 # BEGIN-EVAL makefile-parser --make-help Makefile
 
@@ -31,7 +37,8 @@ help:
 	@echo "    install      Install package with pip"
 	@echo "    install-dev  Install editable with pip"
 	@echo "    deps-test    Install test dependencies with pip"
-	@echo "    models       Download and extract models to $(CURDIR)/models_layout_v0_5_0"
+	@echo "    models       Download and extract models to $(CURDIR):"
+	@echo "                 $(BIN_MODELNAME) $(SEG_MODELNAME) $(OCR_MODELNAME)"
 	@echo "    smoke-test   Run simple CLI check"
 	@echo "    ocrd-test    Run OCR-D CLI check"
 	@echo "    test         Run unit tests"
@@ -42,33 +49,29 @@ help:
 	@echo "    PYTEST_ARGS  pytest args for 'test' (Set to '-s' to see log output during test execution, '-vv' to see individual tests. [$(PYTEST_ARGS)]"
 	@echo "    SEG_MODEL    URL of 'models' archive to download for segmentation 'test' [$(SEG_MODEL)]"
 	@echo "    BIN_MODEL    URL of 'models' archive to download for binarization 'test' [$(BIN_MODEL)]"
+	@echo "    OCR_MODEL    URL of 'models' archive to download for binarization 'test' [$(OCR_MODEL)]"
 	@echo ""
 
 # END-EVAL
 
 
 # Download and extract models to $(PWD)/models_layout_v0_5_0
-models: models_layout_v0_5_0 models_ocr_v0_5_0 default-2021-03-09
+models: $(BIN_MODELNAME) $(SEG_MODELNAME) $(OCR_MODELNAME)
 
-models_layout_v0_5_0: models_layout_v0_5_0.tar.gz
-	tar zxf models_layout_v0_5_0.tar.gz
-
-models_layout_v0_5_0.tar.gz:
+$(BIN_MODELFILE):
+	wget -O $@ $(BIN_MODEL)
+$(SEG_MODELFILE):
 	wget -O $@ $(SEG_MODEL)
-
-models_ocr_v0_5_0: models_ocr_v0_5_0.tar.gz
-	tar zxf models_ocr_v0_5_0.tar.gz
-
-models_ocr_v0_5_0.tar.gz:
+$(OCR_MODELFILE):
 	wget -O $@ $(OCR_MODEL)
 
-default-2021-03-09: $(notdir $(BIN_MODEL))
-	unzip $(notdir $(BIN_MODEL))
+$(BIN_MODELNAME): $(BIN_MODELFILE)
 	mkdir $@
-	mv $(basename $(notdir $(BIN_MODEL))) $@
-
-$(notdir $(BIN_MODEL)):
-	wget $(BIN_MODEL)
+	unzip -d $@ $<
+$(SEG_MODELNAME): $(SEG_MODELFILE)
+	tar zxf $<
+$(OCR_MODELNAME): $(OCR_MODELFILE)
+	tar zxf $<
 
 build:
 	$(PIP) install build
@@ -82,7 +85,10 @@ install:
 install-dev:
 	$(PIP) install -e .$(and $(EXTRAS),[$(EXTRAS)])
 
-deps-test: models_layout_v0_5_0
+ifeq (OCR,$(findstring OCR, $(EXTRAS)))
+deps-test: $(OCR_MODELNAME)
+endif
+deps-test: $(BIN_MODELNAME) $(SEG_MODELNAME)
 	$(PIP) install -r requirements-test.txt
 
 smoke-test: TMPDIR != mktemp -d
@@ -123,9 +129,9 @@ ocrd-test: tests/resources/kant_aufklaerung_1784_0020.tif
 	$(RM) -r $(TMPDIR)
 
 # Run unit tests
-test: export MODELS_LAYOUT=$(CURDIR)/models_layout_v0_5_0
-test: export MODELS_OCR=$(CURDIR)/models_ocr_v0_5_0
-test: export MODELS_BIN=$(CURDIR)/default-2021-03-09
+test: export MODELS_LAYOUT=$(CURDIR)/$(SEG_MODELNAME)
+test: export MODELS_OCR=$(CURDIR)/$(OCR_MODELNAME)
+test: export MODELS_BIN=$(CURDIR)/$(BIN_MODELNAME)
 test:
 	$(PYTHON) -m pytest tests --durations=0 --continue-on-collection-errors $(PYTEST_ARGS)
 

diff --git a/requirements-test.txt b/requirements-test.txt
@@ -1,4 +1,4 @@
 pytest
-pytest-subtests
+pytest-isolate
 coverage[toml]
 black
diff --git a/requirements.txt b/requirements.txt
@@ -5,5 +5,4 @@ scikit-learn >= 0.23.2
 tensorflow < 2.13
 numba <= 0.58.1
 scikit-image
-loky
 biopython