From 31884c60db9d3c95855e250e895e79dba05bdc53 Mon Sep 17 00:00:00 2001 From: joschrew Date: Wed, 31 Jan 2024 15:22:10 +0100 Subject: [PATCH 01/13] Update dockerfile --- Dockerfile | 77 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 69 insertions(+), 8 deletions(-) diff --git a/Dockerfile b/Dockerfile index d818bc3..9190dfa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,65 @@ -FROM ocrd/core +FROM ocrd/core:v2.62.0 AS base +# set proper locales +ENV LANG C.UTF-8 +ENV LC_ALL C.UTF-8 + +# set TESSDATA_PREFIX +ENV TESSDATA_PREFIX /usr/local/share/tessdata + +ARG TESSERACT_REPOSITORY=https://github.com/tesseract-ocr/tesseract.git +ARG TESSERACT_REF=5.3.3 + +# set frontend non-interactive to silence interactive tzdata config +ARG DEBIAN_FRONTEND=noninteractive + + +# install common tools and tesseract build dependencies +# use provided leptonica +# tzdata required for proper timezone settings +RUN apt-get update && apt-get install -y \ + apt-utils \ + build-essential \ + g++ \ + git \ + libjpeg-dev \ + libgif-dev \ + libwebp-dev \ + libopenjp2-7-dev \ + libpng-dev \ + libtiff-dev \ + libtool \ + pkg-config \ + tzdata \ + xzgv \ + zlib1g-dev \ + libleptonica-dev \ + libpango1.0-dev \ + libicu-dev \ + autotools-dev \ + automake \ + libcurl4-nss-dev \ + libarchive-dev + +# set proper date and timezone in container +RUN echo "Europe/Berlin" > /etc/timezone +RUN ln -sf /usr/share/zoneinfo/Europe/Berlin /etc/localtime +RUN dpkg-reconfigure -f noninteractive tzdata + +# diagnostic output - check timezone settings +RUN cat /etc/timezone + +# clone and checkout desired tesseract version tag +RUN git clone ${TESSERACT_REPOSITORY} /build_tesseract +WORKDIR /build_tesseract +RUN git checkout ${TESSERACT_REF} + +# compile +RUN ./autogen.sh && ./configure --disable-openmp --disable-shared 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic' +RUN LDFLAGS="-L/usr/local/lib" CFLAGS="-I/usr/local/include" make +RUN make +RUN make install && ldconfig + +# install ocrd-tesserocr (until here commands for installing tesseract-ocr) ARG VCS_REF ARG BUILD_DATE LABEL \ @@ -7,7 +68,6 @@ LABEL \ org.label-schema.vcs-url="https://github.com/OCR-D/ocrd_tesserocr" \ org.label-schema.build-date=$BUILD_DATE -ENV DEBIAN_FRONTEND noninteractive ENV PYTHONIOENCODING utf8 # avoid HOME/.local/share (hard to predict USER here) @@ -15,7 +75,7 @@ ENV PYTHONIOENCODING utf8 # (can still be overridden by derived stages) ENV XDG_DATA_HOME /usr/local/share -WORKDIR /build-ocrd +WORKDIR /build-ocrd_tesserocr COPY setup.py . COPY ocrd_tesserocr/ocrd-tool.json . COPY README.md . @@ -23,14 +83,15 @@ COPY requirements.txt . COPY requirements_test.txt . COPY ocrd_tesserocr ./ocrd_tesserocr COPY Makefile . -RUN make deps-ubuntu && \ - apt-get install -y --no-install-recommends \ - g++ \ +RUN apt-get install -y --no-install-recommends \ + python3 \ + python3-pip \ && make deps install \ - && rm -rf /build-ocrd \ + && rm -rf /build-ocrd_tesserocr \ && apt-get -y remove --auto-remove g++ libtesseract-dev make # PPA tessdata prefix (= ocrd_tesserocr moduledir) is owned by root -RUN sudo chmod go+w `dpkg-query -L tesseract-ocr-eng | sed -n s,/eng.traineddata,,p` +# next line causes failure because tesseract-ocr-eng not existing. Not sure if needed, so skipping +# RUN sudo chmod go+w `dpkg-query -L tesseract-ocr-eng | sed -n s,/eng.traineddata,,p` RUN ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata RUN ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata From 043b5c4711717564c7319b310d22fe0ee854caf0 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 5 Feb 2024 19:17:44 +0100 Subject: [PATCH 02/13] build tesseract and tesserocr --- .dockerignore | 2 ++ .gitignore | 1 + .gitmodules | 6 ++++++ Dockerfile | 42 +++++++++++++++--------------------------- Makefile | 34 +++++++++++++++++++++++++++++++++- repo/tesseract | 1 + repo/tesserocr | 1 + 7 files changed, 59 insertions(+), 28 deletions(-) create mode 100644 .gitmodules create mode 160000 repo/tesseract create mode 160000 repo/tesserocr diff --git a/.dockerignore b/.dockerignore index 27c70c6..b4a4830 100644 --- a/.dockerignore +++ b/.dockerignore @@ -5,6 +5,8 @@ !requirements_test.txt !LICENSE !README.md +!repo/tesserocr +!repo/tesseract # avoid .git and __pycache__ etc: !ocrd_tesserocr/**/*.py diff --git a/.gitignore b/.gitignore index 38b47c8..c84bd90 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ test-workspace /.coverage /htmlcov /.cache +build_tesseract diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..cfc2d01 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "repo/tesserocr"] + path = repo/tesserocr + url = https://github.com/sirfz/tesserocr/ +[submodule "repo/tesseract"] + path = repo/tesseract + url = https://github.com/tesseract-ocr/tesseract diff --git a/Dockerfile b/Dockerfile index 9190dfa..1204553 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,13 +2,20 @@ FROM ocrd/core:v2.62.0 AS base # set proper locales ENV LANG C.UTF-8 ENV LC_ALL C.UTF-8 +# install ocrd-tesserocr (until here commands for installing tesseract-ocr) +ARG VCS_REF +ARG BUILD_DATE +LABEL \ + maintainer="https://ocr-d.de/kontakt" \ + org.label-schema.vcs-ref=$VCS_REF \ + org.label-schema.vcs-url="https://github.com/OCR-D/ocrd_tesserocr" \ + org.label-schema.build-date=$BUILD_DATE + +ENV PYTHONIOENCODING utf8 # set TESSDATA_PREFIX ENV TESSDATA_PREFIX /usr/local/share/tessdata -ARG TESSERACT_REPOSITORY=https://github.com/tesseract-ocr/tesseract.git -ARG TESSERACT_REF=5.3.3 - # set frontend non-interactive to silence interactive tzdata config ARG DEBIAN_FRONTEND=noninteractive @@ -46,29 +53,7 @@ RUN ln -sf /usr/share/zoneinfo/Europe/Berlin /etc/localtime RUN dpkg-reconfigure -f noninteractive tzdata # diagnostic output - check timezone settings -RUN cat /etc/timezone - -# clone and checkout desired tesseract version tag -RUN git clone ${TESSERACT_REPOSITORY} /build_tesseract -WORKDIR /build_tesseract -RUN git checkout ${TESSERACT_REF} - -# compile -RUN ./autogen.sh && ./configure --disable-openmp --disable-shared 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic' -RUN LDFLAGS="-L/usr/local/lib" CFLAGS="-I/usr/local/include" make -RUN make -RUN make install && ldconfig - -# install ocrd-tesserocr (until here commands for installing tesseract-ocr) -ARG VCS_REF -ARG BUILD_DATE -LABEL \ - maintainer="https://ocr-d.de/kontakt" \ - org.label-schema.vcs-ref=$VCS_REF \ - org.label-schema.vcs-url="https://github.com/OCR-D/ocrd_tesserocr" \ - org.label-schema.build-date=$BUILD_DATE - -ENV PYTHONIOENCODING utf8 +# RUN cat /etc/timezone # avoid HOME/.local/share (hard to predict USER here) # so let XDG_DATA_HOME coincide with fixed system location @@ -82,13 +67,16 @@ COPY README.md . COPY requirements.txt . COPY requirements_test.txt . COPY ocrd_tesserocr ./ocrd_tesserocr +COPY repo/tesserocr ./repo/tesserocr +COPY repo/tesseract ./repo/tesseract COPY Makefile . RUN apt-get install -y --no-install-recommends \ python3 \ python3-pip \ - && make deps install \ + && make deps install-tesseract install-tesserocr install \ && rm -rf /build-ocrd_tesserocr \ && apt-get -y remove --auto-remove g++ libtesseract-dev make + # PPA tessdata prefix (= ocrd_tesserocr moduledir) is owned by root # next line causes failure because tesseract-ocr-eng not existing. Not sure if needed, so skipping # RUN sudo chmod go+w `dpkg-query -L tesseract-ocr-eng | sed -n s,/eng.traineddata,,p` diff --git a/Makefile b/Makefile index e1648ae..40f3295 100644 --- a/Makefile +++ b/Makefile @@ -7,8 +7,20 @@ LOG_LEVEL = INFO PYTHONIOENCODING=utf8 LC_ALL = C.UTF-8 LANG = C.UTF-8 -export +ifdef VIRTUAL_ENV + TESSERACT_PREFIX = $(VIRTUAL_ENV) +else + TESSERACT_PREFIX = /usr/local +endif + +ifeq ($(PKG_CONFIG_PATH),) +PKG_CONFIG_PATH := $(TESSERACT_PREFIX)/lib/pkgconfig +else +PKG_CONFIG_PATH := $(TESSERACT_PREFIX)/lib/pkgconfig:$(PKG_CONFIG_PATH) +endif +export PKG_CONFIG_PATH +export # pytest args. Set to '-s' to see log output during test execution, '--verbose' to see individual tests. Default: '$(PYTEST_ARGS)' PYTEST_ARGS = @@ -85,6 +97,20 @@ docker: --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ -t $(DOCKER_TAG) . +install-tesserocr: + cd repo/tesserocr; $(PIP) install . + +install-tesseract: + cd repo/tesseract; ./autogen.sh + mkdir -p $(CURDIR)/build_tesseract + cd $(CURDIR)/build_tesseract && $(CURDIR)/repo/tesseract/configure \ + --prefix=$(TESSERACT_PREFIX) \ + --disable-openmp \ + --disable-shared \ + 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic -fPIC' ;\ + cd $(CURDIR)/build_tesseract && $(MAKE) install + if [[ "$(TESSERACT_PREFIX)" = "/usr"* ]];then ldconfig ;fi + # Install this package install: deps $(PIP) install . @@ -135,6 +161,12 @@ repo/assets: mkdir -p $(dir $@) git clone https://github.com/OCR-D/assets "$@" +.PHONY: clean +clean: assets-clean tesseract-clean + +tesseract-clean: + rm -rf $(CURDIR)/build_tesseract + cd repo/tesseract; make distclean .PHONY: assets-clean # Remove symlinks in test/assets diff --git a/repo/tesseract b/repo/tesseract new file mode 160000 index 0000000..8ee020e --- /dev/null +++ b/repo/tesseract @@ -0,0 +1 @@ +Subproject commit 8ee020e14cf5be4e3f0e9beb09b6b050a1871854 diff --git a/repo/tesserocr b/repo/tesserocr new file mode 160000 index 0000000..1f960e9 --- /dev/null +++ b/repo/tesserocr @@ -0,0 +1 @@ +Subproject commit 1f960e9e0714dcd5ebdcf86248269efb70ccca5b From 2c05dbcb407dcdb73f0b51083a4c38eaf4cdac13 Mon Sep 17 00:00:00 2001 From: joschrew Date: Thu, 8 Feb 2024 12:08:16 +0100 Subject: [PATCH 03/13] Dockerfile: add eng traineddata --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 1204553..33b0e0e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -82,6 +82,7 @@ RUN apt-get install -y --no-install-recommends \ # RUN sudo chmod go+w `dpkg-query -L tesseract-ocr-eng | sed -n s,/eng.traineddata,,p` RUN ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata RUN ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata +RUN ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata WORKDIR /data VOLUME /data From 42e2ba57ee4a17bc34f6355aa54e79b85dd97803 Mon Sep 17 00:00:00 2001 From: joschrew Date: Mon, 12 Feb 2024 10:09:07 +0100 Subject: [PATCH 04/13] Move deps install from Dockerfile to Makefile Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- Dockerfile | 39 +++++---------------------------------- Makefile | 32 ++++++++++++++++++++------------ 2 files changed, 25 insertions(+), 46 deletions(-) diff --git a/Dockerfile b/Dockerfile index 33b0e0e..971c556 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,34 +19,6 @@ ENV TESSDATA_PREFIX /usr/local/share/tessdata # set frontend non-interactive to silence interactive tzdata config ARG DEBIAN_FRONTEND=noninteractive - -# install common tools and tesseract build dependencies -# use provided leptonica -# tzdata required for proper timezone settings -RUN apt-get update && apt-get install -y \ - apt-utils \ - build-essential \ - g++ \ - git \ - libjpeg-dev \ - libgif-dev \ - libwebp-dev \ - libopenjp2-7-dev \ - libpng-dev \ - libtiff-dev \ - libtool \ - pkg-config \ - tzdata \ - xzgv \ - zlib1g-dev \ - libleptonica-dev \ - libpango1.0-dev \ - libicu-dev \ - autotools-dev \ - automake \ - libcurl4-nss-dev \ - libarchive-dev - # set proper date and timezone in container RUN echo "Europe/Berlin" > /etc/timezone RUN ln -sf /usr/share/zoneinfo/Europe/Berlin /etc/localtime @@ -60,7 +32,7 @@ RUN dpkg-reconfigure -f noninteractive tzdata # (can still be overridden by derived stages) ENV XDG_DATA_HOME /usr/local/share -WORKDIR /build-ocrd_tesserocr +WORKDIR /build COPY setup.py . COPY ocrd_tesserocr/ocrd-tool.json . COPY README.md . @@ -70,11 +42,8 @@ COPY ocrd_tesserocr ./ocrd_tesserocr COPY repo/tesserocr ./repo/tesserocr COPY repo/tesseract ./repo/tesseract COPY Makefile . -RUN apt-get install -y --no-install-recommends \ - python3 \ - python3-pip \ - && make deps install-tesseract install-tesserocr install \ - && rm -rf /build-ocrd_tesserocr \ +RUN make deps-ubuntu deps install-tesseract install-tesserocr install \ + && rm -rf /build \ && apt-get -y remove --auto-remove g++ libtesseract-dev make # PPA tessdata prefix (= ocrd_tesserocr moduledir) is owned by root @@ -83,6 +52,8 @@ RUN apt-get install -y --no-install-recommends \ RUN ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata RUN ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata RUN ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata +RUN ocrd resmgr download ocrd-tesserocr-recognize equ.traineddata +RUN ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata WORKDIR /data VOLUME /data diff --git a/Makefile b/Makefile index 40f3295..8a22a0c 100644 --- a/Makefile +++ b/Makefile @@ -62,24 +62,32 @@ help: # Dependencies for deployment in an ubuntu/debian linux # (lib*-dev merely for building tesserocr with pip) -# (tesseract-ocr: Ubuntu 18.04 now ships 4.0.0, -# which is unsupported. Add the tesseract-ocr PPA -# from Alexander Pozdnyakov which provides 4.1.0. -# See https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr -# for details.) deps-ubuntu: - apt-get install -y --no-install-recommends software-properties-common - -add-apt-repository -u -y ppa:alex-p/tesseract-ocr - apt-get install -y \ + apt-get update && apt-get install -y --no-install-recommends \ + apt-utils \ + build-essential \ g++ \ git \ python3 \ python3-pip \ - libtesseract-dev \ + libjpeg-dev \ + libgif-dev \ + libwebp-dev \ + libopenjp2-7-dev \ + libpng-dev \ + libtiff-dev \ + libtool \ + pkg-config \ + tzdata \ + xzgv \ + zlib1g-dev \ libleptonica-dev \ - tesseract-ocr-eng \ - tesseract-ocr-script-frak \ - tesseract-ocr + libpango1.0-dev \ + libicu-dev \ + autotools-dev \ + automake \ + libcurl4-nss-dev \ + libarchive-dev # Install Python deps for install via pip deps: From d194816f51f97d7498439e6bf344b64afc60545b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 14 Feb 2024 10:32:46 +0100 Subject: [PATCH 05/13] CI: update (make install-tesseract) make deps-ubuntu no longer fetches Tesseract via PPA, so we need to make install-tesseract also, drop unsupported Python 3.6 --- .circleci/config.yml | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 5afab93..89bf3fa 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -15,15 +15,10 @@ jobs: steps: - checkout - run: sudo make deps-ubuntu - - when: - condition: - equal: [ '3.6', << parameters.python-version >> ] - steps: - # speed-up build time for end-of-life Python by holding at latest binary: - - run: pip install --prefer-binary -U opencv-python-headless numpy + - run: make install-tesseract install-tesserocr - run: make install - # PPA tessdata prefix (= ocrd_tesserocr moduledir) is owned by root - - run: sudo chmod go+w `dpkg-query -L tesseract-ocr-eng | sed -n s,/eng.traineddata,,p` + - run: ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata + - run: ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata - run: ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata - run: ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata - run: make test-cli @@ -36,4 +31,4 @@ workflows: - build-python: matrix: parameters: - python-version: ['3.6', '3.7', '3.8', '3.9', '3.10'] + python-version: ['3.7', '3.8', '3.9', '3.10'] From c9f2f748eecbfd7a0f26f0a54d3f3cc9f5b50082 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 14 Feb 2024 10:41:33 +0100 Subject: [PATCH 06/13] makefile: fix subrepo dependencies --- Makefile | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index 8a22a0c..75745c1 100644 --- a/Makefile +++ b/Makefile @@ -105,20 +105,24 @@ docker: --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ -t $(DOCKER_TAG) . -install-tesserocr: - cd repo/tesserocr; $(PIP) install . +install-tesserocr: repo/tesserocr + $(PIP) install ./$< -install-tesseract: - cd repo/tesseract; ./autogen.sh - mkdir -p $(CURDIR)/build_tesseract - cd $(CURDIR)/build_tesseract && $(CURDIR)/repo/tesseract/configure \ +install-tesseract: repo/tesseract + cd $<; ./autogen.sh + mkdir -p build_tesseract + cd build_tesseract && $(CURDIR)/repo/tesseract/configure \ --prefix=$(TESSERACT_PREFIX) \ --disable-openmp \ --disable-shared \ - 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic -fPIC' ;\ - cd $(CURDIR)/build_tesseract && $(MAKE) install + 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic -fPIC' && \ + $(MAKE) install if [[ "$(TESSERACT_PREFIX)" = "/usr"* ]];then ldconfig ;fi +repo/tesserocr repo/tesseract: + git submodule sync $@ + git submodule update --init $@ + # Install this package install: deps $(PIP) install . From 3bdfeb73f51afa51f298b9659c859c8394a051a1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 14 Feb 2024 10:55:14 +0100 Subject: [PATCH 07/13] CI: checkout with submodules (since normal Circleci `checkout` creates empty submodule directories) --- .circleci/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 89bf3fa..71bed66 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -14,6 +14,7 @@ jobs: - image: cimg/python:<< parameters.python-version >> steps: - checkout + - run: git submodule sync && git submodule update --init - run: sudo make deps-ubuntu - run: make install-tesseract install-tesserocr - run: make install From 36486bb8f0fd18ca992066531aa3c717905b78a1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 14 Feb 2024 11:10:10 +0100 Subject: [PATCH 08/13] CI: install-tesseract needs sudo --- .circleci/config.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 71bed66..7822c9f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -16,7 +16,8 @@ jobs: - checkout - run: git submodule sync && git submodule update --init - run: sudo make deps-ubuntu - - run: make install-tesseract install-tesserocr + - run: sudo make install-tesseract + - run: make install-tesserocr - run: make install - run: ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata - run: ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata From 55ade98e80304cae55a66914ee0a9475c4f8a316 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 14 Feb 2024 11:33:43 +0100 Subject: [PATCH 09/13] CI: try unprivileged tessdata using VIRTUAL_ENV from PYENV_ROOT --- .circleci/config.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 7822c9f..825e5d3 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -12,11 +12,14 @@ jobs: type: string docker: - image: cimg/python:<< parameters.python-version >> + environment: + # cimg/python uses pyenv instead of venv + VIRTUAL_ENV: $PYENV_ROOT steps: - checkout - run: git submodule sync && git submodule update --init - run: sudo make deps-ubuntu - - run: sudo make install-tesseract + - run: make install-tesseract - run: make install-tesserocr - run: make install - run: ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata From 09686cd94683733ab6054ebbbacc129631b9f8e3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 14 Feb 2024 11:38:52 +0100 Subject: [PATCH 10/13] CI: fix envvar syntax --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 825e5d3..be6f770 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -14,7 +14,7 @@ jobs: - image: cimg/python:<< parameters.python-version >> environment: # cimg/python uses pyenv instead of venv - VIRTUAL_ENV: $PYENV_ROOT + VIRTUAL_ENV: ${PYENV_ROOT} steps: - checkout - run: git submodule sync && git submodule update --init From dbfbea706f803b95327a0c266272ae075f07bd14 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 14 Feb 2024 12:43:27 +0100 Subject: [PATCH 11/13] install-tesseract: make rules reentrant, add training utils --- Makefile | 72 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 39 insertions(+), 33 deletions(-) diff --git a/Makefile b/Makefile index 75745c1..3589e85 100644 --- a/Makefile +++ b/Makefile @@ -28,40 +28,35 @@ PYTEST_ARGS = # Docker container tag DOCKER_TAG = 'ocrd/tesserocr' -# BEGIN-EVAL makefile-parser --make-help Makefile - help: @echo "" @echo " Targets" @echo "" - @echo " deps-ubuntu Dependencies for deployment in an ubuntu/debian linux" - @echo " (lib*-dev merely for building tesserocr with pip)" - @echo " (tesseract-ocr: Ubuntu 18.04 now ships 4.0.0," - @echo " which is unsupported. Add the tesseract-ocr PPA" - @echo " from Alexander Pozdnyakov which provides 4.1.0." - @echo " See https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr" - @echo " for details.)" - @echo " deps Install Python deps for install via pip" - @echo " deps-test Install Python deps for test via pip" - @echo " docker Build docker image" - @echo " install Install this package" - @echo " test Run unit tests" - @echo " coverage Run unit tests and determine test coverage" - @echo " test-cli Test the command line tools" - @echo " test/assets Setup test assets" - @echo " repo/assets Clone OCR-D/assets to ./repo/assets" - @echo " assets-clean Remove symlinks in test/assets" + @echo " deps-ubuntu Install system dependencies in an Ubuntu/Debian Linux" + @echo " install-tesseract Compile and install Tesseract" + @echo " install-tesseract-training Compile and install training utilities for Tesseract" + @echo " install-tesserocr Compile and install Tesserocr" + @echo " deps Install Python dependencies for install via pip" + @echo " install Install this package via pip" + @echo " deps-test Install Python deps for test via pip" + @echo " test Run unit tests" + @echo " coverage Run unit tests and determine test coverage" + @echo " test-cli Test the command line tools" + @echo " test/assets Setup test assets" + @echo " repo/assets Clone OCR-D/assets to ./repo/assets" + @echo " repo/tesseract Checkout Tesseract ./repo/tesseract" + @echo " repo/tesserocr Checkout Tesserocr to ./repo/tesserocr" + @echo " docker Build docker image" + @echo " assets-clean Remove symlinks in test/assets" @echo "" @echo " Variables" @echo "" - @echo " PYTEST_ARGS pytest args. Set to '-s' to see log output during test execution, '--verbose' to see individual tests. Default: '$(PYTEST_ARGS)'" - @echo " DOCKER_TAG Docker container tag" - @echo " TESSDATA_PREFIX search path for recognition models (overriding Tesseract compile-time default)" - -# END-EVAL + @echo " PYTEST_ARGS pytest args. Set to '-s' to see log output during test execution, '--verbose' to see individual tests. [$(PYTEST_ARGS)]" + @echo " DOCKER_TAG Docker container tag [$(DOCKER_TAG)]" + @echo " TESSDATA_PREFIX search path for recognition models (overriding Tesseract compile-time default) [$(TESSDATA_PREFIX)]" -# Dependencies for deployment in an ubuntu/debian linux -# (lib*-dev merely for building tesserocr with pip) +# Dependencies for deployment in an Ubuntu/Debian Linux +# (lib*-dev merely for building Tesseract and tesserocr from sources) deps-ubuntu: apt-get update && apt-get install -y --no-install-recommends \ apt-utils \ @@ -108,16 +103,27 @@ docker: install-tesserocr: repo/tesserocr $(PIP) install ./$< -install-tesseract: repo/tesseract - cd $<; ./autogen.sh - mkdir -p build_tesseract - cd build_tesseract && $(CURDIR)/repo/tesseract/configure \ +install-tesseract: $(TESSERACT_PREFIX)/bin/tesseract + +install-tesseract-training: $(TESSERACT_PREFIX)/bin/lstmtraining + +$(TESSERACT_PREFIX)/bin/tesseract: build_tesseract/Makefile + $(MAKE) -C build_tesseract install + if [[ "$(TESSERACT_PREFIX)" = "/usr"* ]]; then ldconfig; fi + +$(TESSERACT_PREFIX)/bin/lstmtraining: build_tesseract/Makefile + $(MAKE) -C build_tesseract training-install + +build_tesseract/Makefile: repo/tesseract/Makefile.in + mkdir -p $(@D) + cd $(@D) && $(CURDIR)/repo/tesseract/configure \ --prefix=$(TESSERACT_PREFIX) \ --disable-openmp \ --disable-shared \ - 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic -fPIC' && \ - $(MAKE) install - if [[ "$(TESSERACT_PREFIX)" = "/usr"* ]];then ldconfig ;fi + 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic -fPIC' + +repo/tesseract/Makefile.in: repo/tesseract + cd $<; ./autogen.sh repo/tesserocr repo/tesseract: git submodule sync $@ From 21f2ad370f4ce1bf09906d9770d4ff80c9ddd015 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 14 Feb 2024 12:59:35 +0100 Subject: [PATCH 12/13] docker: make tesseract-training, too and alias tessdata to /models for easier persistence --- Dockerfile | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index 971c556..f1f0b31 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,9 +13,6 @@ LABEL \ ENV PYTHONIOENCODING utf8 -# set TESSDATA_PREFIX -ENV TESSDATA_PREFIX /usr/local/share/tessdata - # set frontend non-interactive to silence interactive tzdata config ARG DEBIAN_FRONTEND=noninteractive @@ -31,6 +28,8 @@ RUN dpkg-reconfigure -f noninteractive tzdata # so let XDG_DATA_HOME coincide with fixed system location # (can still be overridden by derived stages) ENV XDG_DATA_HOME /usr/local/share +ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources +ENV TESSDATA_PREFIX $XDG_DATA_HOME/tessdata WORKDIR /build COPY setup.py . @@ -42,18 +41,28 @@ COPY ocrd_tesserocr ./ocrd_tesserocr COPY repo/tesserocr ./repo/tesserocr COPY repo/tesseract ./repo/tesseract COPY Makefile . -RUN make deps-ubuntu deps install-tesseract install-tesserocr install \ +RUN make deps-ubuntu deps install-tesseract install-tesseract-training install-tesserocr install \ && rm -rf /build \ && apt-get -y remove --auto-remove g++ libtesseract-dev make -# PPA tessdata prefix (= ocrd_tesserocr moduledir) is owned by root -# next line causes failure because tesseract-ocr-eng not existing. Not sure if needed, so skipping -# RUN sudo chmod go+w `dpkg-query -L tesseract-ocr-eng | sed -n s,/eng.traineddata,,p` RUN ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata RUN ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata RUN ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata RUN ocrd resmgr download ocrd-tesserocr-recognize equ.traineddata RUN ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata +# as discussed in ocrd_all#378, we do not want to manage more than one resource location +# to mount for model persistence; +# with named volumes, the preinstalled models will be copied to the host and complemented +# by downloaded models; +# tessdata is the only problematic module location +RUN mkdir -p $XDG_CONFIG_HOME +RUN mv $TESSDATA_PREFIX $XDG_CONFIG_HOME/ocrd-tesserocr-recognize +RUN ln -s $XDG_CONFIG_HOME/ocrd-tesserocr-recognize $TESSDATA_PREFIX +# finally, alias/symlink all ocrd-resources to /models for shorter mount commands +RUN mv $XDG_CONFIG_HOME /models && ln -s /models $XDG_CONFIG_HOME + + +# finally, alias/symlink all ocrd-resources to /models for shorter mount commands WORKDIR /data VOLUME /data From dab081e1c27067e50787128cbd302c35b6693760 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 14 Feb 2024 13:20:11 +0100 Subject: [PATCH 13/13] CI: add CD via Dockerhub --- .circleci/config.yml | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index be6f770..1b5dcc6 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -6,7 +6,7 @@ orbs: jobs: - build-python: + test-python: parameters: python-version: type: string @@ -30,10 +30,32 @@ jobs: - run: make coverage - codecov/upload + deploy-docker: + docker: + - image: circleci/buildpack-deps:stretch + environment: + DOCKER_TAG: ocrd/tesserocr + steps: + - checkout + - setup_remote_docker: # https://circleci.com/docs/2.0/building-docker-images/ + docker_layer_caching: true + - run: make docker DOCKER_TAG=$DOCKER_TAG + - run: + name: Login to Docker Hub + command: echo "$DOCKERHUB_PASS" | docker login --username "$DOCKERHUB_USER" --password-stdin + - run: docker push $DOCKER_TAG + + workflows: build: jobs: - - build-python: + - test-python: matrix: parameters: python-version: ['3.7', '3.8', '3.9', '3.10'] + deploy: + jobs: + - deploy-docker: + filters: + branches: + only: master