diff --git a/.circleci/config.yml b/.circleci/config.yml index 5afab93..1b5dcc6 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -6,34 +6,56 @@ orbs: jobs: - build-python: + test-python: parameters: python-version: type: string docker: - image: cimg/python:<< parameters.python-version >> + environment: + # cimg/python uses pyenv instead of venv + VIRTUAL_ENV: ${PYENV_ROOT} steps: - checkout + - run: git submodule sync && git submodule update --init - run: sudo make deps-ubuntu - - when: - condition: - equal: [ '3.6', << parameters.python-version >> ] - steps: - # speed-up build time for end-of-life Python by holding at latest binary: - - run: pip install --prefer-binary -U opencv-python-headless numpy + - run: make install-tesseract + - run: make install-tesserocr - run: make install - # PPA tessdata prefix (= ocrd_tesserocr moduledir) is owned by root - - run: sudo chmod go+w `dpkg-query -L tesseract-ocr-eng | sed -n s,/eng.traineddata,,p` + - run: ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata + - run: ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata - run: ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata - run: ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata - run: make test-cli - run: make coverage - codecov/upload + deploy-docker: + docker: + - image: circleci/buildpack-deps:stretch + environment: + DOCKER_TAG: ocrd/tesserocr + steps: + - checkout + - setup_remote_docker: # https://circleci.com/docs/2.0/building-docker-images/ + docker_layer_caching: true + - run: make docker DOCKER_TAG=$DOCKER_TAG + - run: + name: Login to Docker Hub + command: echo "$DOCKERHUB_PASS" | docker login --username "$DOCKERHUB_USER" --password-stdin + - run: docker push $DOCKER_TAG + + workflows: build: jobs: - - build-python: + - test-python: matrix: parameters: - python-version: ['3.6', '3.7', '3.8', '3.9', '3.10'] + python-version: ['3.7', '3.8', '3.9', '3.10'] + deploy: + jobs: + - deploy-docker: + filters: + branches: + only: master diff --git a/.dockerignore b/.dockerignore index 27c70c6..b4a4830 100644 --- a/.dockerignore +++ b/.dockerignore @@ -5,6 +5,8 @@ !requirements_test.txt !LICENSE !README.md +!repo/tesserocr +!repo/tesseract # avoid .git and __pycache__ etc: !ocrd_tesserocr/**/*.py diff --git a/.gitignore b/.gitignore index 38b47c8..c84bd90 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ test-workspace /.coverage /htmlcov /.cache +build_tesseract diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..cfc2d01 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "repo/tesserocr"] + path = repo/tesserocr + url = https://github.com/sirfz/tesserocr/ +[submodule "repo/tesseract"] + path = repo/tesseract + url = https://github.com/tesseract-ocr/tesseract diff --git a/Dockerfile b/Dockerfile index d818bc3..f1f0b31 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,8 @@ -FROM ocrd/core +FROM ocrd/core:v2.62.0 AS base +# set proper locales +ENV LANG C.UTF-8 +ENV LC_ALL C.UTF-8 +# install ocrd-tesserocr (until here commands for installing tesseract-ocr) ARG VCS_REF ARG BUILD_DATE LABEL \ @@ -7,32 +11,58 @@ LABEL \ org.label-schema.vcs-url="https://github.com/OCR-D/ocrd_tesserocr" \ org.label-schema.build-date=$BUILD_DATE -ENV DEBIAN_FRONTEND noninteractive ENV PYTHONIOENCODING utf8 +# set frontend non-interactive to silence interactive tzdata config +ARG DEBIAN_FRONTEND=noninteractive + +# set proper date and timezone in container +RUN echo "Europe/Berlin" > /etc/timezone +RUN ln -sf /usr/share/zoneinfo/Europe/Berlin /etc/localtime +RUN dpkg-reconfigure -f noninteractive tzdata + +# diagnostic output - check timezone settings +# RUN cat /etc/timezone + # avoid HOME/.local/share (hard to predict USER here) # so let XDG_DATA_HOME coincide with fixed system location # (can still be overridden by derived stages) ENV XDG_DATA_HOME /usr/local/share +ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources +ENV TESSDATA_PREFIX $XDG_DATA_HOME/tessdata -WORKDIR /build-ocrd +WORKDIR /build COPY setup.py . COPY ocrd_tesserocr/ocrd-tool.json . COPY README.md . COPY requirements.txt . COPY requirements_test.txt . COPY ocrd_tesserocr ./ocrd_tesserocr +COPY repo/tesserocr ./repo/tesserocr +COPY repo/tesseract ./repo/tesseract COPY Makefile . -RUN make deps-ubuntu && \ - apt-get install -y --no-install-recommends \ - g++ \ - && make deps install \ - && rm -rf /build-ocrd \ +RUN make deps-ubuntu deps install-tesseract install-tesseract-training install-tesserocr install \ + && rm -rf /build \ && apt-get -y remove --auto-remove g++ libtesseract-dev make -# PPA tessdata prefix (= ocrd_tesserocr moduledir) is owned by root -RUN sudo chmod go+w `dpkg-query -L tesseract-ocr-eng | sed -n s,/eng.traineddata,,p` + RUN ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata RUN ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata +RUN ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata +RUN ocrd resmgr download ocrd-tesserocr-recognize equ.traineddata +RUN ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata + +# as discussed in ocrd_all#378, we do not want to manage more than one resource location +# to mount for model persistence; +# with named volumes, the preinstalled models will be copied to the host and complemented +# by downloaded models; +# tessdata is the only problematic module location +RUN mkdir -p $XDG_CONFIG_HOME +RUN mv $TESSDATA_PREFIX $XDG_CONFIG_HOME/ocrd-tesserocr-recognize +RUN ln -s $XDG_CONFIG_HOME/ocrd-tesserocr-recognize $TESSDATA_PREFIX +# finally, alias/symlink all ocrd-resources to /models for shorter mount commands +RUN mv $XDG_CONFIG_HOME /models && ln -s /models $XDG_CONFIG_HOME + +# finally, alias/symlink all ocrd-resources to /models for shorter mount commands WORKDIR /data VOLUME /data diff --git a/Makefile b/Makefile index e1648ae..3589e85 100644 --- a/Makefile +++ b/Makefile @@ -7,8 +7,20 @@ LOG_LEVEL = INFO PYTHONIOENCODING=utf8 LC_ALL = C.UTF-8 LANG = C.UTF-8 -export +ifdef VIRTUAL_ENV + TESSERACT_PREFIX = $(VIRTUAL_ENV) +else + TESSERACT_PREFIX = /usr/local +endif + +ifeq ($(PKG_CONFIG_PATH),) +PKG_CONFIG_PATH := $(TESSERACT_PREFIX)/lib/pkgconfig +else +PKG_CONFIG_PATH := $(TESSERACT_PREFIX)/lib/pkgconfig:$(PKG_CONFIG_PATH) +endif +export PKG_CONFIG_PATH +export # pytest args. Set to '-s' to see log output during test execution, '--verbose' to see individual tests. Default: '$(PYTEST_ARGS)' PYTEST_ARGS = @@ -16,58 +28,61 @@ PYTEST_ARGS = # Docker container tag DOCKER_TAG = 'ocrd/tesserocr' -# BEGIN-EVAL makefile-parser --make-help Makefile - help: @echo "" @echo " Targets" @echo "" - @echo " deps-ubuntu Dependencies for deployment in an ubuntu/debian linux" - @echo " (lib*-dev merely for building tesserocr with pip)" - @echo " (tesseract-ocr: Ubuntu 18.04 now ships 4.0.0," - @echo " which is unsupported. Add the tesseract-ocr PPA" - @echo " from Alexander Pozdnyakov which provides 4.1.0." - @echo " See https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr" - @echo " for details.)" - @echo " deps Install Python deps for install via pip" - @echo " deps-test Install Python deps for test via pip" - @echo " docker Build docker image" - @echo " install Install this package" - @echo " test Run unit tests" - @echo " coverage Run unit tests and determine test coverage" - @echo " test-cli Test the command line tools" - @echo " test/assets Setup test assets" - @echo " repo/assets Clone OCR-D/assets to ./repo/assets" - @echo " assets-clean Remove symlinks in test/assets" + @echo " deps-ubuntu Install system dependencies in an Ubuntu/Debian Linux" + @echo " install-tesseract Compile and install Tesseract" + @echo " install-tesseract-training Compile and install training utilities for Tesseract" + @echo " install-tesserocr Compile and install Tesserocr" + @echo " deps Install Python dependencies for install via pip" + @echo " install Install this package via pip" + @echo " deps-test Install Python deps for test via pip" + @echo " test Run unit tests" + @echo " coverage Run unit tests and determine test coverage" + @echo " test-cli Test the command line tools" + @echo " test/assets Setup test assets" + @echo " repo/assets Clone OCR-D/assets to ./repo/assets" + @echo " repo/tesseract Checkout Tesseract ./repo/tesseract" + @echo " repo/tesserocr Checkout Tesserocr to ./repo/tesserocr" + @echo " docker Build docker image" + @echo " assets-clean Remove symlinks in test/assets" @echo "" @echo " Variables" @echo "" - @echo " PYTEST_ARGS pytest args. Set to '-s' to see log output during test execution, '--verbose' to see individual tests. Default: '$(PYTEST_ARGS)'" - @echo " DOCKER_TAG Docker container tag" - @echo " TESSDATA_PREFIX search path for recognition models (overriding Tesseract compile-time default)" - -# END-EVAL - -# Dependencies for deployment in an ubuntu/debian linux -# (lib*-dev merely for building tesserocr with pip) -# (tesseract-ocr: Ubuntu 18.04 now ships 4.0.0, -# which is unsupported. Add the tesseract-ocr PPA -# from Alexander Pozdnyakov which provides 4.1.0. -# See https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr -# for details.) + @echo " PYTEST_ARGS pytest args. Set to '-s' to see log output during test execution, '--verbose' to see individual tests. [$(PYTEST_ARGS)]" + @echo " DOCKER_TAG Docker container tag [$(DOCKER_TAG)]" + @echo " TESSDATA_PREFIX search path for recognition models (overriding Tesseract compile-time default) [$(TESSDATA_PREFIX)]" + +# Dependencies for deployment in an Ubuntu/Debian Linux +# (lib*-dev merely for building Tesseract and tesserocr from sources) deps-ubuntu: - apt-get install -y --no-install-recommends software-properties-common - -add-apt-repository -u -y ppa:alex-p/tesseract-ocr - apt-get install -y \ + apt-get update && apt-get install -y --no-install-recommends \ + apt-utils \ + build-essential \ g++ \ git \ python3 \ python3-pip \ - libtesseract-dev \ + libjpeg-dev \ + libgif-dev \ + libwebp-dev \ + libopenjp2-7-dev \ + libpng-dev \ + libtiff-dev \ + libtool \ + pkg-config \ + tzdata \ + xzgv \ + zlib1g-dev \ libleptonica-dev \ - tesseract-ocr-eng \ - tesseract-ocr-script-frak \ - tesseract-ocr + libpango1.0-dev \ + libicu-dev \ + autotools-dev \ + automake \ + libcurl4-nss-dev \ + libarchive-dev # Install Python deps for install via pip deps: @@ -85,6 +100,35 @@ docker: --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ -t $(DOCKER_TAG) . +install-tesserocr: repo/tesserocr + $(PIP) install ./$< + +install-tesseract: $(TESSERACT_PREFIX)/bin/tesseract + +install-tesseract-training: $(TESSERACT_PREFIX)/bin/lstmtraining + +$(TESSERACT_PREFIX)/bin/tesseract: build_tesseract/Makefile + $(MAKE) -C build_tesseract install + if [[ "$(TESSERACT_PREFIX)" = "/usr"* ]]; then ldconfig; fi + +$(TESSERACT_PREFIX)/bin/lstmtraining: build_tesseract/Makefile + $(MAKE) -C build_tesseract training-install + +build_tesseract/Makefile: repo/tesseract/Makefile.in + mkdir -p $(@D) + cd $(@D) && $(CURDIR)/repo/tesseract/configure \ + --prefix=$(TESSERACT_PREFIX) \ + --disable-openmp \ + --disable-shared \ + 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic -fPIC' + +repo/tesseract/Makefile.in: repo/tesseract + cd $<; ./autogen.sh + +repo/tesserocr repo/tesseract: + git submodule sync $@ + git submodule update --init $@ + # Install this package install: deps $(PIP) install . @@ -135,6 +179,12 @@ repo/assets: mkdir -p $(dir $@) git clone https://github.com/OCR-D/assets "$@" +.PHONY: clean +clean: assets-clean tesseract-clean + +tesseract-clean: + rm -rf $(CURDIR)/build_tesseract + cd repo/tesseract; make distclean .PHONY: assets-clean # Remove symlinks in test/assets diff --git a/repo/tesseract b/repo/tesseract new file mode 160000 index 0000000..8ee020e --- /dev/null +++ b/repo/tesseract @@ -0,0 +1 @@ +Subproject commit 8ee020e14cf5be4e3f0e9beb09b6b050a1871854 diff --git a/repo/tesserocr b/repo/tesserocr new file mode 160000 index 0000000..1f960e9 --- /dev/null +++ b/repo/tesserocr @@ -0,0 +1 @@ +Subproject commit 1f960e9e0714dcd5ebdcf86248269efb70ccca5b