From f4d72c1591c531189d81c8f89a59fa8f2ab527c5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Feb 2024 12:07:18 +0100 Subject: [PATCH 01/10] test: also set up logging system during tests --- .gitmodules | 3 +++ repo/assets | 1 + test/conftest.py | 4 +++- 3 files changed, 7 insertions(+), 1 deletion(-) create mode 120000 repo/assets diff --git a/.gitmodules b/.gitmodules index cfc2d01..f9b32eb 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "repo/tesseract"] path = repo/tesseract url = https://github.com/tesseract-ocr/tesseract +[submodule "repo/assets"] + path = repo/assets + url = https://github.com/OCR-D/assets diff --git a/repo/assets b/repo/assets new file mode 120000 index 0000000..c410393 --- /dev/null +++ b/repo/assets @@ -0,0 +1 @@ +/home/xbert/unsortiert/arbeit/heyer/daten/assets/ \ No newline at end of file diff --git a/test/conftest.py b/test/conftest.py index 23c78bc..d2de7c6 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -1,5 +1,5 @@ from ocrd.resolver import Resolver -from ocrd_utils import pushd_popd +from ocrd_utils import pushd_popd, initLogging from pytest import fixture from test.assets import assets as assets @@ -9,11 +9,13 @@ @fixture def workspace_kant_binarized(): + initLogging() with pushd_popd(tempdir=True) as tempdir: yield Resolver().workspace_from_url(METS_KANT_BINARIZED, dst_dir=tempdir) @fixture def workspace_herold_small(): + initLogging() with pushd_popd(tempdir=True) as tempdir: yield Resolver().workspace_from_url(METS_HEROLD_SMALL, dst_dir=tempdir) From c1e54af9f13b0ce7ffda6334c9c6202cc15bb2f9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Feb 2024 12:08:14 +0100 Subject: [PATCH 02/10] test: download files of data assets --- test/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index d2de7c6..54cba29 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -11,11 +11,11 @@ def workspace_kant_binarized(): initLogging() with pushd_popd(tempdir=True) as tempdir: - yield Resolver().workspace_from_url(METS_KANT_BINARIZED, dst_dir=tempdir) + yield Resolver().workspace_from_url(METS_KANT_BINARIZED, dst_dir=tempdir, download=True) @fixture def workspace_herold_small(): initLogging() with pushd_popd(tempdir=True) as tempdir: - yield Resolver().workspace_from_url(METS_HEROLD_SMALL, dst_dir=tempdir) + yield Resolver().workspace_from_url(METS_HEROLD_SMALL, dst_dir=tempdir, download=True) From 8c9426a2a618fb98ad45bbdc92e3015c7c38b115 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Feb 2024 12:16:11 +0100 Subject: [PATCH 03/10] =?UTF-8?q?explify=20dependencies=20install=20?= =?UTF-8?q?=E2=86=90=20install-tesserocr=20=E2=86=90=20install-tesseract?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Makefile | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index e34f4f0..4dd1b7a 100644 --- a/Makefile +++ b/Makefile @@ -36,8 +36,8 @@ help: @echo " install-tesseract Compile and install Tesseract" @echo " install-tesseract-training Compile and install training utilities for Tesseract" @echo " install-tesserocr Compile and install Tesserocr" - @echo " deps Install Python dependencies for install via pip" - @echo " install Install this package via pip" + @echo " deps Install Tesseract/Tesserocr and all Python dependencies" + @echo " install Install this package with all dependencies and download minimal models" @echo " deps-test Install Python deps for test via pip" @echo " test Run unit tests" @echo " coverage Run unit tests and determine test coverage" @@ -47,13 +47,16 @@ help: @echo " repo/tesseract Checkout Tesseract ./repo/tesseract" @echo " repo/tesserocr Checkout Tesserocr to ./repo/tesserocr" @echo " docker Build docker image" - @echo " assets-clean Remove symlinks in test/assets" + @echo " clean Remove temporary files" + @echo " clean-assets Remove only test/assets" + @echo " clean-tesseract Remove only build_tesseract" @echo "" @echo " Variables" @echo "" - @echo " PYTEST_ARGS pytest args. Set to '-s' to see log output during test execution, '--verbose' to see individual tests. [$(PYTEST_ARGS)]" - @echo " DOCKER_TAG Docker container tag [$(DOCKER_TAG)]" - @echo " TESSDATA_PREFIX search path for recognition models (overriding Tesseract compile-time default) [$(TESSDATA_PREFIX)]" + @echo " PYTEST_ARGS pytest args. Set to '-s' to see log output during test execution, '--verbose' to see individual tests. [$(PYTEST_ARGS)]" + @echo " DOCKER_TAG Docker container tag [$(DOCKER_TAG)]" + @echo ' TESSERACT_CONFIG command line options for Tesseract `configure` [$(TESSERACT_CONFIG)]' + @echo " TESSDATA_PREFIX search path for recognition models (overriding Tesseract compile-time default) [$(TESSDATA_PREFIX)]" # Dependencies for deployment in an Ubuntu/Debian Linux # (lib*-dev merely for building Tesseract and tesserocr from sources) @@ -85,8 +88,7 @@ deps-ubuntu: libarchive-dev # Install Python deps for install via pip -deps: - $(PIP) install -U pip +deps: install-tesserocr $(PIP) install -r requirements.txt # Install Python deps for test via pip @@ -100,7 +102,7 @@ docker: repo/tesseract repo/tesserocr --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ -t $(DOCKER_TAG) . -install-tesserocr: repo/tesserocr +install-tesserocr: repo/tesserocr install-tesseract $(PIP) install ./$< install-tesseract: $(TESSERACT_PREFIX)/bin/tesseract @@ -149,18 +151,16 @@ coverage: coverage html # Test the command line tools -test-cli: test/assets - $(PIP) install -e . +test-cli: test/assets deps-test rm -rfv test/workspace cp -rv test/assets/kant_aufklaerung_1784 test/workspace - ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata - ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata cd test/workspace/data && \ ocrd-tesserocr-segment-region -l DEBUG -I OCR-D-IMG -O OCR-D-SEG-REGION && \ ocrd-tesserocr-segment-line -l DEBUG -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE && \ ocrd-tesserocr-recognize -l DEBUG -I OCR-D-SEG-LINE -O OCR-D-TESS-OCR -P model deu .PHONY: test test-cli install deps deps-ubuntu deps-test help +.PHONY: install-tesseract install-tesserocr install-tesseract-training # # Assets From 58d1513bab76ff08df87877267696705c2ecea62 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Feb 2024 12:17:45 +0100 Subject: [PATCH 04/10] also install minimal needed models --- Makefile | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 4dd1b7a..7acab42 100644 --- a/Makefile +++ b/Makefile @@ -94,6 +94,8 @@ deps: install-tesserocr # Install Python deps for test via pip deps-test: $(PIP) install -r requirements_test.txt + ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata + ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata # Build docker image docker: repo/tesseract repo/tesserocr @@ -133,7 +135,10 @@ repo/tesserocr repo/tesseract: # Install this package install: deps - $(PIP) install . + $(PIP) install $(PIP_OPTIONS) . + ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata + ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata + ocrd resmgr download ocrd-tesserocr-recognize equ.traineddata # Run unit tests test: test/assets deps-test From 2643d9d61a7b9c0e414feb13623cc199381e4b87 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Feb 2024 12:18:09 +0100 Subject: [PATCH 05/10] make Tesseract build configurable --- Makefile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 7acab42..aa34e99 100644 --- a/Makefile +++ b/Makefile @@ -118,13 +118,12 @@ $(TESSERACT_PREFIX)/bin/tesseract: build_tesseract/Makefile $(TESSERACT_PREFIX)/bin/lstmtraining: build_tesseract/Makefile $(MAKE) -C build_tesseract training-install +TESSERACT_CONFIG ?= --disable-openmp --disable-shared CXXFLAGS="-g -O2 -fPIC -fno-math-errno -Wall -Wextra -Wpedantic" build_tesseract/Makefile: repo/tesseract/Makefile.in mkdir -p $(@D) cd $(@D) && $(CURDIR)/repo/tesseract/configure \ --prefix=$(TESSERACT_PREFIX) \ - --disable-openmp \ - --disable-shared \ - 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic -fPIC' + $(TESSERACT_CONFIG) repo/tesseract/Makefile.in: repo/tesseract cd $<; ./autogen.sh From 349dd36f24b8d5f750455f41fc2d282c25b48580 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Feb 2024 12:19:10 +0100 Subject: [PATCH 06/10] =?UTF-8?q?add=20repo/assets=20as=20proper=20submodu?= =?UTF-8?q?le,=20rename=20*-clean=20=E2=86=92=20clean-*?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 - Makefile | 22 +++++++--------------- repo/assets | 2 +- 3 files changed, 8 insertions(+), 17 deletions(-) mode change 120000 => 160000 repo/assets diff --git a/.gitignore b/.gitignore index c84bd90..06c4feb 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,6 @@ venv3 __pycache__ *.pyc *.egg-info -repo/assets test/assets .tox /build diff --git a/Makefile b/Makefile index aa34e99..5e68e46 100644 --- a/Makefile +++ b/Makefile @@ -128,7 +128,7 @@ build_tesseract/Makefile: repo/tesseract/Makefile.in repo/tesseract/Makefile.in: repo/tesseract cd $<; ./autogen.sh -repo/tesserocr repo/tesseract: +repo/tesserocr repo/tesseract repo/assets: git submodule sync $@ git submodule update --init $@ @@ -176,21 +176,13 @@ test/assets: repo/assets mkdir -p $@ cp -r -t $@ repo/assets/data/* -# Clone OCR-D/assets to ./repo/assets -# FIXME does not work if already checked out -# FIXME should be a proper (VCed) submodule -repo/assets: - mkdir -p $(dir $@) - git clone https://github.com/OCR-D/assets "$@" - .PHONY: clean -clean: assets-clean tesseract-clean +clean: clean-assets clean-tesseract -tesseract-clean: - rm -rf $(CURDIR)/build_tesseract +clean-tesseract: + $(RM) -rf $(CURDIR)/build_tesseract cd repo/tesseract; make distclean -.PHONY: assets-clean -# Remove symlinks in test/assets -assets-clean: - rm -rf test/assets +.PHONY: clean-assets +clean-assets: + $(RM) -rf test/assets diff --git a/repo/assets b/repo/assets deleted file mode 120000 index c410393..0000000 --- a/repo/assets +++ /dev/null @@ -1 +0,0 @@ -/home/xbert/unsortiert/arbeit/heyer/daten/assets/ \ No newline at end of file diff --git a/repo/assets b/repo/assets new file mode 160000 index 0000000..05568aa --- /dev/null +++ b/repo/assets @@ -0,0 +1 @@ +Subproject commit 05568aaa2dc20678bf87ffec77f3baf2924d7c24 From f75161c15980187b901d993fcead58868cb608d8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Feb 2024 12:19:42 +0100 Subject: [PATCH 07/10] CI: add make test --- .circleci/config.yml | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 15da3fc..2dd45dc 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -19,15 +19,11 @@ jobs: - checkout - run: git submodule sync && git submodule update --init - run: sudo make deps-ubuntu - - run: make install-tesseract - - run: make install-tesserocr - run: make install - - run: ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata - - run: ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata - - run: ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata - - run: ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata + - run: make deps-test + - run: make test - run: make test-cli - - run: make deps-test coverage + - run: make coverage - codecov/upload deploy-docker: From 42362473b77d5cff5243381a22d97782529aba90 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Feb 2024 12:55:41 +0100 Subject: [PATCH 08/10] simplify dockerfile --- Dockerfile | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index f1f0b31..bf1a204 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,15 +41,12 @@ COPY ocrd_tesserocr ./ocrd_tesserocr COPY repo/tesserocr ./repo/tesserocr COPY repo/tesseract ./repo/tesseract COPY Makefile . -RUN make deps-ubuntu deps install-tesseract install-tesseract-training install-tesserocr install \ +RUN make deps-ubuntu deps install install-tesseract-training \ && rm -rf /build \ && apt-get -y remove --auto-remove g++ libtesseract-dev make RUN ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata RUN ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata -RUN ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata -RUN ocrd resmgr download ocrd-tesserocr-recognize equ.traineddata -RUN ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata # as discussed in ocrd_all#378, we do not want to manage more than one resource location # to mount for model persistence; From 651885c4c94e9dfa9c55a4574a27224e413d411e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Feb 2024 13:56:24 +0100 Subject: [PATCH 09/10] update/improve readme --- README.md | 93 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 54 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index 49b6fab..7075135 100644 --- a/README.md +++ b/README.md @@ -33,50 +33,55 @@ To run with docker: docker run -v path/to/workspaces:/data ocrd/tesserocr ocrd-tesserocrd-crop ... -### From PyPI and PPA +### From PyPI and Tesseract provided by system -This is the best option if you want to use the stable, released version. +If your operating system / distribution already provides Tesseract 4.1 +or newer, then just install its development package: ---- + # on Debian / Ubuntu: + sudo apt install libtesseract-dev -**NOTE** - -ocrd_tesserocr requires **Tesseract >= 4.1.0**. The Tesseract packages -bundled with **Ubuntu < 19.10** are too old. If you are on Ubuntu 18.04 LTS, -please use [Alexander Pozdnyakov's PPA](https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr) repository, +Otherwise, recent Tesseract packages for Ubuntu are available via PPA +[alex-p](https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr-devel), which has up-to-date builds of Tesseract and its dependencies: -```sh -sudo add-apt-repository ppa:alex-p/tesseract-ocr -sudo apt-get update -``` ---- + # on Debian / Ubuntu + sudo add-apt-repository ppa:alex-p/tesseract-ocr + sudo apt-get update + sudo apt install libtesseract-dev + +Once Tesseract is available, just install ocrd_tesserocr from PyPI server: + + + pip install ocrd_tesserocr + +We strongly recommend setting up a +[venv](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) first. -```sh -sudo apt-get install python3 python3-pip libtesseract-dev libleptonica-dev tesseract-ocr wget -pip install ocrd_tesserocr -``` ### From git -Use this option if you want to change the source code or install the latest, unpublished changes. +Use this option if there is no suitable prebuilt version of Tesseract available +on your system, or you want to change the source code or install the latest, unpublished changes. + + git clone https://github.com/OCR-D/ocrd_tesserocr + cd ocrd_tesserocr + # install Tesseract: + sudo make deps-ubuntu # system dependencies just for the build + make deps + # install tesserocr and ocrd_tesserocr: + make install -We strongly recommend to use [venv](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). +We strongly recommend setting up a +[venv](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) first. -```sh -git clone https://github.com/OCR-D/ocrd_tesserocr -cd ocrd_tesserocr -# install Tesseract: -sudo make deps-ubuntu # or manually from git or via ocrd_all -# install tesserocr and ocrd_tesserocr: -make deps # or pip install -r requirements -make install # or pip install . -``` ## Models -Tesseract comes with synthetically trained models for languages (`tesseract-ocr-{eng,deu,frk,...}` or scripts (`tesseract-ocr-script-{latn,frak,...}`). In addition, various models [trained](https://github.com/tesseract-ocr/tesstrain) on scan data are available from the community. +Tesseract comes with synthetically trained models for languages (`tesseract-ocr-{eng,deu,frk,...}` +or scripts (`tesseract-ocr-script-{latn,frak,...}`). In addition, various models +[trained](https://github.com/tesseract-ocr/tesstrain) on scan data are available from the community. Since all OCR-D processors must resolve file/data resources in a [standardized way](https://ocr-d.de/en/spec/cli#processor-resources), @@ -89,6 +94,10 @@ The `module` location is determined by the underlying Tesseract installation Other resource locations (data/system/cwd) will be ignored, and should not be used when installing models with the **Resource Manager** (`ocrd resmgr download`). +To see the `module` resource location of your installation: + + ocrd-tesserocr-recognize -D + For a full description of available commands for resource management, see: ocrd resmgr --help @@ -96,18 +105,24 @@ For a full description of available commands for resource management, see: ocrd resmgr download --help ocrd resmgr list-installed --help -(In previous versions, the resource locations of standalone Tesseract and the OCR-D wrapper were different. - If you already have models under `$XDG_DATA_HOME/ocrd-resources/ocrd-tesserocr-recognize`, - usually `~/.local/share/ocrd-resources/ocrd-tesserocr-recognize`, then consider moving them - to the new default under `ocrd-tesserocr-recognize -D`, - usually `/usr/share/tesseract-ocr/4.00/tessdata`, _or_ alternatively overriding the module directory - by setting `TESSDATA_PREFIX=$XDG_DATA_HOME/ocrd-resources/ocrd-tesserocr-recognize` in the environment.) +> **Note**: +> (In previous versions, the resource locations of standalone Tesseract and the OCR-D wrapper were different. +> If you already have models under `$XDG_DATA_HOME/ocrd-resources/ocrd-tesserocr-recognize`, +> usually `~/.local/share/ocrd-resources/ocrd-tesserocr-recognize`, then consider moving them +> to the new default under `ocrd-tesserocr-recognize -D`, +> usually `/usr/share/tesseract-ocr/4.00/tessdata`, _or_ alternatively overriding the module directory +> by setting `TESSDATA_PREFIX=$XDG_DATA_HOME/ocrd-resources/ocrd-tesserocr-recognize` in the environment.) Cf. [OCR-D model guide](https://ocr-d.de/en/models). Models always use the filename suffix `.traineddata`, but are just loaded by their basename. You will need **at least** `eng` and `osd` installed (even for segmentation and deskewing), -probably also `Latin` and `Fraktur` etc. +probably also `Latin` and `Fraktur` etc. So to get minimal models, do: + + ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata + ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata + +(This will already be installed if using the Docker or git installation option.) As of v0.13.1, you can configure `ocrd-tesserocr-recognize` to select models **dynamically** segment by segment, either via custom conditions on the PAGE-XML annotation (presented as XPath rules), @@ -202,9 +217,9 @@ shrinking to the convex hull of all its symbol outlines. ## Testing -```sh -make test -``` + + make test + This downloads some test data from https://github.com/OCR-D/assets under `repo/assets`, and runs some basic test of the Python API as well as the CLIs. From d5724ce0f8441d6db6601690123b7bf9e99eaf7c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 5 Mar 2024 21:50:59 +0100 Subject: [PATCH 10/10] update tesseract/tesserocr to most recent --- repo/tesseract | 2 +- repo/tesserocr | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/repo/tesseract b/repo/tesseract index 8ee020e..bd3c1a2 160000 --- a/repo/tesseract +++ b/repo/tesseract @@ -1 +1 @@ -Subproject commit 8ee020e14cf5be4e3f0e9beb09b6b050a1871854 +Subproject commit bd3c1a2ae059547bb24f5cbd8950e496fec082d4 diff --git a/repo/tesserocr b/repo/tesserocr index 1f960e9..c4307f0 160000 --- a/repo/tesserocr +++ b/repo/tesserocr @@ -1 +1 @@ -Subproject commit 1f960e9e0714dcd5ebdcf86248269efb70ccca5b +Subproject commit c4307f0e499422c70e4684caf24e047eb75c2938