diff --git a/.circleci/config.yml b/.circleci/config.yml index 15da3fc..2dd45dc 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -19,15 +19,11 @@ jobs: - checkout - run: git submodule sync && git submodule update --init - run: sudo make deps-ubuntu - - run: make install-tesseract - - run: make install-tesserocr - run: make install - - run: ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata - - run: ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata - - run: ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata - - run: ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata + - run: make deps-test + - run: make test - run: make test-cli - - run: make deps-test coverage + - run: make coverage - codecov/upload deploy-docker: diff --git a/.gitignore b/.gitignore index c84bd90..06c4feb 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,6 @@ venv3 __pycache__ *.pyc *.egg-info -repo/assets test/assets .tox /build diff --git a/.gitmodules b/.gitmodules index cfc2d01..f9b32eb 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "repo/tesseract"] path = repo/tesseract url = https://github.com/tesseract-ocr/tesseract +[submodule "repo/assets"] + path = repo/assets + url = https://github.com/OCR-D/assets diff --git a/Dockerfile b/Dockerfile index f1f0b31..bf1a204 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,15 +41,12 @@ COPY ocrd_tesserocr ./ocrd_tesserocr COPY repo/tesserocr ./repo/tesserocr COPY repo/tesseract ./repo/tesseract COPY Makefile . -RUN make deps-ubuntu deps install-tesseract install-tesseract-training install-tesserocr install \ +RUN make deps-ubuntu deps install install-tesseract-training \ && rm -rf /build \ && apt-get -y remove --auto-remove g++ libtesseract-dev make RUN ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata RUN ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata -RUN ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata -RUN ocrd resmgr download ocrd-tesserocr-recognize equ.traineddata -RUN ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata # as discussed in ocrd_all#378, we do not want to manage more than one resource location # to mount for model persistence; diff --git a/Makefile b/Makefile index e34f4f0..5e68e46 100644 --- a/Makefile +++ b/Makefile @@ -36,8 +36,8 @@ help: @echo " install-tesseract Compile and install Tesseract" @echo " install-tesseract-training Compile and install training utilities for Tesseract" @echo " install-tesserocr Compile and install Tesserocr" - @echo " deps Install Python dependencies for install via pip" - @echo " install Install this package via pip" + @echo " deps Install Tesseract/Tesserocr and all Python dependencies" + @echo " install Install this package with all dependencies and download minimal models" @echo " deps-test Install Python deps for test via pip" @echo " test Run unit tests" @echo " coverage Run unit tests and determine test coverage" @@ -47,13 +47,16 @@ help: @echo " repo/tesseract Checkout Tesseract ./repo/tesseract" @echo " repo/tesserocr Checkout Tesserocr to ./repo/tesserocr" @echo " docker Build docker image" - @echo " assets-clean Remove symlinks in test/assets" + @echo " clean Remove temporary files" + @echo " clean-assets Remove only test/assets" + @echo " clean-tesseract Remove only build_tesseract" @echo "" @echo " Variables" @echo "" - @echo " PYTEST_ARGS pytest args. Set to '-s' to see log output during test execution, '--verbose' to see individual tests. [$(PYTEST_ARGS)]" - @echo " DOCKER_TAG Docker container tag [$(DOCKER_TAG)]" - @echo " TESSDATA_PREFIX search path for recognition models (overriding Tesseract compile-time default) [$(TESSDATA_PREFIX)]" + @echo " PYTEST_ARGS pytest args. Set to '-s' to see log output during test execution, '--verbose' to see individual tests. [$(PYTEST_ARGS)]" + @echo " DOCKER_TAG Docker container tag [$(DOCKER_TAG)]" + @echo ' TESSERACT_CONFIG command line options for Tesseract `configure` [$(TESSERACT_CONFIG)]' + @echo " TESSDATA_PREFIX search path for recognition models (overriding Tesseract compile-time default) [$(TESSDATA_PREFIX)]" # Dependencies for deployment in an Ubuntu/Debian Linux # (lib*-dev merely for building Tesseract and tesserocr from sources) @@ -85,13 +88,14 @@ deps-ubuntu: libarchive-dev # Install Python deps for install via pip -deps: - $(PIP) install -U pip +deps: install-tesserocr $(PIP) install -r requirements.txt # Install Python deps for test via pip deps-test: $(PIP) install -r requirements_test.txt + ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata + ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata # Build docker image docker: repo/tesseract repo/tesserocr @@ -100,7 +104,7 @@ docker: repo/tesseract repo/tesserocr --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ -t $(DOCKER_TAG) . -install-tesserocr: repo/tesserocr +install-tesserocr: repo/tesserocr install-tesseract $(PIP) install ./$< install-tesseract: $(TESSERACT_PREFIX)/bin/tesseract @@ -114,24 +118,26 @@ $(TESSERACT_PREFIX)/bin/tesseract: build_tesseract/Makefile $(TESSERACT_PREFIX)/bin/lstmtraining: build_tesseract/Makefile $(MAKE) -C build_tesseract training-install +TESSERACT_CONFIG ?= --disable-openmp --disable-shared CXXFLAGS="-g -O2 -fPIC -fno-math-errno -Wall -Wextra -Wpedantic" build_tesseract/Makefile: repo/tesseract/Makefile.in mkdir -p $(@D) cd $(@D) && $(CURDIR)/repo/tesseract/configure \ --prefix=$(TESSERACT_PREFIX) \ - --disable-openmp \ - --disable-shared \ - 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic -fPIC' + $(TESSERACT_CONFIG) repo/tesseract/Makefile.in: repo/tesseract cd $<; ./autogen.sh -repo/tesserocr repo/tesseract: +repo/tesserocr repo/tesseract repo/assets: git submodule sync $@ git submodule update --init $@ # Install this package install: deps - $(PIP) install . + $(PIP) install $(PIP_OPTIONS) . + ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata + ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata + ocrd resmgr download ocrd-tesserocr-recognize equ.traineddata # Run unit tests test: test/assets deps-test @@ -149,18 +155,16 @@ coverage: coverage html # Test the command line tools -test-cli: test/assets - $(PIP) install -e . +test-cli: test/assets deps-test rm -rfv test/workspace cp -rv test/assets/kant_aufklaerung_1784 test/workspace - ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata - ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata cd test/workspace/data && \ ocrd-tesserocr-segment-region -l DEBUG -I OCR-D-IMG -O OCR-D-SEG-REGION && \ ocrd-tesserocr-segment-line -l DEBUG -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE && \ ocrd-tesserocr-recognize -l DEBUG -I OCR-D-SEG-LINE -O OCR-D-TESS-OCR -P model deu .PHONY: test test-cli install deps deps-ubuntu deps-test help +.PHONY: install-tesseract install-tesserocr install-tesseract-training # # Assets @@ -172,21 +176,13 @@ test/assets: repo/assets mkdir -p $@ cp -r -t $@ repo/assets/data/* -# Clone OCR-D/assets to ./repo/assets -# FIXME does not work if already checked out -# FIXME should be a proper (VCed) submodule -repo/assets: - mkdir -p $(dir $@) - git clone https://github.com/OCR-D/assets "$@" - .PHONY: clean -clean: assets-clean tesseract-clean +clean: clean-assets clean-tesseract -tesseract-clean: - rm -rf $(CURDIR)/build_tesseract +clean-tesseract: + $(RM) -rf $(CURDIR)/build_tesseract cd repo/tesseract; make distclean -.PHONY: assets-clean -# Remove symlinks in test/assets -assets-clean: - rm -rf test/assets +.PHONY: clean-assets +clean-assets: + $(RM) -rf test/assets diff --git a/README.md b/README.md index 49b6fab..7075135 100644 --- a/README.md +++ b/README.md @@ -33,50 +33,55 @@ To run with docker: docker run -v path/to/workspaces:/data ocrd/tesserocr ocrd-tesserocrd-crop ... -### From PyPI and PPA +### From PyPI and Tesseract provided by system -This is the best option if you want to use the stable, released version. +If your operating system / distribution already provides Tesseract 4.1 +or newer, then just install its development package: ---- + # on Debian / Ubuntu: + sudo apt install libtesseract-dev -**NOTE** - -ocrd_tesserocr requires **Tesseract >= 4.1.0**. The Tesseract packages -bundled with **Ubuntu < 19.10** are too old. If you are on Ubuntu 18.04 LTS, -please use [Alexander Pozdnyakov's PPA](https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr) repository, +Otherwise, recent Tesseract packages for Ubuntu are available via PPA +[alex-p](https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr-devel), which has up-to-date builds of Tesseract and its dependencies: -```sh -sudo add-apt-repository ppa:alex-p/tesseract-ocr -sudo apt-get update -``` ---- + # on Debian / Ubuntu + sudo add-apt-repository ppa:alex-p/tesseract-ocr + sudo apt-get update + sudo apt install libtesseract-dev + +Once Tesseract is available, just install ocrd_tesserocr from PyPI server: + + + pip install ocrd_tesserocr + +We strongly recommend setting up a +[venv](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) first. -```sh -sudo apt-get install python3 python3-pip libtesseract-dev libleptonica-dev tesseract-ocr wget -pip install ocrd_tesserocr -``` ### From git -Use this option if you want to change the source code or install the latest, unpublished changes. +Use this option if there is no suitable prebuilt version of Tesseract available +on your system, or you want to change the source code or install the latest, unpublished changes. + + git clone https://github.com/OCR-D/ocrd_tesserocr + cd ocrd_tesserocr + # install Tesseract: + sudo make deps-ubuntu # system dependencies just for the build + make deps + # install tesserocr and ocrd_tesserocr: + make install -We strongly recommend to use [venv](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). +We strongly recommend setting up a +[venv](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) first. -```sh -git clone https://github.com/OCR-D/ocrd_tesserocr -cd ocrd_tesserocr -# install Tesseract: -sudo make deps-ubuntu # or manually from git or via ocrd_all -# install tesserocr and ocrd_tesserocr: -make deps # or pip install -r requirements -make install # or pip install . -``` ## Models -Tesseract comes with synthetically trained models for languages (`tesseract-ocr-{eng,deu,frk,...}` or scripts (`tesseract-ocr-script-{latn,frak,...}`). In addition, various models [trained](https://github.com/tesseract-ocr/tesstrain) on scan data are available from the community. +Tesseract comes with synthetically trained models for languages (`tesseract-ocr-{eng,deu,frk,...}` +or scripts (`tesseract-ocr-script-{latn,frak,...}`). In addition, various models +[trained](https://github.com/tesseract-ocr/tesstrain) on scan data are available from the community. Since all OCR-D processors must resolve file/data resources in a [standardized way](https://ocr-d.de/en/spec/cli#processor-resources), @@ -89,6 +94,10 @@ The `module` location is determined by the underlying Tesseract installation Other resource locations (data/system/cwd) will be ignored, and should not be used when installing models with the **Resource Manager** (`ocrd resmgr download`). +To see the `module` resource location of your installation: + + ocrd-tesserocr-recognize -D + For a full description of available commands for resource management, see: ocrd resmgr --help @@ -96,18 +105,24 @@ For a full description of available commands for resource management, see: ocrd resmgr download --help ocrd resmgr list-installed --help -(In previous versions, the resource locations of standalone Tesseract and the OCR-D wrapper were different. - If you already have models under `$XDG_DATA_HOME/ocrd-resources/ocrd-tesserocr-recognize`, - usually `~/.local/share/ocrd-resources/ocrd-tesserocr-recognize`, then consider moving them - to the new default under `ocrd-tesserocr-recognize -D`, - usually `/usr/share/tesseract-ocr/4.00/tessdata`, _or_ alternatively overriding the module directory - by setting `TESSDATA_PREFIX=$XDG_DATA_HOME/ocrd-resources/ocrd-tesserocr-recognize` in the environment.) +> **Note**: +> (In previous versions, the resource locations of standalone Tesseract and the OCR-D wrapper were different. +> If you already have models under `$XDG_DATA_HOME/ocrd-resources/ocrd-tesserocr-recognize`, +> usually `~/.local/share/ocrd-resources/ocrd-tesserocr-recognize`, then consider moving them +> to the new default under `ocrd-tesserocr-recognize -D`, +> usually `/usr/share/tesseract-ocr/4.00/tessdata`, _or_ alternatively overriding the module directory +> by setting `TESSDATA_PREFIX=$XDG_DATA_HOME/ocrd-resources/ocrd-tesserocr-recognize` in the environment.) Cf. [OCR-D model guide](https://ocr-d.de/en/models). Models always use the filename suffix `.traineddata`, but are just loaded by their basename. You will need **at least** `eng` and `osd` installed (even for segmentation and deskewing), -probably also `Latin` and `Fraktur` etc. +probably also `Latin` and `Fraktur` etc. So to get minimal models, do: + + ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata + ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata + +(This will already be installed if using the Docker or git installation option.) As of v0.13.1, you can configure `ocrd-tesserocr-recognize` to select models **dynamically** segment by segment, either via custom conditions on the PAGE-XML annotation (presented as XPath rules), @@ -202,9 +217,9 @@ shrinking to the convex hull of all its symbol outlines. ## Testing -```sh -make test -``` + + make test + This downloads some test data from https://github.com/OCR-D/assets under `repo/assets`, and runs some basic test of the Python API as well as the CLIs. diff --git a/repo/assets b/repo/assets new file mode 160000 index 0000000..05568aa --- /dev/null +++ b/repo/assets @@ -0,0 +1 @@ +Subproject commit 05568aaa2dc20678bf87ffec77f3baf2924d7c24 diff --git a/repo/tesseract b/repo/tesseract index 8ee020e..bd3c1a2 160000 --- a/repo/tesseract +++ b/repo/tesseract @@ -1 +1 @@ -Subproject commit 8ee020e14cf5be4e3f0e9beb09b6b050a1871854 +Subproject commit bd3c1a2ae059547bb24f5cbd8950e496fec082d4 diff --git a/repo/tesserocr b/repo/tesserocr index 1f960e9..c4307f0 160000 --- a/repo/tesserocr +++ b/repo/tesserocr @@ -1 +1 @@ -Subproject commit 1f960e9e0714dcd5ebdcf86248269efb70ccca5b +Subproject commit c4307f0e499422c70e4684caf24e047eb75c2938 diff --git a/test/conftest.py b/test/conftest.py index 23c78bc..54cba29 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -1,5 +1,5 @@ from ocrd.resolver import Resolver -from ocrd_utils import pushd_popd +from ocrd_utils import pushd_popd, initLogging from pytest import fixture from test.assets import assets as assets @@ -9,11 +9,13 @@ @fixture def workspace_kant_binarized(): + initLogging() with pushd_popd(tempdir=True) as tempdir: - yield Resolver().workspace_from_url(METS_KANT_BINARIZED, dst_dir=tempdir) + yield Resolver().workspace_from_url(METS_KANT_BINARIZED, dst_dir=tempdir, download=True) @fixture def workspace_herold_small(): + initLogging() with pushd_popd(tempdir=True) as tempdir: - yield Resolver().workspace_from_url(METS_HEROLD_SMALL, dst_dir=tempdir) + yield Resolver().workspace_from_url(METS_HEROLD_SMALL, dst_dir=tempdir, download=True)