OCR-D · kba · Mar 7, 2024 · Feb 22, 2024 · Feb 22, 2024 · Feb 22, 2024
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -19,15 +19,11 @@ jobs:
       - checkout
       - run: git submodule sync && git submodule update --init
       - run: sudo make deps-ubuntu
-      - run: make install-tesseract
-      - run: make install-tesserocr
       - run: make install
-      - run: ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata
-      - run: ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata
-      - run: ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata
-      - run: ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata
+      - run: make deps-test
+      - run: make test
       - run: make test-cli
-      - run: make deps-test coverage
+      - run: make coverage
       - codecov/upload
 
   deploy-docker:

diff --git a/.gitignore b/.gitignore
@@ -3,7 +3,6 @@ venv3
 __pycache__
 *.pyc
 *.egg-info
-repo/assets
 test/assets
 .tox
 /build

diff --git a/.gitmodules b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "repo/tesseract"]
 	path = repo/tesseract
 	url = https://github.com/tesseract-ocr/tesseract
+[submodule "repo/assets"]
+	path = repo/assets
+	url = https://github.com/OCR-D/assets
diff --git a/Dockerfile b/Dockerfile
@@ -41,15 +41,12 @@ COPY ocrd_tesserocr ./ocrd_tesserocr
 COPY repo/tesserocr ./repo/tesserocr
 COPY repo/tesseract ./repo/tesseract
 COPY Makefile .
-RUN make deps-ubuntu deps install-tesseract install-tesseract-training install-tesserocr install \
+RUN make deps-ubuntu deps install install-tesseract-training \
     && rm -rf /build \
     && apt-get -y remove --auto-remove g++ libtesseract-dev make
 
 RUN ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata
 RUN ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata
-RUN ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata
-RUN ocrd resmgr download ocrd-tesserocr-recognize equ.traineddata
-RUN ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata
 
 # as discussed in ocrd_all#378, we do not want to manage more than one resource location
 # to mount for model persistence; 

diff --git a/Makefile b/Makefile
@@ -36,8 +36,8 @@ help:
 	@echo "    install-tesseract Compile and install Tesseract"
 	@echo "    install-tesseract-training Compile and install training utilities for Tesseract"
 	@echo "    install-tesserocr Compile and install Tesserocr"
-	@echo "    deps              Install Python dependencies for install via pip"
-	@echo "    install           Install this package via pip"
+	@echo "    deps              Install Tesseract/Tesserocr and all Python dependencies"
+	@echo "    install           Install this package with all dependencies and download minimal models"
 	@echo "    deps-test         Install Python deps for test via pip"
 	@echo "    test              Run unit tests"
 	@echo "    coverage          Run unit tests and determine test coverage"
@@ -47,13 +47,16 @@ help:
 	@echo "    repo/tesseract    Checkout Tesseract ./repo/tesseract"
 	@echo "    repo/tesserocr    Checkout Tesserocr to ./repo/tesserocr"
 	@echo "    docker            Build docker image"
-	@echo "    assets-clean      Remove symlinks in test/assets"
+	@echo "    clean             Remove temporary files"
+	@echo "    clean-assets      Remove only test/assets"
+	@echo "    clean-tesseract   Remove only build_tesseract"
 	@echo ""
 	@echo "  Variables"
 	@echo ""
-	@echo "    PYTEST_ARGS     pytest args. Set to '-s' to see log output during test execution, '--verbose' to see individual tests. [$(PYTEST_ARGS)]"
-	@echo "    DOCKER_TAG      Docker container tag [$(DOCKER_TAG)]"
-	@echo "    TESSDATA_PREFIX search path for recognition models (overriding Tesseract compile-time default) [$(TESSDATA_PREFIX)]"
+	@echo "    PYTEST_ARGS       pytest args. Set to '-s' to see log output during test execution, '--verbose' to see individual tests. [$(PYTEST_ARGS)]"
+	@echo "    DOCKER_TAG        Docker container tag [$(DOCKER_TAG)]"
+	@echo '    TESSERACT_CONFIG  command line options for Tesseract `configure` [$(TESSERACT_CONFIG)]'
+	@echo "    TESSDATA_PREFIX   search path for recognition models (overriding Tesseract compile-time default) [$(TESSDATA_PREFIX)]"
 
 # Dependencies for deployment in an Ubuntu/Debian Linux
 # (lib*-dev merely for building Tesseract and tesserocr from sources)
@@ -85,13 +88,14 @@ deps-ubuntu:
 		libarchive-dev
 
 # Install Python deps for install via pip
-deps:
-	$(PIP) install -U pip
+deps: install-tesserocr
 	$(PIP) install -r requirements.txt
 
 # Install Python deps for test via pip
 deps-test:
 	$(PIP) install -r requirements_test.txt
+	ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata
+	ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata
 
 # Build docker image
 docker: repo/tesseract repo/tesserocr
@@ -100,7 +104,7 @@ docker: repo/tesseract repo/tesserocr
 	--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
 	-t $(DOCKER_TAG) .
 
-install-tesserocr: repo/tesserocr
+install-tesserocr: repo/tesserocr install-tesseract
 	$(PIP) install ./$<
 
 install-tesseract: $(TESSERACT_PREFIX)/bin/tesseract
@@ -114,24 +118,26 @@ $(TESSERACT_PREFIX)/bin/tesseract: build_tesseract/Makefile
 $(TESSERACT_PREFIX)/bin/lstmtraining: build_tesseract/Makefile
 	$(MAKE) -C build_tesseract training-install
 
+TESSERACT_CONFIG ?= --disable-openmp --disable-shared CXXFLAGS="-g -O2 -fPIC -fno-math-errno -Wall -Wextra -Wpedantic"
 build_tesseract/Makefile: repo/tesseract/Makefile.in
 	mkdir -p $(@D)
 	cd $(@D) && $(CURDIR)/repo/tesseract/configure \
 				--prefix=$(TESSERACT_PREFIX) \
-				--disable-openmp \
-				--disable-shared \
-				'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic -fPIC'
+				$(TESSERACT_CONFIG)
 
 repo/tesseract/Makefile.in: repo/tesseract
 	cd $<; ./autogen.sh
 
-repo/tesserocr repo/tesseract:
+repo/tesserocr repo/tesseract repo/assets:
 	git submodule sync $@
 	git submodule update --init $@
 
 # Install this package
 install: deps
-	$(PIP) install .
+	$(PIP) install $(PIP_OPTIONS) .
+	ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata
+	ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata
+	ocrd resmgr download ocrd-tesserocr-recognize equ.traineddata
 
 # Run unit tests
 test: test/assets deps-test
@@ -149,18 +155,16 @@ coverage:
 	coverage html
 
 # Test the command line tools
-test-cli: test/assets
-	$(PIP) install -e .
+test-cli: test/assets deps-test
 	rm -rfv test/workspace
 	cp -rv test/assets/kant_aufklaerung_1784 test/workspace
-	ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata
-	ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata
 	cd test/workspace/data && \
 		ocrd-tesserocr-segment-region -l DEBUG -I OCR-D-IMG -O OCR-D-SEG-REGION && \
 		ocrd-tesserocr-segment-line   -l DEBUG -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE && \
 		ocrd-tesserocr-recognize      -l DEBUG -I OCR-D-SEG-LINE -O OCR-D-TESS-OCR -P model deu
 
 .PHONY: test test-cli install deps deps-ubuntu deps-test help
+.PHONY: install-tesseract install-tesserocr install-tesseract-training 
 
 #
 # Assets
@@ -172,21 +176,13 @@ test/assets: repo/assets
 	mkdir -p $@
 	cp -r -t $@ repo/assets/data/*
 
-# Clone OCR-D/assets to ./repo/assets
-# FIXME does not work if already checked out
-# FIXME should be a proper (VCed) submodule
-repo/assets:
-	mkdir -p $(dir $@)
-	git clone https://github.com/OCR-D/assets "$@"
-
 .PHONY: clean
-clean: assets-clean tesseract-clean
+clean: clean-assets clean-tesseract
 
-tesseract-clean:
-	rm -rf $(CURDIR)/build_tesseract
+clean-tesseract:
+	$(RM) -rf $(CURDIR)/build_tesseract
 	cd repo/tesseract; make distclean
 
-.PHONY: assets-clean
-# Remove symlinks in test/assets
-assets-clean:
-	rm -rf test/assets
+.PHONY: clean-assets
+clean-assets:
+	$(RM) -rf test/assets
diff --git a/README.md b/README.md
@@ -33,50 +33,55 @@ To run with docker:
     docker run -v path/to/workspaces:/data ocrd/tesserocr ocrd-tesserocrd-crop ...
 
 
-### From PyPI and PPA
+### From PyPI and Tesseract provided by system
 
-This is the best option if you want to use the stable, released version.
+If your operating system / distribution already provides Tesseract 4.1
+or newer, then just install its development package:
 
----
+    # on Debian / Ubuntu:
+    sudo apt install libtesseract-dev
 
-**NOTE**
-
-ocrd_tesserocr requires **Tesseract >= 4.1.0**. The Tesseract packages
-bundled with **Ubuntu < 19.10** are too old. If you are on Ubuntu 18.04 LTS,
-please use [Alexander Pozdnyakov's PPA](https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr) repository,
+Otherwise, recent Tesseract packages for Ubuntu are available via PPA
+[alex-p](https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr-devel),
 which has up-to-date builds of Tesseract and its dependencies:
 
-```sh
-sudo add-apt-repository ppa:alex-p/tesseract-ocr
-sudo apt-get update
-```
 
----
+    # on Debian / Ubuntu
+    sudo add-apt-repository ppa:alex-p/tesseract-ocr
+    sudo apt-get update
+    sudo apt install libtesseract-dev
+
+Once Tesseract is available, just install ocrd_tesserocr from PyPI server:
+
+
+    pip install ocrd_tesserocr
+
+We strongly recommend setting up a
+[venv](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) first.
 
-```sh
-sudo apt-get install python3 python3-pip libtesseract-dev libleptonica-dev tesseract-ocr wget
-pip install ocrd_tesserocr
-```
 
 ### From git
 
-Use this option if you want to change the source code or install the latest, unpublished changes.
+Use this option if there is no suitable prebuilt version of Tesseract available
+on your system, or you want to change the source code or install the latest, unpublished changes.
+
+    git clone https://github.com/OCR-D/ocrd_tesserocr
+    cd ocrd_tesserocr
+    # install Tesseract:
+    sudo make deps-ubuntu # system dependencies just for the build
+    make deps
+    # install tesserocr and ocrd_tesserocr:
+    make install
 
-We strongly recommend to use [venv](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
+We strongly recommend setting up a
+[venv](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) first.
 
-```sh
-git clone https://github.com/OCR-D/ocrd_tesserocr
-cd ocrd_tesserocr
-# install Tesseract:
-sudo make deps-ubuntu # or manually from git or via ocrd_all
-# install tesserocr and ocrd_tesserocr:
-make deps        # or pip install -r requirements
-make install     # or pip install .
-```
 
 ## Models
 
-Tesseract comes with synthetically trained models for languages (`tesseract-ocr-{eng,deu,frk,...}` or scripts (`tesseract-ocr-script-{latn,frak,...}`). In addition, various models [trained](https://github.com/tesseract-ocr/tesstrain) on scan data are available from the community.
+Tesseract comes with synthetically trained models for languages (`tesseract-ocr-{eng,deu,frk,...}` 
+or scripts (`tesseract-ocr-script-{latn,frak,...}`). In addition, various models
+[trained](https://github.com/tesseract-ocr/tesstrain) on scan data are available from the community.
 
 Since all OCR-D processors must resolve file/data resources
 in a [standardized way](https://ocr-d.de/en/spec/cli#processor-resources),
@@ -89,25 +94,35 @@ The `module` location is determined by the underlying Tesseract installation
 Other resource locations (data/system/cwd) will be ignored, and should not be used
 when installing models with the **Resource Manager** (`ocrd resmgr download`).
 
+To see the `module` resource location of your installation:
+
+    ocrd-tesserocr-recognize -D
+
 For a full description of available commands for resource management, see:
 
     ocrd resmgr --help
     ocrd resmgr list-available --help
     ocrd resmgr download --help
     ocrd resmgr list-installed --help
 
-(In previous versions, the resource locations of standalone Tesseract and the OCR-D wrapper were different.
- If you already have models under `$XDG_DATA_HOME/ocrd-resources/ocrd-tesserocr-recognize`,
- usually `~/.local/share/ocrd-resources/ocrd-tesserocr-recognize`, then consider moving them
- to the new default under `ocrd-tesserocr-recognize -D`,
- usually `/usr/share/tesseract-ocr/4.00/tessdata`, _or_ alternatively overriding the module directory
- by setting `TESSDATA_PREFIX=$XDG_DATA_HOME/ocrd-resources/ocrd-tesserocr-recognize` in the environment.)
+> **Note**: 
+> (In previous versions, the resource locations of standalone Tesseract and the OCR-D wrapper were different.
+> If you already have models under `$XDG_DATA_HOME/ocrd-resources/ocrd-tesserocr-recognize`,
+> usually `~/.local/share/ocrd-resources/ocrd-tesserocr-recognize`, then consider moving them
+> to the new default under `ocrd-tesserocr-recognize -D`,
+> usually `/usr/share/tesseract-ocr/4.00/tessdata`, _or_ alternatively overriding the module directory
+> by setting `TESSDATA_PREFIX=$XDG_DATA_HOME/ocrd-resources/ocrd-tesserocr-recognize` in the environment.)
 
 Cf. [OCR-D model guide](https://ocr-d.de/en/models).
 
 Models always use the filename suffix `.traineddata`, but are just loaded by their basename.
 You will need **at least** `eng` and `osd` installed (even for segmentation and deskewing),
-probably also `Latin` and `Fraktur` etc.
+probably also `Latin` and `Fraktur` etc. So to get minimal models, do:
+
+	ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata
+	ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata
+
+(This will already be installed if using the Docker or git installation option.)
 
 As of v0.13.1, you can configure `ocrd-tesserocr-recognize` to select models **dynamically** segment by segment,
 either via custom conditions on the PAGE-XML annotation (presented as XPath rules),
@@ -202,9 +217,9 @@ shrinking to the convex hull of all its symbol outlines.
 
 ## Testing
 
-```sh
-make test
-```
+
+    make test
+
 
 This downloads some test data from https://github.com/OCR-D/assets under `repo/assets`,
 and runs some basic test of the Python API as well as the CLIs.

diff --git a/repo/assets b/repo/assets
diff --git a/repo/tesseract b/repo/tesseract
diff --git a/repo/tesserocr b/repo/tesserocr
diff --git a/test/conftest.py b/test/conftest.py
@@ -1,5 +1,5 @@
 from ocrd.resolver import Resolver
-from ocrd_utils import pushd_popd
+from ocrd_utils import pushd_popd, initLogging
 from pytest import fixture
 
 from test.assets import assets as assets
@@ -9,11 +9,13 @@
 
 @fixture
 def workspace_kant_binarized():
+    initLogging()
     with pushd_popd(tempdir=True) as tempdir:
-        yield Resolver().workspace_from_url(METS_KANT_BINARIZED, dst_dir=tempdir)
+        yield Resolver().workspace_from_url(METS_KANT_BINARIZED, dst_dir=tempdir, download=True)
 
 @fixture
 def workspace_herold_small():
+    initLogging()
     with pushd_popd(tempdir=True) as tempdir:
-        yield Resolver().workspace_from_url(METS_HEROLD_SMALL, dst_dir=tempdir)
+        yield Resolver().workspace_from_url(METS_HEROLD_SMALL, dst_dir=tempdir, download=True)
+2 −2		.github/workflows/autotools-macos.yml
+1 −1		.github/workflows/autotools-openmp.yml
+2 −3		.github/workflows/autotools.yml
+1 −1		.github/workflows/cmake-win64.yml
+1 −2		.github/workflows/cmake.yml
+1 −1		.github/workflows/codeql-analysis.yml
+1 −3		.github/workflows/msys2.yml
+7 −6		.github/workflows/unittest-disablelegacy.yml
+1 −1		.github/workflows/unittest-macos.yml
+1 −1		.github/workflows/unittest.yml
+1 −1		.github/workflows/vcpkg.yml
+2 −9		appveyor.yml
+1 −1		include/tesseract/baseapi.h
+6 −0		include/tesseract/renderer.h
+65 −38		src/api/pdfrenderer.cpp
+1 −1		src/ccmain/control.cpp
+1 −1		src/ccstruct/blobs.cpp
+1 −1		src/classify/shapetable.h
+1 −1		src/textord/colpartitionset.cpp
+1 −1		src/textord/colpartitionset.h
+1 −1		src/textord/edgblob.cpp
+1 −1		src/textord/strokewidth.h
+1 −1		src/textord/tablefind.cpp
+1 −1		src/textord/tablefind.h
+1 −1		src/textord/textlineprojection.cpp
+24 −17		src/training/unicharset/normstrngs.cpp
+1 −1		src/wordrec/associate.h
+1 −1		src/wordrec/tface.cpp
+2 −2		src/wordrec/wordrec.h
+1 −1		unittest/unicharcompress_test.cc