Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 3 additions & 7 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,11 @@ jobs:
- checkout
- run: git submodule sync && git submodule update --init
- run: sudo make deps-ubuntu
- run: make install-tesseract
- run: make install-tesserocr
- run: make install
- run: ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata
- run: ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata
- run: ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata
- run: ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata
- run: make deps-test
- run: make test
- run: make test-cli
- run: make deps-test coverage
- run: make coverage
- codecov/upload

deploy-docker:
Expand Down
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ venv3
__pycache__
*.pyc
*.egg-info
repo/assets
test/assets
.tox
/build
Expand Down
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@
[submodule "repo/tesseract"]
path = repo/tesseract
url = https://github.com/tesseract-ocr/tesseract
[submodule "repo/assets"]
path = repo/assets
url = https://github.com/OCR-D/assets
5 changes: 1 addition & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,12 @@ COPY ocrd_tesserocr ./ocrd_tesserocr
COPY repo/tesserocr ./repo/tesserocr
COPY repo/tesseract ./repo/tesseract
COPY Makefile .
RUN make deps-ubuntu deps install-tesseract install-tesseract-training install-tesserocr install \
RUN make deps-ubuntu deps install install-tesseract-training \
&& rm -rf /build \
&& apt-get -y remove --auto-remove g++ libtesseract-dev make

RUN ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata
RUN ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata
RUN ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata
RUN ocrd resmgr download ocrd-tesserocr-recognize equ.traineddata
RUN ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata

# as discussed in ocrd_all#378, we do not want to manage more than one resource location
# to mount for model persistence;
Expand Down
60 changes: 28 additions & 32 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ help:
@echo " install-tesseract Compile and install Tesseract"
@echo " install-tesseract-training Compile and install training utilities for Tesseract"
@echo " install-tesserocr Compile and install Tesserocr"
@echo " deps Install Python dependencies for install via pip"
@echo " install Install this package via pip"
@echo " deps Install Tesseract/Tesserocr and all Python dependencies"
@echo " install Install this package with all dependencies and download minimal models"
@echo " deps-test Install Python deps for test via pip"
@echo " test Run unit tests"
@echo " coverage Run unit tests and determine test coverage"
Expand All @@ -47,13 +47,16 @@ help:
@echo " repo/tesseract Checkout Tesseract ./repo/tesseract"
@echo " repo/tesserocr Checkout Tesserocr to ./repo/tesserocr"
@echo " docker Build docker image"
@echo " assets-clean Remove symlinks in test/assets"
@echo " clean Remove temporary files"
@echo " clean-assets Remove only test/assets"
@echo " clean-tesseract Remove only build_tesseract"
@echo ""
@echo " Variables"
@echo ""
@echo " PYTEST_ARGS pytest args. Set to '-s' to see log output during test execution, '--verbose' to see individual tests. [$(PYTEST_ARGS)]"
@echo " DOCKER_TAG Docker container tag [$(DOCKER_TAG)]"
@echo " TESSDATA_PREFIX search path for recognition models (overriding Tesseract compile-time default) [$(TESSDATA_PREFIX)]"
@echo " PYTEST_ARGS pytest args. Set to '-s' to see log output during test execution, '--verbose' to see individual tests. [$(PYTEST_ARGS)]"
@echo " DOCKER_TAG Docker container tag [$(DOCKER_TAG)]"
@echo ' TESSERACT_CONFIG command line options for Tesseract `configure` [$(TESSERACT_CONFIG)]'
@echo " TESSDATA_PREFIX search path for recognition models (overriding Tesseract compile-time default) [$(TESSDATA_PREFIX)]"

# Dependencies for deployment in an Ubuntu/Debian Linux
# (lib*-dev merely for building Tesseract and tesserocr from sources)
Expand Down Expand Up @@ -85,13 +88,14 @@ deps-ubuntu:
libarchive-dev

# Install Python deps for install via pip
deps:
$(PIP) install -U pip
deps: install-tesserocr
$(PIP) install -r requirements.txt

# Install Python deps for test via pip
deps-test:
$(PIP) install -r requirements_test.txt
ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata
ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata

# Build docker image
docker: repo/tesseract repo/tesserocr
Expand All @@ -100,7 +104,7 @@ docker: repo/tesseract repo/tesserocr
--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
-t $(DOCKER_TAG) .

install-tesserocr: repo/tesserocr
install-tesserocr: repo/tesserocr install-tesseract
$(PIP) install ./$<

install-tesseract: $(TESSERACT_PREFIX)/bin/tesseract
Expand All @@ -114,24 +118,26 @@ $(TESSERACT_PREFIX)/bin/tesseract: build_tesseract/Makefile
$(TESSERACT_PREFIX)/bin/lstmtraining: build_tesseract/Makefile
$(MAKE) -C build_tesseract training-install

TESSERACT_CONFIG ?= --disable-openmp --disable-shared CXXFLAGS="-g -O2 -fPIC -fno-math-errno -Wall -Wextra -Wpedantic"
build_tesseract/Makefile: repo/tesseract/Makefile.in
mkdir -p $(@D)
cd $(@D) && $(CURDIR)/repo/tesseract/configure \
--prefix=$(TESSERACT_PREFIX) \
--disable-openmp \
--disable-shared \
'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic -fPIC'
$(TESSERACT_CONFIG)

repo/tesseract/Makefile.in: repo/tesseract
cd $<; ./autogen.sh

repo/tesserocr repo/tesseract:
repo/tesserocr repo/tesseract repo/assets:
git submodule sync $@
git submodule update --init $@

# Install this package
install: deps
$(PIP) install .
$(PIP) install $(PIP_OPTIONS) .
ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata
ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata
ocrd resmgr download ocrd-tesserocr-recognize equ.traineddata

# Run unit tests
test: test/assets deps-test
Expand All @@ -149,18 +155,16 @@ coverage:
coverage html

# Test the command line tools
test-cli: test/assets
$(PIP) install -e .
test-cli: test/assets deps-test
rm -rfv test/workspace
cp -rv test/assets/kant_aufklaerung_1784 test/workspace
ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata
ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata
cd test/workspace/data && \
ocrd-tesserocr-segment-region -l DEBUG -I OCR-D-IMG -O OCR-D-SEG-REGION && \
ocrd-tesserocr-segment-line -l DEBUG -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE && \
ocrd-tesserocr-recognize -l DEBUG -I OCR-D-SEG-LINE -O OCR-D-TESS-OCR -P model deu

.PHONY: test test-cli install deps deps-ubuntu deps-test help
.PHONY: install-tesseract install-tesserocr install-tesseract-training

#
# Assets
Expand All @@ -172,21 +176,13 @@ test/assets: repo/assets
mkdir -p $@
cp -r -t $@ repo/assets/data/*

# Clone OCR-D/assets to ./repo/assets
# FIXME does not work if already checked out
# FIXME should be a proper (VCed) submodule
repo/assets:
mkdir -p $(dir $@)
git clone https://github.com/OCR-D/assets "$@"

.PHONY: clean
clean: assets-clean tesseract-clean
clean: clean-assets clean-tesseract

tesseract-clean:
rm -rf $(CURDIR)/build_tesseract
clean-tesseract:
$(RM) -rf $(CURDIR)/build_tesseract
cd repo/tesseract; make distclean

.PHONY: assets-clean
# Remove symlinks in test/assets
assets-clean:
rm -rf test/assets
.PHONY: clean-assets
clean-assets:
$(RM) -rf test/assets
93 changes: 54 additions & 39 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,50 +33,55 @@ To run with docker:
docker run -v path/to/workspaces:/data ocrd/tesserocr ocrd-tesserocrd-crop ...


### From PyPI and PPA
### From PyPI and Tesseract provided by system

This is the best option if you want to use the stable, released version.
If your operating system / distribution already provides Tesseract 4.1
or newer, then just install its development package:

---
# on Debian / Ubuntu:
sudo apt install libtesseract-dev

**NOTE**

ocrd_tesserocr requires **Tesseract >= 4.1.0**. The Tesseract packages
bundled with **Ubuntu < 19.10** are too old. If you are on Ubuntu 18.04 LTS,
please use [Alexander Pozdnyakov's PPA](https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr) repository,
Otherwise, recent Tesseract packages for Ubuntu are available via PPA
[alex-p](https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr-devel),
which has up-to-date builds of Tesseract and its dependencies:

```sh
sudo add-apt-repository ppa:alex-p/tesseract-ocr
sudo apt-get update
```

---
# on Debian / Ubuntu
sudo add-apt-repository ppa:alex-p/tesseract-ocr
sudo apt-get update
sudo apt install libtesseract-dev

Once Tesseract is available, just install ocrd_tesserocr from PyPI server:


pip install ocrd_tesserocr

We strongly recommend setting up a
[venv](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) first.

```sh
sudo apt-get install python3 python3-pip libtesseract-dev libleptonica-dev tesseract-ocr wget
pip install ocrd_tesserocr
```

### From git

Use this option if you want to change the source code or install the latest, unpublished changes.
Use this option if there is no suitable prebuilt version of Tesseract available
on your system, or you want to change the source code or install the latest, unpublished changes.

git clone https://github.com/OCR-D/ocrd_tesserocr
cd ocrd_tesserocr
# install Tesseract:
sudo make deps-ubuntu # system dependencies just for the build
make deps
# install tesserocr and ocrd_tesserocr:
make install

We strongly recommend to use [venv](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
We strongly recommend setting up a
[venv](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) first.

```sh
git clone https://github.com/OCR-D/ocrd_tesserocr
cd ocrd_tesserocr
# install Tesseract:
sudo make deps-ubuntu # or manually from git or via ocrd_all
# install tesserocr and ocrd_tesserocr:
make deps # or pip install -r requirements
make install # or pip install .
```

## Models

Tesseract comes with synthetically trained models for languages (`tesseract-ocr-{eng,deu,frk,...}` or scripts (`tesseract-ocr-script-{latn,frak,...}`). In addition, various models [trained](https://github.com/tesseract-ocr/tesstrain) on scan data are available from the community.
Tesseract comes with synthetically trained models for languages (`tesseract-ocr-{eng,deu,frk,...}`
or scripts (`tesseract-ocr-script-{latn,frak,...}`). In addition, various models
[trained](https://github.com/tesseract-ocr/tesstrain) on scan data are available from the community.

Since all OCR-D processors must resolve file/data resources
in a [standardized way](https://ocr-d.de/en/spec/cli#processor-resources),
Expand All @@ -89,25 +94,35 @@ The `module` location is determined by the underlying Tesseract installation
Other resource locations (data/system/cwd) will be ignored, and should not be used
when installing models with the **Resource Manager** (`ocrd resmgr download`).

To see the `module` resource location of your installation:

ocrd-tesserocr-recognize -D

For a full description of available commands for resource management, see:

ocrd resmgr --help
ocrd resmgr list-available --help
ocrd resmgr download --help
ocrd resmgr list-installed --help

(In previous versions, the resource locations of standalone Tesseract and the OCR-D wrapper were different.
If you already have models under `$XDG_DATA_HOME/ocrd-resources/ocrd-tesserocr-recognize`,
usually `~/.local/share/ocrd-resources/ocrd-tesserocr-recognize`, then consider moving them
to the new default under `ocrd-tesserocr-recognize -D`,
usually `/usr/share/tesseract-ocr/4.00/tessdata`, _or_ alternatively overriding the module directory
by setting `TESSDATA_PREFIX=$XDG_DATA_HOME/ocrd-resources/ocrd-tesserocr-recognize` in the environment.)
> **Note**:
> (In previous versions, the resource locations of standalone Tesseract and the OCR-D wrapper were different.
> If you already have models under `$XDG_DATA_HOME/ocrd-resources/ocrd-tesserocr-recognize`,
> usually `~/.local/share/ocrd-resources/ocrd-tesserocr-recognize`, then consider moving them
> to the new default under `ocrd-tesserocr-recognize -D`,
> usually `/usr/share/tesseract-ocr/4.00/tessdata`, _or_ alternatively overriding the module directory
> by setting `TESSDATA_PREFIX=$XDG_DATA_HOME/ocrd-resources/ocrd-tesserocr-recognize` in the environment.)

Cf. [OCR-D model guide](https://ocr-d.de/en/models).

Models always use the filename suffix `.traineddata`, but are just loaded by their basename.
You will need **at least** `eng` and `osd` installed (even for segmentation and deskewing),
probably also `Latin` and `Fraktur` etc.
probably also `Latin` and `Fraktur` etc. So to get minimal models, do:

ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata
ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata

(This will already be installed if using the Docker or git installation option.)

As of v0.13.1, you can configure `ocrd-tesserocr-recognize` to select models **dynamically** segment by segment,
either via custom conditions on the PAGE-XML annotation (presented as XPath rules),
Expand Down Expand Up @@ -202,9 +217,9 @@ shrinking to the convex hull of all its symbol outlines.

## Testing

```sh
make test
```

make test


This downloads some test data from https://github.com/OCR-D/assets under `repo/assets`,
and runs some basic test of the Python API as well as the CLIs.
Expand Down
1 change: 1 addition & 0 deletions repo/assets
Submodule assets added at 05568a
2 changes: 1 addition & 1 deletion repo/tesserocr
Submodule tesserocr updated 2 files
+9 −1 setup.py
+1 −0 tests/test_api.py
8 changes: 5 additions & 3 deletions test/conftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from ocrd.resolver import Resolver
from ocrd_utils import pushd_popd
from ocrd_utils import pushd_popd, initLogging
from pytest import fixture

from test.assets import assets as assets
Expand All @@ -9,11 +9,13 @@

@fixture
def workspace_kant_binarized():
initLogging()
with pushd_popd(tempdir=True) as tempdir:
yield Resolver().workspace_from_url(METS_KANT_BINARIZED, dst_dir=tempdir)
yield Resolver().workspace_from_url(METS_KANT_BINARIZED, dst_dir=tempdir, download=True)

@fixture
def workspace_herold_small():
initLogging()
with pushd_popd(tempdir=True) as tempdir:
yield Resolver().workspace_from_url(METS_HEROLD_SMALL, dst_dir=tempdir)
yield Resolver().workspace_from_url(METS_HEROLD_SMALL, dst_dir=tempdir, download=True)