Skip to content
Merged
18 changes: 9 additions & 9 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,18 @@ jobs:
type: string
docker:
- image: cimg/python:<< parameters.python-version >>
environment:
# cimg/python uses pyenv instead of venv
VIRTUAL_ENV: ${PYENV_ROOT}
steps:
- checkout
- run: git submodule sync && git submodule update --init
- run: sudo make deps-ubuntu
- when:
condition:
equal: [ '3.6', << parameters.python-version >> ]
steps:
# speed-up build time for end-of-life Python by holding at latest binary:
- run: pip install --prefer-binary -U opencv-python-headless numpy
- run: make install-tesseract
- run: make install-tesserocr
- run: make install
# PPA tessdata prefix (= ocrd_tesserocr moduledir) is owned by root
- run: sudo chmod go+w `dpkg-query -L tesseract-ocr-eng | sed -n s,/eng.traineddata,,p`
- run: ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata
- run: ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata
- run: ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata
- run: ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata
- run: make test-cli
Expand All @@ -36,4 +36,4 @@ workflows:
- build-python:
matrix:
parameters:
python-version: ['3.6', '3.7', '3.8', '3.9', '3.10']
python-version: ['3.7', '3.8', '3.9', '3.10']
2 changes: 2 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
!requirements_test.txt
!LICENSE
!README.md
!repo/tesserocr
!repo/tesseract

# avoid .git and __pycache__ etc:
!ocrd_tesserocr/**/*.py
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ test-workspace
/.coverage
/htmlcov
/.cache
build_tesseract
6 changes: 6 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[submodule "repo/tesserocr"]
path = repo/tesserocr
url = https://github.com/sirfz/tesserocr/
[submodule "repo/tesseract"]
path = repo/tesseract
url = https://github.com/tesseract-ocr/tesseract
39 changes: 30 additions & 9 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
FROM ocrd/core
FROM ocrd/core:v2.62.0 AS base
# set proper locales
ENV LANG C.UTF-8
ENV LC_ALL C.UTF-8
# install ocrd-tesserocr (until here commands for installing tesseract-ocr)
ARG VCS_REF
ARG BUILD_DATE
LABEL \
Expand All @@ -7,32 +11,49 @@ LABEL \
org.label-schema.vcs-url="https://github.com/OCR-D/ocrd_tesserocr" \
org.label-schema.build-date=$BUILD_DATE

ENV DEBIAN_FRONTEND noninteractive
ENV PYTHONIOENCODING utf8

# set TESSDATA_PREFIX
ENV TESSDATA_PREFIX /usr/local/share/tessdata

# set frontend non-interactive to silence interactive tzdata config
ARG DEBIAN_FRONTEND=noninteractive

# set proper date and timezone in container
RUN echo "Europe/Berlin" > /etc/timezone
RUN ln -sf /usr/share/zoneinfo/Europe/Berlin /etc/localtime
RUN dpkg-reconfigure -f noninteractive tzdata

# diagnostic output - check timezone settings
# RUN cat /etc/timezone

# avoid HOME/.local/share (hard to predict USER here)
# so let XDG_DATA_HOME coincide with fixed system location
# (can still be overridden by derived stages)
ENV XDG_DATA_HOME /usr/local/share

WORKDIR /build-ocrd
WORKDIR /build
COPY setup.py .
COPY ocrd_tesserocr/ocrd-tool.json .
COPY README.md .
COPY requirements.txt .
COPY requirements_test.txt .
COPY ocrd_tesserocr ./ocrd_tesserocr
COPY repo/tesserocr ./repo/tesserocr
COPY repo/tesseract ./repo/tesseract
COPY Makefile .
RUN make deps-ubuntu && \
Comment thread
bertsky marked this conversation as resolved.
apt-get install -y --no-install-recommends \
g++ \
&& make deps install \
&& rm -rf /build-ocrd \
RUN make deps-ubuntu deps install-tesseract install-tesserocr install \
&& rm -rf /build \
&& apt-get -y remove --auto-remove g++ libtesseract-dev make

# PPA tessdata prefix (= ocrd_tesserocr moduledir) is owned by root
RUN sudo chmod go+w `dpkg-query -L tesseract-ocr-eng | sed -n s,/eng.traineddata,,p`
# next line causes failure because tesseract-ocr-eng not existing. Not sure if needed, so skipping
# RUN sudo chmod go+w `dpkg-query -L tesseract-ocr-eng | sed -n s,/eng.traineddata,,p`
RUN ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata
RUN ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata
RUN ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata
Comment thread
bertsky marked this conversation as resolved.
RUN ocrd resmgr download ocrd-tesserocr-recognize equ.traineddata
RUN ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata

WORKDIR /data
VOLUME /data
70 changes: 57 additions & 13 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,20 @@ LOG_LEVEL = INFO
PYTHONIOENCODING=utf8
LC_ALL = C.UTF-8
LANG = C.UTF-8
export
ifdef VIRTUAL_ENV
TESSERACT_PREFIX = $(VIRTUAL_ENV)
else
TESSERACT_PREFIX = /usr/local
endif

ifeq ($(PKG_CONFIG_PATH),)
PKG_CONFIG_PATH := $(TESSERACT_PREFIX)/lib/pkgconfig
else
PKG_CONFIG_PATH := $(TESSERACT_PREFIX)/lib/pkgconfig:$(PKG_CONFIG_PATH)
endif
export PKG_CONFIG_PATH

export

# pytest args. Set to '-s' to see log output during test execution, '--verbose' to see individual tests. Default: '$(PYTEST_ARGS)'
PYTEST_ARGS =
Expand Down Expand Up @@ -50,24 +62,32 @@ help:

# Dependencies for deployment in an ubuntu/debian linux
# (lib*-dev merely for building tesserocr with pip)
# (tesseract-ocr: Ubuntu 18.04 now ships 4.0.0,
# which is unsupported. Add the tesseract-ocr PPA
# from Alexander Pozdnyakov which provides 4.1.0.
# See https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr
# for details.)
deps-ubuntu:
apt-get install -y --no-install-recommends software-properties-common
-add-apt-repository -u -y ppa:alex-p/tesseract-ocr
apt-get install -y \
apt-get update && apt-get install -y --no-install-recommends \
apt-utils \
build-essential \
g++ \
git \
python3 \
python3-pip \
libtesseract-dev \
libjpeg-dev \
libgif-dev \
libwebp-dev \
libopenjp2-7-dev \
libpng-dev \
libtiff-dev \
libtool \
pkg-config \
tzdata \
xzgv \
zlib1g-dev \
libleptonica-dev \
tesseract-ocr-eng \
tesseract-ocr-script-frak \
tesseract-ocr
libpango1.0-dev \
libicu-dev \
autotools-dev \
automake \
libcurl4-nss-dev \
libarchive-dev

# Install Python deps for install via pip
deps:
Expand All @@ -85,6 +105,24 @@ docker:
--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
-t $(DOCKER_TAG) .

install-tesserocr: repo/tesserocr
$(PIP) install ./$<

install-tesseract: repo/tesseract
cd $<; ./autogen.sh
mkdir -p build_tesseract
cd build_tesseract && $(CURDIR)/repo/tesseract/configure \
--prefix=$(TESSERACT_PREFIX) \
--disable-openmp \
--disable-shared \
'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic -fPIC' && \
$(MAKE) install
if [[ "$(TESSERACT_PREFIX)" = "/usr"* ]];then ldconfig ;fi

repo/tesserocr repo/tesseract:
git submodule sync $@
git submodule update --init $@

# Install this package
install: deps
$(PIP) install .
Expand Down Expand Up @@ -135,6 +173,12 @@ repo/assets:
mkdir -p $(dir $@)
git clone https://github.com/OCR-D/assets "$@"

.PHONY: clean
clean: assets-clean tesseract-clean

tesseract-clean:
rm -rf $(CURDIR)/build_tesseract
cd repo/tesseract; make distclean

.PHONY: assets-clean
# Remove symlinks in test/assets
Expand Down
1 change: 1 addition & 0 deletions repo/tesseract
Submodule tesseract added at 8ee020
1 change: 1 addition & 0 deletions repo/tesserocr
Submodule tesserocr added at 1f960e