Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 35 additions & 74 deletions ocrd_segment/repair.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from scipy.ndimage import filters, morphology
import cv2
import numpy as np
from shapely.geometry import Polygon, LineString
from shapely.geometry import asPolygon, Polygon, LineString

from ocrd import Processor
from ocrd_utils import (
Expand All @@ -23,8 +23,6 @@
from ocrd_modelfactory import page_from_file
from ocrd_models.ocrd_page import (
CoordsType,
LabelType, LabelsType,
MetadataItemType,
to_xml
)
from ocrd_models.ocrd_page_generateds import (
Expand All @@ -36,6 +34,7 @@
UnorderedGroupIndexedType,
ReadingOrderType
)
from ocrd_validators.page_validator import PageValidator
from .config import OCRD_TOOL

TOOL = 'ocrd-segment-repair'
Expand Down Expand Up @@ -68,23 +67,17 @@ def process(self):
page_id = input_file.pageId or input_file.ID
LOG.info("INPUT FILE %i / %s", n, page_id)
pcgts = page_from_file(self.workspace.download_file(input_file))
self.add_metadata(pcgts)
page = pcgts.get_Page()
metadata = pcgts.get_Metadata() # ensured by from_file()
metadata.add_MetadataItem(
MetadataItemType(type_="processingStep",
name=self.ocrd_tool['steps'][0],
value=TOOL,
Labels=[LabelsType(
externalModel="ocrd-tool",
externalId="parameters",
Label=[LabelType(type_=name,
value=self.parameter[name])
for name in self.parameter.keys()])]))


#
# validate segmentation (warn of children extending beyond their parents)
#
self.validate_coords(page, page_id)
report = PageValidator.validate(ocrd_page=pcgts,
page_textequiv_consistency='off',
check_baseline=False)
if not report.is_valid:
LOG.warning(report.to_xml())

#
# sanitize region segmentation (shrink to hull of lines)
Expand Down Expand Up @@ -240,11 +233,13 @@ def sanitize_page(self, page, page_id):
LOG.warning('Ignoring contour %d too small (%d/%d) in region "%s"',
i, area, total_area, region.id)
continue
# simplify shape:
# simplify shape (until valid):
# can produce invalid (self-intersecting) polygons:
#polygon = cv2.approxPolyDP(contour, 2, False)[:, 0, ::] # already ordered x,y
polygon = contour[:, 0, ::] # already ordered x,y
polygon = Polygon(polygon).simplify(1).exterior.coords
polygon = Polygon(polygon).simplify(1)
polygon = make_valid(polygon)
polygon = polygon.exterior.coords[:-1] # keep open
if len(polygon) < 4:
LOG.warning('Ignoring contour %d less than 4 points in region "%s"',
i, region.id)
Expand All @@ -259,61 +254,6 @@ def sanitize_page(self, page, page_id):
LOG.info('Using new coordinates for region "%s"', region.id)
region.get_Coords().points = points_from_polygon(region_polygon)

def validate_coords(self, page, page_id):
valid = True
regions = page.get_TextRegion()
if page.get_Border():
other_regions = (
page.get_AdvertRegion() +
page.get_ChartRegion() +
page.get_ChemRegion() +
page.get_GraphicRegion() +
page.get_ImageRegion() +
page.get_LineDrawingRegion() +
page.get_MathsRegion() +
page.get_MusicRegion() +
page.get_NoiseRegion() +
page.get_SeparatorRegion() +
page.get_TableRegion() +
page.get_UnknownRegion())
for region in regions + other_regions:
if not _child_within_parent(region, page.get_Border()):
LOG.warning('Region "%s" extends beyond Border of page "%s"',
region.id, page_id)
valid = False
for region in regions:
lines = region.get_TextLine()
for line in lines:
if not _child_within_parent(line, region):
LOG.warning('Line "%s" extends beyond region "%s" on page "%s"',
line.id, region.id, page_id)
valid = False
if line.get_Baseline():
baseline = LineString(polygon_from_points(line.get_Baseline().points))
linepoly = Polygon(polygon_from_points(line.get_Coords().points))
if not baseline.within(linepoly):
LOG.warning('Baseline extends beyond line "%s" in region "%s" on page "%s"',
line.id, region.id, page_id)
valid = False
words = line.get_Word()
for word in words:
if not _child_within_parent(word, line):
LOG.warning('Word "%s" extends beyond line "%s" in region "%s" on page "%s"',
word.id, line.id, region.id, page_id)
valid = False
glyphs = word.get_Glyph()
for glyph in glyphs:
if not _child_within_parent(glyph, word):
LOG.warning('Glyph "%s" extends beyond word "%s" in line "%s" of region "%s" on page "%s"',
glyph.id, word.id, line.id, region.id, page_id)
valid = False
return valid

def _child_within_parent(child, parent):
child_poly = Polygon(polygon_from_points(child.get_Coords().points))
parent_poly = Polygon(polygon_from_points(parent.get_Coords().points))
return child_poly.within(parent_poly)

def _plausibilize_group(regionspolys, rogroup, mark_for_deletion, mark_for_merging):
wait_for_deletion = list()
reading_order = dict()
Expand Down Expand Up @@ -354,7 +294,13 @@ def _plausibilize_group(regionspolys, rogroup, mark_for_deletion, mark_for_mergi
# and use-cases in the future
superpoly = Polygon(polygon_from_points(superreg.get_Coords().points))
superpoly = superpoly.union(poly)
superreg.get_Coords().points = points_from_polygon(superpoly.exterior.coords)
if superpoly.type == 'MultiPolygon':
superpoly = superpoly.convex_hull
if superpoly.minimum_clearance < 1.0:
superpoly = asPolygon(np.round(superpoly.exterior.coords))
superpoly = make_valid(superpoly)
superpoly = superpoly.exterior.coords[:-1] # keep open
superreg.get_Coords().points = points_from_polygon(superpoly)
# FIXME should we merge/mix attributes and features?
if region.get_orientation() != superreg.get_orientation():
LOG.warning('Merging region "%s" with orientation %f into "%s" with %f',
Expand Down Expand Up @@ -399,3 +345,18 @@ def _plausibilize_group(regionspolys, rogroup, mark_for_deletion, mark_for_mergi
if region.parent_object_:
# remove in-place
region.parent_object_.get_TextRegion().remove(region)

def make_valid(polygon):
Comment thread
kba marked this conversation as resolved.
"""Ensures shapely.geometry.Polygon object is valid by repeated simplification"""
for split in range(1, len(polygon.exterior.coords)-1):
if polygon.is_valid or polygon.simplify(polygon.area).is_valid:
break
# simplification may not be possible (at all) due to ordering
# in that case, try another starting point
polygon = Polygon(polygon.exterior.coords[-split:]+polygon.exterior.coords[:-split])
for tolerance in range(1, int(polygon.area)):
if polygon.is_valid:
break
# simplification may require a larger tolerance
polygon = polygon.simplify(tolerance)
return polygon
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ocrd >= 2.13.1
Comment thread
bertsky marked this conversation as resolved.
Outdated
shapely
shapely >= 1.7.1
scikit-image
numpy