Skip to content

Commit

Permalink
Merge pull request #67 from OCR-D/project-parent
Browse files Browse the repository at this point in the history
improve repair and project processors
  • Loading branch information
bertsky authored Mar 9, 2024
2 parents 3993139 + 3d9e0d6 commit d0d20dc
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 61 deletions.
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,17 @@ Versioned according to [Semantic Versioning](http://semver.org/).

## [Unreleased]

### Fixed

* repair/project: adapt to Shapely deprecations,
* repair/project: more robust `join_polygons`, `make_intersections`, `make_valid`

### Changed

* :fire: require Shapely 2
* project: clip coords to parent's parent instead of parent
* repair (`sanitize`): shrink before attempting repair (hierarchical consistency)

## [0.1.22] - 2023-06-29

### Added
Expand Down
74 changes: 44 additions & 30 deletions ocrd_segment/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import numpy as np
from scipy.sparse.csgraph import minimum_spanning_tree
from shapely.geometry import Polygon, LineString
from shapely.geometry.polygon import orient
from shapely import set_precision
from shapely.ops import unary_union, nearest_points

from ocrd import Processor
Expand Down Expand Up @@ -120,22 +122,22 @@ def process(self):
content=to_xml(pcgts))

def _process_segment(self, segment, constituents, page_id):
"""Shrink segment outline to become the minimal convex hull of its constituent segments."""
"""Overwrite segment outline to become the minimal convex hull of its constituent segments."""
LOG = getLogger('processor.ProjectHull')
polygons = [make_valid(Polygon(polygon_from_points(constituent.get_Coords().points)))
for constituent in constituents]
polygon = join_polygons(polygons).buffer(self.parameter['padding']).exterior.coords[:-1]
# make sure the segment still fits into its parent's parent
if isinstance(segment, PageType):
oldborder = segment.Border
segment.Border = None # ensure interim parent is the page frame itself
# make sure the segment still fits into its own parent
polygon2 = polygon_for_parent(polygon, segment)
if polygon2 is None:
# ensure interim parent is the page frame itself
parent = PageType(**segment.__dict__)
parent.Border = None
else:
parent = segment.parent_object_
polygon = polygon_for_parent(polygon, parent)
if polygon is None:
LOG.info('Ignoring extant segment: %s', segment.id)
if isinstance(segment, PageType):
segment.Border = oldborder
else:
polygon = polygon2
points = points_from_polygon(polygon)
coords = CoordsType(points=points)
LOG.debug('Using new coordinates from %d constituents for segment "%s"',
Expand All @@ -152,11 +154,13 @@ def pairwise(iterable):

def join_polygons(polygons, scale=20):
"""construct concave hull (alpha shape) from input polygons by connecting their pairwise nearest points"""
# ensure input polygons are simply typed
polygons = list(itertools.chain.from_iterable([
poly.geoms if poly.type in ['MultiPolygon', 'GeometryCollection']
else [poly]
for poly in polygons]))
# ensure input polygons are simply typed and all oriented equally
polygons = [orient(poly)
for poly in itertools.chain.from_iterable(
[poly.geoms
if poly.geom_type in ['MultiPolygon', 'GeometryCollection']
else [poly]
for poly in polygons])]
npoly = len(polygons)
if npoly == 1:
return polygons[0]
Expand All @@ -175,16 +179,18 @@ def join_polygons(polygons, scale=20):
prevp = polygons[prevp]
nextp = polygons[nextp]
nearest = nearest_points(prevp, nextp)
bridgep = LineString(nearest).buffer(max(1, scale/5), resolution=1)
bridgep = orient(LineString(nearest).buffer(max(1, scale/5), resolution=1), -1)
polygons.append(bridgep)
jointp = unary_union(polygons)
assert jointp.type == 'Polygon', jointp.wkt
if jointp.minimum_clearance < 1.0:
# follow-up calculations will necessarily be integer;
# so anticipate rounding here and then ensure validity
jointp = Polygon(np.round(jointp.exterior.coords))
jointp = make_valid(jointp)
return jointp
assert jointp.geom_type == 'Polygon', jointp.wkt
# follow-up calculations will necessarily be integer;
# so anticipate rounding here and then ensure validity
jointp2 = set_precision(jointp, 1.0)
if jointp2.geom_type != 'Polygon' or not jointp2.is_valid:
jointp2 = Polygon(np.round(jointp.exterior.coords))
jointp2 = make_valid(jointp2)
assert jointp2.geom_type == 'Polygon', jointp2.wkt
return jointp2

def polygon_for_parent(polygon, parent):
"""Clip polygon to parent polygon range.
Expand Down Expand Up @@ -227,30 +233,38 @@ def make_intersection(poly1, poly2):
# post-process
if interp.is_empty or interp.area == 0.0:
return None
if interp.type == 'GeometryCollection':
if interp.geom_type == 'GeometryCollection':
# heterogeneous result: filter zero-area shapes (LineString, Point)
interp = unary_union([geom for geom in interp.geoms if geom.area > 0])
if interp.type == 'MultiPolygon':
if interp.geom_type == 'MultiPolygon':
# homogeneous result: construct convex hull to connect
interp = join_polygons(interp.geoms)
if interp.minimum_clearance < 1.0:
# follow-up calculations will necessarily be integer;
# so anticipate rounding here and then ensure validity
interp = Polygon(np.round(interp.exterior.coords))
interp = make_valid(interp)
# follow-up calculations will necessarily be integer;
# so anticipate rounding here and then ensure validity
interp = set_precision(interp, 1.0)
return interp

def make_valid(polygon):
"""Ensures shapely.geometry.Polygon object is valid by repeated rearrangement/simplification/enlargement."""
points = list(polygon.exterior.coords)
# try by re-arranging points
for split in range(1, len(points)):
if polygon.is_valid or polygon.simplify(polygon.area).is_valid:
break
# simplification may not be possible (at all) due to ordering
# in that case, try another starting point
polygon = Polygon(points[-split:]+points[:-split])
for tolerance in range(int(polygon.area)):
# try by simplification
for tolerance in range(int(polygon.area + 1.5)):
if polygon.is_valid:
break
# simplification may require a larger tolerance
polygon = polygon.simplify(tolerance + 1)
# try by enlarging
for tolerance in range(1, int(polygon.area + 2.5)):
if polygon.is_valid:
break
# enlargement may require a larger tolerance
polygon = polygon.buffer(tolerance)
assert polygon.is_valid, polygon.wkt
return polygon
45 changes: 15 additions & 30 deletions ocrd_segment/repair.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
PageValidator
)
from .config import OCRD_TOOL
from .project import join_polygons
from .project import join_polygons, make_valid

TOOL = 'ocrd-segment-repair'

Expand Down Expand Up @@ -115,6 +115,14 @@ def process(self):
pcgts.set_pcGtsId(file_id)
page = pcgts.get_Page()

# shrink/expand text regions to the hull of their text lines
if sanitize:
page_image, page_coords, _ = self.workspace.image_from_page(
page, page_id,
feature_selector='binarized',
feature_filter='clipped')
shrink_regions(page_image, page_coords, page, page_id,
padding=self.parameter['sanitize_padding'])
#
# validate segmentation (warn of children extending beyond their parents)
#
Expand Down Expand Up @@ -180,14 +188,6 @@ def process(self):
# delete/merge/split redundant text regions (or its text lines)
if plausibilize:
self.plausibilize_page(page, page_id)
# shrink/expand text regions to the hull of their text lines
if sanitize:
page_image, page_coords, _ = self.workspace.image_from_page(
page, page_id,
feature_selector='binarized',
feature_filter='clipped')
shrink_regions(page_image, page_coords, page, page_id,
padding=self.parameter['sanitize_padding'])

self.workspace.add_file(
ID=file_id,
Expand Down Expand Up @@ -482,7 +482,7 @@ def _plausibilize_segments(segpolys, rogroup, marked_for_deletion, marked_for_me
_tag_name(otherseg), otherseg.id)
otherpoly = make_valid(Polygon(polygon_from_points(otherseg.get_Coords().points)))
poly = poly.difference(otherpoly)
if poly.type == 'MultiPolygon':
if poly.geom_type == 'MultiPolygon':
poly = join_polygons(poly.geoms)
if poly.minimum_clearance < 1.0:
poly = Polygon(np.round(poly.exterior.coords))
Expand Down Expand Up @@ -556,8 +556,8 @@ def shrink_regions(page_image, page_coords, page, page_id, padding=0):
continue
# pick contour and convert to absolute:
region_polygon = join_polygons([make_valid(Polygon(contour[:, 0, ::]))
for contour in contours
if len(contour) >= 3], scale=scale)
for area, contour in zip(areas, contours)
if len(contour) >= 3 and area > 0], scale=scale)
if padding:
region_polygon = region_polygon.buffer(padding)
region_polygon = coordinates_for_segment(region_polygon.exterior.coords[:-1], page_image, page_coords)
Expand Down Expand Up @@ -599,7 +599,7 @@ def simplify(segment, tolerance=0):

def merge_poly(poly1, poly2):
poly = poly1.union(poly2)
if poly.type == 'MultiPolygon':
if poly.geom_type == 'MultiPolygon':
#poly = poly.convex_hull
poly = join_polygons(poly.geoms)
if poly.minimum_clearance < 1.0:
Expand All @@ -611,10 +611,10 @@ def clip_poly(poly1, poly2):
poly = poly1.intersection(poly2)
if poly.is_empty or poly.area == 0.0:
return None
if poly.type == 'GeometryCollection':
if poly.geom_type == 'GeometryCollection':
# heterogeneous result: filter zero-area shapes (LineString, Point)
poly = unary_union([geom for geom in poly.geoms if geom.area > 0])
if poly.type == 'MultiPolygon':
if poly.geom_type == 'MultiPolygon':
# homogeneous result: construct convex hull to connect
#poly = poly.convex_hull
poly = join_polygons(poly.geoms)
Expand Down Expand Up @@ -719,20 +719,5 @@ def ensure_valid(element):
points = points_from_polygon(polygon)
coords.set_points(points)

def make_valid(polygon):
"""Ensures shapely.geometry.Polygon object is valid by repeated simplification"""
for split in range(1, len(polygon.exterior.coords)-1):
if polygon.is_valid or polygon.simplify(polygon.area).is_valid:
break
# simplification may not be possible (at all) due to ordering
# in that case, try another starting point
polygon = Polygon(polygon.exterior.coords[-split:]+polygon.exterior.coords[:-split])
for tolerance in range(1, int(polygon.area)):
if polygon.is_valid:
break
# simplification may require a larger tolerance
polygon = polygon.simplify(tolerance)
return polygon

def _tag_name(element):
return element.__class__.__name__[0:-4]
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ocrd >= 2.20.0
shapely >= 1.7.1
shapely >= 2.0
scikit-image
numpy
xlsxwriter
Expand Down

0 comments on commit d0d20dc

Please sign in to comment.