forked from OCR-D/ocrd_tesserocr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
segment_word.py
41 lines (32 loc) · 1.83 KB
/
segment_word.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from __future__ import absolute_import
from ocrd_utils import getLogger
from ocrd_validators import ParameterValidator
from .config import OCRD_TOOL
from .recognize import TesserocrRecognize
TOOL = 'ocrd-tesserocr-segment-word'
BASE_TOOL = 'ocrd-tesserocr-recognize'
class TesserocrSegmentWord(TesserocrRecognize):
def __init__(self, *args, **kwargs):
kwargs.setdefault('ocrd_tool', OCRD_TOOL['tools'][TOOL])
super().__init__(*args, **kwargs)
if hasattr(self, 'parameter'):
self.parameter['overwrite_segments'] = self.parameter['overwrite_words']
del self.parameter['overwrite_words']
self.parameter['segmentation_level'] = "word"
self.parameter['textequiv_level'] = "word"
# add default params
assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid
self.logger = getLogger('processor.TesserocrSegmentWord')
TesserocrSegmentWord.process.__doc__ = """Performs word segmentation with Tesseract on the workspace.
Open and deserialize PAGE input files and their respective images,
then iterate over the element hierarchy down to the textline level,
and remove any existing Word elements.
Set up Tesseract to detect words, and add each one to the line
at the detected coordinates.
If ``shrink_polygons``, then during segmentation (on any level), query Tesseract
for all symbols/glyphs of each segment and calculate the convex hull for them.
Annotate the resulting polygon instead of the coarse bounding box.
(This is more precise and helps avoid overlaps between neighbours, especially
when not segmenting all levels at once.)
Produce a new output file by serialising the resulting hierarchy.
"""