From 83baf9cc50917b7de9609a42376d99f8af24344e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 01:04:29 +0200 Subject: [PATCH 1/6] split up prediction to avoid overly large batches (causing OOM) --- ocrd_calamari/recognize.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 1ab11f5..35b5efb 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -43,6 +43,16 @@ TOOL = "ocrd-calamari-recognize" +BATCH_SIZE = 64 +if not hasattr(itertools, 'batched'): + def batched(iterable, n): + # batched('ABCDEFG', 3) → ABC DEF G + if n < 1: + raise ValueError('n must be at least one') + iterator = iter(iterable) + while batch := tuple(itertools.islice(iterator, n)): + yield batch + itertools.batched = batched class CalamariRecognize(Processor): def __init__(self, *args, **kwargs): @@ -166,9 +176,11 @@ def process(self): line_image_np = np.array(line_image, dtype=np.uint8) line_images_np.append(line_image_np) line_coordss.append(line_coords) - raw_results_all = self.predictor.predict_raw( - line_images_np, progress_bar=False - ) + + # avoid too large a batch size (causing OOM on CPU or GPU) + fun = lambda x: self.predictor.predict_raw(x, progress_bar=False) + raw_results_all = itertools.chain.from_iterable( + map(fun, itertools.batched(line_images_np, BATCH_SIZE))) for line, line_coords, raw_results in zip( textlines, line_coordss, raw_results_all From 64b80a3a95cca313d2dc1f014b76cc69527f182e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 18 Sep 2024 13:34:09 +0000 Subject: [PATCH 2/6] aggregate all lines instead of per region to better utilise batched predictor --- ocrd_calamari/recognize.py | 360 +++++++++++++++++++------------------ 1 file changed, 181 insertions(+), 179 deletions(-) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 35b5efb..293188a 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -73,7 +73,7 @@ def setup(self): """ resolved = self.resolve_resource(self.parameter["checkpoint_dir"]) checkpoints = glob("%s/*.ckpt.json" % resolved) - self.predictor = MultiPredictor(checkpoints=checkpoints) + self.predictor = MultiPredictor(checkpoints=checkpoints, batch_size=BATCH_SIZE) self.network_input_channels = self.predictor.predictors[ 0 @@ -119,6 +119,7 @@ def process(self): page, page_id, feature_selector=self.features ) + lines = [] for region in page.get_AllRegions(classes=["Text"]): region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords, feature_selector=self.features @@ -130,8 +131,6 @@ def process(self): len(textlines), region.id, ) - line_images_np = [] - line_coordss = [] for line in textlines: log.debug( "Recognizing line '%s' in region '%s'", line.id, region.id @@ -171,195 +170,198 @@ def process(self): line.id, region.id, ) - line_image_np = np.array([[0]], dtype=np.uint8) - else: - line_image_np = np.array(line_image, dtype=np.uint8) - line_images_np.append(line_image_np) - line_coordss.append(line_coords) - - # avoid too large a batch size (causing OOM on CPU or GPU) - fun = lambda x: self.predictor.predict_raw(x, progress_bar=False) - raw_results_all = itertools.chain.from_iterable( - map(fun, itertools.batched(line_images_np, BATCH_SIZE))) - - for line, line_coords, raw_results in zip( - textlines, line_coordss, raw_results_all - ): - for i, p in enumerate(raw_results): - p.prediction.id = "fold_{}".format(i) - - prediction = self.voter.vote_prediction_result(raw_results) - prediction.id = "voted" - - # Build line text on our own - # - # Calamari does whitespace post-processing on prediction.sentence, - # while it does not do the same on prediction.positions. Do it on - # our own to have consistency. - # - # XXX Check Calamari's built-in post-processing on - # prediction.sentence - - def _sort_chars(p): - """Filter and sort chars of prediction p""" - chars = p.chars - chars = [ - c for c in chars if c.char - ] # XXX Note that omission probabilities are not normalized?! - chars = [ - c - for c in chars - if c.probability >= self.parameter["glyph_conf_cutoff"] - ] - chars = sorted(chars, key=lambda k: k.probability, reverse=True) - return chars - - def _drop_leading_spaces(positions): - return list( - itertools.dropwhile( - lambda p: _sort_chars(p)[0].char == " ", positions - ) + continue + lines.append((line, line_coords, np.array(line_image, dtype=np.uint8))) + + if len(lines): + lines, coords, images = zip(*lines) + else: + log.warning("No text lines on page '%s'", page_id) + lines, coords, images = [], [], [] + + # not exposed in MultiPredictor yet, cf. calamari#361: + # raw_results_all = self.predictor.predict_raw(images, progress_bar=False, batch_size=BATCH_SIZE) + # avoid too large a batch size (causing OOM on CPU or GPU) + fun = lambda x: self.predictor.predict_raw(x, progress_bar=False) + results = itertools.chain.from_iterable( + map(fun, itertools.batched(images, BATCH_SIZE))) + + for line, line_coords, raw_results in zip(lines, coords, results): + for i, p in enumerate(raw_results): + p.prediction.id = "fold_{}".format(i) + + prediction = self.voter.vote_prediction_result(raw_results) + prediction.id = "voted" + + # Build line text on our own + # + # Calamari does whitespace post-processing on prediction.sentence, + # while it does not do the same on prediction.positions. Do it on + # our own to have consistency. + # + # XXX Check Calamari's built-in post-processing on + # prediction.sentence + + def _sort_chars(p): + """Filter and sort chars of prediction p""" + chars = p.chars + chars = [ + c for c in chars if c.char + ] # XXX Note that omission probabilities are not normalized?! + chars = [ + c + for c in chars + if c.probability >= self.parameter["glyph_conf_cutoff"] + ] + chars = sorted(chars, key=lambda k: k.probability, reverse=True) + return chars + + def _drop_leading_spaces(positions): + return list( + itertools.dropwhile( + lambda p: _sort_chars(p)[0].char == " ", positions ) + ) - def _drop_trailing_spaces(positions): - return list(reversed(_drop_leading_spaces(reversed(positions)))) - - def _drop_double_spaces(positions): - def _drop_double_spaces_generator(positions): - last_was_space = False - for p in positions: - if p.chars[0].char == " ": - if not last_was_space: - yield p - last_was_space = True - else: - yield p - last_was_space = False - - return list(_drop_double_spaces_generator(positions)) - - positions = prediction.positions - positions = _drop_leading_spaces(positions) - positions = _drop_trailing_spaces(positions) - positions = _drop_double_spaces(positions) - positions = list(positions) - - line_text = "".join(_sort_chars(p)[0].char for p in positions) - if line_text != prediction.sentence: - log.warning( - f"Our own line text is not the same as Calamari's:" - f"'{line_text}' != '{prediction.sentence}'" - ) + def _drop_trailing_spaces(positions): + return list(reversed(_drop_leading_spaces(reversed(positions)))) - # Delete existing results - if line.get_TextEquiv(): - log.warning("Line '%s' already contained text results", line.id) - line.set_TextEquiv([]) - if line.get_Word(): - log.warning( - "Line '%s' already contained word segmentation", line.id - ) - line.set_Word([]) + def _drop_double_spaces(positions): + def _drop_double_spaces_generator(positions): + last_was_space = False + for p in positions: + if p.chars[0].char == " ": + if not last_was_space: + yield p + last_was_space = True + else: + yield p + last_was_space = False + + return list(_drop_double_spaces_generator(positions)) + + positions = prediction.positions + positions = _drop_leading_spaces(positions) + positions = _drop_trailing_spaces(positions) + positions = _drop_double_spaces(positions) + positions = list(positions) + + line_text = "".join(_sort_chars(p)[0].char for p in positions) + if line_text != prediction.sentence: + log.warning( + f"Our own line text is not the same as Calamari's:" + f"'{line_text}' != '{prediction.sentence}'" + ) - # Save line results - line_conf = prediction.avg_char_probability - line.set_TextEquiv( - [TextEquivType(Unicode=line_text, conf=line_conf)] + # Delete existing results + if line.get_TextEquiv(): + log.warning("Line '%s' already contained text results", line.id) + line.set_TextEquiv([]) + if line.get_Word(): + log.warning( + "Line '%s' already contained word segmentation", line.id ) + line.set_Word([]) - # Save word results - # - # Calamari OCR does not provide word positions, so we infer word - # positions from a. text segmentation and b. the glyph positions. - # This is necessary because the PAGE XML format enforces a strict - # hierarchy of lines > words > glyphs. - - def _words(s): - """Split words based on spaces and include spaces as 'words'""" - spaces = None - word = "" - for c in s: - if c == " " and spaces is True: - word += c - elif c != " " and spaces is False: - word += c - else: - if word: - yield word - word = c - spaces = c == " " - yield word - - if self.parameter["textequiv_level"] in ["word", "glyph"]: - word_no = 0 - i = 0 - - for word_text in _words(line_text): - word_length = len(word_text) - if not all(c == " " for c in word_text): - word_positions = positions[i : i + word_length] - word_start = word_positions[0].global_start - word_end = word_positions[-1].global_end - - polygon = polygon_from_x0y0x1y1( - [word_start, 0, word_end, line_image.height] - ) - points = points_from_polygon( - coordinates_for_segment(polygon, None, line_coords) - ) - # XXX Crop to line polygon? + # Save line results + line_conf = prediction.avg_char_probability + line.set_TextEquiv( + [TextEquivType(Unicode=line_text, conf=line_conf)] + ) - word = WordType( - id="%s_word%04d" % (line.id, word_no), - Coords=CoordsType(points), - ) - word.add_TextEquiv(TextEquivType(Unicode=word_text)) - - if self.parameter["textequiv_level"] == "glyph": - for glyph_no, p in enumerate(word_positions): - glyph_start = p.global_start - glyph_end = p.global_end - - polygon = polygon_from_x0y0x1y1( - [ - glyph_start, - 0, - glyph_end, - line_image.height, - ] - ) - points = points_from_polygon( - coordinates_for_segment( - polygon, None, line_coords - ) - ) + # Save word results + # + # Calamari OCR does not provide word positions, so we infer word + # positions from a. text segmentation and b. the glyph positions. + # This is necessary because the PAGE XML format enforces a strict + # hierarchy of lines > words > glyphs. + + def _words(s): + """Split words based on spaces and include spaces as 'words'""" + spaces = None + word = "" + for c in s: + if c == " " and spaces is True: + word += c + elif c != " " and spaces is False: + word += c + else: + if word: + yield word + word = c + spaces = c == " " + yield word + + if self.parameter["textequiv_level"] in ["word", "glyph"]: + word_no = 0 + i = 0 + + for word_text in _words(line_text): + word_length = len(word_text) + if not all(c == " " for c in word_text): + word_positions = positions[i : i + word_length] + word_start = word_positions[0].global_start + word_end = word_positions[-1].global_end + + polygon = polygon_from_x0y0x1y1( + [word_start, 0, word_end, line_image.height] + ) + points = points_from_polygon( + coordinates_for_segment(polygon, None, line_coords) + ) + # XXX Crop to line polygon? - glyph = GlyphType( - id="%s_glyph%04d" % (word.id, glyph_no), - Coords=CoordsType(points), + word = WordType( + id="%s_word%04d" % (line.id, word_no), + Coords=CoordsType(points), + ) + word.add_TextEquiv(TextEquivType(Unicode=word_text)) + + if self.parameter["textequiv_level"] == "glyph": + for glyph_no, p in enumerate(word_positions): + glyph_start = p.global_start + glyph_end = p.global_end + + polygon = polygon_from_x0y0x1y1( + [ + glyph_start, + 0, + glyph_end, + line_image.height, + ] + ) + points = points_from_polygon( + coordinates_for_segment( + polygon, None, line_coords ) - - # Add predictions (= TextEquivs) - char_index_start = 1 - # Index must start with 1, see - # https://ocr-d.github.io/page#multiple-textequivs - for char_index, char in enumerate( - _sort_chars(p), start=char_index_start - ): - glyph.add_TextEquiv( - TextEquivType( - Unicode=char.char, - index=char_index, - conf=char.probability, - ) + ) + + glyph = GlyphType( + id="%s_glyph%04d" % (word.id, glyph_no), + Coords=CoordsType(points), + ) + + # Add predictions (= TextEquivs) + char_index_start = 1 + # Index must start with 1, see + # https://ocr-d.github.io/page#multiple-textequivs + for char_index, char in enumerate( + _sort_chars(p), start=char_index_start + ): + glyph.add_TextEquiv( + TextEquivType( + Unicode=char.char, + index=char_index, + conf=char.probability, ) + ) - word.add_Glyph(glyph) + word.add_Glyph(glyph) - line.add_Word(word) - word_no += 1 + line.add_Word(word) + word_no += 1 - i += word_length + i += word_length _page_update_higher_textequiv_levels("line", pcgts) From ac04adc4d8bb2918fa61db8f5bd1f12f61a0f4f8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 18 Sep 2024 13:35:13 +0000 Subject: [PATCH 3/6] require Calamari v1.0.7, no more need for model fixup --- Makefile | 2 -- requirements.txt | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/Makefile b/Makefile index f3164dc..cfa7cd8 100644 --- a/Makefile +++ b/Makefile @@ -40,8 +40,6 @@ install: $(MODEL): ocrd resmgr download ocrd-calamari-recognize $@ - # Workaround, see #91 https://github.com/OCR-D/ocrd_calamari/issues/91 - fix-calamari1-model ~/.local/share/ocrd-resources/ocrd-calamari-recognize/$@ # Download example data (for the README) example: $(EXAMPLE) diff --git a/requirements.txt b/requirements.txt index 5eebd46..3a96004 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ tensorflow >= 2.5.0, < 2.16 numpy -calamari-ocr == 1.0.*, >= 1.0.6 +calamari-ocr == 1.0.*, >= 1.0.7 setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime? click ocrd >= 2.54.0 From c2cab3d911108b847546735c3aea29ff601c648d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 18 Sep 2024 13:40:24 +0000 Subject: [PATCH 4/6] let GPU memory grow by demand (instead of exclusive reservation) --- ocrd_calamari/recognize.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 293188a..3707358 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -36,6 +36,7 @@ from calamari_ocr.ocr import MultiPredictor from calamari_ocr.ocr.voting import voter_from_proto from calamari_ocr.proto import VoterParams +from tensorflow import config as tensorflow_config # ruff: isort: on @@ -71,9 +72,15 @@ def setup(self): """ Set up the model prior to processing. """ + log = getLogger("processor.CalamariRecognize") + devices = tensorflow_config.list_physical_devices("GPU") + for device in devices: + log.info("using GPU device %s", device) + tensorflow_config.experimental.set_memory_growth(device, True) resolved = self.resolve_resource(self.parameter["checkpoint_dir"]) checkpoints = glob("%s/*.ckpt.json" % resolved) self.predictor = MultiPredictor(checkpoints=checkpoints, batch_size=BATCH_SIZE) + log.info("loaded model %s", resolved) self.network_input_channels = self.predictor.predictors[ 0 From 45e20b150fd1605d95268ebf400c35989789f145 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 27 Sep 2024 11:05:04 +0200 Subject: [PATCH 5/6] batching: split up batches if too long img widths --- ocrd_calamari/recognize.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 3707358..509841f 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -45,15 +45,19 @@ TOOL = "ocrd-calamari-recognize" BATCH_SIZE = 64 -if not hasattr(itertools, 'batched'): - def batched(iterable, n): - # batched('ABCDEFG', 3) → ABC DEF G - if n < 1: - raise ValueError('n must be at least one') - iterator = iter(iterable) - while batch := tuple(itertools.islice(iterator, n)): +def batched(iterable, n): + # batched('ABCDEFG', 3) → ABC DEF G + if n < 1: + raise ValueError('n must be at least one') + iterator = iter(iterable) + while batch := tuple(itertools.islice(iterator, n)): + # implement poor man's batch bucketing to avoid OOM: + maxlen = max(image.shape[1] for image in batch) + if maxlen * n > 32000 and n > 1: + yield from batched(batch, n//2) + else: yield batch - itertools.batched = batched +itertools.batched = batched class CalamariRecognize(Processor): def __init__(self, *args, **kwargs): From 842bd9287ed0673a76f4cb22a98675236adb9909 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 27 Sep 2024 15:27:00 +0200 Subject: [PATCH 6/6] avoid overriding itertools.batched, if any --- ocrd_calamari/recognize.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 509841f..edf69b9 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -45,7 +45,8 @@ TOOL = "ocrd-calamari-recognize" BATCH_SIZE = 64 -def batched(iterable, n): + +def batched_length_limited(iterable, n, limit=32000): # batched('ABCDEFG', 3) → ABC DEF G if n < 1: raise ValueError('n must be at least one') @@ -53,11 +54,10 @@ def batched(iterable, n): while batch := tuple(itertools.islice(iterator, n)): # implement poor man's batch bucketing to avoid OOM: maxlen = max(image.shape[1] for image in batch) - if maxlen * n > 32000 and n > 1: - yield from batched(batch, n//2) + if maxlen * n > limit and n > 1: + yield from batched_length_limited(batch, n//2) else: yield batch -itertools.batched = batched class CalamariRecognize(Processor): def __init__(self, *args, **kwargs): @@ -195,7 +195,7 @@ def process(self): # avoid too large a batch size (causing OOM on CPU or GPU) fun = lambda x: self.predictor.predict_raw(x, progress_bar=False) results = itertools.chain.from_iterable( - map(fun, itertools.batched(images, BATCH_SIZE))) + map(fun, batched_length_limited(images, BATCH_SIZE))) for line, line_coords, raw_results in zip(lines, coords, results): for i, p in enumerate(raw_results):