From 83baf9cc50917b7de9609a42376d99f8af24344e Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 13 Sep 2024 01:04:29 +0200
Subject: [PATCH 1/6] split up prediction to avoid overly large batches
 (causing OOM)

---
 ocrd_calamari/recognize.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py
index 1ab11f5..35b5efb 100644
--- a/ocrd_calamari/recognize.py
+++ b/ocrd_calamari/recognize.py
@@ -43,6 +43,16 @@
 
 TOOL = "ocrd-calamari-recognize"
 
+BATCH_SIZE = 64
+if not hasattr(itertools, 'batched'):
+    def batched(iterable, n):
+        # batched('ABCDEFG', 3) → ABC DEF G
+        if n < 1:
+            raise ValueError('n must be at least one')
+        iterator = iter(iterable)
+        while batch := tuple(itertools.islice(iterator, n)):
+            yield batch
+    itertools.batched = batched
 
 class CalamariRecognize(Processor):
     def __init__(self, *args, **kwargs):
@@ -166,9 +176,11 @@ def process(self):
                         line_image_np = np.array(line_image, dtype=np.uint8)
                     line_images_np.append(line_image_np)
                     line_coordss.append(line_coords)
-                raw_results_all = self.predictor.predict_raw(
-                    line_images_np, progress_bar=False
-                )
+
+                # avoid too large a batch size (causing OOM on CPU or GPU)
+                fun = lambda x: self.predictor.predict_raw(x, progress_bar=False)
+                raw_results_all = itertools.chain.from_iterable(
+                    map(fun, itertools.batched(line_images_np, BATCH_SIZE)))
 
                 for line, line_coords, raw_results in zip(
                     textlines, line_coordss, raw_results_all

From 64b80a3a95cca313d2dc1f014b76cc69527f182e Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 18 Sep 2024 13:34:09 +0000
Subject: [PATCH 2/6] aggregate all lines instead of per region to better
 utilise batched predictor

---
 ocrd_calamari/recognize.py | 360 +++++++++++++++++++------------------
 1 file changed, 181 insertions(+), 179 deletions(-)

diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py
index 35b5efb..293188a 100644
--- a/ocrd_calamari/recognize.py
+++ b/ocrd_calamari/recognize.py
@@ -73,7 +73,7 @@ def setup(self):
         """
         resolved = self.resolve_resource(self.parameter["checkpoint_dir"])
         checkpoints = glob("%s/*.ckpt.json" % resolved)
-        self.predictor = MultiPredictor(checkpoints=checkpoints)
+        self.predictor = MultiPredictor(checkpoints=checkpoints, batch_size=BATCH_SIZE)
 
         self.network_input_channels = self.predictor.predictors[
             0
@@ -119,6 +119,7 @@ def process(self):
                 page, page_id, feature_selector=self.features
             )
 
+            lines = []
             for region in page.get_AllRegions(classes=["Text"]):
                 region_image, region_coords = self.workspace.image_from_segment(
                     region, page_image, page_coords, feature_selector=self.features
@@ -130,8 +131,6 @@ def process(self):
                     len(textlines),
                     region.id,
                 )
-                line_images_np = []
-                line_coordss = []
                 for line in textlines:
                     log.debug(
                         "Recognizing line '%s' in region '%s'", line.id, region.id
@@ -171,195 +170,198 @@ def process(self):
                             line.id,
                             region.id,
                         )
-                        line_image_np = np.array([[0]], dtype=np.uint8)
-                    else:
-                        line_image_np = np.array(line_image, dtype=np.uint8)
-                    line_images_np.append(line_image_np)
-                    line_coordss.append(line_coords)
-
-                # avoid too large a batch size (causing OOM on CPU or GPU)
-                fun = lambda x: self.predictor.predict_raw(x, progress_bar=False)
-                raw_results_all = itertools.chain.from_iterable(
-                    map(fun, itertools.batched(line_images_np, BATCH_SIZE)))
-
-                for line, line_coords, raw_results in zip(
-                    textlines, line_coordss, raw_results_all
-                ):
-                    for i, p in enumerate(raw_results):
-                        p.prediction.id = "fold_{}".format(i)
-
-                    prediction = self.voter.vote_prediction_result(raw_results)
-                    prediction.id = "voted"
-
-                    # Build line text on our own
-                    #
-                    # Calamari does whitespace post-processing on prediction.sentence,
-                    # while it does not do the same on prediction.positions. Do it on
-                    # our own to have consistency.
-                    #
-                    # XXX Check Calamari's built-in post-processing on
-                    #     prediction.sentence
-
-                    def _sort_chars(p):
-                        """Filter and sort chars of prediction p"""
-                        chars = p.chars
-                        chars = [
-                            c for c in chars if c.char
-                        ]  # XXX Note that omission probabilities are not normalized?!
-                        chars = [
-                            c
-                            for c in chars
-                            if c.probability >= self.parameter["glyph_conf_cutoff"]
-                        ]
-                        chars = sorted(chars, key=lambda k: k.probability, reverse=True)
-                        return chars
-
-                    def _drop_leading_spaces(positions):
-                        return list(
-                            itertools.dropwhile(
-                                lambda p: _sort_chars(p)[0].char == " ", positions
-                            )
+                        continue
+                    lines.append((line, line_coords, np.array(line_image, dtype=np.uint8)))
+
+            if len(lines):
+                lines, coords, images = zip(*lines)
+            else:
+                log.warning("No text lines on page '%s'", page_id)
+                lines, coords, images = [], [], []
+
+            # not exposed in MultiPredictor yet, cf. calamari#361:
+            # raw_results_all = self.predictor.predict_raw(images, progress_bar=False, batch_size=BATCH_SIZE)
+            # avoid too large a batch size (causing OOM on CPU or GPU)
+            fun = lambda x: self.predictor.predict_raw(x, progress_bar=False)
+            results = itertools.chain.from_iterable(
+                map(fun, itertools.batched(images, BATCH_SIZE)))
+
+            for line, line_coords, raw_results in zip(lines, coords, results):
+                for i, p in enumerate(raw_results):
+                    p.prediction.id = "fold_{}".format(i)
+
+                prediction = self.voter.vote_prediction_result(raw_results)
+                prediction.id = "voted"
+
+                # Build line text on our own
+                #
+                # Calamari does whitespace post-processing on prediction.sentence,
+                # while it does not do the same on prediction.positions. Do it on
+                # our own to have consistency.
+                #
+                # XXX Check Calamari's built-in post-processing on
+                #     prediction.sentence
+
+                def _sort_chars(p):
+                    """Filter and sort chars of prediction p"""
+                    chars = p.chars
+                    chars = [
+                        c for c in chars if c.char
+                    ]  # XXX Note that omission probabilities are not normalized?!
+                    chars = [
+                        c
+                        for c in chars
+                        if c.probability >= self.parameter["glyph_conf_cutoff"]
+                    ]
+                    chars = sorted(chars, key=lambda k: k.probability, reverse=True)
+                    return chars
+
+                def _drop_leading_spaces(positions):
+                    return list(
+                        itertools.dropwhile(
+                            lambda p: _sort_chars(p)[0].char == " ", positions
                         )
+                    )
 
-                    def _drop_trailing_spaces(positions):
-                        return list(reversed(_drop_leading_spaces(reversed(positions))))
-
-                    def _drop_double_spaces(positions):
-                        def _drop_double_spaces_generator(positions):
-                            last_was_space = False
-                            for p in positions:
-                                if p.chars[0].char == " ":
-                                    if not last_was_space:
-                                        yield p
-                                    last_was_space = True
-                                else:
-                                    yield p
-                                    last_was_space = False
-
-                        return list(_drop_double_spaces_generator(positions))
-
-                    positions = prediction.positions
-                    positions = _drop_leading_spaces(positions)
-                    positions = _drop_trailing_spaces(positions)
-                    positions = _drop_double_spaces(positions)
-                    positions = list(positions)
-
-                    line_text = "".join(_sort_chars(p)[0].char for p in positions)
-                    if line_text != prediction.sentence:
-                        log.warning(
-                            f"Our own line text is not the same as Calamari's:"
-                            f"'{line_text}' != '{prediction.sentence}'"
-                        )
+                def _drop_trailing_spaces(positions):
+                    return list(reversed(_drop_leading_spaces(reversed(positions))))
 
-                    # Delete existing results
-                    if line.get_TextEquiv():
-                        log.warning("Line '%s' already contained text results", line.id)
-                    line.set_TextEquiv([])
-                    if line.get_Word():
-                        log.warning(
-                            "Line '%s' already contained word segmentation", line.id
-                        )
-                    line.set_Word([])
+                def _drop_double_spaces(positions):
+                    def _drop_double_spaces_generator(positions):
+                        last_was_space = False
+                        for p in positions:
+                            if p.chars[0].char == " ":
+                                if not last_was_space:
+                                    yield p
+                                last_was_space = True
+                            else:
+                                yield p
+                                last_was_space = False
+
+                    return list(_drop_double_spaces_generator(positions))
+
+                positions = prediction.positions
+                positions = _drop_leading_spaces(positions)
+                positions = _drop_trailing_spaces(positions)
+                positions = _drop_double_spaces(positions)
+                positions = list(positions)
+
+                line_text = "".join(_sort_chars(p)[0].char for p in positions)
+                if line_text != prediction.sentence:
+                    log.warning(
+                        f"Our own line text is not the same as Calamari's:"
+                        f"'{line_text}' != '{prediction.sentence}'"
+                    )
 
-                    # Save line results
-                    line_conf = prediction.avg_char_probability
-                    line.set_TextEquiv(
-                        [TextEquivType(Unicode=line_text, conf=line_conf)]
+                # Delete existing results
+                if line.get_TextEquiv():
+                    log.warning("Line '%s' already contained text results", line.id)
+                line.set_TextEquiv([])
+                if line.get_Word():
+                    log.warning(
+                        "Line '%s' already contained word segmentation", line.id
                     )
+                line.set_Word([])
 
-                    # Save word results
-                    #
-                    # Calamari OCR does not provide word positions, so we infer word
-                    # positions from a. text segmentation and b. the glyph positions.
-                    # This is necessary because the PAGE XML format enforces a strict
-                    # hierarchy of lines > words > glyphs.
-
-                    def _words(s):
-                        """Split words based on spaces and include spaces as 'words'"""
-                        spaces = None
-                        word = ""
-                        for c in s:
-                            if c == " " and spaces is True:
-                                word += c
-                            elif c != " " and spaces is False:
-                                word += c
-                            else:
-                                if word:
-                                    yield word
-                                word = c
-                                spaces = c == " "
-                        yield word
-
-                    if self.parameter["textequiv_level"] in ["word", "glyph"]:
-                        word_no = 0
-                        i = 0
-
-                        for word_text in _words(line_text):
-                            word_length = len(word_text)
-                            if not all(c == " " for c in word_text):
-                                word_positions = positions[i : i + word_length]
-                                word_start = word_positions[0].global_start
-                                word_end = word_positions[-1].global_end
-
-                                polygon = polygon_from_x0y0x1y1(
-                                    [word_start, 0, word_end, line_image.height]
-                                )
-                                points = points_from_polygon(
-                                    coordinates_for_segment(polygon, None, line_coords)
-                                )
-                                # XXX Crop to line polygon?
+                # Save line results
+                line_conf = prediction.avg_char_probability
+                line.set_TextEquiv(
+                    [TextEquivType(Unicode=line_text, conf=line_conf)]
+                )
 
-                                word = WordType(
-                                    id="%s_word%04d" % (line.id, word_no),
-                                    Coords=CoordsType(points),
-                                )
-                                word.add_TextEquiv(TextEquivType(Unicode=word_text))
-
-                                if self.parameter["textequiv_level"] == "glyph":
-                                    for glyph_no, p in enumerate(word_positions):
-                                        glyph_start = p.global_start
-                                        glyph_end = p.global_end
-
-                                        polygon = polygon_from_x0y0x1y1(
-                                            [
-                                                glyph_start,
-                                                0,
-                                                glyph_end,
-                                                line_image.height,
-                                            ]
-                                        )
-                                        points = points_from_polygon(
-                                            coordinates_for_segment(
-                                                polygon, None, line_coords
-                                            )
-                                        )
+                # Save word results
+                #
+                # Calamari OCR does not provide word positions, so we infer word
+                # positions from a. text segmentation and b. the glyph positions.
+                # This is necessary because the PAGE XML format enforces a strict
+                # hierarchy of lines > words > glyphs.
+
+                def _words(s):
+                    """Split words based on spaces and include spaces as 'words'"""
+                    spaces = None
+                    word = ""
+                    for c in s:
+                        if c == " " and spaces is True:
+                            word += c
+                        elif c != " " and spaces is False:
+                            word += c
+                        else:
+                            if word:
+                                yield word
+                            word = c
+                            spaces = c == " "
+                    yield word
+
+                if self.parameter["textequiv_level"] in ["word", "glyph"]:
+                    word_no = 0
+                    i = 0
+
+                    for word_text in _words(line_text):
+                        word_length = len(word_text)
+                        if not all(c == " " for c in word_text):
+                            word_positions = positions[i : i + word_length]
+                            word_start = word_positions[0].global_start
+                            word_end = word_positions[-1].global_end
+
+                            polygon = polygon_from_x0y0x1y1(
+                                [word_start, 0, word_end, line_image.height]
+                            )
+                            points = points_from_polygon(
+                                coordinates_for_segment(polygon, None, line_coords)
+                            )
+                            # XXX Crop to line polygon?
 
-                                        glyph = GlyphType(
-                                            id="%s_glyph%04d" % (word.id, glyph_no),
-                                            Coords=CoordsType(points),
+                            word = WordType(
+                                id="%s_word%04d" % (line.id, word_no),
+                                Coords=CoordsType(points),
+                            )
+                            word.add_TextEquiv(TextEquivType(Unicode=word_text))
+
+                            if self.parameter["textequiv_level"] == "glyph":
+                                for glyph_no, p in enumerate(word_positions):
+                                    glyph_start = p.global_start
+                                    glyph_end = p.global_end
+
+                                    polygon = polygon_from_x0y0x1y1(
+                                        [
+                                            glyph_start,
+                                            0,
+                                            glyph_end,
+                                            line_image.height,
+                                        ]
+                                    )
+                                    points = points_from_polygon(
+                                        coordinates_for_segment(
+                                            polygon, None, line_coords
                                         )
-
-                                        # Add predictions (= TextEquivs)
-                                        char_index_start = 1
-                                        # Index must start with 1, see
-                                        # https://ocr-d.github.io/page#multiple-textequivs
-                                        for char_index, char in enumerate(
-                                            _sort_chars(p), start=char_index_start
-                                        ):
-                                            glyph.add_TextEquiv(
-                                                TextEquivType(
-                                                    Unicode=char.char,
-                                                    index=char_index,
-                                                    conf=char.probability,
-                                                )
+                                    )
+
+                                    glyph = GlyphType(
+                                        id="%s_glyph%04d" % (word.id, glyph_no),
+                                        Coords=CoordsType(points),
+                                    )
+
+                                    # Add predictions (= TextEquivs)
+                                    char_index_start = 1
+                                    # Index must start with 1, see
+                                    # https://ocr-d.github.io/page#multiple-textequivs
+                                    for char_index, char in enumerate(
+                                        _sort_chars(p), start=char_index_start
+                                    ):
+                                        glyph.add_TextEquiv(
+                                            TextEquivType(
+                                                Unicode=char.char,
+                                                index=char_index,
+                                                conf=char.probability,
                                             )
+                                        )
 
-                                        word.add_Glyph(glyph)
+                                    word.add_Glyph(glyph)
 
-                                line.add_Word(word)
-                                word_no += 1
+                            line.add_Word(word)
+                            word_no += 1
 
-                            i += word_length
+                        i += word_length
 
             _page_update_higher_textequiv_levels("line", pcgts)
 

From ac04adc4d8bb2918fa61db8f5bd1f12f61a0f4f8 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 18 Sep 2024 13:35:13 +0000
Subject: [PATCH 3/6] require Calamari v1.0.7, no more need for model fixup

---
 Makefile         | 2 --
 requirements.txt | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index f3164dc..cfa7cd8 100644
--- a/Makefile
+++ b/Makefile
@@ -40,8 +40,6 @@ install:
 
 $(MODEL):
 	ocrd resmgr download ocrd-calamari-recognize $@
-	# Workaround, see #91 https://github.com/OCR-D/ocrd_calamari/issues/91
-	fix-calamari1-model ~/.local/share/ocrd-resources/ocrd-calamari-recognize/$@
 
 # Download example data (for the README)
 example: $(EXAMPLE)
diff --git a/requirements.txt b/requirements.txt
index 5eebd46..3a96004 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 tensorflow >= 2.5.0, < 2.16
 numpy
-calamari-ocr == 1.0.*, >= 1.0.6
+calamari-ocr == 1.0.*, >= 1.0.7
 setuptools >= 41.0.0  # tensorboard depends on this, but why do we get an error at runtime?
 click
 ocrd >= 2.54.0

From c2cab3d911108b847546735c3aea29ff601c648d Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 18 Sep 2024 13:40:24 +0000
Subject: [PATCH 4/6] let GPU memory grow by demand (instead of exclusive
 reservation)

---
 ocrd_calamari/recognize.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py
index 293188a..3707358 100644
--- a/ocrd_calamari/recognize.py
+++ b/ocrd_calamari/recognize.py
@@ -36,6 +36,7 @@
 from calamari_ocr.ocr import MultiPredictor
 from calamari_ocr.ocr.voting import voter_from_proto
 from calamari_ocr.proto import VoterParams
+from tensorflow import config as tensorflow_config
 
 # ruff: isort: on
 
@@ -71,9 +72,15 @@ def setup(self):
         """
         Set up the model prior to processing.
         """
+        log = getLogger("processor.CalamariRecognize")
+        devices = tensorflow_config.list_physical_devices("GPU")
+        for device in devices:
+            log.info("using GPU device %s", device)
+            tensorflow_config.experimental.set_memory_growth(device, True)
         resolved = self.resolve_resource(self.parameter["checkpoint_dir"])
         checkpoints = glob("%s/*.ckpt.json" % resolved)
         self.predictor = MultiPredictor(checkpoints=checkpoints, batch_size=BATCH_SIZE)
+        log.info("loaded model %s", resolved)
 
         self.network_input_channels = self.predictor.predictors[
             0

From 45e20b150fd1605d95268ebf400c35989789f145 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 27 Sep 2024 11:05:04 +0200
Subject: [PATCH 5/6] batching: split up batches if too long img widths

---
 ocrd_calamari/recognize.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py
index 3707358..509841f 100644
--- a/ocrd_calamari/recognize.py
+++ b/ocrd_calamari/recognize.py
@@ -45,15 +45,19 @@
 TOOL = "ocrd-calamari-recognize"
 
 BATCH_SIZE = 64
-if not hasattr(itertools, 'batched'):
-    def batched(iterable, n):
-        # batched('ABCDEFG', 3) → ABC DEF G
-        if n < 1:
-            raise ValueError('n must be at least one')
-        iterator = iter(iterable)
-        while batch := tuple(itertools.islice(iterator, n)):
+def batched(iterable, n):
+    # batched('ABCDEFG', 3) → ABC DEF G
+    if n < 1:
+        raise ValueError('n must be at least one')
+    iterator = iter(iterable)
+    while batch := tuple(itertools.islice(iterator, n)):
+        # implement poor man's batch bucketing to avoid OOM:
+        maxlen = max(image.shape[1] for image in batch)
+        if maxlen * n > 32000 and n > 1:
+            yield from batched(batch, n//2)
+        else:
             yield batch
-    itertools.batched = batched
+itertools.batched = batched
 
 class CalamariRecognize(Processor):
     def __init__(self, *args, **kwargs):

From 842bd9287ed0673a76f4cb22a98675236adb9909 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 27 Sep 2024 15:27:00 +0200
Subject: [PATCH 6/6] avoid overriding itertools.batched, if any

---
 ocrd_calamari/recognize.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py
index 509841f..edf69b9 100644
--- a/ocrd_calamari/recognize.py
+++ b/ocrd_calamari/recognize.py
@@ -45,7 +45,8 @@
 TOOL = "ocrd-calamari-recognize"
 
 BATCH_SIZE = 64
-def batched(iterable, n):
+
+def batched_length_limited(iterable, n, limit=32000):
     # batched('ABCDEFG', 3) → ABC DEF G
     if n < 1:
         raise ValueError('n must be at least one')
@@ -53,11 +54,10 @@ def batched(iterable, n):
     while batch := tuple(itertools.islice(iterator, n)):
         # implement poor man's batch bucketing to avoid OOM:
         maxlen = max(image.shape[1] for image in batch)
-        if maxlen * n > 32000 and n > 1:
-            yield from batched(batch, n//2)
+        if maxlen * n > limit and n > 1:
+            yield from batched_length_limited(batch, n//2)
         else:
             yield batch
-itertools.batched = batched
 
 class CalamariRecognize(Processor):
     def __init__(self, *args, **kwargs):
@@ -195,7 +195,7 @@ def process(self):
             # avoid too large a batch size (causing OOM on CPU or GPU)
             fun = lambda x: self.predictor.predict_raw(x, progress_bar=False)
             results = itertools.chain.from_iterable(
-                map(fun, itertools.batched(images, BATCH_SIZE)))
+                map(fun, batched_length_limited(images, BATCH_SIZE)))
 
             for line, line_coords, raw_results in zip(lines, coords, results):
                 for i, p in enumerate(raw_results):