feat: optionally split alignment visualization (#110)

* feat: optionally split alignment visualization
jitsi · Feb 2, 2025 · 2a036a1 · 2a036a1
1 parent 4113e3f
commit 2a036a1
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 4 deletions.
diff --git a/src/jiwer/alignment.py b/src/jiwer/alignment.py
@@ -21,7 +21,7 @@
 pairs.
 """
 
-from typing import List, Union
+from typing import List, Union, Optional
 
 from jiwer.process import CharacterOutput, WordOutput, AlignmentChunk
 
@@ -32,6 +32,7 @@ def visualize_alignment(
     output: Union[WordOutput, CharacterOutput],
     show_measures: bool = True,
     skip_correct: bool = True,
+    line_width: Optional[int] = None,
 ) -> str:
     """
     Visualize the output of [jiwer.process_words][process.process_words] and
@@ -45,6 +46,7 @@ def visualize_alignment(
         show_measures: If enabled, the visualization will include measures like the WER
                        or CER
         skip_correct: If enabled, the visualization will exclude correct reference and hypothesis pairs
+        line_width: If set, try, at best effort, to spit sentences into multiple lines if they exceed the width.
 
     Returns:
         (str): The visualization as a string
@@ -95,6 +97,18 @@ def visualize_alignment(
         HYP: quite * bit of an even longest sentence here
                    D         I    I       S             I
         ```
+
+        When setting `line_width=80`, the following output will be split into multiple lines:
+
+        ```txt
+        sentence 1
+        REF: This is a very  long sentence that is *** much longer than the previous one
+        HYP: This is a very loong sentence that is not much longer than the previous one
+                                S                    I
+        REF: or the one before that
+        HYP: or *** one before that
+                  D
+        ```
     """
     references = output.references
     hypothesis = output.hypotheses
@@ -110,7 +124,7 @@ def visualize_alignment(
 
         final_str += f"sentence {idx+1}\n"
         final_str += _construct_comparison_string(
-            gt, hp, chunks, include_space_seperator=not is_cer
+            gt, hp, chunks, include_space_seperator=not is_cer, line_width=line_width
         )
         final_str += "\n"
 
@@ -140,10 +154,12 @@ def _construct_comparison_string(
     hypothesis: List[str],
     ops: List[AlignmentChunk],
     include_space_seperator: bool = False,
+    line_width: Optional[int] = None,
 ) -> str:
     ref_str = "REF: "
     hyp_str = "HYP: "
     op_str = "     "
+    agg_str = ""  # aggregate string for max_chars split
 
     for op in ops:
         if op.type == "equal" or op.type == "substitute":
@@ -165,6 +181,19 @@ def _construct_comparison_string(
         for rf, hp, c in zip(ref, hyp, op_chars):
             str_len = max(len(rf), len(hp), len(c))
 
+            if line_width is not None:
+                if len(ref_str) + str_len > line_width:
+                    # aggregate the strings
+                    if include_space_seperator:
+                        agg_str += f"{ref_str[:-1]}\n{hyp_str[:-1]}\n{op_str[:-1]}\n\n"
+                    else:
+                        agg_str += f"{ref_str}\n{hyp_str}\n{op_str}\n\n"
+
+                    # reset the strings
+                    ref_str = "REF: "
+                    hyp_str = "HYP: "
+                    op_str = "     "
+
             if rf == "*":
                 rf = "".join(["*"] * str_len)
             elif hp == "*":
@@ -181,6 +210,6 @@ def _construct_comparison_string(
 
     if include_space_seperator:
         # remove last space
-        return f"{ref_str[:-1]}\n{hyp_str[:-1]}\n{op_str[:-1]}\n"
+        return agg_str + f"{ref_str[:-1]}\n{hyp_str[:-1]}\n{op_str[:-1]}\n"
     else:
-        return f"{ref_str}\n{hyp_str}\n{op_str}\n"
+        return agg_str + f"{ref_str}\n{hyp_str}\n{op_str}\n"
diff --git a/tests/test_alignment.py b/tests/test_alignment.py
@@ -1,5 +1,6 @@
 import unittest
 import jiwer
+from jiwer import visualize_alignment
 
 
 class TestAlignmentVisualizationWords(unittest.TestCase):
@@ -143,6 +144,25 @@ def test_empty_ref_with_hyp_deletion(self):
         )
         self.assertEqual(alignment, correct_alignment)
 
+    def test_line_width(self):
+        correct = """sentence 1
+REF: this sentence could be
+HYP: this sentence  will be
+                       S   
+
+REF: split ** **
+HYP: split by ai
+            I  I
+"""
+        alignment = visualize_alignment(
+            jiwer.process_words(
+                "this sentence could be split", "this sentence will be split by ai"
+            ),
+            line_width=30,
+            show_measures=False,
+        )
+        self.assertEqual(correct, alignment)
+
 
 class TestAlignmentVisualizationCharacters(unittest.TestCase):
     def test_insertion(self):
@@ -263,3 +283,22 @@ def test_empty_ref_with_hyp_deletion(self):
             show_measures=False,
         )
         self.assertEqual(alignment, correct_alignment)
+
+    def test_line_width(self):
+        correct = """sentence 1
+REF: this sentence could be sp
+HYP: this sentence will* be sp
+                   SSS D      
+
+REF: lit******
+HYP: lit by ai
+        IIIIII
+"""
+        alignment = visualize_alignment(
+            jiwer.process_characters(
+                "this sentence could be split", "this sentence will be split by ai"
+            ),
+            line_width=30,
+            show_measures=False,
+        )
+        self.assertEqual(correct, alignment)