diff --git a/src/jiwer/alignment.py b/src/jiwer/alignment.py index b22f789..665d55a 100644 --- a/src/jiwer/alignment.py +++ b/src/jiwer/alignment.py @@ -21,7 +21,7 @@ pairs. """ -from typing import List, Union +from typing import List, Union, Optional from jiwer.process import CharacterOutput, WordOutput, AlignmentChunk @@ -32,6 +32,7 @@ def visualize_alignment( output: Union[WordOutput, CharacterOutput], show_measures: bool = True, skip_correct: bool = True, + line_width: Optional[int] = None, ) -> str: """ Visualize the output of [jiwer.process_words][process.process_words] and @@ -45,6 +46,7 @@ def visualize_alignment( show_measures: If enabled, the visualization will include measures like the WER or CER skip_correct: If enabled, the visualization will exclude correct reference and hypothesis pairs + line_width: If set, try, at best effort, to spit sentences into multiple lines if they exceed the width. Returns: (str): The visualization as a string @@ -95,6 +97,18 @@ def visualize_alignment( HYP: quite * bit of an even longest sentence here D I I S I ``` + + When setting `line_width=80`, the following output will be split into multiple lines: + + ```txt + sentence 1 + REF: This is a very long sentence that is *** much longer than the previous one + HYP: This is a very loong sentence that is not much longer than the previous one + S I + REF: or the one before that + HYP: or *** one before that + D + ``` """ references = output.references hypothesis = output.hypotheses @@ -110,7 +124,7 @@ def visualize_alignment( final_str += f"sentence {idx+1}\n" final_str += _construct_comparison_string( - gt, hp, chunks, include_space_seperator=not is_cer + gt, hp, chunks, include_space_seperator=not is_cer, line_width=line_width ) final_str += "\n" @@ -140,10 +154,12 @@ def _construct_comparison_string( hypothesis: List[str], ops: List[AlignmentChunk], include_space_seperator: bool = False, + line_width: Optional[int] = None, ) -> str: ref_str = "REF: " hyp_str = "HYP: " op_str = " " + agg_str = "" # aggregate string for max_chars split for op in ops: if op.type == "equal" or op.type == "substitute": @@ -165,6 +181,19 @@ def _construct_comparison_string( for rf, hp, c in zip(ref, hyp, op_chars): str_len = max(len(rf), len(hp), len(c)) + if line_width is not None: + if len(ref_str) + str_len > line_width: + # aggregate the strings + if include_space_seperator: + agg_str += f"{ref_str[:-1]}\n{hyp_str[:-1]}\n{op_str[:-1]}\n\n" + else: + agg_str += f"{ref_str}\n{hyp_str}\n{op_str}\n\n" + + # reset the strings + ref_str = "REF: " + hyp_str = "HYP: " + op_str = " " + if rf == "*": rf = "".join(["*"] * str_len) elif hp == "*": @@ -181,6 +210,6 @@ def _construct_comparison_string( if include_space_seperator: # remove last space - return f"{ref_str[:-1]}\n{hyp_str[:-1]}\n{op_str[:-1]}\n" + return agg_str + f"{ref_str[:-1]}\n{hyp_str[:-1]}\n{op_str[:-1]}\n" else: - return f"{ref_str}\n{hyp_str}\n{op_str}\n" + return agg_str + f"{ref_str}\n{hyp_str}\n{op_str}\n" diff --git a/tests/test_alignment.py b/tests/test_alignment.py index c3a56c7..5a865da 100644 --- a/tests/test_alignment.py +++ b/tests/test_alignment.py @@ -1,5 +1,6 @@ import unittest import jiwer +from jiwer import visualize_alignment class TestAlignmentVisualizationWords(unittest.TestCase): @@ -143,6 +144,25 @@ def test_empty_ref_with_hyp_deletion(self): ) self.assertEqual(alignment, correct_alignment) + def test_line_width(self): + correct = """sentence 1 +REF: this sentence could be +HYP: this sentence will be + S + +REF: split ** ** +HYP: split by ai + I I +""" + alignment = visualize_alignment( + jiwer.process_words( + "this sentence could be split", "this sentence will be split by ai" + ), + line_width=30, + show_measures=False, + ) + self.assertEqual(correct, alignment) + class TestAlignmentVisualizationCharacters(unittest.TestCase): def test_insertion(self): @@ -263,3 +283,22 @@ def test_empty_ref_with_hyp_deletion(self): show_measures=False, ) self.assertEqual(alignment, correct_alignment) + + def test_line_width(self): + correct = """sentence 1 +REF: this sentence could be sp +HYP: this sentence will* be sp + SSS D + +REF: lit****** +HYP: lit by ai + IIIIII +""" + alignment = visualize_alignment( + jiwer.process_characters( + "this sentence could be split", "this sentence will be split by ai" + ), + line_width=30, + show_measures=False, + ) + self.assertEqual(correct, alignment)