Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add alignment split lines #89

Closed
wants to merge 3 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 32 additions & 3 deletions jiwer/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def visualize_alignment(
output: Union[WordOutput, CharacterOutput],
show_measures: bool = True,
skip_correct: bool = True,
max_chars: int = None,
) -> str:
"""
Visualize the output of [jiwer.process_words][process.process_words] and
Expand All @@ -46,6 +47,7 @@ def visualize_alignment(
show_measures: If enabled, the visualization will include measures like the WER
or CER
skip_correct: If enabled, the visualization will exclude correct reference and hypothesis pairs
max_chars: If set split the aligned strings into multiple lines if they exceed this length

Returns:
(str): The visualization as a string
Expand Down Expand Up @@ -96,6 +98,19 @@ def visualize_alignment(
HYP: quite * bit of an even longest sentence here
D I I S I
```

When setting `max_chars=80`, the output will be split into multiple lines:

```txt
sentence 1
REF: This is a very long sentence that is *** much longer than the previous one
HYP: This is a very loong sentence that is not much longer than the previous one
S I

REF: or the one before that
HYP: or *** one before that
D
```
"""
references = output.references
hypothesis = output.hypotheses
Expand All @@ -109,7 +124,7 @@ def visualize_alignment(

final_str += f"sentence {idx+1}\n"
final_str += _construct_comparison_string(
gt, hp, chunks, include_space_seperator=not is_cer
gt, hp, chunks, include_space_seperator=not is_cer, max_chars=max_chars,
)
final_str += "\n"

Expand Down Expand Up @@ -139,10 +154,12 @@ def _construct_comparison_string(
hypothesis: List[str],
ops: List[AlignmentChunk],
include_space_seperator: bool = False,
max_chars: int = None,
) -> str:
ref_str = "REF: "
hyp_str = "HYP: "
op_str = " "
agg_str = "" # aggregate string for max_chars split

for op in ops:
if op.type == "equal" or op.type == "substitute":
Expand All @@ -163,6 +180,18 @@ def _construct_comparison_string(
op_chars = [op_char for _ in range(len(ref))]
for rf, hp, c in zip(ref, hyp, op_chars):
str_len = max(len(rf), len(hp), len(c))
if max_chars is not None:
if len(ref_str) + str_len > max_chars:
# aggregate the strings
if include_space_seperator:
agg_str += f"{ref_str[:-1]}\n{hyp_str[:-1]}\n{op_str[:-1]}\n\n"
else:
agg_str += f"{ref_str}\n{hyp_str}\n{op_str}\n\n"

# reset the strings
ref_str = "REF: "
hyp_str = "HYP: "
op_str = " "

if rf == "*":
rf = "".join(["*"] * str_len)
Expand All @@ -180,6 +209,6 @@ def _construct_comparison_string(

if include_space_seperator:
# remove last space
return f"{ref_str[:-1]}\n{hyp_str[:-1]}\n{op_str[:-1]}\n"
return agg_str + f"{ref_str[:-1]}\n{hyp_str[:-1]}\n{op_str[:-1]}\n"
else:
return f"{ref_str}\n{hyp_str}\n{op_str}\n"
return agg_str + f"{ref_str}\n{hyp_str}\n{op_str}\n"