-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmetrics.py
29 lines (26 loc) · 1.59 KB
/
metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from typing import Callable
def edit_distance(prediction_tokens: list[str], reference_tokens: list[str], ignore_tokens: list[str] = []) -> int:
"""Standard dynamic programming algorithm to compute the edit distance adding a list of tokens to ignore
Args:
prediction_tokens: A tokenized predicted sentence
reference_tokens: A tokenized reference sentence
ignore_tokens: Tokens to ignore substitutions and insertions
Returns:
Edit distance between the predicted sentence and the reference sentence, ignoring insertions and substitutions of ignore_tokens
"""
dp = [[0] * (len(reference_tokens) + 1) for _ in range(len(prediction_tokens) + 1)]
for i in range(len(prediction_tokens) + 1):
dp[i][0] = i
for j in range(1, len(reference_tokens) + 1):
dp[0][j] = dp[0][j - 1] + (1 if reference_tokens[j - 1] not in ignore_tokens else 0)
for i in range(1, len(prediction_tokens) + 1):
for j in range(1, len(reference_tokens) + 1):
dp[i][j] = min(
dp[i - 1][j] + 1,
dp[i][j - 1] + (1 if reference_tokens[j - 1] not in ignore_tokens else 0),
dp[i - 1][j - 1] + (1 if prediction_tokens[i - 1] != reference_tokens[j - 1]
and reference_tokens[j - 1] not in ignore_tokens else 0)
)
return dp[-1][-1]
def wer_n(prediction: str, target: str, ignore_tokens: list[str], tokenizer: Callable[[str], list[str]] = str.split) -> float:
return edit_distance(tokenizer(prediction), tokenizer(target), ignore_tokens) / len(tokenizer(target))