Skip to content

Commit

Permalink
fix(signwriting_tokenizer): fix #4
Browse files Browse the repository at this point in the history
  • Loading branch information
AmitMY committed Mar 13, 2024
1 parent 0fdc36f commit 77816fe
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 4 deletions.
8 changes: 4 additions & 4 deletions signwriting/tokenizer/signwriting_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,10 @@ def text_to_tokens(self, text: str, box_position=False) -> List[str]:

def tokens_to_text(self, tokens: List[str]) -> str:
tokenized = " ".join(tokens)
tokenized = re.sub(r'p(\d*) p(\d*)', r'\1x\2', tokenized)
tokenized = re.sub(r'c(\d)\d? r(.)', r'\1\2', tokenized)
tokenized = re.sub(r'c(\d)\d?', r'\1 0', tokenized)
tokenized = re.sub(r'r(.)', r'0\1', tokenized)
tokenized = re.sub(r' p(\d*) p(\d*)', r'\1x\2', tokenized)
tokenized = re.sub(r' c(\d)\d? r(.)', r'\1\2', tokenized)
tokenized = re.sub(r' c(\d)\d?', r'\1 0', tokenized)
tokenized = re.sub(r' r(.)', r'0\1', tokenized)

tokenized = tokenized.replace(' ', '')
tokenized = re.sub(r'(\d)([MBLR])', r'\1 \2', tokenized)
Expand Down
7 changes: 7 additions & 0 deletions signwriting/tokenizer/test_signwriting_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,13 @@ def test_normalizer_creates_space(self):
normalized = normalize_signwriting(fsw_1 + fsw_2)
self.assertEqual(f"{fsw_1} {fsw_2}", normalized)

def test_normalization_is_identity_regression_4(self):
# https://github.com/sign-language-processing/signwriting/issues/4
fsw_1 = "M511x510S2c734490x490"
fsw_2 = "M510x518S2c105490x483"
self.assertEqual(fsw_1, normalize_signwriting(fsw_1))
self.assertEqual(fsw_2, normalize_signwriting(fsw_2))


if __name__ == '__main__':
unittest.main()

0 comments on commit 77816fe

Please sign in to comment.