From d2503b53bf1eb7bbaff28b3bd3a668948f49f327 Mon Sep 17 00:00:00 2001 From: Amit Moryossef Date: Tue, 9 Jan 2024 06:58:44 +0100 Subject: [PATCH] fix(tokenizer): do not treat sign prefix as a sign --- .../tokenizer/signwriting_tokenizer.py | 6 +++-- .../tokenizer/test_signwriting_tokenizer.py | 22 ++++++++++++++----- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/signwriting/tokenizer/signwriting_tokenizer.py b/signwriting/tokenizer/signwriting_tokenizer.py index c1511d0..19e7a34 100644 --- a/signwriting/tokenizer/signwriting_tokenizer.py +++ b/signwriting/tokenizer/signwriting_tokenizer.py @@ -51,8 +51,10 @@ def tokenize_symbol(symbol: SignSymbol, box_position=False): yield "p" + str(symbol["position"][1]) def text_to_tokens(self, text: str, box_position=False) -> List[str]: - text = re.sub(r'([MLBR])', r' \1', text).strip() # add spaces - text = re.sub(r' +', r' ', text) # remove consecutive spaces + text = re.sub(r'([MLBR])', r' \1', text).strip() # add spaces + text = re.sub(r'\bA\w*\b', '', text) # remove sign prefix + text = re.sub(r' +', r' ', text) # remove consecutive spaces + text = text.strip() signs = [fsw_to_sign(f) for f in text.split(" ")] for sign in signs: yield from SignWritingTokenizer.tokenize_symbol(sign["box"], box_position=box_position) diff --git a/signwriting/tokenizer/test_signwriting_tokenizer.py b/signwriting/tokenizer/test_signwriting_tokenizer.py index f818840..3349a8b 100644 --- a/signwriting/tokenizer/test_signwriting_tokenizer.py +++ b/signwriting/tokenizer/test_signwriting_tokenizer.py @@ -25,20 +25,30 @@ def test_tokenization_multiple_signs(self): fsw = 'M123x456S1f720487x492 M124x456S1f210488x493' tokens = list(tokenizer.text_to_tokens(fsw, box_position=True)) - self.assertEqual(tokens, [ + self.assertEqual([ 'M', 'p123', 'p456', 'S1f7', 'c2', 'r0', 'p487', 'p492', 'M', 'p124', 'p456', 'S1f2', 'c1', 'r0', 'p488', 'p493' - ]) + ], tokens) def test_tokenization_multiple_signs_no_space(self): tokenizer = SignWritingTokenizer() fsw = 'M123x456S1f720487x492M124x456S1f210488x493' tokens = list(tokenizer.text_to_tokens(fsw, box_position=True)) - self.assertEqual(tokens, [ - 'M', 'p123', 'p456', 'S1f7', 'c2', 'r0', 'p487', 'p492', 'M', 'p124', 'p456', 'S1f2', 'c1', 'r0', 'p488', - 'p493' - ]) + self.assertEqual([ + 'M', 'p123', 'p456', 'S1f7', 'c2', 'r0', 'p487', 'p492', + 'M', 'p124', 'p456', 'S1f2', 'c1', 'r0', 'p488', 'p493' + ], tokens) + + def test_tokenization_with_a(self): + tokenizer = SignWritingTokenizer() + + fsw = "AS1f720M123x456S1f720487x492" + + tokens = list(tokenizer.text_to_tokens(fsw, box_position=True)) + self.assertEqual([ + 'M', 'p123', 'p456', 'S1f7', 'c2', 'r0', 'p487', 'p492', + ], tokens) def test_not_failing_for_r_box(self): tokenizer = SignWritingTokenizer()