diff --git a/signwriting/tokenizer/signwriting_tokenizer.py b/signwriting/tokenizer/signwriting_tokenizer.py index 22eabaf..c1511d0 100644 --- a/signwriting/tokenizer/signwriting_tokenizer.py +++ b/signwriting/tokenizer/signwriting_tokenizer.py @@ -51,6 +51,8 @@ def tokenize_symbol(symbol: SignSymbol, box_position=False): yield "p" + str(symbol["position"][1]) def text_to_tokens(self, text: str, box_position=False) -> List[str]: + text = re.sub(r'([MLBR])', r' \1', text).strip() # add spaces + text = re.sub(r' +', r' ', text) # remove consecutive spaces signs = [fsw_to_sign(f) for f in text.split(" ")] for sign in signs: yield from SignWritingTokenizer.tokenize_symbol(sign["box"], box_position=box_position) diff --git a/signwriting/tokenizer/test_signwriting_tokenizer.py b/signwriting/tokenizer/test_signwriting_tokenizer.py index 315974b..f818840 100644 --- a/signwriting/tokenizer/test_signwriting_tokenizer.py +++ b/signwriting/tokenizer/test_signwriting_tokenizer.py @@ -25,6 +25,16 @@ def test_tokenization_multiple_signs(self): fsw = 'M123x456S1f720487x492 M124x456S1f210488x493' tokens = list(tokenizer.text_to_tokens(fsw, box_position=True)) + self.assertEqual(tokens, [ + 'M', 'p123', 'p456', 'S1f7', 'c2', 'r0', 'p487', 'p492', + 'M', 'p124', 'p456', 'S1f2', 'c1', 'r0', 'p488', 'p493' + ]) + + def test_tokenization_multiple_signs_no_space(self): + tokenizer = SignWritingTokenizer() + + fsw = 'M123x456S1f720487x492M124x456S1f210488x493' + tokens = list(tokenizer.text_to_tokens(fsw, box_position=True)) self.assertEqual(tokens, [ 'M', 'p123', 'p456', 'S1f7', 'c2', 'r0', 'p487', 'p492', 'M', 'p124', 'p456', 'S1f2', 'c1', 'r0', 'p488', 'p493'