Skip to content

Commit

Permalink
fix(tokenizer): do not treat sign prefix as a sign
Browse files Browse the repository at this point in the history
  • Loading branch information
AmitMY committed Jan 9, 2024
1 parent 953dd21 commit d2503b5
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 8 deletions.
6 changes: 4 additions & 2 deletions signwriting/tokenizer/signwriting_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,10 @@ def tokenize_symbol(symbol: SignSymbol, box_position=False):
yield "p" + str(symbol["position"][1])

def text_to_tokens(self, text: str, box_position=False) -> List[str]:
text = re.sub(r'([MLBR])', r' \1', text).strip() # add spaces
text = re.sub(r' +', r' ', text) # remove consecutive spaces
text = re.sub(r'([MLBR])', r' \1', text).strip() # add spaces
text = re.sub(r'\bA\w*\b', '', text) # remove sign prefix
text = re.sub(r' +', r' ', text) # remove consecutive spaces
text = text.strip()
signs = [fsw_to_sign(f) for f in text.split(" ")]
for sign in signs:
yield from SignWritingTokenizer.tokenize_symbol(sign["box"], box_position=box_position)
Expand Down
22 changes: 16 additions & 6 deletions signwriting/tokenizer/test_signwriting_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,30 @@ def test_tokenization_multiple_signs(self):

fsw = 'M123x456S1f720487x492 M124x456S1f210488x493'
tokens = list(tokenizer.text_to_tokens(fsw, box_position=True))
self.assertEqual(tokens, [
self.assertEqual([
'M', 'p123', 'p456', 'S1f7', 'c2', 'r0', 'p487', 'p492',
'M', 'p124', 'p456', 'S1f2', 'c1', 'r0', 'p488', 'p493'
])
], tokens)

def test_tokenization_multiple_signs_no_space(self):
tokenizer = SignWritingTokenizer()

fsw = 'M123x456S1f720487x492M124x456S1f210488x493'
tokens = list(tokenizer.text_to_tokens(fsw, box_position=True))
self.assertEqual(tokens, [
'M', 'p123', 'p456', 'S1f7', 'c2', 'r0', 'p487', 'p492', 'M', 'p124', 'p456', 'S1f2', 'c1', 'r0', 'p488',
'p493'
])
self.assertEqual([
'M', 'p123', 'p456', 'S1f7', 'c2', 'r0', 'p487', 'p492',
'M', 'p124', 'p456', 'S1f2', 'c1', 'r0', 'p488', 'p493'
], tokens)

def test_tokenization_with_a(self):
tokenizer = SignWritingTokenizer()

fsw = "AS1f720M123x456S1f720487x492"

tokens = list(tokenizer.text_to_tokens(fsw, box_position=True))
self.assertEqual([
'M', 'p123', 'p456', 'S1f7', 'c2', 'r0', 'p487', 'p492',
], tokens)

def test_not_failing_for_r_box(self):
tokenizer = SignWritingTokenizer()
Expand Down

0 comments on commit d2503b5

Please sign in to comment.