Skip to content

Commit

Permalink
feat(tokenizer): add fsw normalization
Browse files Browse the repository at this point in the history
  • Loading branch information
AmitMY committed Jan 25, 2024
1 parent d2503b5 commit 086a631
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 0 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,14 @@ tokenizer.tokenize(fsw, bos=False, eos=False)
# [6, 932, 932, 255, 678, 660, 919, 924, 255, 678, 660, 919, 924]
```

4. Or to remove 'A' information, and separate signs by spaces, we can use:

```python
from signwriting.tokenizer import normalize_signwriting

normalize_signwriting(fsw)
```

### `signwriting.visualizer`

This module is used to visualize SignWriting strings as images.
Expand Down
1 change: 1 addition & 0 deletions signwriting/tokenizer/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from signwriting.tokenizer.signwriting_tokenizer import SignWritingTokenizer
from signwriting.tokenizer.signwriting_normalizer import normalize_signwriting
14 changes: 14 additions & 0 deletions signwriting/tokenizer/signwriting_normalizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from functools import lru_cache

from signwriting.tokenizer.signwriting_tokenizer import SignWritingTokenizer


@lru_cache(maxsize=None)
def get_tokenizer():
return SignWritingTokenizer()


def normalize_signwriting(fsw: str) -> str:
tokenizer = get_tokenizer()
tokens = list(tokenizer.text_to_tokens(fsw, box_position=True))
return tokenizer.tokens_to_text(tokens)
28 changes: 28 additions & 0 deletions signwriting/tokenizer/test_signwriting_normalizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import unittest

from signwriting.tokenizer import normalize_signwriting


class NormalizeCase(unittest.TestCase):

def test_normalizer_same_sign(self):
fsw = 'M123x456S1f720487x492S1f720487x492'
normalized = normalize_signwriting(fsw)
self.assertEqual(fsw, normalized)

def test_normalizer_removes_a(self):
a_info = 'AS16d10S22b03S20500S15a28S31400'
m_info = 'M536x550S15a28485x523S16d10519x484S22b03507x508S20500498x532S31400482x482'
normalized = normalize_signwriting(a_info + m_info)
self.assertEqual(m_info, normalized)

def test_normalizer_creates_space(self):
fsw_1 = 'M536x550S15a28485x523S16d10519x484S22b03507x508S20500498x532S31400482x482'
fsw_2 = 'M123x456S15a28485x523S16d10519x484S22b03507x508S20500498x532S31400482x482'

normalized = normalize_signwriting(fsw_1 + fsw_2)
self.assertEqual(f"{fsw_1} {fsw_2}", normalized)


if __name__ == '__main__':
unittest.main()

0 comments on commit 086a631

Please sign in to comment.