diff --git a/README.md b/README.md index 184cc62..70d92bd 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,14 @@ tokenizer.tokenize(fsw, bos=False, eos=False) # [6, 932, 932, 255, 678, 660, 919, 924, 255, 678, 660, 919, 924] ``` +4. Or to remove 'A' information, and separate signs by spaces, we can use: + +```python +from signwriting.tokenizer import normalize_signwriting + +normalize_signwriting(fsw) +``` + ### `signwriting.visualizer` This module is used to visualize SignWriting strings as images. diff --git a/signwriting/tokenizer/__init__.py b/signwriting/tokenizer/__init__.py index d951466..8b8197e 100644 --- a/signwriting/tokenizer/__init__.py +++ b/signwriting/tokenizer/__init__.py @@ -1 +1,2 @@ from signwriting.tokenizer.signwriting_tokenizer import SignWritingTokenizer +from signwriting.tokenizer.signwriting_normalizer import normalize_signwriting diff --git a/signwriting/tokenizer/signwriting_normalizer.py b/signwriting/tokenizer/signwriting_normalizer.py new file mode 100644 index 0000000..9619ba0 --- /dev/null +++ b/signwriting/tokenizer/signwriting_normalizer.py @@ -0,0 +1,14 @@ +from functools import lru_cache + +from signwriting.tokenizer.signwriting_tokenizer import SignWritingTokenizer + + +@lru_cache(maxsize=None) +def get_tokenizer(): + return SignWritingTokenizer() + + +def normalize_signwriting(fsw: str) -> str: + tokenizer = get_tokenizer() + tokens = list(tokenizer.text_to_tokens(fsw, box_position=True)) + return tokenizer.tokens_to_text(tokens) diff --git a/signwriting/tokenizer/test_signwriting_normalizer.py b/signwriting/tokenizer/test_signwriting_normalizer.py new file mode 100644 index 0000000..b5f98ca --- /dev/null +++ b/signwriting/tokenizer/test_signwriting_normalizer.py @@ -0,0 +1,28 @@ +import unittest + +from signwriting.tokenizer import normalize_signwriting + + +class NormalizeCase(unittest.TestCase): + + def test_normalizer_same_sign(self): + fsw = 'M123x456S1f720487x492S1f720487x492' + normalized = normalize_signwriting(fsw) + self.assertEqual(fsw, normalized) + + def test_normalizer_removes_a(self): + a_info = 'AS16d10S22b03S20500S15a28S31400' + m_info = 'M536x550S15a28485x523S16d10519x484S22b03507x508S20500498x532S31400482x482' + normalized = normalize_signwriting(a_info + m_info) + self.assertEqual(m_info, normalized) + + def test_normalizer_creates_space(self): + fsw_1 = 'M536x550S15a28485x523S16d10519x484S22b03507x508S20500498x532S31400482x482' + fsw_2 = 'M123x456S15a28485x523S16d10519x484S22b03507x508S20500498x532S31400482x482' + + normalized = normalize_signwriting(fsw_1 + fsw_2) + self.assertEqual(f"{fsw_1} {fsw_2}", normalized) + + +if __name__ == '__main__': + unittest.main()