-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathencoder.py
97 lines (83 loc) · 3.7 KB
/
encoder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from typing import List, Dict, Tuple
# from https://github.com/OpenPecha/CommonSpell/blob/main/src/CommonSpell/encoder.py
GAP_ELEMENT = ''
class Encoder(object):
"""
Simple encoder class. A encoder just assigns integers to tokens (in our case strings).
This significantly speeds up operations and allows working at the token level instead of
character level.
"""
def __init__(self, allow_decode=False, shift=32, split_code_for_2_bytes=False):
"""
shift: gives the minimum code value. 32 is the default to avoid ASCII control codes.
we also add 10 to this value to account for the possibility of splitting codes on 2 characters.
split_code_for_2_bytes: In some cases encoding high code values as characters won't work
for diff_match_patch, making it raise an exception. This happens mostly on Windows, see
https://github.com/JoshData/fast_diff_match_patch/blob/5f7b0143353e7419e9f4d6253ac9d206a1369683/interface.cpp#L180
"""
self.elementToCode: Dict[str,int] = {GAP_ELEMENT: shift}
if allow_decode:
self.codeToElement: List[str] = [None for _ in range(shift+10)]
self.codeToElement.append(GAP_ELEMENT)
self.last: int = shift+10
self.allow_decode = allow_decode
self.shift = shift
self.split_code_for_2_bytes = split_code_for_2_bytes
def has(self, element: str) -> bool:
return element in self.elementToCode
def hasCode(self, code: int) -> bool:
return code <= self.last
def encode(self, element: str) -> int:
code = self.elementToCode.get(element)
if code is None:
self.last += 1
code = self.last
self.elementToCode[element] = code
if self.allow_decode:
self.codeToElement.append(element)
# if self.last >= 65536, things can be complicated on Windows
return code
def encode_str(self, element: str) -> Tuple[str,int]:
code: int = self.encode(element)
# if the code is more than 65536, it's not possible to
#
if not self.split_code_for_2_bytes or code < 65536:
return chr(code), 1
else:
# first_char_code will be in the range self.shift -> self.shift+10
# (at least in the likely situations)
first_char_code = (code // 65536) + self.shift
# the second char code should start after self.shift + 10
second_char_code = (code % 65536) + self.shift + 10
return chr(first_char_code)+chr(second_char_code), 2
def encode_list(self, element_list: List[str]):
res = ""
for e in element_list:
s, nb_c = self.encode_str(e)
#print("encode %s as %s (%d)" % (e, s, ord(s)))
res += s
return res
def decode(self, code: int) -> str:
if not self.allow_decode:
raise KeyError('encoder does not allow decoding')
try:
return self.codeToElement[code]
except KeyError:
raise KeyError(
'there is no elements in the encoder encoded as %d' % code)
def decode_string(self, s: str):
res = ""
for c in s:
decoded_c = self.decode(ord(c))
res += decoded_c
return res
def elements(self):
return [self.decode(c) for c in sorted(self.codeToElement)]
def len(self):
return self.last + 1
def reset(self):
self.elementToCode: Dict[str,int] = {GAP_ELEMENT: self.shift+10}
if self.allow_decode:
self.codeToElement: List[str] = [None for _ in range(self.shift+10)]
self.codeToElement.append(GAP_ELEMENT)
self.last: int = self.shift+10