-
Notifications
You must be signed in to change notification settings - Fork 43
/
Copy pathtokenization.py
303 lines (229 loc) Β· 11.5 KB
/
tokenization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
from abc import ABC
from typing import List, Tuple, Dict
from util import *
from references import *
from collections import defaultdict
from dataclasses import dataclass
import regex as re
sample_text1 = "Hello, π! δ½ ε₯½!"
# https://github.com/openai/tiktoken/blob/9d01e5670ff50eb74cdb96406c7f3d9add0ae2f8/tiktoken_ext/openai_public.py#L23
GPT2_TOKENIZER_REGEX = \
r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
def tokenization_unit():
note("This unit was inspired by Andrej Karpathy's video on tokenization; check it out!")
see("https://www.youtube.com/watch?v=zduSFxRajkE")
intro_tokenizer()
examples()
character_tokenizer()
byte_tokenizer()
word_tokenizer()
bpe_tokenizer()
@dataclass(frozen=True)
class BPETokenizerParams:
"""All you need to specify a BPETokenizer."""
vocab: Dict[int, bytes] # index -> bytes
merges: Dict[Tuple[int, int], int] # (index1, index2) -> new_index
class Tokenizer(ABC):
"""Abstract interface for a tokenizer."""
def encode(self, text: str) -> List[int]:
raise NotImplementedError
def decode(self, indices: List[int]) -> str:
raise NotImplementedError
class CharacterTokenizer(Tokenizer):
"""Represent a text as a sequence of Unicode code points."""
def encode(self, text: str) -> List[int]:
return list(map(ord, text))
def decode(self, indices: List[int]) -> str:
return "".join(map(chr, indices))
class ByteTokenizer(Tokenizer):
"""Represent a text as a sequence of bytes."""
def encode(self, text: str) -> List[int]:
text_bytes = text.encode("utf-8")
indices = list(map(int, text_bytes))
return indices
def decode(self, indices: List[int]) -> str:
text_bytes = bytes(indices)
return text_bytes.decode("utf-8")
def merge(indices: List[int], pair: Tuple[(int, int)], new_index: int) -> List[int]:
"""Return `indices`, but with all instances of `pair` replaced with `new_index`."""
new_indices = []
i = 0
while i < len(indices):
if i + 1 < len(indices) and indices[i] == pair[0] and indices[i + 1] == pair[1]:
new_indices.append(new_index)
i += 2
else:
new_indices.append(indices[i])
i += 1
return new_indices
class BPETokenizer(Tokenizer):
"""BPE tokenizer given a set of merges and a vocabulary."""
def __init__(self, params: BPETokenizerParams):
self.params = params
def encode(self, text: str) -> List[int]:
indices = list(map(int, text.encode("utf-8")))
# Note: this is a very slow implementation
for pair, new_index in self.params.merges.items():
indices = merge(indices, pair, new_index)
return indices
def decode(self, indices: List[int]) -> str:
bytes_list = list(map(self.params.vocab.get, indices))
text = b"".join(bytes_list).decode("utf-8")
return text
def get_compression_ratio(text: str, indices: List[int]) -> float:
"""Given `text` that has been tokenized into `indices`, ."""
num_bytes = len(bytes(text, encoding="utf-8")) # Original
num_tokens = len(indices) # Tokenized
return num_bytes / num_tokens
def get_gpt2_tokenizer():
# Code: https://github.com/openai/tiktoken
import tiktoken
# You can use cl100k_base for the gpt3.5-turbo or gpt4 tokenizer
return tiktoken.get_encoding("gpt2")
def intro_tokenizer():
note("Raw text generally represented as Unicode strings.")
text = sample_text1
note("A language model places a probability distribution over "
"sequences of tokens (usually represented by integer indices).")
indices = [15496, 11, 995, 0]
note("So we need a procedure that *encodes* text into tokens.")
note("We also need a procedure that *decodes* tokens back into text.")
note("A Tokenizer is a class that implements the encode and decode methods.")
see(Tokenizer)
note("The number of possible indices is the *vocabulary size*.")
def examples():
note("Play with this interactive site to get a feel for how tokenizers work:")
see("https://tiktokenizer.vercel.app/?encoder=gpt2")
note("## Observations")
note("- A word and its preceding space are part of the same token (e.g., ' world').")
note("- A word at the beginning and in the middle are represented differently (e.g., 'hello hello').")
note("- Some long words are one token (e.g., ' SolidGoldMagikarp').")
note("- Numbers are tokenized into every few digits.")
note("Here's the GPT-2 tokenizer from OpenAI (tiktoken) in action.")
tokenizer = get_gpt2_tokenizer()
text = sample_text1
note("Check that encode() and decode() roundtrip:")
indices = tokenizer.encode(text)
reconstructed_text = tokenizer.decode(indices)
assert text == reconstructed_text
def character_tokenizer():
note("## Character-based tokenization")
note("A Unicode string is a sequence of Unicode characters.")
note("Each character can be converted into a code point (integer) via `ord`.")
assert ord("a") == 97
assert ord("π") == 127757
note("It can be converted back via `chr`.")
assert chr(97) == "a"
assert chr(127757) == "π"
note("Now let's build a `Tokenizer` and make sure it round-trips:")
tokenizer = CharacterTokenizer()
text = sample_text1
indices = tokenizer.encode(text)
reconstructed_text = tokenizer.decode(indices)
assert text == reconstructed_text
note("There are approximately 150K Unicode characters.")
see("https://en.wikipedia.org/wiki/List_of_Unicode_characters")
vocabulary_size = max(indices) + 1 # This is a lower bound
note("Problem 1: this is a very large vocabulary.")
note("Problem 2: many characters are quite rare (e.g., π), "
"which is inefficient use of the vocabulary.")
compression_ratio = get_compression_ratio(text, indices)
def byte_tokenizer():
note("## Byte-based tokenization")
note("Unicode text can be represented as a sequence of bytes, "
"which can be represented by integers between 0 and 255.")
note("The most common Unicode encoding is UTF-8."), see("https://en.wikipedia.org/wiki/UTF-8")
note("Some Unicode characters are represented by one byte:")
assert bytes("a", encoding="utf-8") == b"a"
note("Others take multiple bytes:")
assert bytes("π", encoding="utf-8") == b"\xf0\x9f\x8c\x8d"
note("Now let's build a `Tokenizer` and make sure it round-trips:")
tokenizer = ByteTokenizer()
text = sample_text1
indices = tokenizer.encode(text)
reconstructed_text = tokenizer.decode(indices)
assert text == reconstructed_text
note("The vocabulary is nice and small: a byte can represent 256 values.")
vocabulary_size = 256
note("What about the compression rate?")
compression_ratio = get_compression_ratio(text, indices)
assert compression_ratio == 1
note("The compression ratio is terrible, which means the sequences will be too long.")
note("Given that the context length of a Transformer is limited "
"(since attention is quadratic), this is not looking great...")
note("There are some papers that use bytes directly, "
"but they require architectural changes and "
"have not been scaled to the largest models yet.")
megabyte_paper = Spec(name="MegaByte", url="https://arxiv.org/pdf/2305.07185.pdf")
byt5_paper = Spec(name="ByT5", url="https://arxiv.org/pdf/2305.07185.pdf")
def word_tokenizer():
note("## Word-based tokenization")
note("Another approach (closer to what was done classically in NLP) is to split text into words.")
text = "Hello, supercalifragilisticexpialidocious!"
text_segments = re.findall(r"\w+|.", text)
note("This regular expression keeps all alphanumeric characters together (words).")
note("Here is a fancier version:")
text_segments = re.findall(GPT2_TOKENIZER_REGEX, text)
note("To turn this into a `Tokenizer`, we need to map these segments into integers.")
note("Then, we can build a mapping from each segment into an integer.")
note("But there are problems:")
note("- The number of words is huge (like for Unicode characters).")
note("- Many words are rare and the model won't learn much about them.")
note("- We need a fixed vocabulary size.")
note("New words we haven't seen during training get a special UNK token, "
"which is ugly and can mess up perplexity calculations.")
vocabulary_size = "Number of distinct segments in the training data"
compression_ratio = get_compression_ratio(text, text_segments)
def bpe_tokenizer():
note("## Byte Pair Encoding (BPE)"), see("https://en.wikipedia.org/wiki/Byte_pair_encoding")
note("The BPE algorithm was introduced by Philip Gage in 1994 for data compression.") # http://www.pennelynn.com/Documents/CUJ/HTML/94HTML/19940045.HTM
note("It was adapted to NLP for neural machine translation (Sennrich 2015).") # https://arxiv.org/pdf/2305.07185.pdf
note("(Previously, papers had been using word-based tokenization.)")
note("BPE was then used by the GPT-2 paper (Radford 2019)."), see(gpt_2)
note("The basic idea of BPE is to *train* the tokenizer on text "
"to automatically determine the vocabulary.")
note("Intuition: common sequences of characters are represented by a single token, "
"rare sequences are represented by many tokens.")
note("The GPT-2 paper used word-based tokenization to "
"break up the text into inital segments and "
"run the original BPE algorithm on each segment.")
note("The basic idea is to start with the byte-based tokenization and perform merges.")
note("Basic idea: start with each byte as a token, "
"and successively merge the most common pair of adjacent tokens.")
text = "the cat in the hat"
params = train_bpe(text, num_merges=3)
note("Now, given a new text, we can encode it.")
tokenizer = BPETokenizer(params)
text = "the quick brown fox"
indices = tokenizer.encode(text)
reconstructed_text = tokenizer.decode(indices)
assert text == reconstructed_text
note("In Assignment 1, you will go beyond this in the following ways:")
note("- encode() currently loops over all merges. Only loop over merges that matter.")
note("- Detect and preserve special tokens (e.g., <|endoftext|>).")
note("- Use pre-tokenization (e.g., the GPT-2 tokenizer regex."), see(GPT2_TOKENIZER_REGEX)
note("- Try to make the implementation as fast as possible.")
def train_bpe(text: str, num_merges: int) -> BPETokenizerParams:
note("Start with the list of bytes of `text`.")
indices = list(map(int, text.encode("utf-8")))
# index1, index2 => merged index
merges: Dict[Tuple[int, int], int] = {}
# index -> bytes
vocab: Dict[int, bytes] = {
x: bytes([x]) for x in range(256)
}
for i in range(num_merges):
note("Count the number of occurrences of each pair of tokens")
counts = defaultdict(int)
for pair in zip(indices, indices[1:]): # For each adjacent pair
counts[pair] += 1
note("Find the most common pair.")
pair = max(counts, key=counts.get)
note("Merge that pair.")
new_index = 256 + i
merges[pair] = new_index
vocab[new_index] = vocab[pair[0]] + vocab[pair[1]]
note(f"Merge {vocab[pair[0]]} {vocab[pair[1]]} -> {vocab[new_index]}")
indices = merge(indices, pair, new_index)
note(f"Text: {list(map(vocab.get, indices))}")
return BPETokenizerParams(vocab=vocab, merges=merges)