-
Notifications
You must be signed in to change notification settings - Fork 43
/
Copy pathlecture_01-content.js
283 lines (283 loc) · 84.5 KB
/
lecture_01-content.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 15}, {"name": "prelude", "filename": "lecture_01.py", "lineno": 26}], "## CS336: Language Models From Scratch (Spring 2024)", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 15}, {"name": "prelude", "filename": "lecture_01.py", "lineno": 28}], "What on earth is this program doing?", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 15}, {"name": "prelude", "filename": "lecture_01.py", "lineno": 30}], "This is an *executable lecture*, a program whose execution delivers the content of a lecture.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 15}, {"name": "prelude", "filename": "lecture_01.py", "lineno": 33}], "Executable lectures make it possible to:", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 15}, {"name": "prelude", "filename": "lecture_01.py", "lineno": 34}], "- view and run code (since everything is code!),", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 15}, {"name": "prelude", "filename": "lecture_01.py", "lineno": 35}], "- see the hierarchical structure of the lecture (e.g., we're in the prelude), and", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 15}, {"name": "prelude", "filename": "lecture_01.py", "lineno": 36}], "- jump to definitions and concepts.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 15}, {"name": "prelude", "filename": "lecture_01.py", "lineno": 36}], "ModelSpec(name='GPT-3', author=None, organization='OpenAI', date='2020-06-11', url='https://arxiv.org/pdf/2005.14165.pdf', description=None, references=None, data=DataSpec(name=None, author=None, organization=None, date=None, url=None, description=None, references=None, num_tokens=300000000000.0, vocabulary_size=None), architecture=ArchitectureSpec(name=None, author=None, organization=None, date=None, url=None, description='Same as GPT-2, but alternating sparse and dense attention layers', references=None, num_parameters=175000000000.0, num_layers=96, dim_model=12288, num_heads=96, dim_head=128), training=TrainingSpec(name=None, author=None, organization=None, date=None, url=None, description=None, references=None, context_length=None, batch_size_tokens=3200000.0, learning_rate=6e-05, weight_decay=None, optimizer=None, hardware='V100s', num_epochs=None, num_flops=None))", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 15}, {"name": "prelude", "filename": "lecture_01.py", "lineno": 38}], "It is an experiment. Let's see how it goes!", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 16}, {"name": "course_logistics", "filename": "lecture_01.py", "lineno": 42}], "https://stanford-cs336.github.io/spring2024/", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 16}, {"name": "course_logistics", "filename": "lecture_01.py", "lineno": 44}], "This is a 5-unit class. You will write a lot of code (an order magnitude more than your average AI course).", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 16}, {"name": "course_logistics", "filename": "lecture_01.py", "lineno": 46}], "This is the first time we're teaching this class. Please be patient with us and give us feedback!", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 16}, {"name": "course_logistics", "filename": "lecture_01.py", "lineno": 49}], "## Cluster", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 16}, {"name": "course_logistics", "filename": "lecture_01.py", "lineno": 50}], "Thanks to Together AI for providing compute.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 16}, {"name": "course_logistics", "filename": "lecture_01.py", "lineno": 51}], "Here's the guide on how to use the cluster: https://docs.google.com/document/d/1yLhnbclhOOL5_OBI_jBlhNh9xr3xRNTCgL5B-g-qQF4/edit", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 16}, {"name": "course_logistics", "filename": "lecture_01.py", "lineno": 53}], "Start your assignments early, since the cluster will fill up close to the deadline!", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 16}, {"name": "course_logistics", "filename": "lecture_01.py", "lineno": 55}], "There was a lot of interest in the class, so unfortunately we couldn't enroll everyone.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 16}, {"name": "course_logistics", "filename": "lecture_01.py", "lineno": 57}], "We will make all the assignments and lecture materials available online, so feel free to follow on your own.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 16}, {"name": "course_logistics", "filename": "lecture_01.py", "lineno": 59}], "We plan to offer this class again next year.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 63}], "Philosophy: understanding via building", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 65}], "## Why you should take this course", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 66}], "You have an obsessive need to understand how things work.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 67}], "You want to build up your research engineering muscles.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 69}], "## Why you should not take this course", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 70}], "You actually want to get research done this quarter. (Talk to your advisor.)", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 72}], "You are interested in learning about the hottest new techniques in AI, e.g., diffusion, multimodality, long context, etc. (You could take a seminar class for that.)", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 75}], "You want to get good results on your own application domain. (You could just prompt GPT-4/Claude/Gemini.)", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 77}], "You need to build a language model for your own application. (You could fine-tune an existing model using standard packages.)", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 80}], "## Why this class exists", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 82}], "Problem: researchers are becoming disconnected.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 83}], "10 years ago, researchers would implement and train their own models.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 84}], "5 years ago, researchers would download a model (e.g., BERT) and fine-tune it.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 85}], "Today, researchers just prompt GPT-4.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 87}], "Moving up levels of abstractions boosts productivity, but", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 88}], "- These abstractions are leaky (contrast with operating systems).", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 89}], "- There is still fundamental research to be done at the lower levels.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 91}], "## The landscape", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 93}], "There are language models...and then are (large) language models.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 95}], "GPT-4 supposedly has 1.8T parameters.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 95}], "https://www.hpcwire.com/2024/03/19/the-generative-ai-future-is-now-nvidias-huang-says", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 96}], "GPT-4 supposedly cost $100M to train.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 96}], "https://www.wired.com/story/openai-ceo-sam-altman-the-age-of-giant-ai-models-is-already-over/", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 97}], "The GPT-4 technical report discloses no details.", {})
addImage([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 98}], "images/gpt_4_section_2.png", {"width": "100.0%"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 98}], "https://arxiv.org/pdf/2303.08774.pdf", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 100}], "## So what are we doing in this class?", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 102}], "We are obviously not building GPT-4 (or anything close).", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 103}], "But we hope to impart some of the skills and mindset, so that if you had the resources, at least you'd know where to start.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 106}], "Key question: what can you learn at small scale that generalizes to large scale?", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 108}], "There are three types of knowledge:", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 109}], "- Mechanics: how things work (what a Transformer is, how FSDP works)", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 110}], "- Mindset: squeezing performance, thinking about scale (scaling laws)", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 111}], "- Intuitions: how to set hyperparmeters, process data, to get good performance", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 113}], "We can teach mechanics and mindset (do transfer).", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 114}], "We cannot teach intuitions (do not necessarily transfer across scales).", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 116}], "You can tell a lot of stories about why something will work.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 117}], "Reality: Some design decisions are simply not justifiable.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 118}], "Example: Noam Shazeer paper that introduced SwiGLU (see last sentence of conclusion):", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 119}], "https://arxiv.org/pdf/2002.05202.pdf", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 121}], "## How to learn the material", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 122}], "- Implementing everything yourself (without looking!).", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 123}], "- Read the major language modeling papers to glean insights.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 124}], "- Understand internals deeply by reading through code and thinking about what's happening.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 17}, {"name": "why_this_course", "filename": "lecture_01.py", "lineno": 125}], "- Run experiments to get intuition for what settings lead to what behavior.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 129}], "Here are some examples of language models in action.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 135}], "ModelSpec(name='GPT-4', author=None, organization='OpenAI', date='2023-03-15', url='https://arxiv.org/pdf/2303.08774.pdf', description=['No details on the data or model architecture.'], references=None, data=None, architecture=None, training=None)", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 136}], "### GPT-4 response", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 137}], "Sure, kiddo! You know how you learned to speak and understand words, right? It's kind of like that. Imagine you're showing your toy robot how to talk and understand human language. At first, your robot doesn't understand anything, so you need to start teaching him.", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 137}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 137}], "1. First, you need to gather a lot of books, songs, and stories. This is the data that our robot will use to learn how to talk.", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 137}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 137}], "2. Then, the robot starts its learning process. Let's say he reads one sentence from a book: \"The cat is black\". It keeps this sentence in his memory and uses it to understand that \"The cat is black.\" is a proper, complete idea.", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 137}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 137}], "3. Then he sees another sentence: \"The black cat is lazy.\" He becomes smarter and realizes that \"cat\" and \"black\" can be used in different ways in a sentence.", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 137}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 137}], "4. After reading lots of sentences like these, your robot friend can now understand and create similar sentences on its own.", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 137}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 137}], "5. You then correct and guide your robot if it makes mistakes. It's okay if he messes up because just like you, he is learning too.", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 137}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 137}], "Basically, a language model is like a robot who reads and understands a lot of sentences, remembers the patterns in these sentences, and then uses this knowledge to make its own sentences or understand new ones. Just like you learned to talk and read by listening to people around you, reading books, and practicing a lot!", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 143}], "ModelSpec(name='Mixtral', author=None, organization='Mistral', date='2024-01-08', url='https://arxiv.org/pdf/2401.04088.pdf', description=None, references=None, data=None, architecture=None, training=None)", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 144}], "### Mixtral response", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 145}], " Sure, I'd be happy to explain that in a simple way!", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 145}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 145}], "Imagine you're learning a new language, let's say Spanish. At first, you don't know any words or how to form sentences. But then, you start learning new words one by one, like \"hola\" (which means \"hello\") and \"gracias\" (which means \"thank you\").", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 145}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 145}], "As you learn more and more words, you start to see how they can fit together to make sentences. For example, you might learn that you can say \"hola, \u00bfc\u00f3mo est\u00e1s?\" to ask someone \"hello, how are you?\"", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 145}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 145}], "A language model is like a big, super-smart computer that's learning a language in the same way you did. It starts out not knowing any words or how they fit together. But then, it starts to learn from lots and lots of examples of language use.", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 145}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 145}], "These examples might be sentences or paragraphs from books, articles, or other sources. The computer looks at each example and tries to figure out which words are used most frequently, and how they tend to fit together.", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 145}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 145}], "Over time, the computer builds up a huge database of language patterns and uses that information to generate new sentences that it hasn't seen before. It can even use what it knows to make educated guesses about what word should come next in a sentence, or to fill in missing words in a sentence it's never seen before.", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 145}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 145}], "So, building a language model is like helping a computer learn a new language by giving it lots and lots of examples to learn from. And just like you, the computer gets better at understanding and using the language the more it practices!", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 147}], "They can mostly follow instructions, generate fluent and semantically relevant text.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 18}, {"name": "examples", "filename": "lecture_01.py", "lineno": 148}], "How do they work? How can we build one ourselves?", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 152}], "Language model to measure the entropy of English", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 152}], "MethodSpec(name='Prediction and Entropy of Printed English', author=None, organization=None, date='1950-09-15', url='https://www.princeton.edu/~wbialek/rome/refs/shannon_51.pdf', description=None, references=None)", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 153}], "Lots of work on n-gram language models (for machine translation, speech recognition)", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 154}], "First neural language modeling", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 154}], "MethodSpec(name='A Neural Probabilistic Language Model', author=None, organization=None, date='2003-02-01', url='https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf', description=None, references=None)", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 155}], "Sequence-to-sequence modeling (for machine translation)", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 155}], "MethodSpec(name='Sequence to Sequence Learning with Neural Networks', author=None, organization=None, date='2014-09-10', url='https://arxiv.org/pdf/1409.3215.pdf', description=None, references=None)", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 156}], "Introduced attention mechanism (for machine translation)", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 156}], "MethodSpec(name='Neural Machine Translation by Jointly Learning to Align and Translate', author=None, organization=None, date='2014-09-01', url='https://arxiv.org/pdf/1409.0473.pdf', description=None, references=None)", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 157}], "Introduced the Transformer architecture (for machine translation)", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 157}], "MethodSpec(name='Transformer architecture', author=None, organization=None, date='2017-06-12', url='https://arxiv.org/pdf/1706.03762.pdf', description=['Introduces the encoder-decoder Transformer architecture', 'Scaled dot-product attention, multi-headed attention, sinusoidal positional embeddings', 'Training with warmup learning rate', 'Applied to machine translation'], references=None)", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 159}], "OpenAI's GPT-2 (1.5B): zero-shot learning, staged release", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 159}], "ModelSpec(name='GPT-2', author=None, organization='OpenAI', date='2019-02-14', url='https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf', description=None, references=None, data=None, architecture=ArchitectureSpec(name=None, author=None, organization=None, date=None, url=None, description=None, references=None, num_parameters=1500000000.0, num_layers=None, dim_model=None, num_heads=None, dim_head=None), training=None)", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 160}], "Google's T5 (11B): cast everything as text-to-text", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 160}], "ModelSpec(name='T5', author=None, organization='Google', date='2019-10-23', url='https://arxiv.org/pdf/1910.10683.pdf', description='Encoder-decoder, frames tasks as text-to-text', references=None, data=DataSpec(name='Colossal Cleaned Common Crawl (C4)', author=None, organization=None, date=None, url=None, description='Filtering (Section 2.2)', references=None, num_tokens=None, vocabulary_size=None), architecture=ArchitectureSpec(name=None, author=None, organization=None, date=None, url=None, description='Remove bias from feedforward layers', references=None, num_parameters=11000000000.0, num_layers=None, dim_model=None, num_heads=None, dim_head=None), training=None)", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 161}], "Kaplan's scaling laws", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 161}], "MethodSpec(name='Scaling Laws for Neural Language Models (Kaplan+ 2020)', author=None, organization='OpenAI', date='2020-01-23', url='https://arxiv.org/pdf/2001.08361.pdf', description=['Vary model size, dataset size, compute; get power laws', 'Larger models require fewer tokens'], references=None)", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 162}], "OpenAI's GPT-3 (175B): in-context learning, closed", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 162}], "ModelSpec(name='GPT-3', author=None, organization='OpenAI', date='2020-06-11', url='https://arxiv.org/pdf/2005.14165.pdf', description=None, references=None, data=DataSpec(name=None, author=None, organization=None, date=None, url=None, description=None, references=None, num_tokens=300000000000.0, vocabulary_size=None), architecture=ArchitectureSpec(name=None, author=None, organization=None, date=None, url=None, description='Same as GPT-2, but alternating sparse and dense attention layers', references=None, num_parameters=175000000000.0, num_layers=96, dim_model=12288, num_heads=96, dim_head=128), training=TrainingSpec(name=None, author=None, organization=None, date=None, url=None, description=None, references=None, context_length=None, batch_size_tokens=3200000.0, learning_rate=6e-05, weight_decay=None, optimizer=None, hardware='V100s', num_epochs=None, num_flops=None))", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 164}], "EleutherAI's open datasets (The Pile) and models (GPT-J)", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 164}], "DataSpec(name='The Pile', author=None, organization='EleutherAI', date='2020-12-31', url='https://arxiv.org/pdf/2101.00027.pdf', description='825GB text, 22 diverse subsets (CommonCrawl, PubMed, ArXiv, GitHub, StackExchange, USPTO, OpenWebText2, Books3, etc.)', references=None, num_tokens=None, vocabulary_size=None)", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 164}], "ModelSpec(name='GPT-J', author=None, organization='EleutherAI', date='2021-06-04', url='https://arankomatsuzaki.wordpress.com/2021/06/04/gpt-j/', description=None, references=None, data=None, architecture=ArchitectureSpec(name=None, author=None, organization=None, date=None, url=None, description='Attention and feedforward layers put in parallel', references=None, num_parameters=6700000000.0, num_layers=None, dim_model=None, num_heads=None, dim_head=None), training=TrainingSpec(name=None, author=None, organization=None, date=None, url=None, description=None, references=None, context_length=None, batch_size_tokens=None, learning_rate=None, weight_decay=None, optimizer=None, hardware='v3 256 TPUs (5.4 PFLOPs) for 5 weeks', num_epochs=None, num_flops=None))", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 166}], "Meta's OPT (175B): GPT-3 replication, lots of hardware issues", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 166}], "ModelSpec(name='OPT', author=None, organization='Meta', date='2022-05-03', url='https://arxiv.org/pdf/2205.01068.pdf', description=None, references=None, data=DataSpec(name=None, author=None, organization=None, date=None, url=None, description='The Pile, PushShift.io Reddit, deduplication', references=None, num_tokens=None, vocabulary_size=None), architecture=ArchitectureSpec(name=None, author=None, organization=None, date=None, url=None, description=None, references=None, num_parameters=175000000000.0, num_layers=None, dim_model=None, num_heads=None, dim_head=None), training=TrainingSpec(name=None, author=None, organization=None, date=None, url=None, description='FSDP with Megatron-LM, fp16 with loss scaling', references=None, context_length=None, batch_size_tokens=None, learning_rate=None, weight_decay=None, optimizer=None, hardware='992 A100 80GB for 2 months, lots of hardware failures', num_epochs=None, num_flops=None))", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 167}], "Hugging Face / BigScience's BLOOM: focused on data sourcing", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 167}], "ModelSpec(name='BLOOM', author=None, organization='BigScience', date='2022-11-09', url=None, description=None, references=None, data=DataSpec(name='ROOTS', author=None, organization=None, date=None, url=None, description=None, references=None, num_tokens=None, vocabulary_size=None), architecture=ArchitectureSpec(name=None, author=None, organization=None, date=None, url=None, description='AliBi positional embeddings, embedding LayerNorm', references=None, num_parameters=176000000000.0, num_layers=None, dim_model=None, num_heads=None, dim_head=None), training=TrainingSpec(name=None, author=None, organization=None, date=None, url=None, description='ZeRO stage 1', references=None, context_length=None, batch_size_tokens=None, learning_rate=None, weight_decay=None, optimizer=None, hardware='48x8 A100s on Jean Zay supercomputer for 3.5 months', num_epochs=None, num_flops=None))", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 168}], "Google's PaLM (540B): massive scale, undertrained", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 168}], "ModelSpec(name='PaLM', author=None, organization='Google', date='2022-04-05', url='https://arxiv.org/pdf/2204.02311.pdf', description=None, references=None, data=DataSpec(name=None, author=None, organization=None, date=None, url=None, description='Social media conversations, webpages, books, GitHub, Wikipedia, news', references=None, num_tokens=None, vocabulary_size=None), architecture=ArchitectureSpec(name=None, author=None, organization=None, date=None, url=None, description='SwiGLU, parallelize attention and feedforward layers, multi-query attention, RoPE, remove biases', references=None, num_parameters=540350000000.0, num_layers=118, dim_model=18432, num_heads=48, dim_head=None), training=TrainingSpec(name=None, author=None, organization=None, date=None, url=None, description='Introduced the term model FLOPs utilization (MFU) metric (observed tokens/sec / theoretical max tokens/sec)', references=None, context_length=None, batch_size_tokens=None, learning_rate=None, weight_decay=None, optimizer='Adafactor without factorization', hardware='6144 TPUv4, 46.2% MFU', num_epochs=None, num_flops=None))", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 169}], "DeepMind's Chinchilla (70B): compute-optimal scaling laws", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 169}], "ModelSpec(name='Chincilla', author=None, organization='DeepMind', date='2022-03-29', url='https://arxiv.org/pdf/2203.15556.pdf', description=['Introduced the rigorous analysis scaling laws for language models', 'Key improvement over Kaplan: tune learning rate for the compute budget', 'Approach 1: for each model size, train with 4 learning rates, vary number of training tokens, fit lower envelope', 'Approach 2 (IsoFLOP): for each model size, train with 9 training budgets, take last point', 'Approach 3: fit parametric function L(N, D) = E + A/N^alpha + B/D^beta to data collected from approaches 1 and 2', 'Conclusion: model and data should scale up at same rate', 'Table 3: extrapolate to 10 trillion parameters'], references=None, data=DataSpec(name='MassiveText, different data distribution', author=None, organization=None, date=None, url=None, description=None, references=None, num_tokens=1500000000000.0, vocabulary_size=None), architecture=ArchitectureSpec(name=None, author=None, organization=None, date=None, url=None, description=None, references=None, num_parameters=70000000000.0, num_layers=None, dim_model=None, num_heads=None, dim_head=None), training=TrainingSpec(name=None, author=None, organization=None, date=None, url=None, description=None, references=None, context_length=None, batch_size_tokens=None, learning_rate=None, weight_decay=None, optimizer=None, hardware=None, num_epochs=None, num_flops=None))", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 171}], "Meta's LLaMA (7B, .., 65B): overtrained, optimized the 7B", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 171}], "ModelSpec(name='LLaMA', author=None, organization='Meta', date='2023-02-27', url='https://arxiv.org/pdf/2302.13971.pdf', description=['Train only on open data (detailed recipe that is replicated by RedPajama)', 'Optimize for fast inference at 7B'], references=None, data=DataSpec(name=None, author=None, organization=None, date=None, url=None, description='CommonCrawl, C4, GitHub, Wikipedia, Books, ArXiv, StackExchange', references=None, num_tokens=1400000000000.0, vocabulary_size=None), architecture=ArchitectureSpec(name=None, author=None, organization=None, date=None, url=None, description='Pre-norm, SwiGLU, RoPE', references=None, num_parameters=65000000000.0, num_layers=None, dim_model=None, num_heads=None, dim_head=None), training=TrainingSpec(name=None, author=None, organization=None, date=None, url=None, description=None, references=None, context_length=None, batch_size_tokens=None, learning_rate=None, weight_decay=None, optimizer=None, hardware='2048 A100 80GB for 21 days', num_epochs=None, num_flops=None))", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 172}], "Mistral (7B): overtrained, very good 7B model", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 172}], "ModelSpec(name='Mistral-7B', author=None, organization='Mistral', date='2023-10-10', url='https://arxiv.org/pdf/2310.06825.pdf', description=None, references=None, data=DataSpec(name=None, author=None, organization=None, date=None, url=None, description=None, references=None, num_tokens=None, vocabulary_size=None), architecture=ArchitectureSpec(name=None, author=None, organization=None, date=None, url=None, description='GQA, sliding window attention', references=None, num_parameters=7000000000.0, num_layers=None, dim_model=None, num_heads=None, dim_head=None), training=None)", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 173}], "Many other open models: Yi, DeepSeek, Qwen, StableLM, OLMo, Gemma, etc.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 174}], "Mixture of experts: Mistral's Mixtral, xAI's Grok, Databricks's DBRX", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 176}], "Frontier models:", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 177}], "- OpenAI's GPT-4", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 177}], "ModelSpec(name='GPT-4', author=None, organization='OpenAI', date='2023-03-15', url='https://arxiv.org/pdf/2303.08774.pdf', description=['No details on the data or model architecture.'], references=None, data=None, architecture=None, training=None)", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 178}], "- Anthropic's Claude 3", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 179}], "- Google's Gemini", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 181}], "Ecosystem graphs tracks latest models", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 182}], "https://crfm.stanford.edu/ecosystem-graphs/index.html?mode=table", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 184}], "Summary", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 185}], "- Interplay between open and closed models", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 19}, {"name": "brief_history", "filename": "lecture_01.py", "lineno": 186}], "- Emphasis on number of parameters, then compute-optimal, then overtrained", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 190}], "## Philosophy", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 191}], "Key: it's all about *efficiency*", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 193}], "Resources: data + hardware (compute, memory, communication)", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 194}], "How do you train the best model given these a fixed set of resources?", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 195}], "Example: given a Common Crawl dump and 16 H100s for 2 weeks, what should we do?", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 197}], "Design decisions: data, tokenization, model architecture, training, alignment", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 198}], "## Pipeline (stylized)", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 200}], "Data", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 201}, {"name": "get_raw_data", "filename": "lecture_01.py", "lineno": 232}], "Data does not just fall from the sky.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 201}, {"name": "get_raw_data", "filename": "lecture_01.py", "lineno": 233}], "Sources: webpages scraped from the Internet, books, arXiv papers, GitHub code, etc.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 201}, {"name": "get_raw_data", "filename": "lecture_01.py", "lineno": 234}], "Appeal to fair use to train on copyright data?", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 201}, {"name": "get_raw_data", "filename": "lecture_01.py", "lineno": 234}], "https://arxiv.org/pdf/2303.15715.pdf", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 201}, {"name": "get_raw_data", "filename": "lecture_01.py", "lineno": 235}], "Might have to license data (e.g., Google with Reddit data)", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 201}, {"name": "get_raw_data", "filename": "lecture_01.py", "lineno": 235}], "https://www.reuters.com/technology/reddit-ai-content-licensing-deal-with-google-sources-say-2024-02-22/", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 201}, {"name": "get_raw_data", "filename": "lecture_01.py", "lineno": 236}], "Formats: HTML, PDF, directories (not text!)", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 202}, {"name": "process_data", "filename": "lecture_01.py", "lineno": 244}], "Preprocess the raw data", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 202}, {"name": "process_data", "filename": "lecture_01.py", "lineno": 245}], "- Filtering: keep data of high quality, remove harmful content", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 202}, {"name": "process_data", "filename": "lecture_01.py", "lineno": 246}], "- Deduplication: don't waste time training, avoid memorization", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 202}, {"name": "process_data", "filename": "lecture_01.py", "lineno": 247}], "- Conversion: project HTML to text (preserve content, structure)", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 204}], "Pretraining", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 205}, {"name": "train_tokenizer", "filename": "lecture_01.py", "lineno": 254}], "Tokenizers convert text into sequences of integers (tokens)", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 205}, {"name": "train_tokenizer", "filename": "lecture_01.py", "lineno": 255}], "Balance tradeoff between vocabulary size and compression ratio", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 205}, {"name": "train_tokenizer", "filename": "lecture_01.py", "lineno": 256}], "This course: Byte-Pair Encoding (BPE) tokenizer", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 205}, {"name": "train_tokenizer", "filename": "lecture_01.py", "lineno": 256}], "<function train_bpe at 0x14da93e25240>", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 206}, {"name": "__init__", "filename": "lecture_01.py", "lineno": 264}], "Original Transformer", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 206}, {"name": "__init__", "filename": "lecture_01.py", "lineno": 264}], "MethodSpec(name='Transformer architecture', author=None, organization=None, date='2017-06-12', url='https://arxiv.org/pdf/1706.03762.pdf', description=['Introduces the encoder-decoder Transformer architecture', 'Scaled dot-product attention, multi-headed attention, sinusoidal positional embeddings', 'Training with warmup learning rate', 'Applied to machine translation'], references=None)", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 206}, {"name": "__init__", "filename": "lecture_01.py", "lineno": 265}], "Many variants exist that improve on the original (e.g., post-norm, SwiGLU, RMSNorm, parallel layers, RoPE, GQA)", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 207}, {"name": "pretrain", "filename": "lecture_01.py", "lineno": 273}], "Specify the optimizer (e.g., AdamW)", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 207}, {"name": "pretrain", "filename": "lecture_01.py", "lineno": 273}], "MethodSpec(name='AdamW', author=None, organization=None, date='2017-11-14', url='https://arxiv.org/pdf/1711.05101.pdf', description=['Improves Adam by decoupling weight decay'], references=None)", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 207}, {"name": "pretrain", "filename": "lecture_01.py", "lineno": 274}], "Specify the learning rate schedule (e.g., cosine)", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 207}, {"name": "pretrain", "filename": "lecture_01.py", "lineno": 275}], "Set other hyperparameters (batch size, number of heads, hidden dimension)", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 209}], "Alignment", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 210}, {"name": "get_instruction_data", "filename": "lecture_01.py", "lineno": 285}], "Instruction data: (prompt, response) pairs", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 210}, {"name": "get_instruction_data", "filename": "lecture_01.py", "lineno": 286}], "Intuition: base model already has the skills, just need few examples to surface them.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 210}, {"name": "get_instruction_data", "filename": "lecture_01.py", "lineno": 287}], "MethodSpec(name='LIMA: Less Is More for Alignment', author=None, organization=None, date='2023-05-18', url='https://arxiv.org/pdf/2305.11206.pdf', description=None, references=None)", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 211}, {"name": "instruction_tune", "filename": "lecture_01.py", "lineno": 294}], "Given (prompt, response) pairs, we perform supervised learning.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 211}, {"name": "instruction_tune", "filename": "lecture_01.py", "lineno": 295}], "Specifically, fine-tune `model` to maximize p(response | prompt).", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 212}, {"name": "generate_preference_data", "filename": "lecture_01.py", "lineno": 307}], "Now we have a preliminary instruction following `model`.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 212}, {"name": "generate_preference_data", "filename": "lecture_01.py", "lineno": 308}], "Data: generate multiple responses using `model` (e.g., [A, B]) to a given prompt.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 212}, {"name": "generate_preference_data", "filename": "lecture_01.py", "lineno": 309}], "User provides preferences (e.g., A < B or A > B).", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 213}, {"name": "preference_tune", "filename": "lecture_01.py", "lineno": 322}], "Given (prompt, response1, response2, preference) tuples, tune the model.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 213}, {"name": "preference_tune", "filename": "lecture_01.py", "lineno": 323}], "Traditionally: Proximal Policy Optimization (PPO) from reinforcement learning", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 213}, {"name": "preference_tune", "filename": "lecture_01.py", "lineno": 324}], "MethodSpec(name='Training language models to follow instructions with human feedback', author=None, organization=None, date=None, url='https://arxiv.org/pdf/2203.02155.pdf', description=None, references=None)", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 213}, {"name": "preference_tune", "filename": "lecture_01.py", "lineno": 325}], "Recently, effective and simpler approach: Direct Policy Optimization (DPO)", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 213}, {"name": "preference_tune", "filename": "lecture_01.py", "lineno": 326}], "MethodSpec(name='Direct Preference Optimization: Your Language Model is Secretly a Reward Model', author=None, organization=None, date='2023-05-29', url='https://arxiv.org/pdf/2305.18290.pdf', description=None, references=None)", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 215}], "## On efficiency as a unifying perspective", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 217}], "Today, we are hardware-bound, so design decisions will reflect squeezing the most out of given hardware.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 219}], "- Data processing: avoid wasting precious compute updating on bad / irrelevant data", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 220}], "- Tokenization: working with raw bytes is elegant, but compute-inefficient with today's model architectures", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 222}], "- Model architecture: many changes motivated by keeping GPUs humming along", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 223}], "- Training: we can get away with a single epoch!", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 224}], "- Scaling laws: use less compute on smaller models to do hyperparameter tuning", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 225}], "- Alignment: if tune model more to desired use cases, require smaller base models", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 20}, {"name": "course_components", "filename": "lecture_01.py", "lineno": 227}], "Tomorrow, we might become data-bound...", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 17}], "This unit was inspired by Andrej Karpathy's video on tokenization; check it out!", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 18}], "https://www.youtube.com/watch?v=zduSFxRajkE", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 20}, {"name": "intro_tokenizer", "filename": "tokenization.py", "lineno": 111}], "Raw text generally represented as Unicode strings.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 20}, {"name": "intro_tokenizer", "filename": "tokenization.py", "lineno": 114}], "A language model places a probability distribution over sequences of tokens (usually represented by integer indices).", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 20}, {"name": "intro_tokenizer", "filename": "tokenization.py", "lineno": 118}], "So we need a procedure that *encodes* text into tokens.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 20}, {"name": "intro_tokenizer", "filename": "tokenization.py", "lineno": 119}], "We also need a procedure that *decodes* tokens back into text.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 20}, {"name": "intro_tokenizer", "filename": "tokenization.py", "lineno": 120}], "A Tokenizer is a class that implements the encode and decode methods.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 20}, {"name": "intro_tokenizer", "filename": "tokenization.py", "lineno": 121}], "<class 'tokenization.Tokenizer'>", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 20}, {"name": "intro_tokenizer", "filename": "tokenization.py", "lineno": 122}], "The number of possible indices is the *vocabulary size*.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 21}, {"name": "examples", "filename": "tokenization.py", "lineno": 126}], "Play with this interactive site to get a feel for how tokenizers work:", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 21}, {"name": "examples", "filename": "tokenization.py", "lineno": 127}], "https://tiktokenizer.vercel.app/?encoder=gpt2", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 21}, {"name": "examples", "filename": "tokenization.py", "lineno": 129}], "## Observations", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 21}, {"name": "examples", "filename": "tokenization.py", "lineno": 130}], "- A word and its preceding space are part of the same token (e.g., ' world').", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 21}, {"name": "examples", "filename": "tokenization.py", "lineno": 131}], "- A word at the beginning and in the middle are represented differently (e.g., 'hello hello').", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 21}, {"name": "examples", "filename": "tokenization.py", "lineno": 132}], "- Some long words are one token (e.g., ' SolidGoldMagikarp').", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 21}, {"name": "examples", "filename": "tokenization.py", "lineno": 133}], "- Numbers are tokenized into every few digits.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 21}, {"name": "examples", "filename": "tokenization.py", "lineno": 135}], "Here's the GPT-2 tokenizer from OpenAI (tiktoken) in action.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 21}, {"name": "examples", "filename": "tokenization.py", "lineno": 139}], "Check that encode() and decode() roundtrip:", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 22}, {"name": "character_tokenizer", "filename": "tokenization.py", "lineno": 146}], "## Character-based tokenization", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 22}, {"name": "character_tokenizer", "filename": "tokenization.py", "lineno": 148}], "A Unicode string is a sequence of Unicode characters.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 22}, {"name": "character_tokenizer", "filename": "tokenization.py", "lineno": 149}], "Each character can be converted into a code point (integer) via `ord`.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 22}, {"name": "character_tokenizer", "filename": "tokenization.py", "lineno": 152}], "It can be converted back via `chr`.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 22}, {"name": "character_tokenizer", "filename": "tokenization.py", "lineno": 156}], "Now let's build a `Tokenizer` and make sure it round-trips:", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 22}, {"name": "character_tokenizer", "filename": "tokenization.py", "lineno": 163}], "There are approximately 150K Unicode characters.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 22}, {"name": "character_tokenizer", "filename": "tokenization.py", "lineno": 164}], "https://en.wikipedia.org/wiki/List_of_Unicode_characters", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 22}, {"name": "character_tokenizer", "filename": "tokenization.py", "lineno": 166}], "Problem 1: this is a very large vocabulary.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 22}, {"name": "character_tokenizer", "filename": "tokenization.py", "lineno": 167}], "Problem 2: many characters are quite rare (e.g., \ud83c\udf0d), which is inefficient use of the vocabulary.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 23}, {"name": "byte_tokenizer", "filename": "tokenization.py", "lineno": 173}], "## Byte-based tokenization", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 23}, {"name": "byte_tokenizer", "filename": "tokenization.py", "lineno": 175}], "Unicode text can be represented as a sequence of bytes, which can be represented by integers between 0 and 255.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 23}, {"name": "byte_tokenizer", "filename": "tokenization.py", "lineno": 177}], "The most common Unicode encoding is UTF-8.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 23}, {"name": "byte_tokenizer", "filename": "tokenization.py", "lineno": 177}], "https://en.wikipedia.org/wiki/UTF-8", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 23}, {"name": "byte_tokenizer", "filename": "tokenization.py", "lineno": 179}], "Some Unicode characters are represented by one byte:", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 23}, {"name": "byte_tokenizer", "filename": "tokenization.py", "lineno": 181}], "Others take multiple bytes:", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 23}, {"name": "byte_tokenizer", "filename": "tokenization.py", "lineno": 184}], "Now let's build a `Tokenizer` and make sure it round-trips:", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 23}, {"name": "byte_tokenizer", "filename": "tokenization.py", "lineno": 191}], "The vocabulary is nice and small: a byte can represent 256 values.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 23}, {"name": "byte_tokenizer", "filename": "tokenization.py", "lineno": 193}], "What about the compression rate?", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 23}, {"name": "byte_tokenizer", "filename": "tokenization.py", "lineno": 196}], "The compression ratio is terrible, which means the sequences will be too long.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 23}, {"name": "byte_tokenizer", "filename": "tokenization.py", "lineno": 197}], "Given that the context length of a Transformer is limited (since attention is quadratic), this is not looking great...", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 23}, {"name": "byte_tokenizer", "filename": "tokenization.py", "lineno": 200}], "There are some papers that use bytes directly, but they require architectural changes and have not been scaled to the largest models yet.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 24}, {"name": "word_tokenizer", "filename": "tokenization.py", "lineno": 208}], "## Word-based tokenization", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 24}, {"name": "word_tokenizer", "filename": "tokenization.py", "lineno": 210}], "Another approach (closer to what was done classically in NLP) is to split text into words.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 24}, {"name": "word_tokenizer", "filename": "tokenization.py", "lineno": 214}], "This regular expression keeps all alphanumeric characters together (words).", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 24}, {"name": "word_tokenizer", "filename": "tokenization.py", "lineno": 216}], "Here is a fancier version:", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 24}, {"name": "word_tokenizer", "filename": "tokenization.py", "lineno": 219}], "To turn this into a `Tokenizer`, we need to map these segments into integers.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 24}, {"name": "word_tokenizer", "filename": "tokenization.py", "lineno": 220}], "Then, we can build a mapping from each segment into an integer.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 24}, {"name": "word_tokenizer", "filename": "tokenization.py", "lineno": 222}], "But ther are problems:", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 24}, {"name": "word_tokenizer", "filename": "tokenization.py", "lineno": 223}], "- The number of words is huge (like for Unicode characters).", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 24}, {"name": "word_tokenizer", "filename": "tokenization.py", "lineno": 224}], "- Many words are rare and the model won't learn much about them.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 24}, {"name": "word_tokenizer", "filename": "tokenization.py", "lineno": 225}], "- We need a fixed vocabulary size.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 24}, {"name": "word_tokenizer", "filename": "tokenization.py", "lineno": 227}], "New words we haven't seen during training get a special UNK token, which is ugly and can mess up perplexity calculations.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 235}], "## Byte Pair Encoding (BPE)", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 235}], "https://en.wikipedia.org/wiki/Byte_pair_encoding", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 237}], "The BPE algorithm was introduced by Philip Gage in 1994 for data compression.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 238}], "It was adapted to NLP for neural machine translation (Sennrich 2015).", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 239}], "(Previously, papers had been using word-based tokenization.)", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 240}], "BPE was then used by the GPT-2 paper (Radford 2019).", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 240}], "ModelSpec(name='GPT-2', author=None, organization='OpenAI', date='2019-02-14', url='https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf', description=None, references=None, data=None, architecture=ArchitectureSpec(name=None, author=None, organization=None, date=None, url=None, description=None, references=None, num_parameters=1500000000.0, num_layers=None, dim_model=None, num_heads=None, dim_head=None), training=None)", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 242}], "The basic idea of BPE is to *train* the tokenizer on text to automatically determine the vocabulary.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 244}], "Intuition: common sequences of characters are represented by a single token, rare sequences are represented by many tokens.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 247}], "The GPT-2 paper used word-based tokenization to break up the text into inital segments and run the original BPE algorithm on each segment.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 250}], "The basic idea is to start with the byte-based tokenization and perform merges.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 252}], "Basic idea: start with each byte as a token, and successively merge the most common pair of adjacent tokens.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 256}, {"name": "train_bpe", "filename": "tokenization.py", "lineno": 273}], "Start with the list of bytes of `text`.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 256}, {"name": "train_bpe", "filename": "tokenization.py", "lineno": 285}], "Count the number of occurrences of each pair of tokens", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 256}, {"name": "train_bpe", "filename": "tokenization.py", "lineno": 290}], "Find the most common pair.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 256}, {"name": "train_bpe", "filename": "tokenization.py", "lineno": 293}], "Merge that pair.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 256}, {"name": "train_bpe", "filename": "tokenization.py", "lineno": 298}], "Merge b't' b'h' -> b'th'", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 256}, {"name": "train_bpe", "filename": "tokenization.py", "lineno": 301}], "Text: [b'th', b'e', b' ', b'c', b'a', b't', b' ', b'i', b'n', b' ', b'th', b'e', b' ', b'h', b'a', b't']", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 256}, {"name": "train_bpe", "filename": "tokenization.py", "lineno": 285}], "Count the number of occurrences of each pair of tokens", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 256}, {"name": "train_bpe", "filename": "tokenization.py", "lineno": 290}], "Find the most common pair.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 256}, {"name": "train_bpe", "filename": "tokenization.py", "lineno": 293}], "Merge that pair.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 256}, {"name": "train_bpe", "filename": "tokenization.py", "lineno": 298}], "Merge b'th' b'e' -> b'the'", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 256}, {"name": "train_bpe", "filename": "tokenization.py", "lineno": 301}], "Text: [b'the', b' ', b'c', b'a', b't', b' ', b'i', b'n', b' ', b'the', b' ', b'h', b'a', b't']", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 256}, {"name": "train_bpe", "filename": "tokenization.py", "lineno": 285}], "Count the number of occurrences of each pair of tokens", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 256}, {"name": "train_bpe", "filename": "tokenization.py", "lineno": 290}], "Find the most common pair.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 256}, {"name": "train_bpe", "filename": "tokenization.py", "lineno": 293}], "Merge that pair.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 256}, {"name": "train_bpe", "filename": "tokenization.py", "lineno": 298}], "Merge b'the' b' ' -> b'the '", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 256}, {"name": "train_bpe", "filename": "tokenization.py", "lineno": 301}], "Text: [b'the ', b'c', b'a', b't', b' ', b'i', b'n', b' ', b'the ', b'h', b'a', b't']", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 258}], "Now, given a new text, we can encode it.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 265}], "In Assignment 1, you will go beyond this in the following ways:", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 266}], "- encode() currently loops over all merges. Only loop over merges that matter.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 267}], "- Detect and preserve special tokens (e.g., <|endoftext|>).", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 268}], "- Use pre-tokenization (e.g., the GPT-2 tokenizer regex.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 268}], "'(?:[sdmt]|ll|ve|re)| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+", {"color": "gray"})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 21}, {"name": "tokenization_unit", "filename": "tokenization.py", "lineno": 25}, {"name": "bpe_tokenizer", "filename": "tokenization.py", "lineno": 269}], "- Try to make the implementation as fast as possible.", {})
addText([{"name": "lecture_01", "filename": "lecture_01.py", "lineno": 22}, {"name": "assignment1_overview", "filename": "lecture_01.py", "lineno": 330}], "https://github.com/stanford-cs336/spring2024-assignment1-basics", {})