-
Notifications
You must be signed in to change notification settings - Fork 43
/
Copy pathlecture_06-content.js
569 lines (569 loc) · 176 KB
/
lecture_06-content.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 20}, {"name": "announcements", "filename": "lecture_06.py", "lineno": 56}], "Assignment 1 is due on [Monday April 16] + 3 late days.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 20}, {"name": "announcements", "filename": "lecture_06.py", "lineno": 57}], "Assignment 1 leaderboard", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 20}, {"name": "announcements", "filename": "lecture_06.py", "lineno": 57}], "https://github.com/stanford-cs336/spring2024-assignment1-basics-leaderboard", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 20}, {"name": "announcements", "filename": "lecture_06.py", "lineno": 58}], "Assignment 2 is out", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 20}, {"name": "announcements", "filename": "lecture_06.py", "lineno": 58}], "https://github.com/stanford-cs336/spring2024-assignment2-systems", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 22}], "Last lecture: high-level overview of GPUs and performance", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 22}], "<function lecture_05 at 0x1458dc5fb520>", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 23}], "This lecture: benchmarking/profiling + write kernels", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 62}], "## Hardware", {})
addImage([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 63}], "https://miro.medium.com/v2/resize:fit:2000/format:webp/1*6xoBKi5kL2dZpivFe1-zgw.jpeg", {"width": "100%"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 64}], "Compute: streaming multiprocessors (SMs) [A100: 108]", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 65}], "Memory:", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 66}], "- DRAM [A100: 80GB] - big, slow", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 67}], "- L2 cache [A100: 40MB]", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 68}], "- L1 cache [A100: 192KB per SM] - small, fast", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 70}], "You can look at the specs on your actual GPU.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 71}, {"name": "print_gpu_specs", "filename": "lecture_06.py", "lineno": 779}], "8 devices", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 71}, {"name": "print_gpu_specs", "filename": "lecture_06.py", "lineno": 782}], "0: _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81230MB, multi_processor_count=132)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 71}, {"name": "print_gpu_specs", "filename": "lecture_06.py", "lineno": 782}], "1: _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81230MB, multi_processor_count=132)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 71}, {"name": "print_gpu_specs", "filename": "lecture_06.py", "lineno": 782}], "2: _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81230MB, multi_processor_count=132)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 71}, {"name": "print_gpu_specs", "filename": "lecture_06.py", "lineno": 782}], "3: _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81230MB, multi_processor_count=132)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 71}, {"name": "print_gpu_specs", "filename": "lecture_06.py", "lineno": 782}], "4: _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81230MB, multi_processor_count=132)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 71}, {"name": "print_gpu_specs", "filename": "lecture_06.py", "lineno": 782}], "5: _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81230MB, multi_processor_count=132)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 71}, {"name": "print_gpu_specs", "filename": "lecture_06.py", "lineno": 782}], "6: _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81230MB, multi_processor_count=132)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 71}, {"name": "print_gpu_specs", "filename": "lecture_06.py", "lineno": 782}], "7: _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81230MB, multi_processor_count=132)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 73}], "Basic structure: run f(i) for all i = 0, ..., N-1", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 75}], "## Execution model", {})
addImage([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 76}], "https://docs.nvidia.com/cuda/parallel-thread-execution/_images/grid-with-CTAs.png", {"width": "50.0%"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 77}], "- *Thread*: process individual index (i.e., f(i))", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 78}], "- *Thread block* (a.k.a. concurrent thread arrays): scheduled on a single SM", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 79}], "- *Grid*: collection of thread blocks", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 81}], "Why thread blocks? Shared memory.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 82}], "- Intuition: group f(i)'s that read similar data together", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 83}], "- Threads within a thread block have shared memory (as fast as L1 cache) [A100: 164KB]", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 84}], "- Can synchronize threads (for reading/writing) within a block (but not across blocks)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 86}], "### Hardware and execution interact.", {})
addImage([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 87}], "https://developer-blogs.nvidia.com/wp-content/uploads/2019/06/pasted-image-0.png", {"width": "25.0%"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 88}], "Thread blocks scheduled onto SMs in waves.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 89}], "Problem: last wave has fewer thread blocks, leaving some SMs idle (low occupancy).", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 90}], "Wave quantization: make number of thread blocks divide # SMs.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 91}], "Rule of thumb: number of thread blocks should be >= 4x # SMs", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 92}], "Challenge: some aspects of hardware are hidden from the execution model (e.g., scheduling, # SMs).", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 94}], "### Arithmetic intensity: # FLOPs / # bytes", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 95}], "- If high, operation is compute-bound (good)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 96}], "- If low, operation is memory-bound (bad)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 25}, {"name": "review_of_gpus", "filename": "lecture_06.py", "lineno": 97}], "General rule: matrix multiplication is compute-bound, everything else is memory-bound", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 103}], "IMPORTANT: benchmark/profile your code!", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 105}], "You can read spec sheets (marketing material) and papers, but performance depends on your library version, your hardware, your workload, so there is no substitute for benchmarking/profiling your code.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 109}], "Example computation: running forward/backward passes on an MLP.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 166}], "Benchmarking measures the wall-clock time of performing some operation.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 168}], "It only gives you end-to-end time, not where time is spent (profiling).", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 170}], "It is still useful for:", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 171}], "- comparing different implementations (which is faster?), and", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 172}], "- understanding how performance scales (e.g., with dimension).", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 174}], "Let's define a convenient function for benchmarking an arbitrary function.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 175}], "sleep: [50.1, 50.1, 50.1] (mean 50.1 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 177}], "### Benchmarking matrix multiplication", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 178}], "First, let us benchmark matrix multiplication of square matrices.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 180}], "matmul(dim=1024): [0.1, 0.1, 0.1] (mean 0.1 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 180}], "matmul(dim=2048): [0.4, 0.4, 0.4] (mean 0.4 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 180}], "matmul(dim=4096): [2.7, 2.7, 2.7] (mean 2.7 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 180}], "matmul(dim=8192): [21.2, 21.2, 21.3] (mean 21.3 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 180}], "matmul(dim=16384): [163.2, 163.2, 163.2] (mean 163.2 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 181}], "Times scale cubicly with dimension.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 183}], "Let us benchmark our MLP!", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 189}], "run_mlp: [1.0, 1.0, 1.1] (mean 1.0 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 191}], "Scale the number of steps.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 193}], "run_mlp(2x num_steps): [3.4, 3.4, 3.4] (mean 3.4 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 193}], "run_mlp(3x num_steps): [7.3, 7.3, 7.4] (mean 7.4 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 193}], "run_mlp(4x num_steps): [12.8, 12.8, 12.8] (mean 12.8 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 193}], "run_mlp(5x num_steps): [19.7, 19.7, 19.8] (mean 19.7 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 195}], "Scale the number of layers.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 197}], "run_mlp(2x num_layers): [1.7, 1.8, 1.8] (mean 1.8 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 197}], "run_mlp(3x num_layers): [2.5, 2.5, 2.5] (mean 2.5 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 197}], "run_mlp(4x num_layers): [3.3, 3.3, 3.6] (mean 3.4 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 197}], "run_mlp(5x num_layers): [4.0, 4.1, 4.1] (mean 4.1 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 199}], "Scale the batch size.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 201}], "run_mlp(2x batch_size): [1.0, 1.0, 1.0] (mean 1.0 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 201}], "run_mlp(3x batch_size): [1.0, 1.0, 1.0] (mean 1.0 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 201}], "run_mlp(4x batch_size): [1.0, 1.0, 1.0] (mean 1.0 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 201}], "run_mlp(5x batch_size): [1.0, 1.0, 1.1] (mean 1.0 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 203}], "Scale the dimension.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 205}], "run_mlp(2x dim): [1.0, 1.0, 1.0] (mean 1.0 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 205}], "run_mlp(3x dim): [1.0, 1.0, 1.0] (mean 1.0 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 205}], "run_mlp(4x dim): [1.0, 1.0, 1.0] (mean 1.0 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 205}], "run_mlp(5x dim): [1.2, 1.2, 1.2] (mean 1.2 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 207}], "The timings are not always predictable due to the non-homogenous nature of CUDA kernels, hardware, etc.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 209}], "You can also use `torch.utils.benchmark`, which provides more amenities.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 209}], "https://pytorch.org/tutorials/recipes/recipes/benchmark.html", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 112}, {"name": "benchmarking", "filename": "lecture_06.py", "lineno": 210}], "We did not use this to make benchmarking more transparent.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 237}], "While benchmarking looks at end-to-end time, profiling looks at where time is spent.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 238}], "Obvious: profiling helps you understand where time is being spent.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 239}], "Deeper: profiling helps you understand (what is being called).", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 241}], "PyTorch has a nice built-in profiler", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 241}], "https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 243}], "Let's profile some code to see what is going on under the hood.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 244}], "## sleep", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 244}], "------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 244}], " Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 244}], "------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 244}], " cudaDeviceSynchronize 100.00% 17.000us 100.00% 17.000us 8.500us 2 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 244}], "------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 244}], "Self CPU time total: 17.000us", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 244}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 246}], "Let's start with some basic operations.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 247}], "## add", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 247}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 247}], " Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 247}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 247}], " aten::add 97.63% 1.316ms 99.33% 1.339ms 1.339ms 16.000us 100.00% 16.000us 16.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 247}], "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add... 0.00% 0.000us 0.00% 0.000us 0.000us 16.000us 100.00% 16.000us 16.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 247}], " cudaLaunchKernel 1.71% 23.000us 1.71% 23.000us 23.000us 0.000us 0.00% 0.000us 0.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 247}], " cudaDeviceSynchronize 0.67% 9.000us 0.67% 9.000us 4.500us 0.000us 0.00% 0.000us 0.000us 2 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 247}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 247}], "Self CPU time total: 1.348ms", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 247}], "Self CUDA time total: 16.000us", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 247}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 248}], "## matmul", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 248}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 248}], " Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 248}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 248}], " aten::matmul 0.28% 5.000us 84.60% 1.527ms 1.527ms 0.000us 0.00% 341.000us 341.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 248}], " aten::mm 83.27% 1.503ms 84.32% 1.522ms 1.522ms 341.000us 100.00% 341.000us 341.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 248}], "void cutlass::Kernel<cutlass_80_simt_sgemm_256x128_8x4_nn_align1>(cutlass_80_... 0.00% 0.000us 0.00% 0.000us 0.000us 341.000us 100.00% 341.000us 341.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 248}], " cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags 0.06% 1.000us 0.06% 1.000us 1.000us 0.000us 0.00% 0.000us 0.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 248}], " cudaFuncSetAttribute 0.06% 1.000us 0.06% 1.000us 1.000us 0.000us 0.00% 0.000us 0.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 248}], " cudaLaunchKernel 0.94% 17.000us 0.94% 17.000us 17.000us 0.000us 0.00% 0.000us 0.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 248}], " cudaDeviceSynchronize 15.40% 278.000us 15.40% 278.000us 139.000us 0.000us 0.00% 0.000us 0.000us 2 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 248}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 248}], "Self CPU time total: 1.805ms", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 248}], "Self CUDA time total: 341.000us", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 248}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 249}], "## matmul(dim=128)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 249}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 249}], " Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 249}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 249}], " aten::matmul 0.31% 5.000us 99.51% 1.616ms 1.616ms 0.000us 0.00% 5.000us 5.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 249}], " aten::mm 86.58% 1.406ms 99.20% 1.611ms 1.611ms 5.000us 100.00% 5.000us 5.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 249}], "sm80_xmma_gemm_f32f32_f32f32_f32_nn_n_tilesize32x32x8_stage3_warpsize1x2x1_ff... 0.00% 0.000us 0.00% 0.000us 0.000us 5.000us 100.00% 5.000us 5.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 249}], " cudaFuncGetAttributes 0.25% 4.000us 0.25% 4.000us 4.000us 0.000us 0.00% 0.000us 0.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 249}], " cudaLaunchKernelExC 12.38% 201.000us 12.38% 201.000us 201.000us 0.000us 0.00% 0.000us 0.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 249}], " cudaDeviceSynchronize 0.49% 8.000us 0.49% 8.000us 4.000us 0.000us 0.00% 0.000us 0.000us 2 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 249}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 249}], "Self CPU time total: 1.624ms", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 249}], "Self CUDA time total: 5.000us", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 249}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 251}], "Observations", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 252}], "- You can see what CUDA kernels are actually being called.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 253}], "- Different CUDA kernels are invoked depending on the tensor dimensions.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 255}], "Name of CUDA kernel tells us something about the implementation.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 256}], "Example: cutlass_80_simt_sgemm_256x128_8x4_nn_align1", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 257}], "- cutlass: NVIDIA's CUDA library for linear algebra", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 258}], "- 256x128: tile size", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 260}], "Let's now look at some composite operations.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 261}], "## cdist", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 261}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 261}], " Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 261}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 261}], " aten::cdist 0.96% 25.000us 89.97% 2.341ms 2.341ms 0.000us 0.00% 440.000us 440.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 261}], " aten::_euclidean_dist 1.73% 45.000us 88.47% 2.302ms 2.302ms 0.000us 0.00% 440.000us 440.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 261}], " aten::matmul 0.08% 2.000us 2.04% 53.000us 53.000us 0.000us 0.00% 349.000us 349.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 261}], " aten::mm 1.58% 41.000us 1.96% 51.000us 51.000us 349.000us 79.32% 349.000us 349.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 261}], "sm80_xmma_gemm_f32f32_f32f32_f32_tn_n_tilesize128x128x8_stage3_warpsize2x2x1_... 0.00% 0.000us 0.00% 0.000us 0.000us 349.000us 79.32% 349.000us 349.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 261}], " aten::cat 0.81% 21.000us 1.19% 31.000us 15.500us 30.000us 6.82% 30.000us 15.000us 2 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 261}], "void at::native::(anonymous namespace)::CatArrayBatchedCopy_aligned16_contig<... 0.00% 0.000us 0.00% 0.000us 0.000us 30.000us 6.82% 30.000us 15.000us 2 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 261}], " aten::pow 70.79% 1.842ms 78.21% 2.035ms 1.018ms 21.000us 4.77% 21.000us 10.500us 2 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 261}], "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous name... 0.00% 0.000us 0.00% 0.000us 0.000us 21.000us 4.77% 21.000us 10.500us 2 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 261}], " aten::sum 1.11% 29.000us 1.58% 41.000us 20.500us 14.000us 3.18% 14.000us 7.000us 2 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 261}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 261}], "Self CPU time total: 2.602ms", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 261}], "Self CUDA time total: 440.000us", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 261}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 262}], "## gelu", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 262}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 262}], " Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 262}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 262}], " aten::add 45.71% 176.000us 92.99% 358.000us 358.000us 18.000us 66.67% 18.000us 18.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 262}], "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add... 0.00% 0.000us 0.00% 0.000us 0.000us 18.000us 66.67% 18.000us 18.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 262}], " aten::gelu 3.64% 14.000us 5.19% 20.000us 20.000us 9.000us 33.33% 9.000us 9.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 262}], "void at::native::vectorized_elementwise_kernel<4, at::native::GeluCUDAKernelI... 0.00% 0.000us 0.00% 0.000us 0.000us 9.000us 33.33% 9.000us 9.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 262}], " cudaLaunchKernel 48.83% 188.000us 48.83% 188.000us 94.000us 0.000us 0.00% 0.000us 0.000us 2 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 262}], " cudaDeviceSynchronize 1.82% 7.000us 1.82% 7.000us 3.500us 0.000us 0.00% 0.000us 0.000us 2 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 262}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 262}], "Self CPU time total: 385.000us", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 262}], "Self CUDA time total: 27.000us", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 262}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 263}], "## softmax", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 263}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 263}], " Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 263}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 263}], " aten::add 89.37% 1.799ms 98.36% 1.980ms 1.980ms 18.000us 60.00% 18.000us 18.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 263}], "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add... 0.00% 0.000us 0.00% 0.000us 0.000us 18.000us 60.00% 18.000us 18.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 263}], " aten::softmax 0.20% 4.000us 1.29% 26.000us 26.000us 0.000us 0.00% 12.000us 12.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 263}], " aten::_softmax 0.70% 14.000us 1.09% 22.000us 22.000us 12.000us 40.00% 12.000us 12.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 263}], "void at::native::(anonymous namespace)::cunn_SoftMaxForward<4, float, float, ... 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 40.00% 12.000us 12.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 263}], " cudaLaunchKernel 9.39% 189.000us 9.39% 189.000us 94.500us 0.000us 0.00% 0.000us 0.000us 2 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 263}], " cudaDeviceSynchronize 0.35% 7.000us 0.35% 7.000us 3.500us 0.000us 0.00% 0.000us 0.000us 2 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 263}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 263}], "Self CPU time total: 2.013ms", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 263}], "Self CUDA time total: 30.000us", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 263}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 265}], "Now let's profile our MLP.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 266}], "We will also visualize our stack trace using a flame graph, which reveals where time is being spent.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 267}], "## mlp", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 267}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 267}], " Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 267}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 267}], " autograd::engine::evaluate_function: AddmmBackward0 1.48% 1.044ms 10.94% 7.712ms 60.250us 0.000us 0.00% 45.980ms 359.219us 128 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 267}], " AddmmBackward0 1.22% 858.000us 6.67% 4.703ms 36.742us 0.000us 0.00% 44.702ms 349.234us 128 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 267}], " aten::mm 3.18% 2.243ms 4.80% 3.385ms 13.327us 44.702ms 61.12% 44.702ms 175.992us 254 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 267}], " aten::addmm 3.69% 2.603ms 4.97% 3.504ms 27.375us 23.428ms 32.03% 23.428ms 183.031us 128 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 267}], "sm80_xmma_gemm_f32f32_f32f32_f32_tn_n_tilesize128x128x8_stage3_warpsize2x2x1_... 0.00% 0.000us 0.00% 0.000us 0.000us 23.428ms 32.03% 23.428ms 183.031us 128 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 267}], "void cutlass::Kernel<cutlass_80_simt_sgemm_256x128_8x4_nt_align1>(cutlass_80_... 0.00% 0.000us 0.00% 0.000us 0.000us 22.077ms 30.19% 22.077ms 172.477us 128 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 267}], " aten::linear 0.69% 485.000us 5.89% 4.153ms 32.445us 0.000us 0.00% 21.781ms 170.164us 128 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 267}], "void cutlass::Kernel<cutlass_80_simt_sgemm_256x128_8x4_nn_align1>(cutlass_80_... 0.00% 0.000us 0.00% 0.000us 0.000us 21.683ms 29.65% 21.683ms 172.087us 126 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 267}], " autograd::engine::evaluate_function: torch::autograd::AccumulateGrad 1.00% 705.000us 3.70% 2.610ms 10.195us 0.000us 0.00% 1.815ms 7.090us 256 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 267}], " torch::autograd::AccumulateGrad 0.46% 327.000us 2.70% 1.905ms 7.441us 0.000us 0.00% 1.815ms 7.090us 256 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 267}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 267}], "Self CPU time total: 70.482ms", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 267}], "Self CUDA time total: 73.138ms", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 267}], "", {"font-family": "monospace", "white-space": "pre"})
addImage([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 113}, {"name": "profiling", "filename": "lecture_06.py", "lineno": 267}], "var/stacks_mlp.svg", {"width": "100%"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 26}, {"name": "benchmarking_and_profiling", "filename": "lecture_06.py", "lineno": 115}], "Every time you make a change, benchmark/profile!", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 311}], "Horace He's blog post", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 311}], "https://horace.io/brrr_intro.html", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 313}], "Analogy: warehouse : DRAM :: factory : SRAM", {})
addImage([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 314}], "https://horace.io/img/perf_intro/factory_bandwidth.png", {"width": "30.0%"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 316}], "Each operation needs to read/compute/write:", {})
addImage([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 317}], "https://horace.io/img/perf_intro/multi_operators.png", {"width": "30.0%"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 319}], "If we *fuse* the operations, only need to read/write once:", {})
addImage([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 320}], "https://horace.io/img/perf_intro/operator_fusion.png", {"width": "30.0%"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 322}], "To see the effect of fusion, let's consider the GeLU activation function.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 323}], "https://pytorch.org/docs/stable/generated/torch.nn.GELU.html", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 325}], "Let's consider two ways to compute GeLU:", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 328}], "1. The default PyTorch implementation (fused):", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 331}], "2. We can also write our own by hand (not fused):", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 340}], "Let's benchmark.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 341}], "manual_gelu: [7.7, 7.7, 7.7] (mean 7.7 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 342}], "pytorch_gelu: [0.7, 0.7, 0.7] (mean 0.7 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 343}], "The fused version is significantly faster.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 345}], "Let's look under the hood.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 346}], "## manual_gelu", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 346}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 346}], " Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 346}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 346}], " aten::mul 18.77% 1.815ms 22.39% 2.165ms 360.833us 5.222ms 68.11% 5.222ms 870.333us 6 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 346}], "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor<f... 0.00% 0.000us 0.00% 0.000us 0.000us 3.113ms 40.60% 3.113ms 1.038ms 3 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 346}], "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor<f... 0.00% 0.000us 0.00% 0.000us 0.000us 2.109ms 27.51% 2.109ms 703.000us 3 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 346}], " aten::add 0.17% 16.000us 0.26% 25.000us 12.500us 1.741ms 22.71% 1.741ms 870.500us 2 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 346}], "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add... 0.00% 0.000us 0.00% 0.000us 0.000us 1.037ms 13.53% 1.037ms 1.037ms 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 346}], " aten::tanh 0.07% 7.000us 0.12% 12.000us 12.000us 704.000us 9.18% 704.000us 704.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 346}], "void at::native::vectorized_elementwise_kernel<4, at::native::tanh_kernel_cud... 0.00% 0.000us 0.00% 0.000us 0.000us 704.000us 9.18% 704.000us 704.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 346}], "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSe... 0.00% 0.000us 0.00% 0.000us 0.000us 704.000us 9.18% 704.000us 704.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 346}], " cudaLaunchKernel 3.76% 364.000us 3.76% 364.000us 40.444us 0.000us 0.00% 0.000us 0.000us 9 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 346}], " cudaDeviceSynchronize 77.23% 7.468ms 77.23% 7.468ms 3.734ms 0.000us 0.00% 0.000us 0.000us 2 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 346}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 346}], "Self CPU time total: 9.670ms", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 346}], "Self CUDA time total: 7.667ms", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 346}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 347}], "## pytorch_gelu", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 347}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 347}], " Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 347}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 347}], " aten::gelu 66.83% 1.884ms 77.69% 2.190ms 2.190ms 703.000us 100.00% 703.000us 703.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 347}], "void at::native::vectorized_elementwise_kernel<4, at::native::GeluCUDAKernelI... 0.00% 0.000us 0.00% 0.000us 0.000us 703.000us 100.00% 703.000us 703.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 347}], " cudaLaunchKernel 10.85% 306.000us 10.85% 306.000us 306.000us 0.000us 0.00% 0.000us 0.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 347}], " cudaDeviceSynchronize 22.31% 629.000us 22.31% 629.000us 314.500us 0.000us 0.00% 0.000us 0.000us 2 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 347}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 347}], "Self CPU time total: 2.819ms", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 347}], "Self CUDA time total: 703.000us", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 347}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 28}, {"name": "kernel_fusion_motivation", "filename": "lecture_06.py", "lineno": 348}], "The PyTorch just calls one kernel whereas the others are atomic (remember the warehouse/factory) ", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 352}], "Now let's open the box to understand what's going on inside a CUDA kernel by writing our own.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 354}], "Let's write the GeLU function in CUDA.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 355}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 373}], "CUDA is an extension of C/C++ with APIs for managing GPUs.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 355}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 375}], "Simplified picture: write f(i), CUDA kernel computes f(i) for all i.", {})
addImage([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 355}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 377}], "https://docs.nvidia.com/cuda/parallel-thread-execution/_images/grid-with-CTAs.png", {"width": "50.0%"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 355}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 378}], "Grid: collection of thread blocks: numBlocks = (2, 4), blockDim = (1, 8)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 355}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 379}], "Thread block: collection of threads: blockIdx = (0, 1)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 355}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 380}], "Thread: single unit of operation: threadIdx = (0, 3).", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 355}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 382}], "You write code that a thread execute, using (blockIdx, blockDim, threadIdx) to determine what to do.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 355}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 384}], "Set CUDA_LAUNCH_BLOCKING so that if there are errors, CUDA will tell you what went wrong.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 355}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 387}], "The `load_inline` function makes it convenient to write CUDA code and bind it to a Python module for immediate use.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 355}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 396}], "Compile the CUDA code and bind it to a Python module.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 357}], "Check correctness of our implementation.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 360}], "Benchmark our CUDA version.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 361}], "pytorch_gelu: [0.7, 0.7, 0.7] (mean 0.7 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 362}], "manual_gelu: [7.7, 7.7, 7.7] (mean 7.7 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 363}], "cuda_gelu: [1.6, 1.6, 1.6] (mean 1.6 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 364}], "## cuda_gelu", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 364}], "------------------------------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 364}], " Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 364}], "------------------------------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 364}], " gelu_kernel(float*, float*, int) 0.00% 0.000us 0.00% 0.000us 0.000us 1.666ms 100.00% 1.666ms 1.666ms 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 364}], " aten::empty_like 0.14% 5.000us 49.31% 1.796ms 1.796ms 0.000us 0.00% 0.000us 0.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 364}], " aten::empty_strided 49.18% 1.791ms 49.18% 1.791ms 1.791ms 0.000us 0.00% 0.000us 0.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 364}], " cudaLaunchKernel 6.45% 235.000us 6.45% 235.000us 235.000us 0.000us 0.00% 0.000us 0.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 364}], " cudaDeviceSynchronize 44.23% 1.611ms 44.23% 1.611ms 805.500us 0.000us 0.00% 0.000us 0.000us 2 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 364}], "------------------------------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 364}], "Self CPU time total: 3.642ms", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 364}], "Self CUDA time total: 1.666ms", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 364}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 365}], "Our CUDA implementation is faster than manual, but not as good as PyTorch.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 367}], "Elementwise operations are easy in CUDA (though you can still be smarter).", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 368}], "But most interesting operations (e.g., matmul, softmax, RMSNorm) require reading multiple values.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 29}, {"name": "cuda_kernels", "filename": "lecture_06.py", "lineno": 369}], "For that, you have to think about managing shared memory, etc.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 413}, {"name": "triton_introduction", "filename": "lecture_06.py", "lineno": 418}], "Developed by OpenAI in 2021", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 413}, {"name": "triton_introduction", "filename": "lecture_06.py", "lineno": 419}], "https://openai.com/research/triton", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 413}, {"name": "triton_introduction", "filename": "lecture_06.py", "lineno": 421}], "Make GPU programming more accessible", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 413}, {"name": "triton_introduction", "filename": "lecture_06.py", "lineno": 422}], "- Write in Python", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 413}, {"name": "triton_introduction", "filename": "lecture_06.py", "lineno": 423}], "- Think about thread blocks rather than threads", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 413}, {"name": "triton_introduction", "filename": "lecture_06.py", "lineno": 425}], "What does Triton offer?", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 413}, {"name": "triton_introduction", "filename": "lecture_06.py", "lineno": 426}], " CUDA Triton", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 413}, {"name": "triton_introduction", "filename": "lecture_06.py", "lineno": 427}], "- Memory coalescing (transfer from DRAM) manual automatic", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 413}, {"name": "triton_introduction", "filename": "lecture_06.py", "lineno": 428}], "- Shared memory management manual automatic", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 413}, {"name": "triton_introduction", "filename": "lecture_06.py", "lineno": 429}], "- Scheduling within SMs manual automatic", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 413}, {"name": "triton_introduction", "filename": "lecture_06.py", "lineno": 430}], "- Scheduling across SMs manual manual", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 413}, {"name": "triton_introduction", "filename": "lecture_06.py", "lineno": 432}], "Compiler does more work, can actually outperform PyTorch implementations!", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 436}], "One big advantage of Triton is that you can step through the Python code.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 438}], "Let's step through a Triton kernel.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 442}, {"name": "print_ptx_main", "filename": "lecture_06.py", "lineno": 512}], "PTX (parallel thread execution) is like an assembly language for GPUs.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 442}, {"name": "print_ptx_main", "filename": "lecture_06.py", "lineno": 514}], "We can see the PTX code generated by Triton.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 442}, {"name": "print_ptx_main", "filename": "lecture_06.py", "lineno": 515}], "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 442}, {"name": "print_ptx_main", "filename": "lecture_06.py", "lineno": 517}, {"name": "print_ptx", "filename": "lecture_06.py", "lineno": 541}], "Let's go poke around at the PTX code.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 442}, {"name": "print_ptx_main", "filename": "lecture_06.py", "lineno": 517}, {"name": "print_ptx", "filename": "lecture_06.py", "lineno": 542}], "https://github.com/stanford-cs336/spring2024-lectures/blob/main/var/triton_gelu-ptx.txt", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 442}, {"name": "print_ptx_main", "filename": "lecture_06.py", "lineno": 519}], "Observations:", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 442}, {"name": "print_ptx_main", "filename": "lecture_06.py", "lineno": 520}], "- ld.global.* and st.global.* reads and writes from global memory", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 442}, {"name": "print_ptx_main", "filename": "lecture_06.py", "lineno": 521}], "- %ctaid.x is block index, %tid.x is thread index", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 442}, {"name": "print_ptx_main", "filename": "lecture_06.py", "lineno": 522}], "- %f* are floating point registers, %r* are integer registers", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 442}, {"name": "print_ptx_main", "filename": "lecture_06.py", "lineno": 523}], "- One thread processes 8 elements at the same time (thread coarsening)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 442}, {"name": "print_ptx_main", "filename": "lecture_06.py", "lineno": 525}], "We can compare this to the CUDA code we wrote earlier:", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 442}, {"name": "print_ptx_main", "filename": "lecture_06.py", "lineno": 526}], "https://github.com/stanford-cs336/spring2024-lectures/blob/main/var/cuda_gelu-ptx.txt", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 442}, {"name": "print_ptx_main", "filename": "lecture_06.py", "lineno": 527}], "To get this, you have to look at the nvcc command that's printed out, add `-ptx -o var/cuda_gelu-ptx.txt` and rerun it.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 444}], "Check that it's correct.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 447}], "Let's now benchmark it compared to the PyTorch and CUDA implementations.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 448}], "Remember to set TRITON_INTERPRET=0 for good performance.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 449}], "manual_gelu: [7.7, 7.7, 7.7] (mean 7.7 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 450}], "pytorch_gelu: [0.7, 0.7, 0.7] (mean 0.7 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 451}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 373}], "CUDA is an extension of C/C++ with APIs for managing GPUs.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 451}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 375}], "Simplified picture: write f(i), CUDA kernel computes f(i) for all i.", {})
addImage([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 451}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 377}], "https://docs.nvidia.com/cuda/parallel-thread-execution/_images/grid-with-CTAs.png", {"width": "50.0%"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 451}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 378}], "Grid: collection of thread blocks: numBlocks = (2, 4), blockDim = (1, 8)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 451}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 379}], "Thread block: collection of threads: blockIdx = (0, 1)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 451}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 380}], "Thread: single unit of operation: threadIdx = (0, 3).", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 451}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 382}], "You write code that a thread execute, using (blockIdx, blockDim, threadIdx) to determine what to do.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 451}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 384}], "Set CUDA_LAUNCH_BLOCKING so that if there are errors, CUDA will tell you what went wrong.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 451}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 387}], "The `load_inline` function makes it convenient to write CUDA code and bind it to a Python module for immediate use.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 451}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 396}], "Compile the CUDA code and bind it to a Python module.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 451}], "cuda_gelu: [1.6, 1.6, 1.6] (mean 1.6 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 452}], "triton_gelu: [0.9, 0.9, 0.9] (mean 0.9 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 454}], "## triton_gelu", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 454}], "------------------------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 454}], " Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 454}], "------------------------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 454}], " triton_gelu_kernel_0d1d2de 0.00% 0.000us 0.00% 0.000us 0.000us 706.000us 100.00% 706.000us 706.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 454}], " aten::empty_like 0.32% 4.000us 27.72% 346.000us 346.000us 0.000us 0.00% 0.000us 0.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 454}], " aten::empty_strided 27.40% 342.000us 27.40% 342.000us 342.000us 0.000us 0.00% 0.000us 0.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 454}], " aten::is_pinned 0.00% 0.000us 0.00% 0.000us 0.000us 0.000us 0.00% 0.000us 0.000us 2 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 454}], " cuLaunchKernel 19.63% 245.000us 19.63% 245.000us 245.000us 0.000us 0.00% 0.000us 0.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 454}], " cudaDeviceSynchronize 52.64% 657.000us 52.64% 657.000us 328.500us 0.000us 0.00% 0.000us 0.000us 2 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 454}], "------------------------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 454}], "Self CPU time total: 1.248ms", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 454}], "Self CUDA time total: 706.000us", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 454}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 456}], "Our Triton implementation (triton_gelu):", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 457}], "- is almost as good as the PyTorch implementation (pytorch_gelu).", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 458}], "- is actually slower than our naive CUDA implementation (cuda_gelu).", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 460}], "Triton operates on blocks, CUDA operates on threads.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 461}], "Blocks allows Triton compiler to do other optimizations (e.g., thread coarsening).", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 30}, {"name": "triton_kernels", "filename": "lecture_06.py", "lineno": 414}, {"name": "triton_gelu_main", "filename": "lecture_06.py", "lineno": 463}], "Everything is way faster than the manual implementation (manual_gelu).", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 546}], "So far, we have seen three ways to write GeLU:", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 547}], "- Use the default PyTorch function", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 548}], "- Write it in Python", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 548}], "<function manual_gelu at 0x1458db360ca0>", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 549}], "- Write it in CUDA", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 549}], "<function create_cuda_gelu at 0x1458db360160>", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 550}], "- Write it in Triton", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 550}], "<function triton_gelu at 0x1458db3603a0>", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 552}], "- Write it in Python and compile it into Triton", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 555}], "Check correctness of our implementation.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 558}], "Let's benchmark and profile it!", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 559}], "manual_gelu: [7.7, 7.7, 7.7] (mean 7.7 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 560}], "pytorch_gelu: [0.7, 0.7, 0.7] (mean 0.7 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 561}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 373}], "CUDA is an extension of C/C++ with APIs for managing GPUs.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 561}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 375}], "Simplified picture: write f(i), CUDA kernel computes f(i) for all i.", {})
addImage([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 561}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 377}], "https://docs.nvidia.com/cuda/parallel-thread-execution/_images/grid-with-CTAs.png", {"width": "50.0%"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 561}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 378}], "Grid: collection of thread blocks: numBlocks = (2, 4), blockDim = (1, 8)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 561}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 379}], "Thread block: collection of threads: blockIdx = (0, 1)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 561}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 380}], "Thread: single unit of operation: threadIdx = (0, 3).", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 561}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 382}], "You write code that a thread execute, using (blockIdx, blockDim, threadIdx) to determine what to do.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 561}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 384}], "Set CUDA_LAUNCH_BLOCKING so that if there are errors, CUDA will tell you what went wrong.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 561}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 387}], "The `load_inline` function makes it convenient to write CUDA code and bind it to a Python module for immediate use.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 561}, {"name": "create_cuda_gelu", "filename": "lecture_06.py", "lineno": 396}], "Compile the CUDA code and bind it to a Python module.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 561}], "cuda_gelu: [1.6, 1.6, 1.6] (mean 1.6 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 562}], "triton_gelu: [0.9, 0.9, 0.9] (mean 0.9 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 563}], "compiled_gelu: [0.8, 0.9, 0.9] (mean 0.9 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 565}], "Let's look under the hood", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 566}], "## compiled_gelu", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 566}], "----------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 566}], " Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 566}], "----------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 566}], " Torch-Compiled Region 49.36% 2.408ms 87.47% 4.267ms 4.267ms 0.000us 0.00% 706.000us 706.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 566}], " triton_poi_fused_add_mul_tanh_0 0.80% 39.000us 37.90% 1.849ms 1.849ms 706.000us 100.00% 706.000us 706.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 566}], " triton__0d1d2 0.00% 0.000us 0.00% 0.000us 0.000us 706.000us 100.00% 706.000us 706.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 566}], " TorchDynamo Cache Lookup 0.43% 21.000us 0.43% 21.000us 21.000us 0.000us 0.00% 0.000us 0.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 566}], " aten::empty 0.21% 10.000us 0.21% 10.000us 10.000us 0.000us 0.00% 0.000us 0.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 566}], " cuLaunchKernel 37.11% 1.810ms 37.11% 1.810ms 1.810ms 0.000us 0.00% 0.000us 0.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 566}], " cudaDeviceSynchronize 12.10% 590.000us 12.10% 590.000us 295.000us 0.000us 0.00% 0.000us 0.000us 2 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 566}], "----------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 566}], "Self CPU time total: 4.878ms", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 566}], "Self CUDA time total: 706.000us", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 31}, {"name": "pytorch_compilation", "filename": "lecture_06.py", "lineno": 566}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 570}], "So far, we've looked at elementwise operations in Triton (e.g., GeLU).", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 571}], "Now let us look at operations that aggregate over multiple values.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 573}], "We will roughly follow the Triton fused softmax tutorial:", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 573}], "https://triton-lang.org/main/getting-started/tutorials/02-fused-softmax.html", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 575}], "Recall the softmax operation is used in attention and generating probabilities.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 576}], "Normalize each row of a matrix:", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 577}], "[A1 A2 A3] => [A1/A A2/A A3/A]", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 578}], "[B1 B2 B3] => [B1/B B2/B B3/B]", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 580}], "Let's first start with the naive implementation and keep track of reads/writes.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 587}], "Now let us write the Triton kernel.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 591}], "Check our implementations are correct.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 597}], "Now let's benchmark everything.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 598}], "manual_softmax: [3.2, 3.2, 3.2] (mean 3.2 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 599}], "compiled_softmax: [1.1, 1.1, 1.1] (mean 1.1 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 600}], "pytorch_softmax: [1.3, 1.3, 1.3] (mean 1.3 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 601}], "triton_softmax: [0.9, 0.9, 1.0] (mean 0.9 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 603}], "Look under the hood using the profiler.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 604}], "## manual_softmax", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 604}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 604}], " Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 604}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 604}], " aten::div 0.26% 10.000us 0.39% 15.000us 15.000us 921.000us 28.33% 921.000us 921.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 604}], "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocas... 0.00% 0.000us 0.00% 0.000us 0.000us 921.000us 28.33% 921.000us 921.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 604}], " aten::sub 0.36% 14.000us 0.57% 22.000us 22.000us 892.000us 27.44% 892.000us 892.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 604}], "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocas... 0.00% 0.000us 0.00% 0.000us 0.000us 892.000us 27.44% 892.000us 892.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 604}], " aten::exp 0.21% 8.000us 0.34% 13.000us 13.000us 704.000us 21.65% 704.000us 704.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 604}], "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda... 0.00% 0.000us 0.00% 0.000us 0.000us 704.000us 21.65% 704.000us 704.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 604}], " aten::max 9.67% 371.000us 17.95% 689.000us 689.000us 388.000us 11.93% 388.000us 388.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 604}], "void at::native::reduce_kernel<512, 1, at::native::ReduceOp<float, at::native... 0.00% 0.000us 0.00% 0.000us 0.000us 388.000us 11.93% 388.000us 388.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 604}], " aten::sum 0.36% 14.000us 0.50% 19.000us 19.000us 346.000us 10.64% 346.000us 346.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 604}], "void at::native::reduce_kernel<512, 1, at::native::ReduceOp<float, at::native... 0.00% 0.000us 0.00% 0.000us 0.000us 346.000us 10.64% 346.000us 346.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 604}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 604}], "Self CPU time total: 3.838ms", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 604}], "Self CUDA time total: 3.251ms", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 604}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 605}], "## compiled_softmax", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 605}], "------------------------------------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 605}], " Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 605}], "------------------------------------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 605}], " Torch-Compiled Region 60.69% 1.831ms 71.03% 2.143ms 2.143ms 0.000us 0.00% 956.000us 956.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 605}], " triton_red_fused_div_exp_max_sub_sum_0 1.23% 37.000us 9.98% 301.000us 301.000us 956.000us 100.00% 956.000us 956.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 605}], " triton__0d1d2de3de 0.00% 0.000us 0.00% 0.000us 0.000us 956.000us 100.00% 956.000us 956.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 605}], " TorchDynamo Cache Lookup 0.56% 17.000us 0.56% 17.000us 17.000us 0.000us 0.00% 0.000us 0.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 605}], " aten::empty 0.36% 11.000us 0.36% 11.000us 11.000us 0.000us 0.00% 0.000us 0.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 605}], " cuLaunchKernel 8.75% 264.000us 8.75% 264.000us 264.000us 0.000us 0.00% 0.000us 0.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 605}], " cudaDeviceSynchronize 28.41% 857.000us 28.41% 857.000us 428.500us 0.000us 0.00% 0.000us 0.000us 2 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 605}], "------------------------------------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 605}], "Self CPU time total: 3.017ms", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 605}], "Self CUDA time total: 956.000us", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 605}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 606}], "## pytorch_softmax", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 606}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 606}], " Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 606}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 606}], " aten::softmax 0.16% 5.000us 60.58% 1.901ms 1.901ms 0.000us 0.00% 1.305ms 1.305ms 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 606}], " aten::_softmax 54.14% 1.699ms 60.42% 1.896ms 1.896ms 1.305ms 100.00% 1.305ms 1.305ms 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 606}], "void at::native::(anonymous namespace)::cunn_SoftMaxForward<4, float, float, ... 0.00% 0.000us 0.00% 0.000us 0.000us 1.305ms 100.00% 1.305ms 1.305ms 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 606}], " cudaLaunchKernel 6.28% 197.000us 6.28% 197.000us 197.000us 0.000us 0.00% 0.000us 0.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 606}], " cudaDeviceSynchronize 39.42% 1.237ms 39.42% 1.237ms 618.500us 0.000us 0.00% 0.000us 0.000us 2 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 606}], "-------------------------------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 606}], "Self CPU time total: 3.138ms", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 606}], "Self CUDA time total: 1.305ms", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 606}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 607}], "## triton_softmax", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 607}], "--------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 607}], " Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 607}], "--------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 607}], " triton_softmax_kernel_0d1d2de3de4de 0.00% 0.000us 0.00% 0.000us 0.000us 707.000us 100.00% 707.000us 707.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 607}], " aten::empty_like 0.20% 5.000us 66.80% 1.712ms 1.712ms 0.000us 0.00% 0.000us 0.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 607}], " aten::empty_strided 66.60% 1.707ms 66.60% 1.707ms 1.707ms 0.000us 0.00% 0.000us 0.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 607}], " aten::is_pinned 0.00% 0.000us 0.00% 0.000us 0.000us 0.000us 0.00% 0.000us 0.000us 2 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 607}], " cuLaunchKernel 7.57% 194.000us 7.57% 194.000us 194.000us 0.000us 0.00% 0.000us 0.000us 1 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 607}], " cudaDeviceSynchronize 25.63% 657.000us 25.63% 657.000us 328.500us 0.000us 0.00% 0.000us 0.000us 2 ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 607}], "--------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 607}], "Self CPU time total: 2.563ms", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 607}], "Self CUDA time total: 707.000us", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 607}], "", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 609}, {"name": "print_ptx", "filename": "lecture_06.py", "lineno": 541}], "Let's go poke around at the PTX code.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 609}, {"name": "print_ptx", "filename": "lecture_06.py", "lineno": 542}], "https://github.com/stanford-cs336/spring2024-lectures/blob/main/var/triton_softmax-ptx.txt", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 611}], "Observations:", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 34}, {"name": "triton_softmax_main", "filename": "lecture_06.py", "lineno": 612}], "- Triton outperforms everything!", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 684}], "Matrix multipliction is perhaps the most optimized algorithm ever.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 686}], "If you write matrix multiplication in CUDA, there's all sorts of crazy things you have to do.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 687}], "https://github.com/openai/blocksparse/blob/master/src/matmul_op_gpu.cu", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 689}], "It's much easier in Triton.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 690}], "https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 692}], " k j ", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 693}], " [ A1 A2 A3 ] [ B1 B2 B3 ] [ C1 C2 C3 ]", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 694}], "i [ A4 A5 A6 ] * k [ B4 B5 B6 ] = [ C4 C5 C6 ]", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 695}], " [ A7 A8 A9 ] [ B7 B8 B9 ] [ C7 C8 C9 ]", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 697}], "Naively: need MKN reads, MN writes", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 699}], "Computing C4 and C5 both need A4, A5, A6.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 700}], "Can we read A4, A5, A6 from DRAM once to compute both?", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 701}], "Answer: yes, using shared memory!", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 703}], "## Tiling (leveraging shared memory)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 705}], "Recall that shared memory is:", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 706}], "- fast (10x faster) and small(~100KB)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 707}], "- shared between all the threads in a block.", {})
addImage([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 708}], "https://miro.medium.com/v2/resize:fit:2000/format:webp/1*6xoBKi5kL2dZpivFe1-zgw.jpeg", {"width": "100.0%"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 710}], "Trivial: for small matrices, load all of A and B into shared memory, then could compute C.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 711}], "Now we get MK + KN reads, MN writes", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 713}], "But what if we have big matrices...", {})
addImage([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 715}], "https://www.researchgate.net/profile/Axel-Huebl/publication/320499173/figure/fig1/AS:614298980196359@1523471698396/Performance-critical-A-B-part-of-the-GEMM-using-a-tiling-strategy-A-thread-iterates.png", {"width": "50.0%"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 716}], "Key idea: divide the matrix into blocks.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 717}], "For each block of A and block of B:", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 718}], "- load into shared memory,", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 719}], "- do mini-matrix multiplication,", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 720}], "- write the partial sum.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 722}], "Animation of tiled matrix multiplication", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 722}], "https://youtu.be/aMvCEEBIBto", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 724}], "## Leveraging L2 cache", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 726}], "Two ways of computing 9 elements of a matrix:", {})
addImage([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 727}], "https://triton-lang.org/main/_images/grouped_vs_row_major_ordering.png", {"width": "50.0%"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 728}], "1. Loads 9 + 81 = 90 blocks", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 729}], "1. Loads 27 + 27 = 54 blocks", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 731}], "Process the blocks in an order that minimizes the reads.", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 733}], "Why write your own kernel for matrix multiplication (e.g., A @ B)?", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 734}], "Answer: fusion with another operation (e.g., gelu(A @ B))", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 736}], "Let's try it!", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 737}], "pytorch_matmul: [163.2, 163.2, 163.2] (mean 163.2 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 35}, {"name": "triton_matmul_main", "filename": "lecture_06.py", "lineno": 738}], "triton_matmul: [77.6, 77.7, 77.8] (mean 77.7 ms)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 37}], "## Summary", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 39}], "Gap between the programming model (PyTorch, Triton, PTX) and hardware => performance mysteries", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 41}], "Benchmarking for understanding scaling", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 42}], "Profiling for understanding internals of PyTorch functions (bottoms out with kernels)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 43}], "Looking at PTX assembly to understand internals of CUDA kernels", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 45}], "5 ways to write a function: manual, PyTorch, compiled, CUDA, Triton", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 46}], "GeLU (element-wise), softmax (row-wise), matmul (complex aggregation)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 48}], "Key principle: organize computation to minimize reads/writes", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 49}], "Key ideas: kernel fusion (warehouse/factory analogy), tiling (shared memory)", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 50}], "Automatic compilers (Triton, torch.compile) will get better over time", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 745}], "https://horace.io/brrr_intro.html", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 747}], "CUDA MODE Lecture 1: how to profile CUDA kernels in PyTorch", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 748}], "https://www.youtube.com/watch?v=LuhJEEJQgUM", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 749}], "CUDA MODE Lecture 2: Chapters 1-3 of PPMP book", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 750}], "https://www.youtube.com/watch?v=NQ-0D5Ti2dc", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 751}], "CUDA MODE Lecture 3: Getting started with CUDA for Python Programmers", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 752}], "https://www.youtube.com/watch?v=4sgKnKbR-WE", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 753}], "CUDA MODE Lecture 4: Compute and memory basics", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 754}], "https://www.youtube.com/watch?v=lTmYrKwjSOU", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 755}], "CUDA MODE Lecture 8: CUDA performance checklist", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 756}], "https://www.youtube.com/watch?v=SGhfUhlowB4", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 758}], "HetSys Course: Lecture 1: Programming heterogenous computing systems with GPUs", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 759}], "https://www.youtube.com/watch?v=8JGo2zylE80", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 760}], "HetSys Course: Lecture 2: SIMD processing and GPUs", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 761}], "https://www.youtube.com/watch?v=x1MA4MtO4Tc", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 762}], "HetSys Course: Lecture 3: GPU Software Hierarchy", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 763}], "https://www.youtube.com/watch?v=KGZ00J5MJz0", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 764}], "HetSys Course: Lecture 4: GPU Memory Hierarchy", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 765}], "https://www.youtube.com/watch?v=ZQKMZIP3Fzg", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 766}], "HetSys Course: Lecture 5: GPU performance considerations", {})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 767}], "https://www.youtube.com/watch?v=ODeprwr3Jho", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 769}], "https://jonathan-hui.medium.com/ai-chips-a100-gpu-with-nvidia-ampere-architecture-3034ed685e6e", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 770}], "https://docs.nvidia.com/deeplearning/performance/dl-performance-gpu-background/index.html", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 771}], "https://github.com/srush/gpu-puzzles", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 772}], "https://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf", {"color": "gray"})
addText([{"name": "lecture_06", "filename": "lecture_06.py", "lineno": 52}, {"name": "further_reading", "filename": "lecture_06.py", "lineno": 773}], "https://towardsdatascience.com/how-pytorch-2-0-accelerates-deep-learning-with-operator-fusion-and-cpu-gpu-code-generation-35132a85bd26", {"color": "gray"})