forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_profiler.py
308 lines (264 loc) · 10.9 KB
/
test_profiler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
import collections
import gc
import io
import unittest
import torch
import torch.nn as nn
import torch.optim
import torch.utils.data
from torch.testing._internal.common_utils import (
TestCase, run_tests, TEST_WITH_ASAN, IS_WINDOWS, TemporaryFileName)
import torch.autograd.profiler as profiler
from torch.autograd.profiler import profile
from torch.autograd import kineto_available
try:
import psutil
HAS_PSUTIL = True
except ImportError:
HAS_PSUTIL = False
import pickle
@unittest.skipIf(not HAS_PSUTIL, "Requires psutil to run")
@unittest.skipIf(TEST_WITH_ASAN, "Cannot test with ASAN")
@unittest.skipIf(IS_WINDOWS, "Test is flaky on Windows")
@unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
class TestProfilerCUDA(TestCase):
def test_mem_leak(self):
"""Checks that there's no memory leak when using profiler with CUDA
"""
t = torch.rand(1, 1).cuda()
p = psutil.Process()
last_rss = collections.deque(maxlen=5)
for outer_idx in range(10):
with profile(use_cuda=True):
for _ in range(1024):
t = torch.mm(t, t)
gc.collect()
torch.cuda.empty_cache()
last_rss.append(p.memory_info().rss)
# with CUDA events leaking the increase in memory was ~7 MB between
# profiler invocations above
is_increasing = all(
[last_rss[idx] > last_rss[idx - 1] for idx in range(1, len(last_rss))])
max_diff = -1
for idx in range(1, len(last_rss)):
max_diff = max(max_diff, last_rss[idx] - last_rss[idx - 1])
self.assertTrue(not (is_increasing and max_diff > 100 * 1024),
msg='memory usage is increasing, {}'.format(str(last_rss)))
class TestProfiler(TestCase):
def test_source(self):
"""Checks that source code attribution works for eager, TS and autograd mode
"""
# avoid automatic inlining
prev_opt = torch._C._get_graph_executor_optimize()
torch._C._set_graph_executor_optimize(False)
@torch.jit.script
def ts_method_2(x, y):
return torch.matmul(x, y)
@torch.jit.script
def ts_method_1(x, y, z):
a = x + z
w = ts_method_2(x, y) + a
return w.sum()
class DummyModule(nn.Module):
def __init__(self):
super(DummyModule, self).__init__()
self.conv = torch.nn.Conv2d(3, 2, kernel_size=1, stride=2, padding=3, bias=False)
def forward(self, x):
return self.conv(x)
mod = DummyModule()
with profile(with_stack=True, use_kineto=kineto_available()) as p:
x = torch.randn(10, 10, requires_grad=True)
y = torch.randn(10, 10, requires_grad=True)
z = x + y
w = ts_method_1(x, y, z)
v = 2 * w
v.backward()
a = torch.randn(2, 3, 2, 2, requires_grad=True)
b = mod(a)
c = b.sum()
c.backward()
print(p.key_averages(
group_by_stack_n=5).table(
sort_by="self_cpu_time_total", row_limit=-1))
for e in p.function_events:
if "aten::add" in e.name or "AddBackward" in e.name:
self.assertTrue(any(["test_profiler" in entry for entry in e.stack]))
self.assertTrue(any([(
"test_source" in entry or
"ts_method_1" in entry or
"ts_method_2" in entry) for entry in e.stack]))
torch._C._set_graph_executor_optimize(prev_opt)
def payload(self):
x = torch.randn(10, 10).cuda()
y = torch.randn(10, 10).cuda()
z = torch.mm(x, y)
z = z + y
z = z.cpu()
@unittest.skipIf(not kineto_available(), "Kineto is required")
@unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
def test_kineto(self):
with profile(use_cuda=True, use_kineto=True):
self.payload()
# rerun to avoid initial start overhead
with profile(use_cuda=True, use_kineto=True) as p:
self.payload()
print(p.key_averages().table(
sort_by="self_cuda_time_total", row_limit=-1))
found_gemm = False
found_memcpy = False
for e in p.function_events:
if "gemm" in e.name:
found_gemm = True
if "Memcpy" in e.name or "memcpy" in e.name:
found_memcpy = True
self.assertTrue(found_gemm)
self.assertTrue(found_memcpy)
# p.export_chrome_trace("/tmp/test_trace.json")
def test_high_level_trace(self):
"""Checks that python side high level events are recorded.
"""
class RepeatedDataset(torch.utils.data.Dataset):
def __init__(self, N, D_in, D_out):
self.N = N
self.x = torch.randn(N, D_in)
self.y = torch.randn(N, D_out)
def __len__(self):
return self.N
def __getitem__(self, idx):
return self.x, self.y
class TwoLayerNet(torch.nn.Module):
def __init__(self, D_in, H, D_out):
super(TwoLayerNet, self).__init__()
self.linear1 = torch.nn.Linear(D_in, H)
self.linear2 = torch.nn.Linear(H, D_out)
def forward(self, x):
h_relu = self.linear1(x).clamp(min=0)
y_pred = self.linear2(h_relu)
return y_pred
class CustomSGD(torch.optim.SGD):
def __init__(self, *args, **kwargs):
super(CustomSGD, self).__init__(*args, **kwargs)
def train():
for _, data in enumerate(dataloader):
x, y = data[0], data[1]
y_pred = model(x)
loss = criterion(y_pred, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
N, D_in, H, D_out = 8, 10, 5, 2
model = TwoLayerNet(D_in, H, D_out)
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
ds = RepeatedDataset(N, D_in, D_out)
dataloader = torch.utils.data.DataLoader(ds, batch_size=1)
try:
train()
except Exception:
self.assertTrue(False, "Expected no exception without profiling.")
# Create multiple instances, expect each func is hooked only one time.
# Nested wrappers(repeated patching) will make following test fail.
optimizer_duplicate = torch.optim.SGD(model.parameters(), lr=1e-4)
dataloader_duplicate = torch.utils.data.DataLoader(ds, batch_size=1)
def judge(expected_event_count, prof):
actual_event_count = {}
for e in prof.function_events:
if "#" in e.name:
key = e.name
if key in expected_event_count.keys():
actual_event_count[key] = actual_event_count.setdefault(key, 0) + 1
for key, count in expected_event_count.items():
self.assertTrue((key in actual_event_count.keys()) and (count == actual_event_count[key]))
with profile() as prof:
train()
expected_event_count = {
# "+1" because the final iteration will enter __next__ but skip the loop body.
"enumerate(DataLoader)#_SingleProcessDataLoaderIter.__next__": (N + 1),
"Optimizer.step#SGD.step": N,
"Optimizer.zero_grad#SGD.zero_grad": N
}
judge(expected_event_count, prof)
# Test on pickle/unpickle. Expect to work in multi-processing.
optimizer = pickle.loads(pickle.dumps(optimizer))
with profile() as prof:
train()
judge(expected_event_count, prof)
# Test on customized optimizer.
optimizer = CustomSGD(model.parameters(), lr=1e-4)
with profile() as prof:
train()
expected_event_count = {
"enumerate(DataLoader)#_SingleProcessDataLoaderIter.__next__": (N + 1),
"Optimizer.step#CustomSGD.step": N,
"Optimizer.zero_grad#CustomSGD.zero_grad": N
}
judge(expected_event_count, prof)
def test_flops(self):
model = torch.nn.Sequential(
nn.Conv2d(16, 33, 18),
nn.ReLU(),
nn.Linear(243, 243),
nn.ReLU(),
)
inputs = torch.randn(40, 16, 18, 260)
with profiler.profile(record_shapes=True, with_flops=True) as prof:
model(inputs)
profiler_output = prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10)
print(profiler_output)
self.assertIn("FLOPS", profiler_output)
@unittest.skipIf(not kineto_available(), "Kineto is required")
@unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
def test_kineto_profiler_api(self):
called_num = [0]
with profile(use_cuda=True, use_kineto=True):
self.payload()
def trace_handler(p):
print(p.key_averages().table(
sort_by="self_cuda_time_total", row_limit=-1))
# p.export_chrome_trace("/tmp/test_trace_" + str(called_num[0]) + ".json")
called_num[0] += 1
with torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA],
schedule=torch.profiler.schedule(
wait=1,
warmup=1,
active=2),
on_trace_ready=trace_handler
) as p:
for idx in range(8):
self.payload()
p.step()
self.assertEqual(called_num[0], 2)
# case without enable_pred
with torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA]
) as p:
self.payload()
self.payload()
print(p.key_averages().table(
sort_by="self_cuda_time_total", row_limit=-1))
def test_export_stacks(self):
with profile(with_stack=True, use_kineto=kineto_available()) as p:
x = torch.randn(10, 10)
y = torch.randn(10, 10)
z = torch.mm(x, y)
z = z + y
with TemporaryFileName(mode="w+") as fname:
p.export_stacks(fname)
with io.open(fname, 'r') as f:
lines = f.readlines()
assert len(lines) > 0, "Empty stacks file"
for line in lines:
is_int = False
try:
assert int(line.split(" ")[-1]) > 0, "Invalid stacks record"
is_int = True
except ValueError:
pass
assert is_int, "Invalid stacks record"
if __name__ == '__main__':
run_tests()