-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathgeneral.zig
150 lines (126 loc) · 5.02 KB
/
general.zig
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
const Cuda = @import("cudaz");
const CuDevice = Cuda.Device;
const CuCompile = Cuda.Compile;
const CuLaunchConfig = Cuda.LaunchConfig;
const std = @import("std");
const time = std.time;
test "Setup" {
const device = try CuDevice.default();
defer device.free();
}
test "Allocate" {
const device = try CuDevice.default();
defer device.free();
const slice = try device.alloc(f32, 1024 * 1024 * 1024);
defer slice.free();
}
test "host_to_device" {
const device = try CuDevice.default();
defer device.free();
const slice = try device.htodCopy(f32, &[_]f32{ 1.2, 3.4, 8.9 });
defer slice.free();
}
test "device_to_host" {
const device = try CuDevice.default();
defer device.free();
var float_arr = [_]f32{ 1.2, 3.4, 8.9 };
const slice = try device.htodCopy(f32, &float_arr);
var arr = try CuDevice.syncReclaim(f32, std.testing.allocator, slice);
defer arr.deinit();
try std.testing.expect(std.mem.eql(f32, &float_arr, arr.items));
}
const increment_kernel =
\\extern "C" __global__ void increment(float *out)
\\{
\\ int i = blockIdx.x * blockDim.x + threadIdx.x;
\\ out[i] = out[i] + 1;
\\}
;
test "inc_file" {
const device = try CuDevice.default();
defer device.free();
const data = [_]f32{ 1.2, 2.8, 0.123 };
const cu_slice = try device.htodCopy(f32, &data);
defer cu_slice.free();
const ptx = try CuCompile.cudaText(increment_kernel, .{}, std.testing.allocator);
defer std.testing.allocator.free(ptx);
const module = try CuDevice.loadPtxText(ptx);
const function = try module.getFunc("increment");
try function.run(.{&cu_slice.device_ptr}, CuLaunchConfig{ .block_dim = .{ 3, 1, 1 }, .grid_dim = .{ 1, 1, 1 }, .shared_mem_bytes = 0 });
const incremented_arr = try CuDevice.syncReclaim(f32, std.testing.allocator, cu_slice);
defer incremented_arr.deinit();
for (incremented_arr.items, data) |id, d| {
std.debug.assert(id - d == 1);
}
}
// Got segmentation fault because n was declared a comptime_int construct and after compiling there will be no n, therefore cuda won't able to fetch the value n.
// Even if n is declared as const param there's seg fault, unless it's declared as var
test "ptx_sin_file" {
// CuDevice Initialization
var device = try CuDevice.default();
defer device.free();
//Load module from ptx
const module = try CuDevice.loadPtx(.{ .raw_path = "cuda/sin.ptx" });
const func = try module.getFunc("sin_kernel");
//init variables
var float_arr = [_]f32{ 1.57, 0.5236, 0, 1.05 };
const src_slice = try device.htodCopy(f32, &float_arr);
var dest_slice = try src_slice.clone();
// Launch config
const cfg = CuLaunchConfig.for_num_elems(float_arr.len);
var n = float_arr.len;
// Run the func
try func.run(.{ &dest_slice.device_ptr, &src_slice.device_ptr, &n }, cfg);
const sin_d = try CuDevice.syncReclaim(f32, std.testing.allocator, dest_slice);
defer sin_d.deinit();
for (0..float_arr.len) |index| {
try std.testing.expect(std.math.approxEqAbs(f32, @sin(float_arr[index]), sin_d.items[index], std.math.floatEps(f32)));
}
}
test "compile_ptx" {
const file = try std.fs.cwd().openFile("cuda/sin.cu", .{});
const ptx_data = try CuCompile.cudaFile(file, .{ .use_fast_math = true }, std.testing.allocator);
std.testing.allocator.free(ptx_data);
}
const cuda_src =
\\extern "C" __global__ void matmul(float* A, float* B, float* C, const int N) {
\\ int ROW = blockIdx.y*blockDim.y+threadIdx.y;
\\ int COL = blockIdx.x*blockDim.x+threadIdx.x;
\\
\\ float tmpSum = 0;
\\
\\ if (ROW < N && COL < N) {
\\ // each thread computes one element of C
\\ for (int i = 0; i < N; i++) {
\\ tmpSum += A[ROW * N + i] * B[i * N + COL];
\\ }
\\ }
\\ C[ROW * N + COL] = tmpSum;
\\ }
;
test "matmul_kernel" {
const ptx_data = try Cuda.Compile.cudaText(cuda_src, .{}, std.testing.allocator);
defer std.testing.allocator.free(ptx_data);
var device = try CuDevice.default();
defer device.free();
const module = try CuDevice.loadPtxText(ptx_data);
const func = try module.getFunc("matmul");
const a = [_]f32{ 1.0, 2.0, 3.0, 4.0 };
const a_slice = try device.htodCopy(f32, &a);
const b_slice = try a_slice.clone();
var c_slice = try a_slice.clone();
const cfg = Cuda.LaunchConfig{ .block_dim = .{ 2, 2, 1 }, .grid_dim = .{ 1, 1, 1 }, .shared_mem_bytes = 0 };
var n: i32 = 2;
try func.run(.{ &a_slice.device_ptr, &b_slice.device_ptr, &c_slice.device_ptr, &n }, cfg);
const arr = try CuDevice.syncReclaim(f32, std.testing.allocator, c_slice);
defer arr.deinit();
for (arr.items, [_]f32{ 7.0, 10.0, 15.0, 22.0 }) |c, o| {
try std.testing.expect(std.math.approxEqAbs(f32, c, o, std.math.floatEps(f32)));
}
}
// test "runtime_init" {
// const device = try CuDevice.default();
// defer device.free();
// const slice = try device.allocR(Cuda.DType.f16, 10);
// slice.free();
// }