-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstreams_cuda.h
143 lines (117 loc) · 5.04 KB
/
streams_cuda.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
// Compute bound kernel: performs multiply-add operations
template <typename _TYPE_>
__global__ void cuda_kernel(_TYPE_ *data, const int offset, const int dataSize)
{
int idx = offset + blockIdx.x * blockDim.x + threadIdx.x;
if (idx < dataSize)
{
_TYPE_ a = sqrt(data[idx]);
_TYPE_ b = cos(a);
_TYPE_ c = sin(b);
data[idx] = c * a * b * data[idx];
}
}
template <typename _TYPE_>
void streams_cuda(const int MemSizeArraysMB, const int N_imposed = -1)
// Showcase the use of streams to overlap H2D /D2H transfers with host/device kernels
{
// Timers
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
std::cout << "\n ||| CUDA STREAMS, dtype = " << typeid(_TYPE_).name() << " |||\n";
// If N_imposed is provided, we scale the problem accordingly
// then we also assume that the user wants to test the kernel.
// We set input[i]=i and check the validity of the result to detect any indexing error
// This fails however for large value of dataSize because of round-off error
// In non-test mode, we simply impose input[i]=1 and check that output[i]=2*radius+1
// dimension problem
const int dataSize = MB * MemSizeArraysMB / sizeof(_TYPE_);
const int dataBytes = dataSize * sizeof(_TYPE_);
// Allocate GPU memory
_TYPE_ *data, *h_data;
// host mem Must be pinned
cudaMallocHost((void **)&h_data, dataBytes);
cudaMalloc((void **)&data, dataBytes);
// Dimension block and grid
int NBLOCKS = (dataSize + BLOCK_SIZE - 1) / BLOCK_SIZE;
std::cout << "\ndataSize, NBLOCKS, BLOCK_SIZE= " << dataSize << " " << NBLOCKS << " " << BLOCK_SIZE << "\n";
// Naive implem
cudaEventRecord(start);
cudaMemcpy(data, h_data, dataBytes, cudaMemcpyHostToDevice);
cuda_kernel<<<NBLOCKS, BLOCK_SIZE>>>(data, 0, dataSize);
cudaMemcpy(h_data, data, dataBytes, cudaMemcpyDeviceToHost);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float ms_naive;
cudaEventElapsedTime(&ms_naive, start, stop);
std::cout << "cuda naive time: " << ms_naive << "\n";
//try several nStreams
for (int nStreams = 20; nStreams <= 20; nStreams += 1)
{
const int streamSize = dataSize / nStreams;
int streamBytes = streamSize * sizeof(_TYPE_);
cudaStream_t streams[nStreams];
int NBLOCKSstream = (streamSize + BLOCK_SIZE - 1) / BLOCK_SIZE;
for (size_t i = 0; i < nStreams; i++)
{
cudaStreamCreate(&streams[i]);
}
cudaEventRecord(start);
for (size_t i = 0; i < nStreams; i++)
{
const int offset = i * streamSize;
if (offset + streamSize >= dataSize)
{
streamBytes = (dataSize - offset) * sizeof(_TYPE_);
}
// you have to use &. data[offset] is the offset-th element, & data[offset] is the adress of the offset-th element
cudaMemcpyAsync(&data[offset], &h_data[offset], streamBytes, cudaMemcpyHostToDevice, streams[i]);
// 0 shared memory allocated
cuda_kernel<<<NBLOCKSstream, BLOCK_SIZE, 0, streams[i]>>>(data, offset, dataSize);
cudaMemcpyAsync(&h_data[offset], &data[offset], streamBytes, cudaMemcpyDeviceToHost, streams[i]);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float ms_stream;
cudaEventElapsedTime(&ms_stream, start, stop);
std::cout << "cuda nStreams" << nStreams << " time: " << ms_stream << "\n";
for (size_t i = 0; i < nStreams; i++)
{
cudaStreamDestroy(streams[i]);
};
}
//Only two streams, sharing chunks of ndiv-th of the work
const int ndiv=20;
const int streamSize = dataSize / ndiv;
int streamBytes = streamSize * sizeof(_TYPE_);
cudaStream_t streams[2];
int NBLOCKSstream = (streamSize + BLOCK_SIZE - 1) / BLOCK_SIZE;
cudaStreamCreate(&streams[0]);
cudaStreamCreate(&streams[1]);
cudaEventRecord(start);
for (size_t i = 0; i < ndiv; i++)
{
int stream_id = i%2;
const int offset = i * streamSize;
if (offset + streamSize >= dataSize)
{
streamBytes = (dataSize - offset) * sizeof(_TYPE_);
}
// you have to use &. data[offset] is the offset-th element, & data[offset] is the adress of the offset-th element
cudaMemcpyAsync(&data[offset], &h_data[offset], streamBytes, cudaMemcpyHostToDevice, streams[stream_id]);
// 0 shared memory allocated
cuda_kernel<<<NBLOCKSstream, BLOCK_SIZE, 0, streams[stream_id]>>>(data, offset, dataSize);
cudaMemcpyAsync(&h_data[offset], &data[offset], streamBytes, cudaMemcpyDeviceToHost, streams[stream_id]);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float ms_stream_2;
cudaEventElapsedTime(&ms_stream_2, start, stop);
std::cout << "cuda two streams, ndiv:" << ndiv << " time: " << ms_stream_2 << "\n";
cudaStreamDestroy(streams[0]);
cudaStreamDestroy(streams[1]);
// Free memory
cudaFreeHost(h_data);
cudaFree(data);
}