-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstreams_kokkos.h
110 lines (85 loc) · 3.48 KB
/
streams_kokkos.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
template <typename _TYPE_>
void kokkos_kernel(const View<_TYPE_> &data, const policy_t &policy)
{
Kokkos::parallel_for("kokkos_kernel", policy, KOKKOS_LAMBDA(const int idx) {
_TYPE_ buff = data[idx];
_TYPE_ a = sqrt(buff);
_TYPE_ b = cos(a);
_TYPE_ c = sin(b);
data(idx) = c * a * b * buff;
});
}
template <typename _TYPE_>
void streams_kokkos(const int MemSizeArraysMB)
{
// Timers
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
std::cout << "\n ||| KOKKOS Streams (ExecSpaces), dtype = " << typeid(_TYPE_).name() << " |||\n";
// dimension problem
const int dataSize = MB * MemSizeArraysMB / sizeof(_TYPE_);
// Allocate Views on GPU and CPU
auto data = View<_TYPE_>("data", dataSize);
auto h_data = HostView<_TYPE_>("h_data", dataSize);
auto hp_data = HostPinnedView<_TYPE_>("hp_data", dataSize);
// -- Naive implem
auto policy = policy_t(0, dataSize);
cudaEventRecord(start);
Kokkos::deep_copy(data, h_data);
kokkos_kernel<_TYPE_>(data, policy);
Kokkos::deep_copy(h_data, data);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float ms_naive_host;
cudaEventElapsedTime(&ms_naive_host, start, stop);
std::cout << "kokkos default host: " << ms_naive_host << "\n";
// -- Naive implem - pinned
HostPinned hostPinned;
cudaEventRecord(start);
Kokkos::deep_copy(data, hp_data);
kokkos_kernel<_TYPE_>(data, policy);
Kokkos::deep_copy(hp_data, data);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float ms_naive_host_pinned;
cudaEventElapsedTime(&ms_naive_host_pinned, start, stop);
std::cout << "kokkos pinned host: " << ms_naive_host_pinned << "\n";
//We divide the work into ndiv parts that will take place in two instances (2 streams)
int nInstances = 2;
const int ndiv = 20;
// Create the instances
Device device; //exec space to pass as argument below
std::vector<int> weights(nInstances, 1); //vector of weights 1
auto Instances = Kokkos::Experimental::partition_space(device, weights);
//declare subviews and policies
View<_TYPE_> subviews[ndiv];
HostPinnedView<_TYPE_> hp_subviews[ndiv]; //Pinned !!! otherwise D2H is blocking !!
policy_t policies[ndiv];
const int streamSize = dataSize / ndiv;
//initialize subviews to separate the work in ndiv parts (unmanaged, no allocation)
//also initialize the policies
for (size_t i = 0; i < ndiv; i++)
{
int instance_id = i%2;
const int beg = i * streamSize;
const int end = (i + 1) * streamSize > dataSize ? dataSize - beg : (i + 1) * streamSize;
policies[i] = policy_t(Instances[instance_id], beg, end);
subviews[i] = Kokkos::subview(data, std::make_pair(beg, end));
hp_subviews[i] = Kokkos::subview(hp_data, std::make_pair(beg, end));
}
cudaEventRecord(start);
for (size_t i = 0; i < ndiv; i++)
{
int instance_id = i%2;
Kokkos::deep_copy(Instances[instance_id], subviews[i], hp_subviews[i]);
kokkos_kernel<_TYPE_>(data, policies[i]);
Kokkos::deep_copy(Instances[instance_id], hp_subviews[i], subviews[i]);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float ms_stream;
cudaEventElapsedTime(&ms_stream, start, stop);
std::cout << "kokkos nInstances" << nInstances << " time: " << ms_stream << "\n";
std::cout << "NOTE: Concurrency requires pinned memory on the host !\n";
}