-
Notifications
You must be signed in to change notification settings - Fork 35
/
Copy pathCrc32TestMultithreaded.cpp
203 lines (169 loc) · 6.58 KB
/
Crc32TestMultithreaded.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
// //////////////////////////////////////////////////////////
// Crc32TestMultithreaded.cpp
// Copyright (c) 2019 Stephan Brumme. All rights reserved.
// see http://create.stephan-brumme.com/disclaimer.html
//
#include "Crc32.h"
#include <cstdlib>
#include <cstdio>
#include <vector>
#include <string>
// C++11 multithreading
#include <thread>
#include <future>
// //////////////////////////////////////////////////////////
// test code
/// one gigabyte
const size_t NumBytes = 1024*1024*1024;
#if defined(_WIN32) || defined(_WIN64)
#include <windows.h>
#else
#include <ctime>
#endif
// timing
static double seconds()
{
#if defined(_WIN32) || defined(_WIN64)
LARGE_INTEGER frequency, now;
QueryPerformanceFrequency(&frequency);
QueryPerformanceCounter (&now);
return now.QuadPart / double(frequency.QuadPart);
#else
timespec now;
clock_gettime(CLOCK_REALTIME, &now);
return now.tv_sec + now.tv_nsec / 1000000000.0;
#endif
}
// //////////////////////////////////////////////////////////
// run a CRC32 algorithm on multiple threads
typedef uint32_t (*Crc32Algorithm)(const void* data, size_t length, uint32_t previousCrc32);
// compute CRC32 of up to maxBlockSize bytes and start recursively a new thread for excess data
uint32_t asyncCrc32(Crc32Algorithm myCrc32, const void* data, size_t numBytes, size_t maxBlockSize)
{
// last block ?
if (numBytes <= maxBlockSize)
return myCrc32(data, numBytes, 0); // we're done
// compute CRC of the remaining bytes in a separate thread
auto dataLeft = (const char*)data + maxBlockSize;
auto bytesLeft = numBytes - maxBlockSize;
auto remainder = std::async(std::launch::async, asyncCrc32, myCrc32, dataLeft, bytesLeft, maxBlockSize);
// compute CRC of the current block
auto currentCrc = myCrc32(data, maxBlockSize, 0);
// get CRC of the remainder
auto remainderCrc = remainder.get();
// and merge both
return crc32_combine(currentCrc, remainderCrc, bytesLeft);
}
// call: run(crc32_8bytes, data, NumBytes, 8) if you have an octocore CPU
uint32_t run(Crc32Algorithm myCrc32, const void* data, size_t numBytes, size_t numThreads = 0)
{
// run on all cores
if (numThreads == 0)
numThreads = std::thread::hardware_concurrency();
// split data evenly, rounding up
auto defaultBlocksize = (numBytes + numThreads - 1) / numThreads;
return asyncCrc32(myCrc32, data, numBytes, defaultBlocksize);
}
// test original sequential CRC32 algorithm against crc32_combine
bool testCombine(const char* data, size_t maxBytes = 1024)
{
bool ok = true;
for (size_t lengthA = 1; lengthA < maxBytes; lengthA++)
{
// split bytes into two blocks of lengthA and lengthB
auto lengthB = maxBytes - lengthA;
// compute CRC of both blocks
auto crcA = crc32_1byte(data, lengthA);
auto crcB = crc32_1byte(data + lengthA, lengthB);
// CRC of the whole block
auto crcAtOnce = crc32_1byte(data, maxBytes);
// CRC of both blocks in a sequential fashion
auto crcSequential = crc32_1byte(data + lengthA, lengthB, crcA);
// CRC using the new crc32_combine function
auto crcCombined = crc32_combine(crcA, crcB, lengthB);
// check results
if (crcAtOnce != crcSequential || crcAtOnce != crcCombined)
{
printf("FAILED @ %d: %08X %08X %08X %08X %08X\n", lengthA, crcA, crcB, crcAtOnce, crcSequential, crcCombined);
ok = false;
}
}
return ok;
}
int main(int argc, char* argv[])
{
// //////////////////////////////////////////////////////////
printf("Please wait ...\n");
uint32_t randomNumber = 0x27121978;
// initialize
char* data = new char[NumBytes];
for (size_t i = 0; i < NumBytes; i++)
{
data[i] = char(randomNumber & 0xFF);
// simple LCG, see http://en.wikipedia.org/wiki/Linear_congruential_generator
randomNumber = 1664525 * randomNumber + 1013904223;
}
// re-use variables
double startTime, duration;
uint32_t crc;
std::vector<std::future<uint32_t>> futures;
// number of threads: use all cores by default or set number as command-line parameter
auto numThreads = 0;
if (argc == 2)
numThreads = std::stoi(argv[1]);
if (numThreads <= 0)
numThreads = std::thread::hardware_concurrency();
printf("use %d threads:\n", numThreads);
// //////////////////////////////////////////////////////////
// one byte at once
startTime = seconds();
crc = run(crc32_1byte, data, NumBytes, numThreads);
duration = seconds() - startTime;
printf(" 1 byte at once / %d threads: CRC=%08X, %.3fs, %.3f MB/s\n",
numThreads, crc, duration, (NumBytes / (1024*1024)) / duration);
// four bytes at once
startTime = seconds();
crc = run(crc32_4bytes, data, NumBytes, numThreads);
duration = seconds() - startTime;
printf(" 4 bytes at once / %d threads: CRC=%08X, %.3fs, %.3f MB/s\n",
numThreads, crc, duration, (NumBytes / (1024*1024)) / duration);
// eight bytes at once
startTime = seconds();
crc = run(crc32_8bytes, data, NumBytes, numThreads);
duration = seconds() - startTime;
printf(" 8 bytes at once / %d threads: CRC=%08X, %.3fs, %.3f MB/s\n",
numThreads, crc, duration, (NumBytes / (1024*1024)) / duration);
// eight bytes at once, unrolled 4 times (=> 32 bytes per loop)
startTime = seconds();
crc = run(crc32_4x8bytes, data, NumBytes, numThreads);
duration = seconds() - startTime;
printf("4x8 bytes at once / %d threads: CRC=%08X, %.3fs, %.3f MB/s\n",
numThreads, crc, duration, (NumBytes / (1024*1024)) / duration);
// sixteen bytes at once
startTime = seconds();
crc = run(crc32_16bytes, data, NumBytes, numThreads);
duration = seconds() - startTime;
printf(" 16 bytes at once / %d threads: CRC=%08X, %.3fs, %.3f MB/s\n",
numThreads, crc, duration, (NumBytes / (1024*1024)) / duration);
// //////////////////////////////////////////////////////////
// slowly increment number of threads to determine scalability
printf("run slicing-by-8 algorithm with 1 to %d threads:\n", numThreads);
for (auto scaleThreads = 1; scaleThreads <= numThreads; scaleThreads++)
{
// eight bytes at once
startTime = seconds();
if (scaleThreads == 1)
crc = crc32_8bytes (data, NumBytes); // single-threaded
else
crc = run(crc32_8bytes, data, NumBytes, scaleThreads); // multi-threaded
duration = seconds() - startTime;
printf(" 8 bytes at once / %d threads: CRC=%08X, %.3fs, %.3f MB/s\n",
scaleThreads, crc, duration, (NumBytes / (1024*1024)) / duration);
}
// //////////////////////////////////////////////////////////
// verify crc32_combine
if (!testCombine(data, 1024))
printf("ERROR in crc32_combine !!!\n");
delete[] data;
return 0;
}