-
Notifications
You must be signed in to change notification settings - Fork 104
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
# Contents of this release Examples: multi-gpu example #381 Examples: updates example compares Radix2 and MixedRadix NTTs #383 Feat: add vector operations bindings to Rust #384 Examples: update examples with new vec ops #388 Feat: Grumpkin curve implementation #379 Feat: mixed-radix NTT fast twiddles mode #382 Docs: Update README.md #385 #387 README: Update Hall of Fame section #394 Examples: add rust poseidon example #392 Feat: GoLang bindings for v1.x #386
- Loading branch information
Showing
281 changed files
with
23,881 additions
and
11,322 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
cmake_minimum_required(VERSION 3.18) | ||
set(CMAKE_CXX_STANDARD 17) | ||
set(CMAKE_CUDA_STANDARD 17) | ||
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) | ||
set(CMAKE_CXX_STANDARD_REQUIRED TRUE) | ||
if (${CMAKE_VERSION} VERSION_LESS "3.24.0") | ||
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH}) | ||
else() | ||
set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed | ||
endif () | ||
project(icicle LANGUAGES CUDA CXX) | ||
|
||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") | ||
set(CMAKE_CUDA_FLAGS_RELEASE "") | ||
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0") | ||
# change the path to your Icicle location | ||
include_directories("../../../icicle") | ||
add_executable( | ||
example | ||
example.cu | ||
) | ||
find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ ) | ||
target_link_libraries(example ${NVML_LIBRARY}) | ||
set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
# Icicle example: using multiple GPU to hash large dataset | ||
|
||
## Best-Practices | ||
|
||
This example builds on [single GPU Poseidon example](../poseidon/README.md) so we recommend to run it first. | ||
|
||
## Key-Takeaway | ||
|
||
Use `device_context::DeviceContext` variable to select GPU to use. | ||
Use C++ threads to compute `Icicle` primitives on different GPUs in parallel. | ||
|
||
## Concise Usage Explanation | ||
|
||
1. Include c++ threads | ||
|
||
```c++ | ||
#include <thread> | ||
``` | ||
|
||
2. Define a __thread function__. Importantly, device context `ctx` will hold the GPU id. | ||
|
||
```c++ | ||
void threadPoseidon(device_context::DeviceContext ctx, ...) {...} | ||
``` | ||
3. Initialize device contexts for different GPUs | ||
```c++ | ||
device_context::DeviceContext ctx0 = device_context::get_default_device_context(); | ||
ctx0.device_id=0; | ||
device_context::DeviceContext ctx1 = device_context::get_default_device_context(); | ||
ctx1.device_id=1; | ||
``` | ||
|
||
4. Finally, spawn the threads and wait for their completion | ||
|
||
```c++ | ||
std::thread thread0(threadPoseidon, ctx0, ...); | ||
std::thread thread1(threadPoseidon, ctx1, ...); | ||
thread0.join(); | ||
thread1.join(); | ||
``` | ||
## What's in the example | ||
This is a **toy** example executing the first step of the Filecoin's Pre-Commit 2 phase: compute $2^{30}$ Poseison hashes for each column of $11 \times 2^{30}$ matrix. | ||
1. Define the size of the example: $2^{30}$ won't fit on a typical machine, so we partition the problem into `nof_partitions` | ||
2. Hash two partitions in parallel on two GPUs | ||
3. Hash two partitions in series on one GPU | ||
4. Compare execution times | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
#!/bin/bash | ||
|
||
# Exit immediately on error | ||
set -e | ||
|
||
rm -rf build | ||
mkdir -p build | ||
cmake -S . -B build | ||
cmake --build build |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
#include <iostream> | ||
#include <thread> | ||
#include <chrono> | ||
|
||
#include <nvml.h> | ||
|
||
// select the curve | ||
#define CURVE_ID 2 | ||
#include "appUtils/poseidon/poseidon.cu" | ||
#include "utils/error_handler.cuh" | ||
|
||
using namespace poseidon; | ||
using namespace curve_config; | ||
|
||
void checkCudaError(cudaError_t error) { | ||
if (error != cudaSuccess) { | ||
std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl; | ||
// Handle the error, e.g., exit the program or throw an exception. | ||
} | ||
} | ||
|
||
// these global constants go into template calls | ||
const int size_col = 11; | ||
|
||
// this function executes the Poseidon thread | ||
void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition, scalar_t * layers, scalar_t * column_hashes, PoseidonConstants<scalar_t> * constants) { | ||
cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx.device_id)); | ||
if (err_result != cudaSuccess) { | ||
std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl; | ||
return; | ||
} | ||
// CHK_IF_RETURN(); I can't use it in a standard thread function | ||
PoseidonConfig column_config = { | ||
ctx, // ctx | ||
false, // are_inputes_on_device | ||
false, // are_outputs_on_device | ||
false, // input_is_a_state | ||
false, // aligned | ||
false, // loop_state | ||
false, // is_async | ||
}; | ||
cudaError_t err = poseidon_hash<scalar_t, size_col+1>(layers, column_hashes, (size_t) size_partition, *constants, column_config); | ||
checkCudaError(err); | ||
} | ||
|
||
using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>; | ||
#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now(); | ||
#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count()); | ||
|
||
|
||
#define CHECK_ALLOC(ptr) if ((ptr) == nullptr) { \ | ||
std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl; \ | ||
exit(EXIT_FAILURE); \ | ||
} | ||
|
||
int main() { | ||
const unsigned size_row = (1<<30); | ||
const unsigned nof_partitions = 64; | ||
const unsigned size_partition = size_row / nof_partitions; | ||
// layers is allocated only for one partition, need to reuse for different partitions | ||
const uint32_t size_layers = size_col * size_partition; | ||
|
||
nvmlInit(); | ||
unsigned int deviceCount; | ||
nvmlDeviceGetCount(&deviceCount); | ||
std::cout << "Available GPUs: " << deviceCount << std::endl; | ||
|
||
for (unsigned int i = 0; i < deviceCount; ++i) { | ||
nvmlDevice_t device; | ||
nvmlMemory_t memory; | ||
char name[NVML_DEVICE_NAME_BUFFER_SIZE]; | ||
nvmlDeviceGetHandleByIndex(i, &device); | ||
nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE); | ||
nvmlDeviceGetMemoryInfo(device, &memory); | ||
std::cout << "Device ID: " << i << ", Type: " << name << ", Memory Total/Free (MiB) " << memory.total/1024/1024 << "/" << memory.free/1024/1024 << std::endl; | ||
} | ||
|
||
const unsigned memory_partition = sizeof(scalar_t)*(size_col+1)*size_partition/1024/1024; | ||
std::cout << "Required Memory (MiB) " << memory_partition << std::endl; | ||
|
||
//=============================================================================== | ||
// Key: multiple devices are supported by device context | ||
//=============================================================================== | ||
|
||
device_context::DeviceContext ctx0 = device_context::get_default_device_context(); | ||
ctx0.device_id=0; | ||
device_context::DeviceContext ctx1 = device_context::get_default_device_context(); | ||
ctx1.device_id=1; | ||
|
||
std::cout << "Allocate and initialize the memory for layers and hashes" << std::endl; | ||
scalar_t* layers0 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t))); | ||
CHECK_ALLOC(layers0); | ||
scalar_t s = scalar_t::zero(); | ||
for (unsigned i = 0; i < size_col*size_partition ; i++) { | ||
layers0[i] = s; | ||
s = s + scalar_t::one(); | ||
} | ||
scalar_t* layers1 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t))); | ||
CHECK_ALLOC(layers1); | ||
s = scalar_t::zero() + scalar_t::one(); | ||
for (unsigned i = 0; i < size_col*size_partition ; i++) { | ||
layers1[i] = s; | ||
s = s + scalar_t::one(); | ||
} | ||
|
||
scalar_t* column_hash0 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t))); | ||
CHECK_ALLOC(column_hash0); | ||
scalar_t* column_hash1 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t))); | ||
CHECK_ALLOC(column_hash1); | ||
|
||
PoseidonConstants<scalar_t> column_constants0, column_constants1; | ||
init_optimized_poseidon_constants<scalar_t>(size_col, ctx0, &column_constants0); | ||
cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx1.device_id)); | ||
if (err_result != cudaSuccess) { | ||
std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl; | ||
return; | ||
} | ||
init_optimized_poseidon_constants<scalar_t>(size_col, ctx1, &column_constants1); | ||
|
||
std::cout << "Parallel execution of Poseidon threads" << std::endl; | ||
START_TIMER(parallel); | ||
std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0); | ||
std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_constants1); | ||
|
||
// Wait for the threads to finish | ||
thread0.join(); | ||
thread1.join(); | ||
END_TIMER(parallel,"2 GPUs"); | ||
std::cout << "Output Data from Thread 0: "; | ||
std::cout << column_hash0[0] << std::endl; | ||
std::cout << "Output Data from Thread 1: "; | ||
std::cout << column_hash1[0] << std::endl; | ||
|
||
std::cout << "Sequential execution of Poseidon threads" << std::endl; | ||
START_TIMER(sequential); | ||
std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0); | ||
thread2.join(); | ||
std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_constants0); | ||
thread3.join(); | ||
END_TIMER(sequential,"1 GPU"); | ||
std::cout << "Output Data from Thread 2: "; | ||
std::cout << column_hash0[0] << std::endl; | ||
std::cout << "Output Data from Thread 3: "; | ||
std::cout << column_hash1[0] << std::endl; | ||
|
||
nvmlShutdown(); | ||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
#!/bin/bash | ||
./build/example |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.