Skip to content

Commit

Permalink
Release v1.5.0 (#393)
Browse files Browse the repository at this point in the history
# Contents of this release

Examples: multi-gpu example #381
Examples: updates example compares Radix2 and MixedRadix NTTs #383
Feat: add vector operations bindings to Rust #384 
Examples: update examples with new vec ops #388 
Feat: Grumpkin curve implementation #379 
Feat: mixed-radix NTT fast twiddles mode #382 
Docs: Update README.md #385 #387 
README: Update Hall of Fame section #394 
Examples: add rust poseidon example #392 
Feat: GoLang bindings for v1.x #386
  • Loading branch information
jeremyfelder authored Feb 23, 2024
2 parents fc6badc + e8cd2d7 commit e603569
Show file tree
Hide file tree
Showing 281 changed files with 23,881 additions and 11,322 deletions.
4 changes: 3 additions & 1 deletion .github/changed-files.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
golang:
- goicicle/**/*.go'
- wrappers/golang/**/*.go'
- wrappers/golang/**/*.h'
- wrappers/golang/**/*.tmpl'
- go.mod
rust:
- wrappers/rust
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/examples.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ concurrency:

jobs:
test-examples:
runs-on: [self-hosted, Linux, X64, icicle] # ubuntu-latest
runs-on: [self-hosted, Linux, X64, icicle, examples]
steps:
- name: Checkout
uses: actions/checkout@v2
Expand Down
28 changes: 16 additions & 12 deletions .github/workflows/main-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,18 +80,22 @@ jobs:
# Building from the root workspace will build all members of the workspace by default
run: cargo build --release --verbose

# TODO: Re-enable once Golang bindings for v1+ is finished
# build-golang-linux:
# name: Build Golang on Linux
# runs-on: [self-hosted, Linux, X64, icicle]
# needs: check-changed-files
# steps:
# - name: Checkout Repo
# uses: actions/checkout@v3
# - name: Build CUDA libs
# if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
# run: make all
# working-directory: ./goicicle
build-golang-linux:
name: Build Golang on Linux
runs-on: [self-hosted, Linux, X64, icicle]
needs: check-changed-files
strategy:
matrix:
curve: [bn254, bls12_381, bls12_377, bw6_761]
steps:
- name: Checkout Repo
uses: actions/checkout@v3
- name: Build CUDA libs
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
working-directory: ./wrappers/golang
run: |
export CPATH=$CPATH:/usr/local/cuda/include
./build.sh ${{ matrix.curve }} ON
# TODO: Add once Golang make file supports building for Windows
# build-golang-windows:
Expand Down
39 changes: 22 additions & 17 deletions .github/workflows/main-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,20 +75,25 @@ jobs:
if: needs.check-changed-files.outputs.cpp_cuda == 'true'
run: ctest

# TODO: Re-enable once Golang bindings for v1+ is finished
# test-golang-linux:
# name: Test Golang on Linux
# runs-on: [self-hosted, Linux, X64, icicle]
# needs: check-changed-files
# steps:
# - name: Checkout Repo
# uses: actions/checkout@v3
# - name: Build CUDA libs
# working-directory: ./goicicle
# if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
# run: make libbn254.so
# - name: Run Golang Tests
# if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
# run: |
# export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$(pwd)/goicicle
# go test ./goicicle/curves/bn254 -count=1
test-golang-linux:
name: Test Golang on Linux
runs-on: [self-hosted, Linux, X64, icicle]
needs: check-changed-files
# strategy:
# matrix:
# curve: [bn254, bls12_381, bls12_377, bw6_761]
steps:
- name: Checkout Repo
uses: actions/checkout@v3
- name: Build CUDA libs
working-directory: ./wrappers/golang
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
# builds all curves with g2 ON
run: |
export CPATH=$CPATH:/usr/local/cuda/include
./build.sh all ON
- name: Run Golang Tests
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
run: |
export CPATH=$CPATH:/usr/local/cuda/include
go test --tags=g2 ./... -count=1 -timeout 60m
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ This will ensure our custom hooks are run and will make it easier to follow our
- [Robik](https://github.com/robik75), for his ongoing support and mentorship
- [liuxiao](https://github.com/liuxiaobleach), for being a top notch bug smasher
- [gkigiermo](https://github.com/gkigiermo), for making it intuitive to use ICICLE in Google Colab.
- [nonam3e](https://github.com/nonam3e), for adding Grumpkin curve support into ICICLE

## Help & Support

Expand Down Expand Up @@ -142,10 +143,10 @@ See [LICENSE-MIT][LMIT] for details.
[GRANT_PROGRAM]: https://medium.com/@ingonyama/icicle-for-researchers-grants-challenges-9be1f040998e
[ICICLE-CORE]: ./icicle/
[ICICLE-RUST]: ./wrappers/rust/
[ICICLE-GO]: ./goicicle/
[ICICLE-GO]: ./wrappers/golang/
[ICICLE-CORE-README]: ./icicle/README.md
[ICICLE-RUST-README]: ./wrappers/rust/README.md
[ICICLE-GO-README]: ./goicicle/README.md
[ICICLE-GO-README]: ./wrappers/golang/README.md
[documentation]: https://dev.ingonyama.com/icicle/overview
[examples]: ./examples/

Expand Down
25 changes: 25 additions & 0 deletions examples/c++/multi-gpu-poseidon/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
cmake_minimum_required(VERSION 3.18)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
else()
set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
endif ()
project(icicle LANGUAGES CUDA CXX)

set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS_RELEASE "")
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
# change the path to your Icicle location
include_directories("../../../icicle")
add_executable(
example
example.cu
)
find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
target_link_libraries(example ${NVML_LIBRARY})
set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

52 changes: 52 additions & 0 deletions examples/c++/multi-gpu-poseidon/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Icicle example: using multiple GPU to hash large dataset

## Best-Practices

This example builds on [single GPU Poseidon example](../poseidon/README.md) so we recommend to run it first.

## Key-Takeaway

Use `device_context::DeviceContext` variable to select GPU to use.
Use C++ threads to compute `Icicle` primitives on different GPUs in parallel.

## Concise Usage Explanation

1. Include c++ threads

```c++
#include <thread>
```

2. Define a __thread function__. Importantly, device context `ctx` will hold the GPU id.

```c++
void threadPoseidon(device_context::DeviceContext ctx, ...) {...}
```
3. Initialize device contexts for different GPUs
```c++
device_context::DeviceContext ctx0 = device_context::get_default_device_context();
ctx0.device_id=0;
device_context::DeviceContext ctx1 = device_context::get_default_device_context();
ctx1.device_id=1;
```

4. Finally, spawn the threads and wait for their completion

```c++
std::thread thread0(threadPoseidon, ctx0, ...);
std::thread thread1(threadPoseidon, ctx1, ...);
thread0.join();
thread1.join();
```
## What's in the example
This is a **toy** example executing the first step of the Filecoin's Pre-Commit 2 phase: compute $2^{30}$ Poseison hashes for each column of $11 \times 2^{30}$ matrix.
1. Define the size of the example: $2^{30}$ won't fit on a typical machine, so we partition the problem into `nof_partitions`
2. Hash two partitions in parallel on two GPUs
3. Hash two partitions in series on one GPU
4. Compare execution times
9 changes: 9 additions & 0 deletions examples/c++/multi-gpu-poseidon/compile.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

# Exit immediately on error
set -e

rm -rf build
mkdir -p build
cmake -S . -B build
cmake --build build
148 changes: 148 additions & 0 deletions examples/c++/multi-gpu-poseidon/example.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
#include <iostream>
#include <thread>
#include <chrono>

#include <nvml.h>

// select the curve
#define CURVE_ID 2
#include "appUtils/poseidon/poseidon.cu"
#include "utils/error_handler.cuh"

using namespace poseidon;
using namespace curve_config;

void checkCudaError(cudaError_t error) {
if (error != cudaSuccess) {
std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl;
// Handle the error, e.g., exit the program or throw an exception.
}
}

// these global constants go into template calls
const int size_col = 11;

// this function executes the Poseidon thread
void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition, scalar_t * layers, scalar_t * column_hashes, PoseidonConstants<scalar_t> * constants) {
cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx.device_id));
if (err_result != cudaSuccess) {
std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
return;
}
// CHK_IF_RETURN(); I can't use it in a standard thread function
PoseidonConfig column_config = {
ctx, // ctx
false, // are_inputes_on_device
false, // are_outputs_on_device
false, // input_is_a_state
false, // aligned
false, // loop_state
false, // is_async
};
cudaError_t err = poseidon_hash<scalar_t, size_col+1>(layers, column_hashes, (size_t) size_partition, *constants, column_config);
checkCudaError(err);
}

using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());


#define CHECK_ALLOC(ptr) if ((ptr) == nullptr) { \
std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl; \
exit(EXIT_FAILURE); \
}

int main() {
const unsigned size_row = (1<<30);
const unsigned nof_partitions = 64;
const unsigned size_partition = size_row / nof_partitions;
// layers is allocated only for one partition, need to reuse for different partitions
const uint32_t size_layers = size_col * size_partition;

nvmlInit();
unsigned int deviceCount;
nvmlDeviceGetCount(&deviceCount);
std::cout << "Available GPUs: " << deviceCount << std::endl;

for (unsigned int i = 0; i < deviceCount; ++i) {
nvmlDevice_t device;
nvmlMemory_t memory;
char name[NVML_DEVICE_NAME_BUFFER_SIZE];
nvmlDeviceGetHandleByIndex(i, &device);
nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
nvmlDeviceGetMemoryInfo(device, &memory);
std::cout << "Device ID: " << i << ", Type: " << name << ", Memory Total/Free (MiB) " << memory.total/1024/1024 << "/" << memory.free/1024/1024 << std::endl;
}

const unsigned memory_partition = sizeof(scalar_t)*(size_col+1)*size_partition/1024/1024;
std::cout << "Required Memory (MiB) " << memory_partition << std::endl;

//===============================================================================
// Key: multiple devices are supported by device context
//===============================================================================

device_context::DeviceContext ctx0 = device_context::get_default_device_context();
ctx0.device_id=0;
device_context::DeviceContext ctx1 = device_context::get_default_device_context();
ctx1.device_id=1;

std::cout << "Allocate and initialize the memory for layers and hashes" << std::endl;
scalar_t* layers0 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
CHECK_ALLOC(layers0);
scalar_t s = scalar_t::zero();
for (unsigned i = 0; i < size_col*size_partition ; i++) {
layers0[i] = s;
s = s + scalar_t::one();
}
scalar_t* layers1 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
CHECK_ALLOC(layers1);
s = scalar_t::zero() + scalar_t::one();
for (unsigned i = 0; i < size_col*size_partition ; i++) {
layers1[i] = s;
s = s + scalar_t::one();
}

scalar_t* column_hash0 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
CHECK_ALLOC(column_hash0);
scalar_t* column_hash1 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
CHECK_ALLOC(column_hash1);

PoseidonConstants<scalar_t> column_constants0, column_constants1;
init_optimized_poseidon_constants<scalar_t>(size_col, ctx0, &column_constants0);
cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx1.device_id));
if (err_result != cudaSuccess) {
std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
return;
}
init_optimized_poseidon_constants<scalar_t>(size_col, ctx1, &column_constants1);

std::cout << "Parallel execution of Poseidon threads" << std::endl;
START_TIMER(parallel);
std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_constants1);

// Wait for the threads to finish
thread0.join();
thread1.join();
END_TIMER(parallel,"2 GPUs");
std::cout << "Output Data from Thread 0: ";
std::cout << column_hash0[0] << std::endl;
std::cout << "Output Data from Thread 1: ";
std::cout << column_hash1[0] << std::endl;

std::cout << "Sequential execution of Poseidon threads" << std::endl;
START_TIMER(sequential);
std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
thread2.join();
std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_constants0);
thread3.join();
END_TIMER(sequential,"1 GPU");
std::cout << "Output Data from Thread 2: ";
std::cout << column_hash0[0] << std::endl;
std::cout << "Output Data from Thread 3: ";
std::cout << column_hash1[0] << std::endl;

nvmlShutdown();
return 0;
}
2 changes: 2 additions & 0 deletions examples/c++/multi-gpu-poseidon/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
./build/example
12 changes: 6 additions & 6 deletions examples/c++/multiply/example.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@

using namespace curve_config;

// select scalar or point field
//typedef scalar_t T;
typedef point_field_t T;
typedef scalar_t T;

int vector_mult(T* vec_b, T* vec_a, T* vec_result, size_t n_elments, device_context::DeviceContext ctx)
{
const bool is_on_device = true;
const bool is_montgomery = false;
cudaError_t err = vec_ops::Mul<T,T>(vec_a, vec_b, n_elments, is_on_device, is_montgomery, ctx, vec_result);
vec_ops::VecOpsConfig<scalar_t> config = vec_ops::DefaultVecOpsConfig<scalar_t>();
config.is_a_on_device = true;
config.is_b_on_device = true;
config.is_result_on_device = true;
cudaError_t err = vec_ops::Mul<T>(vec_a, vec_b, n_elments, config, vec_result);
if (err != cudaSuccess) {
std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
return 0;
Expand Down
Loading

0 comments on commit e603569

Please sign in to comment.