Release v1.5.0 (#393)

# Contents of this release Examples: multi-gpu example #381 Examples: updates example compares Radix2 and MixedRadix NTTs #383 Feat: add vector operations bindings to Rust #384 Examples: update examples with new vec ops #388 Feat: Grumpkin curve implementation #379 Feat: mixed-radix NTT fast twiddles mode #382 Docs: Update README.md #385 #387 README: Update Hall of Fame section #394 Examples: add rust poseidon example #392 Feat: GoLang bindings for v1.x #386
ingonyama-zk · Feb 23, 2024 · e603569 · e603569
2 parents fc6badc + e8cd2d7
commit e603569
Show file tree

Hide file tree

Showing 281 changed files with 23,881 additions and 11,322 deletions.
diff --git a/.github/changed-files.yml b/.github/changed-files.yml
@@ -1,5 +1,7 @@
 golang:
-  - goicicle/**/*.go'
+  - wrappers/golang/**/*.go'
+  - wrappers/golang/**/*.h'
+  - wrappers/golang/**/*.tmpl'
   - go.mod
 rust:
   - wrappers/rust

diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml
@@ -23,7 +23,7 @@ concurrency:
 
 jobs:  
   test-examples:
-    runs-on: [self-hosted, Linux, X64, icicle] # ubuntu-latest
+    runs-on: [self-hosted, Linux, X64, icicle, examples]
     steps:
     - name: Checkout
       uses: actions/checkout@v2

diff --git a/.github/workflows/main-build.yml b/.github/workflows/main-build.yml
@@ -80,18 +80,22 @@ jobs:
       # Building from the root workspace will build all members of the workspace by default
       run: cargo build --release --verbose
 
-  # TODO: Re-enable once Golang bindings for v1+ is finished
-  # build-golang-linux:
-  #   name: Build Golang on Linux
-  #   runs-on: [self-hosted, Linux, X64, icicle]
-  #   needs: check-changed-files
-  #   steps:
-  #   - name: Checkout Repo
-  #     uses: actions/checkout@v3
-  #   - name: Build CUDA libs
-  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-  #     run: make all
-  #     working-directory: ./goicicle
+  build-golang-linux:
+    name: Build Golang on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    strategy:
+      matrix:
+        curve: [bn254, bls12_381, bls12_377, bw6_761]
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Build CUDA libs
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      working-directory: ./wrappers/golang
+      run: |
+        export CPATH=$CPATH:/usr/local/cuda/include
+        ./build.sh ${{ matrix.curve }} ON
 
   # TODO: Add once Golang make file supports building for Windows
   # build-golang-windows:

diff --git a/.github/workflows/main-test.yml b/.github/workflows/main-test.yml
@@ -75,20 +75,25 @@ jobs:
       if: needs.check-changed-files.outputs.cpp_cuda == 'true'
       run: ctest
 
-  # TODO: Re-enable once Golang bindings for v1+ is finished
-  # test-golang-linux:
-  #   name: Test Golang on Linux
-  #   runs-on: [self-hosted, Linux, X64, icicle]
-  #   needs: check-changed-files
-  #   steps:
-  #   - name: Checkout Repo
-  #     uses: actions/checkout@v3
-  #   - name: Build CUDA libs
-  #     working-directory: ./goicicle
-  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-  #     run: make libbn254.so
-  #   - name: Run Golang Tests
-  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-  #     run: |
-  #       export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$(pwd)/goicicle
-  #       go test ./goicicle/curves/bn254 -count=1
+  test-golang-linux:
+    name: Test Golang on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    # strategy:
+    #   matrix:
+    #     curve: [bn254, bls12_381, bls12_377, bw6_761]
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Build CUDA libs
+      working-directory: ./wrappers/golang
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      # builds all curves with g2 ON
+      run: |
+        export CPATH=$CPATH:/usr/local/cuda/include
+        ./build.sh all ON
+    - name: Run Golang Tests
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: |
+        export CPATH=$CPATH:/usr/local/cuda/include
+        go test --tags=g2 ./... -count=1 -timeout 60m
diff --git a/README.md b/README.md
@@ -114,6 +114,7 @@ This will ensure our custom hooks are run and will make it easier to follow our
 - [Robik](https://github.com/robik75), for his ongoing support and mentorship
 - [liuxiao](https://github.com/liuxiaobleach), for being a top notch bug smasher
 - [gkigiermo](https://github.com/gkigiermo), for making it intuitive to use ICICLE in Google Colab.
+- [nonam3e](https://github.com/nonam3e), for adding Grumpkin curve support into ICICLE
 
 ## Help & Support
 
@@ -142,10 +143,10 @@ See [LICENSE-MIT][LMIT] for details.
 [GRANT_PROGRAM]: https://medium.com/@ingonyama/icicle-for-researchers-grants-challenges-9be1f040998e
 [ICICLE-CORE]: ./icicle/
 [ICICLE-RUST]: ./wrappers/rust/
-[ICICLE-GO]: ./goicicle/
+[ICICLE-GO]: ./wrappers/golang/
 [ICICLE-CORE-README]: ./icicle/README.md
 [ICICLE-RUST-README]: ./wrappers/rust/README.md
-[ICICLE-GO-README]: ./goicicle/README.md
+[ICICLE-GO-README]: ./wrappers/golang/README.md
 [documentation]: https://dev.ingonyama.com/icicle/overview
 [examples]: ./examples/
 

diff --git a/examples/c++/multi-gpu-poseidon/CMakeLists.txt b/examples/c++/multi-gpu-poseidon/CMakeLists.txt
@@ -0,0 +1,25 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(icicle LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+# change the path to your Icicle location
+include_directories("../../../icicle")
+add_executable(
+  example
+  example.cu
+)
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(example ${NVML_LIBRARY})
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
diff --git a/examples/c++/multi-gpu-poseidon/README.md b/examples/c++/multi-gpu-poseidon/README.md
@@ -0,0 +1,52 @@
+# Icicle example: using multiple GPU to hash large dataset
+
+## Best-Practices
+
+This example builds on [single GPU Poseidon example](../poseidon/README.md) so we recommend to run it first.
+
+## Key-Takeaway
+
+Use `device_context::DeviceContext` variable to select GPU to use. 
+Use C++ threads to compute `Icicle` primitives on different GPUs in parallel.
+
+## Concise Usage Explanation
+
+1. Include c++ threads
+
+```c++
+#include <thread>
+```
+
+2. Define a __thread function__. Importantly, device context `ctx` will hold the GPU id.
+
+```c++
+void threadPoseidon(device_context::DeviceContext ctx, ...) {...}
+```
+
+3. Initialize device contexts for different GPUs
+
+```c++
+device_context::DeviceContext ctx0 = device_context::get_default_device_context();
+ctx0.device_id=0;
+device_context::DeviceContext ctx1 = device_context::get_default_device_context();
+ctx1.device_id=1;
+``` 
+
+4. Finally, spawn the threads and wait for their completion
+
+```c++
+std::thread thread0(threadPoseidon, ctx0, ...);
+std::thread thread1(threadPoseidon, ctx1, ...);
+thread0.join();
+thread1.join();
+```
+
+## What's in the example
+
+This is a **toy** example executing the first step of the Filecoin's Pre-Commit 2 phase: compute $2^{30}$ Poseison hashes for each column of $11 \times 2^{30}$ matrix.
+
+1. Define the size of the example: $2^{30}$ won't fit on a typical machine, so we partition the problem into `nof_partitions`
+2. Hash two partitions in parallel on two GPUs
+3. Hash two partitions in series on one GPU
+4. Compare execution times
+
diff --git a/examples/c++/multi-gpu-poseidon/compile.sh b/examples/c++/multi-gpu-poseidon/compile.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build
diff --git a/examples/c++/multi-gpu-poseidon/example.cu b/examples/c++/multi-gpu-poseidon/example.cu
@@ -0,0 +1,148 @@
+#include <iostream>
+#include <thread>
+#include <chrono>
+
+#include <nvml.h>
+
+// select the curve
+#define CURVE_ID 2
+#include "appUtils/poseidon/poseidon.cu"
+#include "utils/error_handler.cuh"
+
+using namespace poseidon;
+using namespace curve_config;
+
+void checkCudaError(cudaError_t error) {
+    if (error != cudaSuccess) {
+        std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl;
+        // Handle the error, e.g., exit the program or throw an exception.
+    }
+}
+
+// these global constants go into template calls
+const int size_col = 11;
+
+// this function executes the Poseidon thread
+void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition, scalar_t * layers, scalar_t * column_hashes, PoseidonConstants<scalar_t> * constants) {
+    cudaError_t err_result =  CHK_STICKY(cudaSetDevice(ctx.device_id));
+    if (err_result != cudaSuccess) {
+        std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
+        return; 
+    }
+    // CHK_IF_RETURN(); I can't use it in a standard thread function
+    PoseidonConfig column_config = {
+        ctx,   // ctx
+        false, // are_inputes_on_device
+        false, // are_outputs_on_device
+        false, // input_is_a_state
+        false, // aligned
+        false, // loop_state
+        false, // is_async
+        };
+    cudaError_t err = poseidon_hash<scalar_t, size_col+1>(layers, column_hashes, (size_t) size_partition, *constants, column_config);
+    checkCudaError(err);
+}
+
+using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+
+#define CHECK_ALLOC(ptr) if ((ptr) == nullptr) { \
+    std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl; \
+    exit(EXIT_FAILURE); \
+}
+
+int main() {
+    const unsigned size_row = (1<<30);
+    const unsigned nof_partitions = 64;
+    const unsigned size_partition = size_row / nof_partitions;
+    // layers is allocated only for one partition, need to reuse for different partitions
+    const uint32_t size_layers = size_col * size_partition;
+
+    nvmlInit();
+    unsigned int deviceCount;
+    nvmlDeviceGetCount(&deviceCount);
+    std::cout << "Available GPUs: " << deviceCount << std::endl;
+
+    for (unsigned int i = 0; i < deviceCount; ++i) {
+        nvmlDevice_t device;
+        nvmlMemory_t memory;
+        char name[NVML_DEVICE_NAME_BUFFER_SIZE];
+        nvmlDeviceGetHandleByIndex(i, &device);
+        nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
+        nvmlDeviceGetMemoryInfo(device, &memory);
+        std::cout << "Device ID: " << i << ", Type: " << name << ", Memory Total/Free (MiB) " << memory.total/1024/1024 << "/"  << memory.free/1024/1024 << std::endl;
+    }
+
+    const unsigned memory_partition = sizeof(scalar_t)*(size_col+1)*size_partition/1024/1024;
+    std::cout << "Required Memory (MiB) " << memory_partition << std::endl;
+
+    //===============================================================================
+    // Key: multiple devices are supported by device context
+    //===============================================================================
+
+    device_context::DeviceContext ctx0 = device_context::get_default_device_context();
+    ctx0.device_id=0;
+    device_context::DeviceContext ctx1 = device_context::get_default_device_context();
+    ctx1.device_id=1;
+
+    std::cout << "Allocate and initialize the memory for layers and hashes" << std::endl;
+    scalar_t* layers0 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
+    CHECK_ALLOC(layers0);
+    scalar_t s = scalar_t::zero();
+    for (unsigned i = 0; i < size_col*size_partition ; i++) {
+        layers0[i] = s;
+        s = s + scalar_t::one();
+    }
+    scalar_t* layers1 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
+    CHECK_ALLOC(layers1);
+    s = scalar_t::zero() + scalar_t::one();
+    for (unsigned i = 0; i < size_col*size_partition ; i++) {
+        layers1[i] = s;
+        s = s + scalar_t::one();
+    }
+
+    scalar_t* column_hash0 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
+    CHECK_ALLOC(column_hash0);
+    scalar_t* column_hash1 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
+    CHECK_ALLOC(column_hash1);
+
+    PoseidonConstants<scalar_t> column_constants0, column_constants1;
+    init_optimized_poseidon_constants<scalar_t>(size_col, ctx0, &column_constants0);
+    cudaError_t err_result =  CHK_STICKY(cudaSetDevice(ctx1.device_id));
+    if (err_result != cudaSuccess) {
+        std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
+        return; 
+    }
+    init_optimized_poseidon_constants<scalar_t>(size_col, ctx1, &column_constants1);
+
+    std::cout << "Parallel execution of Poseidon threads" << std::endl;
+    START_TIMER(parallel);
+    std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
+    std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_constants1);
+
+    // Wait for the threads to finish
+    thread0.join();
+    thread1.join();
+    END_TIMER(parallel,"2 GPUs");
+    std::cout << "Output Data from Thread 0: ";
+    std::cout << column_hash0[0] << std::endl;
+    std::cout << "Output Data from Thread 1: ";
+    std::cout << column_hash1[0] << std::endl;
+
+    std::cout << "Sequential execution of Poseidon threads" << std::endl;
+    START_TIMER(sequential);
+    std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
+    thread2.join();
+    std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_constants0);
+    thread3.join();
+    END_TIMER(sequential,"1 GPU");
+    std::cout << "Output Data from Thread 2: ";
+    std::cout << column_hash0[0] << std::endl;
+    std::cout << "Output Data from Thread 3: ";
+    std::cout << column_hash1[0] << std::endl;
+
+    nvmlShutdown();
+    return 0;
+}
diff --git a/examples/c++/multi-gpu-poseidon/run.sh b/examples/c++/multi-gpu-poseidon/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+./build/example
diff --git a/examples/c++/multiply/example.cu b/examples/c++/multiply/example.cu
@@ -10,15 +10,15 @@
 
 using namespace curve_config;
 
-// select scalar or point field
-//typedef scalar_t T;
-typedef point_field_t T;
+typedef scalar_t T;
 
 int vector_mult(T* vec_b, T* vec_a, T* vec_result, size_t n_elments, device_context::DeviceContext ctx)
 {
-  const bool is_on_device = true;
-  const bool is_montgomery = false;
-  cudaError_t err =  vec_ops::Mul<T,T>(vec_a, vec_b, n_elments, is_on_device, is_montgomery, ctx, vec_result);
+  vec_ops::VecOpsConfig<scalar_t> config = vec_ops::DefaultVecOpsConfig<scalar_t>();
+  config.is_a_on_device = true;
+  config.is_b_on_device = true;
+  config.is_result_on_device = true;
+  cudaError_t err =  vec_ops::Mul<T>(vec_a, vec_b, n_elments, config, vec_result);
   if (err != cudaSuccess) {
     std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
     return 0;