Insert MSM and FFT code and their benchmarks. (#86)

* Insert MSM and FFT code and their benchmarks. Resolves taikoxyz/zkevm-circuits#150. * feedback * Add instructions * feeback * Implement feedback: Actually supply the correct arguments to `best_multiexp`. Split into `singlecore` and `multicore` benchmarks so Criterion's result caching and comparison over multiple runs makes sense. Rewrite point and scalar generation. * Use slicing and parallelism to to decrease running time. Laptop measurements: k=22: 109 sec k=16: 1 sec * Refactor msm * Refactor fft * Update module comments * Fix formatting * Implement suggestion for fixing CI
privacy-scaling-explorations · Sep 22, 2023 · ee7cb86 · ee7cb86
1 parent 2f3e388
commit ee7cb86
Show file tree

Hide file tree

Showing 7 changed files with 491 additions and 1 deletion.
diff --git a/Cargo.toml b/Cargo.toml
@@ -33,9 +33,11 @@ serde = { version = "1.0", default-features = false, optional = true }
 serde_arrays = { version = "0.1.0", optional = true }
 hex = { version = "0.4", optional = true, default-features = false, features = ["alloc", "serde"] }
 blake2b_simd = "1"
+maybe-rayon = { version = "0.1.0", default-features = false }
 
 [features]
-default = ["reexport", "bits"]
+default = ["reexport", "bits", "multicore"]
+multicore = ["maybe-rayon/threads"]
 asm = []
 bits = ["ff/bits"]
 bn256-table = []
@@ -69,3 +71,12 @@ harness = false
 [[bench]]
 name = "hash_to_curve"
 harness = false
+
+[[bench]]
+name = "fft"
+harness = false
+
+[[bench]]
+name = "msm"
+harness = false
+required-features = ["multicore"]
diff --git a/benches/fft.rs b/benches/fft.rs
@@ -0,0 +1,57 @@
+//! This benchmarks Fast-Fourier Transform (FFT).
+//! Since it is over a finite field, it is actually the Number Theoretical
+//! Transform (NNT).  It uses the `Fr` scalar field from the BN256 curve.
+//!
+//! To run this benchmark:
+//!
+//!     cargo bench -- fft
+//!
+//! Caveat:  The multicore benchmark assumes:
+//!     1. a multi-core system
+//!     2. that the `multicore` feature is enabled.  It is by default.
+
+#[macro_use]
+extern crate criterion;
+
+use criterion::{BenchmarkId, Criterion};
+use group::ff::Field;
+use halo2curves::bn256::Fr as Scalar;
+use halo2curves::fft::best_fft;
+use rand_core::OsRng;
+use std::ops::Range;
+use std::time::SystemTime;
+
+const RANGE: Range<u32> = 3..19;
+
+fn generate_data(k: u32) -> Vec<Scalar> {
+    let n = 1 << k;
+    let timer = SystemTime::now();
+    println!("\n\nGenerating 2^{k} = {n} values..",);
+    let data: Vec<Scalar> = (0..n).map(|_| Scalar::random(OsRng)).collect();
+    let end = timer.elapsed().unwrap();
+    println!(
+        "Generating 2^{k} = {n} values took: {} sec.\n\n",
+        end.as_secs()
+    );
+    data
+}
+
+fn fft(c: &mut Criterion) {
+    let max_k = RANGE.max().unwrap_or(16);
+    let mut data = generate_data(max_k);
+    let omega = Scalar::random(OsRng);
+    let mut group = c.benchmark_group("fft");
+    for k in RANGE {
+        group.bench_function(BenchmarkId::new("k", k), |b| {
+            let n = 1 << k;
+            assert!(n <= data.len());
+            b.iter(|| {
+                best_fft(&mut data[..n], omega, k);
+            });
+        });
+    }
+    group.finish();
+}
+
+criterion_group!(benches, fft);
+criterion_main!(benches);
diff --git a/benches/msm.rs b/benches/msm.rs
@@ -0,0 +1,116 @@
+//! This benchmarks Multi Scalar Multiplication (MSM).
+//! It measures `G1` from the BN256 curve.
+//!
+//! To run this benchmark:
+//!
+//!     cargo bench -- msm
+//!
+//! Caveat:  The multicore benchmark assumes:
+//!     1. a multi-core system
+//!     2. that the `multicore` feature is enabled.  It is by default.
+
+#[macro_use]
+extern crate criterion;
+
+use criterion::{BenchmarkId, Criterion};
+use ff::Field;
+use group::prime::PrimeCurveAffine;
+use halo2curves::bn256::{Fr as Scalar, G1Affine as Point};
+use halo2curves::msm::{best_multiexp, multiexp_serial};
+use maybe_rayon::current_thread_index;
+use maybe_rayon::prelude::{IntoParallelIterator, ParallelIterator};
+use rand_core::SeedableRng;
+use rand_xorshift::XorShiftRng;
+use std::time::SystemTime;
+
+const SAMPLE_SIZE: usize = 10;
+const SINGLECORE_RANGE: [u8; 6] = [3, 8, 10, 12, 14, 16];
+const MULTICORE_RANGE: [u8; 9] = [3, 8, 10, 12, 14, 16, 18, 20, 22];
+const SEED: [u8; 16] = [
+    0x59, 0x62, 0xbe, 0x5d, 0x76, 0x3d, 0x31, 0x8d, 0x17, 0xdb, 0x37, 0x32, 0x54, 0x06, 0xbc, 0xe5,
+];
+
+fn generate_coefficients_and_curvepoints(k: u8) -> (Vec<Scalar>, Vec<Point>) {
+    let n: u64 = {
+        assert!(k < 64);
+        1 << k
+    };
+
+    println!("\n\nGenerating 2^{k} = {n} coefficients and curve points..",);
+    let timer = SystemTime::now();
+    let coeffs = (0..n)
+        .into_par_iter()
+        .map_init(
+            || {
+                let mut thread_seed = SEED;
+                let uniq = current_thread_index().unwrap().to_ne_bytes();
+                assert!(std::mem::size_of::<usize>() == 8);
+                for i in 0..uniq.len() {
+                    thread_seed[i] += uniq[i];
+                    thread_seed[i + 8] += uniq[i];
+                }
+                XorShiftRng::from_seed(thread_seed)
+            },
+            |rng, _| Scalar::random(rng),
+        )
+        .collect();
+    let bases = (0..n)
+        .into_par_iter()
+        .map_init(
+            || {
+                let mut thread_seed = SEED;
+                let uniq = current_thread_index().unwrap().to_ne_bytes();
+                assert!(std::mem::size_of::<usize>() == 8);
+                for i in 0..uniq.len() {
+                    thread_seed[i] += uniq[i];
+                    thread_seed[i + 8] += uniq[i];
+                }
+                XorShiftRng::from_seed(thread_seed)
+            },
+            |rng, _| Point::random(rng),
+        )
+        .collect();
+    let end = timer.elapsed().unwrap();
+    println!(
+        "Generating 2^{k} = {n} coefficients and curve points took: {} sec.\n\n",
+        end.as_secs()
+    );
+
+    (coeffs, bases)
+}
+
+fn msm(c: &mut Criterion) {
+    let mut group = c.benchmark_group("msm");
+    let max_k = *SINGLECORE_RANGE
+        .iter()
+        .chain(MULTICORE_RANGE.iter())
+        .max()
+        .unwrap_or(&16);
+    let (coeffs, bases) = generate_coefficients_and_curvepoints(max_k);
+
+    for k in SINGLECORE_RANGE {
+        group
+            .bench_function(BenchmarkId::new("singlecore", k), |b| {
+                assert!(k < 64);
+                let n: usize = 1 << k;
+                let mut acc = Point::identity().into();
+                b.iter(|| multiexp_serial(&coeffs[..n], &bases[..n], &mut acc));
+            })
+            .sample_size(10);
+    }
+    for k in MULTICORE_RANGE {
+        group
+            .bench_function(BenchmarkId::new("multicore", k), |b| {
+                assert!(k < 64);
+                let n: usize = 1 << k;
+                b.iter(|| {
+                    best_multiexp(&coeffs[..n], &bases[..n]);
+                })
+            })
+            .sample_size(SAMPLE_SIZE);
+    }
+    group.finish();
+}
+
+criterion_group!(benches, msm);
+criterion_main!(benches);
diff --git a/src/fft.rs b/src/fft.rs
@@ -0,0 +1,134 @@
+use crate::multicore;
+pub use crate::{CurveAffine, CurveExt};
+use ff::Field;
+use group::{GroupOpsOwned, ScalarMulOwned};
+
+/// This represents an element of a group with basic operations that can be
+/// performed. This allows an FFT implementation (for example) to operate
+/// generically over either a field or elliptic curve group.
+pub trait FftGroup<Scalar: Field>:
+    Copy + Send + Sync + 'static + GroupOpsOwned + ScalarMulOwned<Scalar>
+{
+}
+
+impl<T, Scalar> FftGroup<Scalar> for T
+where
+    Scalar: Field,
+    T: Copy + Send + Sync + 'static + GroupOpsOwned + ScalarMulOwned<Scalar>,
+{
+}
+
+/// Performs a radix-$2$ Fast-Fourier Transformation (FFT) on a vector of size
+/// $n = 2^k$, when provided `log_n` = $k$ and an element of multiplicative
+/// order $n$ called `omega` ($\omega$). The result is that the vector `a`, when
+/// interpreted as the coefficients of a polynomial of degree $n - 1$, is
+/// transformed into the evaluations of this polynomial at each of the $n$
+/// distinct powers of $\omega$. This transformation is invertible by providing
+/// $\omega^{-1}$ in place of $\omega$ and dividing each resulting field element
+/// by $n$.
+///
+/// This will use multithreading if beneficial.
+pub fn best_fft<Scalar: Field, G: FftGroup<Scalar>>(a: &mut [G], omega: Scalar, log_n: u32) {
+    fn bitreverse(mut n: usize, l: usize) -> usize {
+        let mut r = 0;
+        for _ in 0..l {
+            r = (r << 1) | (n & 1);
+            n >>= 1;
+        }
+        r
+    }
+
+    let threads = multicore::current_num_threads();
+    let log_threads = threads.ilog2();
+    let n = a.len();
+    assert_eq!(n, 1 << log_n);
+
+    for k in 0..n {
+        let rk = bitreverse(k, log_n as usize);
+        if k < rk {
+            a.swap(rk, k);
+        }
+    }
+
+    // precompute twiddle factors
+    let twiddles: Vec<_> = (0..(n / 2))
+        .scan(Scalar::ONE, |w, _| {
+            let tw = *w;
+            *w *= &omega;
+            Some(tw)
+        })
+        .collect();
+
+    if log_n <= log_threads {
+        let mut chunk = 2_usize;
+        let mut twiddle_chunk = n / 2;
+        for _ in 0..log_n {
+            a.chunks_mut(chunk).for_each(|coeffs| {
+                let (left, right) = coeffs.split_at_mut(chunk / 2);
+
+                // case when twiddle factor is one
+                let (a, left) = left.split_at_mut(1);
+                let (b, right) = right.split_at_mut(1);
+                let t = b[0];
+                b[0] = a[0];
+                a[0] += &t;
+                b[0] -= &t;
+
+                left.iter_mut()
+                    .zip(right.iter_mut())
+                    .enumerate()
+                    .for_each(|(i, (a, b))| {
+                        let mut t = *b;
+                        t *= &twiddles[(i + 1) * twiddle_chunk];
+                        *b = *a;
+                        *a += &t;
+                        *b -= &t;
+                    });
+            });
+            chunk *= 2;
+            twiddle_chunk /= 2;
+        }
+    } else {
+        recursive_butterfly_arithmetic(a, n, 1, &twiddles)
+    }
+}
+
+/// This perform recursive butterfly arithmetic
+pub fn recursive_butterfly_arithmetic<Scalar: Field, G: FftGroup<Scalar>>(
+    a: &mut [G],
+    n: usize,
+    twiddle_chunk: usize,
+    twiddles: &[Scalar],
+) {
+    if n == 2 {
+        let t = a[1];
+        a[1] = a[0];
+        a[0] += &t;
+        a[1] -= &t;
+    } else {
+        let (left, right) = a.split_at_mut(n / 2);
+        multicore::join(
+            || recursive_butterfly_arithmetic(left, n / 2, twiddle_chunk * 2, twiddles),
+            || recursive_butterfly_arithmetic(right, n / 2, twiddle_chunk * 2, twiddles),
+        );
+
+        // case when twiddle factor is one
+        let (a, left) = left.split_at_mut(1);
+        let (b, right) = right.split_at_mut(1);
+        let t = b[0];
+        b[0] = a[0];
+        a[0] += &t;
+        b[0] -= &t;
+
+        left.iter_mut()
+            .zip(right.iter_mut())
+            .enumerate()
+            .for_each(|(i, (a, b))| {
+                let mut t = *b;
+                t *= &twiddles[(i + 1) * twiddle_chunk];
+                *b = *a;
+                *a += &t;
+                *b -= &t;
+            });
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,5 +1,8 @@
 mod arithmetic;
+pub mod fft;
 pub mod hash_to_curve;
+pub mod msm;
+pub mod multicore;
 #[macro_use]
 pub mod legendre;
 pub mod serde;