From df8bccd7195796ab64107cb4961fad5252b39a4c Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Tue, 21 Jan 2025 12:15:18 -0600 Subject: [PATCH 1/9] feat: add AVX512 small fft kernel for koalabear --- ecc/bls12-377/fr/asm_avx.go | 2 +- ecc/bls12-377/fr/fft/fft.go | 101 ++-- ecc/bls12-377/fr/fft/fft_test.go | 17 + ecc/bls12-377/fr/fft/kernel_purego.go | 28 + ecc/bls12-377/fr/sis/sis.go | 1 - ecc/bls12-381/fr/asm_avx.go | 2 +- ecc/bls12-381/fr/fft/fft.go | 101 ++-- ecc/bls12-381/fr/fft/fft_test.go | 17 + ecc/bls12-381/fr/fft/kernel_purego.go | 28 + ecc/bls24-315/fr/asm_avx.go | 2 +- ecc/bls24-315/fr/fft/fft.go | 101 ++-- ecc/bls24-315/fr/fft/fft_test.go | 17 + ecc/bls24-315/fr/fft/kernel_purego.go | 28 + ecc/bls24-317/fr/asm_avx.go | 2 +- ecc/bls24-317/fr/fft/fft.go | 101 ++-- ecc/bls24-317/fr/fft/fft_test.go | 17 + ecc/bls24-317/fr/fft/kernel_purego.go | 28 + ecc/bn254/fp/asm_avx.go | 2 +- ecc/bn254/fr/asm_avx.go | 2 +- ecc/bn254/fr/fft/fft.go | 101 ++-- ecc/bn254/fr/fft/fft_test.go | 17 + ecc/bn254/fr/fft/kernel_purego.go | 28 + ecc/bw6-633/fr/fft/fft.go | 101 ++-- ecc/bw6-633/fr/fft/fft_test.go | 17 + ecc/bw6-633/fr/fft/kernel_purego.go | 28 + ecc/bw6-761/fr/fft/fft.go | 101 ++-- ecc/bw6-761/fr/fft/fft_test.go | 17 + ecc/bw6-761/fr/fft/kernel_purego.go | 28 + ecc/stark-curve/fp/asm_avx.go | 2 +- ecc/stark-curve/fr/asm_avx.go | 2 +- field/babybear/asm_avx.go | 2 +- field/babybear/fft/fft.go | 127 ++--- field/babybear/fft/fft_test.go | 17 + field/babybear/fft/kernel_amd64.go | 53 ++ field/babybear/fft/kernel_amd64.s | 389 ++++++++++++++ field/babybear/fft/kernel_purego.go | 23 + field/babybear/sis/sis.go | 51 +- field/babybear/vector_amd64.go | 3 + field/generator/asm/amd64/build.go | 22 + field/generator/asm/amd64/element_vec_F31.go | 505 +++++++++++++++++- field/generator/generator_fft.go | 50 +- field/generator/generator_sis.go | 2 + field/generator/internal/addchain/addchain.go | 1 - .../internal/templates/element/asm.go | 2 +- .../templates/element/vector_ops_asm.go | 9 +- .../templates/element/vector_ops_purego.go | 1 + .../internal/templates/fft/fft.go.tmpl | 68 ++- .../templates/fft/kernel.amd64.go.tmpl | 50 ++ .../templates/fft/kernel.purego.go.tmpl | 18 + .../internal/templates/fft/tests/fft.go.tmpl | 17 + .../internal/templates/sis/sis.go.tmpl | 64 ++- field/goldilocks/fft/fft.go | 101 ++-- field/goldilocks/fft/fft_test.go | 17 + field/goldilocks/fft/kernel_purego.go | 28 + field/goldilocks/sis/sis.go | 1 - field/internal/main.go | 1 - field/koalabear/asm_avx.go | 2 +- field/koalabear/fft/fft.go | 127 ++--- field/koalabear/fft/fft_test.go | 17 + field/koalabear/fft/kernel_amd64.go | 53 ++ field/koalabear/fft/kernel_amd64.s | 389 ++++++++++++++ field/koalabear/fft/kernel_purego.go | 23 + field/koalabear/sis/sis.go | 51 +- field/koalabear/vector_amd64.go | 3 + .../generator/crypto/hash/mimc/generate.go | 1 - .../crypto/hash/poseidon2/generate.go | 1 - internal/generator/ecc/generate.go | 1 - internal/generator/fflonk/generator.go | 1 - internal/generator/fri/template/generate.go | 1 - internal/generator/gkr/generate.go | 3 +- .../gkr/template/gkr.test.vectors.gen.go.tmpl | 1 + internal/generator/gkr/test_vectors/main.go | 1 + internal/generator/iop/generate.go | 1 - internal/generator/kzg/generate.go | 1 - internal/generator/main.go | 4 +- internal/generator/pairing/generate.go | 1 - internal/generator/pedersen/generate.go | 1 - internal/generator/permutation/generator.go | 1 - internal/generator/plookup/generate.go | 1 - internal/generator/polynomial/generate.go | 1 - internal/generator/shplonk/generator.go | 1 - internal/generator/sumcheck/generate.go | 3 +- .../generator/test_vector_utils/generate.go | 4 +- internal/generator/tower/generate.go | 1 + 84 files changed, 2625 insertions(+), 731 deletions(-) create mode 100644 ecc/bls12-377/fr/fft/kernel_purego.go create mode 100644 ecc/bls12-381/fr/fft/kernel_purego.go create mode 100644 ecc/bls24-315/fr/fft/kernel_purego.go create mode 100644 ecc/bls24-317/fr/fft/kernel_purego.go create mode 100644 ecc/bn254/fr/fft/kernel_purego.go create mode 100644 ecc/bw6-633/fr/fft/kernel_purego.go create mode 100644 ecc/bw6-761/fr/fft/kernel_purego.go create mode 100644 field/babybear/fft/kernel_amd64.go create mode 100644 field/babybear/fft/kernel_amd64.s create mode 100644 field/babybear/fft/kernel_purego.go create mode 100644 field/generator/internal/templates/fft/kernel.amd64.go.tmpl create mode 100644 field/generator/internal/templates/fft/kernel.purego.go.tmpl create mode 100644 field/goldilocks/fft/kernel_purego.go create mode 100644 field/koalabear/fft/kernel_amd64.go create mode 100644 field/koalabear/fft/kernel_amd64.s create mode 100644 field/koalabear/fft/kernel_purego.go diff --git a/ecc/bls12-377/fr/asm_avx.go b/ecc/bls12-377/fr/asm_avx.go index f89a44926d..b4865c8da2 100644 --- a/ecc/bls12-377/fr/asm_avx.go +++ b/ecc/bls12-377/fr/asm_avx.go @@ -10,6 +10,6 @@ package fr import "golang.org/x/sys/cpu" var ( - supportAvx512 = supportAdx && cpu.X86.HasAVX512 && cpu.X86.HasAVX512DQ + supportAvx512 = supportAdx && cpu.X86.HasAVX512 && cpu.X86.HasAVX512DQ && cpu.X86.HasAVX512VBMI2 _ = supportAvx512 ) diff --git a/ecc/bls12-377/fr/fft/fft.go b/ecc/bls12-377/fr/fft/fft.go index 9ab74a303b..32b74acfc0 100644 --- a/ecc/bls12-377/fr/fft/fft.go +++ b/ecc/bls12-377/fr/fft/fft.go @@ -201,14 +201,13 @@ func difFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart if n == 1 { return } else if stage >= twiddlesStartStage { - if n == 256 { - kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) + if n == 1<<5 { + kerDIFNP_32(a, twiddles, stage-twiddlesStartStage) return - } else if n == 64 { - kerDIFNP_64(a, twiddles, stage-twiddlesStartStage) + } else if n == 1<<8 { + kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) return } - } m := n >> 1 @@ -258,7 +257,7 @@ func difFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart } -func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { +func innerDIFWithTwiddlesGeneric(a []fr.Element, twiddles []fr.Element, start, end, m int) { if start == 0 { fr.Butterfly(&a[0], &a[m]) start++ @@ -266,7 +265,6 @@ func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m i for i := start; i < end; i++ { fr.Butterfly(&a[i], &a[i+m]) } - // TODO @gbotrel: here the butterfly for most cases could leave the result not reduced mod q v1 := fr.Vector(a[start+m : end+m]) v2 := fr.Vector(twiddles[start:end]) v1.Mul(v1, v2) @@ -292,14 +290,13 @@ func ditFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart if n == 1 { return } else if stage >= twiddlesStartStage { - if n == 64 { - kerDITNP_64(a, twiddles, stage-twiddlesStartStage) + if n == 1<<5 { + kerDITNP_32(a, twiddles, stage-twiddlesStartStage) return - } else if n == 256 { + } else if n == 1<<8 { kerDITNP_256(a, twiddles, stage-twiddlesStartStage) return } - } m := n >> 1 @@ -374,7 +371,43 @@ func innerDITWithoutTwiddles(a []fr.Element, at, w fr.Element, start, end, m int } } -func kerDIFNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { +func kerDIFNP_32generic(a []fr.Element, twiddles [][]fr.Element, stage int) { + // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl + + innerDIFWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) + for offset := 0; offset < 32; offset += 16 { + innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) + } + for offset := 0; offset < 32; offset += 8 { + innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) + } + for offset := 0; offset < 32; offset += 4 { + innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) + } + for offset := 0; offset < 32; offset += 2 { + fr.Butterfly(&a[offset], &a[offset+1]) + } +} + +func kerDITNP_32generic(a []fr.Element, twiddles [][]fr.Element, stage int) { + // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl + + for offset := 0; offset < 32; offset += 2 { + fr.Butterfly(&a[offset], &a[offset+1]) + } + for offset := 0; offset < 32; offset += 4 { + innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) + } + for offset := 0; offset < 32; offset += 8 { + innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) + } + for offset := 0; offset < 32; offset += 16 { + innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) + } + innerDITWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) +} + +func kerDIFNP_256generic(a []fr.Element, twiddles [][]fr.Element, stage int) { // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl innerDIFWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) @@ -401,7 +434,7 @@ func kerDIFNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { } } -func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { +func kerDITNP_256generic(a []fr.Element, twiddles [][]fr.Element, stage int) { // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl for offset := 0; offset < 256; offset += 2 { @@ -427,45 +460,3 @@ func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { } innerDITWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) } - -func kerDIFNP_64(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - innerDIFWithTwiddles(a[:64], twiddles[stage+0], 0, 32, 32) - for offset := 0; offset < 64; offset += 32 { - innerDIFWithTwiddles(a[offset:offset+32], twiddles[stage+1], 0, 16, 16) - } - for offset := 0; offset < 64; offset += 16 { - innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+2], 0, 8, 8) - } - for offset := 0; offset < 64; offset += 8 { - innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+3], 0, 4, 4) - } - for offset := 0; offset < 64; offset += 4 { - innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+4], 0, 2, 2) - } - for offset := 0; offset < 64; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } -} - -func kerDITNP_64(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - for offset := 0; offset < 64; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } - for offset := 0; offset < 64; offset += 4 { - innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+4], 0, 2, 2) - } - for offset := 0; offset < 64; offset += 8 { - innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+3], 0, 4, 4) - } - for offset := 0; offset < 64; offset += 16 { - innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+2], 0, 8, 8) - } - for offset := 0; offset < 64; offset += 32 { - innerDITWithTwiddles(a[offset:offset+32], twiddles[stage+1], 0, 16, 16) - } - innerDITWithTwiddles(a[:64], twiddles[stage+0], 0, 32, 32) -} diff --git a/ecc/bls12-377/fr/fft/fft_test.go b/ecc/bls12-377/fr/fft/fft_test.go index cf3b2711a8..4c47f0f5b4 100644 --- a/ecc/bls12-377/fr/fft/fft_test.go +++ b/ecc/bls12-377/fr/fft/fft_test.go @@ -305,6 +305,23 @@ func BenchmarkFFTDIFReference(b *testing.B) { } } +func BenchmarkFFTDIFReferenceSmall(b *testing.B) { + const maxSize = 1 << 9 + + pol := make([]fr.Element, maxSize) + pol[0].SetRandom() + for i := 1; i < maxSize; i++ { + pol[i] = pol[i-1] + } + + domain := NewDomain(maxSize) + + b.ResetTimer() + for j := 0; j < b.N; j++ { + domain.FFT(pol, DIF) + } +} + func evaluatePolynomial(pol []fr.Element, val fr.Element) fr.Element { var acc, res, tmp fr.Element res.Set(&pol[0]) diff --git a/ecc/bls12-377/fr/fft/kernel_purego.go b/ecc/bls12-377/fr/fft/kernel_purego.go new file mode 100644 index 0000000000..4d89a65a2b --- /dev/null +++ b/ecc/bls12-377/fr/fft/kernel_purego.go @@ -0,0 +1,28 @@ +// Copyright 2020-2025 Consensys Software Inc. +// Licensed under the Apache License, Version 2.0. See the LICENSE file for details. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package fft + +import ( + "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" +) + +func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { + innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) +} + +func kerDIFNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDIFNP_32generic(a, twiddles, stage) +} +func kerDITNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDITNP_32generic(a, twiddles, stage) +} + +func kerDIFNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDIFNP_256generic(a, twiddles, stage) +} +func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDITNP_256generic(a, twiddles, stage) +} diff --git a/ecc/bls12-377/fr/sis/sis.go b/ecc/bls12-377/fr/sis/sis.go index 93c461c4f7..c773a03441 100644 --- a/ecc/bls12-377/fr/sis/sis.go +++ b/ecc/bls12-377/fr/sis/sis.go @@ -159,7 +159,6 @@ func (r *RSis) Hash(v, res []fr.Element) error { // full FFT mask = uint64(len(partialFFT_64) - 1) } - // inner hash it := NewLimbIterator(&VectorIterator{v: v}, r.LogTwoBound/8) for i := 0; i < len(r.Ag); i++ { diff --git a/ecc/bls12-381/fr/asm_avx.go b/ecc/bls12-381/fr/asm_avx.go index f89a44926d..b4865c8da2 100644 --- a/ecc/bls12-381/fr/asm_avx.go +++ b/ecc/bls12-381/fr/asm_avx.go @@ -10,6 +10,6 @@ package fr import "golang.org/x/sys/cpu" var ( - supportAvx512 = supportAdx && cpu.X86.HasAVX512 && cpu.X86.HasAVX512DQ + supportAvx512 = supportAdx && cpu.X86.HasAVX512 && cpu.X86.HasAVX512DQ && cpu.X86.HasAVX512VBMI2 _ = supportAvx512 ) diff --git a/ecc/bls12-381/fr/fft/fft.go b/ecc/bls12-381/fr/fft/fft.go index 17f7023b91..088fb10e91 100644 --- a/ecc/bls12-381/fr/fft/fft.go +++ b/ecc/bls12-381/fr/fft/fft.go @@ -201,14 +201,13 @@ func difFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart if n == 1 { return } else if stage >= twiddlesStartStage { - if n == 256 { - kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) + if n == 1<<5 { + kerDIFNP_32(a, twiddles, stage-twiddlesStartStage) return - } else if n == 64 { - kerDIFNP_64(a, twiddles, stage-twiddlesStartStage) + } else if n == 1<<8 { + kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) return } - } m := n >> 1 @@ -258,7 +257,7 @@ func difFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart } -func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { +func innerDIFWithTwiddlesGeneric(a []fr.Element, twiddles []fr.Element, start, end, m int) { if start == 0 { fr.Butterfly(&a[0], &a[m]) start++ @@ -266,7 +265,6 @@ func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m i for i := start; i < end; i++ { fr.Butterfly(&a[i], &a[i+m]) } - // TODO @gbotrel: here the butterfly for most cases could leave the result not reduced mod q v1 := fr.Vector(a[start+m : end+m]) v2 := fr.Vector(twiddles[start:end]) v1.Mul(v1, v2) @@ -292,14 +290,13 @@ func ditFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart if n == 1 { return } else if stage >= twiddlesStartStage { - if n == 64 { - kerDITNP_64(a, twiddles, stage-twiddlesStartStage) + if n == 1<<5 { + kerDITNP_32(a, twiddles, stage-twiddlesStartStage) return - } else if n == 256 { + } else if n == 1<<8 { kerDITNP_256(a, twiddles, stage-twiddlesStartStage) return } - } m := n >> 1 @@ -374,7 +371,43 @@ func innerDITWithoutTwiddles(a []fr.Element, at, w fr.Element, start, end, m int } } -func kerDIFNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { +func kerDIFNP_32generic(a []fr.Element, twiddles [][]fr.Element, stage int) { + // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl + + innerDIFWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) + for offset := 0; offset < 32; offset += 16 { + innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) + } + for offset := 0; offset < 32; offset += 8 { + innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) + } + for offset := 0; offset < 32; offset += 4 { + innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) + } + for offset := 0; offset < 32; offset += 2 { + fr.Butterfly(&a[offset], &a[offset+1]) + } +} + +func kerDITNP_32generic(a []fr.Element, twiddles [][]fr.Element, stage int) { + // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl + + for offset := 0; offset < 32; offset += 2 { + fr.Butterfly(&a[offset], &a[offset+1]) + } + for offset := 0; offset < 32; offset += 4 { + innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) + } + for offset := 0; offset < 32; offset += 8 { + innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) + } + for offset := 0; offset < 32; offset += 16 { + innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) + } + innerDITWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) +} + +func kerDIFNP_256generic(a []fr.Element, twiddles [][]fr.Element, stage int) { // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl innerDIFWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) @@ -401,7 +434,7 @@ func kerDIFNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { } } -func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { +func kerDITNP_256generic(a []fr.Element, twiddles [][]fr.Element, stage int) { // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl for offset := 0; offset < 256; offset += 2 { @@ -427,45 +460,3 @@ func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { } innerDITWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) } - -func kerDIFNP_64(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - innerDIFWithTwiddles(a[:64], twiddles[stage+0], 0, 32, 32) - for offset := 0; offset < 64; offset += 32 { - innerDIFWithTwiddles(a[offset:offset+32], twiddles[stage+1], 0, 16, 16) - } - for offset := 0; offset < 64; offset += 16 { - innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+2], 0, 8, 8) - } - for offset := 0; offset < 64; offset += 8 { - innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+3], 0, 4, 4) - } - for offset := 0; offset < 64; offset += 4 { - innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+4], 0, 2, 2) - } - for offset := 0; offset < 64; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } -} - -func kerDITNP_64(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - for offset := 0; offset < 64; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } - for offset := 0; offset < 64; offset += 4 { - innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+4], 0, 2, 2) - } - for offset := 0; offset < 64; offset += 8 { - innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+3], 0, 4, 4) - } - for offset := 0; offset < 64; offset += 16 { - innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+2], 0, 8, 8) - } - for offset := 0; offset < 64; offset += 32 { - innerDITWithTwiddles(a[offset:offset+32], twiddles[stage+1], 0, 16, 16) - } - innerDITWithTwiddles(a[:64], twiddles[stage+0], 0, 32, 32) -} diff --git a/ecc/bls12-381/fr/fft/fft_test.go b/ecc/bls12-381/fr/fft/fft_test.go index a2de1acce0..a150e790dd 100644 --- a/ecc/bls12-381/fr/fft/fft_test.go +++ b/ecc/bls12-381/fr/fft/fft_test.go @@ -305,6 +305,23 @@ func BenchmarkFFTDIFReference(b *testing.B) { } } +func BenchmarkFFTDIFReferenceSmall(b *testing.B) { + const maxSize = 1 << 9 + + pol := make([]fr.Element, maxSize) + pol[0].SetRandom() + for i := 1; i < maxSize; i++ { + pol[i] = pol[i-1] + } + + domain := NewDomain(maxSize) + + b.ResetTimer() + for j := 0; j < b.N; j++ { + domain.FFT(pol, DIF) + } +} + func evaluatePolynomial(pol []fr.Element, val fr.Element) fr.Element { var acc, res, tmp fr.Element res.Set(&pol[0]) diff --git a/ecc/bls12-381/fr/fft/kernel_purego.go b/ecc/bls12-381/fr/fft/kernel_purego.go new file mode 100644 index 0000000000..c14f582e7e --- /dev/null +++ b/ecc/bls12-381/fr/fft/kernel_purego.go @@ -0,0 +1,28 @@ +// Copyright 2020-2025 Consensys Software Inc. +// Licensed under the Apache License, Version 2.0. See the LICENSE file for details. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package fft + +import ( + "github.com/consensys/gnark-crypto/ecc/bls12-381/fr" +) + +func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { + innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) +} + +func kerDIFNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDIFNP_32generic(a, twiddles, stage) +} +func kerDITNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDITNP_32generic(a, twiddles, stage) +} + +func kerDIFNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDIFNP_256generic(a, twiddles, stage) +} +func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDITNP_256generic(a, twiddles, stage) +} diff --git a/ecc/bls24-315/fr/asm_avx.go b/ecc/bls24-315/fr/asm_avx.go index f89a44926d..b4865c8da2 100644 --- a/ecc/bls24-315/fr/asm_avx.go +++ b/ecc/bls24-315/fr/asm_avx.go @@ -10,6 +10,6 @@ package fr import "golang.org/x/sys/cpu" var ( - supportAvx512 = supportAdx && cpu.X86.HasAVX512 && cpu.X86.HasAVX512DQ + supportAvx512 = supportAdx && cpu.X86.HasAVX512 && cpu.X86.HasAVX512DQ && cpu.X86.HasAVX512VBMI2 _ = supportAvx512 ) diff --git a/ecc/bls24-315/fr/fft/fft.go b/ecc/bls24-315/fr/fft/fft.go index 8630671963..1508d143a2 100644 --- a/ecc/bls24-315/fr/fft/fft.go +++ b/ecc/bls24-315/fr/fft/fft.go @@ -201,14 +201,13 @@ func difFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart if n == 1 { return } else if stage >= twiddlesStartStage { - if n == 256 { - kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) + if n == 1<<5 { + kerDIFNP_32(a, twiddles, stage-twiddlesStartStage) return - } else if n == 64 { - kerDIFNP_64(a, twiddles, stage-twiddlesStartStage) + } else if n == 1<<8 { + kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) return } - } m := n >> 1 @@ -258,7 +257,7 @@ func difFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart } -func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { +func innerDIFWithTwiddlesGeneric(a []fr.Element, twiddles []fr.Element, start, end, m int) { if start == 0 { fr.Butterfly(&a[0], &a[m]) start++ @@ -266,7 +265,6 @@ func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m i for i := start; i < end; i++ { fr.Butterfly(&a[i], &a[i+m]) } - // TODO @gbotrel: here the butterfly for most cases could leave the result not reduced mod q v1 := fr.Vector(a[start+m : end+m]) v2 := fr.Vector(twiddles[start:end]) v1.Mul(v1, v2) @@ -292,14 +290,13 @@ func ditFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart if n == 1 { return } else if stage >= twiddlesStartStage { - if n == 64 { - kerDITNP_64(a, twiddles, stage-twiddlesStartStage) + if n == 1<<5 { + kerDITNP_32(a, twiddles, stage-twiddlesStartStage) return - } else if n == 256 { + } else if n == 1<<8 { kerDITNP_256(a, twiddles, stage-twiddlesStartStage) return } - } m := n >> 1 @@ -374,7 +371,43 @@ func innerDITWithoutTwiddles(a []fr.Element, at, w fr.Element, start, end, m int } } -func kerDIFNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { +func kerDIFNP_32generic(a []fr.Element, twiddles [][]fr.Element, stage int) { + // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl + + innerDIFWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) + for offset := 0; offset < 32; offset += 16 { + innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) + } + for offset := 0; offset < 32; offset += 8 { + innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) + } + for offset := 0; offset < 32; offset += 4 { + innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) + } + for offset := 0; offset < 32; offset += 2 { + fr.Butterfly(&a[offset], &a[offset+1]) + } +} + +func kerDITNP_32generic(a []fr.Element, twiddles [][]fr.Element, stage int) { + // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl + + for offset := 0; offset < 32; offset += 2 { + fr.Butterfly(&a[offset], &a[offset+1]) + } + for offset := 0; offset < 32; offset += 4 { + innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) + } + for offset := 0; offset < 32; offset += 8 { + innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) + } + for offset := 0; offset < 32; offset += 16 { + innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) + } + innerDITWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) +} + +func kerDIFNP_256generic(a []fr.Element, twiddles [][]fr.Element, stage int) { // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl innerDIFWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) @@ -401,7 +434,7 @@ func kerDIFNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { } } -func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { +func kerDITNP_256generic(a []fr.Element, twiddles [][]fr.Element, stage int) { // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl for offset := 0; offset < 256; offset += 2 { @@ -427,45 +460,3 @@ func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { } innerDITWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) } - -func kerDIFNP_64(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - innerDIFWithTwiddles(a[:64], twiddles[stage+0], 0, 32, 32) - for offset := 0; offset < 64; offset += 32 { - innerDIFWithTwiddles(a[offset:offset+32], twiddles[stage+1], 0, 16, 16) - } - for offset := 0; offset < 64; offset += 16 { - innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+2], 0, 8, 8) - } - for offset := 0; offset < 64; offset += 8 { - innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+3], 0, 4, 4) - } - for offset := 0; offset < 64; offset += 4 { - innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+4], 0, 2, 2) - } - for offset := 0; offset < 64; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } -} - -func kerDITNP_64(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - for offset := 0; offset < 64; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } - for offset := 0; offset < 64; offset += 4 { - innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+4], 0, 2, 2) - } - for offset := 0; offset < 64; offset += 8 { - innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+3], 0, 4, 4) - } - for offset := 0; offset < 64; offset += 16 { - innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+2], 0, 8, 8) - } - for offset := 0; offset < 64; offset += 32 { - innerDITWithTwiddles(a[offset:offset+32], twiddles[stage+1], 0, 16, 16) - } - innerDITWithTwiddles(a[:64], twiddles[stage+0], 0, 32, 32) -} diff --git a/ecc/bls24-315/fr/fft/fft_test.go b/ecc/bls24-315/fr/fft/fft_test.go index 73000af837..7f6f55a59b 100644 --- a/ecc/bls24-315/fr/fft/fft_test.go +++ b/ecc/bls24-315/fr/fft/fft_test.go @@ -305,6 +305,23 @@ func BenchmarkFFTDIFReference(b *testing.B) { } } +func BenchmarkFFTDIFReferenceSmall(b *testing.B) { + const maxSize = 1 << 9 + + pol := make([]fr.Element, maxSize) + pol[0].SetRandom() + for i := 1; i < maxSize; i++ { + pol[i] = pol[i-1] + } + + domain := NewDomain(maxSize) + + b.ResetTimer() + for j := 0; j < b.N; j++ { + domain.FFT(pol, DIF) + } +} + func evaluatePolynomial(pol []fr.Element, val fr.Element) fr.Element { var acc, res, tmp fr.Element res.Set(&pol[0]) diff --git a/ecc/bls24-315/fr/fft/kernel_purego.go b/ecc/bls24-315/fr/fft/kernel_purego.go new file mode 100644 index 0000000000..fe96f2bbb2 --- /dev/null +++ b/ecc/bls24-315/fr/fft/kernel_purego.go @@ -0,0 +1,28 @@ +// Copyright 2020-2025 Consensys Software Inc. +// Licensed under the Apache License, Version 2.0. See the LICENSE file for details. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package fft + +import ( + "github.com/consensys/gnark-crypto/ecc/bls24-315/fr" +) + +func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { + innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) +} + +func kerDIFNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDIFNP_32generic(a, twiddles, stage) +} +func kerDITNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDITNP_32generic(a, twiddles, stage) +} + +func kerDIFNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDIFNP_256generic(a, twiddles, stage) +} +func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDITNP_256generic(a, twiddles, stage) +} diff --git a/ecc/bls24-317/fr/asm_avx.go b/ecc/bls24-317/fr/asm_avx.go index f89a44926d..b4865c8da2 100644 --- a/ecc/bls24-317/fr/asm_avx.go +++ b/ecc/bls24-317/fr/asm_avx.go @@ -10,6 +10,6 @@ package fr import "golang.org/x/sys/cpu" var ( - supportAvx512 = supportAdx && cpu.X86.HasAVX512 && cpu.X86.HasAVX512DQ + supportAvx512 = supportAdx && cpu.X86.HasAVX512 && cpu.X86.HasAVX512DQ && cpu.X86.HasAVX512VBMI2 _ = supportAvx512 ) diff --git a/ecc/bls24-317/fr/fft/fft.go b/ecc/bls24-317/fr/fft/fft.go index 42efabd2ff..65a9e85f4d 100644 --- a/ecc/bls24-317/fr/fft/fft.go +++ b/ecc/bls24-317/fr/fft/fft.go @@ -201,14 +201,13 @@ func difFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart if n == 1 { return } else if stage >= twiddlesStartStage { - if n == 256 { - kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) + if n == 1<<5 { + kerDIFNP_32(a, twiddles, stage-twiddlesStartStage) return - } else if n == 64 { - kerDIFNP_64(a, twiddles, stage-twiddlesStartStage) + } else if n == 1<<8 { + kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) return } - } m := n >> 1 @@ -258,7 +257,7 @@ func difFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart } -func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { +func innerDIFWithTwiddlesGeneric(a []fr.Element, twiddles []fr.Element, start, end, m int) { if start == 0 { fr.Butterfly(&a[0], &a[m]) start++ @@ -266,7 +265,6 @@ func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m i for i := start; i < end; i++ { fr.Butterfly(&a[i], &a[i+m]) } - // TODO @gbotrel: here the butterfly for most cases could leave the result not reduced mod q v1 := fr.Vector(a[start+m : end+m]) v2 := fr.Vector(twiddles[start:end]) v1.Mul(v1, v2) @@ -292,14 +290,13 @@ func ditFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart if n == 1 { return } else if stage >= twiddlesStartStage { - if n == 64 { - kerDITNP_64(a, twiddles, stage-twiddlesStartStage) + if n == 1<<5 { + kerDITNP_32(a, twiddles, stage-twiddlesStartStage) return - } else if n == 256 { + } else if n == 1<<8 { kerDITNP_256(a, twiddles, stage-twiddlesStartStage) return } - } m := n >> 1 @@ -374,7 +371,43 @@ func innerDITWithoutTwiddles(a []fr.Element, at, w fr.Element, start, end, m int } } -func kerDIFNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { +func kerDIFNP_32generic(a []fr.Element, twiddles [][]fr.Element, stage int) { + // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl + + innerDIFWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) + for offset := 0; offset < 32; offset += 16 { + innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) + } + for offset := 0; offset < 32; offset += 8 { + innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) + } + for offset := 0; offset < 32; offset += 4 { + innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) + } + for offset := 0; offset < 32; offset += 2 { + fr.Butterfly(&a[offset], &a[offset+1]) + } +} + +func kerDITNP_32generic(a []fr.Element, twiddles [][]fr.Element, stage int) { + // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl + + for offset := 0; offset < 32; offset += 2 { + fr.Butterfly(&a[offset], &a[offset+1]) + } + for offset := 0; offset < 32; offset += 4 { + innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) + } + for offset := 0; offset < 32; offset += 8 { + innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) + } + for offset := 0; offset < 32; offset += 16 { + innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) + } + innerDITWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) +} + +func kerDIFNP_256generic(a []fr.Element, twiddles [][]fr.Element, stage int) { // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl innerDIFWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) @@ -401,7 +434,7 @@ func kerDIFNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { } } -func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { +func kerDITNP_256generic(a []fr.Element, twiddles [][]fr.Element, stage int) { // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl for offset := 0; offset < 256; offset += 2 { @@ -427,45 +460,3 @@ func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { } innerDITWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) } - -func kerDIFNP_64(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - innerDIFWithTwiddles(a[:64], twiddles[stage+0], 0, 32, 32) - for offset := 0; offset < 64; offset += 32 { - innerDIFWithTwiddles(a[offset:offset+32], twiddles[stage+1], 0, 16, 16) - } - for offset := 0; offset < 64; offset += 16 { - innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+2], 0, 8, 8) - } - for offset := 0; offset < 64; offset += 8 { - innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+3], 0, 4, 4) - } - for offset := 0; offset < 64; offset += 4 { - innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+4], 0, 2, 2) - } - for offset := 0; offset < 64; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } -} - -func kerDITNP_64(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - for offset := 0; offset < 64; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } - for offset := 0; offset < 64; offset += 4 { - innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+4], 0, 2, 2) - } - for offset := 0; offset < 64; offset += 8 { - innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+3], 0, 4, 4) - } - for offset := 0; offset < 64; offset += 16 { - innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+2], 0, 8, 8) - } - for offset := 0; offset < 64; offset += 32 { - innerDITWithTwiddles(a[offset:offset+32], twiddles[stage+1], 0, 16, 16) - } - innerDITWithTwiddles(a[:64], twiddles[stage+0], 0, 32, 32) -} diff --git a/ecc/bls24-317/fr/fft/fft_test.go b/ecc/bls24-317/fr/fft/fft_test.go index 0cbc6ccca2..db52f90a95 100644 --- a/ecc/bls24-317/fr/fft/fft_test.go +++ b/ecc/bls24-317/fr/fft/fft_test.go @@ -305,6 +305,23 @@ func BenchmarkFFTDIFReference(b *testing.B) { } } +func BenchmarkFFTDIFReferenceSmall(b *testing.B) { + const maxSize = 1 << 9 + + pol := make([]fr.Element, maxSize) + pol[0].SetRandom() + for i := 1; i < maxSize; i++ { + pol[i] = pol[i-1] + } + + domain := NewDomain(maxSize) + + b.ResetTimer() + for j := 0; j < b.N; j++ { + domain.FFT(pol, DIF) + } +} + func evaluatePolynomial(pol []fr.Element, val fr.Element) fr.Element { var acc, res, tmp fr.Element res.Set(&pol[0]) diff --git a/ecc/bls24-317/fr/fft/kernel_purego.go b/ecc/bls24-317/fr/fft/kernel_purego.go new file mode 100644 index 0000000000..2a1738dcad --- /dev/null +++ b/ecc/bls24-317/fr/fft/kernel_purego.go @@ -0,0 +1,28 @@ +// Copyright 2020-2025 Consensys Software Inc. +// Licensed under the Apache License, Version 2.0. See the LICENSE file for details. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package fft + +import ( + "github.com/consensys/gnark-crypto/ecc/bls24-317/fr" +) + +func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { + innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) +} + +func kerDIFNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDIFNP_32generic(a, twiddles, stage) +} +func kerDITNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDITNP_32generic(a, twiddles, stage) +} + +func kerDIFNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDIFNP_256generic(a, twiddles, stage) +} +func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDITNP_256generic(a, twiddles, stage) +} diff --git a/ecc/bn254/fp/asm_avx.go b/ecc/bn254/fp/asm_avx.go index 45e1ab3f0d..93490b31fd 100644 --- a/ecc/bn254/fp/asm_avx.go +++ b/ecc/bn254/fp/asm_avx.go @@ -10,6 +10,6 @@ package fp import "golang.org/x/sys/cpu" var ( - supportAvx512 = supportAdx && cpu.X86.HasAVX512 && cpu.X86.HasAVX512DQ + supportAvx512 = supportAdx && cpu.X86.HasAVX512 && cpu.X86.HasAVX512DQ && cpu.X86.HasAVX512VBMI2 _ = supportAvx512 ) diff --git a/ecc/bn254/fr/asm_avx.go b/ecc/bn254/fr/asm_avx.go index f89a44926d..b4865c8da2 100644 --- a/ecc/bn254/fr/asm_avx.go +++ b/ecc/bn254/fr/asm_avx.go @@ -10,6 +10,6 @@ package fr import "golang.org/x/sys/cpu" var ( - supportAvx512 = supportAdx && cpu.X86.HasAVX512 && cpu.X86.HasAVX512DQ + supportAvx512 = supportAdx && cpu.X86.HasAVX512 && cpu.X86.HasAVX512DQ && cpu.X86.HasAVX512VBMI2 _ = supportAvx512 ) diff --git a/ecc/bn254/fr/fft/fft.go b/ecc/bn254/fr/fft/fft.go index 4def2f70d5..4e972fb442 100644 --- a/ecc/bn254/fr/fft/fft.go +++ b/ecc/bn254/fr/fft/fft.go @@ -201,14 +201,13 @@ func difFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart if n == 1 { return } else if stage >= twiddlesStartStage { - if n == 256 { - kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) + if n == 1<<5 { + kerDIFNP_32(a, twiddles, stage-twiddlesStartStage) return - } else if n == 64 { - kerDIFNP_64(a, twiddles, stage-twiddlesStartStage) + } else if n == 1<<8 { + kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) return } - } m := n >> 1 @@ -258,7 +257,7 @@ func difFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart } -func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { +func innerDIFWithTwiddlesGeneric(a []fr.Element, twiddles []fr.Element, start, end, m int) { if start == 0 { fr.Butterfly(&a[0], &a[m]) start++ @@ -266,7 +265,6 @@ func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m i for i := start; i < end; i++ { fr.Butterfly(&a[i], &a[i+m]) } - // TODO @gbotrel: here the butterfly for most cases could leave the result not reduced mod q v1 := fr.Vector(a[start+m : end+m]) v2 := fr.Vector(twiddles[start:end]) v1.Mul(v1, v2) @@ -292,14 +290,13 @@ func ditFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart if n == 1 { return } else if stage >= twiddlesStartStage { - if n == 64 { - kerDITNP_64(a, twiddles, stage-twiddlesStartStage) + if n == 1<<5 { + kerDITNP_32(a, twiddles, stage-twiddlesStartStage) return - } else if n == 256 { + } else if n == 1<<8 { kerDITNP_256(a, twiddles, stage-twiddlesStartStage) return } - } m := n >> 1 @@ -374,7 +371,43 @@ func innerDITWithoutTwiddles(a []fr.Element, at, w fr.Element, start, end, m int } } -func kerDIFNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { +func kerDIFNP_32generic(a []fr.Element, twiddles [][]fr.Element, stage int) { + // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl + + innerDIFWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) + for offset := 0; offset < 32; offset += 16 { + innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) + } + for offset := 0; offset < 32; offset += 8 { + innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) + } + for offset := 0; offset < 32; offset += 4 { + innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) + } + for offset := 0; offset < 32; offset += 2 { + fr.Butterfly(&a[offset], &a[offset+1]) + } +} + +func kerDITNP_32generic(a []fr.Element, twiddles [][]fr.Element, stage int) { + // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl + + for offset := 0; offset < 32; offset += 2 { + fr.Butterfly(&a[offset], &a[offset+1]) + } + for offset := 0; offset < 32; offset += 4 { + innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) + } + for offset := 0; offset < 32; offset += 8 { + innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) + } + for offset := 0; offset < 32; offset += 16 { + innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) + } + innerDITWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) +} + +func kerDIFNP_256generic(a []fr.Element, twiddles [][]fr.Element, stage int) { // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl innerDIFWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) @@ -401,7 +434,7 @@ func kerDIFNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { } } -func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { +func kerDITNP_256generic(a []fr.Element, twiddles [][]fr.Element, stage int) { // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl for offset := 0; offset < 256; offset += 2 { @@ -427,45 +460,3 @@ func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { } innerDITWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) } - -func kerDIFNP_64(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - innerDIFWithTwiddles(a[:64], twiddles[stage+0], 0, 32, 32) - for offset := 0; offset < 64; offset += 32 { - innerDIFWithTwiddles(a[offset:offset+32], twiddles[stage+1], 0, 16, 16) - } - for offset := 0; offset < 64; offset += 16 { - innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+2], 0, 8, 8) - } - for offset := 0; offset < 64; offset += 8 { - innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+3], 0, 4, 4) - } - for offset := 0; offset < 64; offset += 4 { - innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+4], 0, 2, 2) - } - for offset := 0; offset < 64; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } -} - -func kerDITNP_64(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - for offset := 0; offset < 64; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } - for offset := 0; offset < 64; offset += 4 { - innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+4], 0, 2, 2) - } - for offset := 0; offset < 64; offset += 8 { - innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+3], 0, 4, 4) - } - for offset := 0; offset < 64; offset += 16 { - innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+2], 0, 8, 8) - } - for offset := 0; offset < 64; offset += 32 { - innerDITWithTwiddles(a[offset:offset+32], twiddles[stage+1], 0, 16, 16) - } - innerDITWithTwiddles(a[:64], twiddles[stage+0], 0, 32, 32) -} diff --git a/ecc/bn254/fr/fft/fft_test.go b/ecc/bn254/fr/fft/fft_test.go index 89237d314b..c3e5a5e1da 100644 --- a/ecc/bn254/fr/fft/fft_test.go +++ b/ecc/bn254/fr/fft/fft_test.go @@ -305,6 +305,23 @@ func BenchmarkFFTDIFReference(b *testing.B) { } } +func BenchmarkFFTDIFReferenceSmall(b *testing.B) { + const maxSize = 1 << 9 + + pol := make([]fr.Element, maxSize) + pol[0].SetRandom() + for i := 1; i < maxSize; i++ { + pol[i] = pol[i-1] + } + + domain := NewDomain(maxSize) + + b.ResetTimer() + for j := 0; j < b.N; j++ { + domain.FFT(pol, DIF) + } +} + func evaluatePolynomial(pol []fr.Element, val fr.Element) fr.Element { var acc, res, tmp fr.Element res.Set(&pol[0]) diff --git a/ecc/bn254/fr/fft/kernel_purego.go b/ecc/bn254/fr/fft/kernel_purego.go new file mode 100644 index 0000000000..c7b657402c --- /dev/null +++ b/ecc/bn254/fr/fft/kernel_purego.go @@ -0,0 +1,28 @@ +// Copyright 2020-2025 Consensys Software Inc. +// Licensed under the Apache License, Version 2.0. See the LICENSE file for details. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package fft + +import ( + "github.com/consensys/gnark-crypto/ecc/bn254/fr" +) + +func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { + innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) +} + +func kerDIFNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDIFNP_32generic(a, twiddles, stage) +} +func kerDITNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDITNP_32generic(a, twiddles, stage) +} + +func kerDIFNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDIFNP_256generic(a, twiddles, stage) +} +func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDITNP_256generic(a, twiddles, stage) +} diff --git a/ecc/bw6-633/fr/fft/fft.go b/ecc/bw6-633/fr/fft/fft.go index 51f49f4836..dd6ff7fa41 100644 --- a/ecc/bw6-633/fr/fft/fft.go +++ b/ecc/bw6-633/fr/fft/fft.go @@ -201,14 +201,13 @@ func difFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart if n == 1 { return } else if stage >= twiddlesStartStage { - if n == 256 { - kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) + if n == 1<<5 { + kerDIFNP_32(a, twiddles, stage-twiddlesStartStage) return - } else if n == 64 { - kerDIFNP_64(a, twiddles, stage-twiddlesStartStage) + } else if n == 1<<8 { + kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) return } - } m := n >> 1 @@ -258,7 +257,7 @@ func difFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart } -func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { +func innerDIFWithTwiddlesGeneric(a []fr.Element, twiddles []fr.Element, start, end, m int) { if start == 0 { fr.Butterfly(&a[0], &a[m]) start++ @@ -266,7 +265,6 @@ func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m i for i := start; i < end; i++ { fr.Butterfly(&a[i], &a[i+m]) } - // TODO @gbotrel: here the butterfly for most cases could leave the result not reduced mod q v1 := fr.Vector(a[start+m : end+m]) v2 := fr.Vector(twiddles[start:end]) v1.Mul(v1, v2) @@ -292,14 +290,13 @@ func ditFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart if n == 1 { return } else if stage >= twiddlesStartStage { - if n == 64 { - kerDITNP_64(a, twiddles, stage-twiddlesStartStage) + if n == 1<<5 { + kerDITNP_32(a, twiddles, stage-twiddlesStartStage) return - } else if n == 256 { + } else if n == 1<<8 { kerDITNP_256(a, twiddles, stage-twiddlesStartStage) return } - } m := n >> 1 @@ -374,7 +371,43 @@ func innerDITWithoutTwiddles(a []fr.Element, at, w fr.Element, start, end, m int } } -func kerDIFNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { +func kerDIFNP_32generic(a []fr.Element, twiddles [][]fr.Element, stage int) { + // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl + + innerDIFWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) + for offset := 0; offset < 32; offset += 16 { + innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) + } + for offset := 0; offset < 32; offset += 8 { + innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) + } + for offset := 0; offset < 32; offset += 4 { + innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) + } + for offset := 0; offset < 32; offset += 2 { + fr.Butterfly(&a[offset], &a[offset+1]) + } +} + +func kerDITNP_32generic(a []fr.Element, twiddles [][]fr.Element, stage int) { + // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl + + for offset := 0; offset < 32; offset += 2 { + fr.Butterfly(&a[offset], &a[offset+1]) + } + for offset := 0; offset < 32; offset += 4 { + innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) + } + for offset := 0; offset < 32; offset += 8 { + innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) + } + for offset := 0; offset < 32; offset += 16 { + innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) + } + innerDITWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) +} + +func kerDIFNP_256generic(a []fr.Element, twiddles [][]fr.Element, stage int) { // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl innerDIFWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) @@ -401,7 +434,7 @@ func kerDIFNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { } } -func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { +func kerDITNP_256generic(a []fr.Element, twiddles [][]fr.Element, stage int) { // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl for offset := 0; offset < 256; offset += 2 { @@ -427,45 +460,3 @@ func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { } innerDITWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) } - -func kerDIFNP_64(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - innerDIFWithTwiddles(a[:64], twiddles[stage+0], 0, 32, 32) - for offset := 0; offset < 64; offset += 32 { - innerDIFWithTwiddles(a[offset:offset+32], twiddles[stage+1], 0, 16, 16) - } - for offset := 0; offset < 64; offset += 16 { - innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+2], 0, 8, 8) - } - for offset := 0; offset < 64; offset += 8 { - innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+3], 0, 4, 4) - } - for offset := 0; offset < 64; offset += 4 { - innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+4], 0, 2, 2) - } - for offset := 0; offset < 64; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } -} - -func kerDITNP_64(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - for offset := 0; offset < 64; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } - for offset := 0; offset < 64; offset += 4 { - innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+4], 0, 2, 2) - } - for offset := 0; offset < 64; offset += 8 { - innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+3], 0, 4, 4) - } - for offset := 0; offset < 64; offset += 16 { - innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+2], 0, 8, 8) - } - for offset := 0; offset < 64; offset += 32 { - innerDITWithTwiddles(a[offset:offset+32], twiddles[stage+1], 0, 16, 16) - } - innerDITWithTwiddles(a[:64], twiddles[stage+0], 0, 32, 32) -} diff --git a/ecc/bw6-633/fr/fft/fft_test.go b/ecc/bw6-633/fr/fft/fft_test.go index e4ce874e62..ce7bb73ef1 100644 --- a/ecc/bw6-633/fr/fft/fft_test.go +++ b/ecc/bw6-633/fr/fft/fft_test.go @@ -305,6 +305,23 @@ func BenchmarkFFTDIFReference(b *testing.B) { } } +func BenchmarkFFTDIFReferenceSmall(b *testing.B) { + const maxSize = 1 << 9 + + pol := make([]fr.Element, maxSize) + pol[0].SetRandom() + for i := 1; i < maxSize; i++ { + pol[i] = pol[i-1] + } + + domain := NewDomain(maxSize) + + b.ResetTimer() + for j := 0; j < b.N; j++ { + domain.FFT(pol, DIF) + } +} + func evaluatePolynomial(pol []fr.Element, val fr.Element) fr.Element { var acc, res, tmp fr.Element res.Set(&pol[0]) diff --git a/ecc/bw6-633/fr/fft/kernel_purego.go b/ecc/bw6-633/fr/fft/kernel_purego.go new file mode 100644 index 0000000000..1e53b9c614 --- /dev/null +++ b/ecc/bw6-633/fr/fft/kernel_purego.go @@ -0,0 +1,28 @@ +// Copyright 2020-2025 Consensys Software Inc. +// Licensed under the Apache License, Version 2.0. See the LICENSE file for details. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package fft + +import ( + "github.com/consensys/gnark-crypto/ecc/bw6-633/fr" +) + +func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { + innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) +} + +func kerDIFNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDIFNP_32generic(a, twiddles, stage) +} +func kerDITNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDITNP_32generic(a, twiddles, stage) +} + +func kerDIFNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDIFNP_256generic(a, twiddles, stage) +} +func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDITNP_256generic(a, twiddles, stage) +} diff --git a/ecc/bw6-761/fr/fft/fft.go b/ecc/bw6-761/fr/fft/fft.go index a11ce4eeef..b7996c817a 100644 --- a/ecc/bw6-761/fr/fft/fft.go +++ b/ecc/bw6-761/fr/fft/fft.go @@ -201,14 +201,13 @@ func difFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart if n == 1 { return } else if stage >= twiddlesStartStage { - if n == 256 { - kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) + if n == 1<<5 { + kerDIFNP_32(a, twiddles, stage-twiddlesStartStage) return - } else if n == 64 { - kerDIFNP_64(a, twiddles, stage-twiddlesStartStage) + } else if n == 1<<8 { + kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) return } - } m := n >> 1 @@ -258,7 +257,7 @@ func difFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart } -func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { +func innerDIFWithTwiddlesGeneric(a []fr.Element, twiddles []fr.Element, start, end, m int) { if start == 0 { fr.Butterfly(&a[0], &a[m]) start++ @@ -266,7 +265,6 @@ func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m i for i := start; i < end; i++ { fr.Butterfly(&a[i], &a[i+m]) } - // TODO @gbotrel: here the butterfly for most cases could leave the result not reduced mod q v1 := fr.Vector(a[start+m : end+m]) v2 := fr.Vector(twiddles[start:end]) v1.Mul(v1, v2) @@ -292,14 +290,13 @@ func ditFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart if n == 1 { return } else if stage >= twiddlesStartStage { - if n == 64 { - kerDITNP_64(a, twiddles, stage-twiddlesStartStage) + if n == 1<<5 { + kerDITNP_32(a, twiddles, stage-twiddlesStartStage) return - } else if n == 256 { + } else if n == 1<<8 { kerDITNP_256(a, twiddles, stage-twiddlesStartStage) return } - } m := n >> 1 @@ -374,7 +371,43 @@ func innerDITWithoutTwiddles(a []fr.Element, at, w fr.Element, start, end, m int } } -func kerDIFNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { +func kerDIFNP_32generic(a []fr.Element, twiddles [][]fr.Element, stage int) { + // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl + + innerDIFWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) + for offset := 0; offset < 32; offset += 16 { + innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) + } + for offset := 0; offset < 32; offset += 8 { + innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) + } + for offset := 0; offset < 32; offset += 4 { + innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) + } + for offset := 0; offset < 32; offset += 2 { + fr.Butterfly(&a[offset], &a[offset+1]) + } +} + +func kerDITNP_32generic(a []fr.Element, twiddles [][]fr.Element, stage int) { + // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl + + for offset := 0; offset < 32; offset += 2 { + fr.Butterfly(&a[offset], &a[offset+1]) + } + for offset := 0; offset < 32; offset += 4 { + innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) + } + for offset := 0; offset < 32; offset += 8 { + innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) + } + for offset := 0; offset < 32; offset += 16 { + innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) + } + innerDITWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) +} + +func kerDIFNP_256generic(a []fr.Element, twiddles [][]fr.Element, stage int) { // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl innerDIFWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) @@ -401,7 +434,7 @@ func kerDIFNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { } } -func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { +func kerDITNP_256generic(a []fr.Element, twiddles [][]fr.Element, stage int) { // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl for offset := 0; offset < 256; offset += 2 { @@ -427,45 +460,3 @@ func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { } innerDITWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) } - -func kerDIFNP_64(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - innerDIFWithTwiddles(a[:64], twiddles[stage+0], 0, 32, 32) - for offset := 0; offset < 64; offset += 32 { - innerDIFWithTwiddles(a[offset:offset+32], twiddles[stage+1], 0, 16, 16) - } - for offset := 0; offset < 64; offset += 16 { - innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+2], 0, 8, 8) - } - for offset := 0; offset < 64; offset += 8 { - innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+3], 0, 4, 4) - } - for offset := 0; offset < 64; offset += 4 { - innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+4], 0, 2, 2) - } - for offset := 0; offset < 64; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } -} - -func kerDITNP_64(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - for offset := 0; offset < 64; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } - for offset := 0; offset < 64; offset += 4 { - innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+4], 0, 2, 2) - } - for offset := 0; offset < 64; offset += 8 { - innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+3], 0, 4, 4) - } - for offset := 0; offset < 64; offset += 16 { - innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+2], 0, 8, 8) - } - for offset := 0; offset < 64; offset += 32 { - innerDITWithTwiddles(a[offset:offset+32], twiddles[stage+1], 0, 16, 16) - } - innerDITWithTwiddles(a[:64], twiddles[stage+0], 0, 32, 32) -} diff --git a/ecc/bw6-761/fr/fft/fft_test.go b/ecc/bw6-761/fr/fft/fft_test.go index cf7b941e6a..b56e9c7fac 100644 --- a/ecc/bw6-761/fr/fft/fft_test.go +++ b/ecc/bw6-761/fr/fft/fft_test.go @@ -305,6 +305,23 @@ func BenchmarkFFTDIFReference(b *testing.B) { } } +func BenchmarkFFTDIFReferenceSmall(b *testing.B) { + const maxSize = 1 << 9 + + pol := make([]fr.Element, maxSize) + pol[0].SetRandom() + for i := 1; i < maxSize; i++ { + pol[i] = pol[i-1] + } + + domain := NewDomain(maxSize) + + b.ResetTimer() + for j := 0; j < b.N; j++ { + domain.FFT(pol, DIF) + } +} + func evaluatePolynomial(pol []fr.Element, val fr.Element) fr.Element { var acc, res, tmp fr.Element res.Set(&pol[0]) diff --git a/ecc/bw6-761/fr/fft/kernel_purego.go b/ecc/bw6-761/fr/fft/kernel_purego.go new file mode 100644 index 0000000000..7f742c4043 --- /dev/null +++ b/ecc/bw6-761/fr/fft/kernel_purego.go @@ -0,0 +1,28 @@ +// Copyright 2020-2025 Consensys Software Inc. +// Licensed under the Apache License, Version 2.0. See the LICENSE file for details. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package fft + +import ( + "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" +) + +func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { + innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) +} + +func kerDIFNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDIFNP_32generic(a, twiddles, stage) +} +func kerDITNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDITNP_32generic(a, twiddles, stage) +} + +func kerDIFNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDIFNP_256generic(a, twiddles, stage) +} +func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { + kerDITNP_256generic(a, twiddles, stage) +} diff --git a/ecc/stark-curve/fp/asm_avx.go b/ecc/stark-curve/fp/asm_avx.go index 45e1ab3f0d..93490b31fd 100644 --- a/ecc/stark-curve/fp/asm_avx.go +++ b/ecc/stark-curve/fp/asm_avx.go @@ -10,6 +10,6 @@ package fp import "golang.org/x/sys/cpu" var ( - supportAvx512 = supportAdx && cpu.X86.HasAVX512 && cpu.X86.HasAVX512DQ + supportAvx512 = supportAdx && cpu.X86.HasAVX512 && cpu.X86.HasAVX512DQ && cpu.X86.HasAVX512VBMI2 _ = supportAvx512 ) diff --git a/ecc/stark-curve/fr/asm_avx.go b/ecc/stark-curve/fr/asm_avx.go index f89a44926d..b4865c8da2 100644 --- a/ecc/stark-curve/fr/asm_avx.go +++ b/ecc/stark-curve/fr/asm_avx.go @@ -10,6 +10,6 @@ package fr import "golang.org/x/sys/cpu" var ( - supportAvx512 = supportAdx && cpu.X86.HasAVX512 && cpu.X86.HasAVX512DQ + supportAvx512 = supportAdx && cpu.X86.HasAVX512 && cpu.X86.HasAVX512DQ && cpu.X86.HasAVX512VBMI2 _ = supportAvx512 ) diff --git a/field/babybear/asm_avx.go b/field/babybear/asm_avx.go index 585a4df090..92aac84260 100644 --- a/field/babybear/asm_avx.go +++ b/field/babybear/asm_avx.go @@ -10,6 +10,6 @@ package babybear import "golang.org/x/sys/cpu" var ( - supportAvx512 = cpu.X86.HasAVX512 && cpu.X86.HasAVX512DQ + supportAvx512 = cpu.X86.HasAVX512 && cpu.X86.HasAVX512DQ && cpu.X86.HasAVX512VBMI2 _ = supportAvx512 ) diff --git a/field/babybear/fft/fft.go b/field/babybear/fft/fft.go index 6ad4e6f884..8fbd1eb12a 100644 --- a/field/babybear/fft/fft.go +++ b/field/babybear/fft/fft.go @@ -201,14 +201,10 @@ func difFFT(a []babybear.Element, w babybear.Element, twiddles [][]babybear.Elem if n == 1 { return } else if stage >= twiddlesStartStage { - if n == 256 { - kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) - return - } else if n == 64 { - kerDIFNP_64(a, twiddles, stage-twiddlesStartStage) + if n == 1<<7 { + kerDIFNP_128(a, twiddles, stage-twiddlesStartStage) return } - } m := n >> 1 @@ -232,13 +228,7 @@ func difFFT(a []babybear.Element, w babybear.Element, twiddles [][]babybear.Elem // compute next twiddle w.Square(&w) } else { - if parallelButterfly { - parallel.Execute(m, func(start, end int) { - innerDIFWithTwiddles(a, twiddles[stage-twiddlesStartStage], start, end, m) - }, nbTasks/(1<<(stage))) - } else { - innerDIFWithTwiddles(a, twiddles[stage-twiddlesStartStage], 0, m, m) - } + innerDIFWithTwiddles(a, twiddles[stage-twiddlesStartStage], 0, m, m) } if m == 1 { @@ -258,7 +248,7 @@ func difFFT(a []babybear.Element, w babybear.Element, twiddles [][]babybear.Elem } -func innerDIFWithTwiddles(a []babybear.Element, twiddles []babybear.Element, start, end, m int) { +func innerDIFWithTwiddlesGeneric(a []babybear.Element, twiddles []babybear.Element, start, end, m int) { if start == 0 { babybear.Butterfly(&a[0], &a[m]) start++ @@ -266,7 +256,6 @@ func innerDIFWithTwiddles(a []babybear.Element, twiddles []babybear.Element, sta for i := start; i < end; i++ { babybear.Butterfly(&a[i], &a[i+m]) } - // TODO @gbotrel: here the butterfly for most cases could leave the result not reduced mod q v1 := babybear.Vector(a[start+m : end+m]) v2 := babybear.Vector(twiddles[start:end]) v1.Mul(v1, v2) @@ -292,14 +281,10 @@ func ditFFT(a []babybear.Element, w babybear.Element, twiddles [][]babybear.Elem if n == 1 { return } else if stage >= twiddlesStartStage { - if n == 64 { - kerDITNP_64(a, twiddles, stage-twiddlesStartStage) - return - } else if n == 256 { - kerDITNP_256(a, twiddles, stage-twiddlesStartStage) + if n == 1<<7 { + kerDITNP_128(a, twiddles, stage-twiddlesStartStage) return } - } m := n >> 1 @@ -374,98 +359,50 @@ func innerDITWithoutTwiddles(a []babybear.Element, at, w babybear.Element, start } } -func kerDIFNP_256(a []babybear.Element, twiddles [][]babybear.Element, stage int) { +func kerDIFNP_128generic(a []babybear.Element, twiddles [][]babybear.Element, stage int) { // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - innerDIFWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) - for offset := 0; offset < 256; offset += 128 { - innerDIFWithTwiddles(a[offset:offset+128], twiddles[stage+1], 0, 64, 64) + innerDIFWithTwiddles(a[:128], twiddles[stage+0], 0, 64, 64) + for offset := 0; offset < 128; offset += 64 { + innerDIFWithTwiddles(a[offset:offset+64], twiddles[stage+1], 0, 32, 32) } - for offset := 0; offset < 256; offset += 64 { - innerDIFWithTwiddles(a[offset:offset+64], twiddles[stage+2], 0, 32, 32) + for offset := 0; offset < 128; offset += 32 { + innerDIFWithTwiddles(a[offset:offset+32], twiddles[stage+2], 0, 16, 16) } - for offset := 0; offset < 256; offset += 32 { - innerDIFWithTwiddles(a[offset:offset+32], twiddles[stage+3], 0, 16, 16) + for offset := 0; offset < 128; offset += 16 { + innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+3], 0, 8, 8) } - for offset := 0; offset < 256; offset += 16 { - innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+4], 0, 8, 8) + for offset := 0; offset < 128; offset += 8 { + innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+4], 0, 4, 4) } - for offset := 0; offset < 256; offset += 8 { - innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+5], 0, 4, 4) + for offset := 0; offset < 128; offset += 4 { + innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+5], 0, 2, 2) } - for offset := 0; offset < 256; offset += 4 { - innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+6], 0, 2, 2) - } - for offset := 0; offset < 256; offset += 2 { - babybear.Butterfly(&a[offset], &a[offset+1]) - } -} - -func kerDITNP_256(a []babybear.Element, twiddles [][]babybear.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - for offset := 0; offset < 256; offset += 2 { + for offset := 0; offset < 128; offset += 2 { babybear.Butterfly(&a[offset], &a[offset+1]) } - for offset := 0; offset < 256; offset += 4 { - innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+6], 0, 2, 2) - } - for offset := 0; offset < 256; offset += 8 { - innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+5], 0, 4, 4) - } - for offset := 0; offset < 256; offset += 16 { - innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+4], 0, 8, 8) - } - for offset := 0; offset < 256; offset += 32 { - innerDITWithTwiddles(a[offset:offset+32], twiddles[stage+3], 0, 16, 16) - } - for offset := 0; offset < 256; offset += 64 { - innerDITWithTwiddles(a[offset:offset+64], twiddles[stage+2], 0, 32, 32) - } - for offset := 0; offset < 256; offset += 128 { - innerDITWithTwiddles(a[offset:offset+128], twiddles[stage+1], 0, 64, 64) - } - innerDITWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) } -func kerDIFNP_64(a []babybear.Element, twiddles [][]babybear.Element, stage int) { +func kerDITNP_128generic(a []babybear.Element, twiddles [][]babybear.Element, stage int) { // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - innerDIFWithTwiddles(a[:64], twiddles[stage+0], 0, 32, 32) - for offset := 0; offset < 64; offset += 32 { - innerDIFWithTwiddles(a[offset:offset+32], twiddles[stage+1], 0, 16, 16) - } - for offset := 0; offset < 64; offset += 16 { - innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+2], 0, 8, 8) - } - for offset := 0; offset < 64; offset += 8 { - innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+3], 0, 4, 4) - } - for offset := 0; offset < 64; offset += 4 { - innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+4], 0, 2, 2) - } - for offset := 0; offset < 64; offset += 2 { + for offset := 0; offset < 128; offset += 2 { babybear.Butterfly(&a[offset], &a[offset+1]) } -} - -func kerDITNP_64(a []babybear.Element, twiddles [][]babybear.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - for offset := 0; offset < 64; offset += 2 { - babybear.Butterfly(&a[offset], &a[offset+1]) + for offset := 0; offset < 128; offset += 4 { + innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+5], 0, 2, 2) } - for offset := 0; offset < 64; offset += 4 { - innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+4], 0, 2, 2) + for offset := 0; offset < 128; offset += 8 { + innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+4], 0, 4, 4) } - for offset := 0; offset < 64; offset += 8 { - innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+3], 0, 4, 4) + for offset := 0; offset < 128; offset += 16 { + innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+3], 0, 8, 8) } - for offset := 0; offset < 64; offset += 16 { - innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+2], 0, 8, 8) + for offset := 0; offset < 128; offset += 32 { + innerDITWithTwiddles(a[offset:offset+32], twiddles[stage+2], 0, 16, 16) } - for offset := 0; offset < 64; offset += 32 { - innerDITWithTwiddles(a[offset:offset+32], twiddles[stage+1], 0, 16, 16) + for offset := 0; offset < 128; offset += 64 { + innerDITWithTwiddles(a[offset:offset+64], twiddles[stage+1], 0, 32, 32) } - innerDITWithTwiddles(a[:64], twiddles[stage+0], 0, 32, 32) + innerDITWithTwiddles(a[:128], twiddles[stage+0], 0, 64, 64) } diff --git a/field/babybear/fft/fft_test.go b/field/babybear/fft/fft_test.go index 1e8144a4e0..daf36bfa85 100644 --- a/field/babybear/fft/fft_test.go +++ b/field/babybear/fft/fft_test.go @@ -305,6 +305,23 @@ func BenchmarkFFTDIFReference(b *testing.B) { } } +func BenchmarkFFTDIFReferenceSmall(b *testing.B) { + const maxSize = 1 << 9 + + pol := make([]babybear.Element, maxSize) + pol[0].SetRandom() + for i := 1; i < maxSize; i++ { + pol[i] = pol[i-1] + } + + domain := NewDomain(maxSize) + + b.ResetTimer() + for j := 0; j < b.N; j++ { + domain.FFT(pol, DIF) + } +} + func evaluatePolynomial(pol []babybear.Element, val babybear.Element) babybear.Element { var acc, res, tmp babybear.Element res.Set(&pol[0]) diff --git a/field/babybear/fft/kernel_amd64.go b/field/babybear/fft/kernel_amd64.go new file mode 100644 index 0000000000..6a8c668f33 --- /dev/null +++ b/field/babybear/fft/kernel_amd64.go @@ -0,0 +1,53 @@ +//go:build !purego + +// Copyright 2020-2025 Consensys Software Inc. +// Licensed under the Apache License, Version 2.0. See the LICENSE file for details. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package fft + +import ( + "github.com/consensys/gnark-crypto/field/babybear" + "golang.org/x/sys/cpu" +) + +var ( + supportAVX512 = cpu.X86.HasAVX512 && cpu.X86.HasAVX512DQ && cpu.X86.HasAVX512VBMI2 +) + +// q + r'.r = 1, i.e., qInvNeg = - q⁻¹ mod r +// used for Montgomery reduction +const qInvNeg = 2013265919 +const q = 2013265921 + +// index table used in avx512 shuffling +var vInterleaveIndices = []uint64{ + 2, 3, 8, 9, 6, 7, 12, 13, +} + +//go:noescape +func innerDIFWithTwiddles_avx512(a []babybear.Element, twiddles []babybear.Element, start, end, m int) + +func innerDIFWithTwiddles(a []babybear.Element, twiddles []babybear.Element, start, end, m int) { + if !supportAVX512 { + innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) + return + } + innerDIFWithTwiddles_avx512(a, twiddles, start, end, m) +} + +//go:noescape +func kerDIFNP_128_avx512(a []babybear.Element, twiddles [][]babybear.Element, stage int) + +func kerDIFNP_128(a []babybear.Element, twiddles [][]babybear.Element, stage int) { + if !supportAVX512 { + kerDIFNP_128generic(a, twiddles, stage) + return + } + kerDIFNP_128_avx512(a, twiddles, stage) +} + +func kerDITNP_128(a []babybear.Element, twiddles [][]babybear.Element, stage int) { + kerDITNP_128generic(a, twiddles, stage) +} diff --git a/field/babybear/fft/kernel_amd64.s b/field/babybear/fft/kernel_amd64.s new file mode 100644 index 0000000000..d8e3d38bf8 --- /dev/null +++ b/field/babybear/fft/kernel_amd64.s @@ -0,0 +1,389 @@ +//go:build !purego +// Code generated by gnark-crypto/generator. DO NOT EDIT. +#include "textflag.h" +#include "funcdata.h" +#include "go_asm.h" + +TEXT ·innerDIFWithTwiddles_avx512(SB), NOSPLIT, $0-72 + + // func innerDIFWithTwiddles(a []Element, twiddles []Element, start, end, m int) { + // for i := start; i < end; i++ { + // Butterfly(&a[i], &a[i+m]) + // a[i+m].Mul(&a[i+m], &twiddles[i]) + // } + // } + + // prepare constants needed for mul and reduce ops + MOVD $const_q, AX + VPBROADCASTD AX, Z2 + VPBROADCASTQ AX, Z8 + MOVD $const_qInvNeg, AX + VPBROADCASTQ AX, Z9 + VPCMPEQB Y0, Y0, Y0 + VPMOVZXDQ Y0, Z11 + + // load arguments + MOVQ a+0(FP), R15 + MOVQ twiddles+24(FP), CX + MOVQ end+56(FP), SI + MOVQ m+64(FP), BX + CMPQ BX, $0x0000000000000010 + JL smallerThan16_1 // m < 16 + SHRQ $4, SI // we are processing 16 elements at a time + SHLQ $2, BX // offset = m * 4bytes + MOVQ R15, DX + ADDQ BX, DX + + // performs a butterfly between 2 vectors of dwords + // first vector is in [0, q) and second vector is in [0, 2q) +#define BUTTERFLYD2Q(in0, in1) \ + VPADDD in0, in1, Z3 \ + VPSUBD in1, in0, in1 \ + VPSUBD Z2, Z3, in0 \ + VPMINUD Z3, in0, in0 \ + VPADDD Z2, in1, in1 \ + +#define MUL(in0, in1) \ + VPMULUDQ in0, in1, Z12 \ + VPANDQ Z11, Z12, Z10 \ + VPMULUDQ Z10, Z9, Z10 \ + VPANDQ Z11, Z10, Z10 \ + VPMULUDQ Z10, Z8, Z10 \ + VPADDQ Z12, Z10, Z12 \ + VPSRLQ $32, Z12, Z12 \ + VPSUBQ Z8, Z12, Z10 \ + VPMINUQ Z12, Z10, in0 \ + +loop_3: + TESTQ SI, SI + JEQ done_2 // n == 0, we are done + VMOVDQA32 0(R15), Z0 // load a[i] + VMOVDQA32 0(DX), Z1 // load a[i+m] + BUTTERFLYD2Q(Z0, Z1) + VMOVDQA32 Z0, 0(R15) // store a[i] + VEXTRACTI32X8 $0, Z1, Y20 + VEXTRACTI32X8 $1, Z1, Y21 + VPMOVZXDQ Y20, Z13 + VPMOVZXDQ Y21, Z14 + VPMOVZXDQ 0(CX), Z15 + VPMOVZXDQ 32(CX), Z16 + MUL(Z13, Z15) + MUL(Z14, Z16) + VPMOVQD Z13, 0(DX) + VPMOVQD Z14, 32(DX) + ADDQ $64, R15 + ADDQ $64, DX + ADDQ $64, CX + DECQ SI // decrement n + JMP loop_3 + +done_2: + RET + +smallerThan16_1: + // m < 16, we call the generic one + // note that this should happen only when doing a FFT smaller than the smallest generated kernel + MOVQ a+0(FP), AX + MOVQ AX, (SP) + MOVQ twiddles+24(FP), AX + MOVQ AX, 24(SP) + MOVQ start+48(FP), AX + MOVQ AX, 48(SP) + MOVQ end+56(FP), AX + MOVQ AX, 56(SP) + MOVQ m+64(FP), AX + MOVQ AX, 64(SP) + CALL ·innerDIFWithTwiddlesGeneric(SB) + RET + +// kerDIFNP_128_avx512(a []{{ .FF }}.Element, twiddles [][]{{ .FF }}.Element, stage int) +TEXT ·kerDIFNP_128_avx512(SB), NOSPLIT, $0-56 + // prepare constants needed for mul and reduce ops + MOVD $const_q, AX + VPBROADCASTQ AX, Z17 + MOVD $const_qInvNeg, AX + VPBROADCASTQ AX, Z18 + VPCMPEQB Y0, Y0, Y0 + VPMOVZXDQ Y0, Z20 + + // load arguments + MOVQ a+0(FP), R15 + MOVQ twiddles+24(FP), CX + MOVQ stage+48(FP), AX + IMULQ $24, AX + ADDQ AX, CX // we want twiddles[stage] as starting point + + // load a[:128] in registers + VPMOVZXDQ 0(R15), Z0 + VPMOVZXDQ 32(R15), Z1 + VPMOVZXDQ 64(R15), Z2 + VPMOVZXDQ 96(R15), Z3 + VPMOVZXDQ 128(R15), Z4 + VPMOVZXDQ 160(R15), Z5 + VPMOVZXDQ 192(R15), Z6 + VPMOVZXDQ 224(R15), Z7 + VPMOVZXDQ 256(R15), Z8 + VPMOVZXDQ 288(R15), Z9 + VPMOVZXDQ 320(R15), Z10 + VPMOVZXDQ 352(R15), Z11 + VPMOVZXDQ 384(R15), Z12 + VPMOVZXDQ 416(R15), Z13 + VPMOVZXDQ 448(R15), Z14 + VPMOVZXDQ 480(R15), Z15 + + // butterfly computes + // in0 = in0 + in1 (in [0,q)) + // in1 = in0 - in1 (in [0,2q)) +#define BUTTERFLY(in0, in1) \ + VPADDQ in0, in1, Z21 \ + VPSUBQ in1, in0, in1 \ + VPSUBQ Z17, Z21, in0 \ + VPMINUQ Z21, in0, in0 \ + VPADDQ Z17, in1, in1 \ + +// mul computes x = x * y +#define MUL_0(in0, in1) \ + VPMULUDQ in0, in1, Z16 \ + VPANDQ Z20, Z16, Z19 \ + VPMULUDQ Z19, Z18, Z19 \ + VPANDQ Z20, Z19, Z19 \ + VPMULUDQ Z19, Z17, Z19 \ + VPADDQ Z16, Z19, Z16 \ + VPSRLQ $32, Z16, Z16 \ + VPSUBQ Z17, Z16, Z19 \ + VPMINUQ Z16, Z19, in0 \ + + MOVQ 0(CX), BX + VPMOVZXDQ 0(BX), Z23 + VPMOVZXDQ 32(BX), Z24 + VPMOVZXDQ 64(BX), Z25 + VPMOVZXDQ 96(BX), Z26 + VPMOVZXDQ 128(BX), Z27 + VPMOVZXDQ 160(BX), Z28 + VPMOVZXDQ 192(BX), Z29 + VPMOVZXDQ 224(BX), Z30 + BUTTERFLY(Z0, Z8) + MUL_0(Z8, Z23) + BUTTERFLY(Z1, Z9) + MUL_0(Z9, Z24) + BUTTERFLY(Z2, Z10) + MUL_0(Z10, Z25) + BUTTERFLY(Z3, Z11) + MUL_0(Z11, Z26) + BUTTERFLY(Z4, Z12) + MUL_0(Z12, Z27) + BUTTERFLY(Z5, Z13) + MUL_0(Z13, Z28) + BUTTERFLY(Z6, Z14) + MUL_0(Z14, Z29) + BUTTERFLY(Z7, Z15) + MUL_0(Z15, Z30) + ADDQ $24, CX + MOVQ 0(CX), BX + VPMOVZXDQ 0(BX), Z23 + VPMOVZXDQ 32(BX), Z24 + VPMOVZXDQ 64(BX), Z25 + VPMOVZXDQ 96(BX), Z26 + BUTTERFLY(Z0, Z4) + MUL_0(Z4, Z23) + BUTTERFLY(Z1, Z5) + MUL_0(Z5, Z24) + BUTTERFLY(Z2, Z6) + MUL_0(Z6, Z25) + BUTTERFLY(Z3, Z7) + MUL_0(Z7, Z26) + BUTTERFLY(Z8, Z12) + MUL_0(Z12, Z23) + BUTTERFLY(Z9, Z13) + MUL_0(Z13, Z24) + BUTTERFLY(Z10, Z14) + MUL_0(Z14, Z25) + BUTTERFLY(Z11, Z15) + MUL_0(Z15, Z26) + ADDQ $24, CX + MOVQ 0(CX), BX + VPMOVZXDQ 0(BX), Z23 + VPMOVZXDQ 32(BX), Z24 + BUTTERFLY(Z0, Z2) + MUL_0(Z2, Z23) + BUTTERFLY(Z1, Z3) + MUL_0(Z3, Z24) + BUTTERFLY(Z4, Z6) + MUL_0(Z6, Z23) + BUTTERFLY(Z5, Z7) + MUL_0(Z7, Z24) + BUTTERFLY(Z8, Z10) + MUL_0(Z10, Z23) + BUTTERFLY(Z9, Z11) + MUL_0(Z11, Z24) + BUTTERFLY(Z12, Z14) + MUL_0(Z14, Z23) + BUTTERFLY(Z13, Z15) + MUL_0(Z15, Z24) + ADDQ $24, CX + MOVQ 0(CX), BX + VPMOVZXDQ 0(BX), Z23 + BUTTERFLY(Z0, Z1) + MUL_0(Z1, Z23) + BUTTERFLY(Z2, Z3) + MUL_0(Z3, Z23) + BUTTERFLY(Z4, Z5) + MUL_0(Z5, Z23) + BUTTERFLY(Z6, Z7) + MUL_0(Z7, Z23) + BUTTERFLY(Z8, Z9) + MUL_0(Z9, Z23) + BUTTERFLY(Z10, Z11) + MUL_0(Z11, Z23) + BUTTERFLY(Z12, Z13) + MUL_0(Z13, Z23) + BUTTERFLY(Z14, Z15) + MUL_0(Z15, Z23) + ADDQ $24, CX + MOVQ 0(CX), BX + VPMOVZXDQ 0(BX), Y23 // zero extend 4x uint32 to 4x uint64 + VINSERTI64X4 $1, Y23, Z23, Z23 + MOVQ $0x0000000000000f0f, AX + KMOVQ AX, K1 + +#define PERMUTE4X4(in0, in1) \ + VSHUFI64X2 $0x000000000000004e, in1, in0, Z21 \ + VPBLENDMQ in0, Z21, K1, in0 \ + VPBLENDMQ Z21, in1, K1, in1 \ + + PERMUTE4X4(Z0, Z1) + BUTTERFLY(Z0, Z1) + MUL_0(Z1, Z23) + PERMUTE4X4(Z0, Z1) + PERMUTE4X4(Z2, Z3) + BUTTERFLY(Z2, Z3) + MUL_0(Z3, Z23) + PERMUTE4X4(Z2, Z3) + PERMUTE4X4(Z4, Z5) + BUTTERFLY(Z4, Z5) + MUL_0(Z5, Z23) + PERMUTE4X4(Z4, Z5) + PERMUTE4X4(Z6, Z7) + BUTTERFLY(Z6, Z7) + MUL_0(Z7, Z23) + PERMUTE4X4(Z6, Z7) + PERMUTE4X4(Z8, Z9) + BUTTERFLY(Z8, Z9) + MUL_0(Z9, Z23) + PERMUTE4X4(Z8, Z9) + PERMUTE4X4(Z10, Z11) + BUTTERFLY(Z10, Z11) + MUL_0(Z11, Z23) + PERMUTE4X4(Z10, Z11) + PERMUTE4X4(Z12, Z13) + BUTTERFLY(Z12, Z13) + MUL_0(Z13, Z23) + PERMUTE4X4(Z12, Z13) + PERMUTE4X4(Z14, Z15) + BUTTERFLY(Z14, Z15) + MUL_0(Z15, Z23) + PERMUTE4X4(Z14, Z15) + ADDQ $24, CX + MOVQ 0(CX), BX + VPMOVZXDQ 0(BX), X23 // zero extend 2x uint32 to 2x uint64 + VINSERTI64X2 $1, X23, Z23, Z23 + VINSERTI64X2 $0x0000000000000002, X23, Z23, Z23 + VINSERTI64X2 $0x0000000000000003, X23, Z23, Z23 + MOVQ $0x0000000000000033, AX + KMOVQ AX, K2 + MOVQ ·vInterleaveIndices+0(SB), DI + VMOVDQU64 0(DI), Z30 + +#define PERMUTE2X2(in0, in1) \ + VMOVDQA64 Z30, Z29 \ + VPERMI2Q in1, in0, Z29 \ + VPBLENDMQ in0, Z29, K2, in0 \ + VPBLENDMQ Z29, in1, K2, in1 \ + + PERMUTE2X2(Z0, Z1) + BUTTERFLY(Z0, Z1) + MUL_0(Z1, Z23) + PERMUTE2X2(Z0, Z1) + PERMUTE2X2(Z2, Z3) + BUTTERFLY(Z2, Z3) + MUL_0(Z3, Z23) + PERMUTE2X2(Z2, Z3) + PERMUTE2X2(Z4, Z5) + BUTTERFLY(Z4, Z5) + MUL_0(Z5, Z23) + PERMUTE2X2(Z4, Z5) + PERMUTE2X2(Z6, Z7) + BUTTERFLY(Z6, Z7) + MUL_0(Z7, Z23) + PERMUTE2X2(Z6, Z7) + PERMUTE2X2(Z8, Z9) + BUTTERFLY(Z8, Z9) + MUL_0(Z9, Z23) + PERMUTE2X2(Z8, Z9) + PERMUTE2X2(Z10, Z11) + BUTTERFLY(Z10, Z11) + MUL_0(Z11, Z23) + PERMUTE2X2(Z10, Z11) + PERMUTE2X2(Z12, Z13) + BUTTERFLY(Z12, Z13) + MUL_0(Z13, Z23) + PERMUTE2X2(Z12, Z13) + PERMUTE2X2(Z14, Z15) + BUTTERFLY(Z14, Z15) + MUL_0(Z15, Z23) + PERMUTE2X2(Z14, Z15) + MOVQ $0x0000000000005555, AX + KMOVD AX, K3 + +#define PERMUTE1X1(in0, in1) \ + VPSHRDQ $32, in1, in0, Z21 \ + VPBLENDMD in0, Z21, K3, in0 \ + VPBLENDMD Z21, in1, K3, in1 \ + + MOVD $const_q, AX + VPBROADCASTD AX, Z17 // rebroadcast q, but on dword lanes + +#define LASTBUTTERFLY(in0, in1) \ + VPADDD in0, in1, Z21 \ + VPSUBD in1, in0, in1 \ + VPSUBD Z17, Z21, in0 \ + VPMINUD Z21, in0, in0 \ + VPADDD Z17, in1, Z22 \ + VPMINUD Z22, in1, in1 \ + +#define PACK_DWORDS(in0, in1, in2, in3) \ + VPMOVQD in0, in1 \ + VPMOVQD in2, in3 \ + VINSERTI64X4 $1, in3, in0, in0 \ + + PACK_DWORDS(Z0, Y0, Z1, Y1) + PACK_DWORDS(Z2, Y2, Z3, Y3) + PERMUTE1X1(Z0, Z2) + LASTBUTTERFLY(Z0, Z2) + PERMUTE1X1(Z0, Z2) + PACK_DWORDS(Z4, Y4, Z5, Y5) + PACK_DWORDS(Z6, Y6, Z7, Y7) + PERMUTE1X1(Z4, Z6) + LASTBUTTERFLY(Z4, Z6) + PERMUTE1X1(Z4, Z6) + PACK_DWORDS(Z8, Y8, Z9, Y9) + PACK_DWORDS(Z10, Y10, Z11, Y11) + PERMUTE1X1(Z8, Z10) + LASTBUTTERFLY(Z8, Z10) + PERMUTE1X1(Z8, Z10) + PACK_DWORDS(Z12, Y12, Z13, Y13) + PACK_DWORDS(Z14, Y14, Z15, Y15) + PERMUTE1X1(Z12, Z14) + LASTBUTTERFLY(Z12, Z14) + PERMUTE1X1(Z12, Z14) + + // store a[:128] in memory + VMOVDQA32 Z0, 0(R15) + VMOVDQA32 Z2, 64(R15) + VMOVDQA32 Z4, 128(R15) + VMOVDQA32 Z6, 192(R15) + VMOVDQA32 Z8, 256(R15) + VMOVDQA32 Z10, 320(R15) + VMOVDQA32 Z12, 384(R15) + VMOVDQA32 Z14, 448(R15) + RET diff --git a/field/babybear/fft/kernel_purego.go b/field/babybear/fft/kernel_purego.go new file mode 100644 index 0000000000..755fe17413 --- /dev/null +++ b/field/babybear/fft/kernel_purego.go @@ -0,0 +1,23 @@ +//go:build purego || !amd64 + +// Copyright 2020-2025 Consensys Software Inc. +// Licensed under the Apache License, Version 2.0. See the LICENSE file for details. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package fft + +import ( + "github.com/consensys/gnark-crypto/field/babybear" +) + +func innerDIFWithTwiddles(a []babybear.Element, twiddles []babybear.Element, start, end, m int) { + innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) +} + +func kerDIFNP_128(a []babybear.Element, twiddles [][]babybear.Element, stage int) { + kerDIFNP_128generic(a, twiddles, stage) +} +func kerDITNP_128(a []babybear.Element, twiddles [][]babybear.Element, stage int) { + kerDITNP_128generic(a, twiddles, stage) +} diff --git a/field/babybear/sis/sis.go b/field/babybear/sis/sis.go index 8a6e931709..a1ebe05d24 100644 --- a/field/babybear/sis/sis.go +++ b/field/babybear/sis/sis.go @@ -142,11 +142,54 @@ func (r *RSis) Hash(v, res []babybear.Element) error { // by default, the mask is ignored (unless we unrolled the FFT and have a degree 64) mask := ^uint64(0) + if r.Degree == 512 && r.LogTwoBound == 16 { + // this is our hot path, we don't use the iterator because with + // avx512 instructions, it actually ends up being most of the CPU time. + er := babybear.Element{1} // mul by 1 --> mont reduce + polId := 0 + var k512 [512]babybear.Element + vk := babybear.Vector(k512[:]) + vRes := babybear.Vector(res) + vb := babybear.Vector(k512[256:]) + + cosets, err := r.Domain.CosetTable() + if err != nil { + return err + } + vCosets := babybear.Vector(cosets) + + for j := 0; j < len(v); j += 256 { + start := j + end := j + 256 + end = min(end, len(v)) - // inner hash - it := NewLimbIterator(&VectorIterator{v: v}, r.LogTwoBound/8) - for i := 0; i < len(r.Ag); i++ { - r.InnerHash(it, res, k, r.kz, i, mask) + // use half of vk to copy the v input to batch convert to regular form + copy(vb[:], v[start:end]) + for k := (end - start); k < 256; k++ { + vb[k][0] = 0 + } + // batch montgomery -> regular + vb.ScalarMul(vb, &er) + + // do the limb split + for k := 0; k < 256; k++ { + k512[k*2][0] = uint32(uint16(vb[k][0])) + k512[k*2+1][0] = uint32(uint16(vb[k][0] >> 16)) + } + + // inner hash + vk.Mul(vk, vCosets) + r.Domain.FFT(k512[:], fft.DIF, fft.WithNbTasks(1)) + vk.Mul(vk, babybear.Vector(r.Ag[polId])) + vRes.Add(vRes, vk) + polId++ + } + } else { + // inner hash + it := NewLimbIterator(&VectorIterator{v: v}, r.LogTwoBound/8) + for i := 0; i < len(r.Ag); i++ { + r.InnerHash(it, res, k, r.kz, i, mask) + } } // reduces mod Xᵈ+1 diff --git a/field/babybear/vector_amd64.go b/field/babybear/vector_amd64.go index b6d092f7e3..795a835f2a 100644 --- a/field/babybear/vector_amd64.go +++ b/field/babybear/vector_amd64.go @@ -25,6 +25,9 @@ func scalarMulVec(res, a, b *Element, n uint64) //go:noescape func innerProdVec(t *uint64, a, b *Element, n uint64) +//go:noescape +func butterflyMulVec(a, twiddles *Element, m int) + // Add adds two vectors element-wise and stores the result in self. // It panics if the vectors don't have the same length. func (vector *Vector) Add(a, b Vector) { diff --git a/field/generator/asm/amd64/build.go b/field/generator/asm/amd64/build.go index 3e383b2106..fec8f0a027 100644 --- a/field/generator/asm/amd64/build.go +++ b/field/generator/asm/amd64/build.go @@ -260,6 +260,28 @@ func GenerateCommonASM(w io.Writer, nbWords, nbBits int, hasVector bool) error { return nil } +func GenerateF31FFTKernels(w io.Writer, nbBits int, kernels []int) error { + if nbBits != 31 { + return fmt.Errorf("only 31 bits supported for now") + } + f := NewFFAmd64(w, 1) + + f.Comment("Code generated by gnark-crypto/generator. DO NOT EDIT.") + + f.WriteLn("#include \"textflag.h\"") + f.WriteLn("#include \"funcdata.h\"") + f.WriteLn("#include \"go_asm.h\"") + f.WriteLn("") + + f.generateFFTInnerDIFF31() + + for _, ksize := range kernels { + f.generateFFTKernelF31(ksize) + } + + return nil +} + func GenerateF31ASM(f *FFAmd64, hasVector bool) error { if !hasVector { return nil // nothing for now. diff --git a/field/generator/asm/amd64/element_vec_F31.go b/field/generator/asm/amd64/element_vec_F31.go index 1d13fd5d4d..09a84073d4 100644 --- a/field/generator/asm/amd64/element_vec_F31.go +++ b/field/generator/asm/amd64/element_vec_F31.go @@ -15,6 +15,8 @@ package amd64 import ( + "fmt" + "github.com/consensys/bavard/amd64" ) @@ -41,7 +43,7 @@ func (f *FFAmd64) generateAddVecF31() { // load q in Z3 f.WriteLn("MOVD $const_q, AX") - f.VPBROADCASTD("AX", q) + f.VPBROADCASTD(amd64.AX, q) loop := f.NewLabel("loop") done := f.NewLabel("done") @@ -107,7 +109,7 @@ func (f *FFAmd64) generateSubVecF31() { // load q in Z3 f.WriteLn("MOVD $const_q, AX") - f.VPBROADCASTD("AX", q) + f.VPBROADCASTD(amd64.AX, q) loop := f.NewLabel("loop") done := f.NewLabel("done") @@ -246,9 +248,9 @@ func (f *FFAmd64) generateMulVecF31() { // load q in Z3 f.WriteLn("MOVD $const_q, AX") - f.VPBROADCASTQ("AX", q) + f.VPBROADCASTQ(amd64.AX, q) f.WriteLn("MOVD $const_qInvNeg, AX") - f.VPBROADCASTQ("AX", qInvNeg) + f.VPBROADCASTQ(amd64.AX, qInvNeg) f.Comment("Create mask for low dword in each qword") f.VPCMPEQB("Y0", "Y0", "Y0") @@ -326,9 +328,9 @@ func (f *FFAmd64) generateScalarMulVecF31() { // load q in Z3 f.WriteLn("MOVD $const_q, AX") - f.VPBROADCASTQ("AX", q) + f.VPBROADCASTQ(amd64.AX, q) f.WriteLn("MOVD $const_qInvNeg, AX") - f.VPBROADCASTQ("AX", qInvNeg) + f.VPBROADCASTQ(amd64.AX, qInvNeg) f.Comment("Create mask for low dword in each qword") f.VPCMPEQB("Y0", "Y0", "Y0") @@ -415,9 +417,9 @@ func (f *FFAmd64) generateInnerProdVecF31() { done := f.NewLabel("done") f.WriteLn("MOVD $const_q, AX") - f.VPBROADCASTQ("AX", q) + f.VPBROADCASTQ(amd64.AX, q) f.WriteLn("MOVD $const_qInvNeg, AX") - f.VPBROADCASTQ("AX", qInvNeg) + f.VPBROADCASTQ(amd64.AX, qInvNeg) f.Comment("Create mask for low dword in each qword") f.VPCMPEQB("Y0", "Y0", "Y0") @@ -468,3 +470,490 @@ func (f *FFAmd64) generateInnerProdVecF31() { f.Push(®isters, addrA, addrT, len) } + +func (f *FFAmd64) generateFFTInnerDIFF31() { + + const argSize = 9 * 8 + stackSize := f.StackSize(f.NbWords*2+4, 1, 0) + registers := f.FnHeader("innerDIFWithTwiddles_avx512", stackSize, argSize, amd64.AX) + defer f.AssertCleanStack(stackSize, 0) + + f.WriteLn(` + // func innerDIFWithTwiddles(a []Element, twiddles []Element, start, end, m int) { + // for i := start; i < end; i++ { + // Butterfly(&a[i], &a[i+m]) + // a[i+m].Mul(&a[i+m], &twiddles[i]) + // } + // } +`) + + addrA := f.Pop(®isters) + addrAPlusM := f.Pop(®isters) + addrTwiddles := f.Pop(®isters) + m := f.Pop(®isters) + len := f.Pop(®isters) + + a := amd64.Register("Z0") + am := amd64.Register("Z1") + qd := amd64.Register("Z2") + b0 := amd64.Register("Z3") + q := amd64.Register("Z8") + qInvNeg := amd64.Register("Z9") + PL := amd64.Register("Z10") + LSW := amd64.Register("Z11") + P := amd64.Register("Z12") + m1 := amd64.Register("Z13") + m2 := amd64.Register("Z14") + t0 := amd64.Register("Z15") + t1 := amd64.Register("Z16") + y1 := amd64.Register("Y20") + y2 := amd64.Register("Y21") + + f.Comment("prepare constants needed for mul and reduce ops") + f.WriteLn("MOVD $const_q, AX") + f.VPBROADCASTD(amd64.AX, qd) + f.VPBROADCASTQ(amd64.AX, q) + f.WriteLn("MOVD $const_qInvNeg, AX") + f.VPBROADCASTQ(amd64.AX, qInvNeg) + f.VPCMPEQB("Y0", "Y0", "Y0") + f.VPMOVZXDQ("Y0", LSW) + + f.Comment("load arguments") + f.MOVQ("a+0(FP)", addrA) + f.MOVQ("twiddles+24(FP)", addrTwiddles) + f.MOVQ("end+56(FP)", len) + f.MOVQ("m+64(FP)", m) + + // we do only m >= 16; + // if m < 16, we call the generic one; this can be called when doing a FFT + // smaller than the smallest generated kernel + lblSmallerThan16 := f.NewLabel("smallerThan16") + f.CMPQ(m, 16) + f.JL(lblSmallerThan16, "m < 16") + + // we are processing elements 16x16 so we divide len by 16 + f.SHRQ("$4", len, "we are processing 16 elements at a time") + + // offset we want to add to a is m*4bytes + f.SHLQ("$2", m, "offset = m * 4bytes") + + f.MOVQ(addrA, addrAPlusM) + f.ADDQ(m, addrAPlusM) + + f.Comment("performs a butterfly between 2 vectors of dwords") + f.Comment("first vector is in [0, q) and second vector is in [0, 2q)") + butterflyD2Q := f.Define("butterflyD2Q", 2, func(args ...amd64.Register) { + x := args[0] + y := args[1] + f.VPADDD(x, y, b0) // b0 = x + y + f.VPSUBD(y, x, y) // y = x - y + f.VPSUBD(qd, b0, x) // x = (x+y) - q + f.VPMINUD(b0, x, x) // x %= q + f.VPADDD(qd, y, y) // y = (x-y) + q --> y in [0,2q) + }) + + mul := f.Define("mul", 2, func(args ...amd64.Register) { + x := args[0] + y := args[1] + + f.VPMULUDQ(x, y, P) + f.VPANDQ(LSW, P, PL) + f.VPMULUDQ(PL, qInvNeg, PL) + f.VPANDQ(LSW, PL, PL) + f.VPMULUDQ(PL, q, PL) + f.VPADDQ(P, PL, P) + f.VPSRLQ("$32", P, P) + f.VPSUBQ(q, P, PL) + f.VPMINUQ(P, PL, x) + }) + + lblDone := f.NewLabel("done") + lblLoop := f.NewLabel("loop") + + f.LABEL(lblLoop) + + f.TESTQ(len, len) + f.JEQ(lblDone, "n == 0, we are done") + + f.VMOVDQA32(addrA.At(0), a, "load a[i]") + f.VMOVDQA32(addrAPlusM.At(0), am, "load a[i+m]") + + butterflyD2Q(a, am) + + // a is ready to be stored, but we need to scale am by twiddles. + f.VMOVDQA32(a, addrA.At(0), "store a[i]") + + // we split am into m1 and m2; + // that is am contains 16 uint32 + // but we want that to be 2x8 uint64 + f.VEXTRACTI32X8(0, am, y1) + f.VEXTRACTI32X8(1, am, y2) + f.VPMOVZXDQ(y1, m1) + f.VPMOVZXDQ(y2, m2) + + // load twiddles + f.VPMOVZXDQ(addrTwiddles.At(0), t0) + f.VPMOVZXDQ(addrTwiddles.At(4), t1) + + mul(m1, t0) + mul(m2, t1) + + // store m1 and m2 + f.VPMOVQD(m1, addrAPlusM.At(0)) + f.VPMOVQD(m2, addrAPlusM.At(4)) + + f.ADDQ("$64", addrA) + f.ADDQ("$64", addrAPlusM) + f.ADDQ("$64", addrTwiddles) + f.DECQ(len, "decrement n") + f.JMP(lblLoop) + + f.LABEL(lblDone) + + f.RET() + + f.LABEL(lblSmallerThan16) + f.Comment("m < 16, we call the generic one") + f.Comment("note that this should happen only when doing a FFT smaller than the smallest generated kernel") + + // TODO @gbotrel should have dedicated tests + f.MOVQ("a+0(FP)", amd64.AX) + f.MOVQ(amd64.AX, "(SP)") + f.MOVQ("twiddles+24(FP)", amd64.AX) + f.MOVQ(amd64.AX, "24(SP)") // go vet says 24(SP) should be a_cap+16(FP) + f.MOVQ("start+48(FP)", amd64.AX) + f.MOVQ(amd64.AX, "48(SP)") // go vet says 48(SP) should be twiddles_cap+40(FP) + f.MOVQ("end+56(FP)", amd64.AX) + f.MOVQ(amd64.AX, "56(SP)") + f.MOVQ("m+64(FP)", amd64.AX) + f.MOVQ(amd64.AX, "64(SP)") + + f.WriteLn("CALL ·innerDIFWithTwiddlesGeneric(SB)") + f.RET() + +} + +func (f *FFAmd64) generateFFTKernelF31(klog2 int) { + if klog2 != 7 { + panic("not implemented") + } + // for now we generate kernels of size 1 << 7 (128) only + // as we can keep the input and twiddles in registers and avoid round trips with memory. + // perf note: we could generate a larger kernel, maybe up to 512 and process the "left" part of the FFT + // fully in registers. may not be clearly worth it since it would only save 3 calls to the assembly + // innerDIFWithTwiddles ; + the latency to write a to L1 cache. + n := 1 << klog2 + f.Comment(fmt.Sprintf("kerDIFNP_%d_avx512(a []{{ .FF }}.Element, twiddles [][]{{ .FF }}.Element, stage int)", n)) + const argSize = 7 * 8 + stackSize := f.StackSize(f.NbWords*2+4, 1, 0) + registers := f.FnHeader(fmt.Sprintf("kerDIFNP_%d_avx512", n), stackSize, argSize, amd64.AX) + defer f.AssertCleanStack(stackSize, 0) + + // registers & labels we need + addrA := f.Pop(®isters) + addrAPlusM := f.Pop(®isters) + addrTwiddlesRoot := f.Pop(®isters) + addrTwiddles := f.Pop(®isters) + innerLen := f.Pop(®isters) + addrVInterleaveIndices := f.Pop(®isters) + + // AVX512 registers + // Z0-Z15 taken by a + a := make([]amd64.Register, 16) + for i := range a { + a[i] = amd64.Register(fmt.Sprintf("Z%d", i)) + } + + P := amd64.Register("Z16") + q := amd64.Register("Z17") + qInvNeg := amd64.Register("Z18") + PL := amd64.Register("Z19") + LSW := amd64.Register("Z20") + b0 := amd64.Register("Z21") + b1 := amd64.Register("Z22") + _ = b1 + + // t takes Z23 -> Z31 + t := make([]amd64.Register, 8) + for i := range t { + t[i] = amd64.Register(fmt.Sprintf("Z%d", 23+i)) + } + tx0 := amd64.Register("X23") + ty0 := amd64.Register("Y23") + + // load q and qInvNeg + f.Comment("prepare constants needed for mul and reduce ops") + f.WriteLn("MOVD $const_q, AX") + f.VPBROADCASTQ(amd64.AX, q) + f.WriteLn("MOVD $const_qInvNeg, AX") + f.VPBROADCASTQ(amd64.AX, qInvNeg) + f.VPCMPEQB("Y0", "Y0", "Y0") + f.VPMOVZXDQ("Y0", LSW) + + f.Comment("load arguments") + f.MOVQ("a+0(FP)", addrA) + f.MOVQ("twiddles+24(FP)", addrTwiddlesRoot) + f.MOVQ("stage+48(FP)", amd64.AX) + f.IMULQ("$24", amd64.AX) + f.ADDQ(amd64.AX, addrTwiddlesRoot, "we want twiddles[stage] as starting point") + + f.Comment("load a[:128] in registers") + for i := range a { + // addr.At gives i*8 (word size); + // we want to advance by 32bytes to have 8 uint32 element loaded at a time. + f.VPMOVZXDQ(addrA.At(i*4), a[i]) + } + + // step 0 + // innerDIFWithTwiddles(a[:128], twiddles[stage+0], 0, 64, 64) + f.Comment("butterfly computes") + f.Comment("in0 = in0 + in1 (in [0,q))") + f.Comment("in1 = in0 - in1 (in [0,2q))") + butterfly := f.Define("butterfly", 2, func(args ...amd64.Register) { + x := args[0] + y := args[1] + + f.VPADDQ(x, y, b0) // b0 = x + y + f.VPSUBQ(y, x, y) // y = x - y + f.VPSUBQ(q, b0, x) // x = (x+y) - q + f.VPMINUQ(b0, x, x) // x %= q + f.VPADDQ(q, y, y) // y = (x-y) + q --> y in [0,2q) + }) + + f.Comment("mul computes x = x * y") + mul := f.Define("mul", 2, func(args ...amd64.Register) { + x := args[0] + y := args[1] + + f.VPMULUDQ(x, y, P) + f.VPANDQ(LSW, P, PL) + f.VPMULUDQ(PL, qInvNeg, PL) + f.VPANDQ(LSW, PL, PL) + f.VPMULUDQ(PL, q, PL) + f.VPADDQ(P, PL, P) + f.VPSRLQ("$32", P, P) + f.VPSUBQ(q, P, PL) + f.VPMINUQ(P, PL, x) + }) + + m := n >> 1 + + // perf note: we could handle the case m == 16 a bit differently + // (see innerDIFWithTwiddles) + // and likely save some cycles; keeping as is for now for simplicity. + for m >= 8 { + + f.MOVQ(addrTwiddlesRoot.At(0), addrTwiddles) + nbTwiddles := m + for i := 0; i < nbTwiddles/8; i++ { + f.VPMOVZXDQ(addrTwiddles.At(i*4), t[i]) + } + + am := m / 8 + for offset := 0; offset < 128; offset += n { + aa := a[offset/8:] + for i := 0; i < am; i++ { + butterfly(aa[i], aa[i+am]) + mul(aa[i+am], t[i]) + } + } + + n >>= 1 + m = n >> 1 + + // increment addrTwiddlesRoot + f.ADDQ("$24", addrTwiddlesRoot) + } + + // here we should have m == 2 + if m != 4 { + panic("unexpected m value") + } + + // for m == 4, we are going to permute some lanes; + // we have for example + // Z1 = A A A A B B B B + // Z2 = C C C C D D D D + // we want + // Z1 = A A A A C C C C + // Z2 = B B B B D D D D + // and then we can do our butterfly ops, + // our scaling by twiddles + // and permute back. + // + // similarly, we need to "pack" 2x4 twiddles + // into a single Z register + f.MOVQ(addrTwiddlesRoot.At(0), addrTwiddles) + f.VPMOVZXDQ(addrTwiddles.At(0), ty0, "zero extend 4x uint32 to 4x uint64") + f.VINSERTI64X4(1, ty0, t[0], t[0]) + + const kBlendEven4 = 0x0f0f + f.MOVQ(uint64(kBlendEven4), amd64.AX) + f.KMOVQ(amd64.AX, "K1") + + // we have for example + // Z1 = A A A A B B B B + // Z2 = C C C C D D D D + // we want + // Z1 = A A A A C C C C + // Z2 = B B B B D D D D + permute4x4 := f.Define("permute4x4", 2, func(args ...amd64.Register) { + x := args[0] + y := args[1] + f.VSHUFI64X2(uint64(0b01_00_11_10), y, x, b0) + f.VPBLENDMQ(x, b0, x, "K1") + f.VPBLENDMQ(b0, y, y, "K1") + }) + + // for offset := 0; offset < 128; offset += 8 { + // innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+4], 0, 4, 4) + // } + // now we process the a[i] 2 by 2 and permute before / after the ops. + for offset := 0; offset < 128; offset += n * 2 { + // note that we advance by 2*n, that is 16 uint32 + // that is 2 ZMM vectors + x := a[offset/8] + y := a[(offset/8)+1] + + // first we need to permute 4 last of x with 4 first of y + permute4x4(x, y) + butterfly(x, y) + mul(y, t[0]) + + // invert back + permute4x4(x, y) + } + + n >>= 1 + + // now m == 2 our permutation may cost a bit more but let's see. + + // increment addrTwiddlesRoot + f.ADDQ("$24", addrTwiddlesRoot) + // we could probably extract the twiddles from t[0] with a stride. + // we know we want t[0] = 1 t2 1 t2 1 t2 1 t2 + // and we have from m == 4 + // t[0] = 1 t1 t2 t3 1 t1 t2 t3 + f.MOVQ(addrTwiddlesRoot.At(0), addrTwiddles) + f.VPMOVZXDQ(addrTwiddles.At(0), tx0, "zero extend 2x uint32 to 2x uint64") + f.VINSERTI64X2(1, tx0, t[0], t[0]) + f.VINSERTI64X2(2, tx0, t[0], t[0]) + f.VINSERTI64X2(3, tx0, t[0], t[0]) + + // we have for example + // Z1 = A A B B C C D D + // Z2 = L L M M N N O O + // we want + // Z1 = A A L L C C N N + // Z2 = B B M M D D O O + + const kBlendEven = 0b00110011 + f.MOVQ(uint64(kBlendEven), amd64.AX) + f.KMOVQ(amd64.AX, "K2") + + vInterleaveIndices := t[7] + f.MOVQ("·vInterleaveIndices+0(SB)", addrVInterleaveIndices) + f.VMOVDQU64(addrVInterleaveIndices.At(0), vInterleaveIndices) + + permute2x2 := f.Define("permute2x2", 2, func(args ...amd64.Register) { + x := args[0] + y := args[1] + + tmp := t[6] + f.VMOVDQA64(vInterleaveIndices, tmp) + f.VPERMI2Q(y, x, tmp) + f.VPBLENDMQ(x, tmp, x, "K2") + f.VPBLENDMQ(tmp, y, y, "K2") + }) + + // for offset := 0; offset < 128; offset += 4 { + // innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+5], 0, 2, 2) + // } + for offset := 0; offset < 128; offset += n * 4 { + // note that we advance by 4*n, that is 16 uint32 + // that is 2 ZMM vectors + x := a[offset/8] + y := a[(offset/8)+1] + + // first we need to permute 4 last of x with 4 first of y + permute2x2(x, y) + butterfly(x, y) + mul(y, t[0]) + + // invert back + permute2x2(x, y) + } + + const kBlendEven2 = 0b0101010101010101 + + f.MOVQ(uint64(kBlendEven2), amd64.AX) + f.KMOVD(amd64.AX, "K3") + + permute1x1 := f.Define("permute1x1", 2, func(args ...amd64.Register) { + x := args[0] + y := args[1] + + f.VPSHRDQ("$32", y, x, b0) + f.VPBLENDMD(x, b0, x, "K3") + f.VPBLENDMD(b0, y, y, "K3") + }) + + f.WriteLn("MOVD $const_q, AX") + f.VPBROADCASTD(amd64.AX, q, "rebroadcast q, but on dword lanes") + + // same as butterfly, but we reduce mod q the 2 results + // also uses dword lane instructions + lastButterfly := f.Define("lastButterfly", 2, func(args ...amd64.Register) { + x := args[0] + y := args[1] + f.VPADDD(x, y, b0) // b0 = x + y + f.VPSUBD(y, x, y) // y = x - y + f.VPSUBD(q, b0, x) // x = (x+y) - q + f.VPMINUD(b0, x, x) // x %= q + f.VPADDD(q, y, b1) // b1 = (x-y) + q --> b1 in [0,2q) + f.VPMINUD(b1, y, y) // y %= q + }) + + packDWORDS := f.Define("PACK_DWORDS", 4, func(args ...amd64.Register) { + x := args[0] + xx := args[1] + y := args[2] + xy := args[3] + + f.VPMOVQD(x, xx) + f.VPMOVQD(y, xy) + f.VINSERTI64X4(1, xy, x, x) + }) + + // now m == 1, last step is only butterflies like so + // for offset := 0; offset < 128; offset += 2 { + // koalabear.Butterfly(&a[offset], &a[offset+1]) + // } + // our a vectors are on QWORDS lanes, we can pack them into DWORD lanes to reduce nb ops + for i := 0; i < len(a); i += 4 { + u, v, w, x := a[i], a[i+1], a[i+2], a[i+3] + packDWORDS(u, zToy(u), v, zToy(v)) + packDWORDS(w, zToy(w), x, zToy(x)) + + permute1x1(u, w) + lastButterfly(u, w) + permute1x1(u, w) + } + + // end we store back a + f.Comment("store a[:128] in memory") + for i := 0; i < len(a); i += 2 { + f.VMOVDQA32(a[i], addrA.At(i*4)) + } + + f.RET() + + f.Push(®isters, addrA, addrTwiddles, addrAPlusM, innerLen) + +} + +func zToy(r amd64.Register) amd64.Register { + v := string(r) // v will be Z1, Z2, ... etc, we want to return Y1, Y2, ... + vr := "Y" + v[1:] + return amd64.Register(vr) +} diff --git a/field/generator/generator_fft.go b/field/generator/generator_fft.go index 4a6ffa26a7..47a72b3c82 100644 --- a/field/generator/generator_fft.go +++ b/field/generator/generator_fft.go @@ -9,6 +9,7 @@ import ( "strings" "github.com/consensys/bavard" + "github.com/consensys/gnark-crypto/field/generator/asm/amd64" "github.com/consensys/gnark-crypto/field/generator/config" eccconfig "github.com/consensys/gnark-crypto/internal/generator/config" ) @@ -33,10 +34,18 @@ func generateFFT(F *config.Field, fft *config.FFT, outputDir string) error { FFT: *fft, FieldPackagePath: fieldImportPath, FF: F.PackageName, + HasASMKernel: F.F31, + Kernels: []int{5, 8}, Package: "fft", } outputDir = filepath.Join(outputDir, "fft") + pureGoBuildTag := "" + if data.HasASMKernel { + pureGoBuildTag = "purego || (!amd64)" + data.Kernels = []int{7} + } + entries := []bavard.Entry{ {File: filepath.Join(outputDir, "doc.go"), Templates: []string{"doc.go.tmpl"}}, {File: filepath.Join(outputDir, "domain_test.go"), Templates: []string{"tests/domain.go.tmpl"}}, @@ -44,10 +53,35 @@ func generateFFT(F *config.Field, fft *config.FFT, outputDir string) error { {File: filepath.Join(outputDir, "fft_test.go"), Templates: []string{"tests/fft.go.tmpl"}}, {File: filepath.Join(outputDir, "bitreverse_test.go"), Templates: []string{"tests/bitreverse.go.tmpl"}}, {File: filepath.Join(outputDir, "fft.go"), Templates: []string{"fft.go.tmpl"}}, + {File: filepath.Join(outputDir, "kernel_purego.go"), Templates: []string{"kernel.purego.go.tmpl"}, BuildTag: pureGoBuildTag}, {File: filepath.Join(outputDir, "bitreverse.go"), Templates: []string{"bitreverse.go.tmpl"}}, {File: filepath.Join(outputDir, "options.go"), Templates: []string{"options.go.tmpl"}}, } + if data.HasASMKernel { + data.Q = F.Q[0] + data.QInvNeg = F.QInverse[0] + entries = append(entries, + bavard.Entry{ + File: filepath.Join(outputDir, "kernel_amd64.go"), + Templates: []string{"kernel.amd64.go.tmpl"}, + BuildTag: "!purego"}) + + // generate the assembly file; + fftKernels, err := os.Create(filepath.Join(outputDir, "kernel_amd64.s")) + if err != nil { + return err + } + + fftKernels.WriteString("//go:build !purego\n") + + if err := amd64.GenerateF31FFTKernels(fftKernels, F.NbBits, data.Kernels); err != nil { + fftKernels.Close() + return err + } + fftKernels.Close() + } + funcs := make(map[string]interface{}) funcs["bitReverse"] = bitReverse funcs["reverseBits"] = func(x, n any) uint64 { @@ -70,7 +104,7 @@ func generateFFT(F *config.Field, fft *config.FFT, outputDir string) error { } fftTemplatesRootDir = filepath.Join(fftTemplatesRootDir, "fft") - if err := bgen.GenerateWithOptions(data, data.Package, fftTemplatesRootDir, bavardOpts, entries...); err != nil { + if err := bgen.GenerateWithOptions(data, "fft", fftTemplatesRootDir, bavardOpts, entries...); err != nil { return err } @@ -91,14 +125,12 @@ func generateFFT(F *config.Field, fft *config.FFT, outputDir string) error { type fftTemplateData struct { config.FFT - // Package name of the generated package - Package string - - // ImportPathFiniteField path to the finite field package - FieldPackagePath string - - // FF the name of the package corresponding to the finite field - FF string + FieldPackagePath string // path to the finite field package + FF string // name of the package corresponding to the finite field + HasASMKernel bool // indicates if the kernels have an assembly impl + Kernels []int // indicates which kernels to generate + Package string // package name + Q, QInvNeg uint64 } func findTemplatesRootDir() (string, error) { diff --git a/field/generator/generator_sis.go b/field/generator/generator_sis.go index 78d320040d..9bb27b2c5c 100644 --- a/field/generator/generator_sis.go +++ b/field/generator/generator_sis.go @@ -32,12 +32,14 @@ func generateSIS(F *config.Field, outputDir string) error { FF string FieldPackagePath string HasUnrolledFFT bool + F31 bool } data := &sisTemplateData{ FF: F.PackageName, FieldPackagePath: fieldImportPath, HasUnrolledFFT: F.NbBytes == 32, + F31: F.F31, } funcs := make(map[string]interface{}) diff --git a/field/generator/internal/addchain/addchain.go b/field/generator/internal/addchain/addchain.go index 9c2f9c635e..9d11b1866c 100644 --- a/field/generator/internal/addchain/addchain.go +++ b/field/generator/internal/addchain/addchain.go @@ -64,7 +64,6 @@ var ( // GetAddChain returns template data of a short addition chain for given big.Int func GetAddChain(n *big.Int) *AddChainData { - // init the cache only once. once.Do(initCache) diff --git a/field/generator/internal/templates/element/asm.go b/field/generator/internal/templates/element/asm.go index 53cadbc0a8..87e54f10d3 100644 --- a/field/generator/internal/templates/element/asm.go +++ b/field/generator/internal/templates/element/asm.go @@ -14,7 +14,7 @@ const Avx = ` import "golang.org/x/sys/cpu" var ( - supportAvx512 = {{- if not .F31 }}supportAdx && {{- end}}cpu.X86.HasAVX512 && cpu.X86.HasAVX512DQ + supportAvx512 = {{- if not .F31 }}supportAdx && {{- end}}cpu.X86.HasAVX512 && cpu.X86.HasAVX512DQ && cpu.X86.HasAVX512VBMI2 _ = supportAvx512 ) ` diff --git a/field/generator/internal/templates/element/vector_ops_asm.go b/field/generator/internal/templates/element/vector_ops_asm.go index 2a1d0bf8e6..1c5999148e 100644 --- a/field/generator/internal/templates/element/vector_ops_asm.go +++ b/field/generator/internal/templates/element/vector_ops_asm.go @@ -142,6 +142,9 @@ var ( //go:noescape func mulVec(res, a, b *{{.ElementName}}, n uint64, qInvNeg uint64) + + + ` const VectorOpsArm64 = VectorOpsPureGo @@ -265,6 +268,9 @@ func scalarMulVec(res, a, b *{{.ElementName}}, n uint64) //go:noescape func innerProdVec(t *uint64, a, b *{{.ElementName}}, n uint64) +//go:noescape +func butterflyMulVec(a, twiddles *{{.ElementName}}, m int) + // Add adds two vectors element-wise and stores the result in self. // It panics if the vectors don't have the same length. func (vector *Vector) Add(a, b Vector) { @@ -428,7 +434,4 @@ func (vector *Vector) Mul(a, b Vector) { mulVecGeneric((*vector)[start:], a[start:], b[start:]) } } - - - ` diff --git a/field/generator/internal/templates/element/vector_ops_purego.go b/field/generator/internal/templates/element/vector_ops_purego.go index 071b710587..15827e772b 100644 --- a/field/generator/internal/templates/element/vector_ops_purego.go +++ b/field/generator/internal/templates/element/vector_ops_purego.go @@ -37,4 +37,5 @@ func (vector *Vector) InnerProduct(other Vector) (res {{.ElementName}}) { func (vector *Vector) Mul(a, b Vector) { mulVecGeneric(*vector, a, b) } + ` diff --git a/field/generator/internal/templates/fft/fft.go.tmpl b/field/generator/internal/templates/fft/fft.go.tmpl index c239e9701b..c49a8c01f2 100644 --- a/field/generator/internal/templates/fft/fft.go.tmpl +++ b/field/generator/internal/templates/fft/fft.go.tmpl @@ -7,13 +7,6 @@ import ( "{{ .FieldPackagePath }}" ) -{{- /* these params set the size of the kernel we generate & unroll */}} -{{ $sizeKernelLog2 := 8}} -{{ $sizeKernel := shl 1 $sizeKernelLog2}} - -{{ $sizeKernel2Log2 := 6}} -{{ $sizeKernel2 := shl 1 $sizeKernel2Log2}} - // Decimation is used in the FFT call to select decimation in time or in frequency type Decimation uint8 @@ -204,14 +197,13 @@ func difFFT(a []{{ .FF }}.Element, w {{ .FF }}.Element, twiddles [][]{{ .FF }}.E if n == 1 { return } else if stage >= twiddlesStartStage { - if n == {{$sizeKernel}} { - kerDIFNP_{{$sizeKernel}}(a, twiddles, stage-twiddlesStartStage) - return - } else if n == {{$sizeKernel2}} { - kerDIFNP_{{$sizeKernel2}}(a, twiddles, stage-twiddlesStartStage) - return - } - + {{- range $ki, $klog2 := $.Kernels}} + {{- if ne $ki 0}} else {{- end}} if n == 1 << {{$klog2}} { + {{- $ksize := shl 1 $klog2}} + kerDIFNP_{{$ksize}}(a, twiddles, stage-twiddlesStartStage) + return + } + {{- end }} } m := n >> 1 @@ -235,13 +227,17 @@ func difFFT(a []{{ .FF }}.Element, w {{ .FF }}.Element, twiddles [][]{{ .FF }}.E // compute next twiddle w.Square(&w) } else { - if parallelButterfly { - parallel.Execute(m, func(start, end int) { - innerDIFWithTwiddles(a, twiddles[stage-twiddlesStartStage], start, end, m) - }, nbTasks / (1 << (stage))) - } else { + {{- if .HasASMKernel}} innerDIFWithTwiddles(a, twiddles[stage-twiddlesStartStage], 0, m, m) - } + {{- else}} + if parallelButterfly { + parallel.Execute(m, func(start, end int) { + innerDIFWithTwiddles(a, twiddles[stage-twiddlesStartStage], start, end, m) + }, nbTasks / (1 << (stage))) + } else { + innerDIFWithTwiddles(a, twiddles[stage-twiddlesStartStage], 0, m, m) + } + {{- end}} } if m == 1 { @@ -261,7 +257,7 @@ func difFFT(a []{{ .FF }}.Element, w {{ .FF }}.Element, twiddles [][]{{ .FF }}.E } -func innerDIFWithTwiddles(a []{{ .FF }}.Element, twiddles []{{ .FF }}.Element, start, end, m int) { +func innerDIFWithTwiddlesGeneric(a []{{ .FF }}.Element, twiddles []{{ .FF }}.Element, start, end, m int) { if start == 0 { {{ .FF }}.Butterfly(&a[0], &a[m]) start++ @@ -269,7 +265,6 @@ func innerDIFWithTwiddles(a []{{ .FF }}.Element, twiddles []{{ .FF }}.Element, s for i := start; i < end; i++ { {{ .FF }}.Butterfly(&a[i], &a[i+m]) } - // TODO @gbotrel: here the butterfly for most cases could leave the result not reduced mod q v1 := {{ .FF }}.Vector(a[start+m:end+m]) v2 := {{ .FF }}.Vector(twiddles[start:end]) v1.Mul(v1, v2) @@ -296,14 +291,13 @@ func ditFFT(a []{{ .FF }}.Element, w {{ .FF }}.Element, twiddles [][]{{ .FF }}.E if n == 1 { return } else if stage >= twiddlesStartStage { - if n == {{$sizeKernel2}} { - kerDITNP_{{$sizeKernel2}}(a, twiddles, stage-twiddlesStartStage) - return - } else if n == {{$sizeKernel}} { - kerDITNP_{{$sizeKernel}}(a, twiddles, stage-twiddlesStartStage) - return - } - + {{- range $ki, $klog2 := $.Kernels}} + {{- if ne $ki 0}} else {{- end}} if n == 1 << {{$klog2}} { + {{- $ksize := shl 1 $klog2}} + kerDITNP_{{$ksize}}(a, twiddles, stage-twiddlesStartStage) + return + } + {{- end }} } m := n >> 1 @@ -379,13 +373,16 @@ func innerDITWithoutTwiddles(a []{{ .FF }}.Element, at, w {{ .FF }}.Element, sta } } -{{genKernel $.FF $sizeKernel $sizeKernelLog2}} -{{genKernel $.FF $sizeKernel2 $sizeKernel2Log2}} +{{range $ki, $klog2 := $.Kernels}} + {{$ksize := shl 1 $klog2}} + {{genKernel $.FF $ksize $klog2}} +{{end}} {{define "genKernel FF sizeKernel sizeKernelLog2"}} -func kerDIFNP_{{.sizeKernel}}(a []{{ .FF }}.Element, twiddles [][]{{ .FF }}.Element, stage int) { +func kerDIFNP_{{.sizeKernel}}generic(a []{{ .FF }}.Element, twiddles [][]{{ .FF }}.Element, stage int) { // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl + {{ $n := shl 1 .sizeKernelLog2}} {{ $m := div $n 2}} {{ $split := 1}} @@ -411,8 +408,7 @@ func kerDIFNP_{{.sizeKernel}}(a []{{ .FF }}.Element, twiddles [][]{{ .FF }}.Elem {{- end}} } - -func kerDITNP_{{.sizeKernel}}(a []{{ .FF }}.Element, twiddles [][]{{ .FF }}.Element, stage int) { +func kerDITNP_{{.sizeKernel}}generic(a []{{ .FF }}.Element, twiddles [][]{{ .FF }}.Element, stage int) { // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl {{ $n := 2}} {{ $m := div $n 2}} diff --git a/field/generator/internal/templates/fft/kernel.amd64.go.tmpl b/field/generator/internal/templates/fft/kernel.amd64.go.tmpl new file mode 100644 index 0000000000..0f0696319a --- /dev/null +++ b/field/generator/internal/templates/fft/kernel.amd64.go.tmpl @@ -0,0 +1,50 @@ +import ( + "golang.org/x/sys/cpu" + "{{ .FieldPackagePath }}" +) + + +var ( + supportAVX512 = cpu.X86.HasAVX512 && cpu.X86.HasAVX512DQ && cpu.X86.HasAVX512VBMI2 +) + +// q + r'.r = 1, i.e., qInvNeg = - q⁻¹ mod r +// used for Montgomery reduction +const qInvNeg = {{.QInvNeg}} +const q = {{.Q}} + +// index table used in avx512 shuffling +var vInterleaveIndices = []uint64 { + 2, 3, 8, 9, 6, 7, 12, 13, +} + +//go:noescape +func innerDIFWithTwiddles_avx512(a []{{ .FF }}.Element, twiddles []{{ .FF }}.Element, start, end, m int) + +func innerDIFWithTwiddles(a []{{ .FF }}.Element, twiddles []{{ .FF }}.Element, start, end, m int) { + if !supportAVX512 { + innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) + return + } + innerDIFWithTwiddles_avx512(a, twiddles, start, end, m) +} + +{{range $ki, $klog2 := $.Kernels}} + {{- $ksize := shl 1 $klog2}} + +//go:noescape +func kerDIFNP_{{$ksize}}_avx512(a []{{ $.FF }}.Element, twiddles [][]{{ $.FF }}.Element, stage int) + +func kerDIFNP_{{$ksize}}(a []{{ $.FF }}.Element, twiddles [][]{{ $.FF }}.Element, stage int) { + if !supportAVX512 { + kerDIFNP_{{$ksize}}generic(a, twiddles, stage) + return + } + kerDIFNP_{{$ksize}}_avx512(a, twiddles, stage) +} + +func kerDITNP_{{$ksize}}(a []{{ $.FF }}.Element, twiddles [][]{{ $.FF }}.Element, stage int) { + kerDITNP_{{$ksize}}generic(a, twiddles, stage) +} +{{end}} + diff --git a/field/generator/internal/templates/fft/kernel.purego.go.tmpl b/field/generator/internal/templates/fft/kernel.purego.go.tmpl new file mode 100644 index 0000000000..f4caaac6f1 --- /dev/null +++ b/field/generator/internal/templates/fft/kernel.purego.go.tmpl @@ -0,0 +1,18 @@ +import ( + "{{ .FieldPackagePath }}" +) + + +func innerDIFWithTwiddles(a []{{ .FF }}.Element, twiddles []{{ .FF }}.Element, start, end, m int) { + innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) +} + +{{range $ki, $klog2 := $.Kernels}} + {{- $ksize := shl 1 $klog2}} +func kerDIFNP_{{$ksize}}(a []{{ $.FF }}.Element, twiddles [][]{{ $.FF }}.Element, stage int) { + kerDIFNP_{{$ksize}}generic(a, twiddles, stage) +} +func kerDITNP_{{$ksize}}(a []{{ $.FF }}.Element, twiddles [][]{{ $.FF }}.Element, stage int) { + kerDITNP_{{$ksize}}generic(a, twiddles, stage) +} +{{end}} \ No newline at end of file diff --git a/field/generator/internal/templates/fft/tests/fft.go.tmpl b/field/generator/internal/templates/fft/tests/fft.go.tmpl index c926779cde..6275d2f069 100644 --- a/field/generator/internal/templates/fft/tests/fft.go.tmpl +++ b/field/generator/internal/templates/fft/tests/fft.go.tmpl @@ -303,6 +303,23 @@ func BenchmarkFFTDIFReference(b *testing.B) { } } +func BenchmarkFFTDIFReferenceSmall(b *testing.B) { + const maxSize = 1 << 9 + + pol := make([]{{ .FF }}.Element, maxSize) + pol[0].SetRandom() + for i := 1; i < maxSize; i++ { + pol[i] = pol[i-1] + } + + domain := NewDomain(maxSize) + + b.ResetTimer() + for j := 0; j < b.N; j++ { + domain.FFT(pol, DIF) + } +} + func evaluatePolynomial(pol []{{ .FF }}.Element, val {{ .FF }}.Element) {{ .FF }}.Element { var acc, res, tmp {{ .FF }}.Element diff --git a/field/generator/internal/templates/sis/sis.go.tmpl b/field/generator/internal/templates/sis/sis.go.tmpl index 53271d7214..90d552a810 100644 --- a/field/generator/internal/templates/sis/sis.go.tmpl +++ b/field/generator/internal/templates/sis/sis.go.tmpl @@ -169,11 +169,65 @@ func (r *RSis) Hash(v, res []{{ .FF }}.Element) error { } {{- end}} - // inner hash - it := NewLimbIterator(&VectorIterator{v: v}, r.LogTwoBound/8) - for i := 0; i < len(r.Ag); i++ { - r.InnerHash(it, res, k, r.kz, i, mask) - } + {{- if .F31}} + if r.Degree == 512 && r.LogTwoBound == 16 { + // this is our hot path, we don't use the iterator because with + // avx512 instructions, it actually ends up being most of the CPU time. + er := {{ .FF }}.Element{1} // mul by 1 --> mont reduce + polId := 0 + var k512 [512]{{ .FF }}.Element + vk := {{ .FF }}.Vector(k512[:]) + vRes := {{ .FF }}.Vector(res) + vb := {{ .FF }}.Vector(k512[256:]) + + cosets, err := r.Domain.CosetTable() + if err != nil { + return err + } + vCosets := {{ .FF }}.Vector(cosets) + + for j := 0; j < len(v); j+=256 { + start := j + end := j + 256 + end = min(end, len(v)) + + // use half of vk to copy the v input to batch convert to regular form + copy(vb[:], v[start:end]) + for k:= (end-start); k < 256; k++ { + vb[k][0] = 0 + } + // batch montgomery -> regular + vb.ScalarMul(vb, &er) + + // do the limb split + for k := 0; k < 256; k++ { + k512[k*2][0] = uint32(uint16(vb[k][0])) + k512[k*2+1][0] = uint32(uint16(vb[k][0] >> 16)) + } + + // inner hash + vk.Mul(vk, vCosets) + r.Domain.FFT(k512[:], fft.DIF, fft.WithNbTasks(1)) + vk.Mul(vk, {{.FF}}.Vector(r.Ag[polId])) + vRes.Add(vRes, vk) + polId++ + } + } else { + // inner hash + it := NewLimbIterator(&VectorIterator{v: v}, r.LogTwoBound/8) + for i := 0; i < len(r.Ag); i++ { + r.InnerHash(it, res, k, r.kz, i, mask) + } + } + {{- else}} + // inner hash + it := NewLimbIterator(&VectorIterator{v: v}, r.LogTwoBound/8) + for i := 0; i < len(r.Ag); i++ { + r.InnerHash(it, res, k, r.kz, i, mask) + } + {{- end}} + + // reduces mod Xᵈ+1 r.Domain.FFTInverse(res, fft.DIT, fft.OnCoset(), fft.WithNbTasks(1)) diff --git a/field/goldilocks/fft/fft.go b/field/goldilocks/fft/fft.go index 140d40be8b..ed3535a7cf 100644 --- a/field/goldilocks/fft/fft.go +++ b/field/goldilocks/fft/fft.go @@ -201,14 +201,13 @@ func difFFT(a []goldilocks.Element, w goldilocks.Element, twiddles [][]goldilock if n == 1 { return } else if stage >= twiddlesStartStage { - if n == 256 { - kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) + if n == 1<<5 { + kerDIFNP_32(a, twiddles, stage-twiddlesStartStage) return - } else if n == 64 { - kerDIFNP_64(a, twiddles, stage-twiddlesStartStage) + } else if n == 1<<8 { + kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) return } - } m := n >> 1 @@ -258,7 +257,7 @@ func difFFT(a []goldilocks.Element, w goldilocks.Element, twiddles [][]goldilock } -func innerDIFWithTwiddles(a []goldilocks.Element, twiddles []goldilocks.Element, start, end, m int) { +func innerDIFWithTwiddlesGeneric(a []goldilocks.Element, twiddles []goldilocks.Element, start, end, m int) { if start == 0 { goldilocks.Butterfly(&a[0], &a[m]) start++ @@ -266,7 +265,6 @@ func innerDIFWithTwiddles(a []goldilocks.Element, twiddles []goldilocks.Element, for i := start; i < end; i++ { goldilocks.Butterfly(&a[i], &a[i+m]) } - // TODO @gbotrel: here the butterfly for most cases could leave the result not reduced mod q v1 := goldilocks.Vector(a[start+m : end+m]) v2 := goldilocks.Vector(twiddles[start:end]) v1.Mul(v1, v2) @@ -292,14 +290,13 @@ func ditFFT(a []goldilocks.Element, w goldilocks.Element, twiddles [][]goldilock if n == 1 { return } else if stage >= twiddlesStartStage { - if n == 64 { - kerDITNP_64(a, twiddles, stage-twiddlesStartStage) + if n == 1<<5 { + kerDITNP_32(a, twiddles, stage-twiddlesStartStage) return - } else if n == 256 { + } else if n == 1<<8 { kerDITNP_256(a, twiddles, stage-twiddlesStartStage) return } - } m := n >> 1 @@ -374,7 +371,43 @@ func innerDITWithoutTwiddles(a []goldilocks.Element, at, w goldilocks.Element, s } } -func kerDIFNP_256(a []goldilocks.Element, twiddles [][]goldilocks.Element, stage int) { +func kerDIFNP_32generic(a []goldilocks.Element, twiddles [][]goldilocks.Element, stage int) { + // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl + + innerDIFWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) + for offset := 0; offset < 32; offset += 16 { + innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) + } + for offset := 0; offset < 32; offset += 8 { + innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) + } + for offset := 0; offset < 32; offset += 4 { + innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) + } + for offset := 0; offset < 32; offset += 2 { + goldilocks.Butterfly(&a[offset], &a[offset+1]) + } +} + +func kerDITNP_32generic(a []goldilocks.Element, twiddles [][]goldilocks.Element, stage int) { + // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl + + for offset := 0; offset < 32; offset += 2 { + goldilocks.Butterfly(&a[offset], &a[offset+1]) + } + for offset := 0; offset < 32; offset += 4 { + innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) + } + for offset := 0; offset < 32; offset += 8 { + innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) + } + for offset := 0; offset < 32; offset += 16 { + innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) + } + innerDITWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) +} + +func kerDIFNP_256generic(a []goldilocks.Element, twiddles [][]goldilocks.Element, stage int) { // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl innerDIFWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) @@ -401,7 +434,7 @@ func kerDIFNP_256(a []goldilocks.Element, twiddles [][]goldilocks.Element, stage } } -func kerDITNP_256(a []goldilocks.Element, twiddles [][]goldilocks.Element, stage int) { +func kerDITNP_256generic(a []goldilocks.Element, twiddles [][]goldilocks.Element, stage int) { // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl for offset := 0; offset < 256; offset += 2 { @@ -427,45 +460,3 @@ func kerDITNP_256(a []goldilocks.Element, twiddles [][]goldilocks.Element, stage } innerDITWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) } - -func kerDIFNP_64(a []goldilocks.Element, twiddles [][]goldilocks.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - innerDIFWithTwiddles(a[:64], twiddles[stage+0], 0, 32, 32) - for offset := 0; offset < 64; offset += 32 { - innerDIFWithTwiddles(a[offset:offset+32], twiddles[stage+1], 0, 16, 16) - } - for offset := 0; offset < 64; offset += 16 { - innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+2], 0, 8, 8) - } - for offset := 0; offset < 64; offset += 8 { - innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+3], 0, 4, 4) - } - for offset := 0; offset < 64; offset += 4 { - innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+4], 0, 2, 2) - } - for offset := 0; offset < 64; offset += 2 { - goldilocks.Butterfly(&a[offset], &a[offset+1]) - } -} - -func kerDITNP_64(a []goldilocks.Element, twiddles [][]goldilocks.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - for offset := 0; offset < 64; offset += 2 { - goldilocks.Butterfly(&a[offset], &a[offset+1]) - } - for offset := 0; offset < 64; offset += 4 { - innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+4], 0, 2, 2) - } - for offset := 0; offset < 64; offset += 8 { - innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+3], 0, 4, 4) - } - for offset := 0; offset < 64; offset += 16 { - innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+2], 0, 8, 8) - } - for offset := 0; offset < 64; offset += 32 { - innerDITWithTwiddles(a[offset:offset+32], twiddles[stage+1], 0, 16, 16) - } - innerDITWithTwiddles(a[:64], twiddles[stage+0], 0, 32, 32) -} diff --git a/field/goldilocks/fft/fft_test.go b/field/goldilocks/fft/fft_test.go index 85fda92d12..eb98e00791 100644 --- a/field/goldilocks/fft/fft_test.go +++ b/field/goldilocks/fft/fft_test.go @@ -305,6 +305,23 @@ func BenchmarkFFTDIFReference(b *testing.B) { } } +func BenchmarkFFTDIFReferenceSmall(b *testing.B) { + const maxSize = 1 << 9 + + pol := make([]goldilocks.Element, maxSize) + pol[0].SetRandom() + for i := 1; i < maxSize; i++ { + pol[i] = pol[i-1] + } + + domain := NewDomain(maxSize) + + b.ResetTimer() + for j := 0; j < b.N; j++ { + domain.FFT(pol, DIF) + } +} + func evaluatePolynomial(pol []goldilocks.Element, val goldilocks.Element) goldilocks.Element { var acc, res, tmp goldilocks.Element res.Set(&pol[0]) diff --git a/field/goldilocks/fft/kernel_purego.go b/field/goldilocks/fft/kernel_purego.go new file mode 100644 index 0000000000..7ba8f66000 --- /dev/null +++ b/field/goldilocks/fft/kernel_purego.go @@ -0,0 +1,28 @@ +// Copyright 2020-2025 Consensys Software Inc. +// Licensed under the Apache License, Version 2.0. See the LICENSE file for details. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package fft + +import ( + "github.com/consensys/gnark-crypto/field/goldilocks" +) + +func innerDIFWithTwiddles(a []goldilocks.Element, twiddles []goldilocks.Element, start, end, m int) { + innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) +} + +func kerDIFNP_32(a []goldilocks.Element, twiddles [][]goldilocks.Element, stage int) { + kerDIFNP_32generic(a, twiddles, stage) +} +func kerDITNP_32(a []goldilocks.Element, twiddles [][]goldilocks.Element, stage int) { + kerDITNP_32generic(a, twiddles, stage) +} + +func kerDIFNP_256(a []goldilocks.Element, twiddles [][]goldilocks.Element, stage int) { + kerDIFNP_256generic(a, twiddles, stage) +} +func kerDITNP_256(a []goldilocks.Element, twiddles [][]goldilocks.Element, stage int) { + kerDITNP_256generic(a, twiddles, stage) +} diff --git a/field/goldilocks/sis/sis.go b/field/goldilocks/sis/sis.go index 51a7559a2c..433ce5bb52 100644 --- a/field/goldilocks/sis/sis.go +++ b/field/goldilocks/sis/sis.go @@ -142,7 +142,6 @@ func (r *RSis) Hash(v, res []goldilocks.Element) error { // by default, the mask is ignored (unless we unrolled the FFT and have a degree 64) mask := ^uint64(0) - // inner hash it := NewLimbIterator(&VectorIterator{v: v}, r.LogTwoBound/8) for i := 0; i < len(r.Ag); i++ { diff --git a/field/internal/main.go b/field/internal/main.go index 46412585eb..5817e656f9 100644 --- a/field/internal/main.go +++ b/field/internal/main.go @@ -11,7 +11,6 @@ import ( //go:generate go run main.go func main() { // generate the following fields - type field struct { name string modulus string diff --git a/field/koalabear/asm_avx.go b/field/koalabear/asm_avx.go index ec075d811d..7e1fae954c 100644 --- a/field/koalabear/asm_avx.go +++ b/field/koalabear/asm_avx.go @@ -10,6 +10,6 @@ package koalabear import "golang.org/x/sys/cpu" var ( - supportAvx512 = cpu.X86.HasAVX512 && cpu.X86.HasAVX512DQ + supportAvx512 = cpu.X86.HasAVX512 && cpu.X86.HasAVX512DQ && cpu.X86.HasAVX512VBMI2 _ = supportAvx512 ) diff --git a/field/koalabear/fft/fft.go b/field/koalabear/fft/fft.go index 850876deab..535fa4ed9b 100644 --- a/field/koalabear/fft/fft.go +++ b/field/koalabear/fft/fft.go @@ -201,14 +201,10 @@ func difFFT(a []koalabear.Element, w koalabear.Element, twiddles [][]koalabear.E if n == 1 { return } else if stage >= twiddlesStartStage { - if n == 256 { - kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) - return - } else if n == 64 { - kerDIFNP_64(a, twiddles, stage-twiddlesStartStage) + if n == 1<<7 { + kerDIFNP_128(a, twiddles, stage-twiddlesStartStage) return } - } m := n >> 1 @@ -232,13 +228,7 @@ func difFFT(a []koalabear.Element, w koalabear.Element, twiddles [][]koalabear.E // compute next twiddle w.Square(&w) } else { - if parallelButterfly { - parallel.Execute(m, func(start, end int) { - innerDIFWithTwiddles(a, twiddles[stage-twiddlesStartStage], start, end, m) - }, nbTasks/(1<<(stage))) - } else { - innerDIFWithTwiddles(a, twiddles[stage-twiddlesStartStage], 0, m, m) - } + innerDIFWithTwiddles(a, twiddles[stage-twiddlesStartStage], 0, m, m) } if m == 1 { @@ -258,7 +248,7 @@ func difFFT(a []koalabear.Element, w koalabear.Element, twiddles [][]koalabear.E } -func innerDIFWithTwiddles(a []koalabear.Element, twiddles []koalabear.Element, start, end, m int) { +func innerDIFWithTwiddlesGeneric(a []koalabear.Element, twiddles []koalabear.Element, start, end, m int) { if start == 0 { koalabear.Butterfly(&a[0], &a[m]) start++ @@ -266,7 +256,6 @@ func innerDIFWithTwiddles(a []koalabear.Element, twiddles []koalabear.Element, s for i := start; i < end; i++ { koalabear.Butterfly(&a[i], &a[i+m]) } - // TODO @gbotrel: here the butterfly for most cases could leave the result not reduced mod q v1 := koalabear.Vector(a[start+m : end+m]) v2 := koalabear.Vector(twiddles[start:end]) v1.Mul(v1, v2) @@ -292,14 +281,10 @@ func ditFFT(a []koalabear.Element, w koalabear.Element, twiddles [][]koalabear.E if n == 1 { return } else if stage >= twiddlesStartStage { - if n == 64 { - kerDITNP_64(a, twiddles, stage-twiddlesStartStage) - return - } else if n == 256 { - kerDITNP_256(a, twiddles, stage-twiddlesStartStage) + if n == 1<<7 { + kerDITNP_128(a, twiddles, stage-twiddlesStartStage) return } - } m := n >> 1 @@ -374,98 +359,50 @@ func innerDITWithoutTwiddles(a []koalabear.Element, at, w koalabear.Element, sta } } -func kerDIFNP_256(a []koalabear.Element, twiddles [][]koalabear.Element, stage int) { +func kerDIFNP_128generic(a []koalabear.Element, twiddles [][]koalabear.Element, stage int) { // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - innerDIFWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) - for offset := 0; offset < 256; offset += 128 { - innerDIFWithTwiddles(a[offset:offset+128], twiddles[stage+1], 0, 64, 64) + innerDIFWithTwiddles(a[:128], twiddles[stage+0], 0, 64, 64) + for offset := 0; offset < 128; offset += 64 { + innerDIFWithTwiddles(a[offset:offset+64], twiddles[stage+1], 0, 32, 32) } - for offset := 0; offset < 256; offset += 64 { - innerDIFWithTwiddles(a[offset:offset+64], twiddles[stage+2], 0, 32, 32) + for offset := 0; offset < 128; offset += 32 { + innerDIFWithTwiddles(a[offset:offset+32], twiddles[stage+2], 0, 16, 16) } - for offset := 0; offset < 256; offset += 32 { - innerDIFWithTwiddles(a[offset:offset+32], twiddles[stage+3], 0, 16, 16) + for offset := 0; offset < 128; offset += 16 { + innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+3], 0, 8, 8) } - for offset := 0; offset < 256; offset += 16 { - innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+4], 0, 8, 8) + for offset := 0; offset < 128; offset += 8 { + innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+4], 0, 4, 4) } - for offset := 0; offset < 256; offset += 8 { - innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+5], 0, 4, 4) + for offset := 0; offset < 128; offset += 4 { + innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+5], 0, 2, 2) } - for offset := 0; offset < 256; offset += 4 { - innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+6], 0, 2, 2) - } - for offset := 0; offset < 256; offset += 2 { - koalabear.Butterfly(&a[offset], &a[offset+1]) - } -} - -func kerDITNP_256(a []koalabear.Element, twiddles [][]koalabear.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - for offset := 0; offset < 256; offset += 2 { + for offset := 0; offset < 128; offset += 2 { koalabear.Butterfly(&a[offset], &a[offset+1]) } - for offset := 0; offset < 256; offset += 4 { - innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+6], 0, 2, 2) - } - for offset := 0; offset < 256; offset += 8 { - innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+5], 0, 4, 4) - } - for offset := 0; offset < 256; offset += 16 { - innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+4], 0, 8, 8) - } - for offset := 0; offset < 256; offset += 32 { - innerDITWithTwiddles(a[offset:offset+32], twiddles[stage+3], 0, 16, 16) - } - for offset := 0; offset < 256; offset += 64 { - innerDITWithTwiddles(a[offset:offset+64], twiddles[stage+2], 0, 32, 32) - } - for offset := 0; offset < 256; offset += 128 { - innerDITWithTwiddles(a[offset:offset+128], twiddles[stage+1], 0, 64, 64) - } - innerDITWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) } -func kerDIFNP_64(a []koalabear.Element, twiddles [][]koalabear.Element, stage int) { +func kerDITNP_128generic(a []koalabear.Element, twiddles [][]koalabear.Element, stage int) { // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - innerDIFWithTwiddles(a[:64], twiddles[stage+0], 0, 32, 32) - for offset := 0; offset < 64; offset += 32 { - innerDIFWithTwiddles(a[offset:offset+32], twiddles[stage+1], 0, 16, 16) - } - for offset := 0; offset < 64; offset += 16 { - innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+2], 0, 8, 8) - } - for offset := 0; offset < 64; offset += 8 { - innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+3], 0, 4, 4) - } - for offset := 0; offset < 64; offset += 4 { - innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+4], 0, 2, 2) - } - for offset := 0; offset < 64; offset += 2 { + for offset := 0; offset < 128; offset += 2 { koalabear.Butterfly(&a[offset], &a[offset+1]) } -} - -func kerDITNP_64(a []koalabear.Element, twiddles [][]koalabear.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - for offset := 0; offset < 64; offset += 2 { - koalabear.Butterfly(&a[offset], &a[offset+1]) + for offset := 0; offset < 128; offset += 4 { + innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+5], 0, 2, 2) } - for offset := 0; offset < 64; offset += 4 { - innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+4], 0, 2, 2) + for offset := 0; offset < 128; offset += 8 { + innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+4], 0, 4, 4) } - for offset := 0; offset < 64; offset += 8 { - innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+3], 0, 4, 4) + for offset := 0; offset < 128; offset += 16 { + innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+3], 0, 8, 8) } - for offset := 0; offset < 64; offset += 16 { - innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+2], 0, 8, 8) + for offset := 0; offset < 128; offset += 32 { + innerDITWithTwiddles(a[offset:offset+32], twiddles[stage+2], 0, 16, 16) } - for offset := 0; offset < 64; offset += 32 { - innerDITWithTwiddles(a[offset:offset+32], twiddles[stage+1], 0, 16, 16) + for offset := 0; offset < 128; offset += 64 { + innerDITWithTwiddles(a[offset:offset+64], twiddles[stage+1], 0, 32, 32) } - innerDITWithTwiddles(a[:64], twiddles[stage+0], 0, 32, 32) + innerDITWithTwiddles(a[:128], twiddles[stage+0], 0, 64, 64) } diff --git a/field/koalabear/fft/fft_test.go b/field/koalabear/fft/fft_test.go index edce30096b..f95d8f6612 100644 --- a/field/koalabear/fft/fft_test.go +++ b/field/koalabear/fft/fft_test.go @@ -305,6 +305,23 @@ func BenchmarkFFTDIFReference(b *testing.B) { } } +func BenchmarkFFTDIFReferenceSmall(b *testing.B) { + const maxSize = 1 << 9 + + pol := make([]koalabear.Element, maxSize) + pol[0].SetRandom() + for i := 1; i < maxSize; i++ { + pol[i] = pol[i-1] + } + + domain := NewDomain(maxSize) + + b.ResetTimer() + for j := 0; j < b.N; j++ { + domain.FFT(pol, DIF) + } +} + func evaluatePolynomial(pol []koalabear.Element, val koalabear.Element) koalabear.Element { var acc, res, tmp koalabear.Element res.Set(&pol[0]) diff --git a/field/koalabear/fft/kernel_amd64.go b/field/koalabear/fft/kernel_amd64.go new file mode 100644 index 0000000000..319190bb3d --- /dev/null +++ b/field/koalabear/fft/kernel_amd64.go @@ -0,0 +1,53 @@ +//go:build !purego + +// Copyright 2020-2025 Consensys Software Inc. +// Licensed under the Apache License, Version 2.0. See the LICENSE file for details. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package fft + +import ( + "github.com/consensys/gnark-crypto/field/koalabear" + "golang.org/x/sys/cpu" +) + +var ( + supportAVX512 = cpu.X86.HasAVX512 && cpu.X86.HasAVX512DQ && cpu.X86.HasAVX512VBMI2 +) + +// q + r'.r = 1, i.e., qInvNeg = - q⁻¹ mod r +// used for Montgomery reduction +const qInvNeg = 2130706431 +const q = 2130706433 + +// index table used in avx512 shuffling +var vInterleaveIndices = []uint64{ + 2, 3, 8, 9, 6, 7, 12, 13, +} + +//go:noescape +func innerDIFWithTwiddles_avx512(a []koalabear.Element, twiddles []koalabear.Element, start, end, m int) + +func innerDIFWithTwiddles(a []koalabear.Element, twiddles []koalabear.Element, start, end, m int) { + if !supportAVX512 { + innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) + return + } + innerDIFWithTwiddles_avx512(a, twiddles, start, end, m) +} + +//go:noescape +func kerDIFNP_128_avx512(a []koalabear.Element, twiddles [][]koalabear.Element, stage int) + +func kerDIFNP_128(a []koalabear.Element, twiddles [][]koalabear.Element, stage int) { + if !supportAVX512 { + kerDIFNP_128generic(a, twiddles, stage) + return + } + kerDIFNP_128_avx512(a, twiddles, stage) +} + +func kerDITNP_128(a []koalabear.Element, twiddles [][]koalabear.Element, stage int) { + kerDITNP_128generic(a, twiddles, stage) +} diff --git a/field/koalabear/fft/kernel_amd64.s b/field/koalabear/fft/kernel_amd64.s new file mode 100644 index 0000000000..d8e3d38bf8 --- /dev/null +++ b/field/koalabear/fft/kernel_amd64.s @@ -0,0 +1,389 @@ +//go:build !purego +// Code generated by gnark-crypto/generator. DO NOT EDIT. +#include "textflag.h" +#include "funcdata.h" +#include "go_asm.h" + +TEXT ·innerDIFWithTwiddles_avx512(SB), NOSPLIT, $0-72 + + // func innerDIFWithTwiddles(a []Element, twiddles []Element, start, end, m int) { + // for i := start; i < end; i++ { + // Butterfly(&a[i], &a[i+m]) + // a[i+m].Mul(&a[i+m], &twiddles[i]) + // } + // } + + // prepare constants needed for mul and reduce ops + MOVD $const_q, AX + VPBROADCASTD AX, Z2 + VPBROADCASTQ AX, Z8 + MOVD $const_qInvNeg, AX + VPBROADCASTQ AX, Z9 + VPCMPEQB Y0, Y0, Y0 + VPMOVZXDQ Y0, Z11 + + // load arguments + MOVQ a+0(FP), R15 + MOVQ twiddles+24(FP), CX + MOVQ end+56(FP), SI + MOVQ m+64(FP), BX + CMPQ BX, $0x0000000000000010 + JL smallerThan16_1 // m < 16 + SHRQ $4, SI // we are processing 16 elements at a time + SHLQ $2, BX // offset = m * 4bytes + MOVQ R15, DX + ADDQ BX, DX + + // performs a butterfly between 2 vectors of dwords + // first vector is in [0, q) and second vector is in [0, 2q) +#define BUTTERFLYD2Q(in0, in1) \ + VPADDD in0, in1, Z3 \ + VPSUBD in1, in0, in1 \ + VPSUBD Z2, Z3, in0 \ + VPMINUD Z3, in0, in0 \ + VPADDD Z2, in1, in1 \ + +#define MUL(in0, in1) \ + VPMULUDQ in0, in1, Z12 \ + VPANDQ Z11, Z12, Z10 \ + VPMULUDQ Z10, Z9, Z10 \ + VPANDQ Z11, Z10, Z10 \ + VPMULUDQ Z10, Z8, Z10 \ + VPADDQ Z12, Z10, Z12 \ + VPSRLQ $32, Z12, Z12 \ + VPSUBQ Z8, Z12, Z10 \ + VPMINUQ Z12, Z10, in0 \ + +loop_3: + TESTQ SI, SI + JEQ done_2 // n == 0, we are done + VMOVDQA32 0(R15), Z0 // load a[i] + VMOVDQA32 0(DX), Z1 // load a[i+m] + BUTTERFLYD2Q(Z0, Z1) + VMOVDQA32 Z0, 0(R15) // store a[i] + VEXTRACTI32X8 $0, Z1, Y20 + VEXTRACTI32X8 $1, Z1, Y21 + VPMOVZXDQ Y20, Z13 + VPMOVZXDQ Y21, Z14 + VPMOVZXDQ 0(CX), Z15 + VPMOVZXDQ 32(CX), Z16 + MUL(Z13, Z15) + MUL(Z14, Z16) + VPMOVQD Z13, 0(DX) + VPMOVQD Z14, 32(DX) + ADDQ $64, R15 + ADDQ $64, DX + ADDQ $64, CX + DECQ SI // decrement n + JMP loop_3 + +done_2: + RET + +smallerThan16_1: + // m < 16, we call the generic one + // note that this should happen only when doing a FFT smaller than the smallest generated kernel + MOVQ a+0(FP), AX + MOVQ AX, (SP) + MOVQ twiddles+24(FP), AX + MOVQ AX, 24(SP) + MOVQ start+48(FP), AX + MOVQ AX, 48(SP) + MOVQ end+56(FP), AX + MOVQ AX, 56(SP) + MOVQ m+64(FP), AX + MOVQ AX, 64(SP) + CALL ·innerDIFWithTwiddlesGeneric(SB) + RET + +// kerDIFNP_128_avx512(a []{{ .FF }}.Element, twiddles [][]{{ .FF }}.Element, stage int) +TEXT ·kerDIFNP_128_avx512(SB), NOSPLIT, $0-56 + // prepare constants needed for mul and reduce ops + MOVD $const_q, AX + VPBROADCASTQ AX, Z17 + MOVD $const_qInvNeg, AX + VPBROADCASTQ AX, Z18 + VPCMPEQB Y0, Y0, Y0 + VPMOVZXDQ Y0, Z20 + + // load arguments + MOVQ a+0(FP), R15 + MOVQ twiddles+24(FP), CX + MOVQ stage+48(FP), AX + IMULQ $24, AX + ADDQ AX, CX // we want twiddles[stage] as starting point + + // load a[:128] in registers + VPMOVZXDQ 0(R15), Z0 + VPMOVZXDQ 32(R15), Z1 + VPMOVZXDQ 64(R15), Z2 + VPMOVZXDQ 96(R15), Z3 + VPMOVZXDQ 128(R15), Z4 + VPMOVZXDQ 160(R15), Z5 + VPMOVZXDQ 192(R15), Z6 + VPMOVZXDQ 224(R15), Z7 + VPMOVZXDQ 256(R15), Z8 + VPMOVZXDQ 288(R15), Z9 + VPMOVZXDQ 320(R15), Z10 + VPMOVZXDQ 352(R15), Z11 + VPMOVZXDQ 384(R15), Z12 + VPMOVZXDQ 416(R15), Z13 + VPMOVZXDQ 448(R15), Z14 + VPMOVZXDQ 480(R15), Z15 + + // butterfly computes + // in0 = in0 + in1 (in [0,q)) + // in1 = in0 - in1 (in [0,2q)) +#define BUTTERFLY(in0, in1) \ + VPADDQ in0, in1, Z21 \ + VPSUBQ in1, in0, in1 \ + VPSUBQ Z17, Z21, in0 \ + VPMINUQ Z21, in0, in0 \ + VPADDQ Z17, in1, in1 \ + +// mul computes x = x * y +#define MUL_0(in0, in1) \ + VPMULUDQ in0, in1, Z16 \ + VPANDQ Z20, Z16, Z19 \ + VPMULUDQ Z19, Z18, Z19 \ + VPANDQ Z20, Z19, Z19 \ + VPMULUDQ Z19, Z17, Z19 \ + VPADDQ Z16, Z19, Z16 \ + VPSRLQ $32, Z16, Z16 \ + VPSUBQ Z17, Z16, Z19 \ + VPMINUQ Z16, Z19, in0 \ + + MOVQ 0(CX), BX + VPMOVZXDQ 0(BX), Z23 + VPMOVZXDQ 32(BX), Z24 + VPMOVZXDQ 64(BX), Z25 + VPMOVZXDQ 96(BX), Z26 + VPMOVZXDQ 128(BX), Z27 + VPMOVZXDQ 160(BX), Z28 + VPMOVZXDQ 192(BX), Z29 + VPMOVZXDQ 224(BX), Z30 + BUTTERFLY(Z0, Z8) + MUL_0(Z8, Z23) + BUTTERFLY(Z1, Z9) + MUL_0(Z9, Z24) + BUTTERFLY(Z2, Z10) + MUL_0(Z10, Z25) + BUTTERFLY(Z3, Z11) + MUL_0(Z11, Z26) + BUTTERFLY(Z4, Z12) + MUL_0(Z12, Z27) + BUTTERFLY(Z5, Z13) + MUL_0(Z13, Z28) + BUTTERFLY(Z6, Z14) + MUL_0(Z14, Z29) + BUTTERFLY(Z7, Z15) + MUL_0(Z15, Z30) + ADDQ $24, CX + MOVQ 0(CX), BX + VPMOVZXDQ 0(BX), Z23 + VPMOVZXDQ 32(BX), Z24 + VPMOVZXDQ 64(BX), Z25 + VPMOVZXDQ 96(BX), Z26 + BUTTERFLY(Z0, Z4) + MUL_0(Z4, Z23) + BUTTERFLY(Z1, Z5) + MUL_0(Z5, Z24) + BUTTERFLY(Z2, Z6) + MUL_0(Z6, Z25) + BUTTERFLY(Z3, Z7) + MUL_0(Z7, Z26) + BUTTERFLY(Z8, Z12) + MUL_0(Z12, Z23) + BUTTERFLY(Z9, Z13) + MUL_0(Z13, Z24) + BUTTERFLY(Z10, Z14) + MUL_0(Z14, Z25) + BUTTERFLY(Z11, Z15) + MUL_0(Z15, Z26) + ADDQ $24, CX + MOVQ 0(CX), BX + VPMOVZXDQ 0(BX), Z23 + VPMOVZXDQ 32(BX), Z24 + BUTTERFLY(Z0, Z2) + MUL_0(Z2, Z23) + BUTTERFLY(Z1, Z3) + MUL_0(Z3, Z24) + BUTTERFLY(Z4, Z6) + MUL_0(Z6, Z23) + BUTTERFLY(Z5, Z7) + MUL_0(Z7, Z24) + BUTTERFLY(Z8, Z10) + MUL_0(Z10, Z23) + BUTTERFLY(Z9, Z11) + MUL_0(Z11, Z24) + BUTTERFLY(Z12, Z14) + MUL_0(Z14, Z23) + BUTTERFLY(Z13, Z15) + MUL_0(Z15, Z24) + ADDQ $24, CX + MOVQ 0(CX), BX + VPMOVZXDQ 0(BX), Z23 + BUTTERFLY(Z0, Z1) + MUL_0(Z1, Z23) + BUTTERFLY(Z2, Z3) + MUL_0(Z3, Z23) + BUTTERFLY(Z4, Z5) + MUL_0(Z5, Z23) + BUTTERFLY(Z6, Z7) + MUL_0(Z7, Z23) + BUTTERFLY(Z8, Z9) + MUL_0(Z9, Z23) + BUTTERFLY(Z10, Z11) + MUL_0(Z11, Z23) + BUTTERFLY(Z12, Z13) + MUL_0(Z13, Z23) + BUTTERFLY(Z14, Z15) + MUL_0(Z15, Z23) + ADDQ $24, CX + MOVQ 0(CX), BX + VPMOVZXDQ 0(BX), Y23 // zero extend 4x uint32 to 4x uint64 + VINSERTI64X4 $1, Y23, Z23, Z23 + MOVQ $0x0000000000000f0f, AX + KMOVQ AX, K1 + +#define PERMUTE4X4(in0, in1) \ + VSHUFI64X2 $0x000000000000004e, in1, in0, Z21 \ + VPBLENDMQ in0, Z21, K1, in0 \ + VPBLENDMQ Z21, in1, K1, in1 \ + + PERMUTE4X4(Z0, Z1) + BUTTERFLY(Z0, Z1) + MUL_0(Z1, Z23) + PERMUTE4X4(Z0, Z1) + PERMUTE4X4(Z2, Z3) + BUTTERFLY(Z2, Z3) + MUL_0(Z3, Z23) + PERMUTE4X4(Z2, Z3) + PERMUTE4X4(Z4, Z5) + BUTTERFLY(Z4, Z5) + MUL_0(Z5, Z23) + PERMUTE4X4(Z4, Z5) + PERMUTE4X4(Z6, Z7) + BUTTERFLY(Z6, Z7) + MUL_0(Z7, Z23) + PERMUTE4X4(Z6, Z7) + PERMUTE4X4(Z8, Z9) + BUTTERFLY(Z8, Z9) + MUL_0(Z9, Z23) + PERMUTE4X4(Z8, Z9) + PERMUTE4X4(Z10, Z11) + BUTTERFLY(Z10, Z11) + MUL_0(Z11, Z23) + PERMUTE4X4(Z10, Z11) + PERMUTE4X4(Z12, Z13) + BUTTERFLY(Z12, Z13) + MUL_0(Z13, Z23) + PERMUTE4X4(Z12, Z13) + PERMUTE4X4(Z14, Z15) + BUTTERFLY(Z14, Z15) + MUL_0(Z15, Z23) + PERMUTE4X4(Z14, Z15) + ADDQ $24, CX + MOVQ 0(CX), BX + VPMOVZXDQ 0(BX), X23 // zero extend 2x uint32 to 2x uint64 + VINSERTI64X2 $1, X23, Z23, Z23 + VINSERTI64X2 $0x0000000000000002, X23, Z23, Z23 + VINSERTI64X2 $0x0000000000000003, X23, Z23, Z23 + MOVQ $0x0000000000000033, AX + KMOVQ AX, K2 + MOVQ ·vInterleaveIndices+0(SB), DI + VMOVDQU64 0(DI), Z30 + +#define PERMUTE2X2(in0, in1) \ + VMOVDQA64 Z30, Z29 \ + VPERMI2Q in1, in0, Z29 \ + VPBLENDMQ in0, Z29, K2, in0 \ + VPBLENDMQ Z29, in1, K2, in1 \ + + PERMUTE2X2(Z0, Z1) + BUTTERFLY(Z0, Z1) + MUL_0(Z1, Z23) + PERMUTE2X2(Z0, Z1) + PERMUTE2X2(Z2, Z3) + BUTTERFLY(Z2, Z3) + MUL_0(Z3, Z23) + PERMUTE2X2(Z2, Z3) + PERMUTE2X2(Z4, Z5) + BUTTERFLY(Z4, Z5) + MUL_0(Z5, Z23) + PERMUTE2X2(Z4, Z5) + PERMUTE2X2(Z6, Z7) + BUTTERFLY(Z6, Z7) + MUL_0(Z7, Z23) + PERMUTE2X2(Z6, Z7) + PERMUTE2X2(Z8, Z9) + BUTTERFLY(Z8, Z9) + MUL_0(Z9, Z23) + PERMUTE2X2(Z8, Z9) + PERMUTE2X2(Z10, Z11) + BUTTERFLY(Z10, Z11) + MUL_0(Z11, Z23) + PERMUTE2X2(Z10, Z11) + PERMUTE2X2(Z12, Z13) + BUTTERFLY(Z12, Z13) + MUL_0(Z13, Z23) + PERMUTE2X2(Z12, Z13) + PERMUTE2X2(Z14, Z15) + BUTTERFLY(Z14, Z15) + MUL_0(Z15, Z23) + PERMUTE2X2(Z14, Z15) + MOVQ $0x0000000000005555, AX + KMOVD AX, K3 + +#define PERMUTE1X1(in0, in1) \ + VPSHRDQ $32, in1, in0, Z21 \ + VPBLENDMD in0, Z21, K3, in0 \ + VPBLENDMD Z21, in1, K3, in1 \ + + MOVD $const_q, AX + VPBROADCASTD AX, Z17 // rebroadcast q, but on dword lanes + +#define LASTBUTTERFLY(in0, in1) \ + VPADDD in0, in1, Z21 \ + VPSUBD in1, in0, in1 \ + VPSUBD Z17, Z21, in0 \ + VPMINUD Z21, in0, in0 \ + VPADDD Z17, in1, Z22 \ + VPMINUD Z22, in1, in1 \ + +#define PACK_DWORDS(in0, in1, in2, in3) \ + VPMOVQD in0, in1 \ + VPMOVQD in2, in3 \ + VINSERTI64X4 $1, in3, in0, in0 \ + + PACK_DWORDS(Z0, Y0, Z1, Y1) + PACK_DWORDS(Z2, Y2, Z3, Y3) + PERMUTE1X1(Z0, Z2) + LASTBUTTERFLY(Z0, Z2) + PERMUTE1X1(Z0, Z2) + PACK_DWORDS(Z4, Y4, Z5, Y5) + PACK_DWORDS(Z6, Y6, Z7, Y7) + PERMUTE1X1(Z4, Z6) + LASTBUTTERFLY(Z4, Z6) + PERMUTE1X1(Z4, Z6) + PACK_DWORDS(Z8, Y8, Z9, Y9) + PACK_DWORDS(Z10, Y10, Z11, Y11) + PERMUTE1X1(Z8, Z10) + LASTBUTTERFLY(Z8, Z10) + PERMUTE1X1(Z8, Z10) + PACK_DWORDS(Z12, Y12, Z13, Y13) + PACK_DWORDS(Z14, Y14, Z15, Y15) + PERMUTE1X1(Z12, Z14) + LASTBUTTERFLY(Z12, Z14) + PERMUTE1X1(Z12, Z14) + + // store a[:128] in memory + VMOVDQA32 Z0, 0(R15) + VMOVDQA32 Z2, 64(R15) + VMOVDQA32 Z4, 128(R15) + VMOVDQA32 Z6, 192(R15) + VMOVDQA32 Z8, 256(R15) + VMOVDQA32 Z10, 320(R15) + VMOVDQA32 Z12, 384(R15) + VMOVDQA32 Z14, 448(R15) + RET diff --git a/field/koalabear/fft/kernel_purego.go b/field/koalabear/fft/kernel_purego.go new file mode 100644 index 0000000000..80b5ee1a62 --- /dev/null +++ b/field/koalabear/fft/kernel_purego.go @@ -0,0 +1,23 @@ +//go:build purego || !amd64 + +// Copyright 2020-2025 Consensys Software Inc. +// Licensed under the Apache License, Version 2.0. See the LICENSE file for details. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package fft + +import ( + "github.com/consensys/gnark-crypto/field/koalabear" +) + +func innerDIFWithTwiddles(a []koalabear.Element, twiddles []koalabear.Element, start, end, m int) { + innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) +} + +func kerDIFNP_128(a []koalabear.Element, twiddles [][]koalabear.Element, stage int) { + kerDIFNP_128generic(a, twiddles, stage) +} +func kerDITNP_128(a []koalabear.Element, twiddles [][]koalabear.Element, stage int) { + kerDITNP_128generic(a, twiddles, stage) +} diff --git a/field/koalabear/sis/sis.go b/field/koalabear/sis/sis.go index 55248e4389..f0eb7d5d50 100644 --- a/field/koalabear/sis/sis.go +++ b/field/koalabear/sis/sis.go @@ -142,11 +142,54 @@ func (r *RSis) Hash(v, res []koalabear.Element) error { // by default, the mask is ignored (unless we unrolled the FFT and have a degree 64) mask := ^uint64(0) + if r.Degree == 512 && r.LogTwoBound == 16 { + // this is our hot path, we don't use the iterator because with + // avx512 instructions, it actually ends up being most of the CPU time. + er := koalabear.Element{1} // mul by 1 --> mont reduce + polId := 0 + var k512 [512]koalabear.Element + vk := koalabear.Vector(k512[:]) + vRes := koalabear.Vector(res) + vb := koalabear.Vector(k512[256:]) + + cosets, err := r.Domain.CosetTable() + if err != nil { + return err + } + vCosets := koalabear.Vector(cosets) + + for j := 0; j < len(v); j += 256 { + start := j + end := j + 256 + end = min(end, len(v)) - // inner hash - it := NewLimbIterator(&VectorIterator{v: v}, r.LogTwoBound/8) - for i := 0; i < len(r.Ag); i++ { - r.InnerHash(it, res, k, r.kz, i, mask) + // use half of vk to copy the v input to batch convert to regular form + copy(vb[:], v[start:end]) + for k := (end - start); k < 256; k++ { + vb[k][0] = 0 + } + // batch montgomery -> regular + vb.ScalarMul(vb, &er) + + // do the limb split + for k := 0; k < 256; k++ { + k512[k*2][0] = uint32(uint16(vb[k][0])) + k512[k*2+1][0] = uint32(uint16(vb[k][0] >> 16)) + } + + // inner hash + vk.Mul(vk, vCosets) + r.Domain.FFT(k512[:], fft.DIF, fft.WithNbTasks(1)) + vk.Mul(vk, koalabear.Vector(r.Ag[polId])) + vRes.Add(vRes, vk) + polId++ + } + } else { + // inner hash + it := NewLimbIterator(&VectorIterator{v: v}, r.LogTwoBound/8) + for i := 0; i < len(r.Ag); i++ { + r.InnerHash(it, res, k, r.kz, i, mask) + } } // reduces mod Xᵈ+1 diff --git a/field/koalabear/vector_amd64.go b/field/koalabear/vector_amd64.go index 2d63563df0..f55d13df7c 100644 --- a/field/koalabear/vector_amd64.go +++ b/field/koalabear/vector_amd64.go @@ -25,6 +25,9 @@ func scalarMulVec(res, a, b *Element, n uint64) //go:noescape func innerProdVec(t *uint64, a, b *Element, n uint64) +//go:noescape +func butterflyMulVec(a, twiddles *Element, m int) + // Add adds two vectors element-wise and stores the result in self. // It panics if the vectors don't have the same length. func (vector *Vector) Add(a, b Vector) { diff --git a/internal/generator/crypto/hash/mimc/generate.go b/internal/generator/crypto/hash/mimc/generate.go index 58dc7fb099..30d34c1d81 100644 --- a/internal/generator/crypto/hash/mimc/generate.go +++ b/internal/generator/crypto/hash/mimc/generate.go @@ -9,7 +9,6 @@ import ( ) func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) error { - conf.Package = "mimc" entries := []bavard.Entry{ {File: filepath.Join(baseDir, "doc.go"), Templates: []string{"doc.go.tmpl"}}, diff --git a/internal/generator/crypto/hash/poseidon2/generate.go b/internal/generator/crypto/hash/poseidon2/generate.go index 2c05bb08eb..0bed64ef18 100644 --- a/internal/generator/crypto/hash/poseidon2/generate.go +++ b/internal/generator/crypto/hash/poseidon2/generate.go @@ -8,7 +8,6 @@ import ( ) func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) error { - conf.Package = "poseidon2" entries := []bavard.Entry{ {File: filepath.Join(baseDir, "poseidon2.go"), Templates: []string{"poseidon2.go.tmpl"}}, diff --git a/internal/generator/ecc/generate.go b/internal/generator/ecc/generate.go index 9e29595153..ab488966f1 100644 --- a/internal/generator/ecc/generate.go +++ b/internal/generator/ecc/generate.go @@ -14,7 +14,6 @@ import ( ) func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) error { - packageName := strings.ReplaceAll(conf.Name, "-", "") var entries []bavard.Entry diff --git a/internal/generator/fflonk/generator.go b/internal/generator/fflonk/generator.go index 5eba0c6a95..40e1cd0326 100644 --- a/internal/generator/fflonk/generator.go +++ b/internal/generator/fflonk/generator.go @@ -8,7 +8,6 @@ import ( ) func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) error { - // kzg commitment scheme conf.Package = "fflonk" entries := []bavard.Entry{ diff --git a/internal/generator/fri/template/generate.go b/internal/generator/fri/template/generate.go index 3ac33c51ab..49f52ac96f 100644 --- a/internal/generator/fri/template/generate.go +++ b/internal/generator/fri/template/generate.go @@ -8,7 +8,6 @@ import ( ) func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) error { - // fri commitment scheme conf.Package = "fri" entries := []bavard.Entry{ diff --git a/internal/generator/gkr/generate.go b/internal/generator/gkr/generate.go index 7acc39c94f..0d2e09e8f5 100644 --- a/internal/generator/gkr/generate.go +++ b/internal/generator/gkr/generate.go @@ -1,9 +1,10 @@ package gkr import ( + "path/filepath" + "github.com/consensys/bavard" "github.com/consensys/gnark-crypto/internal/generator/config" - "path/filepath" ) type Config struct { diff --git a/internal/generator/gkr/template/gkr.test.vectors.gen.go.tmpl b/internal/generator/gkr/template/gkr.test.vectors.gen.go.tmpl index bc73c9625a..71f0d48359 100644 --- a/internal/generator/gkr/template/gkr.test.vectors.gen.go.tmpl +++ b/internal/generator/gkr/template/gkr.test.vectors.gen.go.tmpl @@ -2,6 +2,7 @@ import ( "encoding/json" "fmt" fiatshamir "github.com/consensys/gnark-crypto/fiat-shamir" + "github.com/consensys/gnark-crypto/internal/generator/test_vector_utils/small_rational" "github.com/consensys/gnark-crypto/internal/generator/test_vector_utils/small_rational/gkr" "github.com/consensys/gnark-crypto/internal/generator/test_vector_utils/small_rational/polynomial" diff --git a/internal/generator/gkr/test_vectors/main.go b/internal/generator/gkr/test_vectors/main.go index 6b4784e177..37e62d4d51 100644 --- a/internal/generator/gkr/test_vectors/main.go +++ b/internal/generator/gkr/test_vectors/main.go @@ -9,6 +9,7 @@ import ( "encoding/json" "fmt" fiatshamir "github.com/consensys/gnark-crypto/fiat-shamir" + "github.com/consensys/gnark-crypto/internal/generator/test_vector_utils/small_rational" "github.com/consensys/gnark-crypto/internal/generator/test_vector_utils/small_rational/gkr" "github.com/consensys/gnark-crypto/internal/generator/test_vector_utils/small_rational/polynomial" diff --git a/internal/generator/iop/generate.go b/internal/generator/iop/generate.go index f18b0e1ed0..1ffe58c4c6 100644 --- a/internal/generator/iop/generate.go +++ b/internal/generator/iop/generate.go @@ -8,7 +8,6 @@ import ( ) func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) error { - // fri commitment scheme conf.Package = "iop" entries := []bavard.Entry{ diff --git a/internal/generator/kzg/generate.go b/internal/generator/kzg/generate.go index 14df851c8f..c549511970 100644 --- a/internal/generator/kzg/generate.go +++ b/internal/generator/kzg/generate.go @@ -8,7 +8,6 @@ import ( ) func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) error { - // kzg commitment scheme conf.Package = "kzg" entries := []bavard.Entry{ diff --git a/internal/generator/main.go b/internal/generator/main.go index c9453b7640..6038a71290 100644 --- a/internal/generator/main.go +++ b/internal/generator/main.go @@ -2,12 +2,13 @@ package main import ( "fmt" - "github.com/consensys/gnark-crypto/internal/generator/mpcsetup" "os" "os/exec" "path/filepath" "sync" + "github.com/consensys/gnark-crypto/internal/generator/mpcsetup" + "github.com/consensys/bavard" "github.com/consensys/gnark-crypto/field/generator" fieldConfig "github.com/consensys/gnark-crypto/field/generator/config" @@ -20,6 +21,7 @@ import ( "github.com/consensys/gnark-crypto/internal/generator/edwards/eddsa" "github.com/consensys/gnark-crypto/internal/generator/fflonk" fri "github.com/consensys/gnark-crypto/internal/generator/fri/template" + "github.com/consensys/gnark-crypto/internal/generator/gkr" "github.com/consensys/gnark-crypto/internal/generator/hash_to_field" "github.com/consensys/gnark-crypto/internal/generator/iop" diff --git a/internal/generator/pairing/generate.go b/internal/generator/pairing/generate.go index 879544c09d..1a6039b036 100644 --- a/internal/generator/pairing/generate.go +++ b/internal/generator/pairing/generate.go @@ -9,7 +9,6 @@ import ( ) func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) error { - packageName := strings.ReplaceAll(conf.Name, "-", "") return bgen.Generate(conf, packageName, "./pairing/template", bavard.Entry{ File: filepath.Join(baseDir, "pairing_test.go"), Templates: []string{"tests/pairing.go.tmpl"}, diff --git a/internal/generator/pedersen/generate.go b/internal/generator/pedersen/generate.go index cb3cf5dc6f..41ce35099b 100644 --- a/internal/generator/pedersen/generate.go +++ b/internal/generator/pedersen/generate.go @@ -8,7 +8,6 @@ import ( ) func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) error { - // pedersen commitment scheme conf.Package = "pedersen" entries := []bavard.Entry{ diff --git a/internal/generator/permutation/generator.go b/internal/generator/permutation/generator.go index 978102f93b..f337ec22d9 100644 --- a/internal/generator/permutation/generator.go +++ b/internal/generator/permutation/generator.go @@ -8,7 +8,6 @@ import ( ) func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) error { - // permutation data conf.Package = "permutation" entries := []bavard.Entry{ diff --git a/internal/generator/plookup/generate.go b/internal/generator/plookup/generate.go index dd8ad8625d..b4632c391c 100644 --- a/internal/generator/plookup/generate.go +++ b/internal/generator/plookup/generate.go @@ -8,7 +8,6 @@ import ( ) func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) error { - // kzg commitment scheme conf.Package = "plookup" entries := []bavard.Entry{ diff --git a/internal/generator/polynomial/generate.go b/internal/generator/polynomial/generate.go index 57e45b43c6..bdc2ab197a 100644 --- a/internal/generator/polynomial/generate.go +++ b/internal/generator/polynomial/generate.go @@ -8,7 +8,6 @@ import ( ) func Generate(conf config.FieldDependency, baseDir string, generateTests bool, bgen *bavard.BatchGenerator) error { - entries := []bavard.Entry{ {File: filepath.Join(baseDir, "doc.go"), Templates: []string{"doc.go.tmpl"}}, {File: filepath.Join(baseDir, "polynomial.go"), Templates: []string{"polynomial.go.tmpl"}}, diff --git a/internal/generator/shplonk/generator.go b/internal/generator/shplonk/generator.go index 15d72c2589..00c458ab0a 100644 --- a/internal/generator/shplonk/generator.go +++ b/internal/generator/shplonk/generator.go @@ -8,7 +8,6 @@ import ( ) func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) error { - // kzg commitment scheme conf.Package = "shplonk" entries := []bavard.Entry{ diff --git a/internal/generator/sumcheck/generate.go b/internal/generator/sumcheck/generate.go index 868fb23130..8c600221ef 100644 --- a/internal/generator/sumcheck/generate.go +++ b/internal/generator/sumcheck/generate.go @@ -1,9 +1,10 @@ package sumcheck import ( + "path/filepath" + "github.com/consensys/bavard" "github.com/consensys/gnark-crypto/internal/generator/config" - "path/filepath" ) func Generate(conf config.FieldDependency, baseDir string, bgen *bavard.BatchGenerator) error { diff --git a/internal/generator/test_vector_utils/generate.go b/internal/generator/test_vector_utils/generate.go index 60a8fc9a97..f91f300690 100644 --- a/internal/generator/test_vector_utils/generate.go +++ b/internal/generator/test_vector_utils/generate.go @@ -1,12 +1,14 @@ package test_vector_utils import ( + "path/filepath" + "github.com/consensys/bavard" "github.com/consensys/gnark-crypto/internal/generator/config" + "github.com/consensys/gnark-crypto/internal/generator/gkr" "github.com/consensys/gnark-crypto/internal/generator/polynomial" "github.com/consensys/gnark-crypto/internal/generator/sumcheck" - "path/filepath" ) type Config struct { diff --git a/internal/generator/tower/generate.go b/internal/generator/tower/generate.go index 734e86aa72..0bdbefd730 100644 --- a/internal/generator/tower/generate.go +++ b/internal/generator/tower/generate.go @@ -7,6 +7,7 @@ import ( "github.com/consensys/bavard" "github.com/consensys/gnark-crypto/internal/generator/config" + "github.com/consensys/gnark-crypto/internal/generator/tower/asm/amd64" ) From 186d85014eaf64d9ee0bda91fe32d792d3dde095 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Tue, 21 Jan 2025 18:29:48 +0000 Subject: [PATCH 2/9] test: add MB/s measure in SIS bench --- ecc/bls12-377/fr/sis/sis_test.go | 3 +++ field/babybear/sis/sis_test.go | 3 +++ field/generator/internal/templates/sis/sis.test.go.tmpl | 3 +++ field/goldilocks/sis/sis_test.go | 3 +++ field/koalabear/sis/sis_test.go | 3 +++ 5 files changed, 15 insertions(+) diff --git a/ecc/bls12-377/fr/sis/sis_test.go b/ecc/bls12-377/fr/sis/sis_test.go index 9835765c37..c7fc2362f6 100644 --- a/ecc/bls12-377/fr/sis/sis_test.go +++ b/ecc/bls12-377/fr/sis/sis_test.go @@ -235,6 +235,9 @@ func benchmarkSIS(b *testing.B, input []fr.Element, sparse bool, logTwoBound, lo benchName += fmt.Sprintf("inputs=%v/log2-bound=%v/log2-degree=%v", n, logTwoBound, logTwoDegree) b.Run(benchName, func(b *testing.B) { + // report the throughput in MB/s + b.SetBytes(int64(len(input)) * koalabear.Bytes) + instance, err := NewRSis(0, logTwoDegree, logTwoBound, n) if err != nil { b.Fatal(err) diff --git a/field/babybear/sis/sis_test.go b/field/babybear/sis/sis_test.go index 6d041515e7..ca63269386 100644 --- a/field/babybear/sis/sis_test.go +++ b/field/babybear/sis/sis_test.go @@ -235,6 +235,9 @@ func benchmarkSIS(b *testing.B, input []babybear.Element, sparse bool, logTwoBou benchName += fmt.Sprintf("inputs=%v/log2-bound=%v/log2-degree=%v", n, logTwoBound, logTwoDegree) b.Run(benchName, func(b *testing.B) { + // report the throughput in MB/s + b.SetBytes(int64(len(input)) * koalabear.Bytes) + instance, err := NewRSis(0, logTwoDegree, logTwoBound, n) if err != nil { b.Fatal(err) diff --git a/field/generator/internal/templates/sis/sis.test.go.tmpl b/field/generator/internal/templates/sis/sis.test.go.tmpl index e1ef0c7bb3..da3872ea74 100644 --- a/field/generator/internal/templates/sis/sis.test.go.tmpl +++ b/field/generator/internal/templates/sis/sis.test.go.tmpl @@ -232,6 +232,9 @@ func benchmarkSIS(b *testing.B, input []{{ .FF }}.Element, sparse bool, logTwoBo benchName += fmt.Sprintf("inputs=%v/log2-bound=%v/log2-degree=%v", n, logTwoBound, logTwoDegree) b.Run(benchName, func(b *testing.B) { + // report the throughput in MB/s + b.SetBytes(int64(len(input)) * koalabear.Bytes) + instance, err := NewRSis(0, logTwoDegree, logTwoBound, n) if err != nil { b.Fatal(err) diff --git a/field/goldilocks/sis/sis_test.go b/field/goldilocks/sis/sis_test.go index 480cbb8c33..d12f819bec 100644 --- a/field/goldilocks/sis/sis_test.go +++ b/field/goldilocks/sis/sis_test.go @@ -235,6 +235,9 @@ func benchmarkSIS(b *testing.B, input []goldilocks.Element, sparse bool, logTwoB benchName += fmt.Sprintf("inputs=%v/log2-bound=%v/log2-degree=%v", n, logTwoBound, logTwoDegree) b.Run(benchName, func(b *testing.B) { + // report the throughput in MB/s + b.SetBytes(int64(len(input)) * koalabear.Bytes) + instance, err := NewRSis(0, logTwoDegree, logTwoBound, n) if err != nil { b.Fatal(err) diff --git a/field/koalabear/sis/sis_test.go b/field/koalabear/sis/sis_test.go index d05364dea6..7a53d3ba22 100644 --- a/field/koalabear/sis/sis_test.go +++ b/field/koalabear/sis/sis_test.go @@ -235,6 +235,9 @@ func benchmarkSIS(b *testing.B, input []koalabear.Element, sparse bool, logTwoBo benchName += fmt.Sprintf("inputs=%v/log2-bound=%v/log2-degree=%v", n, logTwoBound, logTwoDegree) b.Run(benchName, func(b *testing.B) { + // report the throughput in MB/s + b.SetBytes(int64(len(input)) * koalabear.Bytes) + instance, err := NewRSis(0, logTwoDegree, logTwoBound, n) if err != nil { b.Fatal(err) From 15149dba3a8c377e984a6c0b44232a523d83bcfb Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Tue, 21 Jan 2025 22:30:20 +0000 Subject: [PATCH 3/9] refactor: move common defines in begginning of .s file for fft kernels --- field/babybear/fft/kernel_amd64.s | 493 +++++++++---------- field/generator/asm/amd64/build.go | 3 + field/generator/asm/amd64/element_vec_F31.go | 317 ++++++------ field/koalabear/fft/kernel_amd64.s | 493 +++++++++---------- 4 files changed, 661 insertions(+), 645 deletions(-) diff --git a/field/babybear/fft/kernel_amd64.s b/field/babybear/fft/kernel_amd64.s index d8e3d38bf8..9f4fb27135 100644 --- a/field/babybear/fft/kernel_amd64.s +++ b/field/babybear/fft/kernel_amd64.s @@ -4,15 +4,88 @@ #include "funcdata.h" #include "go_asm.h" -TEXT ·innerDIFWithTwiddles_avx512(SB), NOSPLIT, $0-72 +// performs a butterfly between 2 vectors of dwords +// in0 = (in0 + in1) mod q +// in1 = (in0 - in1) mod 2q +// in2: q broadcasted on all dwords lanes +// in3: temporary Z register +#define BUTTERFLYD2Q(in0, in1, in2, in3) \ + VPADDD in0, in1, in3 \ + VPSUBD in1, in0, in1 \ + VPSUBD in2, in3, in0 \ + VPMINUD in3, in0, in0 \ + VPADDD in2, in1, in1 \ + +// same as butterflyD2Q but reduces in1 to [0,q) +#define BUTTERFLYD1Q(in0, in1, in2, in3, in4) \ + VPADDD in0, in1, in3 \ + VPSUBD in1, in0, in1 \ + VPSUBD in2, in3, in0 \ + VPMINUD in3, in0, in0 \ + VPADDD in2, in1, in4 \ + VPMINUD in4, in1, in1 \ + +// same as butterflyD2Q but for qwords +// in2: must be broadcasted on all qwords lanes +#define BUTTERFLYQ2Q(in0, in1, in2, in3) \ + VPADDQ in0, in1, in3 \ + VPSUBQ in1, in0, in1 \ + VPSUBQ in2, in3, in0 \ + VPMINUQ in3, in0, in0 \ + VPADDQ in2, in1, in1 \ + +// performs a multiplication in place between 2 vectors of qwords (values should be dwords zero extended) +// in0 = (in0 * in1) mod q +// in1: second operand +// in2: mask for low dword in each qword +// in3: q broadcasted on all qwords lanes +// in4: qInvNeg broadcasted on all qwords lanes +// in5: temporary Z register +// in6: temporary Z register +#define MUL(in0, in1, in2, in3, in4, in5, in6) \ + VPMULUDQ in0, in1, in5 \ + VPANDQ in2, in5, in6 \ + VPMULUDQ in6, in4, in6 \ + VPANDQ in2, in6, in6 \ + VPMULUDQ in6, in3, in6 \ + VPADDQ in5, in6, in5 \ + VPSRLQ $32, in5, in5 \ + VPSUBQ in3, in5, in6 \ + VPMINUQ in5, in6, in0 \ + +// goes from +// Z1 = A A A A B B B B +// Z2 = C C C C D D D D +// we want +// Z1 = A A A A C C C C +// Z2 = B B B B D D D D +#define PERMUTE4X4(in0, in1, in2, in3) \ + VSHUFI64X2 $0x000000000000004e, in1, in0, in2 \ + VPBLENDMQ in0, in2, in3, in0 \ + VPBLENDMQ in2, in1, in3, in1 \ + +// Z1 = A A B B C C D D +// Z2 = L L M M N N O O +// we want +// Z1 = A A L L C C N N +// Z2 = B B M M D D O O +#define PERMUTE2X2(in0, in1, in2, in3, in4) \ + VMOVDQA64 in2, in3 \ + VPERMI2Q in1, in0, in3 \ + VPBLENDMQ in0, in3, in4, in0 \ + VPBLENDMQ in3, in1, in4, in1 \ + +#define PERMUTE1X1(in0, in1, in2, in3) \ + VPSHRDQ $32, in1, in0, in2 \ + VPBLENDMD in0, in2, in3, in0 \ + VPBLENDMD in2, in1, in3, in1 \ - // func innerDIFWithTwiddles(a []Element, twiddles []Element, start, end, m int) { - // for i := start; i < end; i++ { - // Butterfly(&a[i], &a[i+m]) - // a[i+m].Mul(&a[i+m], &twiddles[i]) - // } - // } +#define PACK_DWORDS(in0, in1, in2, in3) \ + VPMOVQD in0, in1 \ + VPMOVQD in2, in3 \ + VINSERTI64X4 $1, in3, in0, in0 \ +TEXT ·innerDIFWithTwiddles_avx512(SB), NOSPLIT, $0-72 // prepare constants needed for mul and reduce ops MOVD $const_q, AX VPBROADCASTD AX, Z2 @@ -34,32 +107,12 @@ TEXT ·innerDIFWithTwiddles_avx512(SB), NOSPLIT, $0-72 MOVQ R15, DX ADDQ BX, DX - // performs a butterfly between 2 vectors of dwords - // first vector is in [0, q) and second vector is in [0, 2q) -#define BUTTERFLYD2Q(in0, in1) \ - VPADDD in0, in1, Z3 \ - VPSUBD in1, in0, in1 \ - VPSUBD Z2, Z3, in0 \ - VPMINUD Z3, in0, in0 \ - VPADDD Z2, in1, in1 \ - -#define MUL(in0, in1) \ - VPMULUDQ in0, in1, Z12 \ - VPANDQ Z11, Z12, Z10 \ - VPMULUDQ Z10, Z9, Z10 \ - VPANDQ Z11, Z10, Z10 \ - VPMULUDQ Z10, Z8, Z10 \ - VPADDQ Z12, Z10, Z12 \ - VPSRLQ $32, Z12, Z12 \ - VPSUBQ Z8, Z12, Z10 \ - VPMINUQ Z12, Z10, in0 \ - loop_3: TESTQ SI, SI JEQ done_2 // n == 0, we are done VMOVDQA32 0(R15), Z0 // load a[i] VMOVDQA32 0(DX), Z1 // load a[i+m] - BUTTERFLYD2Q(Z0, Z1) + BUTTERFLYD2Q(Z0, Z1, Z2, Z3) VMOVDQA32 Z0, 0(R15) // store a[i] VEXTRACTI32X8 $0, Z1, Y20 VEXTRACTI32X8 $1, Z1, Y21 @@ -67,8 +120,8 @@ loop_3: VPMOVZXDQ Y21, Z14 VPMOVZXDQ 0(CX), Z15 VPMOVZXDQ 32(CX), Z16 - MUL(Z13, Z15) - MUL(Z14, Z16) + MUL(Z13, Z15, Z11, Z8, Z9, Z12, Z10) + MUL(Z14, Z16, Z11, Z8, Z9, Z12, Z10) VPMOVQD Z13, 0(DX) VPMOVQD Z14, 32(DX) ADDQ $64, R15 @@ -114,45 +167,24 @@ TEXT ·kerDIFNP_128_avx512(SB), NOSPLIT, $0-56 ADDQ AX, CX // we want twiddles[stage] as starting point // load a[:128] in registers - VPMOVZXDQ 0(R15), Z0 - VPMOVZXDQ 32(R15), Z1 - VPMOVZXDQ 64(R15), Z2 - VPMOVZXDQ 96(R15), Z3 - VPMOVZXDQ 128(R15), Z4 - VPMOVZXDQ 160(R15), Z5 - VPMOVZXDQ 192(R15), Z6 - VPMOVZXDQ 224(R15), Z7 - VPMOVZXDQ 256(R15), Z8 - VPMOVZXDQ 288(R15), Z9 - VPMOVZXDQ 320(R15), Z10 - VPMOVZXDQ 352(R15), Z11 - VPMOVZXDQ 384(R15), Z12 - VPMOVZXDQ 416(R15), Z13 - VPMOVZXDQ 448(R15), Z14 - VPMOVZXDQ 480(R15), Z15 - - // butterfly computes - // in0 = in0 + in1 (in [0,q)) - // in1 = in0 - in1 (in [0,2q)) -#define BUTTERFLY(in0, in1) \ - VPADDQ in0, in1, Z21 \ - VPSUBQ in1, in0, in1 \ - VPSUBQ Z17, Z21, in0 \ - VPMINUQ Z21, in0, in0 \ - VPADDQ Z17, in1, in1 \ - -// mul computes x = x * y -#define MUL_0(in0, in1) \ - VPMULUDQ in0, in1, Z16 \ - VPANDQ Z20, Z16, Z19 \ - VPMULUDQ Z19, Z18, Z19 \ - VPANDQ Z20, Z19, Z19 \ - VPMULUDQ Z19, Z17, Z19 \ - VPADDQ Z16, Z19, Z16 \ - VPSRLQ $32, Z16, Z16 \ - VPSUBQ Z17, Z16, Z19 \ - VPMINUQ Z16, Z19, in0 \ - + VPMOVZXDQ 0(R15), Z0 + VPMOVZXDQ 32(R15), Z1 + VPMOVZXDQ 64(R15), Z2 + VPMOVZXDQ 96(R15), Z3 + VPMOVZXDQ 128(R15), Z4 + VPMOVZXDQ 160(R15), Z5 + VPMOVZXDQ 192(R15), Z6 + VPMOVZXDQ 224(R15), Z7 + VPMOVZXDQ 256(R15), Z8 + VPMOVZXDQ 288(R15), Z9 + VPMOVZXDQ 320(R15), Z10 + VPMOVZXDQ 352(R15), Z11 + VPMOVZXDQ 384(R15), Z12 + VPMOVZXDQ 416(R15), Z13 + VPMOVZXDQ 448(R15), Z14 + VPMOVZXDQ 480(R15), Z15 + MOVQ $0x0000000000000f0f, AX + KMOVQ AX, K1 MOVQ 0(CX), BX VPMOVZXDQ 0(BX), Z23 VPMOVZXDQ 32(BX), Z24 @@ -162,127 +194,119 @@ TEXT ·kerDIFNP_128_avx512(SB), NOSPLIT, $0-56 VPMOVZXDQ 160(BX), Z28 VPMOVZXDQ 192(BX), Z29 VPMOVZXDQ 224(BX), Z30 - BUTTERFLY(Z0, Z8) - MUL_0(Z8, Z23) - BUTTERFLY(Z1, Z9) - MUL_0(Z9, Z24) - BUTTERFLY(Z2, Z10) - MUL_0(Z10, Z25) - BUTTERFLY(Z3, Z11) - MUL_0(Z11, Z26) - BUTTERFLY(Z4, Z12) - MUL_0(Z12, Z27) - BUTTERFLY(Z5, Z13) - MUL_0(Z13, Z28) - BUTTERFLY(Z6, Z14) - MUL_0(Z14, Z29) - BUTTERFLY(Z7, Z15) - MUL_0(Z15, Z30) + BUTTERFLYQ2Q(Z0, Z8, Z17, Z21) + MUL(Z8, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z1, Z9, Z17, Z21) + MUL(Z9, Z24, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z2, Z10, Z17, Z21) + MUL(Z10, Z25, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z3, Z11, Z17, Z21) + MUL(Z11, Z26, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z4, Z12, Z17, Z21) + MUL(Z12, Z27, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z5, Z13, Z17, Z21) + MUL(Z13, Z28, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z6, Z14, Z17, Z21) + MUL(Z14, Z29, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z7, Z15, Z17, Z21) + MUL(Z15, Z30, Z20, Z17, Z18, Z16, Z19) ADDQ $24, CX MOVQ 0(CX), BX VPMOVZXDQ 0(BX), Z23 VPMOVZXDQ 32(BX), Z24 VPMOVZXDQ 64(BX), Z25 VPMOVZXDQ 96(BX), Z26 - BUTTERFLY(Z0, Z4) - MUL_0(Z4, Z23) - BUTTERFLY(Z1, Z5) - MUL_0(Z5, Z24) - BUTTERFLY(Z2, Z6) - MUL_0(Z6, Z25) - BUTTERFLY(Z3, Z7) - MUL_0(Z7, Z26) - BUTTERFLY(Z8, Z12) - MUL_0(Z12, Z23) - BUTTERFLY(Z9, Z13) - MUL_0(Z13, Z24) - BUTTERFLY(Z10, Z14) - MUL_0(Z14, Z25) - BUTTERFLY(Z11, Z15) - MUL_0(Z15, Z26) + BUTTERFLYQ2Q(Z0, Z4, Z17, Z21) + MUL(Z4, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z1, Z5, Z17, Z21) + MUL(Z5, Z24, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z2, Z6, Z17, Z21) + MUL(Z6, Z25, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z3, Z7, Z17, Z21) + MUL(Z7, Z26, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z8, Z12, Z17, Z21) + MUL(Z12, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z9, Z13, Z17, Z21) + MUL(Z13, Z24, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z10, Z14, Z17, Z21) + MUL(Z14, Z25, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z11, Z15, Z17, Z21) + MUL(Z15, Z26, Z20, Z17, Z18, Z16, Z19) ADDQ $24, CX MOVQ 0(CX), BX VPMOVZXDQ 0(BX), Z23 VPMOVZXDQ 32(BX), Z24 - BUTTERFLY(Z0, Z2) - MUL_0(Z2, Z23) - BUTTERFLY(Z1, Z3) - MUL_0(Z3, Z24) - BUTTERFLY(Z4, Z6) - MUL_0(Z6, Z23) - BUTTERFLY(Z5, Z7) - MUL_0(Z7, Z24) - BUTTERFLY(Z8, Z10) - MUL_0(Z10, Z23) - BUTTERFLY(Z9, Z11) - MUL_0(Z11, Z24) - BUTTERFLY(Z12, Z14) - MUL_0(Z14, Z23) - BUTTERFLY(Z13, Z15) - MUL_0(Z15, Z24) + BUTTERFLYQ2Q(Z0, Z2, Z17, Z21) + MUL(Z2, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z1, Z3, Z17, Z21) + MUL(Z3, Z24, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z4, Z6, Z17, Z21) + MUL(Z6, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z5, Z7, Z17, Z21) + MUL(Z7, Z24, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z8, Z10, Z17, Z21) + MUL(Z10, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z9, Z11, Z17, Z21) + MUL(Z11, Z24, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z12, Z14, Z17, Z21) + MUL(Z14, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z13, Z15, Z17, Z21) + MUL(Z15, Z24, Z20, Z17, Z18, Z16, Z19) ADDQ $24, CX MOVQ 0(CX), BX VPMOVZXDQ 0(BX), Z23 - BUTTERFLY(Z0, Z1) - MUL_0(Z1, Z23) - BUTTERFLY(Z2, Z3) - MUL_0(Z3, Z23) - BUTTERFLY(Z4, Z5) - MUL_0(Z5, Z23) - BUTTERFLY(Z6, Z7) - MUL_0(Z7, Z23) - BUTTERFLY(Z8, Z9) - MUL_0(Z9, Z23) - BUTTERFLY(Z10, Z11) - MUL_0(Z11, Z23) - BUTTERFLY(Z12, Z13) - MUL_0(Z13, Z23) - BUTTERFLY(Z14, Z15) - MUL_0(Z15, Z23) + BUTTERFLYQ2Q(Z0, Z1, Z17, Z21) + MUL(Z1, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z2, Z3, Z17, Z21) + MUL(Z3, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z4, Z5, Z17, Z21) + MUL(Z5, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z6, Z7, Z17, Z21) + MUL(Z7, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z8, Z9, Z17, Z21) + MUL(Z9, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z10, Z11, Z17, Z21) + MUL(Z11, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z12, Z13, Z17, Z21) + MUL(Z13, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z14, Z15, Z17, Z21) + MUL(Z15, Z23, Z20, Z17, Z18, Z16, Z19) ADDQ $24, CX MOVQ 0(CX), BX - VPMOVZXDQ 0(BX), Y23 // zero extend 4x uint32 to 4x uint64 + VPMOVZXDQ 0(BX), Y23 // zero extend 4x uint32 to 4x uint64 VINSERTI64X4 $1, Y23, Z23, Z23 - MOVQ $0x0000000000000f0f, AX - KMOVQ AX, K1 - -#define PERMUTE4X4(in0, in1) \ - VSHUFI64X2 $0x000000000000004e, in1, in0, Z21 \ - VPBLENDMQ in0, Z21, K1, in0 \ - VPBLENDMQ Z21, in1, K1, in1 \ - - PERMUTE4X4(Z0, Z1) - BUTTERFLY(Z0, Z1) - MUL_0(Z1, Z23) - PERMUTE4X4(Z0, Z1) - PERMUTE4X4(Z2, Z3) - BUTTERFLY(Z2, Z3) - MUL_0(Z3, Z23) - PERMUTE4X4(Z2, Z3) - PERMUTE4X4(Z4, Z5) - BUTTERFLY(Z4, Z5) - MUL_0(Z5, Z23) - PERMUTE4X4(Z4, Z5) - PERMUTE4X4(Z6, Z7) - BUTTERFLY(Z6, Z7) - MUL_0(Z7, Z23) - PERMUTE4X4(Z6, Z7) - PERMUTE4X4(Z8, Z9) - BUTTERFLY(Z8, Z9) - MUL_0(Z9, Z23) - PERMUTE4X4(Z8, Z9) - PERMUTE4X4(Z10, Z11) - BUTTERFLY(Z10, Z11) - MUL_0(Z11, Z23) - PERMUTE4X4(Z10, Z11) - PERMUTE4X4(Z12, Z13) - BUTTERFLY(Z12, Z13) - MUL_0(Z13, Z23) - PERMUTE4X4(Z12, Z13) - PERMUTE4X4(Z14, Z15) - BUTTERFLY(Z14, Z15) - MUL_0(Z15, Z23) - PERMUTE4X4(Z14, Z15) + PERMUTE4X4(Z0, Z1, Z21, K1) + BUTTERFLYQ2Q(Z0, Z1, Z17, Z21) + MUL(Z1, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE4X4(Z0, Z1, Z21, K1) + PERMUTE4X4(Z2, Z3, Z21, K1) + BUTTERFLYQ2Q(Z2, Z3, Z17, Z21) + MUL(Z3, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE4X4(Z2, Z3, Z21, K1) + PERMUTE4X4(Z4, Z5, Z21, K1) + BUTTERFLYQ2Q(Z4, Z5, Z17, Z21) + MUL(Z5, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE4X4(Z4, Z5, Z21, K1) + PERMUTE4X4(Z6, Z7, Z21, K1) + BUTTERFLYQ2Q(Z6, Z7, Z17, Z21) + MUL(Z7, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE4X4(Z6, Z7, Z21, K1) + PERMUTE4X4(Z8, Z9, Z21, K1) + BUTTERFLYQ2Q(Z8, Z9, Z17, Z21) + MUL(Z9, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE4X4(Z8, Z9, Z21, K1) + PERMUTE4X4(Z10, Z11, Z21, K1) + BUTTERFLYQ2Q(Z10, Z11, Z17, Z21) + MUL(Z11, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE4X4(Z10, Z11, Z21, K1) + PERMUTE4X4(Z12, Z13, Z21, K1) + BUTTERFLYQ2Q(Z12, Z13, Z17, Z21) + MUL(Z13, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE4X4(Z12, Z13, Z21, K1) + PERMUTE4X4(Z14, Z15, Z21, K1) + BUTTERFLYQ2Q(Z14, Z15, Z17, Z21) + MUL(Z15, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE4X4(Z14, Z15, Z21, K1) ADDQ $24, CX MOVQ 0(CX), BX VPMOVZXDQ 0(BX), X23 // zero extend 2x uint32 to 2x uint64 @@ -293,89 +317,62 @@ TEXT ·kerDIFNP_128_avx512(SB), NOSPLIT, $0-56 KMOVQ AX, K2 MOVQ ·vInterleaveIndices+0(SB), DI VMOVDQU64 0(DI), Z30 - -#define PERMUTE2X2(in0, in1) \ - VMOVDQA64 Z30, Z29 \ - VPERMI2Q in1, in0, Z29 \ - VPBLENDMQ in0, Z29, K2, in0 \ - VPBLENDMQ Z29, in1, K2, in1 \ - - PERMUTE2X2(Z0, Z1) - BUTTERFLY(Z0, Z1) - MUL_0(Z1, Z23) - PERMUTE2X2(Z0, Z1) - PERMUTE2X2(Z2, Z3) - BUTTERFLY(Z2, Z3) - MUL_0(Z3, Z23) - PERMUTE2X2(Z2, Z3) - PERMUTE2X2(Z4, Z5) - BUTTERFLY(Z4, Z5) - MUL_0(Z5, Z23) - PERMUTE2X2(Z4, Z5) - PERMUTE2X2(Z6, Z7) - BUTTERFLY(Z6, Z7) - MUL_0(Z7, Z23) - PERMUTE2X2(Z6, Z7) - PERMUTE2X2(Z8, Z9) - BUTTERFLY(Z8, Z9) - MUL_0(Z9, Z23) - PERMUTE2X2(Z8, Z9) - PERMUTE2X2(Z10, Z11) - BUTTERFLY(Z10, Z11) - MUL_0(Z11, Z23) - PERMUTE2X2(Z10, Z11) - PERMUTE2X2(Z12, Z13) - BUTTERFLY(Z12, Z13) - MUL_0(Z13, Z23) - PERMUTE2X2(Z12, Z13) - PERMUTE2X2(Z14, Z15) - BUTTERFLY(Z14, Z15) - MUL_0(Z15, Z23) - PERMUTE2X2(Z14, Z15) - MOVQ $0x0000000000005555, AX - KMOVD AX, K3 - -#define PERMUTE1X1(in0, in1) \ - VPSHRDQ $32, in1, in0, Z21 \ - VPBLENDMD in0, Z21, K3, in0 \ - VPBLENDMD Z21, in1, K3, in1 \ - + PERMUTE2X2(Z0, Z1, Z30, Z29, K2) + BUTTERFLYQ2Q(Z0, Z1, Z17, Z21) + MUL(Z1, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE2X2(Z0, Z1, Z30, Z29, K2) + PERMUTE2X2(Z2, Z3, Z30, Z29, K2) + BUTTERFLYQ2Q(Z2, Z3, Z17, Z21) + MUL(Z3, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE2X2(Z2, Z3, Z30, Z29, K2) + PERMUTE2X2(Z4, Z5, Z30, Z29, K2) + BUTTERFLYQ2Q(Z4, Z5, Z17, Z21) + MUL(Z5, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE2X2(Z4, Z5, Z30, Z29, K2) + PERMUTE2X2(Z6, Z7, Z30, Z29, K2) + BUTTERFLYQ2Q(Z6, Z7, Z17, Z21) + MUL(Z7, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE2X2(Z6, Z7, Z30, Z29, K2) + PERMUTE2X2(Z8, Z9, Z30, Z29, K2) + BUTTERFLYQ2Q(Z8, Z9, Z17, Z21) + MUL(Z9, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE2X2(Z8, Z9, Z30, Z29, K2) + PERMUTE2X2(Z10, Z11, Z30, Z29, K2) + BUTTERFLYQ2Q(Z10, Z11, Z17, Z21) + MUL(Z11, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE2X2(Z10, Z11, Z30, Z29, K2) + PERMUTE2X2(Z12, Z13, Z30, Z29, K2) + BUTTERFLYQ2Q(Z12, Z13, Z17, Z21) + MUL(Z13, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE2X2(Z12, Z13, Z30, Z29, K2) + PERMUTE2X2(Z14, Z15, Z30, Z29, K2) + BUTTERFLYQ2Q(Z14, Z15, Z17, Z21) + MUL(Z15, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE2X2(Z14, Z15, Z30, Z29, K2) + MOVQ $0x0000000000005555, AX + KMOVD AX, K3 MOVD $const_q, AX - VPBROADCASTD AX, Z17 // rebroadcast q, but on dword lanes - -#define LASTBUTTERFLY(in0, in1) \ - VPADDD in0, in1, Z21 \ - VPSUBD in1, in0, in1 \ - VPSUBD Z17, Z21, in0 \ - VPMINUD Z21, in0, in0 \ - VPADDD Z17, in1, Z22 \ - VPMINUD Z22, in1, in1 \ - -#define PACK_DWORDS(in0, in1, in2, in3) \ - VPMOVQD in0, in1 \ - VPMOVQD in2, in3 \ - VINSERTI64X4 $1, in3, in0, in0 \ - + VPBROADCASTD AX, Z17 // rebroadcast q, but on dword lanes PACK_DWORDS(Z0, Y0, Z1, Y1) PACK_DWORDS(Z2, Y2, Z3, Y3) - PERMUTE1X1(Z0, Z2) - LASTBUTTERFLY(Z0, Z2) - PERMUTE1X1(Z0, Z2) + PERMUTE1X1(Z0, Z2, Z21, K3) + BUTTERFLYD1Q(Z0, Z2, Z17, Z21, Z22) + PERMUTE1X1(Z0, Z2, Z21, K3) PACK_DWORDS(Z4, Y4, Z5, Y5) PACK_DWORDS(Z6, Y6, Z7, Y7) - PERMUTE1X1(Z4, Z6) - LASTBUTTERFLY(Z4, Z6) - PERMUTE1X1(Z4, Z6) + PERMUTE1X1(Z4, Z6, Z21, K3) + BUTTERFLYD1Q(Z4, Z6, Z17, Z21, Z22) + PERMUTE1X1(Z4, Z6, Z21, K3) PACK_DWORDS(Z8, Y8, Z9, Y9) PACK_DWORDS(Z10, Y10, Z11, Y11) - PERMUTE1X1(Z8, Z10) - LASTBUTTERFLY(Z8, Z10) - PERMUTE1X1(Z8, Z10) + PERMUTE1X1(Z8, Z10, Z21, K3) + BUTTERFLYD1Q(Z8, Z10, Z17, Z21, Z22) + PERMUTE1X1(Z8, Z10, Z21, K3) PACK_DWORDS(Z12, Y12, Z13, Y13) PACK_DWORDS(Z14, Y14, Z15, Y15) - PERMUTE1X1(Z12, Z14) - LASTBUTTERFLY(Z12, Z14) - PERMUTE1X1(Z12, Z14) + PERMUTE1X1(Z12, Z14, Z21, K3) + BUTTERFLYD1Q(Z12, Z14, Z17, Z21, Z22) + PERMUTE1X1(Z12, Z14, Z21, K3) // store a[:128] in memory VMOVDQA32 Z0, 0(R15) diff --git a/field/generator/asm/amd64/build.go b/field/generator/asm/amd64/build.go index fec8f0a027..395e6b256d 100644 --- a/field/generator/asm/amd64/build.go +++ b/field/generator/asm/amd64/build.go @@ -62,6 +62,7 @@ func (f *FFAmd64) StackSize(maxNbRegistersNeeded, nbRegistersReserved, minStackS } func (f *FFAmd64) DefineFn(name string) (fn defineFn, err error) { + name = strings.ToUpper(name) fn, ok := f.mDefines[name] if !ok { return nil, fmt.Errorf("function %s not defined", name) @@ -273,6 +274,8 @@ func GenerateF31FFTKernels(w io.Writer, nbBits int, kernels []int) error { f.WriteLn("#include \"go_asm.h\"") f.WriteLn("") + f.generateFFTDefinesF31() + f.generateFFTInnerDIFF31() for _, ksize := range kernels { diff --git a/field/generator/asm/amd64/element_vec_F31.go b/field/generator/asm/amd64/element_vec_F31.go index 09a84073d4..36f037b6e9 100644 --- a/field/generator/asm/amd64/element_vec_F31.go +++ b/field/generator/asm/amd64/element_vec_F31.go @@ -471,21 +471,151 @@ func (f *FFAmd64) generateInnerProdVecF31() { f.Push(®isters, addrA, addrT, len) } -func (f *FFAmd64) generateFFTInnerDIFF31() { +func (f *FFAmd64) generateFFTDefinesF31() { + f.Comment("performs a butterfly between 2 vectors of dwords") + f.Comment("in0 = (in0 + in1) mod q") + f.Comment("in1 = (in0 - in1) mod 2q") + f.Comment("in2: q broadcasted on all dwords lanes") + f.Comment("in3: temporary Z register") + _ = f.Define("butterflyD2Q", 4, func(args ...amd64.Register) { + x := args[0] + y := args[1] + qd := args[2] + b0 := args[3] + f.VPADDD(x, y, b0) // b0 = x + y + f.VPSUBD(y, x, y) // y = x - y + f.VPSUBD(qd, b0, x) // x = (x+y) - q + f.VPMINUD(b0, x, x) // x %= q + f.VPADDD(qd, y, y) // y = (x-y) + q --> y in [0,2q) + }) - const argSize = 9 * 8 - stackSize := f.StackSize(f.NbWords*2+4, 1, 0) - registers := f.FnHeader("innerDIFWithTwiddles_avx512", stackSize, argSize, amd64.AX) - defer f.AssertCleanStack(stackSize, 0) + f.Comment("same as butterflyD2Q but reduces in1 to [0,q)") + _ = f.Define("butterflyD1Q", 5, func(args ...amd64.Register) { + x := args[0] + y := args[1] + qd := args[2] + b0 := args[3] + b1 := args[4] + f.VPADDD(x, y, b0) // b0 = x + y + f.VPSUBD(y, x, y) // y = x - y + f.VPSUBD(qd, b0, x) // x = (x+y) - q + f.VPMINUD(b0, x, x) // x %= q + f.VPADDD(qd, y, b1) // y = (x-y) + q --> y in [0,2q) + f.VPMINUD(b1, y, y) // y %= q + }) + + f.Comment("same as butterflyD2Q but for qwords") + f.Comment("in2: must be broadcasted on all qwords lanes") + _ = f.Define("butterflyQ2Q", 4, func(args ...amd64.Register) { + x := args[0] + y := args[1] + q := args[2] + b0 := args[3] + + f.VPADDQ(x, y, b0) // b0 = x + y + f.VPSUBQ(y, x, y) // y = x - y + f.VPSUBQ(q, b0, x) // x = (x+y) - q + f.VPMINUQ(b0, x, x) // x %= q + f.VPADDQ(q, y, y) // y = (x-y) + q --> y in [0,2q) + }) + + f.Comment("performs a multiplication in place between 2 vectors of qwords (values should be dwords zero extended)") + f.Comment("in0 = (in0 * in1) mod q") + f.Comment("in1: second operand") + f.Comment("in2: mask for low dword in each qword") + f.Comment("in3: q broadcasted on all qwords lanes") + f.Comment("in4: qInvNeg broadcasted on all qwords lanes") + f.Comment("in5: temporary Z register") + f.Comment("in6: temporary Z register") + _ = f.Define("mul", 7, func(args ...amd64.Register) { + x := args[0] + y := args[1] + LSW := args[2] + q := args[3] + qInvNeg := args[4] + P := args[5] + PL := args[6] + + f.VPMULUDQ(x, y, P) + f.VPANDQ(LSW, P, PL) + f.VPMULUDQ(PL, qInvNeg, PL) + f.VPANDQ(LSW, PL, PL) + f.VPMULUDQ(PL, q, PL) + f.VPADDQ(P, PL, P) + f.VPSRLQ("$32", P, P) + f.VPSUBQ(q, P, PL) + f.VPMINUQ(P, PL, x) + }) f.WriteLn(` + // goes from + // Z1 = A A A A B B B B + // Z2 = C C C C D D D D + // we want + // Z1 = A A A A C C C C + // Z2 = B B B B D D D D`) + _ = f.Define("permute4x4", 4, func(args ...amd64.Register) { + x := args[0] + y := args[1] + b0 := args[2] + K := args[3] + f.VSHUFI64X2(uint64(0b01_00_11_10), y, x, b0) + f.VPBLENDMQ(x, b0, x, K) + f.VPBLENDMQ(b0, y, y, K) + }) + + f.WriteLn(` + // Z1 = A A B B C C D D + // Z2 = L L M M N N O O + // we want + // Z1 = A A L L C C N N + // Z2 = B B M M D D O O`) + _ = f.Define("permute2x2", 5, func(args ...amd64.Register) { + x := args[0] + y := args[1] + vInterleaveIndices := args[2] + tmp := args[3] + K := args[4] + f.VMOVDQA64(vInterleaveIndices, tmp) + f.VPERMI2Q(y, x, tmp) + f.VPBLENDMQ(x, tmp, x, K) + f.VPBLENDMQ(tmp, y, y, K) + }) + + _ = f.Define("permute1x1", 4, func(args ...amd64.Register) { + x := args[0] + y := args[1] + b0 := args[2] + K := args[3] + + f.VPSHRDQ("$32", y, x, b0) + f.VPBLENDMD(x, b0, x, K) + f.VPBLENDMD(b0, y, y, K) + }) + + _ = f.Define("PACK_DWORDS", 4, func(args ...amd64.Register) { + x := args[0] + xx := args[1] + y := args[2] + xy := args[3] + + f.VPMOVQD(x, xx) + f.VPMOVQD(y, xy) + f.VINSERTI64X4(1, xy, x, x) + }) +} + +func (f *FFAmd64) generateFFTInnerDIFF31() { // func innerDIFWithTwiddles(a []Element, twiddles []Element, start, end, m int) { // for i := start; i < end; i++ { // Butterfly(&a[i], &a[i+m]) // a[i+m].Mul(&a[i+m], &twiddles[i]) // } // } -`) + const argSize = 9 * 8 + stackSize := f.StackSize(f.NbWords*2+4, 1, 0) + registers := f.FnHeader("innerDIFWithTwiddles_avx512", stackSize, argSize, amd64.AX) + defer f.AssertCleanStack(stackSize, 0) addrA := f.Pop(®isters) addrAPlusM := f.Pop(®isters) @@ -524,6 +654,10 @@ func (f *FFAmd64) generateFFTInnerDIFF31() { f.MOVQ("end+56(FP)", len) f.MOVQ("m+64(FP)", m) + // get defines + butterflyD2Q, _ := f.DefineFn("butterflyD2Q") + mul, _ := f.DefineFn("mul") + // we do only m >= 16; // if m < 16, we call the generic one; this can be called when doing a FFT // smaller than the smallest generated kernel @@ -540,33 +674,6 @@ func (f *FFAmd64) generateFFTInnerDIFF31() { f.MOVQ(addrA, addrAPlusM) f.ADDQ(m, addrAPlusM) - f.Comment("performs a butterfly between 2 vectors of dwords") - f.Comment("first vector is in [0, q) and second vector is in [0, 2q)") - butterflyD2Q := f.Define("butterflyD2Q", 2, func(args ...amd64.Register) { - x := args[0] - y := args[1] - f.VPADDD(x, y, b0) // b0 = x + y - f.VPSUBD(y, x, y) // y = x - y - f.VPSUBD(qd, b0, x) // x = (x+y) - q - f.VPMINUD(b0, x, x) // x %= q - f.VPADDD(qd, y, y) // y = (x-y) + q --> y in [0,2q) - }) - - mul := f.Define("mul", 2, func(args ...amd64.Register) { - x := args[0] - y := args[1] - - f.VPMULUDQ(x, y, P) - f.VPANDQ(LSW, P, PL) - f.VPMULUDQ(PL, qInvNeg, PL) - f.VPANDQ(LSW, PL, PL) - f.VPMULUDQ(PL, q, PL) - f.VPADDQ(P, PL, P) - f.VPSRLQ("$32", P, P) - f.VPSUBQ(q, P, PL) - f.VPMINUQ(P, PL, x) - }) - lblDone := f.NewLabel("done") lblLoop := f.NewLabel("loop") @@ -578,7 +685,7 @@ func (f *FFAmd64) generateFFTInnerDIFF31() { f.VMOVDQA32(addrA.At(0), a, "load a[i]") f.VMOVDQA32(addrAPlusM.At(0), am, "load a[i+m]") - butterflyD2Q(a, am) + butterflyD2Q(a, am, qd, b0) // a is ready to be stored, but we need to scale am by twiddles. f.VMOVDQA32(a, addrA.At(0), "store a[i]") @@ -595,8 +702,8 @@ func (f *FFAmd64) generateFFTInnerDIFF31() { f.VPMOVZXDQ(addrTwiddles.At(0), t0) f.VPMOVZXDQ(addrTwiddles.At(4), t1) - mul(m1, t0) - mul(m2, t1) + mul(m1, t0, LSW, q, qInvNeg, P, PL) + mul(m2, t1, LSW, q, qInvNeg, P, PL) // store m1 and m2 f.VPMOVQD(m1, addrAPlusM.At(0)) @@ -704,37 +811,14 @@ func (f *FFAmd64) generateFFTKernelF31(klog2 int) { f.VPMOVZXDQ(addrA.At(i*4), a[i]) } - // step 0 - // innerDIFWithTwiddles(a[:128], twiddles[stage+0], 0, 64, 64) - f.Comment("butterfly computes") - f.Comment("in0 = in0 + in1 (in [0,q))") - f.Comment("in1 = in0 - in1 (in [0,2q))") - butterfly := f.Define("butterfly", 2, func(args ...amd64.Register) { - x := args[0] - y := args[1] + // get the defines + butterflyQ2Q, _ := f.DefineFn("butterflyQ2Q") + mul, _ := f.DefineFn("mul") - f.VPADDQ(x, y, b0) // b0 = x + y - f.VPSUBQ(y, x, y) // y = x - y - f.VPSUBQ(q, b0, x) // x = (x+y) - q - f.VPMINUQ(b0, x, x) // x %= q - f.VPADDQ(q, y, y) // y = (x-y) + q --> y in [0,2q) - }) - - f.Comment("mul computes x = x * y") - mul := f.Define("mul", 2, func(args ...amd64.Register) { - x := args[0] - y := args[1] - - f.VPMULUDQ(x, y, P) - f.VPANDQ(LSW, P, PL) - f.VPMULUDQ(PL, qInvNeg, PL) - f.VPANDQ(LSW, PL, PL) - f.VPMULUDQ(PL, q, PL) - f.VPADDQ(P, PL, P) - f.VPSRLQ("$32", P, P) - f.VPSUBQ(q, P, PL) - f.VPMINUQ(P, PL, x) - }) + const kBlendEven4 = 0x0f0f + f.MOVQ(uint64(kBlendEven4), amd64.AX) + f.KMOVQ(amd64.AX, "K1") + permute4x4, _ := f.DefineFn("permute4x4") m := n >> 1 @@ -753,8 +837,8 @@ func (f *FFAmd64) generateFFTKernelF31(klog2 int) { for offset := 0; offset < 128; offset += n { aa := a[offset/8:] for i := 0; i < am; i++ { - butterfly(aa[i], aa[i+am]) - mul(aa[i+am], t[i]) + butterflyQ2Q(aa[i], aa[i+am], q, b0) + mul(aa[i+am], t[i], LSW, q, qInvNeg, P, PL) } } @@ -787,27 +871,6 @@ func (f *FFAmd64) generateFFTKernelF31(klog2 int) { f.VPMOVZXDQ(addrTwiddles.At(0), ty0, "zero extend 4x uint32 to 4x uint64") f.VINSERTI64X4(1, ty0, t[0], t[0]) - const kBlendEven4 = 0x0f0f - f.MOVQ(uint64(kBlendEven4), amd64.AX) - f.KMOVQ(amd64.AX, "K1") - - // we have for example - // Z1 = A A A A B B B B - // Z2 = C C C C D D D D - // we want - // Z1 = A A A A C C C C - // Z2 = B B B B D D D D - permute4x4 := f.Define("permute4x4", 2, func(args ...amd64.Register) { - x := args[0] - y := args[1] - f.VSHUFI64X2(uint64(0b01_00_11_10), y, x, b0) - f.VPBLENDMQ(x, b0, x, "K1") - f.VPBLENDMQ(b0, y, y, "K1") - }) - - // for offset := 0; offset < 128; offset += 8 { - // innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+4], 0, 4, 4) - // } // now we process the a[i] 2 by 2 and permute before / after the ops. for offset := 0; offset < 128; offset += n * 2 { // note that we advance by 2*n, that is 16 uint32 @@ -816,12 +879,12 @@ func (f *FFAmd64) generateFFTKernelF31(klog2 int) { y := a[(offset/8)+1] // first we need to permute 4 last of x with 4 first of y - permute4x4(x, y) - butterfly(x, y) - mul(y, t[0]) + permute4x4(x, y, b0, "K1") + butterflyQ2Q(x, y, q, b0) + mul(y, t[0], LSW, q, qInvNeg, P, PL) - // invert back - permute4x4(x, y) + // permute back + permute4x4(x, y, b0, "K1") } n >>= 1 @@ -840,13 +903,6 @@ func (f *FFAmd64) generateFFTKernelF31(klog2 int) { f.VINSERTI64X2(2, tx0, t[0], t[0]) f.VINSERTI64X2(3, tx0, t[0], t[0]) - // we have for example - // Z1 = A A B B C C D D - // Z2 = L L M M N N O O - // we want - // Z1 = A A L L C C N N - // Z2 = B B M M D D O O - const kBlendEven = 0b00110011 f.MOVQ(uint64(kBlendEven), amd64.AX) f.KMOVQ(amd64.AX, "K2") @@ -855,16 +911,7 @@ func (f *FFAmd64) generateFFTKernelF31(klog2 int) { f.MOVQ("·vInterleaveIndices+0(SB)", addrVInterleaveIndices) f.VMOVDQU64(addrVInterleaveIndices.At(0), vInterleaveIndices) - permute2x2 := f.Define("permute2x2", 2, func(args ...amd64.Register) { - x := args[0] - y := args[1] - - tmp := t[6] - f.VMOVDQA64(vInterleaveIndices, tmp) - f.VPERMI2Q(y, x, tmp) - f.VPBLENDMQ(x, tmp, x, "K2") - f.VPBLENDMQ(tmp, y, y, "K2") - }) + permute2x2, _ := f.DefineFn("permute2x2") // for offset := 0; offset < 128; offset += 4 { // innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+5], 0, 2, 2) @@ -876,12 +923,12 @@ func (f *FFAmd64) generateFFTKernelF31(klog2 int) { y := a[(offset/8)+1] // first we need to permute 4 last of x with 4 first of y - permute2x2(x, y) - butterfly(x, y) - mul(y, t[0]) + permute2x2(x, y, vInterleaveIndices, t[6], "K2") + butterflyQ2Q(x, y, q, b0) + mul(y, t[0], LSW, q, qInvNeg, P, PL) // invert back - permute2x2(x, y) + permute2x2(x, y, vInterleaveIndices, t[6], "K2") } const kBlendEven2 = 0b0101010101010101 @@ -889,41 +936,13 @@ func (f *FFAmd64) generateFFTKernelF31(klog2 int) { f.MOVQ(uint64(kBlendEven2), amd64.AX) f.KMOVD(amd64.AX, "K3") - permute1x1 := f.Define("permute1x1", 2, func(args ...amd64.Register) { - x := args[0] - y := args[1] - - f.VPSHRDQ("$32", y, x, b0) - f.VPBLENDMD(x, b0, x, "K3") - f.VPBLENDMD(b0, y, y, "K3") - }) + permute1x1, _ := f.DefineFn("permute1x1") f.WriteLn("MOVD $const_q, AX") f.VPBROADCASTD(amd64.AX, q, "rebroadcast q, but on dword lanes") - // same as butterfly, but we reduce mod q the 2 results - // also uses dword lane instructions - lastButterfly := f.Define("lastButterfly", 2, func(args ...amd64.Register) { - x := args[0] - y := args[1] - f.VPADDD(x, y, b0) // b0 = x + y - f.VPSUBD(y, x, y) // y = x - y - f.VPSUBD(q, b0, x) // x = (x+y) - q - f.VPMINUD(b0, x, x) // x %= q - f.VPADDD(q, y, b1) // b1 = (x-y) + q --> b1 in [0,2q) - f.VPMINUD(b1, y, y) // y %= q - }) - - packDWORDS := f.Define("PACK_DWORDS", 4, func(args ...amd64.Register) { - x := args[0] - xx := args[1] - y := args[2] - xy := args[3] - - f.VPMOVQD(x, xx) - f.VPMOVQD(y, xy) - f.VINSERTI64X4(1, xy, x, x) - }) + butterflyD1Q, _ := f.DefineFn("butterflyD1Q") + packDWORDS, _ := f.DefineFn("PACK_DWORDS") // now m == 1, last step is only butterflies like so // for offset := 0; offset < 128; offset += 2 { @@ -935,9 +954,9 @@ func (f *FFAmd64) generateFFTKernelF31(klog2 int) { packDWORDS(u, zToy(u), v, zToy(v)) packDWORDS(w, zToy(w), x, zToy(x)) - permute1x1(u, w) - lastButterfly(u, w) - permute1x1(u, w) + permute1x1(u, w, b0, "K3") + butterflyD1Q(u, w, q, b0, b1) + permute1x1(u, w, b0, "K3") } // end we store back a diff --git a/field/koalabear/fft/kernel_amd64.s b/field/koalabear/fft/kernel_amd64.s index d8e3d38bf8..9f4fb27135 100644 --- a/field/koalabear/fft/kernel_amd64.s +++ b/field/koalabear/fft/kernel_amd64.s @@ -4,15 +4,88 @@ #include "funcdata.h" #include "go_asm.h" -TEXT ·innerDIFWithTwiddles_avx512(SB), NOSPLIT, $0-72 +// performs a butterfly between 2 vectors of dwords +// in0 = (in0 + in1) mod q +// in1 = (in0 - in1) mod 2q +// in2: q broadcasted on all dwords lanes +// in3: temporary Z register +#define BUTTERFLYD2Q(in0, in1, in2, in3) \ + VPADDD in0, in1, in3 \ + VPSUBD in1, in0, in1 \ + VPSUBD in2, in3, in0 \ + VPMINUD in3, in0, in0 \ + VPADDD in2, in1, in1 \ + +// same as butterflyD2Q but reduces in1 to [0,q) +#define BUTTERFLYD1Q(in0, in1, in2, in3, in4) \ + VPADDD in0, in1, in3 \ + VPSUBD in1, in0, in1 \ + VPSUBD in2, in3, in0 \ + VPMINUD in3, in0, in0 \ + VPADDD in2, in1, in4 \ + VPMINUD in4, in1, in1 \ + +// same as butterflyD2Q but for qwords +// in2: must be broadcasted on all qwords lanes +#define BUTTERFLYQ2Q(in0, in1, in2, in3) \ + VPADDQ in0, in1, in3 \ + VPSUBQ in1, in0, in1 \ + VPSUBQ in2, in3, in0 \ + VPMINUQ in3, in0, in0 \ + VPADDQ in2, in1, in1 \ + +// performs a multiplication in place between 2 vectors of qwords (values should be dwords zero extended) +// in0 = (in0 * in1) mod q +// in1: second operand +// in2: mask for low dword in each qword +// in3: q broadcasted on all qwords lanes +// in4: qInvNeg broadcasted on all qwords lanes +// in5: temporary Z register +// in6: temporary Z register +#define MUL(in0, in1, in2, in3, in4, in5, in6) \ + VPMULUDQ in0, in1, in5 \ + VPANDQ in2, in5, in6 \ + VPMULUDQ in6, in4, in6 \ + VPANDQ in2, in6, in6 \ + VPMULUDQ in6, in3, in6 \ + VPADDQ in5, in6, in5 \ + VPSRLQ $32, in5, in5 \ + VPSUBQ in3, in5, in6 \ + VPMINUQ in5, in6, in0 \ + +// goes from +// Z1 = A A A A B B B B +// Z2 = C C C C D D D D +// we want +// Z1 = A A A A C C C C +// Z2 = B B B B D D D D +#define PERMUTE4X4(in0, in1, in2, in3) \ + VSHUFI64X2 $0x000000000000004e, in1, in0, in2 \ + VPBLENDMQ in0, in2, in3, in0 \ + VPBLENDMQ in2, in1, in3, in1 \ + +// Z1 = A A B B C C D D +// Z2 = L L M M N N O O +// we want +// Z1 = A A L L C C N N +// Z2 = B B M M D D O O +#define PERMUTE2X2(in0, in1, in2, in3, in4) \ + VMOVDQA64 in2, in3 \ + VPERMI2Q in1, in0, in3 \ + VPBLENDMQ in0, in3, in4, in0 \ + VPBLENDMQ in3, in1, in4, in1 \ + +#define PERMUTE1X1(in0, in1, in2, in3) \ + VPSHRDQ $32, in1, in0, in2 \ + VPBLENDMD in0, in2, in3, in0 \ + VPBLENDMD in2, in1, in3, in1 \ - // func innerDIFWithTwiddles(a []Element, twiddles []Element, start, end, m int) { - // for i := start; i < end; i++ { - // Butterfly(&a[i], &a[i+m]) - // a[i+m].Mul(&a[i+m], &twiddles[i]) - // } - // } +#define PACK_DWORDS(in0, in1, in2, in3) \ + VPMOVQD in0, in1 \ + VPMOVQD in2, in3 \ + VINSERTI64X4 $1, in3, in0, in0 \ +TEXT ·innerDIFWithTwiddles_avx512(SB), NOSPLIT, $0-72 // prepare constants needed for mul and reduce ops MOVD $const_q, AX VPBROADCASTD AX, Z2 @@ -34,32 +107,12 @@ TEXT ·innerDIFWithTwiddles_avx512(SB), NOSPLIT, $0-72 MOVQ R15, DX ADDQ BX, DX - // performs a butterfly between 2 vectors of dwords - // first vector is in [0, q) and second vector is in [0, 2q) -#define BUTTERFLYD2Q(in0, in1) \ - VPADDD in0, in1, Z3 \ - VPSUBD in1, in0, in1 \ - VPSUBD Z2, Z3, in0 \ - VPMINUD Z3, in0, in0 \ - VPADDD Z2, in1, in1 \ - -#define MUL(in0, in1) \ - VPMULUDQ in0, in1, Z12 \ - VPANDQ Z11, Z12, Z10 \ - VPMULUDQ Z10, Z9, Z10 \ - VPANDQ Z11, Z10, Z10 \ - VPMULUDQ Z10, Z8, Z10 \ - VPADDQ Z12, Z10, Z12 \ - VPSRLQ $32, Z12, Z12 \ - VPSUBQ Z8, Z12, Z10 \ - VPMINUQ Z12, Z10, in0 \ - loop_3: TESTQ SI, SI JEQ done_2 // n == 0, we are done VMOVDQA32 0(R15), Z0 // load a[i] VMOVDQA32 0(DX), Z1 // load a[i+m] - BUTTERFLYD2Q(Z0, Z1) + BUTTERFLYD2Q(Z0, Z1, Z2, Z3) VMOVDQA32 Z0, 0(R15) // store a[i] VEXTRACTI32X8 $0, Z1, Y20 VEXTRACTI32X8 $1, Z1, Y21 @@ -67,8 +120,8 @@ loop_3: VPMOVZXDQ Y21, Z14 VPMOVZXDQ 0(CX), Z15 VPMOVZXDQ 32(CX), Z16 - MUL(Z13, Z15) - MUL(Z14, Z16) + MUL(Z13, Z15, Z11, Z8, Z9, Z12, Z10) + MUL(Z14, Z16, Z11, Z8, Z9, Z12, Z10) VPMOVQD Z13, 0(DX) VPMOVQD Z14, 32(DX) ADDQ $64, R15 @@ -114,45 +167,24 @@ TEXT ·kerDIFNP_128_avx512(SB), NOSPLIT, $0-56 ADDQ AX, CX // we want twiddles[stage] as starting point // load a[:128] in registers - VPMOVZXDQ 0(R15), Z0 - VPMOVZXDQ 32(R15), Z1 - VPMOVZXDQ 64(R15), Z2 - VPMOVZXDQ 96(R15), Z3 - VPMOVZXDQ 128(R15), Z4 - VPMOVZXDQ 160(R15), Z5 - VPMOVZXDQ 192(R15), Z6 - VPMOVZXDQ 224(R15), Z7 - VPMOVZXDQ 256(R15), Z8 - VPMOVZXDQ 288(R15), Z9 - VPMOVZXDQ 320(R15), Z10 - VPMOVZXDQ 352(R15), Z11 - VPMOVZXDQ 384(R15), Z12 - VPMOVZXDQ 416(R15), Z13 - VPMOVZXDQ 448(R15), Z14 - VPMOVZXDQ 480(R15), Z15 - - // butterfly computes - // in0 = in0 + in1 (in [0,q)) - // in1 = in0 - in1 (in [0,2q)) -#define BUTTERFLY(in0, in1) \ - VPADDQ in0, in1, Z21 \ - VPSUBQ in1, in0, in1 \ - VPSUBQ Z17, Z21, in0 \ - VPMINUQ Z21, in0, in0 \ - VPADDQ Z17, in1, in1 \ - -// mul computes x = x * y -#define MUL_0(in0, in1) \ - VPMULUDQ in0, in1, Z16 \ - VPANDQ Z20, Z16, Z19 \ - VPMULUDQ Z19, Z18, Z19 \ - VPANDQ Z20, Z19, Z19 \ - VPMULUDQ Z19, Z17, Z19 \ - VPADDQ Z16, Z19, Z16 \ - VPSRLQ $32, Z16, Z16 \ - VPSUBQ Z17, Z16, Z19 \ - VPMINUQ Z16, Z19, in0 \ - + VPMOVZXDQ 0(R15), Z0 + VPMOVZXDQ 32(R15), Z1 + VPMOVZXDQ 64(R15), Z2 + VPMOVZXDQ 96(R15), Z3 + VPMOVZXDQ 128(R15), Z4 + VPMOVZXDQ 160(R15), Z5 + VPMOVZXDQ 192(R15), Z6 + VPMOVZXDQ 224(R15), Z7 + VPMOVZXDQ 256(R15), Z8 + VPMOVZXDQ 288(R15), Z9 + VPMOVZXDQ 320(R15), Z10 + VPMOVZXDQ 352(R15), Z11 + VPMOVZXDQ 384(R15), Z12 + VPMOVZXDQ 416(R15), Z13 + VPMOVZXDQ 448(R15), Z14 + VPMOVZXDQ 480(R15), Z15 + MOVQ $0x0000000000000f0f, AX + KMOVQ AX, K1 MOVQ 0(CX), BX VPMOVZXDQ 0(BX), Z23 VPMOVZXDQ 32(BX), Z24 @@ -162,127 +194,119 @@ TEXT ·kerDIFNP_128_avx512(SB), NOSPLIT, $0-56 VPMOVZXDQ 160(BX), Z28 VPMOVZXDQ 192(BX), Z29 VPMOVZXDQ 224(BX), Z30 - BUTTERFLY(Z0, Z8) - MUL_0(Z8, Z23) - BUTTERFLY(Z1, Z9) - MUL_0(Z9, Z24) - BUTTERFLY(Z2, Z10) - MUL_0(Z10, Z25) - BUTTERFLY(Z3, Z11) - MUL_0(Z11, Z26) - BUTTERFLY(Z4, Z12) - MUL_0(Z12, Z27) - BUTTERFLY(Z5, Z13) - MUL_0(Z13, Z28) - BUTTERFLY(Z6, Z14) - MUL_0(Z14, Z29) - BUTTERFLY(Z7, Z15) - MUL_0(Z15, Z30) + BUTTERFLYQ2Q(Z0, Z8, Z17, Z21) + MUL(Z8, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z1, Z9, Z17, Z21) + MUL(Z9, Z24, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z2, Z10, Z17, Z21) + MUL(Z10, Z25, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z3, Z11, Z17, Z21) + MUL(Z11, Z26, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z4, Z12, Z17, Z21) + MUL(Z12, Z27, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z5, Z13, Z17, Z21) + MUL(Z13, Z28, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z6, Z14, Z17, Z21) + MUL(Z14, Z29, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z7, Z15, Z17, Z21) + MUL(Z15, Z30, Z20, Z17, Z18, Z16, Z19) ADDQ $24, CX MOVQ 0(CX), BX VPMOVZXDQ 0(BX), Z23 VPMOVZXDQ 32(BX), Z24 VPMOVZXDQ 64(BX), Z25 VPMOVZXDQ 96(BX), Z26 - BUTTERFLY(Z0, Z4) - MUL_0(Z4, Z23) - BUTTERFLY(Z1, Z5) - MUL_0(Z5, Z24) - BUTTERFLY(Z2, Z6) - MUL_0(Z6, Z25) - BUTTERFLY(Z3, Z7) - MUL_0(Z7, Z26) - BUTTERFLY(Z8, Z12) - MUL_0(Z12, Z23) - BUTTERFLY(Z9, Z13) - MUL_0(Z13, Z24) - BUTTERFLY(Z10, Z14) - MUL_0(Z14, Z25) - BUTTERFLY(Z11, Z15) - MUL_0(Z15, Z26) + BUTTERFLYQ2Q(Z0, Z4, Z17, Z21) + MUL(Z4, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z1, Z5, Z17, Z21) + MUL(Z5, Z24, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z2, Z6, Z17, Z21) + MUL(Z6, Z25, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z3, Z7, Z17, Z21) + MUL(Z7, Z26, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z8, Z12, Z17, Z21) + MUL(Z12, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z9, Z13, Z17, Z21) + MUL(Z13, Z24, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z10, Z14, Z17, Z21) + MUL(Z14, Z25, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z11, Z15, Z17, Z21) + MUL(Z15, Z26, Z20, Z17, Z18, Z16, Z19) ADDQ $24, CX MOVQ 0(CX), BX VPMOVZXDQ 0(BX), Z23 VPMOVZXDQ 32(BX), Z24 - BUTTERFLY(Z0, Z2) - MUL_0(Z2, Z23) - BUTTERFLY(Z1, Z3) - MUL_0(Z3, Z24) - BUTTERFLY(Z4, Z6) - MUL_0(Z6, Z23) - BUTTERFLY(Z5, Z7) - MUL_0(Z7, Z24) - BUTTERFLY(Z8, Z10) - MUL_0(Z10, Z23) - BUTTERFLY(Z9, Z11) - MUL_0(Z11, Z24) - BUTTERFLY(Z12, Z14) - MUL_0(Z14, Z23) - BUTTERFLY(Z13, Z15) - MUL_0(Z15, Z24) + BUTTERFLYQ2Q(Z0, Z2, Z17, Z21) + MUL(Z2, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z1, Z3, Z17, Z21) + MUL(Z3, Z24, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z4, Z6, Z17, Z21) + MUL(Z6, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z5, Z7, Z17, Z21) + MUL(Z7, Z24, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z8, Z10, Z17, Z21) + MUL(Z10, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z9, Z11, Z17, Z21) + MUL(Z11, Z24, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z12, Z14, Z17, Z21) + MUL(Z14, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z13, Z15, Z17, Z21) + MUL(Z15, Z24, Z20, Z17, Z18, Z16, Z19) ADDQ $24, CX MOVQ 0(CX), BX VPMOVZXDQ 0(BX), Z23 - BUTTERFLY(Z0, Z1) - MUL_0(Z1, Z23) - BUTTERFLY(Z2, Z3) - MUL_0(Z3, Z23) - BUTTERFLY(Z4, Z5) - MUL_0(Z5, Z23) - BUTTERFLY(Z6, Z7) - MUL_0(Z7, Z23) - BUTTERFLY(Z8, Z9) - MUL_0(Z9, Z23) - BUTTERFLY(Z10, Z11) - MUL_0(Z11, Z23) - BUTTERFLY(Z12, Z13) - MUL_0(Z13, Z23) - BUTTERFLY(Z14, Z15) - MUL_0(Z15, Z23) + BUTTERFLYQ2Q(Z0, Z1, Z17, Z21) + MUL(Z1, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z2, Z3, Z17, Z21) + MUL(Z3, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z4, Z5, Z17, Z21) + MUL(Z5, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z6, Z7, Z17, Z21) + MUL(Z7, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z8, Z9, Z17, Z21) + MUL(Z9, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z10, Z11, Z17, Z21) + MUL(Z11, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z12, Z13, Z17, Z21) + MUL(Z13, Z23, Z20, Z17, Z18, Z16, Z19) + BUTTERFLYQ2Q(Z14, Z15, Z17, Z21) + MUL(Z15, Z23, Z20, Z17, Z18, Z16, Z19) ADDQ $24, CX MOVQ 0(CX), BX - VPMOVZXDQ 0(BX), Y23 // zero extend 4x uint32 to 4x uint64 + VPMOVZXDQ 0(BX), Y23 // zero extend 4x uint32 to 4x uint64 VINSERTI64X4 $1, Y23, Z23, Z23 - MOVQ $0x0000000000000f0f, AX - KMOVQ AX, K1 - -#define PERMUTE4X4(in0, in1) \ - VSHUFI64X2 $0x000000000000004e, in1, in0, Z21 \ - VPBLENDMQ in0, Z21, K1, in0 \ - VPBLENDMQ Z21, in1, K1, in1 \ - - PERMUTE4X4(Z0, Z1) - BUTTERFLY(Z0, Z1) - MUL_0(Z1, Z23) - PERMUTE4X4(Z0, Z1) - PERMUTE4X4(Z2, Z3) - BUTTERFLY(Z2, Z3) - MUL_0(Z3, Z23) - PERMUTE4X4(Z2, Z3) - PERMUTE4X4(Z4, Z5) - BUTTERFLY(Z4, Z5) - MUL_0(Z5, Z23) - PERMUTE4X4(Z4, Z5) - PERMUTE4X4(Z6, Z7) - BUTTERFLY(Z6, Z7) - MUL_0(Z7, Z23) - PERMUTE4X4(Z6, Z7) - PERMUTE4X4(Z8, Z9) - BUTTERFLY(Z8, Z9) - MUL_0(Z9, Z23) - PERMUTE4X4(Z8, Z9) - PERMUTE4X4(Z10, Z11) - BUTTERFLY(Z10, Z11) - MUL_0(Z11, Z23) - PERMUTE4X4(Z10, Z11) - PERMUTE4X4(Z12, Z13) - BUTTERFLY(Z12, Z13) - MUL_0(Z13, Z23) - PERMUTE4X4(Z12, Z13) - PERMUTE4X4(Z14, Z15) - BUTTERFLY(Z14, Z15) - MUL_0(Z15, Z23) - PERMUTE4X4(Z14, Z15) + PERMUTE4X4(Z0, Z1, Z21, K1) + BUTTERFLYQ2Q(Z0, Z1, Z17, Z21) + MUL(Z1, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE4X4(Z0, Z1, Z21, K1) + PERMUTE4X4(Z2, Z3, Z21, K1) + BUTTERFLYQ2Q(Z2, Z3, Z17, Z21) + MUL(Z3, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE4X4(Z2, Z3, Z21, K1) + PERMUTE4X4(Z4, Z5, Z21, K1) + BUTTERFLYQ2Q(Z4, Z5, Z17, Z21) + MUL(Z5, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE4X4(Z4, Z5, Z21, K1) + PERMUTE4X4(Z6, Z7, Z21, K1) + BUTTERFLYQ2Q(Z6, Z7, Z17, Z21) + MUL(Z7, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE4X4(Z6, Z7, Z21, K1) + PERMUTE4X4(Z8, Z9, Z21, K1) + BUTTERFLYQ2Q(Z8, Z9, Z17, Z21) + MUL(Z9, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE4X4(Z8, Z9, Z21, K1) + PERMUTE4X4(Z10, Z11, Z21, K1) + BUTTERFLYQ2Q(Z10, Z11, Z17, Z21) + MUL(Z11, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE4X4(Z10, Z11, Z21, K1) + PERMUTE4X4(Z12, Z13, Z21, K1) + BUTTERFLYQ2Q(Z12, Z13, Z17, Z21) + MUL(Z13, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE4X4(Z12, Z13, Z21, K1) + PERMUTE4X4(Z14, Z15, Z21, K1) + BUTTERFLYQ2Q(Z14, Z15, Z17, Z21) + MUL(Z15, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE4X4(Z14, Z15, Z21, K1) ADDQ $24, CX MOVQ 0(CX), BX VPMOVZXDQ 0(BX), X23 // zero extend 2x uint32 to 2x uint64 @@ -293,89 +317,62 @@ TEXT ·kerDIFNP_128_avx512(SB), NOSPLIT, $0-56 KMOVQ AX, K2 MOVQ ·vInterleaveIndices+0(SB), DI VMOVDQU64 0(DI), Z30 - -#define PERMUTE2X2(in0, in1) \ - VMOVDQA64 Z30, Z29 \ - VPERMI2Q in1, in0, Z29 \ - VPBLENDMQ in0, Z29, K2, in0 \ - VPBLENDMQ Z29, in1, K2, in1 \ - - PERMUTE2X2(Z0, Z1) - BUTTERFLY(Z0, Z1) - MUL_0(Z1, Z23) - PERMUTE2X2(Z0, Z1) - PERMUTE2X2(Z2, Z3) - BUTTERFLY(Z2, Z3) - MUL_0(Z3, Z23) - PERMUTE2X2(Z2, Z3) - PERMUTE2X2(Z4, Z5) - BUTTERFLY(Z4, Z5) - MUL_0(Z5, Z23) - PERMUTE2X2(Z4, Z5) - PERMUTE2X2(Z6, Z7) - BUTTERFLY(Z6, Z7) - MUL_0(Z7, Z23) - PERMUTE2X2(Z6, Z7) - PERMUTE2X2(Z8, Z9) - BUTTERFLY(Z8, Z9) - MUL_0(Z9, Z23) - PERMUTE2X2(Z8, Z9) - PERMUTE2X2(Z10, Z11) - BUTTERFLY(Z10, Z11) - MUL_0(Z11, Z23) - PERMUTE2X2(Z10, Z11) - PERMUTE2X2(Z12, Z13) - BUTTERFLY(Z12, Z13) - MUL_0(Z13, Z23) - PERMUTE2X2(Z12, Z13) - PERMUTE2X2(Z14, Z15) - BUTTERFLY(Z14, Z15) - MUL_0(Z15, Z23) - PERMUTE2X2(Z14, Z15) - MOVQ $0x0000000000005555, AX - KMOVD AX, K3 - -#define PERMUTE1X1(in0, in1) \ - VPSHRDQ $32, in1, in0, Z21 \ - VPBLENDMD in0, Z21, K3, in0 \ - VPBLENDMD Z21, in1, K3, in1 \ - + PERMUTE2X2(Z0, Z1, Z30, Z29, K2) + BUTTERFLYQ2Q(Z0, Z1, Z17, Z21) + MUL(Z1, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE2X2(Z0, Z1, Z30, Z29, K2) + PERMUTE2X2(Z2, Z3, Z30, Z29, K2) + BUTTERFLYQ2Q(Z2, Z3, Z17, Z21) + MUL(Z3, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE2X2(Z2, Z3, Z30, Z29, K2) + PERMUTE2X2(Z4, Z5, Z30, Z29, K2) + BUTTERFLYQ2Q(Z4, Z5, Z17, Z21) + MUL(Z5, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE2X2(Z4, Z5, Z30, Z29, K2) + PERMUTE2X2(Z6, Z7, Z30, Z29, K2) + BUTTERFLYQ2Q(Z6, Z7, Z17, Z21) + MUL(Z7, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE2X2(Z6, Z7, Z30, Z29, K2) + PERMUTE2X2(Z8, Z9, Z30, Z29, K2) + BUTTERFLYQ2Q(Z8, Z9, Z17, Z21) + MUL(Z9, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE2X2(Z8, Z9, Z30, Z29, K2) + PERMUTE2X2(Z10, Z11, Z30, Z29, K2) + BUTTERFLYQ2Q(Z10, Z11, Z17, Z21) + MUL(Z11, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE2X2(Z10, Z11, Z30, Z29, K2) + PERMUTE2X2(Z12, Z13, Z30, Z29, K2) + BUTTERFLYQ2Q(Z12, Z13, Z17, Z21) + MUL(Z13, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE2X2(Z12, Z13, Z30, Z29, K2) + PERMUTE2X2(Z14, Z15, Z30, Z29, K2) + BUTTERFLYQ2Q(Z14, Z15, Z17, Z21) + MUL(Z15, Z23, Z20, Z17, Z18, Z16, Z19) + PERMUTE2X2(Z14, Z15, Z30, Z29, K2) + MOVQ $0x0000000000005555, AX + KMOVD AX, K3 MOVD $const_q, AX - VPBROADCASTD AX, Z17 // rebroadcast q, but on dword lanes - -#define LASTBUTTERFLY(in0, in1) \ - VPADDD in0, in1, Z21 \ - VPSUBD in1, in0, in1 \ - VPSUBD Z17, Z21, in0 \ - VPMINUD Z21, in0, in0 \ - VPADDD Z17, in1, Z22 \ - VPMINUD Z22, in1, in1 \ - -#define PACK_DWORDS(in0, in1, in2, in3) \ - VPMOVQD in0, in1 \ - VPMOVQD in2, in3 \ - VINSERTI64X4 $1, in3, in0, in0 \ - + VPBROADCASTD AX, Z17 // rebroadcast q, but on dword lanes PACK_DWORDS(Z0, Y0, Z1, Y1) PACK_DWORDS(Z2, Y2, Z3, Y3) - PERMUTE1X1(Z0, Z2) - LASTBUTTERFLY(Z0, Z2) - PERMUTE1X1(Z0, Z2) + PERMUTE1X1(Z0, Z2, Z21, K3) + BUTTERFLYD1Q(Z0, Z2, Z17, Z21, Z22) + PERMUTE1X1(Z0, Z2, Z21, K3) PACK_DWORDS(Z4, Y4, Z5, Y5) PACK_DWORDS(Z6, Y6, Z7, Y7) - PERMUTE1X1(Z4, Z6) - LASTBUTTERFLY(Z4, Z6) - PERMUTE1X1(Z4, Z6) + PERMUTE1X1(Z4, Z6, Z21, K3) + BUTTERFLYD1Q(Z4, Z6, Z17, Z21, Z22) + PERMUTE1X1(Z4, Z6, Z21, K3) PACK_DWORDS(Z8, Y8, Z9, Y9) PACK_DWORDS(Z10, Y10, Z11, Y11) - PERMUTE1X1(Z8, Z10) - LASTBUTTERFLY(Z8, Z10) - PERMUTE1X1(Z8, Z10) + PERMUTE1X1(Z8, Z10, Z21, K3) + BUTTERFLYD1Q(Z8, Z10, Z17, Z21, Z22) + PERMUTE1X1(Z8, Z10, Z21, K3) PACK_DWORDS(Z12, Y12, Z13, Y13) PACK_DWORDS(Z14, Y14, Z15, Y15) - PERMUTE1X1(Z12, Z14) - LASTBUTTERFLY(Z12, Z14) - PERMUTE1X1(Z12, Z14) + PERMUTE1X1(Z12, Z14, Z21, K3) + BUTTERFLYD1Q(Z12, Z14, Z17, Z21, Z22) + PERMUTE1X1(Z12, Z14, Z21, K3) // store a[:128] in memory VMOVDQA32 Z0, 0(R15) From 7ea239f07012ab5e8558b06c69e17460ce103da8 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Tue, 21 Jan 2025 22:47:21 +0000 Subject: [PATCH 4/9] feat: add minimal DIT avx512 acceleration for koalabear --- field/babybear/fft/kernel_amd64.s | 76 ++++++++++- field/generator/asm/amd64/build.go | 2 +- field/generator/asm/amd64/element_vec_F31.go | 127 ++++++++++++++++++ .../internal/templates/fft/fft.go.tmpl | 18 ++- .../templates/fft/kernel.amd64.go.tmpl | 11 ++ .../templates/fft/kernel.purego.go.tmpl | 4 + .../internal/templates/fft/tests/fft.go.tmpl | 17 +++ field/koalabear/fft/fft.go | 10 +- field/koalabear/fft/fft_test.go | 17 +++ field/koalabear/fft/kernel_amd64.go | 11 ++ field/koalabear/fft/kernel_amd64.s | 76 ++++++++++- field/koalabear/fft/kernel_purego.go | 4 + 12 files changed, 345 insertions(+), 28 deletions(-) diff --git a/field/babybear/fft/kernel_amd64.s b/field/babybear/fft/kernel_amd64.s index 9f4fb27135..424db40757 100644 --- a/field/babybear/fft/kernel_amd64.s +++ b/field/babybear/fft/kernel_amd64.s @@ -34,6 +34,14 @@ VPMINUQ in3, in0, in0 \ VPADDQ in2, in1, in1 \ +#define BUTTERFLYQ1Q(in0, in1, in2, in3, in4) \ + VPADDQ in0, in1, in3 \ + VPSUBQ in1, in0, in1 \ + VPSUBQ in2, in3, in0 \ + VPMINUQ in3, in0, in0 \ + VPADDQ in2, in1, in4 \ + VPMINUQ in4, in1, in1 \ + // performs a multiplication in place between 2 vectors of qwords (values should be dwords zero extended) // in0 = (in0 * in1) mod q // in1: second operand @@ -85,6 +93,62 @@ VPMOVQD in2, in3 \ VINSERTI64X4 $1, in3, in0, in0 \ +TEXT ·innerDITWithTwiddles_avx512(SB), NOSPLIT, $0-72 + // prepare constants needed for mul and reduce ops + MOVD $const_q, AX + VPBROADCASTQ AX, Z8 + MOVD $const_qInvNeg, AX + VPBROADCASTQ AX, Z9 + VPCMPEQB Y0, Y0, Y0 + VPMOVZXDQ Y0, Z11 + + // load arguments + MOVQ a+0(FP), R15 + MOVQ twiddles+24(FP), CX + MOVQ end+56(FP), SI + MOVQ m+64(FP), BX + CMPQ BX, $0x0000000000000008 + JL smallerThan8_1 // m < 8 + SHRQ $3, SI // we are processing 8 elements at a time + SHLQ $2, BX // offset = m * 4bytes + MOVQ R15, DX + ADDQ BX, DX + +loop_3: + TESTQ SI, SI + JEQ done_2 // n == 0, we are done + VPMOVZXDQ 0(R15), Z0 // load a[i] + VPMOVZXDQ 0(DX), Z1 // load a[i+m] + VPMOVZXDQ 0(CX), Z15 + MUL(Z1, Z15, Z11, Z8, Z9, Z12, Z10) + BUTTERFLYQ1Q(Z0, Z1, Z8, Z3, Z4) + VPMOVQD Z0, 0(R15) // store a[i] + VPMOVQD Z1, 0(DX) // store a[i+m] + ADDQ $32, R15 + ADDQ $32, DX + ADDQ $32, CX + DECQ SI // decrement n + JMP loop_3 + +done_2: + RET + +smallerThan8_1: + // m < 8, we call the generic one + // note that this should happen only when doing a FFT smaller than the smallest generated kernel + MOVQ a+0(FP), AX + MOVQ AX, (SP) + MOVQ twiddles+24(FP), AX + MOVQ AX, 24(SP) + MOVQ start+48(FP), AX + MOVQ AX, 48(SP) + MOVQ end+56(FP), AX + MOVQ AX, 56(SP) + MOVQ m+64(FP), AX + MOVQ AX, 64(SP) + CALL ·innerDITWithTwiddlesGeneric(SB) + RET + TEXT ·innerDIFWithTwiddles_avx512(SB), NOSPLIT, $0-72 // prepare constants needed for mul and reduce ops MOVD $const_q, AX @@ -101,15 +165,15 @@ TEXT ·innerDIFWithTwiddles_avx512(SB), NOSPLIT, $0-72 MOVQ end+56(FP), SI MOVQ m+64(FP), BX CMPQ BX, $0x0000000000000010 - JL smallerThan16_1 // m < 16 + JL smallerThan16_4 // m < 16 SHRQ $4, SI // we are processing 16 elements at a time SHLQ $2, BX // offset = m * 4bytes MOVQ R15, DX ADDQ BX, DX -loop_3: +loop_6: TESTQ SI, SI - JEQ done_2 // n == 0, we are done + JEQ done_5 // n == 0, we are done VMOVDQA32 0(R15), Z0 // load a[i] VMOVDQA32 0(DX), Z1 // load a[i+m] BUTTERFLYD2Q(Z0, Z1, Z2, Z3) @@ -128,12 +192,12 @@ loop_3: ADDQ $64, DX ADDQ $64, CX DECQ SI // decrement n - JMP loop_3 + JMP loop_6 -done_2: +done_5: RET -smallerThan16_1: +smallerThan16_4: // m < 16, we call the generic one // note that this should happen only when doing a FFT smaller than the smallest generated kernel MOVQ a+0(FP), AX diff --git a/field/generator/asm/amd64/build.go b/field/generator/asm/amd64/build.go index 395e6b256d..0397999023 100644 --- a/field/generator/asm/amd64/build.go +++ b/field/generator/asm/amd64/build.go @@ -275,7 +275,7 @@ func GenerateF31FFTKernels(w io.Writer, nbBits int, kernels []int) error { f.WriteLn("") f.generateFFTDefinesF31() - + f.generateFFTInnerDITF31() f.generateFFTInnerDIFF31() for _, ksize := range kernels { diff --git a/field/generator/asm/amd64/element_vec_F31.go b/field/generator/asm/amd64/element_vec_F31.go index 36f037b6e9..c6f5132886 100644 --- a/field/generator/asm/amd64/element_vec_F31.go +++ b/field/generator/asm/amd64/element_vec_F31.go @@ -519,6 +519,21 @@ func (f *FFAmd64) generateFFTDefinesF31() { f.VPADDQ(q, y, y) // y = (x-y) + q --> y in [0,2q) }) + _ = f.Define("butterflyQ1Q", 5, func(args ...amd64.Register) { + x := args[0] + y := args[1] + q := args[2] + b0 := args[3] + b1 := args[4] + + f.VPADDQ(x, y, b0) // b0 = x + y + f.VPSUBQ(y, x, y) // y = x - y + f.VPSUBQ(q, b0, x) // x = (x+y) - q + f.VPMINUQ(b0, x, x) // x %= q + f.VPADDQ(q, y, b1) // y = (x-y) + q --> y in [0,2q) + f.VPMINUQ(b1, y, y) // y %= q + }) + f.Comment("performs a multiplication in place between 2 vectors of qwords (values should be dwords zero extended)") f.Comment("in0 = (in0 * in1) mod q") f.Comment("in1: second operand") @@ -605,6 +620,118 @@ func (f *FFAmd64) generateFFTDefinesF31() { }) } +func (f *FFAmd64) generateFFTInnerDITF31() { + // func innerDITWithTwiddles(a []Element, twiddles []Element, start, end, m int) { + // for i := start; i < end; i++ { + // a[i+m].Mul(&a[i+m], &twiddles[i]) + // Butterfly(&a[i], &a[i+m]) + // } + // } + const argSize = 9 * 8 + stackSize := f.StackSize(f.NbWords*2+4, 1, 0) + registers := f.FnHeader("innerDITWithTwiddles_avx512", stackSize, argSize, amd64.AX) + defer f.AssertCleanStack(stackSize, 0) + + addrA := f.Pop(®isters) + addrAPlusM := f.Pop(®isters) + addrTwiddles := f.Pop(®isters) + m := f.Pop(®isters) + len := f.Pop(®isters) + + a := amd64.Register("Z0") + am := amd64.Register("Z1") + b0 := amd64.Register("Z3") + b1 := amd64.Register("Z4") + q := amd64.Register("Z8") + qInvNeg := amd64.Register("Z9") + PL := amd64.Register("Z10") + LSW := amd64.Register("Z11") + P := amd64.Register("Z12") + t0 := amd64.Register("Z15") + + f.Comment("prepare constants needed for mul and reduce ops") + f.WriteLn("MOVD $const_q, AX") + f.VPBROADCASTQ(amd64.AX, q) + f.WriteLn("MOVD $const_qInvNeg, AX") + f.VPBROADCASTQ(amd64.AX, qInvNeg) + f.VPCMPEQB("Y0", "Y0", "Y0") + f.VPMOVZXDQ("Y0", LSW) + + f.Comment("load arguments") + f.MOVQ("a+0(FP)", addrA) + f.MOVQ("twiddles+24(FP)", addrTwiddles) + f.MOVQ("end+56(FP)", len) + f.MOVQ("m+64(FP)", m) + + // get defines + butterflyQ1Q, _ := f.DefineFn("butterflyQ1Q") + mul, _ := f.DefineFn("mul") + + // we do only m >= 8; + // if m < 8, we call the generic one; this can be called when doing a FFT + // smaller than the smallest generated kernel + lblSmallerThan8 := f.NewLabel("smallerThan8") + f.CMPQ(m, 8) + f.JL(lblSmallerThan8, "m < 8") + + f.SHRQ("$3", len, "we are processing 8 elements at a time") + + // offset we want to add to a is m*4bytes + f.SHLQ("$2", m, "offset = m * 4bytes") + + f.MOVQ(addrA, addrAPlusM) + f.ADDQ(m, addrAPlusM) + + lblDone := f.NewLabel("done") + lblLoop := f.NewLabel("loop") + + f.LABEL(lblLoop) + + f.TESTQ(len, len) + f.JEQ(lblDone, "n == 0, we are done") + + f.VPMOVZXDQ(addrA.At(0), a, "load a[i]") + f.VPMOVZXDQ(addrAPlusM.At(0), am, "load a[i+m]") + f.VPMOVZXDQ(addrTwiddles.At(0), t0) + + mul(am, t0, LSW, q, qInvNeg, P, PL) + butterflyQ1Q(a, am, q, b0, b1) + + // a is ready to be stored, but we need to scale am by twiddles. + f.VPMOVQD(a, addrA.At(0), "store a[i]") + f.VPMOVQD(am, addrAPlusM.At(0), "store a[i+m]") + + f.ADDQ("$32", addrA) + f.ADDQ("$32", addrAPlusM) + f.ADDQ("$32", addrTwiddles) + f.DECQ(len, "decrement n") + f.JMP(lblLoop) + + f.LABEL(lblDone) + + f.RET() + + f.LABEL(lblSmallerThan8) + f.Comment("m < 8, we call the generic one") + f.Comment("note that this should happen only when doing a FFT smaller than the smallest generated kernel") + + // TODO @gbotrel should have dedicated tests + f.MOVQ("a+0(FP)", amd64.AX) + f.MOVQ(amd64.AX, "(SP)") + f.MOVQ("twiddles+24(FP)", amd64.AX) + f.MOVQ(amd64.AX, "24(SP)") // go vet says 24(SP) should be a_cap+16(FP) + f.MOVQ("start+48(FP)", amd64.AX) + f.MOVQ(amd64.AX, "48(SP)") // go vet says 48(SP) should be twiddles_cap+40(FP) + f.MOVQ("end+56(FP)", amd64.AX) + f.MOVQ(amd64.AX, "56(SP)") + f.MOVQ("m+64(FP)", amd64.AX) + f.MOVQ(amd64.AX, "64(SP)") + + f.WriteLn("CALL ·innerDITWithTwiddlesGeneric(SB)") + f.RET() + +} + func (f *FFAmd64) generateFFTInnerDIFF31() { // func innerDIFWithTwiddles(a []Element, twiddles []Element, start, end, m int) { // for i := start; i < end; i++ { diff --git a/field/generator/internal/templates/fft/fft.go.tmpl b/field/generator/internal/templates/fft/fft.go.tmpl index c49a8c01f2..8a8eca2cbe 100644 --- a/field/generator/internal/templates/fft/fft.go.tmpl +++ b/field/generator/internal/templates/fft/fft.go.tmpl @@ -338,17 +338,21 @@ func ditFFT(a []{{ .FF }}.Element, w {{ .FF }}.Element, twiddles [][]{{ .FF }}.E } return } - if parallelButterfly { - parallel.Execute(m, func(start, end int) { - innerDITWithTwiddles(a, twiddles[stage-twiddlesStartStage], start, end, m) - }, nbTasks / (1 << (stage))) - } else { + {{- if .HasASMKernel}} innerDITWithTwiddles(a, twiddles[stage-twiddlesStartStage], 0, m, m) - } + {{- else}} + if parallelButterfly { + parallel.Execute(m, func(start, end int) { + innerDITWithTwiddles(a, twiddles[stage-twiddlesStartStage], start, end, m) + }, nbTasks / (1 << (stage))) + } else { + innerDITWithTwiddles(a, twiddles[stage-twiddlesStartStage], 0, m, m) + } + {{- end}} } -func innerDITWithTwiddles(a []{{ .FF }}.Element, twiddles []{{ .FF }}.Element, start, end, m int) { +func innerDITWithTwiddlesGeneric(a []{{ .FF }}.Element, twiddles []{{ .FF }}.Element, start, end, m int) { if start == 0 { {{ .FF }}.Butterfly(&a[0], &a[m]) start++ diff --git a/field/generator/internal/templates/fft/kernel.amd64.go.tmpl b/field/generator/internal/templates/fft/kernel.amd64.go.tmpl index 0f0696319a..78ffb6c68b 100644 --- a/field/generator/internal/templates/fft/kernel.amd64.go.tmpl +++ b/field/generator/internal/templates/fft/kernel.amd64.go.tmpl @@ -21,6 +21,9 @@ var vInterleaveIndices = []uint64 { //go:noescape func innerDIFWithTwiddles_avx512(a []{{ .FF }}.Element, twiddles []{{ .FF }}.Element, start, end, m int) +//go:noescape +func innerDITWithTwiddles_avx512(a []{{ .FF }}.Element, twiddles []{{ .FF }}.Element, start, end, m int) + func innerDIFWithTwiddles(a []{{ .FF }}.Element, twiddles []{{ .FF }}.Element, start, end, m int) { if !supportAVX512 { innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) @@ -29,6 +32,14 @@ func innerDIFWithTwiddles(a []{{ .FF }}.Element, twiddles []{{ .FF }}.Element, s innerDIFWithTwiddles_avx512(a, twiddles, start, end, m) } +func innerDITWithTwiddles(a []{{ .FF }}.Element, twiddles []{{ .FF }}.Element, start, end, m int) { + if !supportAVX512 { + innerDITWithTwiddlesGeneric(a, twiddles, start, end, m) + return + } + innerDITWithTwiddles_avx512(a, twiddles, start, end, m) +} + {{range $ki, $klog2 := $.Kernels}} {{- $ksize := shl 1 $klog2}} diff --git a/field/generator/internal/templates/fft/kernel.purego.go.tmpl b/field/generator/internal/templates/fft/kernel.purego.go.tmpl index f4caaac6f1..979d92553c 100644 --- a/field/generator/internal/templates/fft/kernel.purego.go.tmpl +++ b/field/generator/internal/templates/fft/kernel.purego.go.tmpl @@ -7,6 +7,10 @@ func innerDIFWithTwiddles(a []{{ .FF }}.Element, twiddles []{{ .FF }}.Element, s innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) } +func innerDITWithTwiddles(a []{{ .FF }}.Element, twiddles []{{ .FF }}.Element, start, end, m int) { + innerDITWithTwiddlesGeneric(a, twiddles, start, end, m) +} + {{range $ki, $klog2 := $.Kernels}} {{- $ksize := shl 1 $klog2}} func kerDIFNP_{{$ksize}}(a []{{ $.FF }}.Element, twiddles [][]{{ $.FF }}.Element, stage int) { diff --git a/field/generator/internal/templates/fft/tests/fft.go.tmpl b/field/generator/internal/templates/fft/tests/fft.go.tmpl index 6275d2f069..92349398c8 100644 --- a/field/generator/internal/templates/fft/tests/fft.go.tmpl +++ b/field/generator/internal/templates/fft/tests/fft.go.tmpl @@ -286,6 +286,23 @@ func BenchmarkFFTDITCosetReference(b *testing.B) { } } +func BenchmarkFFTDITReferenceSmall(b *testing.B) { + const maxSize = 1 << 9 + + pol := make([]{{ .FF }}.Element, maxSize) + pol[0].SetRandom() + for i := 1; i < maxSize; i++ { + pol[i] = pol[i-1] + } + + domain := NewDomain(maxSize) + + b.ResetTimer() + for j := 0; j < b.N; j++ { + domain.FFT(pol, DIT, OnCoset()) + } +} + func BenchmarkFFTDIFReference(b *testing.B) { const maxSize = 1 << 20 diff --git a/field/koalabear/fft/fft.go b/field/koalabear/fft/fft.go index 535fa4ed9b..b0270656ff 100644 --- a/field/koalabear/fft/fft.go +++ b/field/koalabear/fft/fft.go @@ -325,16 +325,10 @@ func ditFFT(a []koalabear.Element, w koalabear.Element, twiddles [][]koalabear.E } return } - if parallelButterfly { - parallel.Execute(m, func(start, end int) { - innerDITWithTwiddles(a, twiddles[stage-twiddlesStartStage], start, end, m) - }, nbTasks/(1<<(stage))) - } else { - innerDITWithTwiddles(a, twiddles[stage-twiddlesStartStage], 0, m, m) - } + innerDITWithTwiddles(a, twiddles[stage-twiddlesStartStage], 0, m, m) } -func innerDITWithTwiddles(a []koalabear.Element, twiddles []koalabear.Element, start, end, m int) { +func innerDITWithTwiddlesGeneric(a []koalabear.Element, twiddles []koalabear.Element, start, end, m int) { if start == 0 { koalabear.Butterfly(&a[0], &a[m]) start++ diff --git a/field/koalabear/fft/fft_test.go b/field/koalabear/fft/fft_test.go index f95d8f6612..5e8723f165 100644 --- a/field/koalabear/fft/fft_test.go +++ b/field/koalabear/fft/fft_test.go @@ -288,6 +288,23 @@ func BenchmarkFFTDITCosetReference(b *testing.B) { } } +func BenchmarkFFTDITReferenceSmall(b *testing.B) { + const maxSize = 1 << 9 + + pol := make([]koalabear.Element, maxSize) + pol[0].SetRandom() + for i := 1; i < maxSize; i++ { + pol[i] = pol[i-1] + } + + domain := NewDomain(maxSize) + + b.ResetTimer() + for j := 0; j < b.N; j++ { + domain.FFT(pol, DIT, OnCoset()) + } +} + func BenchmarkFFTDIFReference(b *testing.B) { const maxSize = 1 << 20 diff --git a/field/koalabear/fft/kernel_amd64.go b/field/koalabear/fft/kernel_amd64.go index 319190bb3d..670726f70e 100644 --- a/field/koalabear/fft/kernel_amd64.go +++ b/field/koalabear/fft/kernel_amd64.go @@ -29,6 +29,9 @@ var vInterleaveIndices = []uint64{ //go:noescape func innerDIFWithTwiddles_avx512(a []koalabear.Element, twiddles []koalabear.Element, start, end, m int) +//go:noescape +func innerDITWithTwiddles_avx512(a []koalabear.Element, twiddles []koalabear.Element, start, end, m int) + func innerDIFWithTwiddles(a []koalabear.Element, twiddles []koalabear.Element, start, end, m int) { if !supportAVX512 { innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) @@ -37,6 +40,14 @@ func innerDIFWithTwiddles(a []koalabear.Element, twiddles []koalabear.Element, s innerDIFWithTwiddles_avx512(a, twiddles, start, end, m) } +func innerDITWithTwiddles(a []koalabear.Element, twiddles []koalabear.Element, start, end, m int) { + if !supportAVX512 { + innerDITWithTwiddlesGeneric(a, twiddles, start, end, m) + return + } + innerDITWithTwiddles_avx512(a, twiddles, start, end, m) +} + //go:noescape func kerDIFNP_128_avx512(a []koalabear.Element, twiddles [][]koalabear.Element, stage int) diff --git a/field/koalabear/fft/kernel_amd64.s b/field/koalabear/fft/kernel_amd64.s index 9f4fb27135..424db40757 100644 --- a/field/koalabear/fft/kernel_amd64.s +++ b/field/koalabear/fft/kernel_amd64.s @@ -34,6 +34,14 @@ VPMINUQ in3, in0, in0 \ VPADDQ in2, in1, in1 \ +#define BUTTERFLYQ1Q(in0, in1, in2, in3, in4) \ + VPADDQ in0, in1, in3 \ + VPSUBQ in1, in0, in1 \ + VPSUBQ in2, in3, in0 \ + VPMINUQ in3, in0, in0 \ + VPADDQ in2, in1, in4 \ + VPMINUQ in4, in1, in1 \ + // performs a multiplication in place between 2 vectors of qwords (values should be dwords zero extended) // in0 = (in0 * in1) mod q // in1: second operand @@ -85,6 +93,62 @@ VPMOVQD in2, in3 \ VINSERTI64X4 $1, in3, in0, in0 \ +TEXT ·innerDITWithTwiddles_avx512(SB), NOSPLIT, $0-72 + // prepare constants needed for mul and reduce ops + MOVD $const_q, AX + VPBROADCASTQ AX, Z8 + MOVD $const_qInvNeg, AX + VPBROADCASTQ AX, Z9 + VPCMPEQB Y0, Y0, Y0 + VPMOVZXDQ Y0, Z11 + + // load arguments + MOVQ a+0(FP), R15 + MOVQ twiddles+24(FP), CX + MOVQ end+56(FP), SI + MOVQ m+64(FP), BX + CMPQ BX, $0x0000000000000008 + JL smallerThan8_1 // m < 8 + SHRQ $3, SI // we are processing 8 elements at a time + SHLQ $2, BX // offset = m * 4bytes + MOVQ R15, DX + ADDQ BX, DX + +loop_3: + TESTQ SI, SI + JEQ done_2 // n == 0, we are done + VPMOVZXDQ 0(R15), Z0 // load a[i] + VPMOVZXDQ 0(DX), Z1 // load a[i+m] + VPMOVZXDQ 0(CX), Z15 + MUL(Z1, Z15, Z11, Z8, Z9, Z12, Z10) + BUTTERFLYQ1Q(Z0, Z1, Z8, Z3, Z4) + VPMOVQD Z0, 0(R15) // store a[i] + VPMOVQD Z1, 0(DX) // store a[i+m] + ADDQ $32, R15 + ADDQ $32, DX + ADDQ $32, CX + DECQ SI // decrement n + JMP loop_3 + +done_2: + RET + +smallerThan8_1: + // m < 8, we call the generic one + // note that this should happen only when doing a FFT smaller than the smallest generated kernel + MOVQ a+0(FP), AX + MOVQ AX, (SP) + MOVQ twiddles+24(FP), AX + MOVQ AX, 24(SP) + MOVQ start+48(FP), AX + MOVQ AX, 48(SP) + MOVQ end+56(FP), AX + MOVQ AX, 56(SP) + MOVQ m+64(FP), AX + MOVQ AX, 64(SP) + CALL ·innerDITWithTwiddlesGeneric(SB) + RET + TEXT ·innerDIFWithTwiddles_avx512(SB), NOSPLIT, $0-72 // prepare constants needed for mul and reduce ops MOVD $const_q, AX @@ -101,15 +165,15 @@ TEXT ·innerDIFWithTwiddles_avx512(SB), NOSPLIT, $0-72 MOVQ end+56(FP), SI MOVQ m+64(FP), BX CMPQ BX, $0x0000000000000010 - JL smallerThan16_1 // m < 16 + JL smallerThan16_4 // m < 16 SHRQ $4, SI // we are processing 16 elements at a time SHLQ $2, BX // offset = m * 4bytes MOVQ R15, DX ADDQ BX, DX -loop_3: +loop_6: TESTQ SI, SI - JEQ done_2 // n == 0, we are done + JEQ done_5 // n == 0, we are done VMOVDQA32 0(R15), Z0 // load a[i] VMOVDQA32 0(DX), Z1 // load a[i+m] BUTTERFLYD2Q(Z0, Z1, Z2, Z3) @@ -128,12 +192,12 @@ loop_3: ADDQ $64, DX ADDQ $64, CX DECQ SI // decrement n - JMP loop_3 + JMP loop_6 -done_2: +done_5: RET -smallerThan16_1: +smallerThan16_4: // m < 16, we call the generic one // note that this should happen only when doing a FFT smaller than the smallest generated kernel MOVQ a+0(FP), AX diff --git a/field/koalabear/fft/kernel_purego.go b/field/koalabear/fft/kernel_purego.go index 80b5ee1a62..b524bc4c9f 100644 --- a/field/koalabear/fft/kernel_purego.go +++ b/field/koalabear/fft/kernel_purego.go @@ -15,6 +15,10 @@ func innerDIFWithTwiddles(a []koalabear.Element, twiddles []koalabear.Element, s innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) } +func innerDITWithTwiddles(a []koalabear.Element, twiddles []koalabear.Element, start, end, m int) { + innerDITWithTwiddlesGeneric(a, twiddles, start, end, m) +} + func kerDIFNP_128(a []koalabear.Element, twiddles [][]koalabear.Element, stage int) { kerDIFNP_128generic(a, twiddles, stage) } From 183bfa75e41da8e4e85de8b57558d2291ed1b548 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Tue, 21 Jan 2025 22:53:34 +0000 Subject: [PATCH 5/9] chore: re ran go generate --- ecc/bls12-377/fr/fft/fft.go | 2 +- ecc/bls12-377/fr/fft/fft_test.go | 17 +++++++++++++++++ ecc/bls12-377/fr/fft/kernel_purego.go | 4 ++++ ecc/bls12-381/fr/fft/fft.go | 2 +- ecc/bls12-381/fr/fft/fft_test.go | 17 +++++++++++++++++ ecc/bls12-381/fr/fft/kernel_purego.go | 4 ++++ ecc/bls24-315/fr/fft/fft.go | 2 +- ecc/bls24-315/fr/fft/fft_test.go | 17 +++++++++++++++++ ecc/bls24-315/fr/fft/kernel_purego.go | 4 ++++ ecc/bls24-317/fr/fft/fft.go | 2 +- ecc/bls24-317/fr/fft/fft_test.go | 17 +++++++++++++++++ ecc/bls24-317/fr/fft/kernel_purego.go | 4 ++++ ecc/bn254/fr/fft/fft.go | 2 +- ecc/bn254/fr/fft/fft_test.go | 17 +++++++++++++++++ ecc/bn254/fr/fft/kernel_purego.go | 4 ++++ ecc/bw6-633/fr/fft/fft.go | 2 +- ecc/bw6-633/fr/fft/fft_test.go | 17 +++++++++++++++++ ecc/bw6-633/fr/fft/kernel_purego.go | 4 ++++ ecc/bw6-761/fr/fft/fft.go | 2 +- ecc/bw6-761/fr/fft/fft_test.go | 17 +++++++++++++++++ ecc/bw6-761/fr/fft/kernel_purego.go | 4 ++++ field/babybear/fft/fft.go | 10 ++-------- field/babybear/fft/fft_test.go | 17 +++++++++++++++++ field/babybear/fft/kernel_amd64.go | 11 +++++++++++ field/babybear/fft/kernel_purego.go | 4 ++++ field/goldilocks/fft/fft.go | 2 +- field/goldilocks/fft/fft_test.go | 17 +++++++++++++++++ field/goldilocks/fft/kernel_purego.go | 4 ++++ 28 files changed, 210 insertions(+), 16 deletions(-) diff --git a/ecc/bls12-377/fr/fft/fft.go b/ecc/bls12-377/fr/fft/fft.go index 32b74acfc0..ff9dbf964d 100644 --- a/ecc/bls12-377/fr/fft/fft.go +++ b/ecc/bls12-377/fr/fft/fft.go @@ -346,7 +346,7 @@ func ditFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart } } -func innerDITWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { +func innerDITWithTwiddlesGeneric(a []fr.Element, twiddles []fr.Element, start, end, m int) { if start == 0 { fr.Butterfly(&a[0], &a[m]) start++ diff --git a/ecc/bls12-377/fr/fft/fft_test.go b/ecc/bls12-377/fr/fft/fft_test.go index 4c47f0f5b4..cf36f3e33c 100644 --- a/ecc/bls12-377/fr/fft/fft_test.go +++ b/ecc/bls12-377/fr/fft/fft_test.go @@ -288,6 +288,23 @@ func BenchmarkFFTDITCosetReference(b *testing.B) { } } +func BenchmarkFFTDITReferenceSmall(b *testing.B) { + const maxSize = 1 << 9 + + pol := make([]fr.Element, maxSize) + pol[0].SetRandom() + for i := 1; i < maxSize; i++ { + pol[i] = pol[i-1] + } + + domain := NewDomain(maxSize) + + b.ResetTimer() + for j := 0; j < b.N; j++ { + domain.FFT(pol, DIT, OnCoset()) + } +} + func BenchmarkFFTDIFReference(b *testing.B) { const maxSize = 1 << 20 diff --git a/ecc/bls12-377/fr/fft/kernel_purego.go b/ecc/bls12-377/fr/fft/kernel_purego.go index 4d89a65a2b..4c1534263f 100644 --- a/ecc/bls12-377/fr/fft/kernel_purego.go +++ b/ecc/bls12-377/fr/fft/kernel_purego.go @@ -13,6 +13,10 @@ func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m i innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) } +func innerDITWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { + innerDITWithTwiddlesGeneric(a, twiddles, start, end, m) +} + func kerDIFNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { kerDIFNP_32generic(a, twiddles, stage) } diff --git a/ecc/bls12-381/fr/fft/fft.go b/ecc/bls12-381/fr/fft/fft.go index 088fb10e91..b938f47f6a 100644 --- a/ecc/bls12-381/fr/fft/fft.go +++ b/ecc/bls12-381/fr/fft/fft.go @@ -346,7 +346,7 @@ func ditFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart } } -func innerDITWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { +func innerDITWithTwiddlesGeneric(a []fr.Element, twiddles []fr.Element, start, end, m int) { if start == 0 { fr.Butterfly(&a[0], &a[m]) start++ diff --git a/ecc/bls12-381/fr/fft/fft_test.go b/ecc/bls12-381/fr/fft/fft_test.go index a150e790dd..1d58599d73 100644 --- a/ecc/bls12-381/fr/fft/fft_test.go +++ b/ecc/bls12-381/fr/fft/fft_test.go @@ -288,6 +288,23 @@ func BenchmarkFFTDITCosetReference(b *testing.B) { } } +func BenchmarkFFTDITReferenceSmall(b *testing.B) { + const maxSize = 1 << 9 + + pol := make([]fr.Element, maxSize) + pol[0].SetRandom() + for i := 1; i < maxSize; i++ { + pol[i] = pol[i-1] + } + + domain := NewDomain(maxSize) + + b.ResetTimer() + for j := 0; j < b.N; j++ { + domain.FFT(pol, DIT, OnCoset()) + } +} + func BenchmarkFFTDIFReference(b *testing.B) { const maxSize = 1 << 20 diff --git a/ecc/bls12-381/fr/fft/kernel_purego.go b/ecc/bls12-381/fr/fft/kernel_purego.go index c14f582e7e..cb984b95e3 100644 --- a/ecc/bls12-381/fr/fft/kernel_purego.go +++ b/ecc/bls12-381/fr/fft/kernel_purego.go @@ -13,6 +13,10 @@ func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m i innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) } +func innerDITWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { + innerDITWithTwiddlesGeneric(a, twiddles, start, end, m) +} + func kerDIFNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { kerDIFNP_32generic(a, twiddles, stage) } diff --git a/ecc/bls24-315/fr/fft/fft.go b/ecc/bls24-315/fr/fft/fft.go index 1508d143a2..94ca100207 100644 --- a/ecc/bls24-315/fr/fft/fft.go +++ b/ecc/bls24-315/fr/fft/fft.go @@ -346,7 +346,7 @@ func ditFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart } } -func innerDITWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { +func innerDITWithTwiddlesGeneric(a []fr.Element, twiddles []fr.Element, start, end, m int) { if start == 0 { fr.Butterfly(&a[0], &a[m]) start++ diff --git a/ecc/bls24-315/fr/fft/fft_test.go b/ecc/bls24-315/fr/fft/fft_test.go index 7f6f55a59b..30b05ae4f8 100644 --- a/ecc/bls24-315/fr/fft/fft_test.go +++ b/ecc/bls24-315/fr/fft/fft_test.go @@ -288,6 +288,23 @@ func BenchmarkFFTDITCosetReference(b *testing.B) { } } +func BenchmarkFFTDITReferenceSmall(b *testing.B) { + const maxSize = 1 << 9 + + pol := make([]fr.Element, maxSize) + pol[0].SetRandom() + for i := 1; i < maxSize; i++ { + pol[i] = pol[i-1] + } + + domain := NewDomain(maxSize) + + b.ResetTimer() + for j := 0; j < b.N; j++ { + domain.FFT(pol, DIT, OnCoset()) + } +} + func BenchmarkFFTDIFReference(b *testing.B) { const maxSize = 1 << 20 diff --git a/ecc/bls24-315/fr/fft/kernel_purego.go b/ecc/bls24-315/fr/fft/kernel_purego.go index fe96f2bbb2..830194890a 100644 --- a/ecc/bls24-315/fr/fft/kernel_purego.go +++ b/ecc/bls24-315/fr/fft/kernel_purego.go @@ -13,6 +13,10 @@ func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m i innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) } +func innerDITWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { + innerDITWithTwiddlesGeneric(a, twiddles, start, end, m) +} + func kerDIFNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { kerDIFNP_32generic(a, twiddles, stage) } diff --git a/ecc/bls24-317/fr/fft/fft.go b/ecc/bls24-317/fr/fft/fft.go index 65a9e85f4d..3b0a18fd01 100644 --- a/ecc/bls24-317/fr/fft/fft.go +++ b/ecc/bls24-317/fr/fft/fft.go @@ -346,7 +346,7 @@ func ditFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart } } -func innerDITWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { +func innerDITWithTwiddlesGeneric(a []fr.Element, twiddles []fr.Element, start, end, m int) { if start == 0 { fr.Butterfly(&a[0], &a[m]) start++ diff --git a/ecc/bls24-317/fr/fft/fft_test.go b/ecc/bls24-317/fr/fft/fft_test.go index db52f90a95..277bfc9ec5 100644 --- a/ecc/bls24-317/fr/fft/fft_test.go +++ b/ecc/bls24-317/fr/fft/fft_test.go @@ -288,6 +288,23 @@ func BenchmarkFFTDITCosetReference(b *testing.B) { } } +func BenchmarkFFTDITReferenceSmall(b *testing.B) { + const maxSize = 1 << 9 + + pol := make([]fr.Element, maxSize) + pol[0].SetRandom() + for i := 1; i < maxSize; i++ { + pol[i] = pol[i-1] + } + + domain := NewDomain(maxSize) + + b.ResetTimer() + for j := 0; j < b.N; j++ { + domain.FFT(pol, DIT, OnCoset()) + } +} + func BenchmarkFFTDIFReference(b *testing.B) { const maxSize = 1 << 20 diff --git a/ecc/bls24-317/fr/fft/kernel_purego.go b/ecc/bls24-317/fr/fft/kernel_purego.go index 2a1738dcad..ed10583a78 100644 --- a/ecc/bls24-317/fr/fft/kernel_purego.go +++ b/ecc/bls24-317/fr/fft/kernel_purego.go @@ -13,6 +13,10 @@ func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m i innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) } +func innerDITWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { + innerDITWithTwiddlesGeneric(a, twiddles, start, end, m) +} + func kerDIFNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { kerDIFNP_32generic(a, twiddles, stage) } diff --git a/ecc/bn254/fr/fft/fft.go b/ecc/bn254/fr/fft/fft.go index 4e972fb442..6eb9ac90e4 100644 --- a/ecc/bn254/fr/fft/fft.go +++ b/ecc/bn254/fr/fft/fft.go @@ -346,7 +346,7 @@ func ditFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart } } -func innerDITWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { +func innerDITWithTwiddlesGeneric(a []fr.Element, twiddles []fr.Element, start, end, m int) { if start == 0 { fr.Butterfly(&a[0], &a[m]) start++ diff --git a/ecc/bn254/fr/fft/fft_test.go b/ecc/bn254/fr/fft/fft_test.go index c3e5a5e1da..fa2e6872a4 100644 --- a/ecc/bn254/fr/fft/fft_test.go +++ b/ecc/bn254/fr/fft/fft_test.go @@ -288,6 +288,23 @@ func BenchmarkFFTDITCosetReference(b *testing.B) { } } +func BenchmarkFFTDITReferenceSmall(b *testing.B) { + const maxSize = 1 << 9 + + pol := make([]fr.Element, maxSize) + pol[0].SetRandom() + for i := 1; i < maxSize; i++ { + pol[i] = pol[i-1] + } + + domain := NewDomain(maxSize) + + b.ResetTimer() + for j := 0; j < b.N; j++ { + domain.FFT(pol, DIT, OnCoset()) + } +} + func BenchmarkFFTDIFReference(b *testing.B) { const maxSize = 1 << 20 diff --git a/ecc/bn254/fr/fft/kernel_purego.go b/ecc/bn254/fr/fft/kernel_purego.go index c7b657402c..2368b562bd 100644 --- a/ecc/bn254/fr/fft/kernel_purego.go +++ b/ecc/bn254/fr/fft/kernel_purego.go @@ -13,6 +13,10 @@ func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m i innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) } +func innerDITWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { + innerDITWithTwiddlesGeneric(a, twiddles, start, end, m) +} + func kerDIFNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { kerDIFNP_32generic(a, twiddles, stage) } diff --git a/ecc/bw6-633/fr/fft/fft.go b/ecc/bw6-633/fr/fft/fft.go index dd6ff7fa41..4ca5ed0787 100644 --- a/ecc/bw6-633/fr/fft/fft.go +++ b/ecc/bw6-633/fr/fft/fft.go @@ -346,7 +346,7 @@ func ditFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart } } -func innerDITWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { +func innerDITWithTwiddlesGeneric(a []fr.Element, twiddles []fr.Element, start, end, m int) { if start == 0 { fr.Butterfly(&a[0], &a[m]) start++ diff --git a/ecc/bw6-633/fr/fft/fft_test.go b/ecc/bw6-633/fr/fft/fft_test.go index ce7bb73ef1..1f03674697 100644 --- a/ecc/bw6-633/fr/fft/fft_test.go +++ b/ecc/bw6-633/fr/fft/fft_test.go @@ -288,6 +288,23 @@ func BenchmarkFFTDITCosetReference(b *testing.B) { } } +func BenchmarkFFTDITReferenceSmall(b *testing.B) { + const maxSize = 1 << 9 + + pol := make([]fr.Element, maxSize) + pol[0].SetRandom() + for i := 1; i < maxSize; i++ { + pol[i] = pol[i-1] + } + + domain := NewDomain(maxSize) + + b.ResetTimer() + for j := 0; j < b.N; j++ { + domain.FFT(pol, DIT, OnCoset()) + } +} + func BenchmarkFFTDIFReference(b *testing.B) { const maxSize = 1 << 20 diff --git a/ecc/bw6-633/fr/fft/kernel_purego.go b/ecc/bw6-633/fr/fft/kernel_purego.go index 1e53b9c614..8f37a5e72d 100644 --- a/ecc/bw6-633/fr/fft/kernel_purego.go +++ b/ecc/bw6-633/fr/fft/kernel_purego.go @@ -13,6 +13,10 @@ func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m i innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) } +func innerDITWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { + innerDITWithTwiddlesGeneric(a, twiddles, start, end, m) +} + func kerDIFNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { kerDIFNP_32generic(a, twiddles, stage) } diff --git a/ecc/bw6-761/fr/fft/fft.go b/ecc/bw6-761/fr/fft/fft.go index b7996c817a..2a795a2a01 100644 --- a/ecc/bw6-761/fr/fft/fft.go +++ b/ecc/bw6-761/fr/fft/fft.go @@ -346,7 +346,7 @@ func ditFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart } } -func innerDITWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { +func innerDITWithTwiddlesGeneric(a []fr.Element, twiddles []fr.Element, start, end, m int) { if start == 0 { fr.Butterfly(&a[0], &a[m]) start++ diff --git a/ecc/bw6-761/fr/fft/fft_test.go b/ecc/bw6-761/fr/fft/fft_test.go index b56e9c7fac..6ef7b6af0f 100644 --- a/ecc/bw6-761/fr/fft/fft_test.go +++ b/ecc/bw6-761/fr/fft/fft_test.go @@ -288,6 +288,23 @@ func BenchmarkFFTDITCosetReference(b *testing.B) { } } +func BenchmarkFFTDITReferenceSmall(b *testing.B) { + const maxSize = 1 << 9 + + pol := make([]fr.Element, maxSize) + pol[0].SetRandom() + for i := 1; i < maxSize; i++ { + pol[i] = pol[i-1] + } + + domain := NewDomain(maxSize) + + b.ResetTimer() + for j := 0; j < b.N; j++ { + domain.FFT(pol, DIT, OnCoset()) + } +} + func BenchmarkFFTDIFReference(b *testing.B) { const maxSize = 1 << 20 diff --git a/ecc/bw6-761/fr/fft/kernel_purego.go b/ecc/bw6-761/fr/fft/kernel_purego.go index 7f742c4043..99732a9744 100644 --- a/ecc/bw6-761/fr/fft/kernel_purego.go +++ b/ecc/bw6-761/fr/fft/kernel_purego.go @@ -13,6 +13,10 @@ func innerDIFWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m i innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) } +func innerDITWithTwiddles(a []fr.Element, twiddles []fr.Element, start, end, m int) { + innerDITWithTwiddlesGeneric(a, twiddles, start, end, m) +} + func kerDIFNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { kerDIFNP_32generic(a, twiddles, stage) } diff --git a/field/babybear/fft/fft.go b/field/babybear/fft/fft.go index 8fbd1eb12a..eec062cfd4 100644 --- a/field/babybear/fft/fft.go +++ b/field/babybear/fft/fft.go @@ -325,16 +325,10 @@ func ditFFT(a []babybear.Element, w babybear.Element, twiddles [][]babybear.Elem } return } - if parallelButterfly { - parallel.Execute(m, func(start, end int) { - innerDITWithTwiddles(a, twiddles[stage-twiddlesStartStage], start, end, m) - }, nbTasks/(1<<(stage))) - } else { - innerDITWithTwiddles(a, twiddles[stage-twiddlesStartStage], 0, m, m) - } + innerDITWithTwiddles(a, twiddles[stage-twiddlesStartStage], 0, m, m) } -func innerDITWithTwiddles(a []babybear.Element, twiddles []babybear.Element, start, end, m int) { +func innerDITWithTwiddlesGeneric(a []babybear.Element, twiddles []babybear.Element, start, end, m int) { if start == 0 { babybear.Butterfly(&a[0], &a[m]) start++ diff --git a/field/babybear/fft/fft_test.go b/field/babybear/fft/fft_test.go index daf36bfa85..5c81f7bcc0 100644 --- a/field/babybear/fft/fft_test.go +++ b/field/babybear/fft/fft_test.go @@ -288,6 +288,23 @@ func BenchmarkFFTDITCosetReference(b *testing.B) { } } +func BenchmarkFFTDITReferenceSmall(b *testing.B) { + const maxSize = 1 << 9 + + pol := make([]babybear.Element, maxSize) + pol[0].SetRandom() + for i := 1; i < maxSize; i++ { + pol[i] = pol[i-1] + } + + domain := NewDomain(maxSize) + + b.ResetTimer() + for j := 0; j < b.N; j++ { + domain.FFT(pol, DIT, OnCoset()) + } +} + func BenchmarkFFTDIFReference(b *testing.B) { const maxSize = 1 << 20 diff --git a/field/babybear/fft/kernel_amd64.go b/field/babybear/fft/kernel_amd64.go index 6a8c668f33..d2ba142a84 100644 --- a/field/babybear/fft/kernel_amd64.go +++ b/field/babybear/fft/kernel_amd64.go @@ -29,6 +29,9 @@ var vInterleaveIndices = []uint64{ //go:noescape func innerDIFWithTwiddles_avx512(a []babybear.Element, twiddles []babybear.Element, start, end, m int) +//go:noescape +func innerDITWithTwiddles_avx512(a []babybear.Element, twiddles []babybear.Element, start, end, m int) + func innerDIFWithTwiddles(a []babybear.Element, twiddles []babybear.Element, start, end, m int) { if !supportAVX512 { innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) @@ -37,6 +40,14 @@ func innerDIFWithTwiddles(a []babybear.Element, twiddles []babybear.Element, sta innerDIFWithTwiddles_avx512(a, twiddles, start, end, m) } +func innerDITWithTwiddles(a []babybear.Element, twiddles []babybear.Element, start, end, m int) { + if !supportAVX512 { + innerDITWithTwiddlesGeneric(a, twiddles, start, end, m) + return + } + innerDITWithTwiddles_avx512(a, twiddles, start, end, m) +} + //go:noescape func kerDIFNP_128_avx512(a []babybear.Element, twiddles [][]babybear.Element, stage int) diff --git a/field/babybear/fft/kernel_purego.go b/field/babybear/fft/kernel_purego.go index 755fe17413..9fe7f3f769 100644 --- a/field/babybear/fft/kernel_purego.go +++ b/field/babybear/fft/kernel_purego.go @@ -15,6 +15,10 @@ func innerDIFWithTwiddles(a []babybear.Element, twiddles []babybear.Element, sta innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) } +func innerDITWithTwiddles(a []babybear.Element, twiddles []babybear.Element, start, end, m int) { + innerDITWithTwiddlesGeneric(a, twiddles, start, end, m) +} + func kerDIFNP_128(a []babybear.Element, twiddles [][]babybear.Element, stage int) { kerDIFNP_128generic(a, twiddles, stage) } diff --git a/field/goldilocks/fft/fft.go b/field/goldilocks/fft/fft.go index ed3535a7cf..f63b9b1c88 100644 --- a/field/goldilocks/fft/fft.go +++ b/field/goldilocks/fft/fft.go @@ -346,7 +346,7 @@ func ditFFT(a []goldilocks.Element, w goldilocks.Element, twiddles [][]goldilock } } -func innerDITWithTwiddles(a []goldilocks.Element, twiddles []goldilocks.Element, start, end, m int) { +func innerDITWithTwiddlesGeneric(a []goldilocks.Element, twiddles []goldilocks.Element, start, end, m int) { if start == 0 { goldilocks.Butterfly(&a[0], &a[m]) start++ diff --git a/field/goldilocks/fft/fft_test.go b/field/goldilocks/fft/fft_test.go index eb98e00791..0a340ca35a 100644 --- a/field/goldilocks/fft/fft_test.go +++ b/field/goldilocks/fft/fft_test.go @@ -288,6 +288,23 @@ func BenchmarkFFTDITCosetReference(b *testing.B) { } } +func BenchmarkFFTDITReferenceSmall(b *testing.B) { + const maxSize = 1 << 9 + + pol := make([]goldilocks.Element, maxSize) + pol[0].SetRandom() + for i := 1; i < maxSize; i++ { + pol[i] = pol[i-1] + } + + domain := NewDomain(maxSize) + + b.ResetTimer() + for j := 0; j < b.N; j++ { + domain.FFT(pol, DIT, OnCoset()) + } +} + func BenchmarkFFTDIFReference(b *testing.B) { const maxSize = 1 << 20 diff --git a/field/goldilocks/fft/kernel_purego.go b/field/goldilocks/fft/kernel_purego.go index 7ba8f66000..9f5654e708 100644 --- a/field/goldilocks/fft/kernel_purego.go +++ b/field/goldilocks/fft/kernel_purego.go @@ -13,6 +13,10 @@ func innerDIFWithTwiddles(a []goldilocks.Element, twiddles []goldilocks.Element, innerDIFWithTwiddlesGeneric(a, twiddles, start, end, m) } +func innerDITWithTwiddles(a []goldilocks.Element, twiddles []goldilocks.Element, start, end, m int) { + innerDITWithTwiddlesGeneric(a, twiddles, start, end, m) +} + func kerDIFNP_32(a []goldilocks.Element, twiddles [][]goldilocks.Element, stage int) { kerDIFNP_32generic(a, twiddles, stage) } From fdd8045a6df67d5dd80aca72b9dbba9d7a4c8947 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Tue, 21 Jan 2025 23:24:13 +0000 Subject: [PATCH 6/9] fix: replace dword instr by qword in vec mul f31 --- field/asm/element_31b_amd64.s | 8 ++++---- field/babybear/element_amd64.s | 2 +- field/generator/asm/amd64/element_vec_F31.go | 8 ++++---- field/koalabear/element_amd64.s | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/field/asm/element_31b_amd64.s b/field/asm/element_31b_amd64.s index 82451ced17..0c0f3df5ee 100644 --- a/field/asm/element_31b_amd64.s +++ b/field/asm/element_31b_amd64.s @@ -124,8 +124,8 @@ loop_7: VPMULUDQ Z5, Z3, Z5 // m = m * q VPADDQ Z2, Z5, Z2 // P = P + m VPSRLQ $32, Z2, Z2 // P = P >> 32 - VPSUBD Z3, Z2, Z5 // PL = P - q - VPMINUD Z2, Z5, Z2 // P = min(P, PL) + VPSUBQ Z3, Z2, Z5 // PL = P - q + VPMINUQ Z2, Z5, Z2 // P = min(P, PL) VPMOVQD Z2, 0(CX) // res = P // increment pointers to visit next element @@ -166,8 +166,8 @@ loop_9: VPMULUDQ Z5, Z3, Z5 // m = m * q VPADDQ Z2, Z5, Z2 // P = P + m VPSRLQ $32, Z2, Z2 // P = P >> 32 - VPSUBD Z3, Z2, Z5 // PL = P - q - VPMINUD Z2, Z5, Z2 // P = min(P, PL) + VPSUBQ Z3, Z2, Z5 // PL = P - q + VPMINUQ Z2, Z5, Z2 // P = min(P, PL) VPMOVQD Z2, 0(CX) // res = P // increment pointers to visit next element diff --git a/field/babybear/element_amd64.s b/field/babybear/element_amd64.s index 7a3efcf7bd..f2d1aa723a 100644 --- a/field/babybear/element_amd64.s +++ b/field/babybear/element_amd64.s @@ -5,6 +5,6 @@ // Code generated by consensys/gnark-crypto DO NOT EDIT -// We include the hash to force the Go compiler to recompile: 17349933987904761959 +// We include the hash to force the Go compiler to recompile: 382550775135823111 #include "../asm/element_31b_amd64.s" diff --git a/field/generator/asm/amd64/element_vec_F31.go b/field/generator/asm/amd64/element_vec_F31.go index c6f5132886..ef4fad1eaa 100644 --- a/field/generator/asm/amd64/element_vec_F31.go +++ b/field/generator/asm/amd64/element_vec_F31.go @@ -281,8 +281,8 @@ func (f *FFAmd64) generateMulVecF31() { f.VPADDQ(P, PL, P, "P = P + m") f.VPSRLQ("$32", P, P, "P = P >> 32") - f.VPSUBD(q, P, PL, "PL = P - q") - f.VPMINUD(P, PL, P, "P = min(P, PL)") + f.VPSUBQ(q, P, PL, "PL = P - q") + f.VPMINUQ(P, PL, P, "P = min(P, PL)") // move P to res f.VPMOVQD(P, addrRes.At(0), "res = P") @@ -363,8 +363,8 @@ func (f *FFAmd64) generateScalarMulVecF31() { f.VPADDQ(P, PL, P, "P = P + m") f.VPSRLQ("$32", P, P, "P = P >> 32") - f.VPSUBD(q, P, PL, "PL = P - q") - f.VPMINUD(P, PL, P, "P = min(P, PL)") + f.VPSUBQ(q, P, PL, "PL = P - q") + f.VPMINUQ(P, PL, P, "P = min(P, PL)") // move P to res f.VPMOVQD(P, addrRes.At(0), "res = P") diff --git a/field/koalabear/element_amd64.s b/field/koalabear/element_amd64.s index 7a3efcf7bd..f2d1aa723a 100644 --- a/field/koalabear/element_amd64.s +++ b/field/koalabear/element_amd64.s @@ -5,6 +5,6 @@ // Code generated by consensys/gnark-crypto DO NOT EDIT -// We include the hash to force the Go compiler to recompile: 17349933987904761959 +// We include the hash to force the Go compiler to recompile: 382550775135823111 #include "../asm/element_31b_amd64.s" From a6470938b24df23cb127acee761c7c610c13aed0 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Tue, 21 Jan 2025 23:25:12 +0000 Subject: [PATCH 7/9] fix: fix static check issue --- ecc/bls12-377/fr/sis/sis_test.go | 2 +- field/babybear/sis/sis_test.go | 2 +- field/generator/internal/templates/sis/sis.test.go.tmpl | 2 +- field/goldilocks/sis/sis_test.go | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ecc/bls12-377/fr/sis/sis_test.go b/ecc/bls12-377/fr/sis/sis_test.go index c7fc2362f6..384c09b8cd 100644 --- a/ecc/bls12-377/fr/sis/sis_test.go +++ b/ecc/bls12-377/fr/sis/sis_test.go @@ -236,7 +236,7 @@ func benchmarkSIS(b *testing.B, input []fr.Element, sparse bool, logTwoBound, lo b.Run(benchName, func(b *testing.B) { // report the throughput in MB/s - b.SetBytes(int64(len(input)) * koalabear.Bytes) + b.SetBytes(int64(len(input)) * fr.Bytes) instance, err := NewRSis(0, logTwoDegree, logTwoBound, n) if err != nil { diff --git a/field/babybear/sis/sis_test.go b/field/babybear/sis/sis_test.go index ca63269386..9e6b39dd0e 100644 --- a/field/babybear/sis/sis_test.go +++ b/field/babybear/sis/sis_test.go @@ -236,7 +236,7 @@ func benchmarkSIS(b *testing.B, input []babybear.Element, sparse bool, logTwoBou b.Run(benchName, func(b *testing.B) { // report the throughput in MB/s - b.SetBytes(int64(len(input)) * koalabear.Bytes) + b.SetBytes(int64(len(input)) * babybear.Bytes) instance, err := NewRSis(0, logTwoDegree, logTwoBound, n) if err != nil { diff --git a/field/generator/internal/templates/sis/sis.test.go.tmpl b/field/generator/internal/templates/sis/sis.test.go.tmpl index da3872ea74..6d0c101ccb 100644 --- a/field/generator/internal/templates/sis/sis.test.go.tmpl +++ b/field/generator/internal/templates/sis/sis.test.go.tmpl @@ -233,7 +233,7 @@ func benchmarkSIS(b *testing.B, input []{{ .FF }}.Element, sparse bool, logTwoBo b.Run(benchName, func(b *testing.B) { // report the throughput in MB/s - b.SetBytes(int64(len(input)) * koalabear.Bytes) + b.SetBytes(int64(len(input)) * {{.FF}}.Bytes) instance, err := NewRSis(0, logTwoDegree, logTwoBound, n) if err != nil { diff --git a/field/goldilocks/sis/sis_test.go b/field/goldilocks/sis/sis_test.go index d12f819bec..16aeac6653 100644 --- a/field/goldilocks/sis/sis_test.go +++ b/field/goldilocks/sis/sis_test.go @@ -236,7 +236,7 @@ func benchmarkSIS(b *testing.B, input []goldilocks.Element, sparse bool, logTwoB b.Run(benchName, func(b *testing.B) { // report the throughput in MB/s - b.SetBytes(int64(len(input)) * koalabear.Bytes) + b.SetBytes(int64(len(input)) * goldilocks.Bytes) instance, err := NewRSis(0, logTwoDegree, logTwoBound, n) if err != nil { From 7f8ccc6ce951e4a001ae860f4e1bf52cc2712df1 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Wed, 22 Jan 2025 19:28:42 +0000 Subject: [PATCH 8/9] perf: prepare sis full avx, checkpoint limb split --- field/babybear/fft/kernel_amd64.go | 3 + field/babybear/fft/kernel_amd64.s | 364 ++++++++++++++++++ field/babybear/sis/sis.go | 29 +- field/generator/asm/amd64/build.go | 2 + field/generator/asm/amd64/element_vec_F31.go | 65 ++++ .../templates/fft/kernel.amd64.go.tmpl | 3 + .../internal/templates/sis/sis.go.tmpl | 30 +- field/koalabear/fft/kernel_amd64.go | 3 + field/koalabear/fft/kernel_amd64.s | 364 ++++++++++++++++++ field/koalabear/sis/sis.go | 29 +- 10 files changed, 853 insertions(+), 39 deletions(-) diff --git a/field/babybear/fft/kernel_amd64.go b/field/babybear/fft/kernel_amd64.go index d2ba142a84..6001d025c8 100644 --- a/field/babybear/fft/kernel_amd64.go +++ b/field/babybear/fft/kernel_amd64.go @@ -26,6 +26,9 @@ var vInterleaveIndices = []uint64{ 2, 3, 8, 9, 6, 7, 12, 13, } +//go:noescape +func SISToRefactor(k256, k512 []babybear.Element) + //go:noescape func innerDIFWithTwiddles_avx512(a []babybear.Element, twiddles []babybear.Element, start, end, m int) diff --git a/field/babybear/fft/kernel_amd64.s b/field/babybear/fft/kernel_amd64.s index 424db40757..af87c9d533 100644 --- a/field/babybear/fft/kernel_amd64.s +++ b/field/babybear/fft/kernel_amd64.s @@ -448,3 +448,367 @@ TEXT ·kerDIFNP_128_avx512(SB), NOSPLIT, $0-56 VMOVDQA32 Z12, 384(R15) VMOVDQA32 Z14, 448(R15) RET + +TEXT ·SISToRefactor(SB), NOSPLIT, $0-48 + // prepare constants needed for mul and reduce ops + MOVD $const_q, AX + VPBROADCASTQ AX, Z1 + MOVD $const_qInvNeg, AX + VPBROADCASTQ AX, Z2 + VPCMPEQB Y0, Y0, Y0 + VPMOVZXDQ Y0, Z3 + MOVQ k256+0(FP), R15 + MOVQ k512+24(FP), DX + VPMOVZXDQ 0(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 0(DX) + VPMOVZXDQ 32(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 64(DX) + VPMOVZXDQ 64(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 128(DX) + VPMOVZXDQ 96(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 192(DX) + VPMOVZXDQ 128(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 256(DX) + VPMOVZXDQ 160(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 320(DX) + VPMOVZXDQ 192(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 384(DX) + VPMOVZXDQ 224(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 448(DX) + VPMOVZXDQ 256(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 512(DX) + VPMOVZXDQ 288(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 576(DX) + VPMOVZXDQ 320(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 640(DX) + VPMOVZXDQ 352(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 704(DX) + VPMOVZXDQ 384(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 768(DX) + VPMOVZXDQ 416(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 832(DX) + VPMOVZXDQ 448(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 896(DX) + VPMOVZXDQ 480(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 960(DX) + VPMOVZXDQ 512(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1024(DX) + VPMOVZXDQ 544(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1088(DX) + VPMOVZXDQ 576(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1152(DX) + VPMOVZXDQ 608(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1216(DX) + VPMOVZXDQ 640(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1280(DX) + VPMOVZXDQ 672(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1344(DX) + VPMOVZXDQ 704(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1408(DX) + VPMOVZXDQ 736(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1472(DX) + VPMOVZXDQ 768(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1536(DX) + VPMOVZXDQ 800(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1600(DX) + VPMOVZXDQ 832(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1664(DX) + VPMOVZXDQ 864(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1728(DX) + VPMOVZXDQ 896(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1792(DX) + VPMOVZXDQ 928(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1856(DX) + VPMOVZXDQ 960(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1920(DX) + VPMOVZXDQ 992(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1984(DX) + RET diff --git a/field/babybear/sis/sis.go b/field/babybear/sis/sis.go index a1ebe05d24..97a45727b3 100644 --- a/field/babybear/sis/sis.go +++ b/field/babybear/sis/sis.go @@ -145,12 +145,11 @@ func (r *RSis) Hash(v, res []babybear.Element) error { if r.Degree == 512 && r.LogTwoBound == 16 { // this is our hot path, we don't use the iterator because with // avx512 instructions, it actually ends up being most of the CPU time. - er := babybear.Element{1} // mul by 1 --> mont reduce + // er := babybear.Element{1} // mul by 1 --> mont reduce polId := 0 var k512 [512]babybear.Element vk := babybear.Vector(k512[:]) vRes := babybear.Vector(res) - vb := babybear.Vector(k512[256:]) cosets, err := r.Domain.CosetTable() if err != nil { @@ -158,24 +157,28 @@ func (r *RSis) Hash(v, res []babybear.Element) error { } vCosets := babybear.Vector(cosets) + var k256 [256]babybear.Element + for j := 0; j < len(v); j += 256 { start := j end := j + 256 end = min(end, len(v)) - // use half of vk to copy the v input to batch convert to regular form - copy(vb[:], v[start:end]) - for k := (end - start); k < 256; k++ { - vb[k][0] = 0 + _v := babybear.Vector(v[start:end]) + if len(_v) != 256 { + // we need a buffer here + copy(k256[:], _v) + for k := len(_v); k < 256; k++ { + k256[k][0] = 0 + } } - // batch montgomery -> regular - vb.ScalarMul(vb, &er) + fft.SISToRefactor(k256[:], k512[:]) - // do the limb split - for k := 0; k < 256; k++ { - k512[k*2][0] = uint32(uint16(vb[k][0])) - k512[k*2+1][0] = uint32(uint16(vb[k][0] >> 16)) - } + // // do the limb split + // for k := 0; k < 256; k++ { + // k512[k*2][0] = uint32(uint16(vb[k][0])) + // k512[k*2+1][0] = uint32(uint16(vb[k][0] >> 16)) + // } // inner hash vk.Mul(vk, vCosets) diff --git a/field/generator/asm/amd64/build.go b/field/generator/asm/amd64/build.go index 0397999023..e3f97bee39 100644 --- a/field/generator/asm/amd64/build.go +++ b/field/generator/asm/amd64/build.go @@ -282,6 +282,8 @@ func GenerateF31FFTKernels(w io.Writer, nbBits int, kernels []int) error { f.generateFFTKernelF31(ksize) } + f.generateSISToRefactorF31() + return nil } diff --git a/field/generator/asm/amd64/element_vec_F31.go b/field/generator/asm/amd64/element_vec_F31.go index ef4fad1eaa..f1ae339565 100644 --- a/field/generator/asm/amd64/element_vec_F31.go +++ b/field/generator/asm/amd64/element_vec_F31.go @@ -1103,3 +1103,68 @@ func zToy(r amd64.Register) amd64.Register { vr := "Y" + v[1:] return amd64.Register(vr) } + +func (f *FFAmd64) generateSISToRefactorF31() { + const argSize = 6 * 8 + stackSize := f.StackSize(f.NbWords*2+4, 1, 0) + registers := f.FnHeader("SISToRefactor", stackSize, argSize, amd64.AX) + defer f.AssertCleanStack(stackSize, 0) + + // for now we get + // SISToRefactor(k256, k512 []uint32) + // we "limb split" k256 into k512 to start with. + addrK256 := f.Pop(®isters) + addrK512 := f.Pop(®isters) + + x := amd64.Register("Z0") + q := amd64.Register("Z1") + qInvNeg := amd64.Register("Z2") + LSW := amd64.Register("Z3") + PL := amd64.Register("Z4") + + // load q and qInvNeg + f.Comment("prepare constants needed for mul and reduce ops") + f.WriteLn("MOVD $const_q, AX") + f.VPBROADCASTQ(amd64.AX, q) + f.WriteLn("MOVD $const_qInvNeg, AX") + f.VPBROADCASTQ(amd64.AX, qInvNeg) + f.VPCMPEQB("Y0", "Y0", "Y0") + f.VPMOVZXDQ("Y0", LSW) + + f.MOVQ("k256+0(FP)", addrK256) + f.MOVQ("k512+24(FP)", addrK512) + + n := 256 / 8 + + for i := 0; i < n; i++ { + // load 8 uint32 from k256 into a zmm register (zero extended) + f.VPMOVZXDQ(addrK256.At(i*4), x) + + // from Montgomery to regular form + // f.VPMULUDQ(x, y, P) + // f.VPANDQ(LSW, P, PL) + f.VPMULUDQ(x, qInvNeg, PL) + f.VPANDQ(LSW, PL, PL) + f.VPMULUDQ(PL, q, PL) + f.VPADDQ(x, PL, x) + f.VPSRLQ("$32", x, x) + f.VPSUBQ(q, x, PL) + f.VPMINUQ(x, PL, x) + + // we have + // z0 = [ 0 0 a0 a1 | 0 0 b0 b1 | 0 0 c0 c1 | ... ] + // we want + // z0 = [ 0 a1 0 a0 | 0 b1 0 b0 | 0 c1 0 c0 | ... ] + f.VPSHUFLW(0b11011100, x, x) + f.VPSHUFHW(0b11011100, x, x) + + // now we consider that as a vector of dwords and move it into k512 + f.VMOVDQU32(x, addrK512.At(i*8)) + + // f.ADDQ("$32", addrK256) + // f.ADDQ("$64", addrK512) + } + + f.RET() + +} diff --git a/field/generator/internal/templates/fft/kernel.amd64.go.tmpl b/field/generator/internal/templates/fft/kernel.amd64.go.tmpl index 78ffb6c68b..4e03e8d2dc 100644 --- a/field/generator/internal/templates/fft/kernel.amd64.go.tmpl +++ b/field/generator/internal/templates/fft/kernel.amd64.go.tmpl @@ -18,6 +18,9 @@ var vInterleaveIndices = []uint64 { 2, 3, 8, 9, 6, 7, 12, 13, } +//go:noescape +func SISToRefactor(k256, k512 []{{ .FF }}.Element) + //go:noescape func innerDIFWithTwiddles_avx512(a []{{ .FF }}.Element, twiddles []{{ .FF }}.Element, start, end, m int) diff --git a/field/generator/internal/templates/sis/sis.go.tmpl b/field/generator/internal/templates/sis/sis.go.tmpl index 90d552a810..ab303d9a35 100644 --- a/field/generator/internal/templates/sis/sis.go.tmpl +++ b/field/generator/internal/templates/sis/sis.go.tmpl @@ -173,12 +173,12 @@ func (r *RSis) Hash(v, res []{{ .FF }}.Element) error { if r.Degree == 512 && r.LogTwoBound == 16 { // this is our hot path, we don't use the iterator because with // avx512 instructions, it actually ends up being most of the CPU time. - er := {{ .FF }}.Element{1} // mul by 1 --> mont reduce + // er := {{ .FF }}.Element{1} // mul by 1 --> mont reduce polId := 0 var k512 [512]{{ .FF }}.Element vk := {{ .FF }}.Vector(k512[:]) vRes := {{ .FF }}.Vector(res) - vb := {{ .FF }}.Vector(k512[256:]) + cosets, err := r.Domain.CosetTable() if err != nil { @@ -186,24 +186,28 @@ func (r *RSis) Hash(v, res []{{ .FF }}.Element) error { } vCosets := {{ .FF }}.Vector(cosets) + var k256 [256]{{ .FF }}.Element + for j := 0; j < len(v); j+=256 { start := j end := j + 256 end = min(end, len(v)) - // use half of vk to copy the v input to batch convert to regular form - copy(vb[:], v[start:end]) - for k:= (end-start); k < 256; k++ { - vb[k][0] = 0 + _v := {{ .FF }}.Vector(v[start:end]) + if len(_v) != 256 { + // we need a buffer here + copy(k256[:], _v) + for k := len(_v); k < 256; k++ { + k256[k][0] = 0 + } } - // batch montgomery -> regular - vb.ScalarMul(vb, &er) + fft.SISToRefactor(k256[:], k512[:]) - // do the limb split - for k := 0; k < 256; k++ { - k512[k*2][0] = uint32(uint16(vb[k][0])) - k512[k*2+1][0] = uint32(uint16(vb[k][0] >> 16)) - } + // // do the limb split + // for k := 0; k < 256; k++ { + // k512[k*2][0] = uint32(uint16(vb[k][0])) + // k512[k*2+1][0] = uint32(uint16(vb[k][0] >> 16)) + // } // inner hash vk.Mul(vk, vCosets) diff --git a/field/koalabear/fft/kernel_amd64.go b/field/koalabear/fft/kernel_amd64.go index 670726f70e..5e1e0bf323 100644 --- a/field/koalabear/fft/kernel_amd64.go +++ b/field/koalabear/fft/kernel_amd64.go @@ -26,6 +26,9 @@ var vInterleaveIndices = []uint64{ 2, 3, 8, 9, 6, 7, 12, 13, } +//go:noescape +func SISToRefactor(k256, k512 []koalabear.Element) + //go:noescape func innerDIFWithTwiddles_avx512(a []koalabear.Element, twiddles []koalabear.Element, start, end, m int) diff --git a/field/koalabear/fft/kernel_amd64.s b/field/koalabear/fft/kernel_amd64.s index 424db40757..af87c9d533 100644 --- a/field/koalabear/fft/kernel_amd64.s +++ b/field/koalabear/fft/kernel_amd64.s @@ -448,3 +448,367 @@ TEXT ·kerDIFNP_128_avx512(SB), NOSPLIT, $0-56 VMOVDQA32 Z12, 384(R15) VMOVDQA32 Z14, 448(R15) RET + +TEXT ·SISToRefactor(SB), NOSPLIT, $0-48 + // prepare constants needed for mul and reduce ops + MOVD $const_q, AX + VPBROADCASTQ AX, Z1 + MOVD $const_qInvNeg, AX + VPBROADCASTQ AX, Z2 + VPCMPEQB Y0, Y0, Y0 + VPMOVZXDQ Y0, Z3 + MOVQ k256+0(FP), R15 + MOVQ k512+24(FP), DX + VPMOVZXDQ 0(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 0(DX) + VPMOVZXDQ 32(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 64(DX) + VPMOVZXDQ 64(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 128(DX) + VPMOVZXDQ 96(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 192(DX) + VPMOVZXDQ 128(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 256(DX) + VPMOVZXDQ 160(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 320(DX) + VPMOVZXDQ 192(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 384(DX) + VPMOVZXDQ 224(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 448(DX) + VPMOVZXDQ 256(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 512(DX) + VPMOVZXDQ 288(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 576(DX) + VPMOVZXDQ 320(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 640(DX) + VPMOVZXDQ 352(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 704(DX) + VPMOVZXDQ 384(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 768(DX) + VPMOVZXDQ 416(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 832(DX) + VPMOVZXDQ 448(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 896(DX) + VPMOVZXDQ 480(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 960(DX) + VPMOVZXDQ 512(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1024(DX) + VPMOVZXDQ 544(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1088(DX) + VPMOVZXDQ 576(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1152(DX) + VPMOVZXDQ 608(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1216(DX) + VPMOVZXDQ 640(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1280(DX) + VPMOVZXDQ 672(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1344(DX) + VPMOVZXDQ 704(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1408(DX) + VPMOVZXDQ 736(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1472(DX) + VPMOVZXDQ 768(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1536(DX) + VPMOVZXDQ 800(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1600(DX) + VPMOVZXDQ 832(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1664(DX) + VPMOVZXDQ 864(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1728(DX) + VPMOVZXDQ 896(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1792(DX) + VPMOVZXDQ 928(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1856(DX) + VPMOVZXDQ 960(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1920(DX) + VPMOVZXDQ 992(R15), Z0 + VPMULUDQ Z0, Z2, Z4 + VPANDQ Z3, Z4, Z4 + VPMULUDQ Z4, Z1, Z4 + VPADDQ Z0, Z4, Z0 + VPSRLQ $32, Z0, Z0 + VPSUBQ Z1, Z0, Z4 + VPMINUQ Z0, Z4, Z0 + VPSHUFLW $0x00000000000000dc, Z0, Z0 + VPSHUFHW $0x00000000000000dc, Z0, Z0 + VMOVDQU32 Z0, 1984(DX) + RET diff --git a/field/koalabear/sis/sis.go b/field/koalabear/sis/sis.go index f0eb7d5d50..0687607c6d 100644 --- a/field/koalabear/sis/sis.go +++ b/field/koalabear/sis/sis.go @@ -145,12 +145,11 @@ func (r *RSis) Hash(v, res []koalabear.Element) error { if r.Degree == 512 && r.LogTwoBound == 16 { // this is our hot path, we don't use the iterator because with // avx512 instructions, it actually ends up being most of the CPU time. - er := koalabear.Element{1} // mul by 1 --> mont reduce + // er := koalabear.Element{1} // mul by 1 --> mont reduce polId := 0 var k512 [512]koalabear.Element vk := koalabear.Vector(k512[:]) vRes := koalabear.Vector(res) - vb := koalabear.Vector(k512[256:]) cosets, err := r.Domain.CosetTable() if err != nil { @@ -158,24 +157,28 @@ func (r *RSis) Hash(v, res []koalabear.Element) error { } vCosets := koalabear.Vector(cosets) + var k256 [256]koalabear.Element + for j := 0; j < len(v); j += 256 { start := j end := j + 256 end = min(end, len(v)) - // use half of vk to copy the v input to batch convert to regular form - copy(vb[:], v[start:end]) - for k := (end - start); k < 256; k++ { - vb[k][0] = 0 + _v := koalabear.Vector(v[start:end]) + if len(_v) != 256 { + // we need a buffer here + copy(k256[:], _v) + for k := len(_v); k < 256; k++ { + k256[k][0] = 0 + } } - // batch montgomery -> regular - vb.ScalarMul(vb, &er) + fft.SISToRefactor(k256[:], k512[:]) - // do the limb split - for k := 0; k < 256; k++ { - k512[k*2][0] = uint32(uint16(vb[k][0])) - k512[k*2+1][0] = uint32(uint16(vb[k][0] >> 16)) - } + // // do the limb split + // for k := 0; k < 256; k++ { + // k512[k*2][0] = uint32(uint16(vb[k][0])) + // k512[k*2+1][0] = uint32(uint16(vb[k][0] >> 16)) + // } // inner hash vk.Mul(vk, vCosets) From 2f04baa9600e7cc91dc98cbe54d9c1a840a5086f Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Wed, 22 Jan 2025 13:42:08 -0600 Subject: [PATCH 9/9] test: regen larger test vectors --- field/koalabear/sis/sis.sage | 4 +- field/koalabear/sis/test_cases.json | 4258 ++++++++++++++------------- 2 files changed, 2256 insertions(+), 2006 deletions(-) diff --git a/field/koalabear/sis/sis.sage b/field/koalabear/sis/sis.sage index 23cdd8994b..47ae312b06 100644 --- a/field/koalabear/sis/sis.sage +++ b/field/koalabear/sis/sis.sage @@ -222,12 +222,12 @@ degrees = [5,6,7,8,9] for bound in bounds: for degree in degrees: - PARAMS.append(SISParams(5, degree, bound, 10)) + PARAMS.append(SISParams(5, degree, bound, 260)) def random_inputs(size, modulus): return [GFR(random.randint(0, modulus - 1)) for _ in range(size)] -INPUTS = random_inputs(10, R) +INPUTS = random_inputs(260, R) TEST_CASES = {} diff --git a/field/koalabear/sis/test_cases.json b/field/koalabear/sis/test_cases.json index c37fb0d6bc..855c64e7db 100644 --- a/field/koalabear/sis/test_cases.json +++ b/field/koalabear/sis/test_cases.json @@ -1,15 +1,265 @@ { "inputs": [ - "2054924575", - "1532313989", - "1541391506", - "1662844892", - "630002143", - "1720899385", - "1042654129", - "189772050", - "2069485154", - "152959968" + "1231936313", + "272635701", + "1363195145", + "146502394", + "786022921", + "1600538057", + "1101920074", + "816467081", + "938509234", + "156688770", + "905523614", + "1908124650", + "705776240", + "1863485376", + "1754292447", + "1570979433", + "473301137", + "875579603", + "2047676735", + "1966504475", + "558409686", + "339878157", + "1097442723", + "541402894", + "1848742456", + "995768761", + "1331864525", + "214471731", + "1288306048", + "15755785", + "2109248883", + "1627979172", + "739883028", + "2010433396", + "1176685622", + "152002716", + "1958696492", + "185935591", + "141929556", + "1550779333", + "21749651", + "436304389", + "887439456", + "98945666", + "1781817703", + "849539750", + "1767122309", + "581814194", + "1168852417", + "886097536", + "414128699", + "1096414203", + "1738767812", + "1742678311", + "859252866", + "1679066979", + "1617848599", + "1060903090", + "1826235356", + "2062905540", + "1570595944", + "566543585", + "35638636", + "1413445190", + "161972208", + "135041293", + "685648669", + "734379238", + "1320291284", + "801885306", + "1942876308", + "1759892982", + "1539033731", + "1325513453", + "1430620242", + "1826491008", + "1435368049", + "863869199", + "969616733", + "7239502", + "373084573", + "1610630548", + "1618744784", + "454175157", + "3500533", + "2052327763", + "573081514", + "640117526", + "1816058735", + "792528516", + "563670165", + "1348392133", + "2099825175", + "1746421120", + "1665279139", + "291444085", + "1023025172", + "349862824", + "985835491", + "1449213533", + "72065976", + "181672060", + "1391263742", + "675016583", + "1533243532", + "1975439400", + "69312887", + "320307495", + "1564947793", + "1991097681", + "517021475", + "611770974", + "2114676744", + "373489597", + "1458137572", + "787144015", + "361897517", + "138241736", + "1258600688", + "988234108", + "1632465459", + "915208791", + "479056918", + "754231484", + "359124766", + "1608343812", + "918698944", + "1470262429", + "285683667", + "504325084", + "492585198", + "1072856305", + "1964958153", + "1506257961", + "1376079880", + "2126768180", + "605593938", + "1137475016", + "3262510", + "1606803594", + "226127930", + "19778512", + "688460144", + "2069086473", + "931509477", + "556870788", + "1867371211", + "1014963898", + "1130029434", + "302166976", + "175743333", + "437188347", + "1821163181", + "1925609837", + "843500570", + "1608812284", + "676175070", + "1008278877", + "18895702", + "317499306", + "1355245750", + "1863484273", + "1572094698", + "1176589464", + "141425081", + "1407535482", + "993550526", + "1485894716", + "1024093576", + "102029862", + "1871693051", + "478440352", + "2056687970", + "1671636249", + "1627027171", + "1080048089", + "122763389", + "1675502753", + "59698044", + "1395856369", + "1581768708", + "114835612", + "274399072", + "899560555", + "432889144", + "1401632028", + "866118312", + "1777003817", + "765719981", + "1738827214", + "814230170", + "2115909369", + "138512554", + "1878722843", + "1748158498", + "12832259", + "1542878964", + "1708003330", + "1949750678", + "758718977", + "829279900", + "1401051351", + "787572426", + "1252507338", + "1752945886", + "64032438", + "856425912", + "439684935", + "1412983943", + "1737450264", + "677605025", + "1439143647", + "1137291623", + "1320840017", + "562349421", + "605091797", + "1011228786", + "1058912857", + "1069723428", + "1535534901", + "1082473737", + "1091760274", + "1567703364", + "1681692321", + "555954833", + "865341268", + "1479436512", + "564478229", + "827739906", + "1347376912", + "1547912826", + "6984471", + "8490113", + "1248565689", + "654650919", + "1909555874", + "1330930269", + "1675802281", + "1844811910", + "838995408", + "831008755", + "1417449020", + "2046131105", + "2091734843", + "1502044744", + "118920791", + "754113579", + "1473340200", + "1100252410", + "1793089219", + "611789225", + "1537314257", + "1308382905", + "95514951", + "758069645", + "1406897841", + "86376048", + "1225902270", + "1938082418", + "1730911825" ], "entries": [ { @@ -17,41 +267,41 @@ "seed": 5, "logTwoDegree": 5, "logTwoBound": 8, - "maxNbElementsToHash": 10 + "maxNbElementsToHash": 260 }, "expected": [ - "1080428259", - "1687525615", - "354307121", - "1736829172", - "1346875548", - "1789257822", - "2096279203", - "963156319", - "2082915969", - "534292372", - "1363427365", - "716989995", - "1817287203", - "1428071073", - "971227757", - "1023521386", - "1130873962", - "1654210710", - "679723480", - "1182871182", - "1466897268", - "721599430", - "883873478", - "70584855", - "35113605", - "1844442489", - "477865125", - "1689419828", - "218389369", - "912532446", - "388927501", - "1813793590" + "972148406", + "2006855805", + "448553587", + "628050026", + "712930929", + "311071292", + "993345165", + "913305268", + "1521171571", + "1145212290", + "1935623667", + "1386280931", + "1516847620", + "81633442", + "1359360924", + "660561101", + "721470812", + "149383790", + "1693565204", + "554568118", + "332346570", + "598402191", + "1995713005", + "1441536401", + "808098250", + "248333366", + "52784092", + "1562155844", + "1469706100", + "1518844488", + "41618532", + "1072898035" ] }, { @@ -59,73 +309,73 @@ "seed": 5, "logTwoDegree": 6, "logTwoBound": 8, - "maxNbElementsToHash": 10 + "maxNbElementsToHash": 260 }, "expected": [ - "938087166", - "759136409", - "1142253596", - "2048750271", - "957552176", - "1145103448", - "719266699", - "1666257442", - "401905954", - "345937340", - "1776220032", - "1904887677", - "2020072714", - "387751165", - "1198873025", - "118408763", - "442090464", - "161621430", - "618868080", - "1359367047", - "338806358", - "123325467", - "1290733920", - "840361822", - "1715515031", - "1327686767", - "2004817953", - "1391331113", - "462114320", - "807409001", - "1648676766", - "371998565", - "791939920", - "1721217901", - "406077767", - "455928917", - "2038633592", - "1636385742", - "141501558", - "1305668182", - "255371176", - "1807500840", - "676300110", - "468429988", - "1512530442", - "1688558184", - "653895187", - "280445524", - "1462943595", - "1582654141", - "1460834393", - "928213154", - "1083157339", - "683996127", - "975807087", - "1853150912", - "1288419925", - "1546714850", - "342243304", - "1077075386", - "1532121899", - "308106676", - "1986275833", - "1514116058" + "148329831", + "440570540", + "1248831273", + "1212774635", + "1247841642", + "1766400148", + "1292287888", + "326565794", + "401417903", + "688607038", + "569733980", + "2112008839", + "1686654381", + "498274133", + "1678143642", + "409300304", + "1636555939", + "1999320095", + "2127315856", + "932940721", + "1527382618", + "1246600981", + "1486814673", + "2117740460", + "1873371451", + "21449514", + "1360669608", + "1270012485", + "1909839737", + "1812329971", + "2104626357", + "1493072434", + "1341454346", + "338630579", + "266640924", + "1778845490", + "879335191", + "802601440", + "1806953466", + "575812595", + "323240189", + "760808240", + "879228712", + "1551569075", + "2073533827", + "97110634", + "286143454", + "1960574667", + "536065175", + "1153107622", + "2000245102", + "1597336636", + "909781075", + "1551962656", + "633551459", + "688714891", + "977326056", + "2044288605", + "1393345636", + "1683112011", + "1427011368", + "567246823", + "1372580826", + "1601135774" ] }, { @@ -133,137 +383,137 @@ "seed": 5, "logTwoDegree": 7, "logTwoBound": 8, - "maxNbElementsToHash": 10 + "maxNbElementsToHash": 260 }, "expected": [ - "611261932", - "614371455", - "1964948457", - "986321679", - "1135441466", - "1544713334", - "1633307971", - "1547572074", - "350811751", - "1543043735", - "820025679", - "1721283672", - "83496008", - "807453", - "1736439418", - "259691786", - "951226007", - "276750417", - "1546829595", - "1026421103", - "1738648451", - "1087951733", - "858983569", - "1250731799", - "1516616669", - "1701053357", - "755249149", - "828889079", - "1386549734", - "1296294792", - "1871981996", - "1921405086", - "670828308", - "2035158345", - "124296418", - "439596692", - "1629277229", - "762055501", - "859831763", - "1305668182", - "255371176", - "1807500840", - "676300110", - "468429988", - "1512530442", - "1688558184", - "653895187", - "280445524", - "1462943595", - "1582654141", - "1460834393", - "928213154", - "1083157339", - "683996127", - "975807087", - "1853150912", - "1288419925", - "1546714850", - "342243304", - "1077075386", - "1532121899", - "308106676", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177" + "508102485", + "1985759046", + "1392979200", + "1665740080", + "2071303196", + "1113097614", + "6206419", + "1376473927", + "880351808", + "117236264", + "48379570", + "1195249869", + "535222063", + "819158266", + "290951416", + "1046995732", + "508014469", + "1440638229", + "1728555323", + "1462827153", + "156521401", + "25423028", + "1933513274", + "603021044", + "1069214961", + "798033914", + "1354764815", + "30875072", + "597829243", + "1008425480", + "74995909", + "1354631821", + "465468166", + "978054122", + "1704602603", + "1126340660", + "401526718", + "2108248517", + "906687017", + "467759940", + "1083441852", + "868316085", + "1438604123", + "1285700", + "1460556195", + "860260475", + "1622075636", + "534503639", + "42322608", + "1388753975", + "1243935448", + "99272813", + "189830498", + "1854355912", + "1634066718", + "259492619", + "1306907816", + "537154645", + "1409388568", + "890208428", + "14887893", + "1453204826", + "1243575589", + "316221213", + "80923209", + "1923801255", + "1313509939", + "934259734", + "933890400", + "614248105", + "174464753", + "2010399647", + "2062641886", + "628053230", + "1070171720", + "1244142660", + "860844378", + "353994924", + "782605897", + "923225283", + "697767498", + "744989038", + "1765038300", + "889179349", + "668108038", + "892035355", + "1897244991", + "733806713", + "320173009", + "1800250148", + "1844280442", + "1512523266", + "1982676715", + "355088144", + "1294287172", + "1609380408", + "728740826", + "1789939954", + "2042339135", + "1759967001", + "1387184774", + "130861910", + "1447606878", + "184159573", + "1654570257", + "365668001", + "1279977591", + "1615293881", + "1857232518", + "1076692590", + "2101186854", + "1516472051", + "1300028172", + "460191503", + "1349505938", + "629544381", + "1414491032", + "2108889737", + "345687027", + "234549937", + "1299425531", + "1881142575", + "499918173", + "1522568174", + "1075165009", + "1719820020", + "352115825", + "2053650067" ] }, { @@ -271,265 +521,265 @@ "seed": 5, "logTwoDegree": 8, "logTwoBound": 8, - "maxNbElementsToHash": 10 + "maxNbElementsToHash": 260 }, "expected": [ - "1645154281", - "1121101182", - "1331926136", - "1174242033", - "453785164", - "937802072", - "273568943", - "152329054", - "245723088", - "1912724586", - "1194348294", - "99851933", - "1363896054", - "1776753820", - "40514677", - "839679175", - "767955135", - "1612908543", - "896914240", - "2061004821", - "2123046752", - "755451854", - "857380677", - "1978606877", - "363517899", - "2020347475", - "1641610892", - "1461080750", - "1924984067", - "1390207265", - "443555161", - "926129758", - "992840281", - "1146153581", - "110509051", - "662119860", - "1620770022", - "1931969190", - "540211510", - "1305668182", - "255371176", - "1807500840", - "676300110", - "468429988", - "1512530442", - "1688558184", - "653895187", - "280445524", - "1462943595", - "1582654141", - "1460834393", - "928213154", - "1083157339", - "683996127", - "975807087", - "1853150912", - "1288419925", - "1546714850", - "342243304", - "1077075386", - "1532121899", - "308106676", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361" + "1278478533", + "655956391", + "848648648", + "1507558032", + "1427104957", + "1916661391", + "2095743411", + "1341863477", + "1458605789", + "1411292861", + "333718921", + "1874350050", + "156050413", + "1998770364", + "2056884951", + "37024319", + "216552116", + "1409871508", + "1854992196", + "1423236544", + "1585257144", + "1121848096", + "1138389596", + "1317564561", + "1830073460", + "819825096", + "111688279", + "217194261", + "660272444", + "1399026613", + "309422738", + "122349872", + "901886241", + "40334610", + "1366408511", + "201583156", + "1434254856", + "559413778", + "438096129", + "1439994618", + "517961998", + "1475121003", + "60233472", + "1338656855", + "551051687", + "1857504561", + "1441185639", + "1049035737", + "2120792294", + "187991714", + "199364166", + "1881054688", + "1704371236", + "1914278257", + "1847987038", + "1836782333", + "688533004", + "6134979", + "1670363897", + "1285096254", + "243089240", + "11672959", + "803889054", + "483302939", + "293685404", + "992780695", + "1814166900", + "1591429053", + "1325234105", + "1070142804", + "1662873729", + "1630032769", + "2002016932", + "1176935924", + "851465744", + "1078942642", + "841735726", + "1201398939", + "922167013", + "898725371", + "10796427", + "332722216", + "1500105090", + "180036504", + "1166033924", + "1228841225", + "1604158328", + "2103593549", + "1814958532", + "1354048022", + "104335414", + "928180247", + "1906571477", + "7633326", + "1047947500", + "2112220106", + "23505890", + "939553589", + "1307071991", + "1490268768", + "1043764262", + "488257465", + "795542512", + "1204286698", + "536667890", + "1061772803", + "407830784", + "365334820", + "936718858", + "89998822", + "1575981227", + "662994041", + "494201496", + "269936970", + "721904505", + "328245962", + "117525475", + "480412674", + "1040644614", + "2034711928", + "1443373711", + "596677502", + "1922703285", + "697243089", + "1438041554", + "2076250112", + "736167257", + "501199974", + "71252384", + "1629559204", + "1122173270", + "932309907", + "1286623155", + "1750547284", + "2952611", + "732565393", + "286489492", + "1683470362", + "1026316673", + "2019518896", + "746824585", + "228898978", + "863039166", + "621772220", + "1826779978", + "1920526862", + "1793696693", + "383541235", + "500823696", + "87031094", + "1223229476", + "278859492", + "714263285", + "1294379199", + "469957593", + "2128903977", + "1205894304", + "663339178", + "1697586501", + "1477682756", + "1041892606", + "1752417398", + "30084240", + "773676054", + "943513506", + "568003685", + "351470844", + "823942656", + "1731903298", + "1591529979", + "1630630499", + "1757894971", + "2103182661", + "1118128890", + "1439605393", + "1159888186", + "1864993521", + "267324822", + "764896848", + "1852927952", + "1433496663", + "537399207", + "1469422362", + "822532690", + "349512836", + "1834241812", + "322307742", + "541642559", + "404211759", + "1481299107", + "1177430154", + "2123919097", + "920791213", + "188360100", + "1949583581", + "2003980825", + "657865100", + "1072413966", + "2050585468", + "1254595687", + "508311501", + "1429411453", + "308740507", + "317644525", + "882773702", + "287245542", + "1427700688", + "643045102", + "771244124", + "597473730", + "246918825", + "406621780", + "270065969", + "40791006", + "1861319076", + "1868377673", + "1711032177", + "404598253", + "398251048", + "485636057", + "792973754", + "1889377909", + "785918817", + "429127263", + "512666089", + "1172828128", + "1806300532", + "1339732936", + "605825821", + "111659910", + "1463665940", + "1478282599", + "1788333240", + "930049973", + "329183210", + "732051108", + "928748206", + "490204734", + "352151841", + "1964958856", + "901562602", + "1679721004", + "1775489464", + "6685047", + "1710775831", + "1614611192", + "1697262726", + "1474292656", + "98158401", + "1962056406", + "1400047220", + "72995183", + "1415567780", + "1119070702", + "576482523", + "1042592209" ] }, { @@ -537,521 +787,521 @@ "seed": 5, "logTwoDegree": 9, "logTwoBound": 8, - "maxNbElementsToHash": 10 + "maxNbElementsToHash": 260 }, "expected": [ - "938087166", - "759136409", - "1142253596", - "2048750271", - "957552176", - "1145103448", - "719266699", - "1666257442", - "401905954", - "345937340", - "1776220032", - "1904887677", - "2020072714", - "387751165", - "1198873025", - "118408763", - "442090464", - "161621430", - "618868080", - "1359367047", - "338806358", - "123325467", - "1290733920", - "840361822", - "1715515031", - "1327686767", - "2004817953", - "1391331113", - "462114320", - "807409001", - "1648676766", - "371998565", - "791939920", - "1721217901", - "406077767", - "455928917", - "2038633592", - "1636385742", - "141501558", - "1305668182", - "255371176", - "1807500840", - "676300110", - "468429988", - "1512530442", - "1688558184", - "653895187", - "280445524", - "1462943595", - "1582654141", - "1460834393", - "928213154", - "1083157339", - "683996127", - "975807087", - "1853150912", - "1288419925", - "1546714850", - "342243304", - "1077075386", - "1532121899", - "308106676", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058", - "1766212177", - "1672486122", - "1351511361", - "1005682964", - "2049639419", - "1986275833", - "1514116058" + "1096214307", + "1227255957", + "1908017975", + "1839895747", + "209499979", + "1505707558", + "965752962", + "1629953243", + "1993543383", + "1645731291", + "1957572737", + "1392853673", + "1280972051", + "279171400", + "890462415", + "347888825", + "1130489020", + "844551981", + "944590456", + "305603273", + "670604048", + "519936291", + "1981527399", + "510504325", + "244131114", + "2050463581", + "996487355", + "1077766922", + "293751906", + "425832885", + "897356518", + "518946704", + "1087963811", + "54220888", + "1447394251", + "233577304", + "1814387072", + "1219307094", + "1914073182", + "14190258", + "2015175399", + "26240984", + "95298468", + "67189721", + "296260708", + "678681248", + "1850297074", + "567631750", + "1559757411", + "273884918", + "1052381300", + "1783003056", + "1170689141", + "1297206924", + "898613113", + "641483960", + "1373302268", + "963264667", + "1213847203", + "1560117905", + "546759207", + "394985836", + "142059529", + "846932935", + "2094244579", + "950757196", + "1191911873", + "1134069577", + "1001575445", + "353404788", + "1405110386", + "1937272982", + "1120457758", + "317797186", + "759948210", + "781165029", + "1388047364", + "1121428066", + "299803060", + "124265169", + "644352562", + "746487239", + "1720017918", + "1829253446", + "1861980895", + "852910136", + "841519826", + "267793019", + "1599512989", + "1262092352", + "415229205", + "1525406120", + "251903037", + "51794486", + "1175968849", + "143302711", + "803802519", + "1809533084", + "1998993695", + "1708092527", + "1273474736", + "873476535", + "275355270", + "1881623804", + "1694759665", + "519454072", + "1271049468", + "965725827", + "1377565730", + "1534783840", + "118589507", + "1763458286", + "1299848579", + "1818818784", + "1298792391", + "18748465", + "2093544099", + "789349329", + "1771642587", + "2029257668", + "886240497", + "1515549989", + "470666554", + "1033970323", + "1356328161", + "2001348289", + "1166827299", + "1628592815", + "1730126885", + "1172100951", + "636394245", + "538088442", + "1495884564", + "337567777", + "1242695243", + "1235522856", + "273289738", + "849980873", + "1243780424", + "1381458812", + "1634806283", + "2005216396", + "2119671634", + "804636145", + "493141571", + "1470601782", + "767740162", + "1955110586", + "2125777868", + "1717556091", + "1208666954", + "239242418", + "1664562798", + "898875156", + "1822500661", + "1253604891", + "1037949950", + "1605225677", + "324840651", + "1867190270", + "987129199", + "72228545", + "570017505", + "2125735480", + "1246531922", + "2077451194", + "2067291500", + "1507814390", + "512799551", + "935734957", + "22183593", + "2008397689", + "1757143325", + "357257348", + "1399113219", + "502682647", + "1302415291", + "1026521890", + "1910243913", + "189336976", + "1747191353", + "66317829", + "556304944", + "892894423", + "13071202", + "364603253", + "1905399892", + "2039066715", + "980272165", + "471297533", + "1120215717", + "341074138", + "833620711", + "1773477462", + "1648529576", + "1717086442", + "1529855304", + "30821705", + "54982573", + "166448191", + "367089368", + "1925118235", + "1156378671", + "85912459", + "800093276", + "1209092224", + "507585739", + "241599649", + "1298349610", + "717166369", + "1918940694", + "2020190616", + "875622164", + "1121909485", + "1318374076", + "719929831", + "33049782", + "1241271341", + "1860705126", + "188555468", + "331574057", + "892184703", + "132324182", + "447268033", + "1666932985", + "896728976", + "625143560", + "642106930", + "1479949359", + "962538324", + "1351473501", + "1350084321", + "634604279", + "651010669", + "1710617297", + "1786133237", + "1836178707", + "178655329", + "1005806098", + "647883530", + "491263876", + "742623369", + "1225977682", + "609840934", + "262493375", + "1817625736", + "851876804", + "116846863", + "482104224", + "182435635", + "1800473859", + "464579314", + "784188952", + "1145479740", + "82547782", + "1191790433", + "1516935909", + "1915490129", + "221160100", + "1039122510", + "135079081", + "1890725869", + "1414313988", + "435965273", + "2100901019", + "956108557", + "852051468", + "215978015", + "1619898518", + "335233570", + "645955139", + "2089655999", + "1952368683", + "110226932", + "414393035", + "1940216659", + "606903490", + "801992318", + "1795037139", + "1944641014", + "338217721", + "1772225188", + "1437405677", + "577111954", + "1213474759", + "1412448801", + "1630998238", + "2028972096", + "1954486245", + "554881046", + "486139363", + "1955044575", + "1716119121", + "1413470141", + "1235488379", + "1947097143", + "51839062", + "1098823450", + "1776555549", + "1054422789", + "1287793198", + "1218894707", + "1717438299", + "1678471910", + "2127656296", + "508327038", + "1298781710", + "1554343818", + "543658051", + "1952900913", + "222975945", + "1980671412", + "1116319291", + "971488171", + "1271433092", + "2057211183", + "1435463445", + "1216794970", + "1757057291", + "2129295285", + "385812622", + "1121966900", + "1265829404", + "539740951", + "762135368", + "289845774", + "1065927262", + "438823212", + "975747090", + "1932253198", + "747594956", + "1726467543", + "672035346", + "1513444312", + "1016055497", + "2013243112", + "5608477", + "1835436625", + "1461188568", + "1542617412", + "719551680", + "1001844554", + "1987466706", + "47134690", + "887055815", + "1645318138", + "343451413", + "356923837", + "1860805237", + "1645788865", + "564658356", + "1681428394", + "1026009371", + "296031169", + "848058855", + "44526132", + "1186467699", + "285098113", + "385356919", + "1700760264", + "1824863704", + "1974601140", + "1289441403", + "825862710", + "923824164", + "1022632961", + "1559950607", + "819627156", + "413873219", + "1998844010", + "1044439460", + "1676265531", + "1816634825", + "744874524", + "284438414", + "1492887807", + "1350167408", + "951358689", + "52288350", + "1111148466", + "436998629", + "1625515048", + "973335102", + "1551865677", + "519294385", + "237423522", + "1950388039", + "199635828", + "926062376", + "1569216823", + "681404348", + "143122759", + "2129851107", + "1549248987", + "1636929074", + "1108672170", + "1654900290", + "330383288", + "1922342481", + "1939578680", + "1023033584", + "406520463", + "1715305481", + "805455510", + "1156757447", + "520279859", + "386123843", + "613654258", + "2111380782", + "1236264538", + "2118697359", + "2126900671", + "76756620", + "1817164119", + "1892606135", + "763785521", + "1536881037", + "1541835177", + "1167793086", + "2067409637", + "993852103", + "1944352929", + "1626509626", + "932482653", + "1970198222", + "1667970280", + "112911181", + "839614855", + "645101241", + "1139094211", + "1005493034", + "481104477", + "1380328031", + "983440465", + "1322699180", + "322246526", + "1269012453", + "1196088758", + "1342920265", + "1884278873", + "588717273", + "550244931", + "1799156819", + "148553338", + "557523097", + "1141586655", + "1384870534", + "1261012381", + "1302756307", + "1390041415", + "1008472891", + "1126390558", + "1698537527", + "123603568", + "130126046", + "299957640", + "122695167", + "320894859", + "1353228096", + "1012875068", + "2028121606", + "1972693728", + "1423676515", + "1936100958", + "434301508", + "1182432750", + "414797518", + "128668857", + "1000607966", + "952411103", + "854052761", + "2012976696", + "877495740", + "1147117654", + "1910464764", + "676962795", + "1006557677", + "646748963", + "514340212", + "1538961319", + "1167218819", + "1442972626", + "1824517370", + "141299849", + "429868875", + "315978153", + "2063594733", + "94438822", + "985743783", + "1399168405", + "1892767584", + "1493431434", + "1567383526", + "794276366", + "1044210445", + "1368028713", + "205317951", + "936640231", + "9607630", + "1438955550", + "658008538", + "836951612", + "305139284", + "412385068", + "1815330615", + "230395717", + "602906284", + "1366815948", + "2034680970", + "1474461506", + "43057801", + "998725046" ] }, { @@ -1059,41 +1309,41 @@ "seed": 5, "logTwoDegree": 5, "logTwoBound": 16, - "maxNbElementsToHash": 10 + "maxNbElementsToHash": 260 }, "expected": [ - "2003778851", - "1094439305", - "1748204053", - "1893879957", - "853917697", - "1558309440", - "71413910", - "1425848263", - "1510352021", - "1211700477", - "478185661", - "1395703038", - "18737592", - "2014299672", - "1751777608", - "1916056171", - "1225296622", - "1208391412", - "1845225131", - "1704298162", - "1445536840", - "185443427", - "263834900", - "1257087723", - "1882409437", - "52983250", - "369032977", - "1123326106", - "1216674076", - "826801412", - "379132058", - "1051986355" + "722047616", + "160476505", + "550016991", + "133191089", + "397722478", + "1449336459", + "569388058", + "372036602", + "1358945922", + "246284442", + "890900408", + "2062906406", + "350898585", + "821031014", + "116886315", + "209430438", + "439867212", + "1622254393", + "1309128652", + "1189898547", + "1631155457", + "200857851", + "654642837", + "1227726705", + "861889228", + "1803210850", + "264157072", + "1152893012", + "1705941477", + "356536608", + "243661577", + "1504442645" ] }, { @@ -1101,73 +1351,73 @@ "seed": 5, "logTwoDegree": 6, "logTwoBound": 16, - "maxNbElementsToHash": 10 + "maxNbElementsToHash": 260 }, "expected": [ - "2048731893", - "970970551", - "1629151610", - "1715065999", - "29680993", - "286116916", - "1840456830", - "813463422", - "113350468", - "1762882656", - "221898775", - "2033252476", - "1269293550", - "1815392925", - "1920385464", - "1945669355", - "564838800", - "1833188257", - "291163817", - "1704298162", - "1445536840", - "185443427", - "263834900", - "1257087723", - "1882409437", - "52983250", - "369032977", - "1123326106", - "1216674076", - "826801412", - "379132058", - "1051986355", - "389832127", - "713533640", - "533579972", - "1303084217", - "1364574573", - "1451592063", - "386337148", - "1079454453", - "393871607", - "2026690023", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868" + "1663541077", + "1620772206", + "1555766478", + "951160539", + "1204701239", + "2083311821", + "798328262", + "370930055", + "1418521849", + "1385261598", + "1520061736", + "1303135992", + "456273459", + "133912129", + "703567540", + "869181227", + "409770557", + "111427581", + "158821737", + "1024318160", + "735714719", + "596571106", + "740676764", + "47113156", + "1432514829", + "107227057", + "1320270654", + "2083635348", + "981747038", + "791258237", + "1372801684", + "377713592", + "1607127233", + "720023709", + "68330780", + "1523147786", + "1770818369", + "1197266145", + "576133663", + "314511529", + "503712573", + "1338371661", + "1180918802", + "1433713714", + "1430845982", + "1447946506", + "701524322", + "1025638119", + "1044709523", + "1910033653", + "21651983", + "216334251", + "1510516031", + "649199605", + "1798903508", + "1712743383", + "1243750011", + "1188994784", + "2098171565", + "406675288", + "2070971319", + "1777142224", + "1822102120", + "944604655" ] }, { @@ -1175,137 +1425,137 @@ "seed": 5, "logTwoDegree": 7, "logTwoBound": 16, - "maxNbElementsToHash": 10 + "maxNbElementsToHash": 260 }, "expected": [ - "1222949757", - "1204348041", - "1258250989", - "346657268", - "1600871977", - "1884513988", - "1942024302", - "955967047", - "390729908", - "1706584310", - "1095173138", - "1183685087", - "1964780518", - "914966275", - "1672703767", - "117852045", - "826876886", - "1129447777", - "653365857", - "1704298162", - "1445536840", - "185443427", - "263834900", - "1257087723", - "1882409437", - "52983250", - "369032977", - "1123326106", - "1216674076", - "826801412", - "379132058", - "1051986355", - "389832127", - "713533640", - "533579972", - "1303084217", - "1364574573", - "1451592063", - "386337148", - "1079454453", - "393871607", - "2026690023", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968" + "776412672", + "2094322292", + "1858935634", + "139502662", + "1004767924", + "424703458", + "689251301", + "702265545", + "55465742", + "1697215255", + "1311827567", + "1500293014", + "532107633", + "221510384", + "2051503418", + "1843145628", + "320605521", + "654607422", + "497941996", + "90766221", + "1222284874", + "1236895407", + "1948142799", + "1408532788", + "1122764551", + "248632171", + "1539797351", + "1637296186", + "1788137987", + "1018147009", + "131202692", + "1867379995", + "410666953", + "1170204873", + "261669264", + "651725550", + "560668669", + "662217253", + "2112160662", + "468963444", + "515999476", + "1003908782", + "1318742814", + "602266939", + "854469049", + "1862507144", + "1224480302", + "1216306374", + "754298130", + "1497206257", + "1423446877", + "1417072028", + "1435365835", + "1322872564", + "1207739903", + "1379528770", + "403835319", + "853917626", + "1576311969", + "1256681036", + "1265044948", + "1146757168", + "1066461018", + "431240030", + "1533978012", + "680612876", + "1420982686", + "315860621", + "406532135", + "1916624277", + "587359251", + "1674878114", + "1485298909", + "1630039016", + "1071444600", + "1842347494", + "1065527984", + "444731975", + "657585266", + "536620993", + "360812084", + "1252209162", + "950410227", + "8642286", + "11825511", + "1521864621", + "932737879", + "573403048", + "1942089446", + "1797958151", + "1600213697", + "1055579171", + "2076240895", + "675340442", + "1790321360", + "247905816", + "956309655", + "69265983", + "1054609528", + "1788784058", + "851058522", + "1202741794", + "1212242317", + "1480983624", + "1830951467", + "2048577824", + "1515980252", + "1141218072", + "1743810256", + "1238921841", + "1076083292", + "996249286", + "542076426", + "1899781890", + "294977605", + "1571210100", + "331957351", + "1003041636", + "507246222", + "1778399857", + "1991186879", + "1826553142", + "876858751", + "927813053", + "1812708857", + "1046941267", + "992212879", + "1817654080" ] }, { @@ -1313,265 +1563,265 @@ "seed": 5, "logTwoDegree": 8, "logTwoBound": 16, - "maxNbElementsToHash": 10 + "maxNbElementsToHash": 260 }, "expected": [ - "391151845", - "1853827466", - "757615244", - "1653682306", - "2057412302", - "745560852", - "2048039427", - "502843583", - "1950078149", - "1714221719", - "478185661", - "1395703038", - "18737592", - "2014299672", - "1751777608", - "1916056171", - "1225296622", - "1208391412", - "1845225131", - "1704298162", - "1445536840", - "185443427", - "263834900", - "1257087723", - "1882409437", - "52983250", - "369032977", - "1123326106", - "1216674076", - "826801412", - "379132058", - "1051986355", - "389832127", - "713533640", - "533579972", - "1303084217", - "1364574573", - "1451592063", - "386337148", - "1079454453", - "393871607", - "2026690023", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064" + "1740114321", + "312433114", + "693699342", + "187374852", + "433595800", + "152171531", + "473376715", + "721566873", + "2070912942", + "1451850732", + "1221992874", + "1204552651", + "1606579872", + "258721447", + "1634362959", + "803001890", + "1127856650", + "784060990", + "209914772", + "1028837758", + "1655575616", + "1626362547", + "1727812682", + "420943131", + "922095954", + "1104774948", + "1072352781", + "304845750", + "1050481898", + "1787285712", + "563775723", + "2048714414", + "1393035024", + "1234812393", + "1813219736", + "1931748604", + "719509039", + "1231188793", + "1174390882", + "284033733", + "245395860", + "1094158466", + "1635951225", + "1605676745", + "1729024508", + "1668405101", + "372679541", + "1587405912", + "122473937", + "1698569469", + "703327672", + "1876668505", + "453455784", + "339878202", + "167020220", + "1131971630", + "529539955", + "1698230225", + "566741394", + "1483148257", + "776472743", + "443570310", + "803751198", + "347093015", + "1310965756", + "1266520968", + "1068334781", + "631529589", + "2112517705", + "1048584525", + "991424266", + "1380333742", + "1513871372", + "14438595", + "908051283", + "871193761", + "1720858514", + "1346413836", + "152559594", + "1263574013", + "581184795", + "1754154562", + "1388807312", + "2078156145", + "764263429", + "633473677", + "2011492910", + "886190543", + "1892728795", + "307921864", + "1095097", + "1106444916", + "2046006476", + "1741493538", + "1592546537", + "1994428311", + "917590726", + "9383464", + "1710009311", + "1306279658", + "278532269", + "1661716220", + "1931831185", + "734124116", + "393452512", + "1121459481", + "49390954", + "1363313715", + "1802905459", + "349714937", + "948249035", + "85816015", + "1195397427", + "163172526", + "857672637", + "1371512271", + "112922116", + "1421959006", + "870686649", + "751927534", + "1659159648", + "1604688655", + "943596423", + "547539405", + "2030827096", + "2039054112", + "1085956688", + "1393792112", + "1481743103", + "1869302376", + "2061032836", + "1837511752", + "751065773", + "785226810", + "927247021", + "1879579929", + "62648077", + "328989515", + "165665551", + "883796337", + "1206433187", + "1523046397", + "991569002", + "218887094", + "1563174692", + "269223010", + "1285666636", + "727454980", + "633246141", + "1729458457", + "305081776", + "261756030", + "1768252964", + "762152685", + "1989587758", + "297670324", + "175762417", + "1702237575", + "1502673863", + "599526163", + "1559494958", + "655795150", + "1994434626", + "1894979880", + "877327639", + "1025402899", + "1376450540", + "3497597", + "829395895", + "355187825", + "2128651236", + "1593494448", + "1850794957", + "198055428", + "1289341622", + "962263362", + "1150022105", + "273503448", + "1979673526", + "1090766982", + "1830757551", + "2028039782", + "1396950030", + "1170596404", + "436415023", + "1139220835", + "1992416357", + "124947232", + "1777492060", + "140782923", + "1659775546", + "1791972048", + "405516779", + "314389643", + "744278552", + "881742292", + "1234353235", + "1585033357", + "1846910310", + "1982469622", + "164278756", + "919489373", + "850966333", + "1206017956", + "1251846995", + "1030943295", + "2076881579", + "2129614105", + "384947899", + "326937540", + "596620239", + "1146628856", + "1454936317", + "2014131343", + "762884434", + "401456989", + "1345006180", + "660127810", + "695298375", + "1276557422", + "1796260351", + "1181543482", + "879699543", + "774001853", + "374645809", + "1611040058", + "170291950", + "857643414", + "1607301431", + "1984240616", + "1003279315", + "1753261740", + "1758491124", + "913221590", + "1468600409", + "617081834", + "1120890872", + "1103443416", + "1574331184", + "594842561", + "1874191630", + "175515368", + "833409088", + "1295821384", + "1716211461", + "1572572105", + "149925689", + "1913416197", + "1444660357", + "789617967", + "351847699", + "600422309", + "1422851332", + "1929756193", + "908392303", + "686476990" ] }, { @@ -1579,521 +1829,521 @@ "seed": 5, "logTwoDegree": 9, "logTwoBound": 16, - "maxNbElementsToHash": 10 + "maxNbElementsToHash": 260 }, "expected": [ - "2048731893", - "970970551", - "1629151610", - "1715065999", - "29680993", - "286116916", - "1840456830", - "813463422", - "113350468", - "1762882656", - "221898775", - "2033252476", - "1269293550", - "1815392925", - "1920385464", - "1945669355", - "564838800", - "1833188257", - "291163817", - "1704298162", - "1445536840", - "185443427", - "263834900", - "1257087723", - "1882409437", - "52983250", - "369032977", - "1123326106", - "1216674076", - "826801412", - "379132058", - "1051986355", - "389832127", - "713533640", - "533579972", - "1303084217", - "1364574573", - "1451592063", - "386337148", - "1079454453", - "393871607", - "2026690023", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868", - "161079968", - "133634218", - "540418064", - "2002459133", - "2084851912", - "1524168781", - "1543281868" + "986264594", + "1692396266", + "182803964", + "2091370941", + "655733963", + "1808145382", + "1198331190", + "1419844844", + "1868633474", + "1663618122", + "1720419526", + "1752796518", + "181336686", + "1214436794", + "176300116", + "860587052", + "956877570", + "464555043", + "1032363463", + "83756189", + "100054838", + "1795489986", + "1870440945", + "695185336", + "1314586701", + "1129771373", + "1398675175", + "407561970", + "956500084", + "1599693105", + "1901112900", + "510447346", + "941831950", + "2084546012", + "1192915792", + "1594383079", + "1412827808", + "1480270072", + "1117292149", + "450430305", + "1936195845", + "1897488012", + "486826489", + "255411132", + "480791010", + "1618479756", + "1994601678", + "59430082", + "1301752386", + "221948512", + "106046562", + "1402736318", + "822067740", + "417355890", + "1203850374", + "293487852", + "314555358", + "425599746", + "522850996", + "198961504", + "1028850826", + "1131667912", + "999140207", + "434911644", + "894040109", + "345061555", + "1411492055", + "513594238", + "809487618", + "1745010598", + "353743794", + "581550145", + "344352992", + "661728817", + "100312215", + "750511328", + "808862055", + "1885559743", + "1409899012", + "1024642618", + "133497911", + "662484233", + "341972897", + "2050513191", + "55753190", + "1608117034", + "1131378651", + "1456611257", + "149440808", + "154590798", + "1236035990", + "319793399", + "455930014", + "1080775727", + "918807393", + "1248978189", + "966549524", + "471141584", + "1142584898", + "15424098", + "1152338420", + "20712085", + "2035913510", + "350204463", + "1623469059", + "248984121", + "1615672262", + "1514892014", + "1569090694", + "923627119", + "1033997708", + "865496430", + "1034835073", + "1244346076", + "1217577831", + "618797284", + "803960188", + "764445667", + "147883763", + "147878960", + "1892269746", + "472770860", + "211493259", + "1171535841", + "936190359", + "1332541447", + "353776739", + "286091426", + "2061446573", + "129199282", + "62623652", + "1171808508", + "225701193", + "1649845686", + "1078782182", + "799911231", + "871663571", + "536433912", + "1773398259", + "1224572160", + "947087151", + "292837335", + "839677239", + "1005610030", + "441724360", + "1561428544", + "98069769", + "506564843", + "856357487", + "551762755", + "1590482368", + "261763343", + "1745798537", + "495316719", + "207744472", + "1546386486", + "1953856986", + "1294493843", + "693626668", + "815547455", + "876391248", + "1877818338", + "147078742", + "713552502", + "530401685", + "1451970978", + "910023984", + "1667834590", + "813125095", + "1783637388", + "757018920", + "161762242", + "168801910", + "1041020963", + "1475876644", + "1760449181", + "2107529580", + "1833716406", + "1162855453", + "830139965", + "1977606774", + "1531432369", + "1531287898", + "1291206687", + "1619292584", + "1928505058", + "1831553097", + "1635961412", + "1581258400", + "1570895062", + "1957851390", + "1031095008", + "1006898982", + "475334967", + "485049014", + "1416320592", + "620370351", + "283423995", + "282680151", + "1173372366", + "519348194", + "1469255624", + "1142502363", + "46292506", + "208631733", + "1249313464", + "735585937", + "980619559", + "770513260", + "1997500095", + "1408538724", + "337576838", + "1223785776", + "501557211", + "1147673852", + "1444627228", + "775146094", + "583160981", + "1152617790", + "1649457911", + "176417060", + "1507340041", + "1113126687", + "2105495718", + "32228150", + "707542069", + "1051999370", + "240834067", + "120147332", + "1787746960", + "1512989215", + "1595845826", + "1030781183", + "67128482", + "355091837", + "286338443", + "1692898961", + "1746632959", + "1290385962", + "168036102", + "1576350153", + "1288864681", + "1780937944", + "1305126170", + "1214931435", + "1147265356", + "248685443", + "1880309012", + "2058398653", + "1482502277", + "305462347", + "437314159", + "1456761080", + "771713157", + "1161388055", + "1335849066", + "1317782458", + "2123081509", + "249682382", + "1379157382", + "712124405", + "1683800131", + "1662798945", + "1316703630", + "1642206708", + "1759947309", + "984470243", + "296370498", + "109062082", + "385857206", + "1410020637", + "2090833607", + "551092678", + "1128241865", + "709610711", + "865526289", + "372632244", + "611744810", + "127521643", + "1310641425", + "1130853726", + "761597010", + "829420330", + "322368997", + "1886209512", + "1700936277", + "455358882", + "96891105", + "959923001", + "1788440080", + "1742326996", + "1913409311", + "125669147", + "198480918", + "578223029", + "697521182", + "643434943", + "1765439545", + "1290462939", + "1471377653", + "1580764765", + "1428573375", + "272930648", + "1953269969", + "1412758577", + "692068128", + "1036910026", + "1391532306", + "356988438", + "1921358892", + "814897862", + "954153585", + "335154981", + "178122106", + "1764341780", + "1258867484", + "483062647", + "503538112", + "1831615654", + "1131089297", + "1213584611", + "1116944419", + "1023102790", + "106071008", + "884508754", + "338110929", + "634105347", + "645059269", + "701970059", + "1672127065", + "883533249", + "1602919272", + "969192260", + "778997549", + "614665703", + "1568181826", + "606000837", + "169008972", + "812840338", + "977248979", + "1103529728", + "1245226112", + "1864914030", + "1353966855", + "1045154547", + "891820738", + "450940682", + "1124469230", + "1951936245", + "1174642412", + "114837412", + "1834898018", + "596977812", + "1994204863", + "1094681637", + "730771253", + "750174378", + "1923343892", + "471478745", + "209586178", + "1651904839", + "1187749648", + "1298737223", + "380396591", + "418266754", + "210291953", + "1382032623", + "867443477", + "1300998767", + "2107006925", + "1446947136", + "2084505769", + "560617700", + "1475181380", + "829467536", + "1709359817", + "1155990755", + "231640324", + "1065670836", + "1226820069", + "736751272", + "2031557805", + "267896773", + "118754167", + "1861056790", + "1875645440", + "910662509", + "788583675", + "2086902245", + "681492576", + "299856476", + "1141753062", + "660427786", + "5006379", + "360384907", + "411398515", + "1029250661", + "203481810", + "2044985095", + "57239683", + "82837438", + "460428524", + "1744466530", + "203675533", + "2023089901", + "1627375186", + "351907149", + "575443124", + "558492831", + "332269349", + "1778713992", + "1393203857", + "618100066", + "137575050", + "552398247", + "1578995748", + "1182212373", + "680438006", + "1697154387", + "513779831", + "696113455", + "496306153", + "1949274843", + "1245049930", + "1630578703", + "219053157", + "2079763793", + "1771105676", + "96316742", + "91516634", + "1864301363", + "213291695", + "1284030720", + "285337634", + "1563686513", + "1002951550", + "1007668897", + "670954520", + "1616812911", + "1659870119", + "226077586", + "805067145", + "557879422", + "961788281", + "1733578756", + "764529438", + "1674870183", + "1813584308", + "1053857448", + "395621175", + "39982737", + "1888129024", + "340087668", + "1592147667", + "881718376", + "1439187250", + "846317770", + "144443074", + "800063897", + "402695653", + "360792495", + "920355201", + "1274049172", + "1055986698", + "36989751", + "513119574", + "1743034809", + "634740231", + "844997462", + "297931486", + "405361896", + "573407073", + "1713577225", + "1443032106", + "1204613567", + "1777349413", + "627086311", + "1335295830", + "1250594928", + "1303807660", + "1563592465", + "1310915053", + "806651244", + "1303187165", + "1402939667", + "1099399610", + "600033133", + "1645445940", + "459481647", + "1242883796", + "304296133", + "329185352", + "1168448016", + "1326316286", + "49670174", + "283890362", + "805898156", + "1785237536", + "289537142", + "1307941862", + "195664666", + "1361320773", + "1917022400", + "1913610475", + "150631635", + "1525565425", + "904228078", + "2068480327", + "2125218048", + "631788173", + "480016815", + "1556451543" ] } ]