diff --git a/ecc/bls12-377/fr/fft/fft.go b/ecc/bls12-377/fr/fft/fft.go index ff9623cfb..afdb821d3 100644 --- a/ecc/bls12-377/fr/fft/fft.go +++ b/ecc/bls12-377/fr/fft/fft.go @@ -29,7 +29,8 @@ const butterflyThreshold = 16 // if decimation == DIT (decimation in time), the input must be in bit-reversed order // if decimation == DIF (decimation in frequency), the output will be in bit-reversed order func (domain *Domain) FFT(a []fr.Element, decimation Decimation, opts ...Option) { - + // perf note; this option pattern actually allocates on the heap and comes at a cost when + // doing many small FFTs! opt := fftOptions(opts...) // find the stage where we should stop spawning go routines in our recursive calls @@ -199,15 +200,9 @@ func difFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart n := len(a) if n == 1 { return - } else if stage >= twiddlesStartStage { - if n == 256 { - kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) - return - } else if n == 32 { - kerDIFNP_32(a, twiddles, stage-twiddlesStartStage) - return - } - + } else if n == 256 && stage >= twiddlesStartStage { + kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) + return } m := n >> 1 @@ -290,15 +285,9 @@ func ditFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart n := len(a) if n == 1 { return - } else if stage >= twiddlesStartStage { - if n == 32 { - kerDITNP_32(a, twiddles, stage-twiddlesStartStage) - return - } else if n == 256 { - kerDITNP_256(a, twiddles, stage-twiddlesStartStage) - return - } - + } else if n == 256 && stage >= twiddlesStartStage { + kerDITNP_256(a, twiddles, stage-twiddlesStartStage) + return } m := n >> 1 @@ -426,39 +415,3 @@ func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { } innerDITWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) } - -func kerDIFNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - innerDIFWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) - for offset := 0; offset < 32; offset += 16 { - innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) - } - for offset := 0; offset < 32; offset += 8 { - innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) - } - for offset := 0; offset < 32; offset += 4 { - innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) - } - for offset := 0; offset < 32; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } -} - -func kerDITNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - for offset := 0; offset < 32; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } - for offset := 0; offset < 32; offset += 4 { - innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) - } - for offset := 0; offset < 32; offset += 8 { - innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) - } - for offset := 0; offset < 32; offset += 16 { - innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) - } - innerDITWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) -} diff --git a/ecc/bls12-377/fr/sis/sis.go b/ecc/bls12-377/fr/sis/sis.go index e15973275..5be811d82 100644 --- a/ecc/bls12-377/fr/sis/sis.go +++ b/ecc/bls12-377/fr/sis/sis.go @@ -37,6 +37,9 @@ type RSis struct { Domain *fft.Domain maxNbElementsToHash int + + smallFFT func([]fr.Element) + twiddlesCoset []fr.Element // used in conjunction with the smallFFT; } // NewRSis creates an instance of RSis. @@ -97,6 +100,18 @@ func NewRSis(seed int64, logTwoDegree, logTwoBound, maxNbElementsToHash int) (*R maxNbElementsToHash: maxNbElementsToHash, } + r.smallFFT = func(p []fr.Element) { + r.Domain.FFT(p, fft.DIF, fft.OnCoset(), fft.WithNbTasks(1)) + } + + // if we have a FFT kernel of the size of the domain cardinality, we use it. + if r.Domain.Cardinality == 64 { + r.twiddlesCoset = PrecomputeTwiddlesCoset(r.Domain.Generator, shift) + r.smallFFT = func(a []fr.Element) { + FFT64(a, r.twiddlesCoset) + } + } + // filling A a := make([]fr.Element, n*r.Degree) ag := make([]fr.Element, n*r.Degree) @@ -171,7 +186,16 @@ func (r *RSis) InnerHash(it *LimbIterator, res, k fr.Vector, polId int) { return } - r.Domain.FFT(k, fft.DIF, fft.OnCoset(), fft.WithNbTasks(1)) + // r.Domain.FFT(k, fft.DIF, fft.OnCoset(), fft.WithNbTasks(1)) + // for perf, we use directly what's exposed; + r.smallFFT(k) + // k.Mul(k, fr.Vector(r.cosetTable)) + // if r.Domain.KernelDIF != nil { + // r.Domain.KernelDIF(k) + // } else { + // r.Domain.FFT(k, fft.DIF, fft.WithNbTasks(1)) + // } + mulModAcc(res, r.Ag[polId], k) } diff --git a/ecc/bls12-377/fr/sis/sis_fft.go b/ecc/bls12-377/fr/sis/sis_fft.go new file mode 100644 index 000000000..f4f4db3ab --- /dev/null +++ b/ecc/bls12-377/fr/sis/sis_fft.go @@ -0,0 +1,556 @@ +// Copyright 2020-2025 Consensys Software Inc. +// Licensed under the Apache License, Version 2.0. See the LICENSE file for details. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package sis + +import ( + "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" + "math/big" +) + +// FFT64 is generated by gnark-crypto and contains the unrolled code for FFT (DIF) on 64 elements +// equivalent code: r.Domain.FFT(k, fft.DIF, fft.OnCoset(), fft.WithNbTasks(1)) +// twiddlesCoset must be pre-computed from twiddles and coset table, see PrecomputeTwiddlesCoset +func FFT64(a []fr.Element, twiddlesCoset []fr.Element) { + + a[32].Mul(&a[32], &twiddlesCoset[0]) + a[33].Mul(&a[33], &twiddlesCoset[0]) + a[34].Mul(&a[34], &twiddlesCoset[0]) + a[35].Mul(&a[35], &twiddlesCoset[0]) + a[36].Mul(&a[36], &twiddlesCoset[0]) + a[37].Mul(&a[37], &twiddlesCoset[0]) + a[38].Mul(&a[38], &twiddlesCoset[0]) + a[39].Mul(&a[39], &twiddlesCoset[0]) + a[40].Mul(&a[40], &twiddlesCoset[0]) + a[41].Mul(&a[41], &twiddlesCoset[0]) + a[42].Mul(&a[42], &twiddlesCoset[0]) + a[43].Mul(&a[43], &twiddlesCoset[0]) + a[44].Mul(&a[44], &twiddlesCoset[0]) + a[45].Mul(&a[45], &twiddlesCoset[0]) + a[46].Mul(&a[46], &twiddlesCoset[0]) + a[47].Mul(&a[47], &twiddlesCoset[0]) + a[48].Mul(&a[48], &twiddlesCoset[0]) + a[49].Mul(&a[49], &twiddlesCoset[0]) + a[50].Mul(&a[50], &twiddlesCoset[0]) + a[51].Mul(&a[51], &twiddlesCoset[0]) + a[52].Mul(&a[52], &twiddlesCoset[0]) + a[53].Mul(&a[53], &twiddlesCoset[0]) + a[54].Mul(&a[54], &twiddlesCoset[0]) + a[55].Mul(&a[55], &twiddlesCoset[0]) + a[56].Mul(&a[56], &twiddlesCoset[0]) + a[57].Mul(&a[57], &twiddlesCoset[0]) + a[58].Mul(&a[58], &twiddlesCoset[0]) + a[59].Mul(&a[59], &twiddlesCoset[0]) + a[60].Mul(&a[60], &twiddlesCoset[0]) + a[61].Mul(&a[61], &twiddlesCoset[0]) + a[62].Mul(&a[62], &twiddlesCoset[0]) + a[63].Mul(&a[63], &twiddlesCoset[0]) + fr.Butterfly(&a[0], &a[32]) + fr.Butterfly(&a[1], &a[33]) + fr.Butterfly(&a[2], &a[34]) + fr.Butterfly(&a[3], &a[35]) + fr.Butterfly(&a[4], &a[36]) + fr.Butterfly(&a[5], &a[37]) + fr.Butterfly(&a[6], &a[38]) + fr.Butterfly(&a[7], &a[39]) + fr.Butterfly(&a[8], &a[40]) + fr.Butterfly(&a[9], &a[41]) + fr.Butterfly(&a[10], &a[42]) + fr.Butterfly(&a[11], &a[43]) + fr.Butterfly(&a[12], &a[44]) + fr.Butterfly(&a[13], &a[45]) + fr.Butterfly(&a[14], &a[46]) + fr.Butterfly(&a[15], &a[47]) + fr.Butterfly(&a[16], &a[48]) + fr.Butterfly(&a[17], &a[49]) + fr.Butterfly(&a[18], &a[50]) + fr.Butterfly(&a[19], &a[51]) + fr.Butterfly(&a[20], &a[52]) + fr.Butterfly(&a[21], &a[53]) + fr.Butterfly(&a[22], &a[54]) + fr.Butterfly(&a[23], &a[55]) + fr.Butterfly(&a[24], &a[56]) + fr.Butterfly(&a[25], &a[57]) + fr.Butterfly(&a[26], &a[58]) + fr.Butterfly(&a[27], &a[59]) + fr.Butterfly(&a[28], &a[60]) + fr.Butterfly(&a[29], &a[61]) + fr.Butterfly(&a[30], &a[62]) + fr.Butterfly(&a[31], &a[63]) + a[16].Mul(&a[16], &twiddlesCoset[1]) + a[17].Mul(&a[17], &twiddlesCoset[1]) + a[18].Mul(&a[18], &twiddlesCoset[1]) + a[19].Mul(&a[19], &twiddlesCoset[1]) + a[20].Mul(&a[20], &twiddlesCoset[1]) + a[21].Mul(&a[21], &twiddlesCoset[1]) + a[22].Mul(&a[22], &twiddlesCoset[1]) + a[23].Mul(&a[23], &twiddlesCoset[1]) + a[24].Mul(&a[24], &twiddlesCoset[1]) + a[25].Mul(&a[25], &twiddlesCoset[1]) + a[26].Mul(&a[26], &twiddlesCoset[1]) + a[27].Mul(&a[27], &twiddlesCoset[1]) + a[28].Mul(&a[28], &twiddlesCoset[1]) + a[29].Mul(&a[29], &twiddlesCoset[1]) + a[30].Mul(&a[30], &twiddlesCoset[1]) + a[31].Mul(&a[31], &twiddlesCoset[1]) + a[48].Mul(&a[48], &twiddlesCoset[2]) + a[49].Mul(&a[49], &twiddlesCoset[2]) + a[50].Mul(&a[50], &twiddlesCoset[2]) + a[51].Mul(&a[51], &twiddlesCoset[2]) + a[52].Mul(&a[52], &twiddlesCoset[2]) + a[53].Mul(&a[53], &twiddlesCoset[2]) + a[54].Mul(&a[54], &twiddlesCoset[2]) + a[55].Mul(&a[55], &twiddlesCoset[2]) + a[56].Mul(&a[56], &twiddlesCoset[2]) + a[57].Mul(&a[57], &twiddlesCoset[2]) + a[58].Mul(&a[58], &twiddlesCoset[2]) + a[59].Mul(&a[59], &twiddlesCoset[2]) + a[60].Mul(&a[60], &twiddlesCoset[2]) + a[61].Mul(&a[61], &twiddlesCoset[2]) + a[62].Mul(&a[62], &twiddlesCoset[2]) + a[63].Mul(&a[63], &twiddlesCoset[2]) + fr.Butterfly(&a[0], &a[16]) + fr.Butterfly(&a[1], &a[17]) + fr.Butterfly(&a[2], &a[18]) + fr.Butterfly(&a[3], &a[19]) + fr.Butterfly(&a[4], &a[20]) + fr.Butterfly(&a[5], &a[21]) + fr.Butterfly(&a[6], &a[22]) + fr.Butterfly(&a[7], &a[23]) + fr.Butterfly(&a[8], &a[24]) + fr.Butterfly(&a[9], &a[25]) + fr.Butterfly(&a[10], &a[26]) + fr.Butterfly(&a[11], &a[27]) + fr.Butterfly(&a[12], &a[28]) + fr.Butterfly(&a[13], &a[29]) + fr.Butterfly(&a[14], &a[30]) + fr.Butterfly(&a[15], &a[31]) + fr.Butterfly(&a[32], &a[48]) + fr.Butterfly(&a[33], &a[49]) + fr.Butterfly(&a[34], &a[50]) + fr.Butterfly(&a[35], &a[51]) + fr.Butterfly(&a[36], &a[52]) + fr.Butterfly(&a[37], &a[53]) + fr.Butterfly(&a[38], &a[54]) + fr.Butterfly(&a[39], &a[55]) + fr.Butterfly(&a[40], &a[56]) + fr.Butterfly(&a[41], &a[57]) + fr.Butterfly(&a[42], &a[58]) + fr.Butterfly(&a[43], &a[59]) + fr.Butterfly(&a[44], &a[60]) + fr.Butterfly(&a[45], &a[61]) + fr.Butterfly(&a[46], &a[62]) + fr.Butterfly(&a[47], &a[63]) + a[8].Mul(&a[8], &twiddlesCoset[3]) + a[9].Mul(&a[9], &twiddlesCoset[3]) + a[10].Mul(&a[10], &twiddlesCoset[3]) + a[11].Mul(&a[11], &twiddlesCoset[3]) + a[12].Mul(&a[12], &twiddlesCoset[3]) + a[13].Mul(&a[13], &twiddlesCoset[3]) + a[14].Mul(&a[14], &twiddlesCoset[3]) + a[15].Mul(&a[15], &twiddlesCoset[3]) + a[24].Mul(&a[24], &twiddlesCoset[4]) + a[25].Mul(&a[25], &twiddlesCoset[4]) + a[26].Mul(&a[26], &twiddlesCoset[4]) + a[27].Mul(&a[27], &twiddlesCoset[4]) + a[28].Mul(&a[28], &twiddlesCoset[4]) + a[29].Mul(&a[29], &twiddlesCoset[4]) + a[30].Mul(&a[30], &twiddlesCoset[4]) + a[31].Mul(&a[31], &twiddlesCoset[4]) + a[40].Mul(&a[40], &twiddlesCoset[5]) + a[41].Mul(&a[41], &twiddlesCoset[5]) + a[42].Mul(&a[42], &twiddlesCoset[5]) + a[43].Mul(&a[43], &twiddlesCoset[5]) + a[44].Mul(&a[44], &twiddlesCoset[5]) + a[45].Mul(&a[45], &twiddlesCoset[5]) + a[46].Mul(&a[46], &twiddlesCoset[5]) + a[47].Mul(&a[47], &twiddlesCoset[5]) + a[56].Mul(&a[56], &twiddlesCoset[6]) + a[57].Mul(&a[57], &twiddlesCoset[6]) + a[58].Mul(&a[58], &twiddlesCoset[6]) + a[59].Mul(&a[59], &twiddlesCoset[6]) + a[60].Mul(&a[60], &twiddlesCoset[6]) + a[61].Mul(&a[61], &twiddlesCoset[6]) + a[62].Mul(&a[62], &twiddlesCoset[6]) + a[63].Mul(&a[63], &twiddlesCoset[6]) + fr.Butterfly(&a[0], &a[8]) + fr.Butterfly(&a[1], &a[9]) + fr.Butterfly(&a[2], &a[10]) + fr.Butterfly(&a[3], &a[11]) + fr.Butterfly(&a[4], &a[12]) + fr.Butterfly(&a[5], &a[13]) + fr.Butterfly(&a[6], &a[14]) + fr.Butterfly(&a[7], &a[15]) + fr.Butterfly(&a[16], &a[24]) + fr.Butterfly(&a[17], &a[25]) + fr.Butterfly(&a[18], &a[26]) + fr.Butterfly(&a[19], &a[27]) + fr.Butterfly(&a[20], &a[28]) + fr.Butterfly(&a[21], &a[29]) + fr.Butterfly(&a[22], &a[30]) + fr.Butterfly(&a[23], &a[31]) + fr.Butterfly(&a[32], &a[40]) + fr.Butterfly(&a[33], &a[41]) + fr.Butterfly(&a[34], &a[42]) + fr.Butterfly(&a[35], &a[43]) + fr.Butterfly(&a[36], &a[44]) + fr.Butterfly(&a[37], &a[45]) + fr.Butterfly(&a[38], &a[46]) + fr.Butterfly(&a[39], &a[47]) + fr.Butterfly(&a[48], &a[56]) + fr.Butterfly(&a[49], &a[57]) + fr.Butterfly(&a[50], &a[58]) + fr.Butterfly(&a[51], &a[59]) + fr.Butterfly(&a[52], &a[60]) + fr.Butterfly(&a[53], &a[61]) + fr.Butterfly(&a[54], &a[62]) + fr.Butterfly(&a[55], &a[63]) + a[4].Mul(&a[4], &twiddlesCoset[7]) + a[5].Mul(&a[5], &twiddlesCoset[7]) + a[6].Mul(&a[6], &twiddlesCoset[7]) + a[7].Mul(&a[7], &twiddlesCoset[7]) + a[12].Mul(&a[12], &twiddlesCoset[8]) + a[13].Mul(&a[13], &twiddlesCoset[8]) + a[14].Mul(&a[14], &twiddlesCoset[8]) + a[15].Mul(&a[15], &twiddlesCoset[8]) + a[20].Mul(&a[20], &twiddlesCoset[9]) + a[21].Mul(&a[21], &twiddlesCoset[9]) + a[22].Mul(&a[22], &twiddlesCoset[9]) + a[23].Mul(&a[23], &twiddlesCoset[9]) + a[28].Mul(&a[28], &twiddlesCoset[10]) + a[29].Mul(&a[29], &twiddlesCoset[10]) + a[30].Mul(&a[30], &twiddlesCoset[10]) + a[31].Mul(&a[31], &twiddlesCoset[10]) + a[36].Mul(&a[36], &twiddlesCoset[11]) + a[37].Mul(&a[37], &twiddlesCoset[11]) + a[38].Mul(&a[38], &twiddlesCoset[11]) + a[39].Mul(&a[39], &twiddlesCoset[11]) + a[44].Mul(&a[44], &twiddlesCoset[12]) + a[45].Mul(&a[45], &twiddlesCoset[12]) + a[46].Mul(&a[46], &twiddlesCoset[12]) + a[47].Mul(&a[47], &twiddlesCoset[12]) + a[52].Mul(&a[52], &twiddlesCoset[13]) + a[53].Mul(&a[53], &twiddlesCoset[13]) + a[54].Mul(&a[54], &twiddlesCoset[13]) + a[55].Mul(&a[55], &twiddlesCoset[13]) + a[60].Mul(&a[60], &twiddlesCoset[14]) + a[61].Mul(&a[61], &twiddlesCoset[14]) + a[62].Mul(&a[62], &twiddlesCoset[14]) + a[63].Mul(&a[63], &twiddlesCoset[14]) + fr.Butterfly(&a[0], &a[4]) + fr.Butterfly(&a[1], &a[5]) + fr.Butterfly(&a[2], &a[6]) + fr.Butterfly(&a[3], &a[7]) + fr.Butterfly(&a[8], &a[12]) + fr.Butterfly(&a[9], &a[13]) + fr.Butterfly(&a[10], &a[14]) + fr.Butterfly(&a[11], &a[15]) + fr.Butterfly(&a[16], &a[20]) + fr.Butterfly(&a[17], &a[21]) + fr.Butterfly(&a[18], &a[22]) + fr.Butterfly(&a[19], &a[23]) + fr.Butterfly(&a[24], &a[28]) + fr.Butterfly(&a[25], &a[29]) + fr.Butterfly(&a[26], &a[30]) + fr.Butterfly(&a[27], &a[31]) + fr.Butterfly(&a[32], &a[36]) + fr.Butterfly(&a[33], &a[37]) + fr.Butterfly(&a[34], &a[38]) + fr.Butterfly(&a[35], &a[39]) + fr.Butterfly(&a[40], &a[44]) + fr.Butterfly(&a[41], &a[45]) + fr.Butterfly(&a[42], &a[46]) + fr.Butterfly(&a[43], &a[47]) + fr.Butterfly(&a[48], &a[52]) + fr.Butterfly(&a[49], &a[53]) + fr.Butterfly(&a[50], &a[54]) + fr.Butterfly(&a[51], &a[55]) + fr.Butterfly(&a[56], &a[60]) + fr.Butterfly(&a[57], &a[61]) + fr.Butterfly(&a[58], &a[62]) + fr.Butterfly(&a[59], &a[63]) + a[2].Mul(&a[2], &twiddlesCoset[15]) + a[3].Mul(&a[3], &twiddlesCoset[15]) + a[6].Mul(&a[6], &twiddlesCoset[16]) + a[7].Mul(&a[7], &twiddlesCoset[16]) + a[10].Mul(&a[10], &twiddlesCoset[17]) + a[11].Mul(&a[11], &twiddlesCoset[17]) + a[14].Mul(&a[14], &twiddlesCoset[18]) + a[15].Mul(&a[15], &twiddlesCoset[18]) + a[18].Mul(&a[18], &twiddlesCoset[19]) + a[19].Mul(&a[19], &twiddlesCoset[19]) + a[22].Mul(&a[22], &twiddlesCoset[20]) + a[23].Mul(&a[23], &twiddlesCoset[20]) + a[26].Mul(&a[26], &twiddlesCoset[21]) + a[27].Mul(&a[27], &twiddlesCoset[21]) + a[30].Mul(&a[30], &twiddlesCoset[22]) + a[31].Mul(&a[31], &twiddlesCoset[22]) + a[34].Mul(&a[34], &twiddlesCoset[23]) + a[35].Mul(&a[35], &twiddlesCoset[23]) + a[38].Mul(&a[38], &twiddlesCoset[24]) + a[39].Mul(&a[39], &twiddlesCoset[24]) + a[42].Mul(&a[42], &twiddlesCoset[25]) + a[43].Mul(&a[43], &twiddlesCoset[25]) + a[46].Mul(&a[46], &twiddlesCoset[26]) + a[47].Mul(&a[47], &twiddlesCoset[26]) + a[50].Mul(&a[50], &twiddlesCoset[27]) + a[51].Mul(&a[51], &twiddlesCoset[27]) + a[54].Mul(&a[54], &twiddlesCoset[28]) + a[55].Mul(&a[55], &twiddlesCoset[28]) + a[58].Mul(&a[58], &twiddlesCoset[29]) + a[59].Mul(&a[59], &twiddlesCoset[29]) + a[62].Mul(&a[62], &twiddlesCoset[30]) + a[63].Mul(&a[63], &twiddlesCoset[30]) + fr.Butterfly(&a[0], &a[2]) + fr.Butterfly(&a[1], &a[3]) + fr.Butterfly(&a[4], &a[6]) + fr.Butterfly(&a[5], &a[7]) + fr.Butterfly(&a[8], &a[10]) + fr.Butterfly(&a[9], &a[11]) + fr.Butterfly(&a[12], &a[14]) + fr.Butterfly(&a[13], &a[15]) + fr.Butterfly(&a[16], &a[18]) + fr.Butterfly(&a[17], &a[19]) + fr.Butterfly(&a[20], &a[22]) + fr.Butterfly(&a[21], &a[23]) + fr.Butterfly(&a[24], &a[26]) + fr.Butterfly(&a[25], &a[27]) + fr.Butterfly(&a[28], &a[30]) + fr.Butterfly(&a[29], &a[31]) + fr.Butterfly(&a[32], &a[34]) + fr.Butterfly(&a[33], &a[35]) + fr.Butterfly(&a[36], &a[38]) + fr.Butterfly(&a[37], &a[39]) + fr.Butterfly(&a[40], &a[42]) + fr.Butterfly(&a[41], &a[43]) + fr.Butterfly(&a[44], &a[46]) + fr.Butterfly(&a[45], &a[47]) + fr.Butterfly(&a[48], &a[50]) + fr.Butterfly(&a[49], &a[51]) + fr.Butterfly(&a[52], &a[54]) + fr.Butterfly(&a[53], &a[55]) + fr.Butterfly(&a[56], &a[58]) + fr.Butterfly(&a[57], &a[59]) + fr.Butterfly(&a[60], &a[62]) + fr.Butterfly(&a[61], &a[63]) + a[1].Mul(&a[1], &twiddlesCoset[31]) + a[3].Mul(&a[3], &twiddlesCoset[32]) + a[5].Mul(&a[5], &twiddlesCoset[33]) + a[7].Mul(&a[7], &twiddlesCoset[34]) + a[9].Mul(&a[9], &twiddlesCoset[35]) + a[11].Mul(&a[11], &twiddlesCoset[36]) + a[13].Mul(&a[13], &twiddlesCoset[37]) + a[15].Mul(&a[15], &twiddlesCoset[38]) + a[17].Mul(&a[17], &twiddlesCoset[39]) + a[19].Mul(&a[19], &twiddlesCoset[40]) + a[21].Mul(&a[21], &twiddlesCoset[41]) + a[23].Mul(&a[23], &twiddlesCoset[42]) + a[25].Mul(&a[25], &twiddlesCoset[43]) + a[27].Mul(&a[27], &twiddlesCoset[44]) + a[29].Mul(&a[29], &twiddlesCoset[45]) + a[31].Mul(&a[31], &twiddlesCoset[46]) + a[33].Mul(&a[33], &twiddlesCoset[47]) + a[35].Mul(&a[35], &twiddlesCoset[48]) + a[37].Mul(&a[37], &twiddlesCoset[49]) + a[39].Mul(&a[39], &twiddlesCoset[50]) + a[41].Mul(&a[41], &twiddlesCoset[51]) + a[43].Mul(&a[43], &twiddlesCoset[52]) + a[45].Mul(&a[45], &twiddlesCoset[53]) + a[47].Mul(&a[47], &twiddlesCoset[54]) + a[49].Mul(&a[49], &twiddlesCoset[55]) + a[51].Mul(&a[51], &twiddlesCoset[56]) + a[53].Mul(&a[53], &twiddlesCoset[57]) + a[55].Mul(&a[55], &twiddlesCoset[58]) + a[57].Mul(&a[57], &twiddlesCoset[59]) + a[59].Mul(&a[59], &twiddlesCoset[60]) + a[61].Mul(&a[61], &twiddlesCoset[61]) + a[63].Mul(&a[63], &twiddlesCoset[62]) + fr.Butterfly(&a[0], &a[1]) + fr.Butterfly(&a[2], &a[3]) + fr.Butterfly(&a[4], &a[5]) + fr.Butterfly(&a[6], &a[7]) + fr.Butterfly(&a[8], &a[9]) + fr.Butterfly(&a[10], &a[11]) + fr.Butterfly(&a[12], &a[13]) + fr.Butterfly(&a[14], &a[15]) + fr.Butterfly(&a[16], &a[17]) + fr.Butterfly(&a[18], &a[19]) + fr.Butterfly(&a[20], &a[21]) + fr.Butterfly(&a[22], &a[23]) + fr.Butterfly(&a[24], &a[25]) + fr.Butterfly(&a[26], &a[27]) + fr.Butterfly(&a[28], &a[29]) + fr.Butterfly(&a[30], &a[31]) + fr.Butterfly(&a[32], &a[33]) + fr.Butterfly(&a[34], &a[35]) + fr.Butterfly(&a[36], &a[37]) + fr.Butterfly(&a[38], &a[39]) + fr.Butterfly(&a[40], &a[41]) + fr.Butterfly(&a[42], &a[43]) + fr.Butterfly(&a[44], &a[45]) + fr.Butterfly(&a[46], &a[47]) + fr.Butterfly(&a[48], &a[49]) + fr.Butterfly(&a[50], &a[51]) + fr.Butterfly(&a[52], &a[53]) + fr.Butterfly(&a[54], &a[55]) + fr.Butterfly(&a[56], &a[57]) + fr.Butterfly(&a[58], &a[59]) + fr.Butterfly(&a[60], &a[61]) + fr.Butterfly(&a[62], &a[63]) +} + +// PrecomputeTwiddlesCoset precomputes twiddlesCoset from twiddles and coset table +// it then return all elements in the correct order for the unrolled FFT. +func PrecomputeTwiddlesCoset(generator, shifter fr.Element) []fr.Element { + toReturn := make([]fr.Element, 63) + var r, s fr.Element + e := new(big.Int) + + s = shifter + for k := 0; k < 5; k++ { + s.Square(&s) + } + toReturn[0] = s + s = shifter + for k := 0; k < 4; k++ { + s.Square(&s) + } + toReturn[1] = s + r.Exp(generator, e.SetUint64(uint64(1<<4*1))) + toReturn[2].Mul(&r, &s) + s = shifter + for k := 0; k < 3; k++ { + s.Square(&s) + } + toReturn[3] = s + r.Exp(generator, e.SetUint64(uint64(1<<3*2))) + toReturn[4].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<3*1))) + toReturn[5].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<3*3))) + toReturn[6].Mul(&r, &s) + s = shifter + for k := 0; k < 2; k++ { + s.Square(&s) + } + toReturn[7] = s + r.Exp(generator, e.SetUint64(uint64(1<<2*4))) + toReturn[8].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<2*2))) + toReturn[9].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<2*6))) + toReturn[10].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<2*1))) + toReturn[11].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<2*5))) + toReturn[12].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<2*3))) + toReturn[13].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<2*7))) + toReturn[14].Mul(&r, &s) + s = shifter + for k := 0; k < 1; k++ { + s.Square(&s) + } + toReturn[15] = s + r.Exp(generator, e.SetUint64(uint64(1<<1*8))) + toReturn[16].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*4))) + toReturn[17].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*12))) + toReturn[18].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*2))) + toReturn[19].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*10))) + toReturn[20].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*6))) + toReturn[21].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*14))) + toReturn[22].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*1))) + toReturn[23].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*9))) + toReturn[24].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*5))) + toReturn[25].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*13))) + toReturn[26].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*3))) + toReturn[27].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*11))) + toReturn[28].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*7))) + toReturn[29].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*15))) + toReturn[30].Mul(&r, &s) + s = shifter + for k := 0; k < 0; k++ { + s.Square(&s) + } + toReturn[31] = s + r.Exp(generator, e.SetUint64(uint64(1<<0*16))) + toReturn[32].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*8))) + toReturn[33].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*24))) + toReturn[34].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*4))) + toReturn[35].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*20))) + toReturn[36].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*12))) + toReturn[37].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*28))) + toReturn[38].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*2))) + toReturn[39].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*18))) + toReturn[40].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*10))) + toReturn[41].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*26))) + toReturn[42].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*6))) + toReturn[43].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*22))) + toReturn[44].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*14))) + toReturn[45].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*30))) + toReturn[46].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*1))) + toReturn[47].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*17))) + toReturn[48].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*9))) + toReturn[49].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*25))) + toReturn[50].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*5))) + toReturn[51].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*21))) + toReturn[52].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*13))) + toReturn[53].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*29))) + toReturn[54].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*3))) + toReturn[55].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*19))) + toReturn[56].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*11))) + toReturn[57].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*27))) + toReturn[58].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*7))) + toReturn[59].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*23))) + toReturn[60].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*15))) + toReturn[61].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*31))) + toReturn[62].Mul(&r, &s) + return toReturn +} diff --git a/ecc/bls12-381/fr/fft/fft.go b/ecc/bls12-381/fr/fft/fft.go index 3b2e29739..d2a5e7200 100644 --- a/ecc/bls12-381/fr/fft/fft.go +++ b/ecc/bls12-381/fr/fft/fft.go @@ -29,7 +29,8 @@ const butterflyThreshold = 16 // if decimation == DIT (decimation in time), the input must be in bit-reversed order // if decimation == DIF (decimation in frequency), the output will be in bit-reversed order func (domain *Domain) FFT(a []fr.Element, decimation Decimation, opts ...Option) { - + // perf note; this option pattern actually allocates on the heap and comes at a cost when + // doing many small FFTs! opt := fftOptions(opts...) // find the stage where we should stop spawning go routines in our recursive calls @@ -199,15 +200,9 @@ func difFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart n := len(a) if n == 1 { return - } else if stage >= twiddlesStartStage { - if n == 256 { - kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) - return - } else if n == 32 { - kerDIFNP_32(a, twiddles, stage-twiddlesStartStage) - return - } - + } else if n == 256 && stage >= twiddlesStartStage { + kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) + return } m := n >> 1 @@ -290,15 +285,9 @@ func ditFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart n := len(a) if n == 1 { return - } else if stage >= twiddlesStartStage { - if n == 32 { - kerDITNP_32(a, twiddles, stage-twiddlesStartStage) - return - } else if n == 256 { - kerDITNP_256(a, twiddles, stage-twiddlesStartStage) - return - } - + } else if n == 256 && stage >= twiddlesStartStage { + kerDITNP_256(a, twiddles, stage-twiddlesStartStage) + return } m := n >> 1 @@ -426,39 +415,3 @@ func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { } innerDITWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) } - -func kerDIFNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - innerDIFWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) - for offset := 0; offset < 32; offset += 16 { - innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) - } - for offset := 0; offset < 32; offset += 8 { - innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) - } - for offset := 0; offset < 32; offset += 4 { - innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) - } - for offset := 0; offset < 32; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } -} - -func kerDITNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - for offset := 0; offset < 32; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } - for offset := 0; offset < 32; offset += 4 { - innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) - } - for offset := 0; offset < 32; offset += 8 { - innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) - } - for offset := 0; offset < 32; offset += 16 { - innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) - } - innerDITWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) -} diff --git a/ecc/bls24-315/fr/fft/fft.go b/ecc/bls24-315/fr/fft/fft.go index 506f12fb4..85e25dd54 100644 --- a/ecc/bls24-315/fr/fft/fft.go +++ b/ecc/bls24-315/fr/fft/fft.go @@ -29,7 +29,8 @@ const butterflyThreshold = 16 // if decimation == DIT (decimation in time), the input must be in bit-reversed order // if decimation == DIF (decimation in frequency), the output will be in bit-reversed order func (domain *Domain) FFT(a []fr.Element, decimation Decimation, opts ...Option) { - + // perf note; this option pattern actually allocates on the heap and comes at a cost when + // doing many small FFTs! opt := fftOptions(opts...) // find the stage where we should stop spawning go routines in our recursive calls @@ -199,15 +200,9 @@ func difFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart n := len(a) if n == 1 { return - } else if stage >= twiddlesStartStage { - if n == 256 { - kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) - return - } else if n == 32 { - kerDIFNP_32(a, twiddles, stage-twiddlesStartStage) - return - } - + } else if n == 256 && stage >= twiddlesStartStage { + kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) + return } m := n >> 1 @@ -290,15 +285,9 @@ func ditFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart n := len(a) if n == 1 { return - } else if stage >= twiddlesStartStage { - if n == 32 { - kerDITNP_32(a, twiddles, stage-twiddlesStartStage) - return - } else if n == 256 { - kerDITNP_256(a, twiddles, stage-twiddlesStartStage) - return - } - + } else if n == 256 && stage >= twiddlesStartStage { + kerDITNP_256(a, twiddles, stage-twiddlesStartStage) + return } m := n >> 1 @@ -426,39 +415,3 @@ func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { } innerDITWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) } - -func kerDIFNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - innerDIFWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) - for offset := 0; offset < 32; offset += 16 { - innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) - } - for offset := 0; offset < 32; offset += 8 { - innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) - } - for offset := 0; offset < 32; offset += 4 { - innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) - } - for offset := 0; offset < 32; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } -} - -func kerDITNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - for offset := 0; offset < 32; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } - for offset := 0; offset < 32; offset += 4 { - innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) - } - for offset := 0; offset < 32; offset += 8 { - innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) - } - for offset := 0; offset < 32; offset += 16 { - innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) - } - innerDITWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) -} diff --git a/ecc/bls24-317/fr/fft/fft.go b/ecc/bls24-317/fr/fft/fft.go index 4a418f87f..fead81fe4 100644 --- a/ecc/bls24-317/fr/fft/fft.go +++ b/ecc/bls24-317/fr/fft/fft.go @@ -29,7 +29,8 @@ const butterflyThreshold = 16 // if decimation == DIT (decimation in time), the input must be in bit-reversed order // if decimation == DIF (decimation in frequency), the output will be in bit-reversed order func (domain *Domain) FFT(a []fr.Element, decimation Decimation, opts ...Option) { - + // perf note; this option pattern actually allocates on the heap and comes at a cost when + // doing many small FFTs! opt := fftOptions(opts...) // find the stage where we should stop spawning go routines in our recursive calls @@ -199,15 +200,9 @@ func difFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart n := len(a) if n == 1 { return - } else if stage >= twiddlesStartStage { - if n == 256 { - kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) - return - } else if n == 32 { - kerDIFNP_32(a, twiddles, stage-twiddlesStartStage) - return - } - + } else if n == 256 && stage >= twiddlesStartStage { + kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) + return } m := n >> 1 @@ -290,15 +285,9 @@ func ditFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart n := len(a) if n == 1 { return - } else if stage >= twiddlesStartStage { - if n == 32 { - kerDITNP_32(a, twiddles, stage-twiddlesStartStage) - return - } else if n == 256 { - kerDITNP_256(a, twiddles, stage-twiddlesStartStage) - return - } - + } else if n == 256 && stage >= twiddlesStartStage { + kerDITNP_256(a, twiddles, stage-twiddlesStartStage) + return } m := n >> 1 @@ -426,39 +415,3 @@ func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { } innerDITWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) } - -func kerDIFNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - innerDIFWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) - for offset := 0; offset < 32; offset += 16 { - innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) - } - for offset := 0; offset < 32; offset += 8 { - innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) - } - for offset := 0; offset < 32; offset += 4 { - innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) - } - for offset := 0; offset < 32; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } -} - -func kerDITNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - for offset := 0; offset < 32; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } - for offset := 0; offset < 32; offset += 4 { - innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) - } - for offset := 0; offset < 32; offset += 8 { - innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) - } - for offset := 0; offset < 32; offset += 16 { - innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) - } - innerDITWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) -} diff --git a/ecc/bn254/fr/fft/fft.go b/ecc/bn254/fr/fft/fft.go index 463faffb6..1da9a883e 100644 --- a/ecc/bn254/fr/fft/fft.go +++ b/ecc/bn254/fr/fft/fft.go @@ -29,7 +29,8 @@ const butterflyThreshold = 16 // if decimation == DIT (decimation in time), the input must be in bit-reversed order // if decimation == DIF (decimation in frequency), the output will be in bit-reversed order func (domain *Domain) FFT(a []fr.Element, decimation Decimation, opts ...Option) { - + // perf note; this option pattern actually allocates on the heap and comes at a cost when + // doing many small FFTs! opt := fftOptions(opts...) // find the stage where we should stop spawning go routines in our recursive calls @@ -199,15 +200,9 @@ func difFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart n := len(a) if n == 1 { return - } else if stage >= twiddlesStartStage { - if n == 256 { - kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) - return - } else if n == 32 { - kerDIFNP_32(a, twiddles, stage-twiddlesStartStage) - return - } - + } else if n == 256 && stage >= twiddlesStartStage { + kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) + return } m := n >> 1 @@ -290,15 +285,9 @@ func ditFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart n := len(a) if n == 1 { return - } else if stage >= twiddlesStartStage { - if n == 32 { - kerDITNP_32(a, twiddles, stage-twiddlesStartStage) - return - } else if n == 256 { - kerDITNP_256(a, twiddles, stage-twiddlesStartStage) - return - } - + } else if n == 256 && stage >= twiddlesStartStage { + kerDITNP_256(a, twiddles, stage-twiddlesStartStage) + return } m := n >> 1 @@ -426,39 +415,3 @@ func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { } innerDITWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) } - -func kerDIFNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - innerDIFWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) - for offset := 0; offset < 32; offset += 16 { - innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) - } - for offset := 0; offset < 32; offset += 8 { - innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) - } - for offset := 0; offset < 32; offset += 4 { - innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) - } - for offset := 0; offset < 32; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } -} - -func kerDITNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - for offset := 0; offset < 32; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } - for offset := 0; offset < 32; offset += 4 { - innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) - } - for offset := 0; offset < 32; offset += 8 { - innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) - } - for offset := 0; offset < 32; offset += 16 { - innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) - } - innerDITWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) -} diff --git a/ecc/bw6-633/fr/fft/fft.go b/ecc/bw6-633/fr/fft/fft.go index 8fd7d57e7..3cae1e9f1 100644 --- a/ecc/bw6-633/fr/fft/fft.go +++ b/ecc/bw6-633/fr/fft/fft.go @@ -29,7 +29,8 @@ const butterflyThreshold = 16 // if decimation == DIT (decimation in time), the input must be in bit-reversed order // if decimation == DIF (decimation in frequency), the output will be in bit-reversed order func (domain *Domain) FFT(a []fr.Element, decimation Decimation, opts ...Option) { - + // perf note; this option pattern actually allocates on the heap and comes at a cost when + // doing many small FFTs! opt := fftOptions(opts...) // find the stage where we should stop spawning go routines in our recursive calls @@ -199,15 +200,9 @@ func difFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart n := len(a) if n == 1 { return - } else if stage >= twiddlesStartStage { - if n == 256 { - kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) - return - } else if n == 32 { - kerDIFNP_32(a, twiddles, stage-twiddlesStartStage) - return - } - + } else if n == 256 && stage >= twiddlesStartStage { + kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) + return } m := n >> 1 @@ -290,15 +285,9 @@ func ditFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart n := len(a) if n == 1 { return - } else if stage >= twiddlesStartStage { - if n == 32 { - kerDITNP_32(a, twiddles, stage-twiddlesStartStage) - return - } else if n == 256 { - kerDITNP_256(a, twiddles, stage-twiddlesStartStage) - return - } - + } else if n == 256 && stage >= twiddlesStartStage { + kerDITNP_256(a, twiddles, stage-twiddlesStartStage) + return } m := n >> 1 @@ -426,39 +415,3 @@ func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { } innerDITWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) } - -func kerDIFNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - innerDIFWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) - for offset := 0; offset < 32; offset += 16 { - innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) - } - for offset := 0; offset < 32; offset += 8 { - innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) - } - for offset := 0; offset < 32; offset += 4 { - innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) - } - for offset := 0; offset < 32; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } -} - -func kerDITNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - for offset := 0; offset < 32; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } - for offset := 0; offset < 32; offset += 4 { - innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) - } - for offset := 0; offset < 32; offset += 8 { - innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) - } - for offset := 0; offset < 32; offset += 16 { - innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) - } - innerDITWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) -} diff --git a/ecc/bw6-761/fr/fft/fft.go b/ecc/bw6-761/fr/fft/fft.go index 76d6bb7a7..c9ad068c0 100644 --- a/ecc/bw6-761/fr/fft/fft.go +++ b/ecc/bw6-761/fr/fft/fft.go @@ -29,7 +29,8 @@ const butterflyThreshold = 16 // if decimation == DIT (decimation in time), the input must be in bit-reversed order // if decimation == DIF (decimation in frequency), the output will be in bit-reversed order func (domain *Domain) FFT(a []fr.Element, decimation Decimation, opts ...Option) { - + // perf note; this option pattern actually allocates on the heap and comes at a cost when + // doing many small FFTs! opt := fftOptions(opts...) // find the stage where we should stop spawning go routines in our recursive calls @@ -199,15 +200,9 @@ func difFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart n := len(a) if n == 1 { return - } else if stage >= twiddlesStartStage { - if n == 256 { - kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) - return - } else if n == 32 { - kerDIFNP_32(a, twiddles, stage-twiddlesStartStage) - return - } - + } else if n == 256 && stage >= twiddlesStartStage { + kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) + return } m := n >> 1 @@ -290,15 +285,9 @@ func ditFFT(a []fr.Element, w fr.Element, twiddles [][]fr.Element, twiddlesStart n := len(a) if n == 1 { return - } else if stage >= twiddlesStartStage { - if n == 32 { - kerDITNP_32(a, twiddles, stage-twiddlesStartStage) - return - } else if n == 256 { - kerDITNP_256(a, twiddles, stage-twiddlesStartStage) - return - } - + } else if n == 256 && stage >= twiddlesStartStage { + kerDITNP_256(a, twiddles, stage-twiddlesStartStage) + return } m := n >> 1 @@ -426,39 +415,3 @@ func kerDITNP_256(a []fr.Element, twiddles [][]fr.Element, stage int) { } innerDITWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) } - -func kerDIFNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - innerDIFWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) - for offset := 0; offset < 32; offset += 16 { - innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) - } - for offset := 0; offset < 32; offset += 8 { - innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) - } - for offset := 0; offset < 32; offset += 4 { - innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) - } - for offset := 0; offset < 32; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } -} - -func kerDITNP_32(a []fr.Element, twiddles [][]fr.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - for offset := 0; offset < 32; offset += 2 { - fr.Butterfly(&a[offset], &a[offset+1]) - } - for offset := 0; offset < 32; offset += 4 { - innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) - } - for offset := 0; offset < 32; offset += 8 { - innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) - } - for offset := 0; offset < 32; offset += 16 { - innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) - } - innerDITWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) -} diff --git a/field/babybear/fft/fft.go b/field/babybear/fft/fft.go index a623c24a9..506fb9380 100644 --- a/field/babybear/fft/fft.go +++ b/field/babybear/fft/fft.go @@ -29,7 +29,8 @@ const butterflyThreshold = 16 // if decimation == DIT (decimation in time), the input must be in bit-reversed order // if decimation == DIF (decimation in frequency), the output will be in bit-reversed order func (domain *Domain) FFT(a []babybear.Element, decimation Decimation, opts ...Option) { - + // perf note; this option pattern actually allocates on the heap and comes at a cost when + // doing many small FFTs! opt := fftOptions(opts...) // find the stage where we should stop spawning go routines in our recursive calls @@ -199,15 +200,9 @@ func difFFT(a []babybear.Element, w babybear.Element, twiddles [][]babybear.Elem n := len(a) if n == 1 { return - } else if stage >= twiddlesStartStage { - if n == 256 { - kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) - return - } else if n == 32 { - kerDIFNP_32(a, twiddles, stage-twiddlesStartStage) - return - } - + } else if n == 256 && stage >= twiddlesStartStage { + kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) + return } m := n >> 1 @@ -290,15 +285,9 @@ func ditFFT(a []babybear.Element, w babybear.Element, twiddles [][]babybear.Elem n := len(a) if n == 1 { return - } else if stage >= twiddlesStartStage { - if n == 32 { - kerDITNP_32(a, twiddles, stage-twiddlesStartStage) - return - } else if n == 256 { - kerDITNP_256(a, twiddles, stage-twiddlesStartStage) - return - } - + } else if n == 256 && stage >= twiddlesStartStage { + kerDITNP_256(a, twiddles, stage-twiddlesStartStage) + return } m := n >> 1 @@ -426,39 +415,3 @@ func kerDITNP_256(a []babybear.Element, twiddles [][]babybear.Element, stage int } innerDITWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) } - -func kerDIFNP_32(a []babybear.Element, twiddles [][]babybear.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - innerDIFWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) - for offset := 0; offset < 32; offset += 16 { - innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) - } - for offset := 0; offset < 32; offset += 8 { - innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) - } - for offset := 0; offset < 32; offset += 4 { - innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) - } - for offset := 0; offset < 32; offset += 2 { - babybear.Butterfly(&a[offset], &a[offset+1]) - } -} - -func kerDITNP_32(a []babybear.Element, twiddles [][]babybear.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - for offset := 0; offset < 32; offset += 2 { - babybear.Butterfly(&a[offset], &a[offset+1]) - } - for offset := 0; offset < 32; offset += 4 { - innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) - } - for offset := 0; offset < 32; offset += 8 { - innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) - } - for offset := 0; offset < 32; offset += 16 { - innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) - } - innerDITWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) -} diff --git a/field/babybear/sis/sis.go b/field/babybear/sis/sis.go index 2b2c0d931..e951b4cd7 100644 --- a/field/babybear/sis/sis.go +++ b/field/babybear/sis/sis.go @@ -37,6 +37,9 @@ type RSis struct { Domain *fft.Domain maxNbElementsToHash int + + smallFFT func([]babybear.Element) + twiddlesCoset []babybear.Element // used in conjunction with the smallFFT; } // NewRSis creates an instance of RSis. @@ -97,6 +100,18 @@ func NewRSis(seed int64, logTwoDegree, logTwoBound, maxNbElementsToHash int) (*R maxNbElementsToHash: maxNbElementsToHash, } + r.smallFFT = func(p []babybear.Element) { + r.Domain.FFT(p, fft.DIF, fft.OnCoset(), fft.WithNbTasks(1)) + } + + // if we have a FFT kernel of the size of the domain cardinality, we use it. + if r.Domain.Cardinality == 64 { + r.twiddlesCoset = PrecomputeTwiddlesCoset(r.Domain.Generator, shift) + r.smallFFT = func(a []babybear.Element) { + FFT64(a, r.twiddlesCoset) + } + } + // filling A a := make([]babybear.Element, n*r.Degree) ag := make([]babybear.Element, n*r.Degree) @@ -171,7 +186,16 @@ func (r *RSis) InnerHash(it *LimbIterator, res, k babybear.Vector, polId int) { return } - r.Domain.FFT(k, fft.DIF, fft.OnCoset(), fft.WithNbTasks(1)) + // r.Domain.FFT(k, fft.DIF, fft.OnCoset(), fft.WithNbTasks(1)) + // for perf, we use directly what's exposed; + r.smallFFT(k) + // k.Mul(k, fr.Vector(r.cosetTable)) + // if r.Domain.KernelDIF != nil { + // r.Domain.KernelDIF(k) + // } else { + // r.Domain.FFT(k, fft.DIF, fft.WithNbTasks(1)) + // } + mulModAcc(res, r.Ag[polId], k) } diff --git a/field/babybear/sis/sis_fft.go b/field/babybear/sis/sis_fft.go new file mode 100644 index 000000000..0b30e84b2 --- /dev/null +++ b/field/babybear/sis/sis_fft.go @@ -0,0 +1,556 @@ +// Copyright 2020-2025 Consensys Software Inc. +// Licensed under the Apache License, Version 2.0. See the LICENSE file for details. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package sis + +import ( + "github.com/consensys/gnark-crypto/field/babybear" + "math/big" +) + +// FFT64 is generated by gnark-crypto and contains the unrolled code for FFT (DIF) on 64 elements +// equivalent code: r.Domain.FFT(k, fft.DIF, fft.OnCoset(), fft.WithNbTasks(1)) +// twiddlesCoset must be pre-computed from twiddles and coset table, see PrecomputeTwiddlesCoset +func FFT64(a []babybear.Element, twiddlesCoset []babybear.Element) { + + a[32].Mul(&a[32], &twiddlesCoset[0]) + a[33].Mul(&a[33], &twiddlesCoset[0]) + a[34].Mul(&a[34], &twiddlesCoset[0]) + a[35].Mul(&a[35], &twiddlesCoset[0]) + a[36].Mul(&a[36], &twiddlesCoset[0]) + a[37].Mul(&a[37], &twiddlesCoset[0]) + a[38].Mul(&a[38], &twiddlesCoset[0]) + a[39].Mul(&a[39], &twiddlesCoset[0]) + a[40].Mul(&a[40], &twiddlesCoset[0]) + a[41].Mul(&a[41], &twiddlesCoset[0]) + a[42].Mul(&a[42], &twiddlesCoset[0]) + a[43].Mul(&a[43], &twiddlesCoset[0]) + a[44].Mul(&a[44], &twiddlesCoset[0]) + a[45].Mul(&a[45], &twiddlesCoset[0]) + a[46].Mul(&a[46], &twiddlesCoset[0]) + a[47].Mul(&a[47], &twiddlesCoset[0]) + a[48].Mul(&a[48], &twiddlesCoset[0]) + a[49].Mul(&a[49], &twiddlesCoset[0]) + a[50].Mul(&a[50], &twiddlesCoset[0]) + a[51].Mul(&a[51], &twiddlesCoset[0]) + a[52].Mul(&a[52], &twiddlesCoset[0]) + a[53].Mul(&a[53], &twiddlesCoset[0]) + a[54].Mul(&a[54], &twiddlesCoset[0]) + a[55].Mul(&a[55], &twiddlesCoset[0]) + a[56].Mul(&a[56], &twiddlesCoset[0]) + a[57].Mul(&a[57], &twiddlesCoset[0]) + a[58].Mul(&a[58], &twiddlesCoset[0]) + a[59].Mul(&a[59], &twiddlesCoset[0]) + a[60].Mul(&a[60], &twiddlesCoset[0]) + a[61].Mul(&a[61], &twiddlesCoset[0]) + a[62].Mul(&a[62], &twiddlesCoset[0]) + a[63].Mul(&a[63], &twiddlesCoset[0]) + babybear.Butterfly(&a[0], &a[32]) + babybear.Butterfly(&a[1], &a[33]) + babybear.Butterfly(&a[2], &a[34]) + babybear.Butterfly(&a[3], &a[35]) + babybear.Butterfly(&a[4], &a[36]) + babybear.Butterfly(&a[5], &a[37]) + babybear.Butterfly(&a[6], &a[38]) + babybear.Butterfly(&a[7], &a[39]) + babybear.Butterfly(&a[8], &a[40]) + babybear.Butterfly(&a[9], &a[41]) + babybear.Butterfly(&a[10], &a[42]) + babybear.Butterfly(&a[11], &a[43]) + babybear.Butterfly(&a[12], &a[44]) + babybear.Butterfly(&a[13], &a[45]) + babybear.Butterfly(&a[14], &a[46]) + babybear.Butterfly(&a[15], &a[47]) + babybear.Butterfly(&a[16], &a[48]) + babybear.Butterfly(&a[17], &a[49]) + babybear.Butterfly(&a[18], &a[50]) + babybear.Butterfly(&a[19], &a[51]) + babybear.Butterfly(&a[20], &a[52]) + babybear.Butterfly(&a[21], &a[53]) + babybear.Butterfly(&a[22], &a[54]) + babybear.Butterfly(&a[23], &a[55]) + babybear.Butterfly(&a[24], &a[56]) + babybear.Butterfly(&a[25], &a[57]) + babybear.Butterfly(&a[26], &a[58]) + babybear.Butterfly(&a[27], &a[59]) + babybear.Butterfly(&a[28], &a[60]) + babybear.Butterfly(&a[29], &a[61]) + babybear.Butterfly(&a[30], &a[62]) + babybear.Butterfly(&a[31], &a[63]) + a[16].Mul(&a[16], &twiddlesCoset[1]) + a[17].Mul(&a[17], &twiddlesCoset[1]) + a[18].Mul(&a[18], &twiddlesCoset[1]) + a[19].Mul(&a[19], &twiddlesCoset[1]) + a[20].Mul(&a[20], &twiddlesCoset[1]) + a[21].Mul(&a[21], &twiddlesCoset[1]) + a[22].Mul(&a[22], &twiddlesCoset[1]) + a[23].Mul(&a[23], &twiddlesCoset[1]) + a[24].Mul(&a[24], &twiddlesCoset[1]) + a[25].Mul(&a[25], &twiddlesCoset[1]) + a[26].Mul(&a[26], &twiddlesCoset[1]) + a[27].Mul(&a[27], &twiddlesCoset[1]) + a[28].Mul(&a[28], &twiddlesCoset[1]) + a[29].Mul(&a[29], &twiddlesCoset[1]) + a[30].Mul(&a[30], &twiddlesCoset[1]) + a[31].Mul(&a[31], &twiddlesCoset[1]) + a[48].Mul(&a[48], &twiddlesCoset[2]) + a[49].Mul(&a[49], &twiddlesCoset[2]) + a[50].Mul(&a[50], &twiddlesCoset[2]) + a[51].Mul(&a[51], &twiddlesCoset[2]) + a[52].Mul(&a[52], &twiddlesCoset[2]) + a[53].Mul(&a[53], &twiddlesCoset[2]) + a[54].Mul(&a[54], &twiddlesCoset[2]) + a[55].Mul(&a[55], &twiddlesCoset[2]) + a[56].Mul(&a[56], &twiddlesCoset[2]) + a[57].Mul(&a[57], &twiddlesCoset[2]) + a[58].Mul(&a[58], &twiddlesCoset[2]) + a[59].Mul(&a[59], &twiddlesCoset[2]) + a[60].Mul(&a[60], &twiddlesCoset[2]) + a[61].Mul(&a[61], &twiddlesCoset[2]) + a[62].Mul(&a[62], &twiddlesCoset[2]) + a[63].Mul(&a[63], &twiddlesCoset[2]) + babybear.Butterfly(&a[0], &a[16]) + babybear.Butterfly(&a[1], &a[17]) + babybear.Butterfly(&a[2], &a[18]) + babybear.Butterfly(&a[3], &a[19]) + babybear.Butterfly(&a[4], &a[20]) + babybear.Butterfly(&a[5], &a[21]) + babybear.Butterfly(&a[6], &a[22]) + babybear.Butterfly(&a[7], &a[23]) + babybear.Butterfly(&a[8], &a[24]) + babybear.Butterfly(&a[9], &a[25]) + babybear.Butterfly(&a[10], &a[26]) + babybear.Butterfly(&a[11], &a[27]) + babybear.Butterfly(&a[12], &a[28]) + babybear.Butterfly(&a[13], &a[29]) + babybear.Butterfly(&a[14], &a[30]) + babybear.Butterfly(&a[15], &a[31]) + babybear.Butterfly(&a[32], &a[48]) + babybear.Butterfly(&a[33], &a[49]) + babybear.Butterfly(&a[34], &a[50]) + babybear.Butterfly(&a[35], &a[51]) + babybear.Butterfly(&a[36], &a[52]) + babybear.Butterfly(&a[37], &a[53]) + babybear.Butterfly(&a[38], &a[54]) + babybear.Butterfly(&a[39], &a[55]) + babybear.Butterfly(&a[40], &a[56]) + babybear.Butterfly(&a[41], &a[57]) + babybear.Butterfly(&a[42], &a[58]) + babybear.Butterfly(&a[43], &a[59]) + babybear.Butterfly(&a[44], &a[60]) + babybear.Butterfly(&a[45], &a[61]) + babybear.Butterfly(&a[46], &a[62]) + babybear.Butterfly(&a[47], &a[63]) + a[8].Mul(&a[8], &twiddlesCoset[3]) + a[9].Mul(&a[9], &twiddlesCoset[3]) + a[10].Mul(&a[10], &twiddlesCoset[3]) + a[11].Mul(&a[11], &twiddlesCoset[3]) + a[12].Mul(&a[12], &twiddlesCoset[3]) + a[13].Mul(&a[13], &twiddlesCoset[3]) + a[14].Mul(&a[14], &twiddlesCoset[3]) + a[15].Mul(&a[15], &twiddlesCoset[3]) + a[24].Mul(&a[24], &twiddlesCoset[4]) + a[25].Mul(&a[25], &twiddlesCoset[4]) + a[26].Mul(&a[26], &twiddlesCoset[4]) + a[27].Mul(&a[27], &twiddlesCoset[4]) + a[28].Mul(&a[28], &twiddlesCoset[4]) + a[29].Mul(&a[29], &twiddlesCoset[4]) + a[30].Mul(&a[30], &twiddlesCoset[4]) + a[31].Mul(&a[31], &twiddlesCoset[4]) + a[40].Mul(&a[40], &twiddlesCoset[5]) + a[41].Mul(&a[41], &twiddlesCoset[5]) + a[42].Mul(&a[42], &twiddlesCoset[5]) + a[43].Mul(&a[43], &twiddlesCoset[5]) + a[44].Mul(&a[44], &twiddlesCoset[5]) + a[45].Mul(&a[45], &twiddlesCoset[5]) + a[46].Mul(&a[46], &twiddlesCoset[5]) + a[47].Mul(&a[47], &twiddlesCoset[5]) + a[56].Mul(&a[56], &twiddlesCoset[6]) + a[57].Mul(&a[57], &twiddlesCoset[6]) + a[58].Mul(&a[58], &twiddlesCoset[6]) + a[59].Mul(&a[59], &twiddlesCoset[6]) + a[60].Mul(&a[60], &twiddlesCoset[6]) + a[61].Mul(&a[61], &twiddlesCoset[6]) + a[62].Mul(&a[62], &twiddlesCoset[6]) + a[63].Mul(&a[63], &twiddlesCoset[6]) + babybear.Butterfly(&a[0], &a[8]) + babybear.Butterfly(&a[1], &a[9]) + babybear.Butterfly(&a[2], &a[10]) + babybear.Butterfly(&a[3], &a[11]) + babybear.Butterfly(&a[4], &a[12]) + babybear.Butterfly(&a[5], &a[13]) + babybear.Butterfly(&a[6], &a[14]) + babybear.Butterfly(&a[7], &a[15]) + babybear.Butterfly(&a[16], &a[24]) + babybear.Butterfly(&a[17], &a[25]) + babybear.Butterfly(&a[18], &a[26]) + babybear.Butterfly(&a[19], &a[27]) + babybear.Butterfly(&a[20], &a[28]) + babybear.Butterfly(&a[21], &a[29]) + babybear.Butterfly(&a[22], &a[30]) + babybear.Butterfly(&a[23], &a[31]) + babybear.Butterfly(&a[32], &a[40]) + babybear.Butterfly(&a[33], &a[41]) + babybear.Butterfly(&a[34], &a[42]) + babybear.Butterfly(&a[35], &a[43]) + babybear.Butterfly(&a[36], &a[44]) + babybear.Butterfly(&a[37], &a[45]) + babybear.Butterfly(&a[38], &a[46]) + babybear.Butterfly(&a[39], &a[47]) + babybear.Butterfly(&a[48], &a[56]) + babybear.Butterfly(&a[49], &a[57]) + babybear.Butterfly(&a[50], &a[58]) + babybear.Butterfly(&a[51], &a[59]) + babybear.Butterfly(&a[52], &a[60]) + babybear.Butterfly(&a[53], &a[61]) + babybear.Butterfly(&a[54], &a[62]) + babybear.Butterfly(&a[55], &a[63]) + a[4].Mul(&a[4], &twiddlesCoset[7]) + a[5].Mul(&a[5], &twiddlesCoset[7]) + a[6].Mul(&a[6], &twiddlesCoset[7]) + a[7].Mul(&a[7], &twiddlesCoset[7]) + a[12].Mul(&a[12], &twiddlesCoset[8]) + a[13].Mul(&a[13], &twiddlesCoset[8]) + a[14].Mul(&a[14], &twiddlesCoset[8]) + a[15].Mul(&a[15], &twiddlesCoset[8]) + a[20].Mul(&a[20], &twiddlesCoset[9]) + a[21].Mul(&a[21], &twiddlesCoset[9]) + a[22].Mul(&a[22], &twiddlesCoset[9]) + a[23].Mul(&a[23], &twiddlesCoset[9]) + a[28].Mul(&a[28], &twiddlesCoset[10]) + a[29].Mul(&a[29], &twiddlesCoset[10]) + a[30].Mul(&a[30], &twiddlesCoset[10]) + a[31].Mul(&a[31], &twiddlesCoset[10]) + a[36].Mul(&a[36], &twiddlesCoset[11]) + a[37].Mul(&a[37], &twiddlesCoset[11]) + a[38].Mul(&a[38], &twiddlesCoset[11]) + a[39].Mul(&a[39], &twiddlesCoset[11]) + a[44].Mul(&a[44], &twiddlesCoset[12]) + a[45].Mul(&a[45], &twiddlesCoset[12]) + a[46].Mul(&a[46], &twiddlesCoset[12]) + a[47].Mul(&a[47], &twiddlesCoset[12]) + a[52].Mul(&a[52], &twiddlesCoset[13]) + a[53].Mul(&a[53], &twiddlesCoset[13]) + a[54].Mul(&a[54], &twiddlesCoset[13]) + a[55].Mul(&a[55], &twiddlesCoset[13]) + a[60].Mul(&a[60], &twiddlesCoset[14]) + a[61].Mul(&a[61], &twiddlesCoset[14]) + a[62].Mul(&a[62], &twiddlesCoset[14]) + a[63].Mul(&a[63], &twiddlesCoset[14]) + babybear.Butterfly(&a[0], &a[4]) + babybear.Butterfly(&a[1], &a[5]) + babybear.Butterfly(&a[2], &a[6]) + babybear.Butterfly(&a[3], &a[7]) + babybear.Butterfly(&a[8], &a[12]) + babybear.Butterfly(&a[9], &a[13]) + babybear.Butterfly(&a[10], &a[14]) + babybear.Butterfly(&a[11], &a[15]) + babybear.Butterfly(&a[16], &a[20]) + babybear.Butterfly(&a[17], &a[21]) + babybear.Butterfly(&a[18], &a[22]) + babybear.Butterfly(&a[19], &a[23]) + babybear.Butterfly(&a[24], &a[28]) + babybear.Butterfly(&a[25], &a[29]) + babybear.Butterfly(&a[26], &a[30]) + babybear.Butterfly(&a[27], &a[31]) + babybear.Butterfly(&a[32], &a[36]) + babybear.Butterfly(&a[33], &a[37]) + babybear.Butterfly(&a[34], &a[38]) + babybear.Butterfly(&a[35], &a[39]) + babybear.Butterfly(&a[40], &a[44]) + babybear.Butterfly(&a[41], &a[45]) + babybear.Butterfly(&a[42], &a[46]) + babybear.Butterfly(&a[43], &a[47]) + babybear.Butterfly(&a[48], &a[52]) + babybear.Butterfly(&a[49], &a[53]) + babybear.Butterfly(&a[50], &a[54]) + babybear.Butterfly(&a[51], &a[55]) + babybear.Butterfly(&a[56], &a[60]) + babybear.Butterfly(&a[57], &a[61]) + babybear.Butterfly(&a[58], &a[62]) + babybear.Butterfly(&a[59], &a[63]) + a[2].Mul(&a[2], &twiddlesCoset[15]) + a[3].Mul(&a[3], &twiddlesCoset[15]) + a[6].Mul(&a[6], &twiddlesCoset[16]) + a[7].Mul(&a[7], &twiddlesCoset[16]) + a[10].Mul(&a[10], &twiddlesCoset[17]) + a[11].Mul(&a[11], &twiddlesCoset[17]) + a[14].Mul(&a[14], &twiddlesCoset[18]) + a[15].Mul(&a[15], &twiddlesCoset[18]) + a[18].Mul(&a[18], &twiddlesCoset[19]) + a[19].Mul(&a[19], &twiddlesCoset[19]) + a[22].Mul(&a[22], &twiddlesCoset[20]) + a[23].Mul(&a[23], &twiddlesCoset[20]) + a[26].Mul(&a[26], &twiddlesCoset[21]) + a[27].Mul(&a[27], &twiddlesCoset[21]) + a[30].Mul(&a[30], &twiddlesCoset[22]) + a[31].Mul(&a[31], &twiddlesCoset[22]) + a[34].Mul(&a[34], &twiddlesCoset[23]) + a[35].Mul(&a[35], &twiddlesCoset[23]) + a[38].Mul(&a[38], &twiddlesCoset[24]) + a[39].Mul(&a[39], &twiddlesCoset[24]) + a[42].Mul(&a[42], &twiddlesCoset[25]) + a[43].Mul(&a[43], &twiddlesCoset[25]) + a[46].Mul(&a[46], &twiddlesCoset[26]) + a[47].Mul(&a[47], &twiddlesCoset[26]) + a[50].Mul(&a[50], &twiddlesCoset[27]) + a[51].Mul(&a[51], &twiddlesCoset[27]) + a[54].Mul(&a[54], &twiddlesCoset[28]) + a[55].Mul(&a[55], &twiddlesCoset[28]) + a[58].Mul(&a[58], &twiddlesCoset[29]) + a[59].Mul(&a[59], &twiddlesCoset[29]) + a[62].Mul(&a[62], &twiddlesCoset[30]) + a[63].Mul(&a[63], &twiddlesCoset[30]) + babybear.Butterfly(&a[0], &a[2]) + babybear.Butterfly(&a[1], &a[3]) + babybear.Butterfly(&a[4], &a[6]) + babybear.Butterfly(&a[5], &a[7]) + babybear.Butterfly(&a[8], &a[10]) + babybear.Butterfly(&a[9], &a[11]) + babybear.Butterfly(&a[12], &a[14]) + babybear.Butterfly(&a[13], &a[15]) + babybear.Butterfly(&a[16], &a[18]) + babybear.Butterfly(&a[17], &a[19]) + babybear.Butterfly(&a[20], &a[22]) + babybear.Butterfly(&a[21], &a[23]) + babybear.Butterfly(&a[24], &a[26]) + babybear.Butterfly(&a[25], &a[27]) + babybear.Butterfly(&a[28], &a[30]) + babybear.Butterfly(&a[29], &a[31]) + babybear.Butterfly(&a[32], &a[34]) + babybear.Butterfly(&a[33], &a[35]) + babybear.Butterfly(&a[36], &a[38]) + babybear.Butterfly(&a[37], &a[39]) + babybear.Butterfly(&a[40], &a[42]) + babybear.Butterfly(&a[41], &a[43]) + babybear.Butterfly(&a[44], &a[46]) + babybear.Butterfly(&a[45], &a[47]) + babybear.Butterfly(&a[48], &a[50]) + babybear.Butterfly(&a[49], &a[51]) + babybear.Butterfly(&a[52], &a[54]) + babybear.Butterfly(&a[53], &a[55]) + babybear.Butterfly(&a[56], &a[58]) + babybear.Butterfly(&a[57], &a[59]) + babybear.Butterfly(&a[60], &a[62]) + babybear.Butterfly(&a[61], &a[63]) + a[1].Mul(&a[1], &twiddlesCoset[31]) + a[3].Mul(&a[3], &twiddlesCoset[32]) + a[5].Mul(&a[5], &twiddlesCoset[33]) + a[7].Mul(&a[7], &twiddlesCoset[34]) + a[9].Mul(&a[9], &twiddlesCoset[35]) + a[11].Mul(&a[11], &twiddlesCoset[36]) + a[13].Mul(&a[13], &twiddlesCoset[37]) + a[15].Mul(&a[15], &twiddlesCoset[38]) + a[17].Mul(&a[17], &twiddlesCoset[39]) + a[19].Mul(&a[19], &twiddlesCoset[40]) + a[21].Mul(&a[21], &twiddlesCoset[41]) + a[23].Mul(&a[23], &twiddlesCoset[42]) + a[25].Mul(&a[25], &twiddlesCoset[43]) + a[27].Mul(&a[27], &twiddlesCoset[44]) + a[29].Mul(&a[29], &twiddlesCoset[45]) + a[31].Mul(&a[31], &twiddlesCoset[46]) + a[33].Mul(&a[33], &twiddlesCoset[47]) + a[35].Mul(&a[35], &twiddlesCoset[48]) + a[37].Mul(&a[37], &twiddlesCoset[49]) + a[39].Mul(&a[39], &twiddlesCoset[50]) + a[41].Mul(&a[41], &twiddlesCoset[51]) + a[43].Mul(&a[43], &twiddlesCoset[52]) + a[45].Mul(&a[45], &twiddlesCoset[53]) + a[47].Mul(&a[47], &twiddlesCoset[54]) + a[49].Mul(&a[49], &twiddlesCoset[55]) + a[51].Mul(&a[51], &twiddlesCoset[56]) + a[53].Mul(&a[53], &twiddlesCoset[57]) + a[55].Mul(&a[55], &twiddlesCoset[58]) + a[57].Mul(&a[57], &twiddlesCoset[59]) + a[59].Mul(&a[59], &twiddlesCoset[60]) + a[61].Mul(&a[61], &twiddlesCoset[61]) + a[63].Mul(&a[63], &twiddlesCoset[62]) + babybear.Butterfly(&a[0], &a[1]) + babybear.Butterfly(&a[2], &a[3]) + babybear.Butterfly(&a[4], &a[5]) + babybear.Butterfly(&a[6], &a[7]) + babybear.Butterfly(&a[8], &a[9]) + babybear.Butterfly(&a[10], &a[11]) + babybear.Butterfly(&a[12], &a[13]) + babybear.Butterfly(&a[14], &a[15]) + babybear.Butterfly(&a[16], &a[17]) + babybear.Butterfly(&a[18], &a[19]) + babybear.Butterfly(&a[20], &a[21]) + babybear.Butterfly(&a[22], &a[23]) + babybear.Butterfly(&a[24], &a[25]) + babybear.Butterfly(&a[26], &a[27]) + babybear.Butterfly(&a[28], &a[29]) + babybear.Butterfly(&a[30], &a[31]) + babybear.Butterfly(&a[32], &a[33]) + babybear.Butterfly(&a[34], &a[35]) + babybear.Butterfly(&a[36], &a[37]) + babybear.Butterfly(&a[38], &a[39]) + babybear.Butterfly(&a[40], &a[41]) + babybear.Butterfly(&a[42], &a[43]) + babybear.Butterfly(&a[44], &a[45]) + babybear.Butterfly(&a[46], &a[47]) + babybear.Butterfly(&a[48], &a[49]) + babybear.Butterfly(&a[50], &a[51]) + babybear.Butterfly(&a[52], &a[53]) + babybear.Butterfly(&a[54], &a[55]) + babybear.Butterfly(&a[56], &a[57]) + babybear.Butterfly(&a[58], &a[59]) + babybear.Butterfly(&a[60], &a[61]) + babybear.Butterfly(&a[62], &a[63]) +} + +// PrecomputeTwiddlesCoset precomputes twiddlesCoset from twiddles and coset table +// it then return all elements in the correct order for the unrolled FFT. +func PrecomputeTwiddlesCoset(generator, shifter babybear.Element) []babybear.Element { + toReturn := make([]babybear.Element, 63) + var r, s babybear.Element + e := new(big.Int) + + s = shifter + for k := 0; k < 5; k++ { + s.Square(&s) + } + toReturn[0] = s + s = shifter + for k := 0; k < 4; k++ { + s.Square(&s) + } + toReturn[1] = s + r.Exp(generator, e.SetUint64(uint64(1<<4*1))) + toReturn[2].Mul(&r, &s) + s = shifter + for k := 0; k < 3; k++ { + s.Square(&s) + } + toReturn[3] = s + r.Exp(generator, e.SetUint64(uint64(1<<3*2))) + toReturn[4].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<3*1))) + toReturn[5].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<3*3))) + toReturn[6].Mul(&r, &s) + s = shifter + for k := 0; k < 2; k++ { + s.Square(&s) + } + toReturn[7] = s + r.Exp(generator, e.SetUint64(uint64(1<<2*4))) + toReturn[8].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<2*2))) + toReturn[9].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<2*6))) + toReturn[10].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<2*1))) + toReturn[11].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<2*5))) + toReturn[12].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<2*3))) + toReturn[13].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<2*7))) + toReturn[14].Mul(&r, &s) + s = shifter + for k := 0; k < 1; k++ { + s.Square(&s) + } + toReturn[15] = s + r.Exp(generator, e.SetUint64(uint64(1<<1*8))) + toReturn[16].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*4))) + toReturn[17].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*12))) + toReturn[18].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*2))) + toReturn[19].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*10))) + toReturn[20].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*6))) + toReturn[21].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*14))) + toReturn[22].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*1))) + toReturn[23].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*9))) + toReturn[24].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*5))) + toReturn[25].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*13))) + toReturn[26].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*3))) + toReturn[27].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*11))) + toReturn[28].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*7))) + toReturn[29].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*15))) + toReturn[30].Mul(&r, &s) + s = shifter + for k := 0; k < 0; k++ { + s.Square(&s) + } + toReturn[31] = s + r.Exp(generator, e.SetUint64(uint64(1<<0*16))) + toReturn[32].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*8))) + toReturn[33].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*24))) + toReturn[34].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*4))) + toReturn[35].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*20))) + toReturn[36].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*12))) + toReturn[37].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*28))) + toReturn[38].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*2))) + toReturn[39].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*18))) + toReturn[40].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*10))) + toReturn[41].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*26))) + toReturn[42].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*6))) + toReturn[43].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*22))) + toReturn[44].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*14))) + toReturn[45].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*30))) + toReturn[46].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*1))) + toReturn[47].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*17))) + toReturn[48].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*9))) + toReturn[49].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*25))) + toReturn[50].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*5))) + toReturn[51].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*21))) + toReturn[52].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*13))) + toReturn[53].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*29))) + toReturn[54].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*3))) + toReturn[55].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*19))) + toReturn[56].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*11))) + toReturn[57].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*27))) + toReturn[58].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*7))) + toReturn[59].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*23))) + toReturn[60].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*15))) + toReturn[61].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*31))) + toReturn[62].Mul(&r, &s) + return toReturn +} diff --git a/field/generator/generator_sis.go b/field/generator/generator_sis.go index d4f251f71..f8f456b29 100644 --- a/field/generator/generator_sis.go +++ b/field/generator/generator_sis.go @@ -1,7 +1,6 @@ package generator import ( - "os" "path/filepath" "github.com/consensys/bavard" @@ -18,12 +17,11 @@ func generateSIS(F *config.Field, outputDir string) error { outputDir = filepath.Join(outputDir, "sis") entries := []bavard.Entry{ + {File: filepath.Join(outputDir, "sis_fft.go"), Templates: []string{"fft.go.tmpl"}}, {File: filepath.Join(outputDir, "sis.go"), Templates: []string{"sis.go.tmpl"}}, {File: filepath.Join(outputDir, "sis_test.go"), Templates: []string{"sis.test.go.tmpl"}}, } - os.Remove(filepath.Join(outputDir, "sis_fft.go")) - funcs := make(map[string]interface{}) funcs["bitReverse"] = bitReverse diff --git a/field/generator/internal/templates/fft/fft.go.tmpl b/field/generator/internal/templates/fft/fft.go.tmpl index a20785ba2..d777b2164 100644 --- a/field/generator/internal/templates/fft/fft.go.tmpl +++ b/field/generator/internal/templates/fft/fft.go.tmpl @@ -11,8 +11,6 @@ import ( {{ $sizeKernelLog2 := 8}} {{ $sizeKernel := shl 1 $sizeKernelLog2}} -{{ $sizeKernel2Log2 := 5}} -{{ $sizeKernel2 := shl 1 $sizeKernel2Log2}} // Decimation is used in the FFT call to select decimation in time or in frequency type Decimation uint8 @@ -29,7 +27,8 @@ const butterflyThreshold = 16 // if decimation == DIT (decimation in time), the input must be in bit-reversed order // if decimation == DIF (decimation in frequency), the output will be in bit-reversed order func (domain *Domain) FFT(a []{{ .FF }}.Element, decimation Decimation, opts ...Option) { - + // perf note; this option pattern actually allocates on the heap and comes at a cost when + // doing many small FFTs! opt := fftOptions(opts...) // find the stage where we should stop spawning go routines in our recursive calls @@ -202,15 +201,9 @@ func difFFT(a []{{ .FF }}.Element, w {{ .FF }}.Element, twiddles [][]{{ .FF }}.E n := len(a) if n == 1 { return - } else if stage >= twiddlesStartStage { - if n == {{$sizeKernel}} { - kerDIFNP_{{$sizeKernel}}(a, twiddles, stage-twiddlesStartStage) - return - } else if n == {{$sizeKernel2}} { - kerDIFNP_{{$sizeKernel2}}(a, twiddles, stage-twiddlesStartStage) - return - } - + } else if n == {{$sizeKernel}} && stage >= twiddlesStartStage { + kerDIFNP_{{$sizeKernel}}(a, twiddles, stage-twiddlesStartStage) + return } m := n >> 1 @@ -294,15 +287,9 @@ func ditFFT(a []{{ .FF }}.Element, w {{ .FF }}.Element, twiddles [][]{{ .FF }}.E n := len(a) if n == 1 { return - } else if stage >= twiddlesStartStage { - if n == {{$sizeKernel2}} { - kerDITNP_{{$sizeKernel2}}(a, twiddles, stage-twiddlesStartStage) - return - } else if n == {{$sizeKernel}} { - kerDITNP_{{$sizeKernel}}(a, twiddles, stage-twiddlesStartStage) - return - } - + } else if n == {{$sizeKernel}} && stage >= twiddlesStartStage { + kerDITNP_{{$sizeKernel}}(a, twiddles, stage-twiddlesStartStage) + return } m := n >> 1 @@ -379,7 +366,6 @@ func innerDITWithoutTwiddles(a []{{ .FF }}.Element, at, w {{ .FF }}.Element, sta } {{genKernel $.FF $sizeKernel $sizeKernelLog2}} -{{genKernel $.FF $sizeKernel2 $sizeKernel2Log2}} {{define "genKernel FF sizeKernel sizeKernelLog2"}} @@ -439,6 +425,3 @@ func kerDITNP_{{.sizeKernel}}(a []{{ .FF }}.Element, twiddles [][]{{ .FF }}.Elem } {{end}} - - - diff --git a/field/generator/internal/templates/sis/fft.go.tmpl b/field/generator/internal/templates/sis/fft.go.tmpl new file mode 100644 index 000000000..777eb8610 --- /dev/null +++ b/field/generator/internal/templates/sis/fft.go.tmpl @@ -0,0 +1,82 @@ +import ( + "{{ .FieldPackagePath }}" + "math/big" +) + +// FFT64 is generated by gnark-crypto and contains the unrolled code for FFT (DIF) on 64 elements +// equivalent code: r.Domain.FFT(k, fft.DIF, fft.OnCoset(), fft.WithNbTasks(1)) +// twiddlesCoset must be pre-computed from twiddles and coset table, see PrecomputeTwiddlesCoset +func FFT64(a []{{ .FF }}.Element, twiddlesCoset []{{ .FF }}.Element) { + + {{- /* notes: + this function can be updated with larger n + nbSteps must be updated too such as 1 << nbSteps == n + butterflies and multiplication are separated for size n = 8, must check perf for larger n + */}} + {{$tIndex := 0}} + {{ $n := 64}} + {{ $m := div $n 2}} + {{ $split := 1}} + {{ $split = div $split 1}} + {{- range $step := reverse (iterate 0 6)}} + + {{- $offset := 0}} + {{- range $s := iterate 0 $split}} + {{- range $i := iterate 0 $m}} + {{- $j := add $i $offset}} + {{- $k := add $j $m}} + a[{{$k}}].Mul(&a[{{$k}}], &twiddlesCoset[{{$tIndex}}]) + {{- end}} + {{- $offset = add $offset $n}} + {{- $tIndex = add $tIndex 1}} + {{- end}} + + {{- $offset := 0}} + {{- range $s := iterate 0 $split}} + {{- range $i := iterate 0 $m}} + {{- $j := add $i $offset}} + {{- $k := add $j $m}} + {{ $.FF }}.Butterfly(&a[{{$j}}], &a[{{$k}}]) + {{- end}} + {{- $offset = add $offset $n}} + {{- end}} + + {{- $n = div $n 2}} + {{- $m = div $n 2}} + {{- $split = mul $split 2}} + {{- end}} +} + +// PrecomputeTwiddlesCoset precomputes twiddlesCoset from twiddles and coset table +// it then return all elements in the correct order for the unrolled FFT. +func PrecomputeTwiddlesCoset(generator, shifter {{ .FF }}.Element) []{{ .FF }}.Element { + toReturn := make([]{{ .FF }}.Element, 63) + var r, s {{ .FF }}.Element + e := new(big.Int) + {{ $n := 64}} + {{ $m := div $n 2}} + {{ $split := 1}} + {{ $split = div $split 1}} + {{ $j := 0}} + {{- range $step := reverse (iterate 0 6)}} + s = shifter + for k:=0; k <{{$step}};k++ { + s.Square(&s) + } + + {{- $offset := 0}} + {{- range $s := iterate 0 $split}} + {{- $exp := bitReverse $split $s}} + {{- if eq $exp 0}} + toReturn[{{$j}}] = s + {{- else}} + r.Exp(generator, e.SetUint64(uint64(1<<{{$step}} * {{$exp}}))) + toReturn[{{$j}}].Mul(&r, &s) + {{- end}} + {{- $j = add $j 1}} + {{- end}} + + {{- $split = mul $split 2}} + {{- end}} + return toReturn +} \ No newline at end of file diff --git a/field/generator/internal/templates/sis/sis.go.tmpl b/field/generator/internal/templates/sis/sis.go.tmpl index 0a2f3c730..cb235cc9e 100644 --- a/field/generator/internal/templates/sis/sis.go.tmpl +++ b/field/generator/internal/templates/sis/sis.go.tmpl @@ -35,8 +35,11 @@ type RSis struct { // domain for the polynomial multiplication Domain *fft.Domain - + maxNbElementsToHash int + + smallFFT func([]{{ .FF }}.Element) + twiddlesCoset []{{ .FF }}.Element // used in conjunction with the smallFFT; } // NewRSis creates an instance of RSis. @@ -97,6 +100,20 @@ func NewRSis(seed int64, logTwoDegree, logTwoBound, maxNbElementsToHash int) (*R maxNbElementsToHash: maxNbElementsToHash, } + r.smallFFT = func(p []{{ .FF }}.Element) { + r.Domain.FFT(p, fft.DIF, fft.OnCoset(), fft.WithNbTasks(1)) + } + + // if we have a FFT kernel of the size of the domain cardinality, we use it. + if r.Domain.Cardinality == 64 { + r.twiddlesCoset = PrecomputeTwiddlesCoset(r.Domain.Generator, shift) + r.smallFFT = func(a []{{ .FF }}.Element) { + FFT64(a, r.twiddlesCoset) + } + } + + + // filling A a := make([]{{ .FF }}.Element, n*r.Degree) ag := make([]{{ .FF }}.Element, n*r.Degree) @@ -171,7 +188,16 @@ func (r *RSis) InnerHash(it *LimbIterator, res, k {{ .FF }}.Vector, polId int) { return } - r.Domain.FFT(k, fft.DIF, fft.OnCoset(), fft.WithNbTasks(1)) + // r.Domain.FFT(k, fft.DIF, fft.OnCoset(), fft.WithNbTasks(1)) + // for perf, we use directly what's exposed; + r.smallFFT(k) + // k.Mul(k, fr.Vector(r.cosetTable)) + // if r.Domain.KernelDIF != nil { + // r.Domain.KernelDIF(k) + // } else { + // r.Domain.FFT(k, fft.DIF, fft.WithNbTasks(1)) + // } + mulModAcc(res, r.Ag[polId], k) } diff --git a/field/goldilocks/fft/fft.go b/field/goldilocks/fft/fft.go index 20c5e57c5..84ca5af4f 100644 --- a/field/goldilocks/fft/fft.go +++ b/field/goldilocks/fft/fft.go @@ -29,7 +29,8 @@ const butterflyThreshold = 16 // if decimation == DIT (decimation in time), the input must be in bit-reversed order // if decimation == DIF (decimation in frequency), the output will be in bit-reversed order func (domain *Domain) FFT(a []goldilocks.Element, decimation Decimation, opts ...Option) { - + // perf note; this option pattern actually allocates on the heap and comes at a cost when + // doing many small FFTs! opt := fftOptions(opts...) // find the stage where we should stop spawning go routines in our recursive calls @@ -199,15 +200,9 @@ func difFFT(a []goldilocks.Element, w goldilocks.Element, twiddles [][]goldilock n := len(a) if n == 1 { return - } else if stage >= twiddlesStartStage { - if n == 256 { - kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) - return - } else if n == 32 { - kerDIFNP_32(a, twiddles, stage-twiddlesStartStage) - return - } - + } else if n == 256 && stage >= twiddlesStartStage { + kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) + return } m := n >> 1 @@ -290,15 +285,9 @@ func ditFFT(a []goldilocks.Element, w goldilocks.Element, twiddles [][]goldilock n := len(a) if n == 1 { return - } else if stage >= twiddlesStartStage { - if n == 32 { - kerDITNP_32(a, twiddles, stage-twiddlesStartStage) - return - } else if n == 256 { - kerDITNP_256(a, twiddles, stage-twiddlesStartStage) - return - } - + } else if n == 256 && stage >= twiddlesStartStage { + kerDITNP_256(a, twiddles, stage-twiddlesStartStage) + return } m := n >> 1 @@ -426,39 +415,3 @@ func kerDITNP_256(a []goldilocks.Element, twiddles [][]goldilocks.Element, stage } innerDITWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) } - -func kerDIFNP_32(a []goldilocks.Element, twiddles [][]goldilocks.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - innerDIFWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) - for offset := 0; offset < 32; offset += 16 { - innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) - } - for offset := 0; offset < 32; offset += 8 { - innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) - } - for offset := 0; offset < 32; offset += 4 { - innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) - } - for offset := 0; offset < 32; offset += 2 { - goldilocks.Butterfly(&a[offset], &a[offset+1]) - } -} - -func kerDITNP_32(a []goldilocks.Element, twiddles [][]goldilocks.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - for offset := 0; offset < 32; offset += 2 { - goldilocks.Butterfly(&a[offset], &a[offset+1]) - } - for offset := 0; offset < 32; offset += 4 { - innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) - } - for offset := 0; offset < 32; offset += 8 { - innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) - } - for offset := 0; offset < 32; offset += 16 { - innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) - } - innerDITWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) -} diff --git a/field/goldilocks/sis/sis.go b/field/goldilocks/sis/sis.go index c5d703c1f..51b8256ea 100644 --- a/field/goldilocks/sis/sis.go +++ b/field/goldilocks/sis/sis.go @@ -37,6 +37,9 @@ type RSis struct { Domain *fft.Domain maxNbElementsToHash int + + smallFFT func([]goldilocks.Element) + twiddlesCoset []goldilocks.Element // used in conjunction with the smallFFT; } // NewRSis creates an instance of RSis. @@ -97,6 +100,18 @@ func NewRSis(seed int64, logTwoDegree, logTwoBound, maxNbElementsToHash int) (*R maxNbElementsToHash: maxNbElementsToHash, } + r.smallFFT = func(p []goldilocks.Element) { + r.Domain.FFT(p, fft.DIF, fft.OnCoset(), fft.WithNbTasks(1)) + } + + // if we have a FFT kernel of the size of the domain cardinality, we use it. + if r.Domain.Cardinality == 64 { + r.twiddlesCoset = PrecomputeTwiddlesCoset(r.Domain.Generator, shift) + r.smallFFT = func(a []goldilocks.Element) { + FFT64(a, r.twiddlesCoset) + } + } + // filling A a := make([]goldilocks.Element, n*r.Degree) ag := make([]goldilocks.Element, n*r.Degree) @@ -171,7 +186,16 @@ func (r *RSis) InnerHash(it *LimbIterator, res, k goldilocks.Vector, polId int) return } - r.Domain.FFT(k, fft.DIF, fft.OnCoset(), fft.WithNbTasks(1)) + // r.Domain.FFT(k, fft.DIF, fft.OnCoset(), fft.WithNbTasks(1)) + // for perf, we use directly what's exposed; + r.smallFFT(k) + // k.Mul(k, fr.Vector(r.cosetTable)) + // if r.Domain.KernelDIF != nil { + // r.Domain.KernelDIF(k) + // } else { + // r.Domain.FFT(k, fft.DIF, fft.WithNbTasks(1)) + // } + mulModAcc(res, r.Ag[polId], k) } diff --git a/field/goldilocks/sis/sis_fft.go b/field/goldilocks/sis/sis_fft.go new file mode 100644 index 000000000..321a84e90 --- /dev/null +++ b/field/goldilocks/sis/sis_fft.go @@ -0,0 +1,556 @@ +// Copyright 2020-2025 Consensys Software Inc. +// Licensed under the Apache License, Version 2.0. See the LICENSE file for details. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package sis + +import ( + "github.com/consensys/gnark-crypto/field/goldilocks" + "math/big" +) + +// FFT64 is generated by gnark-crypto and contains the unrolled code for FFT (DIF) on 64 elements +// equivalent code: r.Domain.FFT(k, fft.DIF, fft.OnCoset(), fft.WithNbTasks(1)) +// twiddlesCoset must be pre-computed from twiddles and coset table, see PrecomputeTwiddlesCoset +func FFT64(a []goldilocks.Element, twiddlesCoset []goldilocks.Element) { + + a[32].Mul(&a[32], &twiddlesCoset[0]) + a[33].Mul(&a[33], &twiddlesCoset[0]) + a[34].Mul(&a[34], &twiddlesCoset[0]) + a[35].Mul(&a[35], &twiddlesCoset[0]) + a[36].Mul(&a[36], &twiddlesCoset[0]) + a[37].Mul(&a[37], &twiddlesCoset[0]) + a[38].Mul(&a[38], &twiddlesCoset[0]) + a[39].Mul(&a[39], &twiddlesCoset[0]) + a[40].Mul(&a[40], &twiddlesCoset[0]) + a[41].Mul(&a[41], &twiddlesCoset[0]) + a[42].Mul(&a[42], &twiddlesCoset[0]) + a[43].Mul(&a[43], &twiddlesCoset[0]) + a[44].Mul(&a[44], &twiddlesCoset[0]) + a[45].Mul(&a[45], &twiddlesCoset[0]) + a[46].Mul(&a[46], &twiddlesCoset[0]) + a[47].Mul(&a[47], &twiddlesCoset[0]) + a[48].Mul(&a[48], &twiddlesCoset[0]) + a[49].Mul(&a[49], &twiddlesCoset[0]) + a[50].Mul(&a[50], &twiddlesCoset[0]) + a[51].Mul(&a[51], &twiddlesCoset[0]) + a[52].Mul(&a[52], &twiddlesCoset[0]) + a[53].Mul(&a[53], &twiddlesCoset[0]) + a[54].Mul(&a[54], &twiddlesCoset[0]) + a[55].Mul(&a[55], &twiddlesCoset[0]) + a[56].Mul(&a[56], &twiddlesCoset[0]) + a[57].Mul(&a[57], &twiddlesCoset[0]) + a[58].Mul(&a[58], &twiddlesCoset[0]) + a[59].Mul(&a[59], &twiddlesCoset[0]) + a[60].Mul(&a[60], &twiddlesCoset[0]) + a[61].Mul(&a[61], &twiddlesCoset[0]) + a[62].Mul(&a[62], &twiddlesCoset[0]) + a[63].Mul(&a[63], &twiddlesCoset[0]) + goldilocks.Butterfly(&a[0], &a[32]) + goldilocks.Butterfly(&a[1], &a[33]) + goldilocks.Butterfly(&a[2], &a[34]) + goldilocks.Butterfly(&a[3], &a[35]) + goldilocks.Butterfly(&a[4], &a[36]) + goldilocks.Butterfly(&a[5], &a[37]) + goldilocks.Butterfly(&a[6], &a[38]) + goldilocks.Butterfly(&a[7], &a[39]) + goldilocks.Butterfly(&a[8], &a[40]) + goldilocks.Butterfly(&a[9], &a[41]) + goldilocks.Butterfly(&a[10], &a[42]) + goldilocks.Butterfly(&a[11], &a[43]) + goldilocks.Butterfly(&a[12], &a[44]) + goldilocks.Butterfly(&a[13], &a[45]) + goldilocks.Butterfly(&a[14], &a[46]) + goldilocks.Butterfly(&a[15], &a[47]) + goldilocks.Butterfly(&a[16], &a[48]) + goldilocks.Butterfly(&a[17], &a[49]) + goldilocks.Butterfly(&a[18], &a[50]) + goldilocks.Butterfly(&a[19], &a[51]) + goldilocks.Butterfly(&a[20], &a[52]) + goldilocks.Butterfly(&a[21], &a[53]) + goldilocks.Butterfly(&a[22], &a[54]) + goldilocks.Butterfly(&a[23], &a[55]) + goldilocks.Butterfly(&a[24], &a[56]) + goldilocks.Butterfly(&a[25], &a[57]) + goldilocks.Butterfly(&a[26], &a[58]) + goldilocks.Butterfly(&a[27], &a[59]) + goldilocks.Butterfly(&a[28], &a[60]) + goldilocks.Butterfly(&a[29], &a[61]) + goldilocks.Butterfly(&a[30], &a[62]) + goldilocks.Butterfly(&a[31], &a[63]) + a[16].Mul(&a[16], &twiddlesCoset[1]) + a[17].Mul(&a[17], &twiddlesCoset[1]) + a[18].Mul(&a[18], &twiddlesCoset[1]) + a[19].Mul(&a[19], &twiddlesCoset[1]) + a[20].Mul(&a[20], &twiddlesCoset[1]) + a[21].Mul(&a[21], &twiddlesCoset[1]) + a[22].Mul(&a[22], &twiddlesCoset[1]) + a[23].Mul(&a[23], &twiddlesCoset[1]) + a[24].Mul(&a[24], &twiddlesCoset[1]) + a[25].Mul(&a[25], &twiddlesCoset[1]) + a[26].Mul(&a[26], &twiddlesCoset[1]) + a[27].Mul(&a[27], &twiddlesCoset[1]) + a[28].Mul(&a[28], &twiddlesCoset[1]) + a[29].Mul(&a[29], &twiddlesCoset[1]) + a[30].Mul(&a[30], &twiddlesCoset[1]) + a[31].Mul(&a[31], &twiddlesCoset[1]) + a[48].Mul(&a[48], &twiddlesCoset[2]) + a[49].Mul(&a[49], &twiddlesCoset[2]) + a[50].Mul(&a[50], &twiddlesCoset[2]) + a[51].Mul(&a[51], &twiddlesCoset[2]) + a[52].Mul(&a[52], &twiddlesCoset[2]) + a[53].Mul(&a[53], &twiddlesCoset[2]) + a[54].Mul(&a[54], &twiddlesCoset[2]) + a[55].Mul(&a[55], &twiddlesCoset[2]) + a[56].Mul(&a[56], &twiddlesCoset[2]) + a[57].Mul(&a[57], &twiddlesCoset[2]) + a[58].Mul(&a[58], &twiddlesCoset[2]) + a[59].Mul(&a[59], &twiddlesCoset[2]) + a[60].Mul(&a[60], &twiddlesCoset[2]) + a[61].Mul(&a[61], &twiddlesCoset[2]) + a[62].Mul(&a[62], &twiddlesCoset[2]) + a[63].Mul(&a[63], &twiddlesCoset[2]) + goldilocks.Butterfly(&a[0], &a[16]) + goldilocks.Butterfly(&a[1], &a[17]) + goldilocks.Butterfly(&a[2], &a[18]) + goldilocks.Butterfly(&a[3], &a[19]) + goldilocks.Butterfly(&a[4], &a[20]) + goldilocks.Butterfly(&a[5], &a[21]) + goldilocks.Butterfly(&a[6], &a[22]) + goldilocks.Butterfly(&a[7], &a[23]) + goldilocks.Butterfly(&a[8], &a[24]) + goldilocks.Butterfly(&a[9], &a[25]) + goldilocks.Butterfly(&a[10], &a[26]) + goldilocks.Butterfly(&a[11], &a[27]) + goldilocks.Butterfly(&a[12], &a[28]) + goldilocks.Butterfly(&a[13], &a[29]) + goldilocks.Butterfly(&a[14], &a[30]) + goldilocks.Butterfly(&a[15], &a[31]) + goldilocks.Butterfly(&a[32], &a[48]) + goldilocks.Butterfly(&a[33], &a[49]) + goldilocks.Butterfly(&a[34], &a[50]) + goldilocks.Butterfly(&a[35], &a[51]) + goldilocks.Butterfly(&a[36], &a[52]) + goldilocks.Butterfly(&a[37], &a[53]) + goldilocks.Butterfly(&a[38], &a[54]) + goldilocks.Butterfly(&a[39], &a[55]) + goldilocks.Butterfly(&a[40], &a[56]) + goldilocks.Butterfly(&a[41], &a[57]) + goldilocks.Butterfly(&a[42], &a[58]) + goldilocks.Butterfly(&a[43], &a[59]) + goldilocks.Butterfly(&a[44], &a[60]) + goldilocks.Butterfly(&a[45], &a[61]) + goldilocks.Butterfly(&a[46], &a[62]) + goldilocks.Butterfly(&a[47], &a[63]) + a[8].Mul(&a[8], &twiddlesCoset[3]) + a[9].Mul(&a[9], &twiddlesCoset[3]) + a[10].Mul(&a[10], &twiddlesCoset[3]) + a[11].Mul(&a[11], &twiddlesCoset[3]) + a[12].Mul(&a[12], &twiddlesCoset[3]) + a[13].Mul(&a[13], &twiddlesCoset[3]) + a[14].Mul(&a[14], &twiddlesCoset[3]) + a[15].Mul(&a[15], &twiddlesCoset[3]) + a[24].Mul(&a[24], &twiddlesCoset[4]) + a[25].Mul(&a[25], &twiddlesCoset[4]) + a[26].Mul(&a[26], &twiddlesCoset[4]) + a[27].Mul(&a[27], &twiddlesCoset[4]) + a[28].Mul(&a[28], &twiddlesCoset[4]) + a[29].Mul(&a[29], &twiddlesCoset[4]) + a[30].Mul(&a[30], &twiddlesCoset[4]) + a[31].Mul(&a[31], &twiddlesCoset[4]) + a[40].Mul(&a[40], &twiddlesCoset[5]) + a[41].Mul(&a[41], &twiddlesCoset[5]) + a[42].Mul(&a[42], &twiddlesCoset[5]) + a[43].Mul(&a[43], &twiddlesCoset[5]) + a[44].Mul(&a[44], &twiddlesCoset[5]) + a[45].Mul(&a[45], &twiddlesCoset[5]) + a[46].Mul(&a[46], &twiddlesCoset[5]) + a[47].Mul(&a[47], &twiddlesCoset[5]) + a[56].Mul(&a[56], &twiddlesCoset[6]) + a[57].Mul(&a[57], &twiddlesCoset[6]) + a[58].Mul(&a[58], &twiddlesCoset[6]) + a[59].Mul(&a[59], &twiddlesCoset[6]) + a[60].Mul(&a[60], &twiddlesCoset[6]) + a[61].Mul(&a[61], &twiddlesCoset[6]) + a[62].Mul(&a[62], &twiddlesCoset[6]) + a[63].Mul(&a[63], &twiddlesCoset[6]) + goldilocks.Butterfly(&a[0], &a[8]) + goldilocks.Butterfly(&a[1], &a[9]) + goldilocks.Butterfly(&a[2], &a[10]) + goldilocks.Butterfly(&a[3], &a[11]) + goldilocks.Butterfly(&a[4], &a[12]) + goldilocks.Butterfly(&a[5], &a[13]) + goldilocks.Butterfly(&a[6], &a[14]) + goldilocks.Butterfly(&a[7], &a[15]) + goldilocks.Butterfly(&a[16], &a[24]) + goldilocks.Butterfly(&a[17], &a[25]) + goldilocks.Butterfly(&a[18], &a[26]) + goldilocks.Butterfly(&a[19], &a[27]) + goldilocks.Butterfly(&a[20], &a[28]) + goldilocks.Butterfly(&a[21], &a[29]) + goldilocks.Butterfly(&a[22], &a[30]) + goldilocks.Butterfly(&a[23], &a[31]) + goldilocks.Butterfly(&a[32], &a[40]) + goldilocks.Butterfly(&a[33], &a[41]) + goldilocks.Butterfly(&a[34], &a[42]) + goldilocks.Butterfly(&a[35], &a[43]) + goldilocks.Butterfly(&a[36], &a[44]) + goldilocks.Butterfly(&a[37], &a[45]) + goldilocks.Butterfly(&a[38], &a[46]) + goldilocks.Butterfly(&a[39], &a[47]) + goldilocks.Butterfly(&a[48], &a[56]) + goldilocks.Butterfly(&a[49], &a[57]) + goldilocks.Butterfly(&a[50], &a[58]) + goldilocks.Butterfly(&a[51], &a[59]) + goldilocks.Butterfly(&a[52], &a[60]) + goldilocks.Butterfly(&a[53], &a[61]) + goldilocks.Butterfly(&a[54], &a[62]) + goldilocks.Butterfly(&a[55], &a[63]) + a[4].Mul(&a[4], &twiddlesCoset[7]) + a[5].Mul(&a[5], &twiddlesCoset[7]) + a[6].Mul(&a[6], &twiddlesCoset[7]) + a[7].Mul(&a[7], &twiddlesCoset[7]) + a[12].Mul(&a[12], &twiddlesCoset[8]) + a[13].Mul(&a[13], &twiddlesCoset[8]) + a[14].Mul(&a[14], &twiddlesCoset[8]) + a[15].Mul(&a[15], &twiddlesCoset[8]) + a[20].Mul(&a[20], &twiddlesCoset[9]) + a[21].Mul(&a[21], &twiddlesCoset[9]) + a[22].Mul(&a[22], &twiddlesCoset[9]) + a[23].Mul(&a[23], &twiddlesCoset[9]) + a[28].Mul(&a[28], &twiddlesCoset[10]) + a[29].Mul(&a[29], &twiddlesCoset[10]) + a[30].Mul(&a[30], &twiddlesCoset[10]) + a[31].Mul(&a[31], &twiddlesCoset[10]) + a[36].Mul(&a[36], &twiddlesCoset[11]) + a[37].Mul(&a[37], &twiddlesCoset[11]) + a[38].Mul(&a[38], &twiddlesCoset[11]) + a[39].Mul(&a[39], &twiddlesCoset[11]) + a[44].Mul(&a[44], &twiddlesCoset[12]) + a[45].Mul(&a[45], &twiddlesCoset[12]) + a[46].Mul(&a[46], &twiddlesCoset[12]) + a[47].Mul(&a[47], &twiddlesCoset[12]) + a[52].Mul(&a[52], &twiddlesCoset[13]) + a[53].Mul(&a[53], &twiddlesCoset[13]) + a[54].Mul(&a[54], &twiddlesCoset[13]) + a[55].Mul(&a[55], &twiddlesCoset[13]) + a[60].Mul(&a[60], &twiddlesCoset[14]) + a[61].Mul(&a[61], &twiddlesCoset[14]) + a[62].Mul(&a[62], &twiddlesCoset[14]) + a[63].Mul(&a[63], &twiddlesCoset[14]) + goldilocks.Butterfly(&a[0], &a[4]) + goldilocks.Butterfly(&a[1], &a[5]) + goldilocks.Butterfly(&a[2], &a[6]) + goldilocks.Butterfly(&a[3], &a[7]) + goldilocks.Butterfly(&a[8], &a[12]) + goldilocks.Butterfly(&a[9], &a[13]) + goldilocks.Butterfly(&a[10], &a[14]) + goldilocks.Butterfly(&a[11], &a[15]) + goldilocks.Butterfly(&a[16], &a[20]) + goldilocks.Butterfly(&a[17], &a[21]) + goldilocks.Butterfly(&a[18], &a[22]) + goldilocks.Butterfly(&a[19], &a[23]) + goldilocks.Butterfly(&a[24], &a[28]) + goldilocks.Butterfly(&a[25], &a[29]) + goldilocks.Butterfly(&a[26], &a[30]) + goldilocks.Butterfly(&a[27], &a[31]) + goldilocks.Butterfly(&a[32], &a[36]) + goldilocks.Butterfly(&a[33], &a[37]) + goldilocks.Butterfly(&a[34], &a[38]) + goldilocks.Butterfly(&a[35], &a[39]) + goldilocks.Butterfly(&a[40], &a[44]) + goldilocks.Butterfly(&a[41], &a[45]) + goldilocks.Butterfly(&a[42], &a[46]) + goldilocks.Butterfly(&a[43], &a[47]) + goldilocks.Butterfly(&a[48], &a[52]) + goldilocks.Butterfly(&a[49], &a[53]) + goldilocks.Butterfly(&a[50], &a[54]) + goldilocks.Butterfly(&a[51], &a[55]) + goldilocks.Butterfly(&a[56], &a[60]) + goldilocks.Butterfly(&a[57], &a[61]) + goldilocks.Butterfly(&a[58], &a[62]) + goldilocks.Butterfly(&a[59], &a[63]) + a[2].Mul(&a[2], &twiddlesCoset[15]) + a[3].Mul(&a[3], &twiddlesCoset[15]) + a[6].Mul(&a[6], &twiddlesCoset[16]) + a[7].Mul(&a[7], &twiddlesCoset[16]) + a[10].Mul(&a[10], &twiddlesCoset[17]) + a[11].Mul(&a[11], &twiddlesCoset[17]) + a[14].Mul(&a[14], &twiddlesCoset[18]) + a[15].Mul(&a[15], &twiddlesCoset[18]) + a[18].Mul(&a[18], &twiddlesCoset[19]) + a[19].Mul(&a[19], &twiddlesCoset[19]) + a[22].Mul(&a[22], &twiddlesCoset[20]) + a[23].Mul(&a[23], &twiddlesCoset[20]) + a[26].Mul(&a[26], &twiddlesCoset[21]) + a[27].Mul(&a[27], &twiddlesCoset[21]) + a[30].Mul(&a[30], &twiddlesCoset[22]) + a[31].Mul(&a[31], &twiddlesCoset[22]) + a[34].Mul(&a[34], &twiddlesCoset[23]) + a[35].Mul(&a[35], &twiddlesCoset[23]) + a[38].Mul(&a[38], &twiddlesCoset[24]) + a[39].Mul(&a[39], &twiddlesCoset[24]) + a[42].Mul(&a[42], &twiddlesCoset[25]) + a[43].Mul(&a[43], &twiddlesCoset[25]) + a[46].Mul(&a[46], &twiddlesCoset[26]) + a[47].Mul(&a[47], &twiddlesCoset[26]) + a[50].Mul(&a[50], &twiddlesCoset[27]) + a[51].Mul(&a[51], &twiddlesCoset[27]) + a[54].Mul(&a[54], &twiddlesCoset[28]) + a[55].Mul(&a[55], &twiddlesCoset[28]) + a[58].Mul(&a[58], &twiddlesCoset[29]) + a[59].Mul(&a[59], &twiddlesCoset[29]) + a[62].Mul(&a[62], &twiddlesCoset[30]) + a[63].Mul(&a[63], &twiddlesCoset[30]) + goldilocks.Butterfly(&a[0], &a[2]) + goldilocks.Butterfly(&a[1], &a[3]) + goldilocks.Butterfly(&a[4], &a[6]) + goldilocks.Butterfly(&a[5], &a[7]) + goldilocks.Butterfly(&a[8], &a[10]) + goldilocks.Butterfly(&a[9], &a[11]) + goldilocks.Butterfly(&a[12], &a[14]) + goldilocks.Butterfly(&a[13], &a[15]) + goldilocks.Butterfly(&a[16], &a[18]) + goldilocks.Butterfly(&a[17], &a[19]) + goldilocks.Butterfly(&a[20], &a[22]) + goldilocks.Butterfly(&a[21], &a[23]) + goldilocks.Butterfly(&a[24], &a[26]) + goldilocks.Butterfly(&a[25], &a[27]) + goldilocks.Butterfly(&a[28], &a[30]) + goldilocks.Butterfly(&a[29], &a[31]) + goldilocks.Butterfly(&a[32], &a[34]) + goldilocks.Butterfly(&a[33], &a[35]) + goldilocks.Butterfly(&a[36], &a[38]) + goldilocks.Butterfly(&a[37], &a[39]) + goldilocks.Butterfly(&a[40], &a[42]) + goldilocks.Butterfly(&a[41], &a[43]) + goldilocks.Butterfly(&a[44], &a[46]) + goldilocks.Butterfly(&a[45], &a[47]) + goldilocks.Butterfly(&a[48], &a[50]) + goldilocks.Butterfly(&a[49], &a[51]) + goldilocks.Butterfly(&a[52], &a[54]) + goldilocks.Butterfly(&a[53], &a[55]) + goldilocks.Butterfly(&a[56], &a[58]) + goldilocks.Butterfly(&a[57], &a[59]) + goldilocks.Butterfly(&a[60], &a[62]) + goldilocks.Butterfly(&a[61], &a[63]) + a[1].Mul(&a[1], &twiddlesCoset[31]) + a[3].Mul(&a[3], &twiddlesCoset[32]) + a[5].Mul(&a[5], &twiddlesCoset[33]) + a[7].Mul(&a[7], &twiddlesCoset[34]) + a[9].Mul(&a[9], &twiddlesCoset[35]) + a[11].Mul(&a[11], &twiddlesCoset[36]) + a[13].Mul(&a[13], &twiddlesCoset[37]) + a[15].Mul(&a[15], &twiddlesCoset[38]) + a[17].Mul(&a[17], &twiddlesCoset[39]) + a[19].Mul(&a[19], &twiddlesCoset[40]) + a[21].Mul(&a[21], &twiddlesCoset[41]) + a[23].Mul(&a[23], &twiddlesCoset[42]) + a[25].Mul(&a[25], &twiddlesCoset[43]) + a[27].Mul(&a[27], &twiddlesCoset[44]) + a[29].Mul(&a[29], &twiddlesCoset[45]) + a[31].Mul(&a[31], &twiddlesCoset[46]) + a[33].Mul(&a[33], &twiddlesCoset[47]) + a[35].Mul(&a[35], &twiddlesCoset[48]) + a[37].Mul(&a[37], &twiddlesCoset[49]) + a[39].Mul(&a[39], &twiddlesCoset[50]) + a[41].Mul(&a[41], &twiddlesCoset[51]) + a[43].Mul(&a[43], &twiddlesCoset[52]) + a[45].Mul(&a[45], &twiddlesCoset[53]) + a[47].Mul(&a[47], &twiddlesCoset[54]) + a[49].Mul(&a[49], &twiddlesCoset[55]) + a[51].Mul(&a[51], &twiddlesCoset[56]) + a[53].Mul(&a[53], &twiddlesCoset[57]) + a[55].Mul(&a[55], &twiddlesCoset[58]) + a[57].Mul(&a[57], &twiddlesCoset[59]) + a[59].Mul(&a[59], &twiddlesCoset[60]) + a[61].Mul(&a[61], &twiddlesCoset[61]) + a[63].Mul(&a[63], &twiddlesCoset[62]) + goldilocks.Butterfly(&a[0], &a[1]) + goldilocks.Butterfly(&a[2], &a[3]) + goldilocks.Butterfly(&a[4], &a[5]) + goldilocks.Butterfly(&a[6], &a[7]) + goldilocks.Butterfly(&a[8], &a[9]) + goldilocks.Butterfly(&a[10], &a[11]) + goldilocks.Butterfly(&a[12], &a[13]) + goldilocks.Butterfly(&a[14], &a[15]) + goldilocks.Butterfly(&a[16], &a[17]) + goldilocks.Butterfly(&a[18], &a[19]) + goldilocks.Butterfly(&a[20], &a[21]) + goldilocks.Butterfly(&a[22], &a[23]) + goldilocks.Butterfly(&a[24], &a[25]) + goldilocks.Butterfly(&a[26], &a[27]) + goldilocks.Butterfly(&a[28], &a[29]) + goldilocks.Butterfly(&a[30], &a[31]) + goldilocks.Butterfly(&a[32], &a[33]) + goldilocks.Butterfly(&a[34], &a[35]) + goldilocks.Butterfly(&a[36], &a[37]) + goldilocks.Butterfly(&a[38], &a[39]) + goldilocks.Butterfly(&a[40], &a[41]) + goldilocks.Butterfly(&a[42], &a[43]) + goldilocks.Butterfly(&a[44], &a[45]) + goldilocks.Butterfly(&a[46], &a[47]) + goldilocks.Butterfly(&a[48], &a[49]) + goldilocks.Butterfly(&a[50], &a[51]) + goldilocks.Butterfly(&a[52], &a[53]) + goldilocks.Butterfly(&a[54], &a[55]) + goldilocks.Butterfly(&a[56], &a[57]) + goldilocks.Butterfly(&a[58], &a[59]) + goldilocks.Butterfly(&a[60], &a[61]) + goldilocks.Butterfly(&a[62], &a[63]) +} + +// PrecomputeTwiddlesCoset precomputes twiddlesCoset from twiddles and coset table +// it then return all elements in the correct order for the unrolled FFT. +func PrecomputeTwiddlesCoset(generator, shifter goldilocks.Element) []goldilocks.Element { + toReturn := make([]goldilocks.Element, 63) + var r, s goldilocks.Element + e := new(big.Int) + + s = shifter + for k := 0; k < 5; k++ { + s.Square(&s) + } + toReturn[0] = s + s = shifter + for k := 0; k < 4; k++ { + s.Square(&s) + } + toReturn[1] = s + r.Exp(generator, e.SetUint64(uint64(1<<4*1))) + toReturn[2].Mul(&r, &s) + s = shifter + for k := 0; k < 3; k++ { + s.Square(&s) + } + toReturn[3] = s + r.Exp(generator, e.SetUint64(uint64(1<<3*2))) + toReturn[4].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<3*1))) + toReturn[5].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<3*3))) + toReturn[6].Mul(&r, &s) + s = shifter + for k := 0; k < 2; k++ { + s.Square(&s) + } + toReturn[7] = s + r.Exp(generator, e.SetUint64(uint64(1<<2*4))) + toReturn[8].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<2*2))) + toReturn[9].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<2*6))) + toReturn[10].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<2*1))) + toReturn[11].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<2*5))) + toReturn[12].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<2*3))) + toReturn[13].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<2*7))) + toReturn[14].Mul(&r, &s) + s = shifter + for k := 0; k < 1; k++ { + s.Square(&s) + } + toReturn[15] = s + r.Exp(generator, e.SetUint64(uint64(1<<1*8))) + toReturn[16].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*4))) + toReturn[17].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*12))) + toReturn[18].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*2))) + toReturn[19].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*10))) + toReturn[20].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*6))) + toReturn[21].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*14))) + toReturn[22].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*1))) + toReturn[23].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*9))) + toReturn[24].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*5))) + toReturn[25].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*13))) + toReturn[26].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*3))) + toReturn[27].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*11))) + toReturn[28].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*7))) + toReturn[29].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*15))) + toReturn[30].Mul(&r, &s) + s = shifter + for k := 0; k < 0; k++ { + s.Square(&s) + } + toReturn[31] = s + r.Exp(generator, e.SetUint64(uint64(1<<0*16))) + toReturn[32].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*8))) + toReturn[33].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*24))) + toReturn[34].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*4))) + toReturn[35].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*20))) + toReturn[36].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*12))) + toReturn[37].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*28))) + toReturn[38].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*2))) + toReturn[39].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*18))) + toReturn[40].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*10))) + toReturn[41].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*26))) + toReturn[42].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*6))) + toReturn[43].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*22))) + toReturn[44].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*14))) + toReturn[45].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*30))) + toReturn[46].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*1))) + toReturn[47].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*17))) + toReturn[48].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*9))) + toReturn[49].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*25))) + toReturn[50].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*5))) + toReturn[51].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*21))) + toReturn[52].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*13))) + toReturn[53].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*29))) + toReturn[54].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*3))) + toReturn[55].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*19))) + toReturn[56].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*11))) + toReturn[57].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*27))) + toReturn[58].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*7))) + toReturn[59].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*23))) + toReturn[60].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*15))) + toReturn[61].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*31))) + toReturn[62].Mul(&r, &s) + return toReturn +} diff --git a/field/koalabear/fft/fft.go b/field/koalabear/fft/fft.go index 4c5e4ecfe..c8f1d282b 100644 --- a/field/koalabear/fft/fft.go +++ b/field/koalabear/fft/fft.go @@ -29,7 +29,8 @@ const butterflyThreshold = 16 // if decimation == DIT (decimation in time), the input must be in bit-reversed order // if decimation == DIF (decimation in frequency), the output will be in bit-reversed order func (domain *Domain) FFT(a []koalabear.Element, decimation Decimation, opts ...Option) { - + // perf note; this option pattern actually allocates on the heap and comes at a cost when + // doing many small FFTs! opt := fftOptions(opts...) // find the stage where we should stop spawning go routines in our recursive calls @@ -199,15 +200,9 @@ func difFFT(a []koalabear.Element, w koalabear.Element, twiddles [][]koalabear.E n := len(a) if n == 1 { return - } else if stage >= twiddlesStartStage { - if n == 256 { - kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) - return - } else if n == 32 { - kerDIFNP_32(a, twiddles, stage-twiddlesStartStage) - return - } - + } else if n == 256 && stage >= twiddlesStartStage { + kerDIFNP_256(a, twiddles, stage-twiddlesStartStage) + return } m := n >> 1 @@ -290,15 +285,9 @@ func ditFFT(a []koalabear.Element, w koalabear.Element, twiddles [][]koalabear.E n := len(a) if n == 1 { return - } else if stage >= twiddlesStartStage { - if n == 32 { - kerDITNP_32(a, twiddles, stage-twiddlesStartStage) - return - } else if n == 256 { - kerDITNP_256(a, twiddles, stage-twiddlesStartStage) - return - } - + } else if n == 256 && stage >= twiddlesStartStage { + kerDITNP_256(a, twiddles, stage-twiddlesStartStage) + return } m := n >> 1 @@ -426,39 +415,3 @@ func kerDITNP_256(a []koalabear.Element, twiddles [][]koalabear.Element, stage i } innerDITWithTwiddles(a[:256], twiddles[stage+0], 0, 128, 128) } - -func kerDIFNP_32(a []koalabear.Element, twiddles [][]koalabear.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - innerDIFWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) - for offset := 0; offset < 32; offset += 16 { - innerDIFWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) - } - for offset := 0; offset < 32; offset += 8 { - innerDIFWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) - } - for offset := 0; offset < 32; offset += 4 { - innerDIFWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) - } - for offset := 0; offset < 32; offset += 2 { - koalabear.Butterfly(&a[offset], &a[offset+1]) - } -} - -func kerDITNP_32(a []koalabear.Element, twiddles [][]koalabear.Element, stage int) { - // code unrolled & generated by internal/generator/fft/template/fft.go.tmpl - - for offset := 0; offset < 32; offset += 2 { - koalabear.Butterfly(&a[offset], &a[offset+1]) - } - for offset := 0; offset < 32; offset += 4 { - innerDITWithTwiddles(a[offset:offset+4], twiddles[stage+3], 0, 2, 2) - } - for offset := 0; offset < 32; offset += 8 { - innerDITWithTwiddles(a[offset:offset+8], twiddles[stage+2], 0, 4, 4) - } - for offset := 0; offset < 32; offset += 16 { - innerDITWithTwiddles(a[offset:offset+16], twiddles[stage+1], 0, 8, 8) - } - innerDITWithTwiddles(a[:32], twiddles[stage+0], 0, 16, 16) -} diff --git a/field/koalabear/sis/sis.go b/field/koalabear/sis/sis.go index 21224ccee..cfba8a332 100644 --- a/field/koalabear/sis/sis.go +++ b/field/koalabear/sis/sis.go @@ -37,6 +37,9 @@ type RSis struct { Domain *fft.Domain maxNbElementsToHash int + + smallFFT func([]koalabear.Element) + twiddlesCoset []koalabear.Element // used in conjunction with the smallFFT; } // NewRSis creates an instance of RSis. @@ -97,6 +100,18 @@ func NewRSis(seed int64, logTwoDegree, logTwoBound, maxNbElementsToHash int) (*R maxNbElementsToHash: maxNbElementsToHash, } + r.smallFFT = func(p []koalabear.Element) { + r.Domain.FFT(p, fft.DIF, fft.OnCoset(), fft.WithNbTasks(1)) + } + + // if we have a FFT kernel of the size of the domain cardinality, we use it. + if r.Domain.Cardinality == 64 { + r.twiddlesCoset = PrecomputeTwiddlesCoset(r.Domain.Generator, shift) + r.smallFFT = func(a []koalabear.Element) { + FFT64(a, r.twiddlesCoset) + } + } + // filling A a := make([]koalabear.Element, n*r.Degree) ag := make([]koalabear.Element, n*r.Degree) @@ -171,7 +186,16 @@ func (r *RSis) InnerHash(it *LimbIterator, res, k koalabear.Vector, polId int) { return } - r.Domain.FFT(k, fft.DIF, fft.OnCoset(), fft.WithNbTasks(1)) + // r.Domain.FFT(k, fft.DIF, fft.OnCoset(), fft.WithNbTasks(1)) + // for perf, we use directly what's exposed; + r.smallFFT(k) + // k.Mul(k, fr.Vector(r.cosetTable)) + // if r.Domain.KernelDIF != nil { + // r.Domain.KernelDIF(k) + // } else { + // r.Domain.FFT(k, fft.DIF, fft.WithNbTasks(1)) + // } + mulModAcc(res, r.Ag[polId], k) } diff --git a/field/koalabear/sis/sis_fft.go b/field/koalabear/sis/sis_fft.go new file mode 100644 index 000000000..7706135c9 --- /dev/null +++ b/field/koalabear/sis/sis_fft.go @@ -0,0 +1,556 @@ +// Copyright 2020-2025 Consensys Software Inc. +// Licensed under the Apache License, Version 2.0. See the LICENSE file for details. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package sis + +import ( + "github.com/consensys/gnark-crypto/field/koalabear" + "math/big" +) + +// FFT64 is generated by gnark-crypto and contains the unrolled code for FFT (DIF) on 64 elements +// equivalent code: r.Domain.FFT(k, fft.DIF, fft.OnCoset(), fft.WithNbTasks(1)) +// twiddlesCoset must be pre-computed from twiddles and coset table, see PrecomputeTwiddlesCoset +func FFT64(a []koalabear.Element, twiddlesCoset []koalabear.Element) { + + a[32].Mul(&a[32], &twiddlesCoset[0]) + a[33].Mul(&a[33], &twiddlesCoset[0]) + a[34].Mul(&a[34], &twiddlesCoset[0]) + a[35].Mul(&a[35], &twiddlesCoset[0]) + a[36].Mul(&a[36], &twiddlesCoset[0]) + a[37].Mul(&a[37], &twiddlesCoset[0]) + a[38].Mul(&a[38], &twiddlesCoset[0]) + a[39].Mul(&a[39], &twiddlesCoset[0]) + a[40].Mul(&a[40], &twiddlesCoset[0]) + a[41].Mul(&a[41], &twiddlesCoset[0]) + a[42].Mul(&a[42], &twiddlesCoset[0]) + a[43].Mul(&a[43], &twiddlesCoset[0]) + a[44].Mul(&a[44], &twiddlesCoset[0]) + a[45].Mul(&a[45], &twiddlesCoset[0]) + a[46].Mul(&a[46], &twiddlesCoset[0]) + a[47].Mul(&a[47], &twiddlesCoset[0]) + a[48].Mul(&a[48], &twiddlesCoset[0]) + a[49].Mul(&a[49], &twiddlesCoset[0]) + a[50].Mul(&a[50], &twiddlesCoset[0]) + a[51].Mul(&a[51], &twiddlesCoset[0]) + a[52].Mul(&a[52], &twiddlesCoset[0]) + a[53].Mul(&a[53], &twiddlesCoset[0]) + a[54].Mul(&a[54], &twiddlesCoset[0]) + a[55].Mul(&a[55], &twiddlesCoset[0]) + a[56].Mul(&a[56], &twiddlesCoset[0]) + a[57].Mul(&a[57], &twiddlesCoset[0]) + a[58].Mul(&a[58], &twiddlesCoset[0]) + a[59].Mul(&a[59], &twiddlesCoset[0]) + a[60].Mul(&a[60], &twiddlesCoset[0]) + a[61].Mul(&a[61], &twiddlesCoset[0]) + a[62].Mul(&a[62], &twiddlesCoset[0]) + a[63].Mul(&a[63], &twiddlesCoset[0]) + koalabear.Butterfly(&a[0], &a[32]) + koalabear.Butterfly(&a[1], &a[33]) + koalabear.Butterfly(&a[2], &a[34]) + koalabear.Butterfly(&a[3], &a[35]) + koalabear.Butterfly(&a[4], &a[36]) + koalabear.Butterfly(&a[5], &a[37]) + koalabear.Butterfly(&a[6], &a[38]) + koalabear.Butterfly(&a[7], &a[39]) + koalabear.Butterfly(&a[8], &a[40]) + koalabear.Butterfly(&a[9], &a[41]) + koalabear.Butterfly(&a[10], &a[42]) + koalabear.Butterfly(&a[11], &a[43]) + koalabear.Butterfly(&a[12], &a[44]) + koalabear.Butterfly(&a[13], &a[45]) + koalabear.Butterfly(&a[14], &a[46]) + koalabear.Butterfly(&a[15], &a[47]) + koalabear.Butterfly(&a[16], &a[48]) + koalabear.Butterfly(&a[17], &a[49]) + koalabear.Butterfly(&a[18], &a[50]) + koalabear.Butterfly(&a[19], &a[51]) + koalabear.Butterfly(&a[20], &a[52]) + koalabear.Butterfly(&a[21], &a[53]) + koalabear.Butterfly(&a[22], &a[54]) + koalabear.Butterfly(&a[23], &a[55]) + koalabear.Butterfly(&a[24], &a[56]) + koalabear.Butterfly(&a[25], &a[57]) + koalabear.Butterfly(&a[26], &a[58]) + koalabear.Butterfly(&a[27], &a[59]) + koalabear.Butterfly(&a[28], &a[60]) + koalabear.Butterfly(&a[29], &a[61]) + koalabear.Butterfly(&a[30], &a[62]) + koalabear.Butterfly(&a[31], &a[63]) + a[16].Mul(&a[16], &twiddlesCoset[1]) + a[17].Mul(&a[17], &twiddlesCoset[1]) + a[18].Mul(&a[18], &twiddlesCoset[1]) + a[19].Mul(&a[19], &twiddlesCoset[1]) + a[20].Mul(&a[20], &twiddlesCoset[1]) + a[21].Mul(&a[21], &twiddlesCoset[1]) + a[22].Mul(&a[22], &twiddlesCoset[1]) + a[23].Mul(&a[23], &twiddlesCoset[1]) + a[24].Mul(&a[24], &twiddlesCoset[1]) + a[25].Mul(&a[25], &twiddlesCoset[1]) + a[26].Mul(&a[26], &twiddlesCoset[1]) + a[27].Mul(&a[27], &twiddlesCoset[1]) + a[28].Mul(&a[28], &twiddlesCoset[1]) + a[29].Mul(&a[29], &twiddlesCoset[1]) + a[30].Mul(&a[30], &twiddlesCoset[1]) + a[31].Mul(&a[31], &twiddlesCoset[1]) + a[48].Mul(&a[48], &twiddlesCoset[2]) + a[49].Mul(&a[49], &twiddlesCoset[2]) + a[50].Mul(&a[50], &twiddlesCoset[2]) + a[51].Mul(&a[51], &twiddlesCoset[2]) + a[52].Mul(&a[52], &twiddlesCoset[2]) + a[53].Mul(&a[53], &twiddlesCoset[2]) + a[54].Mul(&a[54], &twiddlesCoset[2]) + a[55].Mul(&a[55], &twiddlesCoset[2]) + a[56].Mul(&a[56], &twiddlesCoset[2]) + a[57].Mul(&a[57], &twiddlesCoset[2]) + a[58].Mul(&a[58], &twiddlesCoset[2]) + a[59].Mul(&a[59], &twiddlesCoset[2]) + a[60].Mul(&a[60], &twiddlesCoset[2]) + a[61].Mul(&a[61], &twiddlesCoset[2]) + a[62].Mul(&a[62], &twiddlesCoset[2]) + a[63].Mul(&a[63], &twiddlesCoset[2]) + koalabear.Butterfly(&a[0], &a[16]) + koalabear.Butterfly(&a[1], &a[17]) + koalabear.Butterfly(&a[2], &a[18]) + koalabear.Butterfly(&a[3], &a[19]) + koalabear.Butterfly(&a[4], &a[20]) + koalabear.Butterfly(&a[5], &a[21]) + koalabear.Butterfly(&a[6], &a[22]) + koalabear.Butterfly(&a[7], &a[23]) + koalabear.Butterfly(&a[8], &a[24]) + koalabear.Butterfly(&a[9], &a[25]) + koalabear.Butterfly(&a[10], &a[26]) + koalabear.Butterfly(&a[11], &a[27]) + koalabear.Butterfly(&a[12], &a[28]) + koalabear.Butterfly(&a[13], &a[29]) + koalabear.Butterfly(&a[14], &a[30]) + koalabear.Butterfly(&a[15], &a[31]) + koalabear.Butterfly(&a[32], &a[48]) + koalabear.Butterfly(&a[33], &a[49]) + koalabear.Butterfly(&a[34], &a[50]) + koalabear.Butterfly(&a[35], &a[51]) + koalabear.Butterfly(&a[36], &a[52]) + koalabear.Butterfly(&a[37], &a[53]) + koalabear.Butterfly(&a[38], &a[54]) + koalabear.Butterfly(&a[39], &a[55]) + koalabear.Butterfly(&a[40], &a[56]) + koalabear.Butterfly(&a[41], &a[57]) + koalabear.Butterfly(&a[42], &a[58]) + koalabear.Butterfly(&a[43], &a[59]) + koalabear.Butterfly(&a[44], &a[60]) + koalabear.Butterfly(&a[45], &a[61]) + koalabear.Butterfly(&a[46], &a[62]) + koalabear.Butterfly(&a[47], &a[63]) + a[8].Mul(&a[8], &twiddlesCoset[3]) + a[9].Mul(&a[9], &twiddlesCoset[3]) + a[10].Mul(&a[10], &twiddlesCoset[3]) + a[11].Mul(&a[11], &twiddlesCoset[3]) + a[12].Mul(&a[12], &twiddlesCoset[3]) + a[13].Mul(&a[13], &twiddlesCoset[3]) + a[14].Mul(&a[14], &twiddlesCoset[3]) + a[15].Mul(&a[15], &twiddlesCoset[3]) + a[24].Mul(&a[24], &twiddlesCoset[4]) + a[25].Mul(&a[25], &twiddlesCoset[4]) + a[26].Mul(&a[26], &twiddlesCoset[4]) + a[27].Mul(&a[27], &twiddlesCoset[4]) + a[28].Mul(&a[28], &twiddlesCoset[4]) + a[29].Mul(&a[29], &twiddlesCoset[4]) + a[30].Mul(&a[30], &twiddlesCoset[4]) + a[31].Mul(&a[31], &twiddlesCoset[4]) + a[40].Mul(&a[40], &twiddlesCoset[5]) + a[41].Mul(&a[41], &twiddlesCoset[5]) + a[42].Mul(&a[42], &twiddlesCoset[5]) + a[43].Mul(&a[43], &twiddlesCoset[5]) + a[44].Mul(&a[44], &twiddlesCoset[5]) + a[45].Mul(&a[45], &twiddlesCoset[5]) + a[46].Mul(&a[46], &twiddlesCoset[5]) + a[47].Mul(&a[47], &twiddlesCoset[5]) + a[56].Mul(&a[56], &twiddlesCoset[6]) + a[57].Mul(&a[57], &twiddlesCoset[6]) + a[58].Mul(&a[58], &twiddlesCoset[6]) + a[59].Mul(&a[59], &twiddlesCoset[6]) + a[60].Mul(&a[60], &twiddlesCoset[6]) + a[61].Mul(&a[61], &twiddlesCoset[6]) + a[62].Mul(&a[62], &twiddlesCoset[6]) + a[63].Mul(&a[63], &twiddlesCoset[6]) + koalabear.Butterfly(&a[0], &a[8]) + koalabear.Butterfly(&a[1], &a[9]) + koalabear.Butterfly(&a[2], &a[10]) + koalabear.Butterfly(&a[3], &a[11]) + koalabear.Butterfly(&a[4], &a[12]) + koalabear.Butterfly(&a[5], &a[13]) + koalabear.Butterfly(&a[6], &a[14]) + koalabear.Butterfly(&a[7], &a[15]) + koalabear.Butterfly(&a[16], &a[24]) + koalabear.Butterfly(&a[17], &a[25]) + koalabear.Butterfly(&a[18], &a[26]) + koalabear.Butterfly(&a[19], &a[27]) + koalabear.Butterfly(&a[20], &a[28]) + koalabear.Butterfly(&a[21], &a[29]) + koalabear.Butterfly(&a[22], &a[30]) + koalabear.Butterfly(&a[23], &a[31]) + koalabear.Butterfly(&a[32], &a[40]) + koalabear.Butterfly(&a[33], &a[41]) + koalabear.Butterfly(&a[34], &a[42]) + koalabear.Butterfly(&a[35], &a[43]) + koalabear.Butterfly(&a[36], &a[44]) + koalabear.Butterfly(&a[37], &a[45]) + koalabear.Butterfly(&a[38], &a[46]) + koalabear.Butterfly(&a[39], &a[47]) + koalabear.Butterfly(&a[48], &a[56]) + koalabear.Butterfly(&a[49], &a[57]) + koalabear.Butterfly(&a[50], &a[58]) + koalabear.Butterfly(&a[51], &a[59]) + koalabear.Butterfly(&a[52], &a[60]) + koalabear.Butterfly(&a[53], &a[61]) + koalabear.Butterfly(&a[54], &a[62]) + koalabear.Butterfly(&a[55], &a[63]) + a[4].Mul(&a[4], &twiddlesCoset[7]) + a[5].Mul(&a[5], &twiddlesCoset[7]) + a[6].Mul(&a[6], &twiddlesCoset[7]) + a[7].Mul(&a[7], &twiddlesCoset[7]) + a[12].Mul(&a[12], &twiddlesCoset[8]) + a[13].Mul(&a[13], &twiddlesCoset[8]) + a[14].Mul(&a[14], &twiddlesCoset[8]) + a[15].Mul(&a[15], &twiddlesCoset[8]) + a[20].Mul(&a[20], &twiddlesCoset[9]) + a[21].Mul(&a[21], &twiddlesCoset[9]) + a[22].Mul(&a[22], &twiddlesCoset[9]) + a[23].Mul(&a[23], &twiddlesCoset[9]) + a[28].Mul(&a[28], &twiddlesCoset[10]) + a[29].Mul(&a[29], &twiddlesCoset[10]) + a[30].Mul(&a[30], &twiddlesCoset[10]) + a[31].Mul(&a[31], &twiddlesCoset[10]) + a[36].Mul(&a[36], &twiddlesCoset[11]) + a[37].Mul(&a[37], &twiddlesCoset[11]) + a[38].Mul(&a[38], &twiddlesCoset[11]) + a[39].Mul(&a[39], &twiddlesCoset[11]) + a[44].Mul(&a[44], &twiddlesCoset[12]) + a[45].Mul(&a[45], &twiddlesCoset[12]) + a[46].Mul(&a[46], &twiddlesCoset[12]) + a[47].Mul(&a[47], &twiddlesCoset[12]) + a[52].Mul(&a[52], &twiddlesCoset[13]) + a[53].Mul(&a[53], &twiddlesCoset[13]) + a[54].Mul(&a[54], &twiddlesCoset[13]) + a[55].Mul(&a[55], &twiddlesCoset[13]) + a[60].Mul(&a[60], &twiddlesCoset[14]) + a[61].Mul(&a[61], &twiddlesCoset[14]) + a[62].Mul(&a[62], &twiddlesCoset[14]) + a[63].Mul(&a[63], &twiddlesCoset[14]) + koalabear.Butterfly(&a[0], &a[4]) + koalabear.Butterfly(&a[1], &a[5]) + koalabear.Butterfly(&a[2], &a[6]) + koalabear.Butterfly(&a[3], &a[7]) + koalabear.Butterfly(&a[8], &a[12]) + koalabear.Butterfly(&a[9], &a[13]) + koalabear.Butterfly(&a[10], &a[14]) + koalabear.Butterfly(&a[11], &a[15]) + koalabear.Butterfly(&a[16], &a[20]) + koalabear.Butterfly(&a[17], &a[21]) + koalabear.Butterfly(&a[18], &a[22]) + koalabear.Butterfly(&a[19], &a[23]) + koalabear.Butterfly(&a[24], &a[28]) + koalabear.Butterfly(&a[25], &a[29]) + koalabear.Butterfly(&a[26], &a[30]) + koalabear.Butterfly(&a[27], &a[31]) + koalabear.Butterfly(&a[32], &a[36]) + koalabear.Butterfly(&a[33], &a[37]) + koalabear.Butterfly(&a[34], &a[38]) + koalabear.Butterfly(&a[35], &a[39]) + koalabear.Butterfly(&a[40], &a[44]) + koalabear.Butterfly(&a[41], &a[45]) + koalabear.Butterfly(&a[42], &a[46]) + koalabear.Butterfly(&a[43], &a[47]) + koalabear.Butterfly(&a[48], &a[52]) + koalabear.Butterfly(&a[49], &a[53]) + koalabear.Butterfly(&a[50], &a[54]) + koalabear.Butterfly(&a[51], &a[55]) + koalabear.Butterfly(&a[56], &a[60]) + koalabear.Butterfly(&a[57], &a[61]) + koalabear.Butterfly(&a[58], &a[62]) + koalabear.Butterfly(&a[59], &a[63]) + a[2].Mul(&a[2], &twiddlesCoset[15]) + a[3].Mul(&a[3], &twiddlesCoset[15]) + a[6].Mul(&a[6], &twiddlesCoset[16]) + a[7].Mul(&a[7], &twiddlesCoset[16]) + a[10].Mul(&a[10], &twiddlesCoset[17]) + a[11].Mul(&a[11], &twiddlesCoset[17]) + a[14].Mul(&a[14], &twiddlesCoset[18]) + a[15].Mul(&a[15], &twiddlesCoset[18]) + a[18].Mul(&a[18], &twiddlesCoset[19]) + a[19].Mul(&a[19], &twiddlesCoset[19]) + a[22].Mul(&a[22], &twiddlesCoset[20]) + a[23].Mul(&a[23], &twiddlesCoset[20]) + a[26].Mul(&a[26], &twiddlesCoset[21]) + a[27].Mul(&a[27], &twiddlesCoset[21]) + a[30].Mul(&a[30], &twiddlesCoset[22]) + a[31].Mul(&a[31], &twiddlesCoset[22]) + a[34].Mul(&a[34], &twiddlesCoset[23]) + a[35].Mul(&a[35], &twiddlesCoset[23]) + a[38].Mul(&a[38], &twiddlesCoset[24]) + a[39].Mul(&a[39], &twiddlesCoset[24]) + a[42].Mul(&a[42], &twiddlesCoset[25]) + a[43].Mul(&a[43], &twiddlesCoset[25]) + a[46].Mul(&a[46], &twiddlesCoset[26]) + a[47].Mul(&a[47], &twiddlesCoset[26]) + a[50].Mul(&a[50], &twiddlesCoset[27]) + a[51].Mul(&a[51], &twiddlesCoset[27]) + a[54].Mul(&a[54], &twiddlesCoset[28]) + a[55].Mul(&a[55], &twiddlesCoset[28]) + a[58].Mul(&a[58], &twiddlesCoset[29]) + a[59].Mul(&a[59], &twiddlesCoset[29]) + a[62].Mul(&a[62], &twiddlesCoset[30]) + a[63].Mul(&a[63], &twiddlesCoset[30]) + koalabear.Butterfly(&a[0], &a[2]) + koalabear.Butterfly(&a[1], &a[3]) + koalabear.Butterfly(&a[4], &a[6]) + koalabear.Butterfly(&a[5], &a[7]) + koalabear.Butterfly(&a[8], &a[10]) + koalabear.Butterfly(&a[9], &a[11]) + koalabear.Butterfly(&a[12], &a[14]) + koalabear.Butterfly(&a[13], &a[15]) + koalabear.Butterfly(&a[16], &a[18]) + koalabear.Butterfly(&a[17], &a[19]) + koalabear.Butterfly(&a[20], &a[22]) + koalabear.Butterfly(&a[21], &a[23]) + koalabear.Butterfly(&a[24], &a[26]) + koalabear.Butterfly(&a[25], &a[27]) + koalabear.Butterfly(&a[28], &a[30]) + koalabear.Butterfly(&a[29], &a[31]) + koalabear.Butterfly(&a[32], &a[34]) + koalabear.Butterfly(&a[33], &a[35]) + koalabear.Butterfly(&a[36], &a[38]) + koalabear.Butterfly(&a[37], &a[39]) + koalabear.Butterfly(&a[40], &a[42]) + koalabear.Butterfly(&a[41], &a[43]) + koalabear.Butterfly(&a[44], &a[46]) + koalabear.Butterfly(&a[45], &a[47]) + koalabear.Butterfly(&a[48], &a[50]) + koalabear.Butterfly(&a[49], &a[51]) + koalabear.Butterfly(&a[52], &a[54]) + koalabear.Butterfly(&a[53], &a[55]) + koalabear.Butterfly(&a[56], &a[58]) + koalabear.Butterfly(&a[57], &a[59]) + koalabear.Butterfly(&a[60], &a[62]) + koalabear.Butterfly(&a[61], &a[63]) + a[1].Mul(&a[1], &twiddlesCoset[31]) + a[3].Mul(&a[3], &twiddlesCoset[32]) + a[5].Mul(&a[5], &twiddlesCoset[33]) + a[7].Mul(&a[7], &twiddlesCoset[34]) + a[9].Mul(&a[9], &twiddlesCoset[35]) + a[11].Mul(&a[11], &twiddlesCoset[36]) + a[13].Mul(&a[13], &twiddlesCoset[37]) + a[15].Mul(&a[15], &twiddlesCoset[38]) + a[17].Mul(&a[17], &twiddlesCoset[39]) + a[19].Mul(&a[19], &twiddlesCoset[40]) + a[21].Mul(&a[21], &twiddlesCoset[41]) + a[23].Mul(&a[23], &twiddlesCoset[42]) + a[25].Mul(&a[25], &twiddlesCoset[43]) + a[27].Mul(&a[27], &twiddlesCoset[44]) + a[29].Mul(&a[29], &twiddlesCoset[45]) + a[31].Mul(&a[31], &twiddlesCoset[46]) + a[33].Mul(&a[33], &twiddlesCoset[47]) + a[35].Mul(&a[35], &twiddlesCoset[48]) + a[37].Mul(&a[37], &twiddlesCoset[49]) + a[39].Mul(&a[39], &twiddlesCoset[50]) + a[41].Mul(&a[41], &twiddlesCoset[51]) + a[43].Mul(&a[43], &twiddlesCoset[52]) + a[45].Mul(&a[45], &twiddlesCoset[53]) + a[47].Mul(&a[47], &twiddlesCoset[54]) + a[49].Mul(&a[49], &twiddlesCoset[55]) + a[51].Mul(&a[51], &twiddlesCoset[56]) + a[53].Mul(&a[53], &twiddlesCoset[57]) + a[55].Mul(&a[55], &twiddlesCoset[58]) + a[57].Mul(&a[57], &twiddlesCoset[59]) + a[59].Mul(&a[59], &twiddlesCoset[60]) + a[61].Mul(&a[61], &twiddlesCoset[61]) + a[63].Mul(&a[63], &twiddlesCoset[62]) + koalabear.Butterfly(&a[0], &a[1]) + koalabear.Butterfly(&a[2], &a[3]) + koalabear.Butterfly(&a[4], &a[5]) + koalabear.Butterfly(&a[6], &a[7]) + koalabear.Butterfly(&a[8], &a[9]) + koalabear.Butterfly(&a[10], &a[11]) + koalabear.Butterfly(&a[12], &a[13]) + koalabear.Butterfly(&a[14], &a[15]) + koalabear.Butterfly(&a[16], &a[17]) + koalabear.Butterfly(&a[18], &a[19]) + koalabear.Butterfly(&a[20], &a[21]) + koalabear.Butterfly(&a[22], &a[23]) + koalabear.Butterfly(&a[24], &a[25]) + koalabear.Butterfly(&a[26], &a[27]) + koalabear.Butterfly(&a[28], &a[29]) + koalabear.Butterfly(&a[30], &a[31]) + koalabear.Butterfly(&a[32], &a[33]) + koalabear.Butterfly(&a[34], &a[35]) + koalabear.Butterfly(&a[36], &a[37]) + koalabear.Butterfly(&a[38], &a[39]) + koalabear.Butterfly(&a[40], &a[41]) + koalabear.Butterfly(&a[42], &a[43]) + koalabear.Butterfly(&a[44], &a[45]) + koalabear.Butterfly(&a[46], &a[47]) + koalabear.Butterfly(&a[48], &a[49]) + koalabear.Butterfly(&a[50], &a[51]) + koalabear.Butterfly(&a[52], &a[53]) + koalabear.Butterfly(&a[54], &a[55]) + koalabear.Butterfly(&a[56], &a[57]) + koalabear.Butterfly(&a[58], &a[59]) + koalabear.Butterfly(&a[60], &a[61]) + koalabear.Butterfly(&a[62], &a[63]) +} + +// PrecomputeTwiddlesCoset precomputes twiddlesCoset from twiddles and coset table +// it then return all elements in the correct order for the unrolled FFT. +func PrecomputeTwiddlesCoset(generator, shifter koalabear.Element) []koalabear.Element { + toReturn := make([]koalabear.Element, 63) + var r, s koalabear.Element + e := new(big.Int) + + s = shifter + for k := 0; k < 5; k++ { + s.Square(&s) + } + toReturn[0] = s + s = shifter + for k := 0; k < 4; k++ { + s.Square(&s) + } + toReturn[1] = s + r.Exp(generator, e.SetUint64(uint64(1<<4*1))) + toReturn[2].Mul(&r, &s) + s = shifter + for k := 0; k < 3; k++ { + s.Square(&s) + } + toReturn[3] = s + r.Exp(generator, e.SetUint64(uint64(1<<3*2))) + toReturn[4].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<3*1))) + toReturn[5].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<3*3))) + toReturn[6].Mul(&r, &s) + s = shifter + for k := 0; k < 2; k++ { + s.Square(&s) + } + toReturn[7] = s + r.Exp(generator, e.SetUint64(uint64(1<<2*4))) + toReturn[8].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<2*2))) + toReturn[9].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<2*6))) + toReturn[10].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<2*1))) + toReturn[11].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<2*5))) + toReturn[12].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<2*3))) + toReturn[13].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<2*7))) + toReturn[14].Mul(&r, &s) + s = shifter + for k := 0; k < 1; k++ { + s.Square(&s) + } + toReturn[15] = s + r.Exp(generator, e.SetUint64(uint64(1<<1*8))) + toReturn[16].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*4))) + toReturn[17].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*12))) + toReturn[18].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*2))) + toReturn[19].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*10))) + toReturn[20].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*6))) + toReturn[21].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*14))) + toReturn[22].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*1))) + toReturn[23].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*9))) + toReturn[24].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*5))) + toReturn[25].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*13))) + toReturn[26].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*3))) + toReturn[27].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*11))) + toReturn[28].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*7))) + toReturn[29].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<1*15))) + toReturn[30].Mul(&r, &s) + s = shifter + for k := 0; k < 0; k++ { + s.Square(&s) + } + toReturn[31] = s + r.Exp(generator, e.SetUint64(uint64(1<<0*16))) + toReturn[32].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*8))) + toReturn[33].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*24))) + toReturn[34].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*4))) + toReturn[35].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*20))) + toReturn[36].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*12))) + toReturn[37].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*28))) + toReturn[38].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*2))) + toReturn[39].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*18))) + toReturn[40].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*10))) + toReturn[41].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*26))) + toReturn[42].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*6))) + toReturn[43].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*22))) + toReturn[44].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*14))) + toReturn[45].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*30))) + toReturn[46].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*1))) + toReturn[47].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*17))) + toReturn[48].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*9))) + toReturn[49].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*25))) + toReturn[50].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*5))) + toReturn[51].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*21))) + toReturn[52].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*13))) + toReturn[53].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*29))) + toReturn[54].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*3))) + toReturn[55].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*19))) + toReturn[56].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*11))) + toReturn[57].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*27))) + toReturn[58].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*7))) + toReturn[59].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*23))) + toReturn[60].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*15))) + toReturn[61].Mul(&r, &s) + r.Exp(generator, e.SetUint64(uint64(1<<0*31))) + toReturn[62].Mul(&r, &s) + return toReturn +}