853 lines
32 KiB
Go
853 lines
32 KiB
Go
// Copyright 2020-2025 Consensys Software Inc.
|
|
// Licensed under the Apache License, Version 2.0. See the LICENSE file for details.
|
|
|
|
// Code generated by consensys/gnark-crypto DO NOT EDIT
|
|
|
|
package bls12381
|
|
|
|
import (
|
|
"errors"
|
|
"github.com/consensys/gnark-crypto/ecc"
|
|
"github.com/consensys/gnark-crypto/ecc/bls12-381/fr"
|
|
"github.com/consensys/gnark-crypto/internal/parallel"
|
|
"math"
|
|
"runtime"
|
|
)
|
|
|
|
// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
|
|
//
|
|
// This call return an error if len(scalars) != len(points) or if provided config is invalid.
|
|
func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) {
|
|
var _p G1Jac
|
|
if _, err := _p.MultiExp(points, scalars, config); err != nil {
|
|
return nil, err
|
|
}
|
|
p.FromJacobian(&_p)
|
|
return p, nil
|
|
}
|
|
|
|
// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
|
|
//
|
|
// This call return an error if len(scalars) != len(points) or if provided config is invalid.
|
|
func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
|
|
// TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability.
|
|
// note:
|
|
// each of the msmCX method is the same, except for the c constant it declares
|
|
// duplicating (through template generation) these methods allows to declare the buckets on the stack
|
|
// the choice of c needs to be improved:
|
|
// there is a theoretical value that gives optimal asymptotics
|
|
// but in practice, other factors come into play, including:
|
|
// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
|
|
// * number of CPUs
|
|
// * cache friendliness (which depends on the host, G1 or G2... )
|
|
// --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
|
|
|
|
// for each msmCX
|
|
// step 1
|
|
// we compute, for each scalars over c-bit wide windows, nbChunk digits
|
|
// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and subtract
|
|
// 2^{c} to the current digit, making it negative.
|
|
// negative digits will be processed in the next step as adding -G into the bucket instead of G
|
|
// (computing -G is cheap, and this saves us half of the buckets)
|
|
// step 2
|
|
// buckets are declared on the stack
|
|
// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
|
|
// we use jacobian extended formulas here as they are faster than mixed addition
|
|
// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
|
|
// step 3
|
|
// reduce the buckets weighed sums into our result (msmReduceChunk)
|
|
|
|
// ensure len(points) == len(scalars)
|
|
nbPoints := len(points)
|
|
if nbPoints != len(scalars) {
|
|
return nil, errors.New("len(points) != len(scalars)")
|
|
}
|
|
|
|
// if nbTasks is not set, use all available CPUs
|
|
if config.NbTasks <= 0 {
|
|
config.NbTasks = runtime.NumCPU() * 2
|
|
} else if config.NbTasks > 1024 {
|
|
return nil, errors.New("invalid config: config.NbTasks > 1024")
|
|
}
|
|
|
|
// here, we compute the best C for nbPoints
|
|
// we split recursively until nbChunks(c) >= nbTasks,
|
|
bestC := func(nbPoints int) uint64 {
|
|
// implemented msmC methods (the c we use must be in this slice)
|
|
implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
|
|
var C uint64
|
|
// approximate cost (in group operations)
|
|
// cost = bits/c * (nbPoints + 2^{c})
|
|
// this needs to be verified empirically.
|
|
// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
|
|
min := math.MaxFloat64
|
|
for _, c := range implementedCs {
|
|
cc := (fr.Bits + 1) * (nbPoints + (1 << c))
|
|
cost := float64(cc) / float64(c)
|
|
if cost < min {
|
|
min = cost
|
|
C = c
|
|
}
|
|
}
|
|
return C
|
|
}
|
|
|
|
C := bestC(nbPoints)
|
|
nbChunks := int(computeNbChunks(C))
|
|
|
|
// should we recursively split the msm in half? (see below)
|
|
// we want to minimize the execution time of the algorithm;
|
|
// splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it.
|
|
|
|
// costFunction returns a metric that represent the "wall time" of the algorithm
|
|
costFunction := func(nbTasks, nbCpus, costPerTask int) int {
|
|
// cost for the reduction of all tasks (msmReduceChunk)
|
|
totalCost := nbTasks
|
|
|
|
// cost for the computation of each task (msmProcessChunk)
|
|
for nbTasks >= nbCpus {
|
|
nbTasks -= nbCpus
|
|
totalCost += costPerTask
|
|
}
|
|
if nbTasks > 0 {
|
|
totalCost += costPerTask
|
|
}
|
|
return totalCost
|
|
}
|
|
|
|
// costPerTask is the approximate number of group ops per task
|
|
costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) }
|
|
|
|
costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints))
|
|
|
|
cPostSplit := bestC(nbPoints / 2)
|
|
nbChunksPostSplit := int(computeNbChunks(cPostSplit))
|
|
costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2))
|
|
|
|
// if the cost of the split msm is lower than the cost of the non split msm, we split
|
|
if costPostSplit < costPreSplit {
|
|
config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0))
|
|
var _p G1Jac
|
|
chDone := make(chan struct{}, 1)
|
|
go func() {
|
|
_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
|
|
close(chDone)
|
|
}()
|
|
p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
|
|
<-chDone
|
|
p.AddAssign(&_p)
|
|
return p, nil
|
|
}
|
|
|
|
// if we don't split, we use the best C we found
|
|
_innerMsmG1(p, C, points, scalars, config)
|
|
|
|
return p, nil
|
|
}
|
|
|
|
func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac {
|
|
// partition the scalars
|
|
digits, chunkStats := partitionScalars(scalars, c, config.NbTasks)
|
|
|
|
nbChunks := computeNbChunks(c)
|
|
|
|
// for each chunk, spawn one go routine that'll loop through all the scalars in the
|
|
// corresponding bit-window
|
|
// note that buckets is an array allocated on the stack and this is critical for performance
|
|
|
|
// each go routine sends its result in chChunks[i] channel
|
|
chChunks := make([]chan g1JacExtended, nbChunks)
|
|
for i := 0; i < len(chChunks); i++ {
|
|
chChunks[i] = make(chan g1JacExtended, 1)
|
|
}
|
|
|
|
// we use a semaphore to limit the number of go routines running concurrently
|
|
// (only if nbTasks < nbCPU)
|
|
var sem chan struct{}
|
|
if config.NbTasks < runtime.NumCPU() {
|
|
// we add nbChunks because if chunk is overweight we split it in two
|
|
sem = make(chan struct{}, config.NbTasks+int(nbChunks))
|
|
for i := 0; i < config.NbTasks; i++ {
|
|
sem <- struct{}{}
|
|
}
|
|
defer func() {
|
|
close(sem)
|
|
}()
|
|
}
|
|
|
|
// the last chunk may be processed with a different method than the rest, as it could be smaller.
|
|
n := len(points)
|
|
for j := int(nbChunks - 1); j >= 0; j-- {
|
|
processChunk := getChunkProcessorG1(c, chunkStats[j])
|
|
if j == int(nbChunks-1) {
|
|
processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
|
|
}
|
|
if chunkStats[j].weight >= 115 {
|
|
// we split this in more go routines since this chunk has more work to do than the others.
|
|
// else what would happen is this go routine would finish much later than the others.
|
|
chSplit := make(chan g1JacExtended, 2)
|
|
split := n / 2
|
|
|
|
if sem != nil {
|
|
sem <- struct{}{} // add another token to the semaphore, since we split in two.
|
|
}
|
|
go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem)
|
|
go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem)
|
|
go func(chunkID int) {
|
|
s1 := <-chSplit
|
|
s2 := <-chSplit
|
|
close(chSplit)
|
|
s1.add(&s2)
|
|
chChunks[chunkID] <- s1
|
|
}(j)
|
|
continue
|
|
}
|
|
go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem)
|
|
}
|
|
|
|
return msmReduceChunkG1Affine(p, int(c), chChunks[:])
|
|
}
|
|
|
|
// getChunkProcessorG1 decides, depending on c window size and statistics for the chunk
|
|
// to return the best algorithm to process the chunk.
|
|
func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16, sem chan struct{}) {
|
|
switch c {
|
|
|
|
case 3:
|
|
return processChunkG1Jacobian[bucketg1JacExtendedC3]
|
|
case 4:
|
|
return processChunkG1Jacobian[bucketg1JacExtendedC4]
|
|
case 5:
|
|
return processChunkG1Jacobian[bucketg1JacExtendedC5]
|
|
case 6:
|
|
return processChunkG1Jacobian[bucketg1JacExtendedC6]
|
|
case 7:
|
|
return processChunkG1Jacobian[bucketg1JacExtendedC7]
|
|
case 8:
|
|
return processChunkG1Jacobian[bucketg1JacExtendedC8]
|
|
case 9:
|
|
return processChunkG1Jacobian[bucketg1JacExtendedC9]
|
|
case 10:
|
|
const batchSize = 80
|
|
// here we could check some chunk statistic (deviation, ...) to determine if calling
|
|
// the batch affine version is worth it.
|
|
if stat.nbBucketFilled < batchSize {
|
|
// clear indicator that batch affine method is not appropriate here.
|
|
return processChunkG1Jacobian[bucketg1JacExtendedC10]
|
|
}
|
|
return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
|
|
case 11:
|
|
const batchSize = 150
|
|
// here we could check some chunk statistic (deviation, ...) to determine if calling
|
|
// the batch affine version is worth it.
|
|
if stat.nbBucketFilled < batchSize {
|
|
// clear indicator that batch affine method is not appropriate here.
|
|
return processChunkG1Jacobian[bucketg1JacExtendedC11]
|
|
}
|
|
return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
|
|
case 12:
|
|
const batchSize = 200
|
|
// here we could check some chunk statistic (deviation, ...) to determine if calling
|
|
// the batch affine version is worth it.
|
|
if stat.nbBucketFilled < batchSize {
|
|
// clear indicator that batch affine method is not appropriate here.
|
|
return processChunkG1Jacobian[bucketg1JacExtendedC12]
|
|
}
|
|
return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
|
|
case 13:
|
|
const batchSize = 350
|
|
// here we could check some chunk statistic (deviation, ...) to determine if calling
|
|
// the batch affine version is worth it.
|
|
if stat.nbBucketFilled < batchSize {
|
|
// clear indicator that batch affine method is not appropriate here.
|
|
return processChunkG1Jacobian[bucketg1JacExtendedC13]
|
|
}
|
|
return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
|
|
case 14:
|
|
const batchSize = 400
|
|
// here we could check some chunk statistic (deviation, ...) to determine if calling
|
|
// the batch affine version is worth it.
|
|
if stat.nbBucketFilled < batchSize {
|
|
// clear indicator that batch affine method is not appropriate here.
|
|
return processChunkG1Jacobian[bucketg1JacExtendedC14]
|
|
}
|
|
return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
|
|
case 15:
|
|
const batchSize = 500
|
|
// here we could check some chunk statistic (deviation, ...) to determine if calling
|
|
// the batch affine version is worth it.
|
|
if stat.nbBucketFilled < batchSize {
|
|
// clear indicator that batch affine method is not appropriate here.
|
|
return processChunkG1Jacobian[bucketg1JacExtendedC15]
|
|
}
|
|
return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
|
|
case 16:
|
|
const batchSize = 640
|
|
// here we could check some chunk statistic (deviation, ...) to determine if calling
|
|
// the batch affine version is worth it.
|
|
if stat.nbBucketFilled < batchSize {
|
|
// clear indicator that batch affine method is not appropriate here.
|
|
return processChunkG1Jacobian[bucketg1JacExtendedC16]
|
|
}
|
|
return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
|
|
default:
|
|
// panic("will not happen c != previous values is not generated by templates")
|
|
return processChunkG1Jacobian[bucketg1JacExtendedC16]
|
|
}
|
|
}
|
|
|
|
// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
|
|
func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
|
|
var _p g1JacExtended
|
|
totalj := <-chChunks[len(chChunks)-1]
|
|
_p.Set(&totalj)
|
|
for j := len(chChunks) - 2; j >= 0; j-- {
|
|
for l := 0; l < c; l++ {
|
|
_p.double(&_p)
|
|
}
|
|
totalj := <-chChunks[j]
|
|
_p.add(&totalj)
|
|
}
|
|
|
|
return p.unsafeFromJacExtended(&_p)
|
|
}
|
|
|
|
// Fold computes the multi-exponentiation \sum_{i=0}^{len(points)-1} points[i] *
|
|
// combinationCoeff^i and stores the result in p. It returns error in case
|
|
// configuration is invalid.
|
|
func (p *G1Affine) Fold(points []G1Affine, combinationCoeff fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) {
|
|
var _p G1Jac
|
|
if _, err := _p.Fold(points, combinationCoeff, config); err != nil {
|
|
return nil, err
|
|
}
|
|
p.FromJacobian(&_p)
|
|
return p, nil
|
|
}
|
|
|
|
// Fold computes the multi-exponentiation \sum_{i=0}^{len(points)-1} points[i] *
|
|
// combinationCoeff^i and stores the result in p. It returns error in case
|
|
// configuration is invalid.
|
|
func (p *G1Jac) Fold(points []G1Affine, combinationCoeff fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
|
|
scalars := make([]fr.Element, len(points))
|
|
scalar := fr.NewElement(1)
|
|
for i := 0; i < len(points); i++ {
|
|
scalars[i].Set(&scalar)
|
|
scalar.Mul(&scalar, &combinationCoeff)
|
|
}
|
|
return p.MultiExp(points, scalars, config)
|
|
}
|
|
|
|
// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
|
|
//
|
|
// This call return an error if len(scalars) != len(points) or if provided config is invalid.
|
|
func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
|
|
var _p G2Jac
|
|
if _, err := _p.MultiExp(points, scalars, config); err != nil {
|
|
return nil, err
|
|
}
|
|
p.FromJacobian(&_p)
|
|
return p, nil
|
|
}
|
|
|
|
// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
|
|
//
|
|
// This call return an error if len(scalars) != len(points) or if provided config is invalid.
|
|
func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
|
|
// TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability.
|
|
// note:
|
|
// each of the msmCX method is the same, except for the c constant it declares
|
|
// duplicating (through template generation) these methods allows to declare the buckets on the stack
|
|
// the choice of c needs to be improved:
|
|
// there is a theoretical value that gives optimal asymptotics
|
|
// but in practice, other factors come into play, including:
|
|
// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
|
|
// * number of CPUs
|
|
// * cache friendliness (which depends on the host, G1 or G2... )
|
|
// --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
|
|
|
|
// for each msmCX
|
|
// step 1
|
|
// we compute, for each scalars over c-bit wide windows, nbChunk digits
|
|
// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and subtract
|
|
// 2^{c} to the current digit, making it negative.
|
|
// negative digits will be processed in the next step as adding -G into the bucket instead of G
|
|
// (computing -G is cheap, and this saves us half of the buckets)
|
|
// step 2
|
|
// buckets are declared on the stack
|
|
// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
|
|
// we use jacobian extended formulas here as they are faster than mixed addition
|
|
// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
|
|
// step 3
|
|
// reduce the buckets weighed sums into our result (msmReduceChunk)
|
|
|
|
// ensure len(points) == len(scalars)
|
|
nbPoints := len(points)
|
|
if nbPoints != len(scalars) {
|
|
return nil, errors.New("len(points) != len(scalars)")
|
|
}
|
|
|
|
// if nbTasks is not set, use all available CPUs
|
|
if config.NbTasks <= 0 {
|
|
config.NbTasks = runtime.NumCPU() * 2
|
|
} else if config.NbTasks > 1024 {
|
|
return nil, errors.New("invalid config: config.NbTasks > 1024")
|
|
}
|
|
|
|
// here, we compute the best C for nbPoints
|
|
// we split recursively until nbChunks(c) >= nbTasks,
|
|
bestC := func(nbPoints int) uint64 {
|
|
// implemented msmC methods (the c we use must be in this slice)
|
|
implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
|
|
var C uint64
|
|
// approximate cost (in group operations)
|
|
// cost = bits/c * (nbPoints + 2^{c})
|
|
// this needs to be verified empirically.
|
|
// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
|
|
min := math.MaxFloat64
|
|
for _, c := range implementedCs {
|
|
cc := (fr.Bits + 1) * (nbPoints + (1 << c))
|
|
cost := float64(cc) / float64(c)
|
|
if cost < min {
|
|
min = cost
|
|
C = c
|
|
}
|
|
}
|
|
return C
|
|
}
|
|
|
|
C := bestC(nbPoints)
|
|
nbChunks := int(computeNbChunks(C))
|
|
|
|
// should we recursively split the msm in half? (see below)
|
|
// we want to minimize the execution time of the algorithm;
|
|
// splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it.
|
|
|
|
// costFunction returns a metric that represent the "wall time" of the algorithm
|
|
costFunction := func(nbTasks, nbCpus, costPerTask int) int {
|
|
// cost for the reduction of all tasks (msmReduceChunk)
|
|
totalCost := nbTasks
|
|
|
|
// cost for the computation of each task (msmProcessChunk)
|
|
for nbTasks >= nbCpus {
|
|
nbTasks -= nbCpus
|
|
totalCost += costPerTask
|
|
}
|
|
if nbTasks > 0 {
|
|
totalCost += costPerTask
|
|
}
|
|
return totalCost
|
|
}
|
|
|
|
// costPerTask is the approximate number of group ops per task
|
|
costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) }
|
|
|
|
costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints))
|
|
|
|
cPostSplit := bestC(nbPoints / 2)
|
|
nbChunksPostSplit := int(computeNbChunks(cPostSplit))
|
|
costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2))
|
|
|
|
// if the cost of the split msm is lower than the cost of the non split msm, we split
|
|
if costPostSplit < costPreSplit {
|
|
config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0))
|
|
var _p G2Jac
|
|
chDone := make(chan struct{}, 1)
|
|
go func() {
|
|
_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
|
|
close(chDone)
|
|
}()
|
|
p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
|
|
<-chDone
|
|
p.AddAssign(&_p)
|
|
return p, nil
|
|
}
|
|
|
|
// if we don't split, we use the best C we found
|
|
_innerMsmG2(p, C, points, scalars, config)
|
|
|
|
return p, nil
|
|
}
|
|
|
|
func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac {
|
|
// partition the scalars
|
|
digits, chunkStats := partitionScalars(scalars, c, config.NbTasks)
|
|
|
|
nbChunks := computeNbChunks(c)
|
|
|
|
// for each chunk, spawn one go routine that'll loop through all the scalars in the
|
|
// corresponding bit-window
|
|
// note that buckets is an array allocated on the stack and this is critical for performance
|
|
|
|
// each go routine sends its result in chChunks[i] channel
|
|
chChunks := make([]chan g2JacExtended, nbChunks)
|
|
for i := 0; i < len(chChunks); i++ {
|
|
chChunks[i] = make(chan g2JacExtended, 1)
|
|
}
|
|
|
|
// we use a semaphore to limit the number of go routines running concurrently
|
|
// (only if nbTasks < nbCPU)
|
|
var sem chan struct{}
|
|
if config.NbTasks < runtime.NumCPU() {
|
|
// we add nbChunks because if chunk is overweight we split it in two
|
|
sem = make(chan struct{}, config.NbTasks+int(nbChunks))
|
|
for i := 0; i < config.NbTasks; i++ {
|
|
sem <- struct{}{}
|
|
}
|
|
defer func() {
|
|
close(sem)
|
|
}()
|
|
}
|
|
|
|
// the last chunk may be processed with a different method than the rest, as it could be smaller.
|
|
n := len(points)
|
|
for j := int(nbChunks - 1); j >= 0; j-- {
|
|
processChunk := getChunkProcessorG2(c, chunkStats[j])
|
|
if j == int(nbChunks-1) {
|
|
processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
|
|
}
|
|
if chunkStats[j].weight >= 115 {
|
|
// we split this in more go routines since this chunk has more work to do than the others.
|
|
// else what would happen is this go routine would finish much later than the others.
|
|
chSplit := make(chan g2JacExtended, 2)
|
|
split := n / 2
|
|
|
|
if sem != nil {
|
|
sem <- struct{}{} // add another token to the semaphore, since we split in two.
|
|
}
|
|
go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem)
|
|
go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem)
|
|
go func(chunkID int) {
|
|
s1 := <-chSplit
|
|
s2 := <-chSplit
|
|
close(chSplit)
|
|
s1.add(&s2)
|
|
chChunks[chunkID] <- s1
|
|
}(j)
|
|
continue
|
|
}
|
|
go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem)
|
|
}
|
|
|
|
return msmReduceChunkG2Affine(p, int(c), chChunks[:])
|
|
}
|
|
|
|
// getChunkProcessorG2 decides, depending on c window size and statistics for the chunk
|
|
// to return the best algorithm to process the chunk.
|
|
func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16, sem chan struct{}) {
|
|
switch c {
|
|
|
|
case 3:
|
|
return processChunkG2Jacobian[bucketg2JacExtendedC3]
|
|
case 4:
|
|
return processChunkG2Jacobian[bucketg2JacExtendedC4]
|
|
case 5:
|
|
return processChunkG2Jacobian[bucketg2JacExtendedC5]
|
|
case 6:
|
|
return processChunkG2Jacobian[bucketg2JacExtendedC6]
|
|
case 7:
|
|
return processChunkG2Jacobian[bucketg2JacExtendedC7]
|
|
case 8:
|
|
return processChunkG2Jacobian[bucketg2JacExtendedC8]
|
|
case 9:
|
|
return processChunkG2Jacobian[bucketg2JacExtendedC9]
|
|
case 10:
|
|
const batchSize = 80
|
|
// here we could check some chunk statistic (deviation, ...) to determine if calling
|
|
// the batch affine version is worth it.
|
|
if stat.nbBucketFilled < batchSize {
|
|
// clear indicator that batch affine method is not appropriate here.
|
|
return processChunkG2Jacobian[bucketg2JacExtendedC10]
|
|
}
|
|
return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
|
|
case 11:
|
|
const batchSize = 150
|
|
// here we could check some chunk statistic (deviation, ...) to determine if calling
|
|
// the batch affine version is worth it.
|
|
if stat.nbBucketFilled < batchSize {
|
|
// clear indicator that batch affine method is not appropriate here.
|
|
return processChunkG2Jacobian[bucketg2JacExtendedC11]
|
|
}
|
|
return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
|
|
case 12:
|
|
const batchSize = 200
|
|
// here we could check some chunk statistic (deviation, ...) to determine if calling
|
|
// the batch affine version is worth it.
|
|
if stat.nbBucketFilled < batchSize {
|
|
// clear indicator that batch affine method is not appropriate here.
|
|
return processChunkG2Jacobian[bucketg2JacExtendedC12]
|
|
}
|
|
return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
|
|
case 13:
|
|
const batchSize = 350
|
|
// here we could check some chunk statistic (deviation, ...) to determine if calling
|
|
// the batch affine version is worth it.
|
|
if stat.nbBucketFilled < batchSize {
|
|
// clear indicator that batch affine method is not appropriate here.
|
|
return processChunkG2Jacobian[bucketg2JacExtendedC13]
|
|
}
|
|
return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
|
|
case 14:
|
|
const batchSize = 400
|
|
// here we could check some chunk statistic (deviation, ...) to determine if calling
|
|
// the batch affine version is worth it.
|
|
if stat.nbBucketFilled < batchSize {
|
|
// clear indicator that batch affine method is not appropriate here.
|
|
return processChunkG2Jacobian[bucketg2JacExtendedC14]
|
|
}
|
|
return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
|
|
case 15:
|
|
const batchSize = 500
|
|
// here we could check some chunk statistic (deviation, ...) to determine if calling
|
|
// the batch affine version is worth it.
|
|
if stat.nbBucketFilled < batchSize {
|
|
// clear indicator that batch affine method is not appropriate here.
|
|
return processChunkG2Jacobian[bucketg2JacExtendedC15]
|
|
}
|
|
return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
|
|
case 16:
|
|
const batchSize = 640
|
|
// here we could check some chunk statistic (deviation, ...) to determine if calling
|
|
// the batch affine version is worth it.
|
|
if stat.nbBucketFilled < batchSize {
|
|
// clear indicator that batch affine method is not appropriate here.
|
|
return processChunkG2Jacobian[bucketg2JacExtendedC16]
|
|
}
|
|
return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
|
|
default:
|
|
// panic("will not happen c != previous values is not generated by templates")
|
|
return processChunkG2Jacobian[bucketg2JacExtendedC16]
|
|
}
|
|
}
|
|
|
|
// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
|
|
func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
|
|
var _p g2JacExtended
|
|
totalj := <-chChunks[len(chChunks)-1]
|
|
_p.Set(&totalj)
|
|
for j := len(chChunks) - 2; j >= 0; j-- {
|
|
for l := 0; l < c; l++ {
|
|
_p.double(&_p)
|
|
}
|
|
totalj := <-chChunks[j]
|
|
_p.add(&totalj)
|
|
}
|
|
|
|
return p.unsafeFromJacExtended(&_p)
|
|
}
|
|
|
|
// Fold computes the multi-exponentiation \sum_{i=0}^{len(points)-1} points[i] *
|
|
// combinationCoeff^i and stores the result in p. It returns error in case
|
|
// configuration is invalid.
|
|
func (p *G2Affine) Fold(points []G2Affine, combinationCoeff fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
|
|
var _p G2Jac
|
|
if _, err := _p.Fold(points, combinationCoeff, config); err != nil {
|
|
return nil, err
|
|
}
|
|
p.FromJacobian(&_p)
|
|
return p, nil
|
|
}
|
|
|
|
// Fold computes the multi-exponentiation \sum_{i=0}^{len(points)-1} points[i] *
|
|
// combinationCoeff^i and stores the result in p. It returns error in case
|
|
// configuration is invalid.
|
|
func (p *G2Jac) Fold(points []G2Affine, combinationCoeff fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
|
|
scalars := make([]fr.Element, len(points))
|
|
scalar := fr.NewElement(1)
|
|
for i := 0; i < len(points); i++ {
|
|
scalars[i].Set(&scalar)
|
|
scalar.Mul(&scalar, &combinationCoeff)
|
|
}
|
|
return p.MultiExp(points, scalars, config)
|
|
}
|
|
|
|
// selector stores the index, mask and shifts needed to select bits from a scalar
|
|
// it is used during the multiExp algorithm or the batch scalar multiplication
|
|
type selector struct {
|
|
index uint64 // index in the multi-word scalar to select bits from
|
|
mask uint64 // mask (c-bit wide)
|
|
shift uint64 // shift needed to get our bits on low positions
|
|
|
|
multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
|
|
maskHigh uint64 // same than mask, for index+1
|
|
shiftHigh uint64 // same than shift, for index+1
|
|
}
|
|
|
|
// return number of chunks for a given window size c
|
|
// the last chunk may be bigger to accommodate a potential carry from the NAF decomposition
|
|
func computeNbChunks(c uint64) uint64 {
|
|
return (fr.Bits + c - 1) / c
|
|
}
|
|
|
|
// return the last window size for a scalar;
|
|
// this last window should accommodate a carry (from the NAF decomposition)
|
|
// it can be == c if we have 1 available bit
|
|
// it can be > c if we have 0 available bit
|
|
// it can be < c if we have 2+ available bits
|
|
func lastC(c uint64) uint64 {
|
|
nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits
|
|
return c + 1 - nbAvailableBits
|
|
}
|
|
|
|
type chunkStat struct {
|
|
// relative weight of work compared to other chunks. 100.0 -> nominal weight.
|
|
weight float32
|
|
|
|
// percentage of bucket filled in the window;
|
|
ppBucketFilled float32
|
|
nbBucketFilled int
|
|
}
|
|
|
|
// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits
|
|
// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and subtract
|
|
// 2^{c} to the current digit, making it negative.
|
|
// negative digits can be processed in a later step as adding -G into the bucket instead of G
|
|
// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
|
|
func partitionScalars(scalars []fr.Element, c uint64, nbTasks int) ([]uint16, []chunkStat) {
|
|
// no benefit here to have more tasks than CPUs
|
|
if nbTasks > runtime.NumCPU() {
|
|
nbTasks = runtime.NumCPU()
|
|
}
|
|
|
|
// number of c-bit radixes in a scalar
|
|
nbChunks := computeNbChunks(c)
|
|
|
|
digits := make([]uint16, len(scalars)*int(nbChunks))
|
|
|
|
mask := uint64((1 << c) - 1) // low c bits are 1
|
|
max := int(1<<(c-1)) - 1 // max value (inclusive) we want for our digits
|
|
cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words
|
|
|
|
// compute offset and word selector / shift to select the right bits of our windows
|
|
selectors := make([]selector, nbChunks)
|
|
for chunk := uint64(0); chunk < nbChunks; chunk++ {
|
|
jc := uint64(chunk * c)
|
|
d := selector{}
|
|
d.index = jc / 64
|
|
d.shift = jc - (d.index * 64)
|
|
d.mask = mask << d.shift
|
|
d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
|
|
if d.multiWordSelect {
|
|
nbBitsHigh := d.shift - uint64(64-c)
|
|
d.maskHigh = (1 << nbBitsHigh) - 1
|
|
d.shiftHigh = (c - nbBitsHigh)
|
|
}
|
|
selectors[chunk] = d
|
|
}
|
|
|
|
parallel.Execute(len(scalars), func(start, end int) {
|
|
for i := start; i < end; i++ {
|
|
if scalars[i].IsZero() {
|
|
// everything is 0, no need to process this scalar
|
|
continue
|
|
}
|
|
scalar := scalars[i].Bits()
|
|
|
|
var carry int
|
|
|
|
// for each chunk in the scalar, compute the current digit, and an eventual carry
|
|
for chunk := uint64(0); chunk < nbChunks-1; chunk++ {
|
|
s := selectors[chunk]
|
|
|
|
// init with carry if any
|
|
digit := carry
|
|
carry = 0
|
|
|
|
// digit = value of the c-bit window
|
|
digit += int((scalar[s.index] & s.mask) >> s.shift)
|
|
|
|
if s.multiWordSelect {
|
|
// we are selecting bits over 2 words
|
|
digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
|
|
}
|
|
|
|
// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and subtract
|
|
// 2^{c} to the current digit, making it negative.
|
|
if digit > max {
|
|
digit -= (1 << c)
|
|
carry = 1
|
|
}
|
|
|
|
// if digit is zero, no impact on result
|
|
if digit == 0 {
|
|
continue
|
|
}
|
|
|
|
var bits uint16
|
|
if digit > 0 {
|
|
bits = uint16(digit) << 1
|
|
} else {
|
|
bits = (uint16(-digit-1) << 1) + 1
|
|
}
|
|
digits[int(chunk)*len(scalars)+i] = bits
|
|
}
|
|
|
|
// for the last chunk, we don't want to borrow from a next window
|
|
// (but may have a larger max value)
|
|
chunk := nbChunks - 1
|
|
s := selectors[chunk]
|
|
// init with carry if any
|
|
digit := carry
|
|
// digit = value of the c-bit window
|
|
digit += int((scalar[s.index] & s.mask) >> s.shift)
|
|
if s.multiWordSelect {
|
|
// we are selecting bits over 2 words
|
|
digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
|
|
}
|
|
digits[int(chunk)*len(scalars)+i] = uint16(digit) << 1
|
|
}
|
|
|
|
}, nbTasks)
|
|
|
|
// aggregate chunk stats
|
|
chunkStats := make([]chunkStat, nbChunks)
|
|
if c <= 9 {
|
|
// no need to compute stats for small window sizes
|
|
return digits, chunkStats
|
|
}
|
|
parallel.Execute(len(chunkStats), func(start, end int) {
|
|
// for each chunk compute the statistics
|
|
for chunkID := start; chunkID < end; chunkID++ {
|
|
// indicates if a bucket is hit.
|
|
var b bitSetC16
|
|
|
|
// digits for the chunk
|
|
chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
|
|
|
|
totalOps := 0
|
|
nz := 0 // non zero buckets count
|
|
for _, digit := range chunkDigits {
|
|
if digit == 0 {
|
|
continue
|
|
}
|
|
totalOps++
|
|
bucketID := digit >> 1
|
|
if digit&1 == 0 {
|
|
bucketID -= 1
|
|
}
|
|
if !b[bucketID] {
|
|
nz++
|
|
b[bucketID] = true
|
|
}
|
|
}
|
|
chunkStats[chunkID].weight = float32(totalOps) // count number of ops for now, we will compute the weight after
|
|
chunkStats[chunkID].ppBucketFilled = (float32(nz) * 100.0) / float32(int(1<<(c-1)))
|
|
chunkStats[chunkID].nbBucketFilled = nz
|
|
}
|
|
}, nbTasks)
|
|
|
|
totalOps := float32(0.0)
|
|
for _, stat := range chunkStats {
|
|
totalOps += stat.weight
|
|
}
|
|
|
|
target := totalOps / float32(nbChunks)
|
|
if target != 0.0 {
|
|
// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
|
|
for i := 0; i < len(chunkStats); i++ {
|
|
chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target
|
|
}
|
|
}
|
|
|
|
return digits, chunkStats
|
|
}
|