610 lines
14 KiB
Go
610 lines
14 KiB
Go
// uint256: Fixed size 256-bit math library
|
|
// Copyright 2021 uint256 Authors
|
|
// SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
package uint256
|
|
|
|
import (
|
|
"math/bits"
|
|
)
|
|
|
|
// Reciprocal computes a 320-bit value representing 1/m
|
|
//
|
|
// Notes:
|
|
// - specialized for m[3] != 0, hence limited to 2^192 <= m < 2^256
|
|
// - returns zero if m[3] == 0
|
|
// - starts with a 32-bit division, refines with newton-raphson iterations
|
|
func Reciprocal(m *Int) (mu [5]uint64) {
|
|
|
|
if m[3] == 0 {
|
|
return mu
|
|
}
|
|
|
|
s := bits.LeadingZeros64(m[3]) // Replace with leadingZeros(m) for general case
|
|
p := 255 - s // floor(log_2(m)), m>0
|
|
|
|
// 0 or a power of 2?
|
|
|
|
// Check if at least one bit is set in m[2], m[1] or m[0],
|
|
// or at least two bits in m[3]
|
|
|
|
if m[0]|m[1]|m[2]|(m[3]&(m[3]-1)) == 0 {
|
|
|
|
mu[4] = ^uint64(0) >> uint(p&63)
|
|
mu[3] = ^uint64(0)
|
|
mu[2] = ^uint64(0)
|
|
mu[1] = ^uint64(0)
|
|
mu[0] = ^uint64(0)
|
|
|
|
return mu
|
|
}
|
|
|
|
// Maximise division precision by left-aligning divisor
|
|
|
|
var (
|
|
y Int // left-aligned copy of m
|
|
r0 uint32 // estimate of 2^31/y
|
|
)
|
|
|
|
y.Lsh(m, uint(s)) // 1/2 < y < 1
|
|
|
|
// Extract most significant 32 bits
|
|
|
|
yh := uint32(y[3] >> 32)
|
|
|
|
if yh == 0x80000000 { // Avoid overflow in division
|
|
r0 = 0xffffffff
|
|
} else {
|
|
r0, _ = bits.Div32(0x80000000, 0, yh)
|
|
}
|
|
|
|
// First iteration: 32 -> 64
|
|
|
|
t1 := uint64(r0) // 2^31/y
|
|
t1 *= t1 // 2^62/y^2
|
|
t1, _ = bits.Mul64(t1, y[3]) // 2^62/y^2 * 2^64/y / 2^64 = 2^62/y
|
|
|
|
r1 := uint64(r0) << 32 // 2^63/y
|
|
r1 -= t1 // 2^63/y - 2^62/y = 2^62/y
|
|
r1 *= 2 // 2^63/y
|
|
|
|
if (r1 | (y[3] << 1)) == 0 {
|
|
r1 = ^uint64(0)
|
|
}
|
|
|
|
// Second iteration: 64 -> 128
|
|
|
|
// square: 2^126/y^2
|
|
a2h, a2l := bits.Mul64(r1, r1)
|
|
|
|
// multiply by y: e2h:e2l:b2h = 2^126/y^2 * 2^128/y / 2^128 = 2^126/y
|
|
b2h, _ := bits.Mul64(a2l, y[2])
|
|
c2h, c2l := bits.Mul64(a2l, y[3])
|
|
d2h, d2l := bits.Mul64(a2h, y[2])
|
|
e2h, e2l := bits.Mul64(a2h, y[3])
|
|
|
|
b2h, c := bits.Add64(b2h, c2l, 0)
|
|
e2l, c = bits.Add64(e2l, c2h, c)
|
|
e2h, _ = bits.Add64(e2h, 0, c)
|
|
|
|
_, c = bits.Add64(b2h, d2l, 0)
|
|
e2l, c = bits.Add64(e2l, d2h, c)
|
|
e2h, _ = bits.Add64(e2h, 0, c)
|
|
|
|
// subtract: t2h:t2l = 2^127/y - 2^126/y = 2^126/y
|
|
t2l, b := bits.Sub64(0, e2l, 0)
|
|
t2h, _ := bits.Sub64(r1, e2h, b)
|
|
|
|
// double: r2h:r2l = 2^127/y
|
|
r2l, c := bits.Add64(t2l, t2l, 0)
|
|
r2h, _ := bits.Add64(t2h, t2h, c)
|
|
|
|
if (r2h | r2l | (y[3] << 1)) == 0 {
|
|
r2h = ^uint64(0)
|
|
r2l = ^uint64(0)
|
|
}
|
|
|
|
// Third iteration: 128 -> 192
|
|
|
|
// square r2 (keep 256 bits): 2^190/y^2
|
|
a3h, a3l := bits.Mul64(r2l, r2l)
|
|
b3h, b3l := bits.Mul64(r2l, r2h)
|
|
c3h, c3l := bits.Mul64(r2h, r2h)
|
|
|
|
a3h, c = bits.Add64(a3h, b3l, 0)
|
|
c3l, c = bits.Add64(c3l, b3h, c)
|
|
c3h, _ = bits.Add64(c3h, 0, c)
|
|
|
|
a3h, c = bits.Add64(a3h, b3l, 0)
|
|
c3l, c = bits.Add64(c3l, b3h, c)
|
|
c3h, _ = bits.Add64(c3h, 0, c)
|
|
|
|
// multiply by y: q = 2^190/y^2 * 2^192/y / 2^192 = 2^190/y
|
|
|
|
x0 := a3l
|
|
x1 := a3h
|
|
x2 := c3l
|
|
x3 := c3h
|
|
|
|
var q0, q1, q2, q3, q4, t0 uint64
|
|
|
|
q0, _ = bits.Mul64(x2, y[0])
|
|
q1, t0 = bits.Mul64(x3, y[0])
|
|
q0, c = bits.Add64(q0, t0, 0)
|
|
q1, _ = bits.Add64(q1, 0, c)
|
|
|
|
t1, _ = bits.Mul64(x1, y[1])
|
|
q0, c = bits.Add64(q0, t1, 0)
|
|
q2, t0 = bits.Mul64(x3, y[1])
|
|
q1, c = bits.Add64(q1, t0, c)
|
|
q2, _ = bits.Add64(q2, 0, c)
|
|
|
|
t1, t0 = bits.Mul64(x2, y[1])
|
|
q0, c = bits.Add64(q0, t0, 0)
|
|
q1, c = bits.Add64(q1, t1, c)
|
|
q2, _ = bits.Add64(q2, 0, c)
|
|
|
|
t1, t0 = bits.Mul64(x1, y[2])
|
|
q0, c = bits.Add64(q0, t0, 0)
|
|
q1, c = bits.Add64(q1, t1, c)
|
|
q3, t0 = bits.Mul64(x3, y[2])
|
|
q2, c = bits.Add64(q2, t0, c)
|
|
q3, _ = bits.Add64(q3, 0, c)
|
|
|
|
t1, _ = bits.Mul64(x0, y[2])
|
|
q0, c = bits.Add64(q0, t1, 0)
|
|
t1, t0 = bits.Mul64(x2, y[2])
|
|
q1, c = bits.Add64(q1, t0, c)
|
|
q2, c = bits.Add64(q2, t1, c)
|
|
q3, _ = bits.Add64(q3, 0, c)
|
|
|
|
t1, t0 = bits.Mul64(x1, y[3])
|
|
q1, c = bits.Add64(q1, t0, 0)
|
|
q2, c = bits.Add64(q2, t1, c)
|
|
q4, t0 = bits.Mul64(x3, y[3])
|
|
q3, c = bits.Add64(q3, t0, c)
|
|
q4, _ = bits.Add64(q4, 0, c)
|
|
|
|
t1, t0 = bits.Mul64(x0, y[3])
|
|
q0, c = bits.Add64(q0, t0, 0)
|
|
q1, c = bits.Add64(q1, t1, c)
|
|
t1, t0 = bits.Mul64(x2, y[3])
|
|
q2, c = bits.Add64(q2, t0, c)
|
|
q3, c = bits.Add64(q3, t1, c)
|
|
q4, _ = bits.Add64(q4, 0, c)
|
|
|
|
// subtract: t3 = 2^191/y - 2^190/y = 2^190/y
|
|
_, b = bits.Sub64(0, q0, 0)
|
|
_, b = bits.Sub64(0, q1, b)
|
|
t3l, b := bits.Sub64(0, q2, b)
|
|
t3m, b := bits.Sub64(r2l, q3, b)
|
|
t3h, _ := bits.Sub64(r2h, q4, b)
|
|
|
|
// double: r3 = 2^191/y
|
|
r3l, c := bits.Add64(t3l, t3l, 0)
|
|
r3m, c := bits.Add64(t3m, t3m, c)
|
|
r3h, _ := bits.Add64(t3h, t3h, c)
|
|
|
|
// Fourth iteration: 192 -> 320
|
|
|
|
// square r3
|
|
|
|
a4h, a4l := bits.Mul64(r3l, r3l)
|
|
b4h, b4l := bits.Mul64(r3l, r3m)
|
|
c4h, c4l := bits.Mul64(r3l, r3h)
|
|
d4h, d4l := bits.Mul64(r3m, r3m)
|
|
e4h, e4l := bits.Mul64(r3m, r3h)
|
|
f4h, f4l := bits.Mul64(r3h, r3h)
|
|
|
|
b4h, c = bits.Add64(b4h, c4l, 0)
|
|
e4l, c = bits.Add64(e4l, c4h, c)
|
|
e4h, _ = bits.Add64(e4h, 0, c)
|
|
|
|
a4h, c = bits.Add64(a4h, b4l, 0)
|
|
d4l, c = bits.Add64(d4l, b4h, c)
|
|
d4h, c = bits.Add64(d4h, e4l, c)
|
|
f4l, c = bits.Add64(f4l, e4h, c)
|
|
f4h, _ = bits.Add64(f4h, 0, c)
|
|
|
|
a4h, c = bits.Add64(a4h, b4l, 0)
|
|
d4l, c = bits.Add64(d4l, b4h, c)
|
|
d4h, c = bits.Add64(d4h, e4l, c)
|
|
f4l, c = bits.Add64(f4l, e4h, c)
|
|
f4h, _ = bits.Add64(f4h, 0, c)
|
|
|
|
// multiply by y
|
|
|
|
x1, x0 = bits.Mul64(d4h, y[0])
|
|
x3, x2 = bits.Mul64(f4h, y[0])
|
|
t1, t0 = bits.Mul64(f4l, y[0])
|
|
x1, c = bits.Add64(x1, t0, 0)
|
|
x2, c = bits.Add64(x2, t1, c)
|
|
x3, _ = bits.Add64(x3, 0, c)
|
|
|
|
t1, t0 = bits.Mul64(d4h, y[1])
|
|
x1, c = bits.Add64(x1, t0, 0)
|
|
x2, c = bits.Add64(x2, t1, c)
|
|
x4, t0 := bits.Mul64(f4h, y[1])
|
|
x3, c = bits.Add64(x3, t0, c)
|
|
x4, _ = bits.Add64(x4, 0, c)
|
|
t1, t0 = bits.Mul64(d4l, y[1])
|
|
x0, c = bits.Add64(x0, t0, 0)
|
|
x1, c = bits.Add64(x1, t1, c)
|
|
t1, t0 = bits.Mul64(f4l, y[1])
|
|
x2, c = bits.Add64(x2, t0, c)
|
|
x3, c = bits.Add64(x3, t1, c)
|
|
x4, _ = bits.Add64(x4, 0, c)
|
|
|
|
t1, t0 = bits.Mul64(a4h, y[2])
|
|
x0, c = bits.Add64(x0, t0, 0)
|
|
x1, c = bits.Add64(x1, t1, c)
|
|
t1, t0 = bits.Mul64(d4h, y[2])
|
|
x2, c = bits.Add64(x2, t0, c)
|
|
x3, c = bits.Add64(x3, t1, c)
|
|
x5, t0 := bits.Mul64(f4h, y[2])
|
|
x4, c = bits.Add64(x4, t0, c)
|
|
x5, _ = bits.Add64(x5, 0, c)
|
|
t1, t0 = bits.Mul64(d4l, y[2])
|
|
x1, c = bits.Add64(x1, t0, 0)
|
|
x2, c = bits.Add64(x2, t1, c)
|
|
t1, t0 = bits.Mul64(f4l, y[2])
|
|
x3, c = bits.Add64(x3, t0, c)
|
|
x4, c = bits.Add64(x4, t1, c)
|
|
x5, _ = bits.Add64(x5, 0, c)
|
|
|
|
t1, t0 = bits.Mul64(a4h, y[3])
|
|
x1, c = bits.Add64(x1, t0, 0)
|
|
x2, c = bits.Add64(x2, t1, c)
|
|
t1, t0 = bits.Mul64(d4h, y[3])
|
|
x3, c = bits.Add64(x3, t0, c)
|
|
x4, c = bits.Add64(x4, t1, c)
|
|
x6, t0 := bits.Mul64(f4h, y[3])
|
|
x5, c = bits.Add64(x5, t0, c)
|
|
x6, _ = bits.Add64(x6, 0, c)
|
|
t1, t0 = bits.Mul64(a4l, y[3])
|
|
x0, c = bits.Add64(x0, t0, 0)
|
|
x1, c = bits.Add64(x1, t1, c)
|
|
t1, t0 = bits.Mul64(d4l, y[3])
|
|
x2, c = bits.Add64(x2, t0, c)
|
|
x3, c = bits.Add64(x3, t1, c)
|
|
t1, t0 = bits.Mul64(f4l, y[3])
|
|
x4, c = bits.Add64(x4, t0, c)
|
|
x5, c = bits.Add64(x5, t1, c)
|
|
x6, _ = bits.Add64(x6, 0, c)
|
|
|
|
// subtract
|
|
_, b = bits.Sub64(0, x0, 0)
|
|
_, b = bits.Sub64(0, x1, b)
|
|
r4l, b := bits.Sub64(0, x2, b)
|
|
r4k, b := bits.Sub64(0, x3, b)
|
|
r4j, b := bits.Sub64(r3l, x4, b)
|
|
r4i, b := bits.Sub64(r3m, x5, b)
|
|
r4h, _ := bits.Sub64(r3h, x6, b)
|
|
|
|
// Multiply candidate for 1/4y by y, with full precision
|
|
|
|
x0 = r4l
|
|
x1 = r4k
|
|
x2 = r4j
|
|
x3 = r4i
|
|
x4 = r4h
|
|
|
|
q1, q0 = bits.Mul64(x0, y[0])
|
|
q3, q2 = bits.Mul64(x2, y[0])
|
|
q5, q4 := bits.Mul64(x4, y[0])
|
|
|
|
t1, t0 = bits.Mul64(x1, y[0])
|
|
q1, c = bits.Add64(q1, t0, 0)
|
|
q2, c = bits.Add64(q2, t1, c)
|
|
t1, t0 = bits.Mul64(x3, y[0])
|
|
q3, c = bits.Add64(q3, t0, c)
|
|
q4, c = bits.Add64(q4, t1, c)
|
|
q5, _ = bits.Add64(q5, 0, c)
|
|
|
|
t1, t0 = bits.Mul64(x0, y[1])
|
|
q1, c = bits.Add64(q1, t0, 0)
|
|
q2, c = bits.Add64(q2, t1, c)
|
|
t1, t0 = bits.Mul64(x2, y[1])
|
|
q3, c = bits.Add64(q3, t0, c)
|
|
q4, c = bits.Add64(q4, t1, c)
|
|
q6, t0 := bits.Mul64(x4, y[1])
|
|
q5, c = bits.Add64(q5, t0, c)
|
|
q6, _ = bits.Add64(q6, 0, c)
|
|
|
|
t1, t0 = bits.Mul64(x1, y[1])
|
|
q2, c = bits.Add64(q2, t0, 0)
|
|
q3, c = bits.Add64(q3, t1, c)
|
|
t1, t0 = bits.Mul64(x3, y[1])
|
|
q4, c = bits.Add64(q4, t0, c)
|
|
q5, c = bits.Add64(q5, t1, c)
|
|
q6, _ = bits.Add64(q6, 0, c)
|
|
|
|
t1, t0 = bits.Mul64(x0, y[2])
|
|
q2, c = bits.Add64(q2, t0, 0)
|
|
q3, c = bits.Add64(q3, t1, c)
|
|
t1, t0 = bits.Mul64(x2, y[2])
|
|
q4, c = bits.Add64(q4, t0, c)
|
|
q5, c = bits.Add64(q5, t1, c)
|
|
q7, t0 := bits.Mul64(x4, y[2])
|
|
q6, c = bits.Add64(q6, t0, c)
|
|
q7, _ = bits.Add64(q7, 0, c)
|
|
|
|
t1, t0 = bits.Mul64(x1, y[2])
|
|
q3, c = bits.Add64(q3, t0, 0)
|
|
q4, c = bits.Add64(q4, t1, c)
|
|
t1, t0 = bits.Mul64(x3, y[2])
|
|
q5, c = bits.Add64(q5, t0, c)
|
|
q6, c = bits.Add64(q6, t1, c)
|
|
q7, _ = bits.Add64(q7, 0, c)
|
|
|
|
t1, t0 = bits.Mul64(x0, y[3])
|
|
q3, c = bits.Add64(q3, t0, 0)
|
|
q4, c = bits.Add64(q4, t1, c)
|
|
t1, t0 = bits.Mul64(x2, y[3])
|
|
q5, c = bits.Add64(q5, t0, c)
|
|
q6, c = bits.Add64(q6, t1, c)
|
|
q8, t0 := bits.Mul64(x4, y[3])
|
|
q7, c = bits.Add64(q7, t0, c)
|
|
q8, _ = bits.Add64(q8, 0, c)
|
|
|
|
t1, t0 = bits.Mul64(x1, y[3])
|
|
q4, c = bits.Add64(q4, t0, 0)
|
|
q5, c = bits.Add64(q5, t1, c)
|
|
t1, t0 = bits.Mul64(x3, y[3])
|
|
q6, c = bits.Add64(q6, t0, c)
|
|
q7, c = bits.Add64(q7, t1, c)
|
|
q8, _ = bits.Add64(q8, 0, c)
|
|
|
|
// Final adjustment
|
|
|
|
// subtract q from 1/4
|
|
_, b = bits.Sub64(0, q0, 0)
|
|
_, b = bits.Sub64(0, q1, b)
|
|
_, b = bits.Sub64(0, q2, b)
|
|
_, b = bits.Sub64(0, q3, b)
|
|
_, b = bits.Sub64(0, q4, b)
|
|
_, b = bits.Sub64(0, q5, b)
|
|
_, b = bits.Sub64(0, q6, b)
|
|
_, b = bits.Sub64(0, q7, b)
|
|
_, b = bits.Sub64(uint64(1)<<62, q8, b)
|
|
|
|
// decrement the result
|
|
x0, t := bits.Sub64(r4l, 1, 0)
|
|
x1, t = bits.Sub64(r4k, 0, t)
|
|
x2, t = bits.Sub64(r4j, 0, t)
|
|
x3, t = bits.Sub64(r4i, 0, t)
|
|
x4, _ = bits.Sub64(r4h, 0, t)
|
|
|
|
// commit the decrement if the subtraction underflowed (reciprocal was too large)
|
|
if b != 0 {
|
|
r4h, r4i, r4j, r4k, r4l = x4, x3, x2, x1, x0
|
|
}
|
|
|
|
// Shift to correct bit alignment, truncating excess bits
|
|
|
|
p = (p & 63) - 1
|
|
|
|
x0, c = bits.Add64(r4l, r4l, 0)
|
|
x1, c = bits.Add64(r4k, r4k, c)
|
|
x2, c = bits.Add64(r4j, r4j, c)
|
|
x3, c = bits.Add64(r4i, r4i, c)
|
|
x4, _ = bits.Add64(r4h, r4h, c)
|
|
|
|
if p < 0 {
|
|
r4h, r4i, r4j, r4k, r4l = x4, x3, x2, x1, x0
|
|
p = 0 // avoid negative shift below
|
|
}
|
|
|
|
{
|
|
r := uint(p) // right shift
|
|
l := uint(64 - r) // left shift
|
|
|
|
x0 = (r4l >> r) | (r4k << l)
|
|
x1 = (r4k >> r) | (r4j << l)
|
|
x2 = (r4j >> r) | (r4i << l)
|
|
x3 = (r4i >> r) | (r4h << l)
|
|
x4 = (r4h >> r)
|
|
}
|
|
|
|
if p > 0 {
|
|
r4h, r4i, r4j, r4k, r4l = x4, x3, x2, x1, x0
|
|
}
|
|
|
|
mu[0] = r4l
|
|
mu[1] = r4k
|
|
mu[2] = r4j
|
|
mu[3] = r4i
|
|
mu[4] = r4h
|
|
|
|
return mu
|
|
}
|
|
|
|
// reduce4 computes the least non-negative residue of x modulo m
|
|
//
|
|
// requires a four-word modulus (m[3] != 0) and its inverse (mu)
|
|
func (z *Int) reduce4(x *[8]uint64, m *Int, mu *[5]uint64) *Int {
|
|
|
|
// NB: Most variable names in the comments match the pseudocode for
|
|
// Barrett reduction in the Handbook of Applied Cryptography.
|
|
|
|
// q1 = x/2^192
|
|
|
|
x0 := x[3]
|
|
x1 := x[4]
|
|
x2 := x[5]
|
|
x3 := x[6]
|
|
x4 := x[7]
|
|
|
|
// q2 = q1 * mu; q3 = q2 / 2^320
|
|
|
|
var q0, q1, q2, q3, q4, q5, t0, t1, c uint64
|
|
|
|
q0, _ = bits.Mul64(x3, mu[0])
|
|
q1, t0 = bits.Mul64(x4, mu[0])
|
|
q0, c = bits.Add64(q0, t0, 0)
|
|
q1, _ = bits.Add64(q1, 0, c)
|
|
|
|
t1, _ = bits.Mul64(x2, mu[1])
|
|
q0, c = bits.Add64(q0, t1, 0)
|
|
q2, t0 = bits.Mul64(x4, mu[1])
|
|
q1, c = bits.Add64(q1, t0, c)
|
|
q2, _ = bits.Add64(q2, 0, c)
|
|
|
|
t1, t0 = bits.Mul64(x3, mu[1])
|
|
q0, c = bits.Add64(q0, t0, 0)
|
|
q1, c = bits.Add64(q1, t1, c)
|
|
q2, _ = bits.Add64(q2, 0, c)
|
|
|
|
t1, t0 = bits.Mul64(x2, mu[2])
|
|
q0, c = bits.Add64(q0, t0, 0)
|
|
q1, c = bits.Add64(q1, t1, c)
|
|
q3, t0 = bits.Mul64(x4, mu[2])
|
|
q2, c = bits.Add64(q2, t0, c)
|
|
q3, _ = bits.Add64(q3, 0, c)
|
|
|
|
t1, _ = bits.Mul64(x1, mu[2])
|
|
q0, c = bits.Add64(q0, t1, 0)
|
|
t1, t0 = bits.Mul64(x3, mu[2])
|
|
q1, c = bits.Add64(q1, t0, c)
|
|
q2, c = bits.Add64(q2, t1, c)
|
|
q3, _ = bits.Add64(q3, 0, c)
|
|
|
|
t1, _ = bits.Mul64(x0, mu[3])
|
|
q0, c = bits.Add64(q0, t1, 0)
|
|
t1, t0 = bits.Mul64(x2, mu[3])
|
|
q1, c = bits.Add64(q1, t0, c)
|
|
q2, c = bits.Add64(q2, t1, c)
|
|
q4, t0 = bits.Mul64(x4, mu[3])
|
|
q3, c = bits.Add64(q3, t0, c)
|
|
q4, _ = bits.Add64(q4, 0, c)
|
|
|
|
t1, t0 = bits.Mul64(x1, mu[3])
|
|
q0, c = bits.Add64(q0, t0, 0)
|
|
q1, c = bits.Add64(q1, t1, c)
|
|
t1, t0 = bits.Mul64(x3, mu[3])
|
|
q2, c = bits.Add64(q2, t0, c)
|
|
q3, c = bits.Add64(q3, t1, c)
|
|
q4, _ = bits.Add64(q4, 0, c)
|
|
|
|
t1, t0 = bits.Mul64(x0, mu[4])
|
|
_, c = bits.Add64(q0, t0, 0)
|
|
q1, c = bits.Add64(q1, t1, c)
|
|
t1, t0 = bits.Mul64(x2, mu[4])
|
|
q2, c = bits.Add64(q2, t0, c)
|
|
q3, c = bits.Add64(q3, t1, c)
|
|
q5, t0 = bits.Mul64(x4, mu[4])
|
|
q4, c = bits.Add64(q4, t0, c)
|
|
q5, _ = bits.Add64(q5, 0, c)
|
|
|
|
t1, t0 = bits.Mul64(x1, mu[4])
|
|
q1, c = bits.Add64(q1, t0, 0)
|
|
q2, c = bits.Add64(q2, t1, c)
|
|
t1, t0 = bits.Mul64(x3, mu[4])
|
|
q3, c = bits.Add64(q3, t0, c)
|
|
q4, c = bits.Add64(q4, t1, c)
|
|
q5, _ = bits.Add64(q5, 0, c)
|
|
|
|
// Drop the fractional part of q3
|
|
|
|
q0 = q1
|
|
q1 = q2
|
|
q2 = q3
|
|
q3 = q4
|
|
q4 = q5
|
|
|
|
// r1 = x mod 2^320
|
|
|
|
x0 = x[0]
|
|
x1 = x[1]
|
|
x2 = x[2]
|
|
x3 = x[3]
|
|
x4 = x[4]
|
|
|
|
// r2 = q3 * m mod 2^320
|
|
|
|
var r0, r1, r2, r3, r4 uint64
|
|
|
|
r4, r3 = bits.Mul64(q0, m[3])
|
|
_, t0 = bits.Mul64(q1, m[3])
|
|
r4, _ = bits.Add64(r4, t0, 0)
|
|
|
|
t1, r2 = bits.Mul64(q0, m[2])
|
|
r3, c = bits.Add64(r3, t1, 0)
|
|
_, t0 = bits.Mul64(q2, m[2])
|
|
r4, _ = bits.Add64(r4, t0, c)
|
|
|
|
t1, t0 = bits.Mul64(q1, m[2])
|
|
r3, c = bits.Add64(r3, t0, 0)
|
|
r4, _ = bits.Add64(r4, t1, c)
|
|
|
|
t1, r1 = bits.Mul64(q0, m[1])
|
|
r2, c = bits.Add64(r2, t1, 0)
|
|
t1, t0 = bits.Mul64(q2, m[1])
|
|
r3, c = bits.Add64(r3, t0, c)
|
|
r4, _ = bits.Add64(r4, t1, c)
|
|
|
|
t1, t0 = bits.Mul64(q1, m[1])
|
|
r2, c = bits.Add64(r2, t0, 0)
|
|
r3, c = bits.Add64(r3, t1, c)
|
|
_, t0 = bits.Mul64(q3, m[1])
|
|
r4, _ = bits.Add64(r4, t0, c)
|
|
|
|
t1, r0 = bits.Mul64(q0, m[0])
|
|
r1, c = bits.Add64(r1, t1, 0)
|
|
t1, t0 = bits.Mul64(q2, m[0])
|
|
r2, c = bits.Add64(r2, t0, c)
|
|
r3, c = bits.Add64(r3, t1, c)
|
|
_, t0 = bits.Mul64(q4, m[0])
|
|
r4, _ = bits.Add64(r4, t0, c)
|
|
|
|
t1, t0 = bits.Mul64(q1, m[0])
|
|
r1, c = bits.Add64(r1, t0, 0)
|
|
r2, c = bits.Add64(r2, t1, c)
|
|
t1, t0 = bits.Mul64(q3, m[0])
|
|
r3, c = bits.Add64(r3, t0, c)
|
|
r4, _ = bits.Add64(r4, t1, c)
|
|
|
|
// r = r1 - r2
|
|
|
|
var b uint64
|
|
|
|
r0, b = bits.Sub64(x0, r0, 0)
|
|
r1, b = bits.Sub64(x1, r1, b)
|
|
r2, b = bits.Sub64(x2, r2, b)
|
|
r3, b = bits.Sub64(x3, r3, b)
|
|
r4, b = bits.Sub64(x4, r4, b)
|
|
|
|
// if r<0 then r+=m
|
|
|
|
if b != 0 {
|
|
r0, c = bits.Add64(r0, m[0], 0)
|
|
r1, c = bits.Add64(r1, m[1], c)
|
|
r2, c = bits.Add64(r2, m[2], c)
|
|
r3, c = bits.Add64(r3, m[3], c)
|
|
r4, _ = bits.Add64(r4, 0, c)
|
|
}
|
|
|
|
// while (r>=m) r-=m
|
|
|
|
for {
|
|
// q = r - m
|
|
q0, b = bits.Sub64(r0, m[0], 0)
|
|
q1, b = bits.Sub64(r1, m[1], b)
|
|
q2, b = bits.Sub64(r2, m[2], b)
|
|
q3, b = bits.Sub64(r3, m[3], b)
|
|
q4, b = bits.Sub64(r4, 0, b)
|
|
|
|
// if borrow break
|
|
if b != 0 {
|
|
break
|
|
}
|
|
|
|
// r = q
|
|
r4, r3, r2, r1, r0 = q4, q3, q2, q1, q0
|
|
}
|
|
|
|
z[3], z[2], z[1], z[0] = r3, r2, r1, r0
|
|
|
|
return z
|
|
}
|