diff --git a/PLANNING.md b/PLANNING.md index cfeae65c..7725fc1e 100644 --- a/PLANNING.md +++ b/PLANNING.md @@ -92,13 +92,13 @@ Other tracks are stretch goals, contributions towards them are accepted. - ARM assembly - Finish Nvidia GPU codegenerator up to MSM -- Implement a backend for prime moduli of special form with fast reduction - that don't need Montgomery form +- Implement a backend for Solinas prime like P256 - Implement an unsaturated finite fields backend for Risc-V, WASM, WebGPU, AMD GPU, Apple Metal, Vulkan, ... - ideally in LLVM IR so that pristine Risc-V assembly can be generated and used in zkVMs without any risk of C stdlib or syscalls being used and without depending on the Nim compiler at build time. - introduce batchAffine_vartime +- Optimized square_repeated in assembly for Montgomery and Crandall/Pseudo-Mersenne primes ### User Experience track diff --git a/benchmarks/bench_elliptic_parallel_template.nim b/benchmarks/bench_elliptic_parallel_template.nim index a6a7c77b..ce048534 100644 --- a/benchmarks/bench_elliptic_parallel_template.nim +++ b/benchmarks/bench_elliptic_parallel_template.nim @@ -35,7 +35,7 @@ export bench_elliptic_template # # ############################################################ -proc multiAddParallelBench*(EC: typedesc, numInputs: int, iters: int) = +proc multiAddParallelBench*(EC: typedesc, numInputs: int, iters: int) {.noinline.} = var points = newSeq[EC_ShortW_Aff[EC.F, EC.G]](numInputs) for i in 0 ..< numInputs: @@ -59,7 +59,7 @@ type BenchMsmContext*[EC] = object coefs: seq[BigInt[64]] # seq[getBigInt(EC.getName(), kScalarField)] points: seq[affine(EC)] -proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmContext[EC] = +proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmContext[EC] {.noinline.} = result.tp = Threadpool.new() let maxNumInputs = inputSizes.max() @@ -103,7 +103,7 @@ proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmC let stop = getMonotime() stdout.write &"in {float64(inNanoSeconds(stop-start)) / 1e6:6.3f} ms\n" -proc msmParallelBench*[EC](ctx: var BenchMsmContext[EC], numInputs: int, iters: int) = +proc msmParallelBench*[EC](ctx: var BenchMsmContext[EC], numInputs: int, iters: int) {.noinline.} = const bits = 64 # EC.getScalarField().bits() type ECaff = affine(EC) diff --git a/benchmarks/bench_elliptic_template.nim b/benchmarks/bench_elliptic_template.nim index 13aca484..cbd20624 100644 --- a/benchmarks/bench_elliptic_template.nim +++ b/benchmarks/bench_elliptic_template.nim @@ -72,7 +72,7 @@ func `+=`[F; G: static Subgroup](P: var EC_ShortW_JacExt[F, G], Q: EC_ShortW_Jac func `+=`[F; G: static Subgroup](P: var EC_ShortW_JacExt[F, G], Q: EC_ShortW_Aff[F, G]) {.inline.}= P.mixedSum_vartime(P, Q) -proc addBench*(EC: typedesc, iters: int) = +proc addBench*(EC: typedesc, iters: int) {.noinline.} = var r {.noInit.}: EC let P = rng.random_unsafe(EC) let Q = rng.random_unsafe(EC) @@ -88,7 +88,7 @@ proc addBench*(EC: typedesc, iters: int) = bench("EC Add vartime " & $EC.G, EC, iters): r.sum_vartime(P, Q) -proc mixedAddBench*(EC: typedesc, iters: int) = +proc mixedAddBench*(EC: typedesc, iters: int) {.noinline.} = var r {.noInit.}: EC let P = rng.random_unsafe(EC) let Q = rng.random_unsafe(EC) @@ -106,25 +106,25 @@ proc mixedAddBench*(EC: typedesc, iters: int) = bench("EC Mixed Addition vartime " & $EC.G, EC, iters): r.mixedSum_vartime(P, Qaff) -proc doublingBench*(EC: typedesc, iters: int) = +proc doublingBench*(EC: typedesc, iters: int) {.noinline.} = var r {.noInit.}: EC let P = rng.random_unsafe(EC) bench("EC Double " & $EC.G, EC, iters): r.double(P) -proc affFromProjBench*(EC: typedesc, iters: int) = +proc affFromProjBench*(EC: typedesc, iters: int) {.noinline.} = var r {.noInit.}: EC_ShortW_Aff[EC.F, EC.G] let P = rng.random_unsafe(EC) bench("EC Projective to Affine " & $EC.G, EC, iters): r.affine(P) -proc affFromJacBench*(EC: typedesc, iters: int) = +proc affFromJacBench*(EC: typedesc, iters: int) {.noinline.} = var r {.noInit.}: EC_ShortW_Aff[EC.F, EC.G] let P = rng.random_unsafe(EC) bench("EC Jacobian to Affine " & $EC.G, EC, iters): r.affine(P) -proc affFromProjBatchBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) = +proc affFromProjBatchBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) {.noinline.} = var r = newSeq[affine(EC)](numPoints) var points = newSeq[EC](numPoints) @@ -139,7 +139,7 @@ proc affFromProjBatchBench*(EC: typedesc, numPoints: int, useBatching: bool, ite for i in 0 ..< numPoints: r[i].affine(points[i]) -proc affFromJacBatchBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) = +proc affFromJacBatchBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) {.noinline.} = var r = newSeq[affine(EC)](numPoints) var points = newSeq[EC](numPoints) @@ -154,7 +154,7 @@ proc affFromJacBatchBench*(EC: typedesc, numPoints: int, useBatching: bool, iter for i in 0 ..< numPoints: r[i].affine(points[i]) -proc scalarMulGenericBench*(EC: typedesc, bits, window: static int, iters: int) = +proc scalarMulGenericBench*(EC: typedesc, bits, window: static int, iters: int) {.noinline.} = var r {.noInit.}: EC var P = rng.random_unsafe(EC) P.clearCofactor() @@ -165,7 +165,7 @@ proc scalarMulGenericBench*(EC: typedesc, bits, window: static int, iters: int) r = P r.scalarMulGeneric(exponent, window) -proc scalarMulEndo*(EC: typedesc, bits: static int, iters: int) = +proc scalarMulEndo*(EC: typedesc, bits: static int, iters: int) {.noinline.} = var r {.noInit.}: EC var P = rng.random_unsafe(EC) P.clearCofactor() @@ -176,7 +176,7 @@ proc scalarMulEndo*(EC: typedesc, bits: static int, iters: int) = r = P r.scalarMulEndo(exponent) -proc scalarMulEndoWindow*(EC: typedesc, bits: static int, iters: int) = +proc scalarMulEndoWindow*(EC: typedesc, bits: static int, iters: int) {.noinline.} = var r {.noInit.}: EC var P = rng.random_unsafe(EC) P.clearCofactor() @@ -190,7 +190,7 @@ proc scalarMulEndoWindow*(EC: typedesc, bits: static int, iters: int) = else: {.error: "Not implemented".} -proc scalarMulVartimeDoubleAddBench*(EC: typedesc, bits: static int, iters: int) = +proc scalarMulVartimeDoubleAddBench*(EC: typedesc, bits: static int, iters: int) {.noinline.} = var r {.noInit.}: EC var P = rng.random_unsafe(EC) P.clearCofactor() @@ -201,7 +201,7 @@ proc scalarMulVartimeDoubleAddBench*(EC: typedesc, bits: static int, iters: int) r = P r.scalarMul_doubleAdd_vartime(exponent) -proc scalarMulVartimeMinHammingWeightRecodingBench*(EC: typedesc, bits: static int, iters: int) = +proc scalarMulVartimeMinHammingWeightRecodingBench*(EC: typedesc, bits: static int, iters: int) {.noinline.} = var r {.noInit.}: EC var P = rng.random_unsafe(EC) P.clearCofactor() @@ -212,7 +212,7 @@ proc scalarMulVartimeMinHammingWeightRecodingBench*(EC: typedesc, bits: static i r = P r.scalarMul_jy00_vartime(exponent) -proc scalarMulVartimeWNAFBench*(EC: typedesc, bits, window: static int, iters: int) = +proc scalarMulVartimeWNAFBench*(EC: typedesc, bits, window: static int, iters: int) {.noinline.} = var r {.noInit.}: EC var P = rng.random_unsafe(EC) P.clearCofactor() @@ -223,7 +223,7 @@ proc scalarMulVartimeWNAFBench*(EC: typedesc, bits, window: static int, iters: i r = P r.scalarMul_wNAF_vartime(exponent, window) -proc scalarMulVartimeEndoWNAFBench*(EC: typedesc, bits, window: static int, iters: int) = +proc scalarMulVartimeEndoWNAFBench*(EC: typedesc, bits, window: static int, iters: int) {.noinline.} = var r {.noInit.}: EC var P = rng.random_unsafe(EC) P.clearCofactor() @@ -234,14 +234,14 @@ proc scalarMulVartimeEndoWNAFBench*(EC: typedesc, bits, window: static int, iter r = P r.scalarMulEndo_wNAF_vartime(exponent, window) -proc subgroupCheckBench*(EC: typedesc, iters: int) = +proc subgroupCheckBench*(EC: typedesc, iters: int) {.noinline.} = var P = rng.random_unsafe(EC) P.clearCofactor() bench("Subgroup check", EC, iters): discard P.isInSubgroup() -proc subgroupCheckScalarMulVartimeEndoWNAFBench*(EC: typedesc, bits, window: static int, iters: int) = +proc subgroupCheckScalarMulVartimeEndoWNAFBench*(EC: typedesc, bits, window: static int, iters: int) {.noinline.} = var r {.noInit.}: EC var P = rng.random_unsafe(EC) P.clearCofactor() @@ -253,7 +253,7 @@ proc subgroupCheckScalarMulVartimeEndoWNAFBench*(EC: typedesc, bits, window: sta discard r.isInSubgroup() r.scalarMulEndo_wNAF_vartime(exponent, window) -proc multiAddBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) = +proc multiAddBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) {.noinline.} = var points = newSeq[EC_ShortW_Aff[EC.F, EC.G]](numPoints) for i in 0 ..< numPoints: @@ -271,7 +271,7 @@ proc multiAddBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) r += points[i] -proc msmBench*(EC: typedesc, numPoints: int, iters: int) = +proc msmBench*(EC: typedesc, numPoints: int, iters: int) {.noinline.} = const bits = EC.getScalarField().bits() var points = newSeq[EC_ShortW_Aff[EC.F, EC.G]](numPoints) var scalars = newSeq[BigInt[bits]](numPoints) diff --git a/benchmarks/bench_fields_template.nim b/benchmarks/bench_fields_template.nim index 5ad318e2..f919c6fc 100644 --- a/benchmarks/bench_fields_template.nim +++ b/benchmarks/bench_fields_template.nim @@ -61,37 +61,46 @@ func random_unsafe(rng: var RngState, a: var ExtensionField2x) = for i in 0 ..< a.coords.len: rng.random_unsafe(a.coords[i]) -proc addBench*(T: typedesc, iters: int) = +proc addBench*(T: typedesc, iters: int) {.noinline.} = var x = rng.random_unsafe(T) let y = rng.random_unsafe(T) bench("Addition", T, iters): x += y -proc subBench*(T: typedesc, iters: int) = +proc add10Bench*(T: typedesc, iters: int) {.noinline.} = + var xs: array[10, T] + for x in xs.mitems(): + x = rng.random_unsafe(T) + let y = rng.random_unsafe(T) + bench("Additions (10)", T, iters): + staticFor i, 0, 10: + xs[i] += y + +proc subBench*(T: typedesc, iters: int) {.noinline.} = var x = rng.random_unsafe(T) let y = rng.random_unsafe(T) preventOptimAway(x) bench("Substraction", T, iters): x -= y -proc negBench*(T: typedesc, iters: int) = +proc negBench*(T: typedesc, iters: int) {.noinline.} = var r: T let x = rng.random_unsafe(T) bench("Negation", T, iters): r.neg(x) -proc ccopyBench*(T: typedesc, iters: int) = +proc ccopyBench*(T: typedesc, iters: int) {.noinline.} = var r: T let x = rng.random_unsafe(T) bench("Conditional Copy", T, iters): r.ccopy(x, CtFalse) -proc div2Bench*(T: typedesc, iters: int) = +proc div2Bench*(T: typedesc, iters: int) {.noinline.} = var x = rng.random_unsafe(T) bench("Division by 2", T, iters): x.div2() -proc mulBench*(T: typedesc, iters: int) = +proc mulBench*(T: typedesc, iters: int) {.noinline.} = var r: T let x = rng.random_unsafe(T) let y = rng.random_unsafe(T) @@ -99,14 +108,14 @@ proc mulBench*(T: typedesc, iters: int) = bench("Multiplication", T, iters): r.prod(x, y) -proc sqrBench*(T: typedesc, iters: int) = +proc sqrBench*(T: typedesc, iters: int) {.noinline.} = var r: T let x = rng.random_unsafe(T) preventOptimAway(r) bench("Squaring", T, iters): r.square(x) -proc mul2xUnrBench*(T: typedesc, iters: int) = +proc mul2xUnrBench*(T: typedesc, iters: int) {.noinline.} = var r: doublePrec(T) let x = rng.random_unsafe(T) let y = rng.random_unsafe(T) @@ -114,14 +123,14 @@ proc mul2xUnrBench*(T: typedesc, iters: int) = bench("Multiplication 2x unreduced", T, iters): r.prod2x(x, y) -proc sqr2xUnrBench*(T: typedesc, iters: int) = +proc sqr2xUnrBench*(T: typedesc, iters: int) {.noinline.} = var r: doublePrec(T) let x = rng.random_unsafe(T) preventOptimAway(r) bench("Squaring 2x unreduced", T, iters): r.square2x(x) -proc rdc2xBench*(T: typedesc, iters: int) = +proc rdc2xBench*(T: typedesc, iters: int) {.noinline.} = var r: T var t: doublePrec(T) rng.random_unsafe(t) @@ -129,7 +138,7 @@ proc rdc2xBench*(T: typedesc, iters: int) = bench("Redc 2x", T, iters): r.redc2x(t) -proc sumprodBench*(T: typedesc, iters: int) = +proc sumprodBench*(T: typedesc, iters: int) {.noinline.} = var r: T let a = rng.random_unsafe(T) let b = rng.random_unsafe(T) @@ -139,40 +148,40 @@ proc sumprodBench*(T: typedesc, iters: int) = bench("Linear combination", T, iters): r.sumprod([a, b], [u, v]) -proc toBigBench*(T: typedesc, iters: int) = +proc toBigBench*(T: typedesc, iters: int) {.noinline.} = var r: T.getBigInt() let x = rng.random_unsafe(T) preventOptimAway(r) bench("BigInt <- field conversion", T, iters): r.fromField(x) -proc toFieldBench*(T: typedesc, iters: int) = +proc toFieldBench*(T: typedesc, iters: int) {.noinline.} = var r: T let x = rng.random_unsafe(T.getBigInt()) preventOptimAway(r) bench("BigInt -> field conversion", T, iters): r.fromBig(x) -proc invBench*(T: typedesc, iters: int) = +proc invBench*(T: typedesc, iters: int) {.noinline.} = var r: T let x = rng.random_unsafe(T) preventOptimAway(r) bench("Inversion (constant-time)", T, iters): r.inv(x) -proc invVartimeBench*(T: typedesc, iters: int) = +proc invVartimeBench*(T: typedesc, iters: int) {.noinline.} = var r: T let x = rng.random_unsafe(T) preventOptimAway(r) bench("Inversion (variable-time)", T, iters): r.inv_vartime(x) -proc isSquareBench*(T: typedesc, iters: int) = +proc isSquareBench*(T: typedesc, iters: int) {.noinline.} = let x = rng.random_unsafe(T) bench("isSquare (constant-time)", T, iters): let qrt = x.isSquare() -proc sqrtBench*(T: typedesc, iters: int) = +proc sqrtBench*(T: typedesc, iters: int) {.noinline.} = let x = rng.random_unsafe(T) const algoType = block: @@ -192,14 +201,14 @@ proc sqrtBench*(T: typedesc, iters: int) = var r = x discard r.sqrt_if_square() -proc sqrtRatioBench*(T: typedesc, iters: int) = +proc sqrtRatioBench*(T: typedesc, iters: int) {.noinline.} = var r: T let u = rng.random_unsafe(T) let v = rng.random_unsafe(T) bench("Fused SquareRoot+Division+isSquare sqrt(u/v)", T, iters): let isSquare = r.sqrt_ratio_if_square(u, v) -proc sqrtVartimeBench*(T: typedesc, iters: int) = +proc sqrtVartimeBench*(T: typedesc, iters: int) {.noinline.} = let x = rng.random_unsafe(T) const algoType = block: @@ -219,21 +228,21 @@ proc sqrtVartimeBench*(T: typedesc, iters: int) = var r = x discard r.sqrt_if_square_vartime() -proc sqrtRatioVartimeBench*(T: typedesc, iters: int) = +proc sqrtRatioVartimeBench*(T: typedesc, iters: int) {.noinline.} = var r: T let u = rng.random_unsafe(T) let v = rng.random_unsafe(T) bench("Fused SquareRoot+Division+isSquare sqrt_vartime(u/v)", T, iters): let isSquare = r.sqrt_ratio_if_square_vartime(u, v) -proc powBench*(T: typedesc, iters: int) = +proc powBench*(T: typedesc, iters: int) {.noinline.} = let x = rng.random_unsafe(T) let exponent = rng.random_unsafe(BigInt[Fr[T.Name].bits()]) var r = x bench("Exp curve order (constant-time) - " & $exponent.bits & "-bit", T, iters): r.pow(exponent) -proc powVartimeBench*(T: typedesc, iters: int) = +proc powVartimeBench*(T: typedesc, iters: int) {.noinline.} = let x = rng.random_unsafe(T) let exponent = rng.random_unsafe(BigInt[Fr[T.Name].bits()]) var r = x diff --git a/benchmarks/bench_fp.nim b/benchmarks/bench_fp.nim index 7678340b..4e818508 100644 --- a/benchmarks/bench_fp.nim +++ b/benchmarks/bench_fp.nim @@ -44,6 +44,7 @@ proc main() = staticFor i, 0, AvailableCurves.len: const curve = AvailableCurves[i] addBench(Fp[curve], Iters) + add10Bench(Fp[curve], Iters) subBench(Fp[curve], Iters) negBench(Fp[curve], Iters) ccopyBench(Fp[curve], Iters) @@ -55,8 +56,9 @@ proc main() = sqr2xUnrBench(Fp[curve], Iters) rdc2xBench(Fp[curve], Iters) smallSeparator() - sumprodBench(Fp[curve], Iters) - smallSeparator() + when not Fp[curve].isCrandallPrimeField(): + sumprodBench(Fp[curve], Iters) + smallSeparator() toBigBench(Fp[curve], Iters) toFieldBench(Fp[curve], Iters) smallSeparator() diff --git a/constantine.nimble b/constantine.nimble index 20d8c950..45bfeb15 100644 --- a/constantine.nimble +++ b/constantine.nimble @@ -402,10 +402,10 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[ ("tests/math_fields/t_io_fields", false), # ("tests/math_fields/t_finite_fields.nim", false), # ("tests/math_fields/t_finite_fields_conditional_arithmetic.nim", false), - # ("tests/math_fields/t_finite_fields_mulsquare.nim", false), - # ("tests/math_fields/t_finite_fields_sqrt.nim", false), + ("tests/math_fields/t_finite_fields_mulsquare.nim", false), + ("tests/math_fields/t_finite_fields_sqrt.nim", false), ("tests/math_fields/t_finite_fields_powinv.nim", false), - # ("tests/math_fields/t_finite_fields_vs_gmp.nim", true), + ("tests/math_fields/t_finite_fields_vs_gmp.nim", true), # ("tests/math_fields/t_fp_cubic_root.nim", false), # Double-precision finite fields diff --git a/constantine/hash_to_curve/h2c_hash_to_field.nim b/constantine/hash_to_curve/h2c_hash_to_field.nim index 3d862815..4a1d68e9 100644 --- a/constantine/hash_to_curve/h2c_hash_to_field.nim +++ b/constantine/hash_to_curve/h2c_hash_to_field.nim @@ -229,12 +229,12 @@ func hashToField*[Field; count: static int]( output[i].redc2x(big2x) output[i].mres.mulMont( output[i].mres, - Fp[Field.Name].getR3ModP(), + Fp[Field.Name].getR3modP(), Fp[Field.Name]) else: output[i].coords[j].redc2x(big2x) output[i].coords[j].mres.mulMont( output[i].coords[j].mres, - Fp[Field.Name].getR3ModP(), + Fp[Field.Name].getR3modP(), Fp[Field.Name]) diff --git a/constantine/math/arithmetic/assembly/limbs_asm_crandall_x86.nim b/constantine/math/arithmetic/assembly/limbs_asm_crandall_x86.nim new file mode 100644 index 00000000..ee4b730a --- /dev/null +++ b/constantine/math/arithmetic/assembly/limbs_asm_crandall_x86.nim @@ -0,0 +1,266 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + # Standard library + std/macros, + # Internal + constantine/platforms/abstractions + +when UseASM_X86_64: + import ./limbs_asm_mul_x86 + +# ############################################################ +# +# Assembly implementation of finite fields +# +# ############################################################ + + +static: doAssert UseASM_X86_32 + +# Crandall reductions +# ------------------------------------------------------------ + +macro reduceCrandallPartial_gen*[N: static int]( + r_PIR: var array[N, SecretWord], + a_MEM: array[N*2, SecretWord], + m: static int, c: static BaseType) = + + result = newStmtList() + var ctx = init(Assembler_x86, BaseType) + + ctx.comment "Crandall reduction - Partial" + ctx.comment "----------------------------" + + let + r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it). + a = asmArray(a_MEM, 2*N, MemOffsettable, asmInput) + + tSym = ident"t" + t = asmArray(tSym, N, ElemsInReg, asmOutputEarlyClobber) + + hiSym = ident"hi" + hi = asmValue(hiSym, Reg, asmOutputEarlyClobber) + + csSym = ident"cs" + cs = asmValue(csSym, Reg, asmOutputEarlyClobber) + + S =(N*WordBitWidth - m) + csImm = c shl S + + # Prologue + result.add quote do: + var `hiSym`{.noinit.}, `csSym`{.noinit.}: BaseType + var `tSym`{.noInit.}: typeof(`r_PIR`) + + ctx.`xor` hi, hi + ctx.mov cs, csImm + + # Algorithm + # Note: always accumulate into a register. + # add-carry into memory is huge pessimization + + ctx.comment "First reduction pass" + ctx.comment "--------------------" + ctx.comment "(hi, r₀) <- aₙ*cs + a₀" + ctx.mov rax, a[N] + ctx.mul rdx, rax, cs, rax + ctx.add rax, a[0] + ctx.mov t[0], rax + ctx.adc hi, rdx # hi = 0 + for i in 1 ..< N: + ctx.comment " (hi, rᵢ) <- aᵢ₊ₙ*cs + aᵢ + hi" + ctx.mov rax, a[i+N] + ctx.mul rdx, rax, cs, rax + ctx.add rax, hi + ctx.adc rdx, 0 + ctx.`xor` hi, hi + ctx.add rax, a[i] + ctx.adc hi, rdx + ctx.mov t[i], rax + + # The first reduction pass may carry in `hi` + # which would be hi*2ʷⁿ ≡ hi*2ʷⁿ⁻ᵐ*c (mod p) + # ≡ hi*cs (mod p) + ctx.shld hi, t[N-1], S + # High-bits have been "carried" to `hi`, cancel them in r[N-1]. + # Note: there might be up to `c` not reduced. + ctx.mov rax, BaseType(MaxWord) shr S + ctx.`and` t[N-1], rax + + # Partially reduce to up to `m` bits + # We need to fold what's beyond `m` bits + # by repeatedly multiplying it by cs + # We distinguish 2 cases: + # 1. Folding (N..2N)*cs onto 0..N + # may overflow and a 3rd folding is necessary + # 2. Folding (N..2N)*cs onto 0..N + # may not overflow and 2 foldings are all we need. + # This is possible: + # - if we don't use the full 2ʷⁿ + # for example we use 255 bits out of 256 available + # - And (2ʷⁿ⁻ᵐ*c)² < 2ʷ + # + # There is a 3rd case that we don't handle + # c > 2ʷ, for example secp256k1 on 32-bit + + if N*WordBitWidth == m: # Secp256k1 only according to eprint/iacr 2018/985 + # We shifted hi by 2ʷⁿ⁻ᵐ so no need for cs = 2ʷⁿ⁻ᵐc + # we just need c + ctx.mov cs, c + + # Second pass + ctx.mov rax, hi + ctx.mul rdx, rax, cs, rax + ctx.add t[0], rax + ctx.mov rax, 0 + ctx.adc t[1], rdx + for i in 2 ..< N: + ctx.adc t[i], 0 + ctx.mov r[i], t[i] + + # Third pass + ctx.setc rax # This only sets the low 8 bits, need extra zero-out + ctx.mul rdx, rax, cs, rax + ctx.add t[0], rax + ctx.mov r[0], t[0] + ctx.adc t[1], rdx + ctx.mov r[1], t[1] + + # We want to ensure that cs² < 2³² on 64-bit or 2¹⁶ on 32-bit + # But doing unsigned cs² may overflow, so we sqrt the rhs instead + elif csImm.uint64 < (1'u64 shl (WordBitWidth shr 1)) - 1: + + # Second pass + + # hi < cs, and cs² < 2ʷ (2³² on 32-bit or 2⁶⁴ on 64-bit) + # hence hi *= c cannot overflow + ctx.imul rax, hi, c + ctx.add t[0], rax + ctx.mov r[0], t[0] + for i in 1 ..< N: + ctx.adc t[i], 0 + ctx.mov r[i], t[i] + + else: + error "Not implemented" + + # Code generation + result.add ctx.generate() + +func reduceCrandallPartial_asm*[N: static int]( + r: var Limbs[N], + a: array[2*N, SecretWord], + m: static int, c: static SecretWord) = + ## Partial Reduction modulo p + ## with p with special form 2ᵐ-c + ## called "Crandall prime" or Pseudo-Mersenne Prime in the litterature + ## + ## This is a partial reduction that reduces down to + ## 2ᵐ, i.e. it fits in the same amount of word by p + ## but values my be up to p+c + ## + ## Crandal primes allow fast reduction from the fact that + ## 2ᵐ-c ≡ 0 (mod p) + ## <=> 2ᵐ ≡ c (mod p) + ## <=> a2ᵐ+b ≡ ac + b (mod p) + r.reduceCrandallPartial_gen(a, m, BaseType(c)) + +macro reduceCrandallFinal_gen*[N: static int]( + a_PIR: var Limbs[N], p_MEM: Limbs[N]) = + ## Final Reduction modulo p + ## with p with special form 2ᵐ-c + ## called "Crandall prime" or Pseudo-Mersenne Prime in the litterature + ## + ## This reduces `a` from [0, 2ᵐ) to [0, 2ᵐ-c) + + result = newStmtList() + var ctx = init(Assembler_x86, BaseType) + + ctx.comment "Crandall reduction - Final substraction" + ctx.comment "---------------------------------------" + + let a =asmArray(a_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but_ec_shortw_prj_g1_sum_reduce.nimt compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it). + let tSym = ident"t" + let t = asmArray(tSym, N, ElemsInReg, asmOutputEarlyClobber) + let p = asmArray(p_MEM, N, MemOffsettable, asmInput) + + result.add quote do: + var `tsym`{.noInit.}: typeof(`a_PIR`) + + # Substract the modulus, and test a < p with the last borrow + ctx.mov t[0], a[0] + ctx.sub t[0], p[0] + for i in 1 ..< N: + ctx.mov t[i], a[i] + ctx.sbb t[i], p[i] + + # If we borrowed it means that we were smaller than + # the modulus and we don't need "scratch". + var r0 = rax + var r1 = rdx + for i in 0 ..< N: + ctx.mov r0, a[i] + ctx.cmovnc r0, t[i] + ctx.mov a[i], r0 + swap(r0, r1) + + # Codegen + result.add ctx.generate() + +func reduceCrandallFinal_asm*[N: static int]( + a: var Limbs[N], p: Limbs[N]) = + ## Partial Reduction modulo p + ## with p with special form 2ᵐ-c + ## called "Crandall prime" or Pseudo-Mersenne Prime in the litterature + ## + ## This is a partial reduction that reduces down to + ## 2ᵐ, i.e. it fits in the same amount of word by p + ## but values my be up to p+c + ## + ## Crandal primes allow fast reduction from the fact that + ## 2ᵐ-c ≡ 0 (mod p) + ## <=> 2ᵐ ≡ c (mod p) + ## <=> a2ᵐ+b ≡ ac + b (mod p) + a.reduceCrandallFinal_gen(p) + +# Crandall Multiplication and squaring +# ------------------------------------------------------------ + +func mulCranPartialReduce_asm*[N: static int]( + r: var Limbs[N], + a, b: Limbs[N], + m: static int, c: static SecretWord) = + static: doAssert UseASM_X86_64, "x86-32 does not have enough registers for multiplication" + var r2 {.noInit.}: Limbs[2*N] + r2.mul_asm(a, b) + r.reduceCrandallPartial_asm(r2, m, c) + +func squareCranPartialReduce_asm*[N: static int]( + r: var Limbs[N], + a: Limbs[N], + m: static int, c: static SecretWord) = + static: doAssert UseASM_X86_64, "x86-32 does not have enough registers for squaring" + var r2 {.noInit.}: Limbs[2*N] + r2.square_asm(a) + r.reduceCrandallPartial_asm(r2, m, c) + +func mulCran_asm*[N: static int]( + r: var Limbs[N], + a, b, p: Limbs[N], + m: static int, c: static SecretWord) = + r.mulCranPartialReduce_asm(a, b, m, c) + r.reduceCrandallFinal_asm(p) + +func squareCran_asm*[N: static int]( + r: var Limbs[N], + a, p: Limbs[N], + m: static int, c: static SecretWord) = + r.squareCranPartialReduce_asm(a, m, c) + r.reduceCrandallFinal_asm(p) diff --git a/constantine/math/arithmetic/assembly/limbs_asm_crandall_x86_adx_bmi2.nim b/constantine/math/arithmetic/assembly/limbs_asm_crandall_x86_adx_bmi2.nim new file mode 100644 index 00000000..d4256467 --- /dev/null +++ b/constantine/math/arithmetic/assembly/limbs_asm_crandall_x86_adx_bmi2.nim @@ -0,0 +1,209 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + # Standard library + std/macros, + # Internal + constantine/platforms/abstractions, + ./limbs_asm_crandall_x86 + +when UseASM_X86_64: + import ./limbs_asm_mul_x86_adx_bmi2 + +# ############################################################ +# +# Assembly implementation of finite fields +# +# ############################################################ + + +static: doAssert UseASM_X86_32 + +# Crandall reduction +# ------------------------------------------------------------ + +macro reduceCrandallPartial_adx_gen*[N: static int]( + r_PIR: var array[N, SecretWord], + a_MEM: array[N*2, SecretWord], + m: static int, c: static BaseType) = + + result = newStmtList() + var ctx = init(Assembler_x86, BaseType) + + ctx.comment "Crandall reduction - Partial" + ctx.comment "----------------------------" + + let + r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it). + a = asmArray(a_MEM, 2*N, MemOffsettable, asmInput) + + tSym = ident"t" + t = asmArray(tSym, N, ElemsInReg, asmOutputEarlyClobber) + + hiSym = ident"hi" + hi = asmValue(hiSym, Reg, asmOutputEarlyClobber) + + csSym = ident"cs" + cs = asmValue(csSym, Reg, asmOutputEarlyClobber) + + S =(N*WordBitWidth - m) + csImm = c shl S + + # Prologue + # --------- + result.add quote do: + var `hiSym`{.noinit.}, `csSym`{.noinit.}: BaseType + var `tSym`{.noInit.}: typeof(`r_PIR`) + + ctx.`xor` rax, rax + ctx.mov cs, csImm + + # Algorithm + # --------- + ctx.comment "First reduction pass" + ctx.comment "--------------------" + ctx.comment "(hi, r₀) <- aₙ*cs + a₀" + ctx.mov rdx, cs + ctx.mov t[0], a[0] + for i in 0 ..< N: + # TODO: should we alternate rax with another register? + # to deal with false dependencies? + ctx.comment " (hi, rᵢ) <- aᵢ₊ₙ*cs + aᵢ + hi" + if i != N-1: + ctx.mov t[i+1], a[i+1] + ctx.mulx hi, rax, a[N+i], rdx + ctx.adox t[i], rax + if i != N-1: + ctx.adcx t[i+1], hi + + # Final carries + ctx.mov rdx, 0 + ctx.adcx hi, rdx + ctx.adox hi, rdx + + # The first reduction pass may carry in `hi` + # which would be hi*2ʷⁿ ≡ hi*2ʷⁿ⁻ᵐ*c (mod p) + # ≡ hi*cs (mod p) + ctx.shld hi, t[N-1], S + # High-bits have been "carried" to `hi`, cancel them in r[N-1]. + # Note: there might be up to `c` not reduced. + ctx.mov rax, BaseType(MaxWord) shr S + ctx.`and` t[N-1], rax + + # Partially reduce to up to `m` bits + # We need to fold what's beyond `m` bits + # by repeatedly multiplying it by cs + # We distinguish 2 cases: + # 1. Folding (N..2N)*cs onto 0..N + # may overflow and a 3rd folding is necessary + # 2. Folding (N..2N)*cs onto 0..N + # may not overflow and 2 foldings are all we need. + # This is possible: + # - if we don't use the full 2ʷⁿ + # for example we use 255 bits out of 256 available + # - And (2ʷⁿ⁻ᵐ*c)² < 2ʷ + # + # There is a 3rd case that we don't handle + # c > 2ʷ, for example secp256k1 on 32-bit + + if N*WordBitWidth == m: # Secp256k1 only according to eprint/iacr 2018/985 + # We shifted hi by 2ʷⁿ⁻ᵐ so no need for cs = 2ʷⁿ⁻ᵐc + # we just need c + ctx.mov rdx, c + + # Second pass + ctx.mulx hi, rax, hi, rdx + ctx.add t[0], rax + ctx.mov rax, 0 + ctx.adc t[1], hi + for i in 2 ..< N: + ctx.adc t[i], 0 + ctx.mov r[i], t[i] + + # Third pass + ctx.setc rax # This only sets the low 8 bits, need extra zero-out + ctx.mulx hi, rax, rax, rdx + ctx.add t[0], rax + ctx.mov r[0], t[0] + ctx.adc t[1], hi + ctx.mov r[1], t[1] + + # We want to ensure that cs² < 2³² on 64-bit or 2¹⁶ on 32-bit + # But doing unsigned cs² may overflow, so we sqrt the rhs instead + elif csImm.uint64 < (1'u64 shl (WordBitWidth shr 1)) - 1: + + # Second pass + + # hi < cs, and cs² < 2ʷ (2³² on 32-bit or 2⁶⁴ on 64-bit) + # hence hi *= c cannot overflow + ctx.imul rax, hi, c + ctx.add t[0], rax + ctx.mov r[0], t[0] + for i in 1 ..< N: + ctx.adc t[i], 0 + ctx.mov r[i], t[i] + + else: + error "Not implemented" + + # Code generation + result.add ctx.generate() + +func reduceCrandallPartial_asm_adx*[N: static int]( + r: var Limbs[N], + a: array[2*N, SecretWord], + m: static int, c: static SecretWord) = + ## Partial Reduction modulo p + ## with p with special form 2ᵐ-c + ## called "Crandall prime" or Pseudo-Mersenne Prime in the litterature + ## + ## This is a partial reduction that reduces down to + ## 2ᵐ, i.e. it fits in the same amount of word by p + ## but values my be up to p+c + ## + ## Crandal primes allow fast reduction from the fact that + ## 2ᵐ-c ≡ 0 (mod p) + ## <=> 2ᵐ ≡ c (mod p) + ## <=> a2ᵐ+b ≡ ac + b (mod p) + r.reduceCrandallPartial_adx_gen(a, m, BaseType(c)) + +# Crandall Multiplication and squaring +# ------------------------------------------------------------ + +func mulCranPartialReduce_asm_adx*[N: static int]( + r: var Limbs[N], + a, b: Limbs[N], + m: static int, c: static SecretWord) = + static: doAssert UseASM_X86_64, "x86-32 does not have enough registers for squaring" + var r2 {.noInit.}: Limbs[2*N] + r2.mul_asm_adx(a, b) + r.reduceCrandallPartial_asm_adx(r2, m, c) + +func squareCranPartialReduce_asm_adx*[N: static int]( + r: var Limbs[N], + a: Limbs[N], + m: static int, c: static SecretWord) = + static: doAssert UseASM_X86_64, "x86-32 does not have enough registers for squaring" + var r2 {.noInit.}: Limbs[2*N] + r2.square_asm_adx(a) + r.reduceCrandallPartial_asm_adx(r2, m, c) + +func mulCran_asm_adx*[N: static int]( + r: var Limbs[N], + a, b, p: Limbs[N], + m: static int, c: static SecretWord) = + r.mulCranPartialReduce_asm_adx(a, b, m, c) + r.reduceCrandallFinal_asm(p) + +func squareCran_asm_adx*[N: static int]( + r: var Limbs[N], + a, p: Limbs[N], + m: static int, c: static SecretWord) = + r.squareCranPartialReduce_asm_adx(a, m, c) + r.reduceCrandallFinal_asm(p) diff --git a/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_x86.nim b/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_x86.nim index 797a4306..14544d47 100644 --- a/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_x86.nim +++ b/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_x86.nim @@ -33,7 +33,7 @@ static: doAssert UseASM_X86_64 macro mulMont_CIOS_sparebit_gen[N: static int]( r_PIR: var Limbs[N], a_PIR, b_PIR, M_MEM: Limbs[N], m0ninv_REG: BaseType, - skipFinalSub: static bool): untyped = + lazyReduce: static bool): untyped = ## Generate an optimized Montgomery Multiplication kernel ## using the CIOS method ## @@ -161,21 +161,21 @@ macro mulMont_CIOS_sparebit_gen[N: static int]( ctx.mov rax, r # move r away from scratchspace that will be used for final substraction let r2 = rax.asArrayAddr(r_PIR, len = N, memIndirect = memWrite) - if skipFinalSub: + if lazyReduce: for i in 0 ..< N: ctx.mov r2[i], t[i] else: ctx.finalSubNoOverflowImpl(r2, t, M, scratch) result.add ctx.generate() -func mulMont_CIOS_sparebit_asm*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) = +func mulMont_CIOS_sparebit_asm*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, lazyReduce: static bool = false) = ## Constant-time Montgomery multiplication - ## If "skipFinalSub" is set + ## If "lazyReduce" is set ## the result is in the range [0, 2M) ## otherwise the result is in the range [0, M) ## ## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation - r.mulMont_CIOS_sparebit_gen(a, b, M, m0ninv, skipFinalSub) + r.mulMont_CIOS_sparebit_gen(a, b, M, m0ninv, lazyReduce) # Montgomery Squaring # ------------------------------------------------------------ @@ -184,11 +184,11 @@ func squareMont_CIOS_asm*[N]( r: var Limbs[N], a, M: Limbs[N], m0ninv: BaseType, - spareBits: static int, skipFinalSub: static bool) = + spareBits: static int, lazyReduce: static bool) = ## Constant-time modular squaring var r2x {.noInit.}: Limbs[2*N] square_asm(r2x, a) - r.redcMont_asm(r2x, M, m0ninv, spareBits, skipFinalSub) + r.redcMont_asm(r2x, M, m0ninv, spareBits, lazyReduce) # Montgomery Sum of Products # ------------------------------------------------------------ @@ -196,7 +196,7 @@ func squareMont_CIOS_asm*[N]( macro sumprodMont_CIOS_spare2bits_gen[N, K: static int]( r_PIR: var Limbs[N], a_PIR, b_PIR: array[K, Limbs[N]], M_MEM: Limbs[N], m0ninv_REG: BaseType, - skipFinalSub: static bool): untyped = + lazyReduce: static bool): untyped = ## Generate an optimized Montgomery merged sum of products ⅀aᵢ.bᵢ kernel ## using the CIOS method ## @@ -353,7 +353,7 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int]( ctx.mov rax, r # move r away from scratchspace that will be used for final substraction let r2 = rax.asArrayAddr(r_PIR, len = N, memIndirect = memWrite) - if skipFinalSub: + if lazyReduce: ctx.comment " Copy result" for i in 0 ..< N: ctx.mov r2[i], t[i] @@ -367,11 +367,11 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int]( func sumprodMont_CIOS_spare2bits_asm*[N, K: static int]( r: var Limbs[N], a, b: array[K, Limbs[N]], M: Limbs[N], m0ninv: BaseType, - skipFinalSub: static bool) = + lazyReduce: static bool) = ## Sum of products ⅀aᵢ.bᵢ in the Montgomery domain - ## If "skipFinalSub" is set + ## If "lazyReduce" is set ## the result is in the range [0, 2M) ## otherwise the result is in the range [0, M) ## ## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation - r.sumprodMont_CIOS_spare2bits_gen(a, b, M, m0ninv, skipFinalSub) + r.sumprodMont_CIOS_spare2bits_gen(a, b, M, m0ninv, lazyReduce) diff --git a/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_x86_adx_bmi2.nim b/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_x86_adx_bmi2.nim index 12298f1c..3bbd0644 100644 --- a/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_x86_adx_bmi2.nim +++ b/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_x86_adx_bmi2.nim @@ -62,6 +62,8 @@ proc mulx_by_word( # Steady state for j in 1 ..< N-1: + # TODO: should we alternate lo with another register? + # to deal with false dependencies? ctx.mulx t[j+1], lo, a[j], rdx if j == 1: ctx.add t[j], lo @@ -172,7 +174,7 @@ proc partialRedx( macro mulMont_CIOS_sparebit_adx_gen[N: static int]( r_PIR: var Limbs[N], a_PIR, b_PIR, M_MEM: Limbs[N], m0ninv_REG: BaseType, - skipFinalSub: static bool): untyped = + lazyReduce: static bool): untyped = ## Generate an optimized Montgomery Multiplication kernel ## using the CIOS method ## This requires the most significant word of the Modulus @@ -258,7 +260,7 @@ macro mulMont_CIOS_sparebit_adx_gen[N: static int]( M, m0ninv, lo, C) - if skipFinalSub: + if lazyReduce: for i in 0 ..< N: ctx.mov r[i], t[i] else: @@ -268,14 +270,14 @@ macro mulMont_CIOS_sparebit_adx_gen[N: static int]( result.add ctx.generate() -func mulMont_CIOS_sparebit_asm_adx*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) = +func mulMont_CIOS_sparebit_asm_adx*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, lazyReduce: static bool = false) = ## Constant-time Montgomery multiplication - ## If "skipFinalSub" is set + ## If "lazyReduce" is set ## the result is in the range [0, 2M) ## otherwise the result is in the range [0, M) ## ## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation - r.mulMont_CIOS_sparebit_adx_gen(a, b, M, m0ninv, skipFinalSub) + r.mulMont_CIOS_sparebit_adx_gen(a, b, M, m0ninv, lazyReduce) # Montgomery Squaring # ------------------------------------------------------------ @@ -284,11 +286,11 @@ func squareMont_CIOS_asm_adx*[N]( r: var Limbs[N], a, M: Limbs[N], m0ninv: BaseType, - spareBits: static int, skipFinalSub: static bool) = + spareBits: static int, lazyReduce: static bool) = ## Constant-time modular squaring var r2x {.noInit.}: Limbs[2*N] r2x.square_asm_adx(a) - r.redcMont_asm_adx(r2x, M, m0ninv, spareBits, skipFinalSub) + r.redcMont_asm_adx(r2x, M, m0ninv, spareBits, lazyReduce) # Montgomery Sum of Products # ------------------------------------------------------------ @@ -296,7 +298,7 @@ func squareMont_CIOS_asm_adx*[N]( macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int]( r_PIR: var Limbs[N], a_PIR, b_PIR: array[K, Limbs[N]], M_MEM: Limbs[N], m0ninv_REG: BaseType, - skipFinalSub: static bool): untyped = + lazyReduce: static bool): untyped = ## Generate an optimized Montgomery merged sum of products ⅀aᵢ.bᵢ kernel ## using the CIOS method ## @@ -440,7 +442,7 @@ macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int]( ctx.mov rax, r # move r away from scratchspace that will be used for final substraction let r2 = rax.asArrayAddr(r_PIR, len = N, memIndirect = memWrite) - if skipFinalSub: + if lazyReduce: ctx.comment " Copy result" for i in 0 ..< N: ctx.mov r2[i], t[i] @@ -452,11 +454,11 @@ macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int]( func sumprodMont_CIOS_spare2bits_asm_adx*[N, K: static int]( r: var Limbs[N], a, b: array[K, Limbs[N]], M: Limbs[N], m0ninv: BaseType, - skipFinalSub: static bool) = + lazyReduce: static bool) = ## Sum of products ⅀aᵢ.bᵢ in the Montgomery domain - ## If "skipFinalSub" is set + ## If "lazyReduce" is set ## the result is in the range [0, 2M) ## otherwise the result is in the range [0, M) ## ## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation - r.sumprodMont_CIOS_spare2bits_adx_gen(a, b, M, m0ninv, skipFinalSub) + r.sumprodMont_CIOS_spare2bits_adx_gen(a, b, M, m0ninv, lazyReduce) diff --git a/constantine/math/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim b/constantine/math/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim index 50cad165..d7d6aabd 100644 --- a/constantine/math/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim +++ b/constantine/math/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim @@ -52,6 +52,8 @@ proc mulx_by_word( ctx.mov r0, rax # Steady state + # TODO: should we alternate rax with another register? + # to deal with false dependencies? for j in 1 ..< N: ctx.mulx t[j], rax, a[j], rdx if j == 1: diff --git a/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_x86.nim b/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_x86.nim index bce6e0ea..c4743aed 100644 --- a/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_x86.nim +++ b/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_x86.nim @@ -33,7 +33,7 @@ macro redc2xMont_gen*[N: static int]( a_PIR: array[N*2, SecretWord], M_MEM: array[N, SecretWord], m0ninv_REG: BaseType, - spareBits: static int, skipFinalSub: static bool) = + spareBits: static int, lazyReduce: static bool) = # No register spilling handling doAssert N > 2, "The Assembly-optimized montgomery reduction requires a minimum of 2 limbs." doAssert N <= 6, "The Assembly-optimized montgomery reduction requires at most 6 limbs." @@ -134,7 +134,7 @@ macro redc2xMont_gen*[N: static int]( # Second part - Final substraction # --------------------------------------------- - if not(spareBits >= 2 and skipFinalSub): + if not(spareBits >= 2 and lazyReduce): ctx.mov rdx, r_temp let r = rdx.asArrayAddr(r_PIR, len = N, memIndirect = memWrite) @@ -150,7 +150,7 @@ macro redc2xMont_gen*[N: static int]( # v is invalidated from now on let t = repackRegisters(v, u[N], u[N+1]) - if spareBits >= 2 and skipFinalSub: + if spareBits >= 2 and lazyReduce: for i in 0 ..< N: ctx.mov r_temp[i], u[i] elif spareBits >= 1: @@ -167,10 +167,10 @@ func redcMont_asm*[N: static int]( M: array[N, SecretWord], m0ninv: BaseType, spareBits: static int, - skipFinalSub: static bool) = + lazyReduce: static bool) = ## Constant-time Montgomery reduction static: doAssert UseASM_X86_64, "This requires x86-64." - redc2xMont_gen(r, a, M, m0ninv, spareBits, skipFinalSub) + redc2xMont_gen(r, a, M, m0ninv, spareBits, lazyReduce) # Montgomery conversion # ---------------------------------------------------------- diff --git a/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_x86_adx_bmi2.nim b/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_x86_adx_bmi2.nim index 41f21773..e814939e 100644 --- a/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_x86_adx_bmi2.nim +++ b/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_x86_adx_bmi2.nim @@ -37,7 +37,7 @@ macro redc2xMont_adx_gen[N: static int]( a_PIR: array[N*2, SecretWord], M_MEM: array[N, SecretWord], m0ninv_REG: BaseType, - spareBits: static int, skipFinalSub: static bool) = + spareBits: static int, lazyReduce: static bool) = # No register spilling handling doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs." @@ -127,7 +127,7 @@ macro redc2xMont_adx_gen[N: static int]( let t = repackRegisters(v, u[N]) - if spareBits >= 2 and skipFinalSub: + if spareBits >= 2 and lazyReduce: for i in 0 ..< N: ctx.mov r[i], t[i] elif spareBits >= 1: @@ -144,12 +144,12 @@ func redcMont_asm_adx*[N: static int]( M: array[N, SecretWord], m0ninv: BaseType, spareBits: static int, - skipFinalSub: static bool = false) = + lazyReduce: static bool = false) = ## Constant-time Montgomery reduction # Inlining redcMont_asm_adx twice in mul_fp2_complex_asm_adx # causes GCC to miscompile with -Os (--opt:size) # see https://github.com/mratsim/constantine/issues/229 - redc2xMont_adx_gen(r, a, M, m0ninv, spareBits, skipFinalSub) + redc2xMont_adx_gen(r, a, M, m0ninv, spareBits, lazyReduce) # Montgomery conversion # ---------------------------------------------------------- diff --git a/constantine/math/arithmetic/bigints_crandall.nim b/constantine/math/arithmetic/bigints_crandall.nim new file mode 100644 index 00000000..7ad51cf6 --- /dev/null +++ b/constantine/math/arithmetic/bigints_crandall.nim @@ -0,0 +1,128 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + constantine/platforms/abstractions, + ./limbs, + ./limbs_crandall, + ./bigints + +# No exceptions allowed +{.push raises: [].} +{.push inline.} + +# ############################################################ +# +# Crandall Arithmetic +# +# ############################################################ + +func mulCran*[m: static int]( + r: var BigInt[m], a, b: BigInt[m], + p: BigInt[m], + c: static SecretWord, + lazyReduce: static bool = false) = + ## Compute r <- a*b (2ᵐ-c) + mulCran(r.limbs, a.limbs, b.limbs, p.limbs, m, c, lazyReduce) + +func squareCran*[m: static int]( + r: var BigInt[m], a: BigInt[m], + p: BigInt[m], + c: static SecretWord, + lazyReduce: static bool = false) = + ## Compute r <- a² (2ᵐ-c), m = bits + squareCran(r.limbs, a.limbs, p.limbs, m, c, lazyReduce) + +func powCran*[m: static int]( + a: var BigInt[m], exponent: openarray[byte], + p: BigInt[m], + windowSize: static int, + c: static SecretWord, + lazyReduce: static bool = false) = + ## Compute a <- a^exponent (mod M) + ## ``exponent`` is a BigInt in canonical big-endian representation + ## + ## This uses fixed window optimization + ## A window size in the range [1, 5] must be chosen + ## + ## This is constant-time: the window optimization does + ## not reveal the exponent bits or hamming weight + const scratchLen = if windowSize == 1: 2 + else: (1 shl windowSize) + 1 + var scratchSpace {.noInit.}: array[scratchLen, Limbs[m.wordsRequired()]] + powCran(a.limbs, exponent, p.limbs, scratchSpace, m, c, lazyReduce) + +func powCran_vartime*[m: static int]( + a: var BigInt[m], exponent: openarray[byte], + p: BigInt[m], + windowSize: static int, + c: static SecretWord, + lazyReduce: static bool = false) = + ## Compute a <- a^exponent (mod M) + ## ``exponent`` is a BigInt in canonical big-endian representation + ## + ## Warning ⚠️ : + ## This is an optimization for public exponent + ## Otherwise bits of the exponent can be retrieved with: + ## - memory access analysis + ## - power analysis + ## - timing analysis + ## + ## This uses fixed window optimization + ## A window size in the range [1, 5] must be chosen + + const scratchLen = if windowSize == 1: 2 + else: (1 shl windowSize) + 1 + var scratchSpace {.noInit.}: array[scratchLen, Limbs[m.wordsRequired()]] + powCran_vartime(a.limbs, exponent, p.limbs, scratchSpace, m, c, lazyReduce) + +func powCran*[m, eBits: static int]( + a: var BigInt[m], exponent: BigInt[eBits], + p: BigInt[m], + windowSize: static int, + c: static SecretWord, + lazyReduce: static bool = false) = + ## Compute a <- a^exponent (mod M) + ## ``exponent`` is any BigInt, in the canonical domain + ## + ## This uses fixed window optimization + ## A window size in the range [1, 5] must be chosen + ## + ## This is constant-time: the window optimization does + ## not reveal the exponent bits or hamming weight + var expBE {.noInit.}: array[ebits.ceilDiv_vartime(8), byte] + expBE.marshal(exponent, bigEndian) + + powCran(a, expBE, p, windowSize, c, lazyReduce) + +func powCran_vartime*[m, eBits: static int]( + a: var BigInt[m], exponent: BigInt[eBits], + p: BigInt[m], + windowSize: static int, + c: static SecretWord, + lazyReduce: static bool = false) = + ## Compute a <- a^exponent (mod M) + ## ``a`` in the Montgomery domain + ## ``exponent`` is any BigInt, in the canonical domain + ## + ## Warning ⚠️ : + ## This is an optimization for public exponent + ## Otherwise bits of the exponent can be retrieved with: + ## - memory access analysis + ## - power analysis + ## - timing analysis + ## + ## This uses fixed window optimization + ## A window size in the range [1, 5] must be chosen + var expBE {.noInit.}: array[ebits.ceilDiv_vartime(8), byte] + expBE.marshal(exponent, bigEndian) + + powCran_vartime(a, expBE, p, windowSize, c, lazyReduce) + +{.pop.} # inline +{.pop.} # raises no exceptions diff --git a/constantine/math/arithmetic/bigints_montgomery.nim b/constantine/math/arithmetic/bigints_montgomery.nim index 4836ed96..27dd4bca 100644 --- a/constantine/math/arithmetic/bigints_montgomery.nim +++ b/constantine/math/arithmetic/bigints_montgomery.nim @@ -51,64 +51,58 @@ func fromMont*(r: var BigInt, a, M: BigInt, m0ninv: BaseType, spareBits: static fromMont(r.limbs, a.limbs, M.limbs, m0ninv, spareBits) func mulMont*(r: var BigInt, a, b, M: BigInt, negInvModWord: BaseType, - spareBits: static int, skipFinalSub: static bool = false) = + spareBits: static int, lazyReduce: static bool = false) = ## Compute r <- a*b (mod M) in the Montgomery domain ## ## This resets r to zero before processing. Use {.noInit.} ## to avoid duplicating with Nim zero-init policy - mulMont(r.limbs, a.limbs, b.limbs, M.limbs, negInvModWord, spareBits, skipFinalSub) + mulMont(r.limbs, a.limbs, b.limbs, M.limbs, negInvModWord, spareBits, lazyReduce) func squareMont*(r: var BigInt, a, M: BigInt, negInvModWord: BaseType, - spareBits: static int, skipFinalSub: static bool = false) = + spareBits: static int, lazyReduce: static bool = false) = ## Compute r <- a^2 (mod M) in the Montgomery domain ## ## This resets r to zero before processing. Use {.noInit.} ## to avoid duplicating with Nim zero-init policy - squareMont(r.limbs, a.limbs, M.limbs, negInvModWord, spareBits, skipFinalSub) + squareMont(r.limbs, a.limbs, M.limbs, negInvModWord, spareBits, lazyReduce) func sumprodMont*[N: static int]( r: var BigInt, a, b: array[N, BigInt], M: BigInt, negInvModWord: BaseType, - spareBits: static int, skipFinalSub: static bool = false) = + spareBits: static int, lazyReduce: static bool = false) = ## Compute r <- ⅀aᵢ.bᵢ (mod M) (sum of products) in the Montgomery domain # We rely on BigInt and Limbs having the same repr to avoid array copies sumprodMont( r.limbs, cast[ptr array[N, typeof(a[0].limbs)]](a.unsafeAddr)[], cast[ptr array[N, typeof(b[0].limbs)]](b.unsafeAddr)[], - M.limbs, negInvModWord, spareBits, skipFinalSub + M.limbs, negInvModWord, spareBits, lazyReduce ) func powMont*[mBits: static int]( a: var BigInt[mBits], exponent: openarray[byte], M, one: BigInt[mBits], negInvModWord: BaseType, windowSize: static int, - spareBits: static int - ) = + spareBits: static int) = ## Compute a <- a^exponent (mod M) ## ``a`` in the Montgomery domain ## ``exponent`` is a BigInt in canonical big-endian representation ## - ## Warning ⚠️ : - ## This is an optimization for public exponent - ## Otherwise bits of the exponent can be retrieved with: - ## - memory access analysis - ## - power analysis - ## - timing analysis - ## ## This uses fixed window optimization ## A window size in the range [1, 5] must be chosen + ## + ## This is constant-time: the window optimization does + ## not reveal the exponent bits or hamming weight const scratchLen = if windowSize == 1: 2 else: (1 shl windowSize) + 1 - var scratchSpace {.noInit.}: array[scratchLen, Limbs[mBits.wordsRequired]] + var scratchSpace {.noInit.}: array[scratchLen, Limbs[mBits.wordsRequired()]] powMont(a.limbs, exponent, M.limbs, one.limbs, negInvModWord, scratchSpace, spareBits) func powMont_vartime*[mBits: static int]( a: var BigInt[mBits], exponent: openarray[byte], M, one: BigInt[mBits], negInvModWord: BaseType, windowSize: static int, - spareBits: static int - ) = + spareBits: static int) = ## Compute a <- a^exponent (mod M) ## ``a`` in the Montgomery domain ## ``exponent`` is a BigInt in canonical big-endian representation @@ -125,14 +119,13 @@ func powMont_vartime*[mBits: static int]( const scratchLen = if windowSize == 1: 2 else: (1 shl windowSize) + 1 - var scratchSpace {.noInit.}: array[scratchLen, Limbs[mBits.wordsRequired]] + var scratchSpace {.noInit.}: array[scratchLen, Limbs[mBits.wordsRequired()]] powMont_vartime(a.limbs, exponent, M.limbs, one.limbs, negInvModWord, scratchSpace, spareBits) func powMont*[mBits, eBits: static int]( a: var BigInt[mBits], exponent: BigInt[eBits], M, one: BigInt[mBits], negInvModWord: BaseType, windowSize: static int, - spareBits: static int - ) = + spareBits: static int) = ## Compute a <- a^exponent (mod M) ## ``a`` in the Montgomery domain ## ``exponent`` is any BigInt, in the canonical domain @@ -150,8 +143,7 @@ func powMont*[mBits, eBits: static int]( func powMont_vartime*[mBits, eBits: static int]( a: var BigInt[mBits], exponent: BigInt[eBits], M, one: BigInt[mBits], negInvModWord: BaseType, windowSize: static int, - spareBits: static int - ) = + spareBits: static int) = ## Compute a <- a^exponent (mod M) ## ``a`` in the Montgomery domain ## ``exponent`` is any BigInt, in the canonical domain diff --git a/constantine/math/arithmetic/finite_fields.nim b/constantine/math/arithmetic/finite_fields.nim index b9a6542c..dd1b44ea 100644 --- a/constantine/math/arithmetic/finite_fields.nim +++ b/constantine/math/arithmetic/finite_fields.nim @@ -30,7 +30,9 @@ import constantine/platforms/abstractions, constantine/serialization/endians, constantine/named/properties_fields, - ./bigints, ./bigints_montgomery + ./bigints, + ./bigints_montgomery, + ./bigints_crandall when UseASM_X86_64: import ./assembly/limbs_asm_modular_x86 @@ -54,10 +56,13 @@ export Fp, Fr, FF func fromBig*(dst: var FF, src: BigInt) = ## Convert a BigInt to its Montgomery form - when nimvm: - dst.mres.montyResidue_precompute(src, FF.getModulus(), FF.getR2modP(), FF.getNegInvModWord()) + when FF.isCrandallPrimeField(): + dst.mres = src else: - dst.mres.getMont(src, FF.getModulus(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits()) + when nimvm: + dst.mres.montyResidue_precompute(src, FF.getModulus(), FF.getR2modP(), FF.getNegInvModWord()) + else: + dst.mres.getMont(src, FF.getModulus(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits()) func fromBig*[Name: static Algebra](T: type FF[Name], src: BigInt): FF[Name] {.noInit.} = ## Convert a BigInt to its Montgomery form @@ -65,7 +70,10 @@ func fromBig*[Name: static Algebra](T: type FF[Name], src: BigInt): FF[Name] {.n func fromField*(dst: var BigInt, src: FF) {.inline.} = ## Convert a finite-field element to a BigInt in natural representation - dst.fromMont(src.mres, FF.getModulus(), FF.getNegInvModWord(), FF.getSpareBits()) + when FF.isCrandallPrimeField(): + dst = src.mres + else: + dst.fromMont(src.mres, FF.getModulus(), FF.getNegInvModWord(), FF.getSpareBits()) func toBig*(src: FF): auto {.noInit, inline.} = ## Convert a finite-field element to a BigInt in natural representation @@ -121,11 +129,17 @@ func isZero*(a: FF): SecretBool = func isOne*(a: FF): SecretBool = ## Constant-time check if one - a.mres == FF.getMontyOne() + when FF.isCrandallPrimeField(): + a.mres.isOne() + else: + a.mres == FF.getMontyOne() func isMinusOne*(a: FF): SecretBool = ## Constant-time check if -1 (mod p) - a.mres == FF.getMontyPrimeMinus1() + when FF.isCrandallPrimeField(): + a.mres == FF.getPrimeMinus1() + else: + a.mres == FF.getMontyPrimeMinus1() func isOdd*(a: FF): SecretBool {. error: "Do you need the actual value to be odd\n" & @@ -141,14 +155,20 @@ func setOne*(a: var FF) = # Note: we need 1 in Montgomery residue form # TODO: Nim codegen is not optimal it uses a temporary # Check if the compiler optimizes it away - a.mres = FF.getMontyOne() + when FF.isCrandallPrimeField(): + a.mres.setOne() + else: + a.mres = FF.getMontyOne() func setMinusOne*(a: var FF) = ## Set ``a`` to -1 (mod p) # Note: we need -1 in Montgomery residue form # TODO: Nim codegen is not optimal it uses a temporary # Check if the compiler optimizes it away - a.mres = FF.getMontyPrimeMinus1() + when FF.isCrandallPrimeField(): + a.mres = FF.getPrimeMinus1() + else: + a.mres = FF.getMontyPrimeMinus1() func neg*(r: var FF, a: FF) {.meter.} = ## Negate modulo p @@ -234,22 +254,31 @@ func double*(r: var FF, a: FF) {.meter.} = overflowed = overflowed or not(r.mres < FF.getModulus()) discard csub(r.mres, FF.getModulus(), overflowed) -func prod*(r: var FF, a, b: FF, skipFinalSub: static bool = false) {.meter.} = +func prod*(r: var FF, a, b: FF, lazyReduce: static bool = false) {.meter.} = ## Store the product of ``a`` by ``b`` modulo p into ``r`` ## ``r`` is initialized / overwritten - r.mres.mulMont(a.mres, b.mres, FF.getModulus(), FF.getNegInvModWord(), FF.getSpareBits(), skipFinalSub) + when FF.isCrandallPrimeField(): + r.mres.mulCran(a.mres, b.mres, FF.getModulus(), FF.getCrandallPrimeSubterm(), lazyReduce) + else: + r.mres.mulMont(a.mres, b.mres, FF.getModulus(), FF.getNegInvModWord(), FF.getSpareBits(), lazyReduce) -func square*(r: var FF, a: FF, skipFinalSub: static bool = false) {.meter.} = +func square*(r: var FF, a: FF, lazyReduce: static bool = false) {.meter.} = ## Squaring modulo p - r.mres.squareMont(a.mres, FF.getModulus(), FF.getNegInvModWord(), FF.getSpareBits(), skipFinalSub) + when FF.isCrandallPrimeField(): + r.mres.squareCran(a.mres, FF.getModulus(), FF.getCrandallPrimeSubterm(), lazyReduce) + else: + r.mres.squareMont(a.mres, FF.getModulus(), FF.getNegInvModWord(), FF.getSpareBits(), lazyReduce) -func sumprod*[N: static int](r: var FF, a, b: array[N, FF], skipFinalSub: static bool = false) {.meter.} = +func sumprod*[N: static int](r: var FF, a, b: array[N, FF], lazyReduce: static bool = false) {.meter.} = ## Compute r <- ⅀aᵢ.bᵢ (mod M) (sum of products) # We rely on FF and Bigints having the same repr to avoid array copies - r.mres.sumprodMont( - cast[ptr array[N, typeof(a[0].mres)]](a.unsafeAddr)[], - cast[ptr array[N, typeof(b[0].mres)]](b.unsafeAddr)[], - FF.getModulus(), FF.getNegInvModWord(), FF.getSpareBits(), skipFinalSub) + when FF.isCrandallPrimeField(): + {.error: "Not implemented".} + else: + r.mres.sumprodMont( + cast[ptr array[N, typeof(a[0].mres)]](a.unsafeAddr)[], + cast[ptr array[N, typeof(b[0].mres)]](b.unsafeAddr)[], + FF.getModulus(), FF.getNegInvModWord(), FF.getSpareBits(), lazyReduce) # ############################################################ # @@ -329,7 +358,10 @@ func inv*(r: var FF, a: FF) = ## Incidentally this avoids extra check ## to convert Jacobian and Projective coordinates ## to affine for elliptic curve - r.mres.invmod(a.mres, FF.getR2modP(), FF.getModulus()) + when FF.isCrandallPrimeField(): + r.mres.invmod(a.mres, FF.getModulus()) + else: + r.mres.invmod(a.mres, FF.getR2modP(), FF.getModulus()) func inv*(a: var FF) = ## Inversion modulo p @@ -347,7 +379,10 @@ func inv_vartime*(r: var FF, a: FF) {.tags: [VarTime].} = ## Incidentally this avoids extra check ## to convert Jacobian and Projective coordinates ## to affine for elliptic curve - r.mres.invmod_vartime(a.mres, FF.getR2modP(), FF.getModulus()) + when FF.isCrandallPrimeField(): + r.mres.invmod_vartime(a.mres, FF.getModulus()) + else: + r.mres.invmod_vartime(a.mres, FF.getR2modP(), FF.getModulus()) func inv_vartime*(a: var FF) {.tags: [VarTime].} = ## Variable-time Inversion modulo p @@ -370,23 +405,23 @@ func `*=`*(a: var FF, b: FF) {.meter.} = ## Multiplication modulo p a.prod(a, b) -func square*(a: var FF, skipFinalSub: static bool = false) {.meter.} = +func square*(a: var FF, lazyReduce: static bool = false) {.meter.} = ## Squaring modulo p - a.square(a, skipFinalSub) + a.square(a, lazyReduce) -func square_repeated*(a: var FF, num: int, skipFinalSub: static bool = false) {.meter.} = +func square_repeated*(a: var FF, num: int, lazyReduce: static bool = false) {.meter.} = ## Repeated squarings ## Assumes at least 1 squaring for _ in 0 ..< num-1: - a.square(skipFinalSub = true) - a.square(skipFinalSub) + a.square(lazyReduce = true) + a.square(lazyReduce) -func square_repeated*(r: var FF, a: FF, num: int, skipFinalSub: static bool = false) {.meter.} = +func square_repeated*(r: var FF, a: FF, num: int, lazyReduce: static bool = false) {.meter.} = ## Repeated squarings - r.square(a, skipFinalSub = true) + r.square(a, lazyReduce = true) for _ in 1 ..< num-1: - r.square(skipFinalSub = true) - r.square(skipFinalSub) + r.square(lazyReduce = true) + r.square(lazyReduce) func `*=`*(a: var FF, b: static int) = ## Multiplication by a small integer known at compile-time @@ -518,31 +553,48 @@ func pow*(a: var FF, exponent: BigInt) = ## Exponentiation modulo p ## ``a``: a field element to be exponentiated ## ``exponent``: a big integer - const windowSize = 5 # TODO: find best window size for each curves - a.mres.powMont( - exponent, - FF.getModulus(), FF.getMontyOne(), - FF.getNegInvModWord(), windowSize, - FF.getSpareBits() - ) + when FF.isCrandallPrimeField(): + const windowSize = 5 # TODO: find best window size for each curves + a.mres.powCran( + exponent, + FF.getModulus(), + windowSize, + FF.getCrandallPrimeSubterm() + ) + else: + const windowSize = 5 # TODO: find best window size for each curves + a.mres.powMont( + exponent, + FF.getModulus(), FF.getMontyOne(), + FF.getNegInvModWord(), windowSize, + FF.getSpareBits() + ) func pow*(a: var FF, exponent: openarray[byte]) = ## Exponentiation modulo p ## ``a``: a field element to be exponentiated ## ``exponent``: a big integer in canonical big endian representation - const windowSize = 5 # TODO: find best window size for each curves - a.mres.powMont( - exponent, - FF.getModulus(), FF.getMontyOne(), - FF.getNegInvModWord(), windowSize, - FF.getSpareBits() - ) + when FF.isCrandallPrimeField(): + const windowSize = 5 # TODO: find best window size for each curves + a.mres.powCran( + exponent, + FF.getModulus(), + windowSize, + FF.getCrandallPrimeSubterm() + ) + else: + const windowSize = 5 # TODO: find best window size for each curves + a.mres.powMont( + exponent, + FF.getModulus(), FF.getMontyOne(), + FF.getNegInvModWord(), windowSize, + FF.getSpareBits() + ) func pow*(a: var FF, exponent: FF) = ## Exponentiation modulo p ## ``a``: a field element to be exponentiated ## ``exponent``: a finite field element - const windowSize = 5 # TODO: find best window size for each curves a.pow(exponent.toBig()) func pow*(r: var FF, a: FF, exponent: BigInt or openArray[byte] or FF) = @@ -566,13 +618,23 @@ func pow_vartime*(a: var FF, exponent: BigInt) = ## - memory access analysis ## - power analysis ## - timing analysis - const windowSize = 5 # TODO: find best window size for each curves - a.mres.powMont_vartime( - exponent, - FF.getModulus(), FF.getMontyOne(), - FF.getNegInvModWord(), windowSize, - FF.getSpareBits() - ) + + when FF.isCrandallPrimeField(): + const windowSize = 5 # TODO: find best window size for each curves + a.mres.powCran_vartime( + exponent, + FF.getModulus(), + windowSize, + FF.getCrandallPrimeSubterm() + ) + else: + const windowSize = 5 # TODO: find best window size for each curves + a.mres.powMont_vartime( + exponent, + FF.getModulus(), FF.getMontyOne(), + FF.getNegInvModWord(), windowSize, + FF.getSpareBits() + ) func pow_vartime*(a: var FF, exponent: openarray[byte]) = ## Exponentiation modulo p @@ -585,19 +647,28 @@ func pow_vartime*(a: var FF, exponent: openarray[byte]) = ## - memory access analysis ## - power analysis ## - timing analysis - const windowSize = 5 # TODO: find best window size for each curves - a.mres.powMont_vartime( - exponent, - FF.getModulus(), FF.getMontyOne(), - FF.getNegInvModWord(), windowSize, - FF.getSpareBits() - ) + + when FF.isCrandallPrimeField(): + const windowSize = 5 # TODO: find best window size for each curves + a.mres.powCran_vartime( + exponent, + FF.getModulus(), + windowSize, + FF.getCrandallPrimeSubterm() + ) + else: + const windowSize = 5 # TODO: find best window size for each curves + a.mres.powMont_vartime( + exponent, + FF.getModulus(), FF.getMontyOne(), + FF.getNegInvModWord(), windowSize, + FF.getSpareBits() + ) func pow_vartime*(a: var FF, exponent: FF) = ## Exponentiation modulo p ## ``a``: a field element to be exponentiated ## ``exponent``: a finite field element - const windowSize = 5 # TODO: find best window size for each curves a.pow_vartime(exponent.toBig()) func pow_vartime*(r: var FF, a: FF, exponent: BigInt or openArray[byte] or FF) = @@ -636,27 +707,27 @@ func pow_squareMultiply_vartime(a: var FF, exponent: SomeUnsignedInt) {.tags:[Va for e in 0 ..< eBytes.len-1: let e = eBytes[e] for i in countdown(7, 0): - a.square(skipFinalSub = true) + a.square(lazyReduce = true) let bit = bool((e shr i) and 1) if bit: - a.prod(a, aa, skipFinalSub = true) + a.prod(a, aa, lazyReduce = true) let e = eBytes[eBytes.len-1] block: # Epilogue, byte-level for i in countdown(7, 1): - a.square(skipFinalSub = true) + a.square(lazyReduce = true) let bit = bool((e shr i) and 1) if bit: - a.prod(a, aa, skipFinalSub = true) + a.prod(a, aa, lazyReduce = true) block: # Epilogue, bit-level # for the very last bit we can't skip final substraction let bit = bool(e and 1) if bit: - a.square(skipFinalSub = true) - a.prod(a, aa, skipFinalSub = false) + a.square(lazyReduce = true) + a.prod(a, aa, lazyReduce = false) else: - a.square(skipFinalSub = false) + a.square(lazyReduce = false) func pow_addchain_4bit_vartime(a: var FF, exponent: SomeUnsignedInt) {.tags:[VarTime], meter.} = ## **Variable-time** Exponentiation @@ -670,69 +741,69 @@ func pow_addchain_4bit_vartime(a: var FF, exponent: SomeUnsignedInt) {.tags:[Var a.square() of 3: var t {.noInit.}: typeof(a) - t.square(a, skipFinalSub = true) + t.square(a, lazyReduce = true) a *= t of 4: a.square_repeated(2) of 5: var t {.noInit.}: typeof(a) - t.square_repeated(a, 2, skipFinalSub = true) + t.square_repeated(a, 2, lazyReduce = true) a *= t of 6: var t {.noInit.}: typeof(a) - t.square(a, skipFinalSub = true) - t.prod(t, a, skipFinalSub = true) # 3 + t.square(a, lazyReduce = true) + t.prod(t, a, lazyReduce = true) # 3 a.square(t) of 7: var t {.noInit.}: typeof(a) - t.square(a, skipFinalSub = true) - a.prod(a, t, skipFinalSub = true) # 3 - t.square(skipFinalSub = true) # 4 + t.square(a, lazyReduce = true) + a.prod(a, t, lazyReduce = true) # 3 + t.square(lazyReduce = true) # 4 a *= t of 8: a.square_repeated(3) of 9: var t {.noInit.}: typeof(a) - t.square_repeated(a, 3, skipFinalSub = true) + t.square_repeated(a, 3, lazyReduce = true) a *= t of 10: var t {.noInit.}: typeof(a) - t.square_repeated(a, 2, skipFinalSub = true) # 4 - a.prod(a, t, skipFinalSub = true) # 5 + t.square_repeated(a, 2, lazyReduce = true) # 4 + a.prod(a, t, lazyReduce = true) # 5 a.square() of 11: var t {.noInit.}: typeof(a) - t.square_repeated(a, 2, skipFinalSub = true) # 4 - t.prod(t, a, skipFinalSub = true) # 5 - t.square(skipFinalSub = true) # 10 + t.square_repeated(a, 2, lazyReduce = true) # 4 + t.prod(t, a, lazyReduce = true) # 5 + t.square(lazyReduce = true) # 10 a *= t of 12: var t {.noInit.}: typeof(a) - t.square(a, skipFinalSub = true) - t.prod(t, a, skipFinalSub = true) # 3 - t.square(skipFinalSub = true) # 6 + t.square(a, lazyReduce = true) + t.prod(t, a, lazyReduce = true) # 3 + t.square(lazyReduce = true) # 6 a.square(t) # 12 of 13: var t1 {.noInit.}, t2 {.noInit.}: typeof(a) - t1.square_repeated(a, 2, skipFinalSub = true) # 4 - t2.square(t1, skipFinalSub = true) # 8 - t1.prod(t1, t2, skipFinalSub = true) # 12 + t1.square_repeated(a, 2, lazyReduce = true) # 4 + t2.square(t1, lazyReduce = true) # 8 + t1.prod(t1, t2, lazyReduce = true) # 12 a *= t1 # 13 of 14: var t {.noInit.}: typeof(a) - t.square(a, skipFinalSub = true) # 2 + t.square(a, lazyReduce = true) # 2 a *= t # 3 - t.square_repeated(2, skipFinalSub = true) # 8 - a.square(skipFinalSub = true) # 6 + t.square_repeated(2, lazyReduce = true) # 8 + a.square(lazyReduce = true) # 6 a *= t # 14 of 15: var t {.noInit.}: typeof(a) - t.square(a, skipFinalSub = true) - t.prod(t, a, skipFinalSub = true) # 3 - a.square_repeated(t, 2, skipFinalSub = true) # 12 + t.square(a, lazyReduce = true) + t.prod(t, a, lazyReduce = true) # 3 + a.square_repeated(t, 2, lazyReduce = true) # 12 a *= t # 15 of 16: - a.square_repeated(4, skipFinalSub = true) + a.square_repeated(4, lazyReduce = true) else: doAssert false, "exponentiation by this small int '" & $exponent & "' is not implemented" @@ -781,7 +852,7 @@ import std/macros macro addchain*(fn: untyped): untyped = ## Modify all prod, `*=`, square, square_repeated calls - ## to skipFinalSub except the very last call. + ## to lazyReduce except the very last call. ## This assumes straight-line code. fn.expectKind(nnkFuncDef) @@ -846,9 +917,9 @@ func batchInv*[F]( dst[i] = acc if i != N-1: - acc.prod(acc, z, skipFinalSub = true) + acc.prod(acc, z, lazyReduce = true) else: - acc.prod(acc, z, skipFinalSub = false) + acc.prod(acc, z, lazyReduce = false) acc.inv() @@ -860,7 +931,7 @@ func batchInv*[F]( # next iteration var eli = elements[i] eli.csetOne(zeros[i]) - acc.prod(acc, eli, skipFinalSub = true) + acc.prod(acc, eli, lazyReduce = true) func batchInv_vartime*[F]( dst: ptr UncheckedArray[F], @@ -889,9 +960,9 @@ func batchInv_vartime*[F]( dst[i] = acc if i != N-1: - acc.prod(acc, elements[i], skipFinalSub = true) + acc.prod(acc, elements[i], lazyReduce = true) else: - acc.prod(acc, elements[i], skipFinalSub = false) + acc.prod(acc, elements[i], lazyReduce = false) acc.inv_vartime() @@ -899,7 +970,7 @@ func batchInv_vartime*[F]( if zeros[i] == true: continue dst[i] *= acc - acc.prod(acc, elements[i], skipFinalSub = true) + acc.prod(acc, elements[i], lazyReduce = true) func batchInv*[F](dst: var openArray[F], source: openArray[F]) {.inline.} = debug: doAssert dst.len == source.len diff --git a/constantine/math/arithmetic/limbs_crandall.nim b/constantine/math/arithmetic/limbs_crandall.nim new file mode 100644 index 00000000..83915d6c --- /dev/null +++ b/constantine/math/arithmetic/limbs_crandall.nim @@ -0,0 +1,515 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + constantine/platforms/abstractions, + ./limbs, ./limbs_extmul + +when UseASM_X86_32: + import + ./assembly/limbs_asm_crandall_x86, + ./assembly/limbs_asm_crandall_x86_adx_bmi2 + +# No exceptions allowed +{.push raises: [], checks: off.} + +# ############################################################ +# +# Multiprecision Crandall prime / +# Pseudo-Mersenne Prime Arithmetic +# +# ############################################################ +# +# Crandall primes have the form p = 2ᵐ-c + +# Fast reduction +# ------------------------------------------------------------ + +func reduceCrandallPartial_impl[N: static int]( + r: var Limbs[N], + a: array[2*N, SecretWord], # using Limbs lead to type mismatch or ICE + m: static int, + c: static SecretWord) = + ## Partial Reduction modulo p + ## with p with special form 2ᵐ-c + ## called "Crandall prime" or Pseudo-Mersenne Prime in the litterature + ## + ## This is a partial reduction that reduces down to + ## 2ᵐ, i.e. it fits in the same amount of word by p + ## but values my be up to p+c + ## + ## Crandal primes allow fast reduction from the fact that + ## 2ᵐ-c ≡ 0 (mod p) + ## <=> 2ᵐ ≡ c (mod p) + ## <=> a2ᵐ+b ≡ ac + b (mod p) + + # In our case we split at 2ʷⁿ with w the word size (32 or 64-bit) + # and N the number of words needed to represent the prime + # hence 2ʷⁿ ≡ 2ʷⁿ⁻ᵐc (mod p), we call this cs (c shifted) + # so a2ʷⁿ+b ≡ a2ʷⁿ⁻ᵐc + b (mod p) + # + # With concrete instantiations: + # for p = 2²⁵⁵-19 (Curve25519) + # 2²⁵⁵ ≡ 19 (mod p) + # 2²⁵⁶ ≡ 2*19 (mod p) + # We rewrite the 510 bits multiplication result as + # a2²⁵⁶+b = a*2*19 + b (mod p) + # + # For Bitcoin/Ethereum, p = 2²⁵⁶-0x1000003D1 = + # p = 2²⁵⁶ - (2³²+2⁹+2⁸+2⁷+2⁶+2⁴+1) + # 2²⁵⁶ ≡ 0x1000003D1 (mod p) + # We rewrite the 512 bits multiplication result as + # a2²⁵⁶+b = a*0x1000003D1 + b (mod p) + # + # Note: on a w-bit architecture, c MUST be less than w-bit + # This is not the case for secp256k1 on 32-bit + # as it's c = 2³²+2⁹+2⁸+2⁷+2⁶+2⁴+1 + # Though as multiplying by 2³² is free + # we can special-case the problem, if there was a + # 32-bit platform with add-with-carry that is still a valuable target. + # (otherwise unsaturated arithmetic is superior) + + const S = (N*WordBitWidth - m) + const cs = c shl S + static: doAssert 0 <= S and S < WordBitWidth + + var hi: SecretWord + + # First reduction pass + # multiply high-words by c shifted and accumulate in low-words + # assumes cs fits in a single word. + + # (hi, r₀) <- aₙ*cs + a₀ + muladd1(hi, r[0], a[N], cs, a[0]) + staticFor i, 1, N: + # (hi, rᵢ) <- aᵢ₊ₙ*cs + aᵢ + hi + muladd2(hi, r[i], a[i+N], cs, a[i], hi) + + # The first reduction pass may carry in `hi` + # which would be hi*2ʷⁿ ≡ hi*2ʷⁿ⁻ᵐ*c (mod p) + # ≡ hi*cs (mod p) + + # Move all extra bits to hi, i.e. double-word shift + hi = (hi shl S) or (r[N-1] shr (WordBitWidth-S)) + + # High-bits have been "carried" to `hi`, cancel them in r[N-1]. + # Note: there might be up to `c` not reduced. + r[N-1] = r[N-1] and (MaxWord shr S) + + # Partially reduce to up to `m` bits + # We need to fold what's beyond `m` bits + # by repeatedly multiplying it by cs + # We distinguish 2 cases: + # 1. Folding (N..2N)*cs onto 0..N + # may overflow and a 3rd folding is necessary + # 2. Folding (N..2N)*cs onto 0..N + # may not overflow and 2 foldings are all we need. + # This is possible: + # - if we don't use the full 2ʷⁿ + # for example we use 255 bits out of 256 available + # - And (2ʷⁿ⁻ᵐ*c)² < 2ʷ + # + # There is a 3rd case that we don't handle + # c > 2ʷ, for example secp256k1 on 32-bit + + when N*WordBitWidth == m: # Secp256k1 only according to eprint/iacr 2018/985 + var t0, t1: SecretWord + var carry: Carry + + # Second pass + mul(t1, t0, hi, c) + addC(carry, r[0], r[0], t0, Carry(0)) + addC(carry, r[1], r[1], t1, carry) + staticFor i, 2, N: + addC(carry, r[i], r[i], Zero, carry) + + # Third pass - the high-word to fold can only be m-bits+1 + mul(t1, t0, SecretWord(carry), c) + addC(carry, r[0], r[0], t0, Carry(0)) + addC(carry, r[1], r[1], t1, carry) + + # We want to ensure that cs² < 2³² on 64-bit or 2¹⁶ on 32-bit + # But doing unsigned cs² may overflow, so we sqrt the rhs instead + elif uint64(cs) < (1'u64 shl (WordBitWidth shr 1)) - 1: + var carry: Carry + + # Second pass + + # hi < cs, and cs² < 2ʷ (2³² on 32-bit or 2⁶⁴ on 64-bit) + # hence hi *= c cannot overflow + hi *= c + addC(carry, r[0], r[0], hi, Carry(0)) + staticFor i, 1, N: + addC(carry, r[i], r[i], Zero, carry) + + else: + {.error: "Not implemented".} + +func reduceCrandallFinal_impl[N: static int]( + a: var Limbs[N], + p: Limbs[N]) = + ## Final Reduction modulo p + ## with p with special form 2ᵐ-c + ## called "Crandall prime" or Pseudo-Mersenne Prime in the litterature + ## + ## This reduces `a` from [0, 2ᵐ) to [0, 2ᵐ-c) + + # 1. Substract p = 2ᵐ-c + var t {.noInit.}: Limbs[N] + let underflow = t.diff(a, p) + + # 2. If underflow, a has the proper reduced result + # otherwise t has the proper reduced result + a.ccopy(t, not SecretBool(underflow)) + +func reduceCrandallFinal[N: static int]( + a: var Limbs[N], + p: Limbs[N]) {.inline.} = + ## Final Reduction modulo p + ## with p with special form 2ᵐ-c + ## called "Crandall prime" or Pseudo-Mersenne Prime in the litterature + ## + ## This reduces `a` from [0, 2ᵐ) to [0, 2ᵐ-c) + when UseASM_X86_32 and a.len in {3..6}: + a.reduceCrandallFinal_asm(p) + else: + a.reduceCrandallFinal_impl(p) + +# High-level API +# ------------------------------------------------------------ + +func mulCranPartialReduce[N: static int]( + r: var Limbs[N], + a, b: Limbs[N], + m: static int, c: static SecretWord) {.inline.} = + when UseASM_X86_64 and a.len in {3..6}: + # ADX implies BMI2 + if ({.noSideEffect.}: hasAdx()): + r.mulCranPartialReduce_asm_adx(a, b, m, c) + else: + r.mulCranPartialReduce_asm(a, b, m, c) + else: + var r2 {.noInit.}: Limbs[2*N] + r2.prod(a, b) + r.reduceCrandallPartial_impl(r2, m, c) + +func mulCran*[N: static int]( + r: var Limbs[N], + a, b: Limbs[N], + p: Limbs[N], + m: static int, c: static SecretWord, + lazyReduce: static bool = false) {.inline.} = + when lazyReduce: + r.mulCranPartialReduce(a, b, m, c) + elif UseASM_X86_64 and a.len in {3..6}: + # ADX implies BMI2 + if ({.noSideEffect.}: hasAdx()): + r.mulCran_asm_adx(a, b, p, m, c) + else: + r.mulCran_asm(a, b, p, m, c) + else: + var r2 {.noInit.}: Limbs[2*N] + r2.prod(a, b) + r.reduceCrandallPartial_impl(r2, m, c) + r.reduceCrandallFinal_impl(p) + +func squareCranPartialReduce[N: static int]( + r: var Limbs[N], + a: Limbs[N], + m: static int, c: static SecretWord) {.inline.} = + when UseASM_X86_64 and a.len in {3..6}: + # ADX implies BMI2 + if ({.noSideEffect.}: hasAdx()): + r.squareCranPartialReduce_asm_adx(a, m, c) + else: + r.squareCranPartialReduce_asm(a, m, c) + else: + var r2 {.noInit.}: Limbs[2*N] + r2.square(a) + r.reduceCrandallPartial_impl(r2, m, c) + +func squareCran*[N: static int]( + r: var Limbs[N], + a: Limbs[N], + p: Limbs[N], + m: static int, c: static SecretWord, + lazyReduce: static bool = false) {.inline.} = + when lazyReduce: + r.squareCranPartialReduce(a, m, c) + elif UseASM_X86_64 and a.len in {3..6}: + # ADX implies BMI2 + if ({.noSideEffect.}: hasAdx()): + r.squareCran_asm_adx(a, p, m, c) + else: + r.squareCran_asm(a, p, m, c) + else: + var r2 {.noInit.}: Limbs[2*N] + r2.square(a) + r.reduceCrandallPartial_impl(r2, m, c) + r.reduceCrandallFinal_impl(p) + +# Crandall Exponentiation +# ------------------------------------------------------------ +# We use fixed-window based exponentiation +# that is constant-time: i.e. the number of multiplications +# does not depend on the number of set bits in the exponents +# those are always done and conditionally copied. +# +# The exponent MUST NOT be private data (until audited otherwise) +# - Power attack on RSA, https://www.di.ens.fr/~fouque/pub/ches06.pdf +# - Flush-and-reload on Sliding window exponentiation: https://tutcris.tut.fi/portal/files/8966761/p1639_pereida_garcia.pdf +# - Sliding right into disaster, https://eprint.iacr.org/2017/627.pdf +# - Fixed window leak: https://www.scirp.org/pdf/JCC_2019102810331929.pdf +# - Constructing sliding-windows leak, https://easychair.org/publications/open/fBNC +# +# For pairing curves, this is the case since exponentiation is only +# used for inversion via the Little Fermat theorem. +# For RSA, some exponentiations uses private exponents. +# +# Note: +# - Implementation closely follows Thomas Pornin's BearSSL +# - Apache Milagro Crypto has an alternative implementation +# that is more straightforward however: +# - the exponent hamming weight is used as loop bounds +# - the baseᵏ is stored at each index of a temp table of size k +# - the baseᵏ to use is indexed by the hamming weight +# of the exponent, leaking this to cache attacks +# - in contrast BearSSL touches the whole table to +# hide the actual selection + +template checkPowScratchSpaceLen(len: int) = + ## Checks that there is a minimum of scratchspace to hold the temporaries + debug: + assert len >= 2, "Internal Error: the scratchspace for powmod should be equal or greater than 2" + +func getWindowLen(bufLen: int): uint = + ## Compute the maximum window size that fits in the scratchspace buffer + checkPowScratchSpaceLen(bufLen) + result = 5 + while (1 shl result) + 1 > bufLen: + dec result + +func powCranPrologue( + a: var Limbs, + scratchspace: var openarray[Limbs], + m: static int, c: static SecretWord): uint = + ## Setup the scratchspace + ## Returns the fixed-window size for exponentiation with window optimization. + result = scratchspace.len.getWindowLen() + # Precompute window content, special case for window = 1 + # (i.e scratchspace has only space for 2 temporaries) + # The content scratchspace[2+k] is set at aᵏ + # with scratchspace[0] untouched + if result == 1: + scratchspace[1] = a + else: + scratchspace[2] = a + for k in 2 ..< 1 shl result: + scratchspace[k+1].mulCranPartialReduce(scratchspace[k], a, m, c) + + # Set a to one + a.setOne() + +func powCranSquarings( + a: var Limbs, + exponent: openarray[byte], + tmp: var Limbs, + window: uint, + acc, acc_len: var uint, + e: var int, + m: static int, c: static SecretWord + ): tuple[k, bits: uint] {.inline.}= + ## Squaring step of exponentiation by squaring + ## Get the next k bits in range [1, window) + ## Square k times + ## Returns the number of squarings done and the corresponding bits + ## + ## Updates iteration variables and accumulators + # Due to the high number of parameters, + # forcing this inline actually reduces the code size + # + # ⚠️: Extreme care should be used to not leak + # the exponent bits nor its real bitlength + # i.e. if the exponent is zero but encoded in a + # 256-bit integer, only "256" should leak + # as for some application like RSA + # the exponent might be the user secret key. + + # Get the next bits + # acc/acc_len must be uint to avoid Nim runtime checks leaking bits + # acc/acc_len must be uint to avoid Nim runtime checks leaking bits + # e is public + var k = window + if acc_len < window: + if e < exponent.len: + acc = (acc shl 8) or exponent[e].uint + inc e + acc_len += 8 + else: # Drained all exponent bits + k = acc_len + + let bits = (acc shr (acc_len - k)) and ((1'u shl k) - 1) + acc_len -= k + + # We have k bits and can do k squaring, skip final substraction for first k-1 ones. + for i in 0 ..< k: + a.squareCranPartialReduce(a, m, c) + + return (k, bits) + +func powCran*( + a: var Limbs, + exponent: openarray[byte], + p: Limbs, + scratchspace: var openarray[Limbs], + m: static int, c: static SecretWord, + lazyReduce: static bool = false) = + ## Modular exponentiation a <- a^exponent (mod M) + ## + ## This uses fixed-window optimization if possible + ## + ## - On input ``a`` is the base, on ``output`` a = a^exponent (mod M) + ## - ``exponent`` is the exponent in big-endian canonical format (octet-string) + ## Use ``marshal`` for conversion + ## - ``scratchspace`` with k the window bitsize of size up to 5 + ## This is a buffer that can hold between 2ᵏ + 1 big-ints + ## A window of of 1-bit (no window optimization) requires only 2 big-ints + ## + ## Note that the best window size require benchmarking and is a tradeoff between + ## - performance + ## - stack usage + ## - precomputation + let window = powCranPrologue(a, scratchspace, m, c) + + # We process bits with from most to least significant. + # At each loop iteration with have acc_len bits in acc. + # To maintain constant-time the number of iterations + # or the number of operations or memory accesses should be the same + # regardless of acc & acc_len + var + acc, acc_len: uint + e = 0 + while acc_len > 0 or e < exponent.len: + let (k, bits) = powCranSquarings( + a, exponent, + scratchspace[0], window, + acc, acc_len, e, + m, c) + + # Window lookup: we set scratchspace[1] to the lookup value. + # If the window length is 1, then it's already set. + if window > 1: + # otherwise we need a constant-time lookup + # in particular we need the same memory accesses, we can't + # just index the openarray with the bits to avoid cache attacks. + for i in 1 ..< 1 shl k: + let ctl = SecretWord(i) == SecretWord(bits) + scratchspace[1].ccopy(scratchspace[1+i], ctl) + + # Multiply with the looked-up value + # we keep the product only if the exponent bits are not all zeroes + scratchspace[0].mulCranPartialReduce(a, scratchspace[1], m, c) + a.ccopy(scratchspace[0], SecretWord(bits).isNonZero()) + + when not lazyReduce: + a.reduceCrandallFinal_impl(p) + +func powCran_vartime*( + a: var Limbs, + exponent: openarray[byte], + p: Limbs, + scratchspace: var openarray[Limbs], + m: static int, c: static SecretWord, + lazyReduce: static bool = false) = + ## Modular exponentiation a <- a^exponent (mod M) + ## + ## Warning ⚠️ : + ## This is an optimization for public exponent + ## Otherwise bits of the exponent can be retrieved with: + ## - memory access analysis + ## - power analysis + ## - timing analysis + + # TODO: scratchspace[1] is unused when window > 1 + + let window = powCranPrologue(a, scratchspace, m, c) + + var + acc, acc_len: uint + e = 0 + while acc_len > 0 or e < exponent.len: + let (_, bits) = powCranSquarings( + a, exponent, + scratchspace[0], window, + acc, acc_len, e, + m, c) + + ## Warning ⚠️: Exposes the exponent bits + if bits != 0: + if window > 1: + scratchspace[0].mulCranPartialReduce(a, scratchspace[1+bits], m, c) + else: + # scratchspace[1] holds the original `a` + scratchspace[0].mulCranPartialReduce(a, scratchspace[1], m, c) + a = scratchspace[0] + + when not lazyReduce: + a.reduceCrandallFinal_impl(p) + +# Lazily reduced arithmetic +# ------------------------------------------------------------ + +# We can use special lazily reduced arithmetic +# where reduction is only done when we overflow 2ʷⁿ +# with w the word bitwidth and n the number of words +# to represent p. +# For example for Curve25519, p = 2²⁵⁵-19 and 2ʷⁿ=2²⁵⁶ +# Hence reduction will only happen when overflowing 2²⁵⁶ bits +# +# However: +# - Restricting it to mul/squaring in addchain +# makes it consistent with generic Montgomery representation +# - We don't really gain something for addition and substraction +# as modular addition needs: +# 1. One pass of add-with-carry +# 2. One pass of sub-with-borrow +# 3. One pass of conditional mov +# And lazily reduced needs +# the same first 2 pass and replace the third with +# masking + single addition +# Total number of instruction doesn't seem to change +# and conditional moves can be issued 2 per cycle +# so we save ~1 clock cycle + +func sum_crandall_impl[N: static int]( + r: var Limbs[N], a, b: Limbs[N], + m: int, + c: SecretWord) {.used.} = + ## Lazily reduced addition + ## Proof-of-concept. Currently unused. + let S = (N*WordBitWidth - m) + let cs = c shl S + debug: doAssert 0 <= S and S < WordBitWidth + + let overflow1 = r.sum(a, b) + # If there is an overflow, substract 2ˢp = 2ʷⁿ - 2ˢc + # with w the word bitwidth and n the number of words + # to represent p. + # For example for Curve25519, p = 2²⁵⁵-19 and 2ʷⁿ=2²⁵⁶ + + # 0x0000 if no overflow or 0xFFFF if overflow + let mask1 = -SecretWord(overflow1) + + # 2ˢp = 2ʷⁿ - 2ˢc ≡ 2ˢc (mod p) + let overflow2 = r.add(mask1 and cs) + let mask2 = -SecretWord(overflow2) + + # We may carry again, but we just did -2ˢc + # so adding back 2ˢc for the extra 2ʷⁿ bit cannot carry + # to higher limbs + r[0] += mask2 and cs diff --git a/constantine/math/arithmetic/limbs_montgomery.nim b/constantine/math/arithmetic/limbs_montgomery.nim index 90d10edc..9438f031 100644 --- a/constantine/math/arithmetic/limbs_montgomery.nim +++ b/constantine/math/arithmetic/limbs_montgomery.nim @@ -54,14 +54,14 @@ func redc2xMont_CIOS[N: static int]( r: var array[N, SecretWord], a: array[N*2, SecretWord], M: array[N, SecretWord], - m0ninv: BaseType, skipFinalSub: static bool = false) = + m0ninv: BaseType, lazyReduce: static bool = false) = ## Montgomery reduce a double-precision bigint modulo M ## ## This maps - ## - [0, 4p²) -> [0, 2p) with skipFinalSub + ## - [0, 4p²) -> [0, 2p) with lazyReduce ## - [0, 4p²) -> [0, p) without ## - ## skipFinalSub skips the final substraction step. + ## lazyReduce skips the final substraction step. # - Analyzing and Comparing Montgomery Multiplication Algorithms # Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr. # http://pdfs.semanticscholar.org/5e39/41ff482ec3ee41dc53c3298f0be085c69483.pdf @@ -115,7 +115,7 @@ func redc2xMont_CIOS[N: static int]( addC(carry, res[i], a[i+N], res[i], carry) # Final substraction - when not skipFinalSub: + when not lazyReduce: discard res.csub(M, SecretWord(carry).isNonZero() or not(res < M)) r = res @@ -123,14 +123,14 @@ func redc2xMont_Comba[N: static int]( r: var array[N, SecretWord], a: array[N*2, SecretWord], M: array[N, SecretWord], - m0ninv: BaseType, skipFinalSub: static bool = false) {.used.} = + m0ninv: BaseType, lazyReduce: static bool = false) {.used.} = ## Montgomery reduce a double-precision bigint modulo M ## ## This maps - ## - [0, 4p²) -> [0, 2p) with skipFinalSub + ## - [0, 4p²) -> [0, 2p) with lazyReduce ## - [0, 4p²) -> [0, p) without ## - ## skipFinalSub skips the final substraction step. + ## lazyReduce skips the final substraction step. # We use Product Scanning / Comba multiplication var t, u, v = Zero var carry: Carry @@ -166,14 +166,14 @@ func redc2xMont_Comba[N: static int]( addC(carry, z[N-1], v, a[2*N-1], Carry(0)) # Final substraction - when not skipFinalSub: + when not lazyReduce: discard z.csub(M, SecretBool(carry) or not(z < M)) r = z # Montgomery Multiplication # ------------------------------------------------------------ -func mulMont_CIOS_sparebit(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) = +func mulMont_CIOS_sparebit(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, lazyReduce: static bool = false) = ## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS) ## and no-carry optimization. ## This requires the most significant word of the Modulus @@ -181,10 +181,10 @@ func mulMont_CIOS_sparebit(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipF ## https://hackmd.io/@gnark/modular_multiplication ## ## This maps - ## - [0, 2p) -> [0, 2p) with skipFinalSub + ## - [0, 2p) -> [0, 2p) with lazyReduce ## - [0, 2p) -> [0, p) without ## - ## skipFinalSub skips the final substraction step. + ## lazyReduce skips the final substraction step. # We want all the computation to be kept in registers # hence we use a temporary `t`, hoping that the compiler does it. @@ -208,11 +208,11 @@ func mulMont_CIOS_sparebit(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipF t[N-1] = C + A - when not skipFinalSub: + when not lazyReduce: discard t.csub(M, not(t < M)) r = t -func mulMont_CIOS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) {.used.} = +func mulMont_CIOS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, lazyReduce: static bool = false) {.used.} = ## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS) # - Analyzing and Comparing Montgomery Multiplication Algorithms # Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr. @@ -257,18 +257,18 @@ func mulMont_CIOS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: # t[N+1] can only be non-zero in the intermediate computation # since it is immediately reduce to t[N] at the end of each "i" iteration # However if t[N] is non-zero we have t > M - when not skipFinalSub: + when not lazyReduce: discard t.csub(M, tN.isNonZero() or not(t < M)) # TODO: (t >= M) is unnecessary for prime in the form (2^64)ʷ r = t -func mulMont_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) = +func mulMont_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, lazyReduce: static bool = false) = ## Montgomery Multiplication using Finely Integrated Product Scanning (FIPS) ## ## This maps - ## - [0, 2p) -> [0, 2p) with skipFinalSub + ## - [0, 2p) -> [0, 2p) with lazyReduce ## - [0, 2p) -> [0, p) without ## - ## skipFinalSub skips the final substraction step. + ## lazyReduce skips the final substraction step. # - Architectural Enhancements for Montgomery # Multiplication on Embedded RISC Processors # Johann Großschädl and Guy-Armand Kamendje, 2003 @@ -301,22 +301,22 @@ func mulMont_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: u = t t = Zero - when not skipFinalSub: + when not lazyReduce: discard z.csub(M, v.isNonZero() or not(z < M)) r = z func sumprodMont_CIOS_spare2bits[K: static int]( r: var Limbs, a, b: array[K, Limbs], M: Limbs, m0ninv: BaseType, - skipFinalSub: static bool = false) = - ## Compute r = ⅀aᵢ.bᵢ (mod M) (suim of products) + lazyReduce: static bool = false) = + ## Compute r = ⅀aᵢ.bᵢ (mod M) (sum of products) ## This requires 2 unused bits in the field element representation ## ## This maps - ## - [0, 2p) -> [0, 2p) with skipFinalSub + ## - [0, 2p) -> [0, 2p) with lazyReduce ## - [0, 2p) -> [0, p) without ## - ## skipFinalSub skips the final substraction step. + ## lazyReduce skips the final substraction step. # We want all the computation to be kept in registers # hence we use a temporary `t`, hoping that the compiler does the right thing™. @@ -358,7 +358,7 @@ func sumprodMont_CIOS_spare2bits[K: static int]( # (_,t[N-1]) <- t[N] + C t[N-1] = tN + C - when not skipFinalSub: + when not lazyReduce: discard t.csub(M, not(t < M)) r = t @@ -452,33 +452,34 @@ func redc2xMont*[N: static int]( a: array[N*2, SecretWord], M: array[N, SecretWord], m0ninv: BaseType, - spareBits: static int, skipFinalSub: static bool = false) {.inline.} = + spareBits: static int, lazyReduce: static bool = false) {.inline.} = ## Montgomery reduce a double-precision bigint modulo M - const skipFinalSub = skipFinalSub and spareBits >= 2 + const lazyReduce = lazyReduce and spareBits >= 2 when UseASM_X86_64 and r.len <= 6: # ADX implies BMI2 if ({.noSideEffect.}: hasAdx()): - redcMont_asm_adx(r, a, M, m0ninv, spareBits, skipFinalSub) + redcMont_asm_adx(r, a, M, m0ninv, spareBits, lazyReduce) else: when r.len in {3..6}: - redcMont_asm(r, a, M, m0ninv, spareBits, skipFinalSub) + # TODO: Assembly faster than GCC but slower than Clang + redcMont_asm(r, a, M, m0ninv, spareBits, lazyReduce) else: - redc2xMont_CIOS(r, a, M, m0ninv, skipFinalSub) + redc2xMont_CIOS(r, a, M, m0ninv, lazyReduce) # redc2xMont_Comba(r, a, M, m0ninv) - elif UseASM_X86_64 and r.len in {3..6}: + elif UseASM_X86_32 and r.len in {3..6}: # TODO: Assembly faster than GCC but slower than Clang - redcMont_asm(r, a, M, m0ninv, spareBits, skipFinalSub) + redcMont_asm(r, a, M, m0ninv, spareBits, lazyReduce) else: - redc2xMont_CIOS(r, a, M, m0ninv, skipFinalSub) - # redc2xMont_Comba(r, a, M, m0ninv, skipFinalSub) + redc2xMont_CIOS(r, a, M, m0ninv, lazyReduce) + # redc2xMont_Comba(r, a, M, m0ninv, lazyReduce) func mulMont*( r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, spareBits: static int, - skipFinalSub: static bool = false) {.inline.} = + lazyReduce: static bool = false) {.inline.} = ## Compute r <- a*b (mod M) in the Montgomery domain ## `m0ninv` = -1/M (mod SecretWord). Our words are 2^32 or 2^64 ## @@ -498,28 +499,28 @@ func mulMont*( # i.e. c'R <- a'R b'R * R^-1 (mod M) in the natural domain # as in the Montgomery domain all numbers are scaled by R - const skipFinalSub = skipFinalSub and spareBits >= 2 + const lazyReduce = lazyReduce and spareBits >= 2 when spareBits >= 1: when UseASM_X86_64 and a.len in {2 .. 6}: # TODO: handle spilling # ADX implies BMI2 if ({.noSideEffect.}: hasAdx()): - mulMont_CIOS_sparebit_asm_adx(r, a, b, M, m0ninv, skipFinalSub) + mulMont_CIOS_sparebit_asm_adx(r, a, b, M, m0ninv, lazyReduce) else: - mulMont_CIOS_sparebit_asm(r, a, b, M, m0ninv, skipFinalSub) + mulMont_CIOS_sparebit_asm(r, a, b, M, m0ninv, lazyReduce) else: - mulMont_CIOS_sparebit(r, a, b, M, m0ninv, skipFinalSub) + mulMont_CIOS_sparebit(r, a, b, M, m0ninv, lazyReduce) else: - mulMont_FIPS(r, a, b, M, m0ninv, skipFinalSub) + mulMont_FIPS(r, a, b, M, m0ninv, lazyReduce) func squareMont*[N](r: var Limbs[N], a, M: Limbs[N], m0ninv: BaseType, spareBits: static int, - skipFinalSub: static bool = false) {.inline.} = + lazyReduce: static bool = false) {.inline.} = ## Compute r <- a^2 (mod M) in the Montgomery domain ## `m0ninv` = -1/M (mod SecretWord). Our words are 2^31 or 2^63 - const skipFinalSub = skipFinalSub and spareBits >= 2 + const lazyReduce = lazyReduce and spareBits >= 2 when UseASM_X86_64 and a.len in {4, 6}: # ADX implies BMI2 @@ -528,37 +529,37 @@ func squareMont*[N](r: var Limbs[N], a, M: Limbs[N], # which uses unfused squaring then Montgomery reduction # is slightly slower than fused Montgomery multiplication when spareBits >= 1: - mulMont_CIOS_sparebit_asm_adx(r, a, a, M, m0ninv, skipFinalSub) + mulMont_CIOS_sparebit_asm_adx(r, a, a, M, m0ninv, lazyReduce) else: - squareMont_CIOS_asm_adx(r, a, M, m0ninv, spareBits, skipFinalSub) + squareMont_CIOS_asm_adx(r, a, M, m0ninv, spareBits, lazyReduce) else: - squareMont_CIOS_asm(r, a, M, m0ninv, spareBits, skipFinalSub) + squareMont_CIOS_asm(r, a, M, m0ninv, spareBits, lazyReduce) elif UseASM_X86_64: var r2x {.noInit.}: Limbs[2*N] r2x.square(a) - r.redc2xMont(r2x, M, m0ninv, spareBits, skipFinalSub) + r.redc2xMont(r2x, M, m0ninv, spareBits, lazyReduce) else: - mulMont(r, a, a, M, m0ninv, spareBits, skipFinalSub) + mulMont(r, a, a, M, m0ninv, spareBits, lazyReduce) func sumprodMont*[N: static int]( r: var Limbs, a, b: array[N, Limbs], M: Limbs, m0ninv: BaseType, spareBits: static int, - skipFinalSub: static bool = false) = + lazyReduce: static bool = false) = ## Compute r <- ⅀aᵢ.bᵢ (mod M) (sum of products) when spareBits >= 2: when UseASM_X86_64 and r.len in {2 .. 6}: if ({.noSideEffect.}: hasAdx()): - r.sumprodMont_CIOS_spare2bits_asm_adx(a, b, M, m0ninv, skipFinalSub) + r.sumprodMont_CIOS_spare2bits_asm_adx(a, b, M, m0ninv, lazyReduce) else: - r.sumprodMont_CIOS_spare2bits_asm(a, b, M, m0ninv, skipFinalSub) + r.sumprodMont_CIOS_spare2bits_asm(a, b, M, m0ninv, lazyReduce) else: - r.sumprodMont_CIOS_spare2bits(a, b, M, m0ninv, skipFinalSub) + r.sumprodMont_CIOS_spare2bits(a, b, M, m0ninv, lazyReduce) else: - r.mulMont(a[0], b[0], M, m0ninv, spareBits, skipFinalSub = false) + r.mulMont(a[0], b[0], M, m0ninv, spareBits, lazyReduce = false) var ri {.noInit.}: Limbs for i in 1 ..< N: - ri.mulMont(a[i], b[i], M, m0ninv, spareBits, skipFinalSub = false) + ri.mulMont(a[i], b[i], M, m0ninv, spareBits, lazyReduce = false) var overflowed = SecretBool r.add(ri) overflowed = overflowed or not(r < M) discard r.csub(M, overflowed) @@ -616,7 +617,7 @@ func getMont*(r: var Limbs, a, M, r2modM: Limbs, # that range is not valid with the no-carry optimization, # hence an unreduced input that uses 256-bit while prime is 254-bit # can have an incorrect representation. - mulMont_FIPS(r, a, r2ModM, M, m0ninv, skipFinalSub = false) + mulMont_FIPS(r, a, r2ModM, M, m0ninv, lazyReduce = false) # Montgomery Modular Exponentiation # ------------------------------------------ @@ -726,7 +727,7 @@ func powMontSquarings( # We have k bits and can do k squaring, skip final substraction for first k-1 ones. for i in 0 ..< k: - a.squareMont(a, M, m0ninv, spareBits) # TODO: skipFinalSub + a.squareMont(a, M, m0ninv, spareBits) # TODO: lazyReduce return (k, bits) diff --git a/constantine/math/elliptic/ec_multi_scalar_mul_scheduler.nim b/constantine/math/elliptic/ec_multi_scalar_mul_scheduler.nim index 043b326b..e5ba14cd 100644 --- a/constantine/math/elliptic/ec_multi_scalar_mul_scheduler.nim +++ b/constantine/math/elliptic/ec_multi_scalar_mul_scheduler.nim @@ -490,7 +490,7 @@ func sparseVectorAddition[ECaff]( elif i == numScheduled-1: accumDen[i].prod(accumDen[i-1], lambdas[i].den) else: - accumDen[i].prod(accumDen[i-1], lambdas[i].den, skipFinalSub = true) + accumDen[i].prod(accumDen[i-1], lambdas[i].den, lazyReduce = true) # Step 3: Batch invert var accInv {.noInit.}: F @@ -516,8 +516,8 @@ func sparseVectorAddition[ECaff]( continue # Compute lambda - destroys accumDen[i] - accumDen[i].prod(accInv, accumDen[i-1], skipFinalSub = true) - accumDen[i].prod(accumDen[i], lambdas[i].num, skipFinalSub = true) + accumDen[i].prod(accInv, accumDen[i-1], lazyReduce = true) + accumDen[i].prod(accumDen[i], lambdas[i].num, lazyReduce = true) # Compute EC addition var r{.noInit.}: ECaff @@ -527,7 +527,7 @@ func sparseVectorAddition[ECaff]( buckets[sps[i].bucket] = r # Next iteration - accInv.prod(accInv, lambdas[i].den, skipFinalSub = true) + accInv.prod(accInv, lambdas[i].den, lazyReduce = true) block: # tail if specialCases[0] == kInfLhs: @@ -543,7 +543,7 @@ func sparseVectorAddition[ECaff]( bucketStatuses[sps[0].bucket].excl(kAffine) else: # Compute lambda - accumDen[0].prod(lambdas[0].num, accInv, skipFinalSub = true) + accumDen[0].prod(lambdas[0].num, accInv, lazyReduce = true) # Compute EC addition var r{.noInit.}: ECaff diff --git a/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim b/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim index 9475e965..a863d7e3 100644 --- a/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim +++ b/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim @@ -54,9 +54,9 @@ func batchAffine*[F, G]( z.csetOne(zeroes[i]) if i != N-1: - affs[i].x.prod(affs[i-1].x, z, skipFinalSub = true) + affs[i].x.prod(affs[i-1].x, z, lazyReduce = true) else: - affs[i].x.prod(affs[i-1].x, z, skipFinalSub = false) + affs[i].x.prod(affs[i-1].x, z, lazyReduce = false) var accInv {.noInit.}: F accInv.inv(affs[N-1].x) @@ -64,7 +64,7 @@ func batchAffine*[F, G]( for i in countdown(N-1, 1): # Extract 1/Pᵢ var invi {.noInit.}: F - invi.prod(accInv, affs[i-1].x, skipFinalSub = true) + invi.prod(accInv, affs[i-1].x, lazyReduce = true) invi.csetZero(zeroes[i]) # Now convert Pᵢ to affine @@ -74,7 +74,7 @@ func batchAffine*[F, G]( # next iteration invi = projs[i].z invi.csetOne(zeroes[i]) - accInv.prod(accInv, invi, skipFinalSub = true) + accInv.prod(accInv, invi, lazyReduce = true) block: # tail accInv.csetZero(zeroes[0]) @@ -114,9 +114,9 @@ func batchAffine*[F, G]( z.csetOne(zeroes[i]) if i != N-1: - affs[i].x.prod(affs[i-1].x, z, skipFinalSub = true) + affs[i].x.prod(affs[i-1].x, z, lazyReduce = true) else: - affs[i].x.prod(affs[i-1].x, z, skipFinalSub = false) + affs[i].x.prod(affs[i-1].x, z, lazyReduce = false) var accInv {.noInit.}: F accInv.inv(affs[N-1].x) @@ -124,27 +124,27 @@ func batchAffine*[F, G]( for i in countdown(N-1, 1): # Extract 1/Pᵢ var invi {.noInit.}: F - invi.prod(accInv, affs[i-1].x, skipFinalSub = true) + invi.prod(accInv, affs[i-1].x, lazyReduce = true) invi.csetZero(zeroes[i]) # Now convert Pᵢ to affine var invi2 {.noinit.}: F - invi2.square(invi, skipFinalSub = true) + invi2.square(invi, lazyReduce = true) affs[i].x.prod(jacs[i].x, invi2) - invi.prod(invi, invi2, skipFinalSub = true) + invi.prod(invi, invi2, lazyReduce = true) affs[i].y.prod(jacs[i].y, invi) # next iteration invi = jacs[i].z invi.csetOne(zeroes[i]) - accInv.prod(accInv, invi, skipFinalSub = true) + accInv.prod(accInv, invi, lazyReduce = true) block: # tail var invi2 {.noinit.}: F accInv.csetZero(zeroes[0]) - invi2.square(accInv, skipFinalSub = true) + invi2.square(accInv, lazyReduce = true) affs[0].x.prod(jacs[0].x, invi2) - accInv.prod(accInv, invi2, skipFinalSub = true) + accInv.prod(accInv, invi2, lazyReduce = true) affs[0].y.prod(jacs[0].y, accInv) func batchAffine*[N: static int, F, G]( @@ -309,7 +309,7 @@ func accum_half_vartime[F; G: static Subgroup]( elif i == N-1: points[q].y.prod(points[q_prev].y, lambdas[i].den) else: - points[q].y.prod(points[q_prev].y, lambdas[i].den, skipFinalSub = true) + points[q].y.prod(points[q_prev].y, lambdas[i].den, lazyReduce = true) # Step 3: batch invert var accInv {.noInit.}: F @@ -338,8 +338,8 @@ func accum_half_vartime[F; G: static Subgroup]( continue # Compute lambda - points[q].y.prod(accInv, points[q_prev].y, skipFinalSub = true) - points[q].y.prod(points[q].y, lambdas[i].num, skipFinalSub = true) + points[q].y.prod(accInv, points[q_prev].y, lazyReduce = true) + points[q].y.prod(points[q].y, lambdas[i].num, lazyReduce = true) # Compute EC addition var r{.noInit.}: EC_ShortW_Aff[F, G] @@ -349,7 +349,7 @@ func accum_half_vartime[F; G: static Subgroup]( points[i] = r # Next iteration - accInv.prod(accInv, lambdas[i].den, skipFinalSub = true) + accInv.prod(accInv, lambdas[i].den, lazyReduce = true) block: # Tail let i = 0 @@ -360,7 +360,7 @@ func accum_half_vartime[F; G: static Subgroup]( recallSpecialCase(i, p, q) else: # Compute lambda - points[q].y.prod(lambdas[0].num, accInv, skipFinalSub = true) + points[q].y.prod(lambdas[0].num, accInv, lazyReduce = true) # Compute EC addition var r{.noInit.}: EC_ShortW_Aff[F, G] diff --git a/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim b/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim index 682b5711..0810055d 100644 --- a/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim +++ b/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim @@ -72,15 +72,15 @@ func `==`*(P, Q: EC_ShortW_Jac): SecretBool {.meter.} = var z1z1 {.noInit.}, z2z2 {.noInit.}: F var a{.noInit.}, b{.noInit.}: F - z1z1.square(P.z, skipFinalSub = true) - z2z2.square(Q.z, skipFinalSub = true) + z1z1.square(P.z, lazyReduce = true) + z2z2.square(Q.z, lazyReduce = true) a.prod(P.x, z2z2) b.prod(Q.x, z1z1) result = a == b - a.prod(P.y, Q.z, skipFinalSub = true) - b.prod(Q.y, P.z, skipFinalSub = true) + a.prod(P.y, Q.z, lazyReduce = true) + b.prod(Q.y, P.z, lazyReduce = true) a *= z2z2 b *= z1z1 result = result and a == b @@ -120,9 +120,9 @@ func trySetFromCoordsXandZ*[F; G]( result = sqrt_if_square(P.y) var z2 {.noInit.}: F - z2.square(z, skipFinalSub = true) + z2.square(z, lazyReduce = true) P.x.prod(x, z2) - P.y.prod(P.y, z2, skipFinalSub = true) + P.y.prod(P.y, z2, lazyReduce = true) P.y *= z P.z = z @@ -231,13 +231,13 @@ template sumImpl[F; G: static Subgroup]( block: # Addition-only, check for exceptional cases var Z2Z2 {.noInit.}, U2 {.noInit.}, S2 {.noInit.}: F - Z2Z2.square(Q.z, skipFinalSub = true) - S1.prod(Q.z, Z2Z2, skipFinalSub = true) + Z2Z2.square(Q.z, lazyReduce = true) + S1.prod(Q.z, Z2Z2, lazyReduce = true) S1 *= P.y # S₁ = Y₁*Z₂³ U1.prod(P.x, Z2Z2) # U₁ = X₁*Z₂² - Z1Z1.square(P.z, skipFinalSub = not CoefA_eq_minus3) - S2.prod(P.z, Z1Z1, skipFinalSub = true) + Z1Z1.square(P.z, lazyReduce = not CoefA_eq_minus3) + S2.prod(P.z, Z1Z1, lazyReduce = true) S2 *= Q.y # S₂ = Y₂*Z₁³ U2.prod(Q.x, Z1Z1) # U₂ = X₂*Z₁² @@ -301,7 +301,7 @@ template sumImpl[F; G: static Subgroup]( HHH_or_Mpre.prod(a, b) # HHH or X₁² # Assuming doubling path - a.square(HHH_or_Mpre, skipFinalSub = true) + a.square(HHH_or_Mpre, lazyReduce = true) a *= HHH_or_Mpre # a = 3X₁² b.square(Z1Z1) b.mulCheckSparse(CoefA) # b = αZZ, with α the "a" coefficient of the curve @@ -453,8 +453,8 @@ func mixedSum*[F; G: static Subgroup]( U1 = P.x S1 = P.y - Z1Z1.square(P.z, skipFinalSub = not CoefA_eq_minus3) - S2.prod(P.z, Z1Z1, skipFinalSub = true) + Z1Z1.square(P.z, lazyReduce = not CoefA_eq_minus3) + S2.prod(P.z, Z1Z1, lazyReduce = true) S2 *= Q.y # S₂ = Y₂*Z₁³ U2.prod(Q.x, Z1Z1) # U₂ = X₂*Z₁² @@ -518,7 +518,7 @@ func mixedSum*[F; G: static Subgroup]( HHH_or_Mpre.prod(a, b) # HHH or X₁² # Assuming doubling path - a.square(HHH_or_Mpre, skipFinalSub = true) + a.square(HHH_or_Mpre, lazyReduce = true) a *= HHH_or_Mpre # a = 3X₁² b.square(Z1Z1) b.mulCheckSparse(CoefA) # b = αZZ, with α the "a" coefficient of the curve @@ -669,10 +669,10 @@ func affine*[F; G]( jac: EC_ShortW_Jac[F, G]) {.meter.} = var invZ {.noInit.}, invZ2{.noInit.}: F invZ.inv(jac.z) - invZ2.square(invZ, skipFinalSub = true) + invZ2.square(invZ, lazyReduce = true) aff.x.prod(jac.x, invZ2) - invZ.prod(invZ, invZ2, skipFinalSub = true) + invZ.prod(invZ, invZ2, lazyReduce = true) aff.y.prod(jac.y, invZ) func fromAffine*[F; G]( @@ -741,7 +741,7 @@ func sum_vartime*[F; G: static Subgroup]( var U {.noInit.}, S{.noInit.}, H{.noInit.}, R{.noInit.}: F if not isPz1: # case Z₁ != 1 - R.square(p.z, skipFinalSub = true) # Z₁Z₁ = Z₁² + R.square(p.z, lazyReduce = true) # Z₁Z₁ = Z₁² if isQz1: # case Z₂ = 1 U = p.x # U₁ = X₁*Z₂Z₂ if isPz1: # case Z₁ = Z₂ = 1 @@ -751,19 +751,19 @@ func sum_vartime*[F; G: static Subgroup]( H -= U # H = U₂-U₁ S = p.y # S₁ = Y₁*Z₂*Z₂Z₂ else: # case Z₂ != 1 - S.square(q.z, skipFinalSub = true) + S.square(q.z, lazyReduce = true) U.prod(p.x, S) # U₁ = X₁*Z₂Z₂ if isPz1: H = q.x else: H.prod(q.x, R) H -= U # H = U₂-U₁ - S.prod(S, q.z, skipFinalSub = true) + S.prod(S, q.z, lazyReduce = true) S *= p.y # S₁ = Y₁*Z₂*Z₂Z₂ if isPz1: R = q.y else: - R.prod(R, p.z, skipFinalSub = true) + R.prod(R, p.z, lazyReduce = true) R *= q.y # S₂ = Y₂*Z₁*Z₁Z₁ R -= S # R = S₂-S₁ @@ -778,7 +778,7 @@ func sum_vartime*[F; G: static Subgroup]( var HHH{.noInit.}: F template V: untyped = U - HHH.square(H, skipFinalSub = true) + HHH.square(H, lazyReduce = true) V *= HHH # V = U₁*HH HHH *= H # HHH = H*HH @@ -804,7 +804,7 @@ func sum_vartime*[F; G: static Subgroup]( if isQz1: r.z.prod(H, p.z) else: - r.z.prod(p.z, q.z, skipFinalSub = true) + r.z.prod(p.z, q.z, lazyReduce = true) r.z *= H func mixedSum_vartime*[F; G: static Subgroup]( @@ -857,7 +857,7 @@ func mixedSum_vartime*[F; G: static Subgroup]( var U {.noInit.}, S{.noInit.}, H{.noInit.}, R{.noInit.}: F if not isPz1: # case Z₁ != 1 - R.square(p.z, skipFinalSub = true) # Z₁Z₁ = Z₁² + R.square(p.z, lazyReduce = true) # Z₁Z₁ = Z₁² U = p.x # U₁ = X₁*Z₂Z₂ if isPz1: # case Z₁ = Z₂ = 1 @@ -870,7 +870,7 @@ func mixedSum_vartime*[F; G: static Subgroup]( if isPz1: R = q.y else: - R.prod(R, p.z, skipFinalSub = true) + R.prod(R, p.z, lazyReduce = true) R *= q.y # S₂ = Y₂*Z₁*Z₁Z₁ R -= S # R = S₂-S₁ @@ -885,7 +885,7 @@ func mixedSum_vartime*[F; G: static Subgroup]( var HHH{.noInit.}: F template V: untyped = U - HHH.square(H, skipFinalSub = true) + HHH.square(H, lazyReduce = true) V *= HHH # V = U₁*HH HHH *= H # HHH = H*HH diff --git a/constantine/math/elliptic/ec_shortweierstrass_jacobian_extended.nim b/constantine/math/elliptic/ec_shortweierstrass_jacobian_extended.nim index 8c37b492..dc2c54cd 100644 --- a/constantine/math/elliptic/ec_shortweierstrass_jacobian_extended.nim +++ b/constantine/math/elliptic/ec_shortweierstrass_jacobian_extended.nim @@ -333,8 +333,8 @@ func affine*[F; G]( jacext: EC_ShortW_JacExt[F, G]) {.meter.} = var invZZ {.noInit.}, invZZZ{.noInit.}: F invZZZ.inv(jacext.zzz) - invZZ.prod(jacext.zz, invZZZ, skipFinalSub = true) - invZZ.square(skipFinalSub = true) + invZZ.prod(jacext.zz, invZZZ, lazyReduce = true) + invZZ.square(lazyReduce = true) aff.x.prod(jacext.x, invZZ) aff.y.prod(jacext.y, invZZZ) diff --git a/constantine/math/elliptic/ec_shortweierstrass_projective.nim b/constantine/math/elliptic/ec_shortweierstrass_projective.nim index 912959f6..afe14c54 100644 --- a/constantine/math/elliptic/ec_shortweierstrass_projective.nim +++ b/constantine/math/elliptic/ec_shortweierstrass_projective.nim @@ -544,7 +544,7 @@ func sum_vartime*[F; G: static Subgroup]( var VVV{.noInit.}: F - VVV.square(V, skipFinalSub = true) + VVV.square(V, lazyReduce = true) R *= VVV VVV *= V @@ -564,7 +564,7 @@ func sum_vartime*[F; G: static Subgroup]( A.prod(U, q.z) r.z.prod(VVV, q.z) else: - r.z.prod(p.z, q.z, skipFinalSub = true) + r.z.prod(p.z, q.z, lazyReduce = true) A.prod(U, r.z) r.z *= VVV @@ -642,7 +642,7 @@ func mixedSum_vartime*[F; G: static Subgroup]( var VVV{.noInit.}: F - VVV.square(V, skipFinalSub = true) + VVV.square(V, lazyReduce = true) R *= VVV VVV *= V diff --git a/constantine/math/elliptic/ec_twistededwards_batch_ops.nim b/constantine/math/elliptic/ec_twistededwards_batch_ops.nim index 3e0ff206..b4566b02 100644 --- a/constantine/math/elliptic/ec_twistededwards_batch_ops.nim +++ b/constantine/math/elliptic/ec_twistededwards_batch_ops.nim @@ -50,9 +50,9 @@ func batchAffine*[F]( z.csetOne(zeroes[i]) if i != N-1: - affs[i].x.prod(affs[i-1].x, z, skipFinalSub = true) + affs[i].x.prod(affs[i-1].x, z, lazyReduce = true) else: - affs[i].x.prod(affs[i-1].x, z, skipFinalSub = false) + affs[i].x.prod(affs[i-1].x, z, lazyReduce = false) var accInv {.noInit.}: F accInv.inv(affs[N-1].x) @@ -60,7 +60,7 @@ func batchAffine*[F]( for i in countdown(N-1, 1): # Extract 1/Pᵢ var invi {.noInit.}: F - invi.prod(accInv, affs[i-1].x, skipFinalSub = true) + invi.prod(accInv, affs[i-1].x, lazyReduce = true) invi.csetZero(zeroes[i]) # Now convert Pᵢ to affine @@ -70,7 +70,7 @@ func batchAffine*[F]( # next iteration invi = projs[i].z invi.csetOne(zeroes[i]) - accInv.prod(accInv, invi, skipFinalSub = true) + accInv.prod(accInv, invi, lazyReduce = true) block: # tail accInv.csetZero(zeroes[0]) diff --git a/constantine/math/extension_fields/towers.nim b/constantine/math/extension_fields/towers.nim index 20fc0d51..c3c5c1c2 100644 --- a/constantine/math/extension_fields/towers.nim +++ b/constantine/math/extension_fields/towers.nim @@ -2185,21 +2185,21 @@ func inv*(a: var CubicExt) = # Convenience functions # ---------------------------------------------------------------------- -template square*(a: var ExtensionField, skipFinalSub: static bool) = +template square*(a: var ExtensionField, lazyReduce: static bool) = # Square alias, # this allows using the same code for # the base field and its extensions while benefitting from skipping # the final substraction on Fp a.square() -template square*(r: var ExtensionField, a: ExtensionField, skipFinalSub: static bool) = +template square*(r: var ExtensionField, a: ExtensionField, lazyReduce: static bool) = # Square alias, # this allows using the same code for # the base field and its extensions while benefitting from skipping # the final substraction on Fp r.square(a) -template prod*(r: var ExtensionField, a, b: ExtensionField, skipFinalSub: static bool) = +template prod*(r: var ExtensionField, a, b: ExtensionField, lazyReduce: static bool) = # Prod alias, # this allows using the same code for # the base field and its extensions while benefitting from skipping diff --git a/constantine/math_arbitrary_precision/arithmetic/limbs_montgomery.nim b/constantine/math_arbitrary_precision/arithmetic/limbs_montgomery.nim index bf24eb6c..da6c7478 100644 --- a/constantine/math_arbitrary_precision/arithmetic/limbs_montgomery.nim +++ b/constantine/math_arbitrary_precision/arithmetic/limbs_montgomery.nim @@ -44,14 +44,14 @@ func mulMont_FIPS*( M: LimbsViewConst, m0ninv: SecretWord, mBits: int, - skipFinalSub: static bool = false) {.noInline, tags:[Alloca], meter.} = + lazyReduce: static bool = false) {.noInline, tags:[Alloca], meter.} = ## Montgomery Multiplication using Finely Integrated Product Scanning (FIPS) ## ## This maps - ## - [0, 2p) -> [0, 2p) with skipFinalSub + ## - [0, 2p) -> [0, 2p) with lazyReduce ## - [0, 2p) -> [0, p) without ## - ## skipFinalSub skips the final substraction step. + ## lazyReduce skips the final substraction step. # - Architectural Enhancements for Montgomery # Multiplication on Embedded RISC Processors # Johann Großschädl and Guy-Armand Kamendje, 2003 @@ -86,7 +86,7 @@ func mulMont_FIPS*( u = t t = Zero - when not skipFinalSub: + when not lazyReduce: discard z.csub(M, v.isNonZero() or not(z.lt(M, L)), L) r.copyWords(0, z, 0, L) diff --git a/constantine/math_compiler/impl_fields_nvidia.nim b/constantine/math_compiler/impl_fields_nvidia.nim index 0ffbb5b1..414f2818 100644 --- a/constantine/math_compiler/impl_fields_nvidia.nim +++ b/constantine/math_compiler/impl_fields_nvidia.nim @@ -193,12 +193,12 @@ proc field_sub_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field): FnDef return (subModTy, subModKernel) -proc field_mul_CIOS_sparebit_gen(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, skipFinalSub: bool): FnDef = +proc field_mul_CIOS_sparebit_gen(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, lazyReduce: bool): FnDef = ## Generate an optimized modular multiplication kernel ## with parameters `a, b, modulus: Limbs -> Limbs` let procName = cm.genSymbol(block: - if skipFinalSub: + if lazyReduce: case field of fp: opFpMulSkipFinalSub of fr: opFrMulSkipFinalSub @@ -343,7 +343,7 @@ proc field_mul_CIOS_sparebit_gen(asy: Assembler_LLVM, cm: CurveMetadata, field: else: t[0] = bld.mulhiadd(m, M[0], t[0]) - if not skipFinalSub: + if not lazyReduce: asy.finalSubNoOverflow(cm, field, t, t) bld.store(r, t) @@ -351,7 +351,7 @@ proc field_mul_CIOS_sparebit_gen(asy: Assembler_LLVM, cm: CurveMetadata, field: return (mulModTy, mulModKernel) -proc field_mul_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, skipFinalSub = false): FnDef = +proc field_mul_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, lazyReduce = false): FnDef = ## Generate an optimized modular addition kernel ## with parameters `a, b, modulus: Limbs -> Limbs` - return asy.field_mul_CIOS_sparebit_gen(cm, field, skipFinalSub) \ No newline at end of file + return asy.field_mul_CIOS_sparebit_gen(cm, field, lazyReduce) diff --git a/constantine/named/config_fields_and_curves.nim b/constantine/named/config_fields_and_curves.nim index c3fc397a..330d4bf5 100644 --- a/constantine/named/config_fields_and_curves.nim +++ b/constantine/named/config_fields_and_curves.nim @@ -183,6 +183,7 @@ declareCurves: curve Edwards25519: # Bernstein curve bitwidth: 255 modulus: "0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffed" + modulusKind: Crandall(19) # Montgomery form: y² = x³ + 486662x² + x # Edwards form: x² + y² = 1+dx²y² with d = 121665/121666 @@ -221,6 +222,8 @@ declareCurves: curve Secp256k1: # Bitcoin curve bitwidth: 256 modulus: "0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f" + modulusKind: Crandall(0x1000003D1'u64) + order: "0xfffffffffffffffffffffffffffffffebaaedce6af48a03bbfd25e8cd0364141" orderBitwidth: 256 eq_form: ShortWeierstrass diff --git a/constantine/named/deriv/derive_constants.nim b/constantine/named/deriv/derive_constants.nim index f0aeac07..b021c172 100644 --- a/constantine/named/deriv/derive_constants.nim +++ b/constantine/named/deriv/derive_constants.nim @@ -66,7 +66,7 @@ macro genDerivedConstants*(mode: static DerivedConstantMode): untyped = M ) ) - # const MyCurve_R4modP = r4mod(MyCurve_Modulus) + # const MyCurve_R3modP = r3mod(MyCurve_Modulus) result.add newConstStmt( used(curve & ff & "_R3modP"), newCall( bindSym"r3mod", @@ -102,6 +102,13 @@ macro genDerivedConstants*(mode: static DerivedConstantMode): untyped = M ) ) + # const MyCurve_PrimeMinus1 = primeMinus1(MyCurve_Modulus) + result.add newConstStmt( + used(curve & ff & "_PrimeMinus1"), newCall( + bindSym"primeMinus1", + M + ) + ) # const MyCurve_PrimeMinus1div2 = primeMinus1div2(MyCurve_Modulus) result.add newConstStmt( used(curve & ff & "_PrimeMinus1div2"), newCall( diff --git a/constantine/named/deriv/parser_fields.nim b/constantine/named/deriv/parser_fields.nim index e21be82a..d6b14fdf 100644 --- a/constantine/named/deriv/parser_fields.nim +++ b/constantine/named/deriv/parser_fields.nim @@ -24,6 +24,10 @@ import # for example when using the `r2modP` constant in multiple overloads in the same module type + PrimeKind* = enum + kGeneric + kCrandall # Crandall Prime are in the form 2ᵐ-c (also called pseudo-Mersenne primes) + CurveFamily* = enum NoFamily BarretoNaehrig # BN curve @@ -95,6 +99,8 @@ type # Field parameters bitWidth*: NimNode # nnkIntLit modulus*: NimNode # nnkStrLit (hex) + modulusKind*: PrimeKind + modulusKindAssociatedValue*: BiggestInt # Towering nonresidue_fp*: NimNode # nnkIntLit @@ -170,13 +176,20 @@ proc parseCurveDecls*(defs: var seq[CurveParams], curves: NimNode) = curveParams[i][1].expectKind(nnkStmtList) let sectionVal = curveParams[i][1][0] + # Field if sectionId.eqIdent"bitwidth": params.bitWidth = sectionVal elif sectionId.eqident"modulus": params.modulus = sectionVal + elif sectionId.eqIdent"modulusKind": + sectionVal.expectKind(nnkCall) + sectionVal[0].expectIdent"Crandall" + params.modulusKind = kCrandall + params.modulusKindAssociatedValue = sectionVal[1].intVal + + # Curve elif sectionId.eqIdent"family": params.family = parseEnum[CurveFamily]($sectionVal) - elif sectionId.eqIdent"eq_form": params.eq_form = parseEnum[CurveEquationForm]($sectionVal) elif sectionId.eqIdent"coef_a": @@ -217,6 +230,7 @@ proc parseCurveDecls*(defs: var seq[CurveParams], curves: NimNode) = elif sectionId.eqIdent"nonresidue_fp2": params.nonresidue_fp2 = sectionVal + # Pairings elif sectionId.eqIdent"embedding_degree": params.embedding_degree = sectionVal.intVal.int elif sectionId.eqIdent"sexticTwist": @@ -242,6 +256,7 @@ proc genFieldsConstants(defs: seq[CurveParams]): NimNode = var MapCurveBitWidth = nnkBracket.newTree() var MapCurveOrderBitWidth = nnkBracket.newTree() var curveModStmts = newStmtList() + var crandallStmts = newStmtList() for curveDef in defs: @@ -271,6 +286,14 @@ proc genFieldsConstants(defs: seq[CurveParams]): NimNode = ) ) + crandallStmts.add newConstStmt( + exported($curve & "_fp_isCrandall"), + newLit(curveDef.modulusKind == kCrandall) + ) + crandallStmts.add newConstStmt( + exported($curve & "_fp_CrandallSubTerm"), + newCall(bindsym"uint64", newLit(curveDef.modulusKindAssociatedValue)) + ) # Field Fr if not curveDef.order.isNil: curveDef.orderBitwidth.expectKind(nnkIntLit) @@ -316,6 +339,7 @@ proc genFieldsConstants(defs: seq[CurveParams]): NimNode = exported("CurveBitWidth"), MapCurveBitWidth ) result.add curveModStmts + result.add crandallStmts # const CurveOrderBitSize: array[Curve, int] = ... result.add newConstStmt( exported("CurveOrderBitWidth"), MapCurveOrderBitWidth diff --git a/constantine/named/deriv/precompute.nim b/constantine/named/deriv/precompute.nim index f428d2a2..3f84852c 100644 --- a/constantine/named/deriv/precompute.nim +++ b/constantine/named/deriv/precompute.nim @@ -391,6 +391,13 @@ func primePlus1div2*(P: BigInt): BigInt = let carry = result.add(1) doAssert not carry +func primeMinus1*(P: BigInt): BigInt = + ## Compute P-1 + ## Warning ⚠️: Result is in the canonical domain (not Montgomery) + + result = P + discard result.sub(1) + func primeMinus1div2*(P: BigInt): BigInt = ## Compute (P-1)/2 ## For use in constant-time modular inversion diff --git a/constantine/named/properties_fields.nim b/constantine/named/properties_fields.nim index abc07b73..b357f9d5 100644 --- a/constantine/named/properties_fields.nim +++ b/constantine/named/properties_fields.nim @@ -97,6 +97,21 @@ template getBigInt*[Name: static Algebra](T: type FF[Name]): untyped = ## Get the underlying BigInt type. typeof(default(T).mres) +template isCrandallPrimeField*(F: type Fr): static bool = false + +macro getCrandallPrimeSubterm*[Name: static Algebra](F: type Fp[Name]): static SecretWord = + result = newcall(bindSym"SecretWord", bindSym($Name & "_fp_CrandallSubTerm")) + +macro isCrandallPrimeField*[Name: static Algebra](F: type Fp[Name]): static bool = + let rawCrandall = bindSym($Name & "_fp_isCrandall") + let subTerm = bindSym($Name & "_fp_CrandallSubTerm") + result = quote do: + when `rawCrandall`: + when log2_vartime(`subTerm`) < WordBitWidth: + true + else: false + else: false + func bits*[Name: static Algebra](T: type FF[Name]): static int = T.getBigInt().bits @@ -204,6 +219,11 @@ macro getPrimePlus1div2*(ff: type FF): untyped = ## Warning ⚠️: Result in canonical domain (not Montgomery) result = bindConstant(ff, "PrimePlus1div2") +macro getPrimeMinus1*(ff: type FF): untyped = + ## Get P-1 + ## Warning ⚠️: Result in canonical domain (not Montgomery) + result = bindConstant(ff, "PrimeMinus1") + macro getPrimeMinus1div2*(ff: type FF): untyped = ## Get (P-1) / 2 for an odd prime ## Warning ⚠️: Result in canonical domain (not Montgomery) diff --git a/constantine/platforms/x86/macro_assembler_x86_att.nim b/constantine/platforms/x86/macro_assembler_x86_att.nim index 20d8d0c1..e95f3eaf 100644 --- a/constantine/platforms/x86/macro_assembler_x86_att.nim +++ b/constantine/platforms/x86/macro_assembler_x86_att.nim @@ -7,8 +7,8 @@ # at your option. This file may not be copied, modified, or distributed except according to those terms. import - std/[macros, strutils, sets, hashes, algorithm, sequtils], - ../config + std/[macros, strutils, sets, hashes, algorithm, sequtils, enumutils], + ../[config, bithacks] # A compile-time inline assembler @@ -484,7 +484,7 @@ func generate*(a: Assembler_x86): NimNode = newEmptyNode(), result) -func getStrOffset(a: Assembler_x86, op: Operand): string = +func getStrOffset(a: Assembler_x86, op: Operand, force32IfReg = false): string = if op.kind != kFromArray: if op.kind in {kArrayAddr, k2dArrayAddr}: # We are operating on an array pointer @@ -493,8 +493,13 @@ func getStrOffset(a: Assembler_x86, op: Operand): string = return "%%" & op.buf[0].desc.asmId else: return "%" & op.buf[0].desc.asmId + elif op.kind == kRegister: + if force32IfReg: + return "%k" & op.desc.asmId + else: + return "%" & op.desc.asmId else: - return "%" & op.desc.asmId + error "Unsupported: " & $op.kind # Beware GCC / Clang differences with displacements # https://lists.llvm.org/pipermail/llvm-dev/2017-August/116202.html @@ -556,7 +561,8 @@ func getStrOffset(a: Assembler_x86, op: Operand): string = return "0(%%" & op.desc.asmId & ')' return $(op.offset * a.wordSize) & "(%%" & op.desc.asmId & ')' else: - error "Unsupported: " & $op.desc.rm.ord + error "Unsupported: " & $op.desc.rm.symbolName() & "\n" & + op.repr func codeFragment(a: var Assembler_x86, instr: string, op: Operand) = # Generate a code fragment @@ -638,7 +644,7 @@ func codeFragment(a: var Assembler_x86, instr: string, reg0, reg1: Register) = a.regClobbers.incl reg0 a.regClobbers.incl reg1 -func codeFragment(a: var Assembler_x86, instr: string, imm: int, op: Operand) = +func codeFragment(a: var Assembler_x86, instr: string, imm: SomeInteger, op: Operand) = # Generate a code fragment # ⚠️ Warning: # The caller should deal with destination/source operand @@ -653,6 +659,24 @@ func codeFragment(a: var Assembler_x86, instr: string, imm: int, op: Operand) = if op.desc.constraint != asmClobberedRegister: a.operands.incl op.desc +func codeFragment(a: var Assembler_x86, instr: string, imm: SomeInteger, op0, op1: Operand) = + # Generate a code fragment + # ⚠️ Warning: + # The caller should deal with destination/source operand + # so that it fits GNU Assembly + let off0 = a.getStrOffset(op0) + let off1 = a.getStrOffset(op1) + + if a.wordBitWidth == 64: + a.code &= instr & "q $" & $imm & ", " & off0 & ", " & off1 & '\n' + else: + a.code &= instr & "l $" & $imm & ", " & off0 & ", " & off1 & '\n' + + if op0.desc.constraint != asmClobberedRegister: + a.operands.incl op0.desc + if op1.desc.constraint != asmClobberedRegister: + a.operands.incl op1.desc + func codeFragment(a: var Assembler_x86, instr: string, reg: Register, op: OperandReuse) = # Generate a code fragment # ⚠️ Warning: @@ -675,7 +699,7 @@ func codeFragment(a: var Assembler_x86, instr: string, op: OperandReuse, reg: Re a.code &= instr & "l %" & $op.asmId & ", %%" & $reg & '\n' a.regClobbers.incl reg -func codeFragment(a: var Assembler_x86, instr: string, imm: int, reg: Register) = +func codeFragment(a: var Assembler_x86, instr: string, imm: SomeInteger, reg: Register) = # Generate a code fragment # ⚠️ Warning: # The caller should deal with destination/source operand @@ -686,7 +710,7 @@ func codeFragment(a: var Assembler_x86, instr: string, imm: int, reg: Register) a.code &= instr & "l $" & $imm & ", %%" & $reg & '\n' a.regClobbers.incl reg -func codeFragment(a: var Assembler_x86, instr: string, imm: int, reg: OperandReuse) = +func codeFragment(a: var Assembler_x86, instr: string, imm: SomeInteger, reg: OperandReuse) = # Generate a code fragment # ⚠️ Warning: # The caller should deal with destination/source operand @@ -773,6 +797,30 @@ func isOutput(op: Operand): bool = # Instructions # ------------------------------------------------------------------------------------------------------------ +const Reg8Low = [ + rbx: "%%bl", + rdx: "%%dl", + r8: "%%r8l", + rax: "%%al", + xmm0: "invalid", +] + +const Reg32 = [ # Allow using 32-bit reg on 64-bit for smaller instructions + rbx: "%%ebx", + rdx: "%%edx", + r8 : "%%r8d", + rax: "%%eax", + xmm0: "invalid", +] + +func setc*(a: var Assembler_x86, dst: Register) = + ## Set destination to 1 if carry flag is set + ## and 0 otherwise + + # This only works with 8-bit reg so we special-map them + a.code &= "setc " & Reg8Low[dst] & '\n' + # No flags affected + func add*(a: var Assembler_x86, dst, src: Operand) = ## Does: dst <- dst + src doAssert dst.isOutput() @@ -809,7 +857,7 @@ func adc*(a: var Assembler_x86, dst, src: Register) = a.codeFragment("adc", src, dst) a.areFlagsClobbered = true -func adc*(a: var Assembler_x86, dst: Operand, imm: int) = +func adc*(a: var Assembler_x86, dst: Operand, imm: SomeInteger) = ## Does: dst <- dst + imm + borrow doAssert dst.isOutput() doAssert dst.desc.rm notin {Mem, MemOffsettable}, @@ -824,7 +872,7 @@ func adc*(a: var Assembler_x86, dst: Operand, src: Register) = a.codeFragment("adc", src, dst) a.areFlagsClobbered = true -func adc*(a: var Assembler_x86, dst: Register, imm: int) = +func adc*(a: var Assembler_x86, dst: Register, imm: SomeInteger) = ## Does: dst <- dst + src a.codeFragment("adc", imm, dst) a.areFlagsClobbered = true @@ -835,6 +883,15 @@ func sub*(a: var Assembler_x86, dst, src: Operand) = a.codeFragment("sub", src, dst) a.areFlagsClobbered = true +func sub*(a: var Assembler_x86, dst: Operand, imm: SomeInteger) = + ## Does: dst <- dst - imm + doAssert dst.isOutput() + doAssert dst.desc.rm notin {Mem, MemOffsettable}, + "Using subborrow with a memory destination, this incurs significant performance penalties." + + a.codeFragment("sub", imm, dst) + a.areFlagsClobbered = true + func sbb*(a: var Assembler_x86, dst, src: Operand) = ## Does: dst <- dst - src - borrow doAssert dst.isOutput() @@ -844,7 +901,7 @@ func sbb*(a: var Assembler_x86, dst, src: Operand) = a.codeFragment("sbb", src, dst) a.areFlagsClobbered = true -func sbb*(a: var Assembler_x86, dst: Operand, imm: int) = +func sbb*(a: var Assembler_x86, dst: Operand, imm: SomeInteger) = ## Does: dst <- dst - imm - borrow doAssert dst.isOutput() doAssert dst.desc.rm notin {Mem, MemOffsettable}, @@ -853,7 +910,7 @@ func sbb*(a: var Assembler_x86, dst: Operand, imm: int) = a.codeFragment("sbb", imm, dst) a.areFlagsClobbered = true -func sbb*(a: var Assembler_x86, dst: Register, imm: int) = +func sbb*(a: var Assembler_x86, dst: Register, imm: SomeInteger) = ## Does: dst <- dst - imm - borrow a.codeFragment("sbb", imm, dst) a.areFlagsClobbered = true @@ -863,7 +920,7 @@ func sbb*(a: var Assembler_x86, dst, src: Register) = a.codeFragment("sbb", src, dst) a.areFlagsClobbered = true -func sbb*(a: var Assembler_x86, dst: OperandReuse, imm: int) = +func sbb*(a: var Assembler_x86, dst: OperandReuse, imm: SomeInteger) = ## Does: dst <- dst - imm - borrow a.codeFragment("sbb", imm, dst) a.areFlagsClobbered = true @@ -873,7 +930,13 @@ func sbb*(a: var Assembler_x86, dst, src: OperandReuse) = a.codeFragment("sbb", src, dst) a.areFlagsClobbered = true -func sar*(a: var Assembler_x86, dst: Operand, imm: int) = +func shld*(a: var Assembler_x86, inout: Operand, src: Operand, imm: SomeInteger) = + ## Does double precision left shift + ## inout <- (inout, src) << imm + a.codeFragment("shld", imm, src, inout) + a.areFlagsClobbered = true + +func sar*(a: var Assembler_x86, dst: Operand, imm: SomeInteger) = ## Does Arithmetic Right Shift (i.e. with sign extension) doAssert dst.isOutput() a.codeFragment("sar", imm, dst) @@ -885,7 +948,13 @@ func `and`*(a: var Assembler_x86, dst: Operand, src: Register) = a.codeFragment("and", src, dst) a.areFlagsClobbered = true -func `and`*(a: var Assembler_x86, dst: OperandReuse, imm: int) = +func `and`*(a: var Assembler_x86, dst: Operand, imm: SomeInteger) = + ## Compute the bitwise AND of x and y and + ## set the Sign, Zero and Parity flags + a.codeFragment("and", imm, dst) + a.areFlagsClobbered = true + +func `and`*(a: var Assembler_x86, dst: OperandReuse, imm: SomeInteger) = ## Compute the bitwise AND of x and y and ## set the Sign, Zero and Parity flags a.codeFragment("and", imm, dst) @@ -936,14 +1005,31 @@ func `or`*(a: var Assembler_x86, dst: OperandReuse, src: Operand) = func `xor`*(a: var Assembler_x86, dst, src: Operand) = ## Compute the bitwise xor of x and y and ## reset all flags - a.codeFragment("xor", src, dst) - a.areFlagsClobbered = true + if dst.desc == src.desc: + # Special case zero-ing so it uses 32-bit registers + # and rely on auto zero-clear of upper bits + let off = a.getStrOffset(dst, force32IfReg = true) + a.code &= "xorl " & off & ", " & off & '\n' + + if dst.desc.constraint != asmClobberedRegister: + a.operands.incl dst.desc + a.areFlagsClobbered = true + else: + a.codeFragment("xor", src, dst) + a.areFlagsClobbered = true func `xor`*(a: var Assembler_x86, dst, src: Register) = ## Compute the bitwise xor of x and y and ## reset all flags - a.codeFragment("xor", src, dst) - a.areFlagsClobbered = true + if dst == src: + # Special case zero-ing so it uses 32-bit registers + a.code &= "xorl " & Reg32[dst] & ", " & Reg32[dst] & '\n' + + a.regClobbers.incl dst + a.areFlagsClobbered = true + else: + a.codeFragment("xor", src, dst) + a.areFlagsClobbered = true func mov*(a: var Assembler_x86, dst, src: Operand) = ## Does: dst <- src @@ -966,16 +1052,33 @@ func mov*(a: var Assembler_x86, dst: OperandReuse, src: Operand) = a.codeFragment("mov", src, dst) # No clobber -func mov*(a: var Assembler_x86, dst: Operand, imm: int) = +func mov*(a: var Assembler_x86, dst: Operand, imm: SomeInteger) = ## Does: dst <- imm doAssert dst.isOutput(), $dst.repr - a.codeFragment("mov", imm, dst) + # We special-case register <- immediate mov + # as they can take up to 10 bytes (2 bytes REX instr + 8-byte data) + # on x86-64 even though most of the time we load small values + if log2_vartime(uint64 imm) >= 32: + a.codeFragment("mov", imm, dst) + else: + let off = a.getStrOffset(dst, force32IfReg = true) + a.code &= "movl $" & $imm & ", " & off & '\n' # No clobber -func mov*(a: var Assembler_x86, dst: Register, imm: int) = +func mov*(a: var Assembler_x86, dst: Register, imm: SomeInteger) = ## Does: dst <- src with dst a fixed register - a.codeFragment("mov", imm, dst) + + # We special-case register <- immediate mov + # as they can take up to 10 bytes (2 bytes REX instr + 8-byte data) + # on x86-64 even though + # most of the time we load small values + + if log2_vartime(uint64 imm) >= 32: + a.codeFragment("mov", imm, dst) + else: + a.code &= "movl $" & $imm & ", " & Reg32[dst] & '\n' + a.regClobbers.incl dst func mov*(a: var Assembler_x86, dst: Register, src: Operand) = ## Does: dst <- src with dst a fixed register @@ -1010,6 +1113,11 @@ func cmovnc*(a: var Assembler_x86, dst, src: Operand) = a.codeFragment("cmovnc", src, dst) # No clobber +func cmovnc*(a: var Assembler_x86, dst: Register, src: Operand) = + ## Does: dst <- src if the carry flag is not set + a.codeFragment("cmovnc", src, dst) + # No clobber + func cmovz*(a: var Assembler_x86, dst: Operand, src: Register) = ## Does: dst <- src if the zero flag is not set doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr @@ -1079,6 +1187,30 @@ func imul*(a: var Assembler_x86, dst: Register, src: Operand) = ## Does dst <- dst * src, keeping only the low half a.codeFragment("imul", src, dst) + a.areFlagsClobbered = true + +func imul*(a: var Assembler_x86, dst: Register, src0: Operand, imm: SomeInteger) = + ## Does dst <- a * imm, keeping only the low half + let off0 = a.getStrOffset(src0) + + if a.wordBitWidth == 64: + a.code &= "imulq $" & $imm & ", " & $off0 & ", %%" & $dst & '\n' + else: + a.code &= "imull $" & $imm & ", " & $off0 & ", %%" & $dst & '\n' + a.regClobbers.incl dst + a.areFlagsClobbered = true + a.operands.incl(src0.desc) + +func imul*(a: var Assembler_x86, dst: Register, src0: Register, imm: SomeInteger) = + ## Does dst <- a * imm, keeping only the low half + if a.wordBitWidth == 64: + a.code &= "imulq $" & $imm & ", %%" & $src0 & ", %%" & $dst '\n' + else: + a.code &= "imull $" & $imm & ", %%" & $src0 & ", %%" & $dst '\n' + a.regClobbers.incl {dst, src0} + a.areFlagsClobbered = true + + func mulx*(a: var Assembler_x86, dHi, dLo, src0: Operand, src1: Register) = ## Does (dHi, dLo) <- src0 * src1 doAssert src1 == rdx, "MULX requires the RDX register" @@ -1121,6 +1253,24 @@ func mulx*(a: var Assembler_x86, dHi: Operand, dLo: Register, src0: Operand, src a.operands.incl src0.desc a.regClobbers.incl dLo +func mulx*(a: var Assembler_x86, dHi: Operand, dLo, src0, src1: Register) = + ## Does (dHi, dLo) <- src0 * src1 + doAssert src1 == rdx, "MULX requires the RDX register" + a.regClobbers.incl rdx + + doAssert dHi.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, + "The destination operand must be a register " & $dHi.repr + doAssert dHi.desc.constraint in OutputReg + + # Annoying AT&T syntax + if a.wordBitWidth == 64: + a.code &= "mulxq %%" & $src0 & ", %%" & $dLo & ", %" & $dHi.desc.asmId & '\n' + else: + a.code &= "mulxl %%" & $src0 & ", %%" & $dLo & ", %" & $dHi.desc.asmId & '\n' + + a.regClobbers.incl src0 + a.regClobbers.incl dLo + func mulx*(a: var Assembler_x86, dHi: OperandReuse, dLo, src0: Operand, src1: Register) = ## Does (dHi, dLo) <- src0 * src1 doAssert src1 == rdx, "MULX requires the RDX register" diff --git a/constantine/platforms/x86/macro_assembler_x86_intel.nim b/constantine/platforms/x86/macro_assembler_x86_intel.nim index 357c0953..e9bb3c44 100644 --- a/constantine/platforms/x86/macro_assembler_x86_intel.nim +++ b/constantine/platforms/x86/macro_assembler_x86_intel.nim @@ -7,8 +7,8 @@ # at your option. This file may not be copied, modified, or distributed except according to those terms. import - std/[macros, strutils, sets, hashes, algorithm, sequtils], - ../config + std/[macros, strutils, sets, hashes, algorithm, sequtils, enumutils], + ../[config, bithacks] # A compile-time inline assembler @@ -306,8 +306,7 @@ func asmArray*(nimSymbol: NimNode, len: int, rm: RM, constraint: Constraint, mem desc: desc, kind: kFromArray, offset: i) - else: - # For ElemsInReg + elif rm == ElemsInReg: # We can't store an array in register so we create assign individual register # per array elements instead for i in 0 ..< len: @@ -320,6 +319,8 @@ func asmArray*(nimSymbol: NimNode, len: int, rm: RM, constraint: Constraint, mem result.buf[i] = Operand( desc: desc, kind: kRegister) + else: + error "Not implemented" func asArrayAddr*(op: Operand, memPointer: NimNode, len: int, memIndirect: MemIndirectAccess): Operand = ## Use the value stored in an operand as an array address @@ -432,7 +433,7 @@ func generate*(a: Assembler_x86): NimNode = params.add newLit(": ") & inOperands.foldl(a & newLit(", ") & b) & newLit("\n") else: params.add newLit(":\n") - + let clobbers = [(a.isStackClobbered, "sp"), (a.areFlagsClobbered, "cc"), (memClobbered, "memory")] @@ -484,7 +485,9 @@ func generate*(a: Assembler_x86): NimNode = newEmptyNode(), result) -func getStrOffset(a: Assembler_x86, op: Operand): string = +func getStrOffset(a: Assembler_x86, op: Operand, force32IfReg = false): string = + # force32IfReg forces uses of 32-bit registers (memory operand are not changed) + if op.kind != kFromArray: if op.kind in {kArrayAddr, k2dArrayAddr}: # We are operating on an array pointer @@ -493,8 +496,13 @@ func getStrOffset(a: Assembler_x86, op: Operand): string = return op.buf[0].desc.asmId else: return "%" & op.buf[0].desc.asmId + elif op.kind == kRegister: + if force32IfReg: + return "%k" & op.desc.asmId + else: + return "%" & op.desc.asmId else: - return "%" & op.desc.asmId + error "Unsupported: " & $op.kind # Beware GCC / Clang differences with displacements # https://lists.llvm.org/pipermail/llvm-dev/2017-August/116202.html @@ -564,7 +572,8 @@ func getStrOffset(a: Assembler_x86, op: Operand): string = return "DWORD ptr [" & op.desc.asmId & ']' return "DWORD ptr [" & op.desc.asmId & " + " & $(op.offset * a.wordSize) & ']' else: - error "Unsupported: " & $op.desc.rm.ord + error "Unsupported: " & $op.desc.rm.symbolName() & "\n" & + op.repr func codeFragment(a: var Assembler_x86, instr: string, op: Operand) = # Generate a code fragment @@ -628,7 +637,7 @@ func codeFragment(a: var Assembler_x86, instr: string, reg0, reg1: Register) = a.regClobbers.incl reg0 a.regClobbers.incl reg1 -func codeFragment(a: var Assembler_x86, instr: string, op: Operand, imm: int) = +func codeFragment(a: var Assembler_x86, instr: string, op: Operand, imm: SomeInteger) = # Generate a code fragment # ⚠️ Warning: # The caller should deal with destination/source operand @@ -640,6 +649,21 @@ func codeFragment(a: var Assembler_x86, instr: string, op: Operand, imm: int) = if op.desc.constraint != asmClobberedRegister: a.operands.incl op.desc +func codeFragment(a: var Assembler_x86, instr: string, op0, op1: Operand, imm: SomeInteger) = + # Generate a code fragment + # ⚠️ Warning: + # The caller should deal with destination/source operand + # so that it fits Intel Assembly + let off0 = a.getStrOffset(op0) + let off1 = a.getStrOffset(op1) + + a.code &= instr & " " & off0 & ", " & off1 & ", " & $imm & '\n' + + if op0.desc.constraint != asmClobberedRegister: + a.operands.incl op0.desc + if op1.desc.constraint != asmClobberedRegister: + a.operands.incl op1.desc + func codeFragment(a: var Assembler_x86, instr: string, op: OperandReuse, reg: Register) = # Generate a code fragment # ⚠️ Warning: @@ -656,7 +680,7 @@ func codeFragment(a: var Assembler_x86, instr: string, reg: Register, op: Operan a.code &= instr & " " & $reg & ", %" & $op.asmId & '\n' a.regClobbers.incl reg -func codeFragment(a: var Assembler_x86, instr: string, reg: Register, imm: int) = +func codeFragment(a: var Assembler_x86, instr: string, reg: Register, imm: SomeInteger) = # Generate a code fragment # ⚠️ Warning: # The caller should deal with destination/source operand @@ -664,7 +688,7 @@ func codeFragment(a: var Assembler_x86, instr: string, reg: Register, imm: int) a.code &= instr & " " & $reg & ", " & $imm & '\n' a.regClobbers.incl reg -func codeFragment(a: var Assembler_x86, instr: string, reg: OperandReuse, imm: int) = +func codeFragment(a: var Assembler_x86, instr: string, reg: OperandReuse, imm: SomeInteger) = # Generate a code fragment # ⚠️ Warning: # The caller should deal with destination/source operand @@ -740,6 +764,30 @@ func isOutput(op: Operand): bool = # Instructions # ------------------------------------------------------------------------------------------------------------ +const Reg8Low = [ + rbx: "bl", + rdx: "dl", + r8: "r8l", + rax: "al", + xmm0: "invalid", +] + +const Reg32 = [ # Allow using 32-bit reg on 64-bit for smaller instructions + rbx: "ebx", + rdx: "edx", + r8 : "r8d", + rax: "eax", + xmm0: "invalid", +] + +func setc*(a: var Assembler_x86, dst: Register) = + ## Set destination to 1 if carry flag is set + ## and 0 otherwise + + # This only works with 8-bit reg so we special-map them + a.code &= "setc " & Reg8Low[dst] & '\n' + # No flags affected + func add*(a: var Assembler_x86, dst, src: Operand) = ## Does: dst <- dst + src doAssert dst.isOutput() @@ -776,7 +824,7 @@ func adc*(a: var Assembler_x86, dst, src: Register) = a.codeFragment("adc", dst, src) a.areFlagsClobbered = true -func adc*(a: var Assembler_x86, dst: Operand, imm: int) = +func adc*(a: var Assembler_x86, dst: Operand, imm: SomeInteger) = ## Does: dst <- dst + imm + borrow doAssert dst.isOutput() doAssert dst.desc.rm notin {Mem, MemOffsettable}, @@ -791,7 +839,7 @@ func adc*(a: var Assembler_x86, dst: Operand, src: Register) = a.codeFragment("adc", dst, src) a.areFlagsClobbered = true -func adc*(a: var Assembler_x86, dst: Register, imm: int) = +func adc*(a: var Assembler_x86, dst: Register, imm: SomeInteger) = ## Does: dst <- dst + src a.codeFragment("adc", dst, imm) a.areFlagsClobbered = true @@ -802,6 +850,15 @@ func sub*(a: var Assembler_x86, dst, src: Operand) = a.codeFragment("sub", dst, src) a.areFlagsClobbered = true +func sub*(a: var Assembler_x86, dst: Operand, imm: SomeInteger) = + ## Does: dst <- dst - imm + doAssert dst.isOutput() + doAssert dst.desc.rm notin {Mem, MemOffsettable}, + "Using subborrow with a memory destination, this incurs significant performance penalties." + + a.codeFragment("sub", dst, imm) + a.areFlagsClobbered = true + func sbb*(a: var Assembler_x86, dst, src: Operand) = ## Does: dst <- dst - src - borrow doAssert dst.isOutput() @@ -811,7 +868,7 @@ func sbb*(a: var Assembler_x86, dst, src: Operand) = a.codeFragment("sbb", dst, src) a.areFlagsClobbered = true -func sbb*(a: var Assembler_x86, dst: Operand, imm: int) = +func sbb*(a: var Assembler_x86, dst: Operand, imm: SomeInteger) = ## Does: dst <- dst - imm - borrow doAssert dst.isOutput() doAssert dst.desc.rm notin {Mem, MemOffsettable}, @@ -820,7 +877,7 @@ func sbb*(a: var Assembler_x86, dst: Operand, imm: int) = a.codeFragment("sbb", dst, imm) a.areFlagsClobbered = true -func sbb*(a: var Assembler_x86, dst: Register, imm: int) = +func sbb*(a: var Assembler_x86, dst: Register, imm: SomeInteger) = ## Does: dst <- dst - imm - borrow a.codeFragment("sbb", dst, imm) a.areFlagsClobbered = true @@ -830,7 +887,7 @@ func sbb*(a: var Assembler_x86, dst, src: Register) = a.codeFragment("sbb", dst, src) a.areFlagsClobbered = true -func sbb*(a: var Assembler_x86, dst: OperandReuse, imm: int) = +func sbb*(a: var Assembler_x86, dst: OperandReuse, imm: SomeInteger) = ## Does: dst <- dst - imm - borrow a.codeFragment("sbb", dst, imm) a.areFlagsClobbered = true @@ -840,7 +897,13 @@ func sbb*(a: var Assembler_x86, dst, src: OperandReuse) = a.codeFragment("sbb", dst, src) a.areFlagsClobbered = true -func sar*(a: var Assembler_x86, dst: Operand, imm: int) = +func shld*(a: var Assembler_x86, inout: Operand, src: Operand, imm: SomeInteger) = + ## Does double precision left shift + ## inout <- (inout, src) << imm + a.codeFragment("shld", inout, src, imm) + a.areFlagsClobbered = true + +func sar*(a: var Assembler_x86, dst: Operand, imm: SomeInteger) = ## Does Arithmetic Right Shift (i.e. with sign extension) doAssert dst.isOutput() a.codeFragment("sar", dst, imm) @@ -852,7 +915,13 @@ func `and`*(a: var Assembler_x86, dst: Operand, src: Register) = a.codeFragment("and", dst, src) a.areFlagsClobbered = true -func `and`*(a: var Assembler_x86, dst: OperandReuse, imm: int) = +func `and`*(a: var Assembler_x86, dst: Operand, imm: SomeInteger) = + ## Compute the bitwise AND of x and y and + ## set the Sign, Zero and Parity flags + a.codeFragment("and", dst, imm) + a.areFlagsClobbered = true + +func `and`*(a: var Assembler_x86, dst: OperandReuse, imm: SomeInteger) = ## Compute the bitwise AND of x and y and ## set the Sign, Zero and Parity flags a.codeFragment("and", dst, imm) @@ -903,14 +972,31 @@ func `or`*(a: var Assembler_x86, dst: OperandReuse, src: Operand) = func `xor`*(a: var Assembler_x86, dst, src: Operand) = ## Compute the bitwise xor of x and y and ## reset all flags - a.codeFragment("xor", dst, src) - a.areFlagsClobbered = true + if dst.desc == src.desc: + # Special case zero-ing so it uses 32-bit registers + # and rely on auto zero-clear of upper bits + let off = a.getStrOffset(dst, force32IfReg = true) + a.code &= "xor " & off & ", " & off & '\n' + + if dst.desc.constraint != asmClobberedRegister: + a.operands.incl dst.desc + a.areFlagsClobbered = true + else: + a.codeFragment("xor", src, dst) + a.areFlagsClobbered = true func `xor`*(a: var Assembler_x86, dst, src: Register) = ## Compute the bitwise xor of x and y and ## reset all flags - a.codeFragment("xor", dst, src) - a.areFlagsClobbered = true + if dst == src: + # Special case zero-ing so it uses 32-bit registers + a.code &= "xor " & Reg32[dst] & ", " & Reg32[dst] & '\n' + + a.regClobbers.incl dst + a.areFlagsClobbered = true + else: + a.codeFragment("xor", src, dst) + a.areFlagsClobbered = true func mov*(a: var Assembler_x86, dst, src: Operand) = ## Does: dst <- src @@ -933,16 +1019,31 @@ func mov*(a: var Assembler_x86, dst: OperandReuse, src: Operand) = a.codeFragment("mov", dst, src) # No clobber -func mov*(a: var Assembler_x86, dst: Operand, imm: int) = +func mov*(a: var Assembler_x86, dst: Operand, imm: SomeInteger) = ## Does: dst <- imm doAssert dst.isOutput(), $dst.repr - a.codeFragment("mov", dst, imm) + # We special-case register <- immediate mov + # as they can take up to 10 bytes (2 bytes REX instr + 8-byte data) + # on x86-64 even though most of the time we load small values + if log2_vartime(uint64 imm) >= 32: + a.codeFragment("mov", dst, imm) + else: + let off = a.getStrOffset(dst, force32IfReg = true) + a.code &= "mov " & off & ", " & $imm & '\n' # No clobber -func mov*(a: var Assembler_x86, dst: Register, imm: int) = +func mov*(a: var Assembler_x86, dst: Register, imm: SomeInteger) = ## Does: dst <- src with dst a fixed register - a.codeFragment("mov", dst, imm) + + # We special-case register <- immediate mov + # as they can take up to 10 bytes (2 bytes REX instr + 8-byte data) + # on x86-64 even though most of the time we load small values + if log2_vartime(uint64 imm) >= 32: + a.codeFragment("mov", dst, imm) + else: + a.code &= "mov " & Reg32[dst] & ", " & $imm & '\n' + a.regClobbers.incl dst func mov*(a: var Assembler_x86, dst: Register, src: Operand) = ## Does: dst <- src with dst a fixed register @@ -977,6 +1078,11 @@ func cmovnc*(a: var Assembler_x86, dst, src: Operand) = a.codeFragment("cmovnc", dst, src) # No clobber +func cmovnc*(a: var Assembler_x86, dst: Register, src: Operand) = + ## Does: dst <- src if the carry flag is not set + a.codeFragment("cmovnc", dst, src) + # No clobber + func cmovz*(a: var Assembler_x86, dst: Operand, src: Register) = ## Does: dst <- src if the zero flag is not set doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr @@ -1034,6 +1140,7 @@ func mul*(a: var Assembler_x86, dHi, dLo: Register, src0: Operand, src1: Registe a.regClobbers.incl rdx a.codeFragment("mul", src0) + a.areFlagsClobbered = true func imul*(a: var Assembler_x86, dst, src: Operand) = ## Does dst <- dst * src, keeping only the low half @@ -1041,10 +1148,27 @@ func imul*(a: var Assembler_x86, dst, src: Operand) = doAssert dst.isOutput(), $dst.repr a.codeFragment("imul", dst, src) + a.areFlagsClobbered = true func imul*(a: var Assembler_x86, dst: Register, src: Operand) = ## Does dst <- dst * src, keeping only the low half a.codeFragment("imul", dst, src) + a.areFlagsClobbered = true + +func imul*(a: var Assembler_x86, dst: Register, src0: Operand, imm: SomeInteger) = + ## Does dst <- a * imm, keeping only the low half + let off0 = a.getStrOffset(src0) + + a.code &= "imul " & $dst & ", " & $off0 & ", " & $imm & '\n' + a.regClobbers.incl dst + a.areFlagsClobbered = true + a.operands.incl(src0.desc) + +func imul*(a: var Assembler_x86, dst: Register, src0: Register, imm: SomeInteger) = + ## Does dst <- a * imm, keeping only the low half + a.code &= "imul " & $dst & ", " & $src0 & ", " & $imm '\n' + a.regClobbers.incl {dst, src0} + a.areFlagsClobbered = true func mulx*(a: var Assembler_x86, dHi, dLo, src0: Operand, src1: Register) = ## Does (dHi, dLo) <- src0 * src1 @@ -1063,6 +1187,7 @@ func mulx*(a: var Assembler_x86, dHi, dLo, src0: Operand, src1: Register) = a.code &= "mulx %" & $dHi.desc.asmId & ", %" & $dLo.desc.asmId & ", " & off0 & '\n' a.operands.incl src0.desc + # No clobber func mulx*(a: var Assembler_x86, dHi: Operand, dLo: Register, src0: Operand, src1: Register) = ## Does (dHi, dLo) <- src0 * src1 @@ -1080,6 +1205,20 @@ func mulx*(a: var Assembler_x86, dHi: Operand, dLo: Register, src0: Operand, src a.operands.incl src0.desc a.regClobbers.incl dLo +func mulx*(a: var Assembler_x86, dHi: Operand, dLo, src0, src1: Register) = + ## Does (dHi, dLo) <- src0 * src1 + doAssert src1 == rdx, "MULX requires the RDX register" + a.regClobbers.incl rdx + + doAssert dHi.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, + "The destination operand must be a register " & $dHi.repr + doAssert dHi.desc.constraint in OutputReg + + a.code &= "mulx %" & $dHi.desc.asmId & ", " & $dLo & ", " & $src0 & '\n' + + a.regClobbers.incl src0 + a.regClobbers.incl dLo + func mulx*(a: var Assembler_x86, dHi: OperandReuse, dLo, src0: Operand, src1: Register) = ## Does (dHi, dLo) <- src0 * src1 doAssert src1 == rdx, "MULX requires the RDX register" diff --git a/metering/eip2537.md b/metering/eip2537.md index 6450b4b9..dce863a5 100644 --- a/metering/eip2537.md +++ b/metering/eip2537.md @@ -86,8 +86,8 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst | `-=`*(a: var FF; b: FF) | 4 | 20000000.000 | 0.200 | 0.050 | 0.132 | 0.033 | | diff*(r: var FF; a, b: FF) | 2 | 25000000.000 | 0.080 | 0.040 | 0.099 | 0.050 | | double*(r: var FF; a: FF) | 1 | 24390243.902 | 0.041 | 0.041 | 0.033 | 0.033 | -| prod*(r: var FF; a, b: FF; skipFinalSub: static bool = f ... | 12 | 16597510.373 | 0.723 | 0.060 | 1.155 | 0.096 | -| square*(r: var FF; a: FF; skipFinalSub: static bool = false) | 4 | 16666666.667 | 0.240 | 0.060 | 0.363 | 0.091 | +| prod*(r: var FF; a, b: FF; lazyReduce: static bool = f ... | 12 | 16597510.373 | 0.723 | 0.060 | 1.155 | 0.096 | +| square*(r: var FF; a: FF; lazyReduce: static bool = false) | 4 | 16666666.667 | 0.240 | 0.060 | 0.363 | 0.091 | | csub*(a: var FF; b: FF; ctl: SecretBool) | 1 | 5555555.556 | 0.180 | 0.180 | 0.495 | 0.495 | | div2*(a: var FF) | 1 | 25000000.000 | 0.040 | 0.040 | 0.066 | 0.066 | | `*=`*(a: var FF; b: FF) | 4 | 7797270.955 | 0.513 | 0.128 | 1.287 | 0.322 | @@ -108,7 +108,7 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst | sum*(r: var FF; a, b: FF) | 12 | 23483365.949 | 0.511 | 0.043 | 0.528 | 0.044 | | diff*(r: var FF; a, b: FF) | 2 | 22222222.222 | 0.090 | 0.045 | 0.099 | 0.050 | | double*(r: var FF; a: FF) | 5 | 25000000.000 | 0.200 | 0.040 | 0.198 | 0.040 | -| prod*(r: var FF; a, b: FF; skipFinalSub: static bool = f ... | 12 | 16853932.584 | 0.712 | 0.059 | 1.188 | 0.099 | +| prod*(r: var FF; a, b: FF; lazyReduce: static bool = f ... | 12 | 16853932.584 | 0.712 | 0.059 | 1.188 | 0.099 | | `*=`*(a: var FF; b: FF) | 7 | 7856341.190 | 0.891 | 0.127 | 2.277 | 0.325 | | sum*(r: var ECP_ShortW_Prj[F, G]; P, Q: ECP_ShortW_Prj[F ... | 1 | 296208.531 | 3.376 | 3.376 | 11.022 | 11.022 | @@ -128,14 +128,14 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst | double*(a: var FF) | 645 | 23402634.157 | 27.561 | 0.043 | 28.512 | 0.044 | | diff*(r: var FF; a, b: FF) | 273 | 23522316.043 | 11.606 | 0.043 | 11.913 | 0.044 | | double*(r: var FF; a: FF) | 201 | 23547328.960 | 8.536 | 0.042 | 8.745 | 0.044 | -| prod*(r: var FF; a, b: FF; skipFinalSub: static bool = f ... | 976 | 16877345.277 | 57.829 | 0.059 | 96.096 | 0.098 | -| square*(r: var FF; a: FF; skipFinalSub: static bool = false) | 877 | 16773453.189 | 52.285 | 0.060 | 86.790 | 0.099 | +| prod*(r: var FF; a, b: FF; lazyReduce: static bool = f ... | 976 | 16877345.277 | 57.829 | 0.059 | 96.096 | 0.098 | +| square*(r: var FF; a: FF; lazyReduce: static bool = false) | 877 | 16773453.189 | 52.285 | 0.060 | 86.790 | 0.099 | | cneg*(r: var FF; a: FF; ctl: SecretBool) | 66 | 5299927.728 | 12.453 | 0.189 | 34.386 | 0.521 | | cneg*(a: var FF; ctl: SecretBool) | 66 | 3931145.393 | 16.789 | 0.254 | 48.906 | 0.741 | | csub*(a: var FF; b: FF; ctl: SecretBool) | 72 | 5666168.254 | 12.707 | 0.176 | 35.046 | 0.487 | | div2*(a: var FF) | 72 | 22893481.717 | 3.145 | 0.044 | 3.201 | 0.044 | | `*=`*(a: var FF; b: FF) | 354 | 7933839.844 | 44.619 | 0.126 | 113.256 | 0.320 | -| square*(a: var FF; skipFinalSub: static bool = false) | 129 | 7914110.429 | 16.300 | 0.126 | 41.481 | 0.322 | +| square*(a: var FF; lazyReduce: static bool = false) | 129 | 7914110.429 | 16.300 | 0.126 | 41.481 | 0.322 | | sum*(r: var ECP_ShortW_Jac[F, G]; P, Q: ECP_ShortW_Jac[F ... | 8 | 291194.991 | 27.473 | 3.434 | 89.826 | 11.228 | | mixedSum*(r: var ECP_ShortW_Jac[F, G]; P: ECP_ShortW_Jac[F, ... | 64 | 337156.193 | 189.823 | 2.966 | 619.014 | 9.672 | | double*(r: var ECP_ShortW_Jac[F, G]; P: ECP_ShortW_Jac[F ... | 129 | 578076.127 | 223.154 | 1.730 | 722.271 | 5.599 | @@ -160,8 +160,8 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst | sum*(r: var FF; a, b: FF) | 610 | 23521246.240 | 25.934 | 0.043 | 26.466 | 0.043 | | diff*(r: var FF; a, b: FF) | 80 | 23289665.211 | 3.435 | 0.043 | 3.300 | 0.041 | | double*(r: var FF; a: FF) | 1005 | 23298945.172 | 43.135 | 0.043 | 44.187 | 0.044 | -| prod*(r: var FF; a, b: FF; skipFinalSub: static bool = f ... | 1612 | 16807774.117 | 95.908 | 0.059 | 160.479 | 0.100 | -| square*(r: var FF; a: FF; skipFinalSub: static bool = false) | 258 | 16701191.093 | 15.448 | 0.060 | 25.674 | 0.100 | +| prod*(r: var FF; a, b: FF; lazyReduce: static bool = f ... | 1612 | 16807774.117 | 95.908 | 0.059 | 160.479 | 0.100 | +| square*(r: var FF; a: FF; lazyReduce: static bool = false) | 258 | 16701191.093 | 15.448 | 0.060 | 25.674 | 0.100 | | cneg*(r: var FF; a: FF; ctl: SecretBool) | 66 | 5659406.620 | 11.662 | 0.177 | 31.977 | 0.484 | | cneg*(a: var FF; ctl: SecretBool) | 66 | 4113943.776 | 16.043 | 0.243 | 46.365 | 0.703 | | `*=`*(a: var FF; b: FF) | 506 | 7906867.724 | 63.995 | 0.126 | 161.898 | 0.320 | @@ -188,10 +188,10 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst | double*(a: var FF) | 645 | 23330680.749 | 27.646 | 0.043 | 27.918 | 0.043 | | diff*(r: var FF; a, b: FF) | 239 | 23376369.327 | 10.224 | 0.043 | 10.494 | 0.044 | | double*(r: var FF; a: FF) | 129 | 23314657.509 | 5.533 | 0.043 | 5.577 | 0.043 | -| prod*(r: var FF; a, b: FF; skipFinalSub: static bool = f ... | 768 | 16917044.804 | 45.398 | 0.059 | 75.669 | 0.099 | -| square*(r: var FF; a: FF; skipFinalSub: static bool = false) | 824 | 16851060.349 | 48.899 | 0.059 | 81.246 | 0.099 | +| prod*(r: var FF; a, b: FF; lazyReduce: static bool = f ... | 768 | 16917044.804 | 45.398 | 0.059 | 75.669 | 0.099 | +| square*(r: var FF; a: FF; lazyReduce: static bool = false) | 824 | 16851060.349 | 48.899 | 0.059 | 81.246 | 0.099 | | `*=`*(a: var FF; b: FF) | 417 | 7897129.005 | 52.804 | 0.127 | 132.957 | 0.319 | -| square*(a: var FF; skipFinalSub: static bool = false) | 129 | 7915567.282 | 16.297 | 0.126 | 41.283 | 0.320 | +| square*(a: var FF; lazyReduce: static bool = false) | 129 | 7915567.282 | 16.297 | 0.126 | 41.283 | 0.320 | | double*(r: var ECP_ShortW_Jac[F, G]; P: ECP_ShortW_Jac[F ... | 129 | 577954.400 | 223.201 | 1.730 | 722.403 | 5.600 | | sum_vartime*(r: var ECP_ShortW_Jac[F, G]; p, q: ECP_Shor ... | 6 | 418205.897 | 14.347 | 2.391 | 46.728 | 7.788 | | mixedSum_vartime*(r: var ECP_ShortW_Jac[F, G]; p: ECP_ShortW ... | 49 | 558188.280 | 87.784 | 1.792 | 284.097 | 5.798 | @@ -216,8 +216,8 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst | sum*(r: var FF; a, b: FF) | 256 | 23374726.077 | 10.952 | 0.043 | 10.989 | 0.043 | | diff*(r: var FF; a, b: FF) | 53 | 23378914.865 | 2.267 | 0.043 | 2.145 | 0.040 | | double*(r: var FF; a: FF) | 640 | 23390103.063 | 27.362 | 0.043 | 28.116 | 0.044 | -| prod*(r: var FF; a, b: FF; skipFinalSub: static bool = f ... | 1301 | 16744317.743 | 77.698 | 0.060 | 129.690 | 0.100 | -| square*(r: var FF; a: FF; skipFinalSub: static bool = false) | 362 | 16813748.258 | 21.530 | 0.059 | 35.838 | 0.099 | +| prod*(r: var FF; a, b: FF; lazyReduce: static bool = f ... | 1301 | 16744317.743 | 77.698 | 0.060 | 129.690 | 0.100 | +| square*(r: var FF; a: FF; lazyReduce: static bool = false) | 362 | 16813748.258 | 21.530 | 0.059 | 35.838 | 0.099 | | `*=`*(a: var FF; b: FF) | 347 | 7342361.405 | 47.260 | 0.136 | 121.506 | 0.350 | | double*(r: var ECP_ShortW_Prj[F, G]; P: ECP_ShortW_Prj[F ... | 128 | 594950.359 | 215.144 | 1.681 | 696.036 | 5.438 | | sum_vartime*(r: var ECP_ShortW_Prj[F, G]; p, q: ECP_Shor ... | 6 | 472255.018 | 12.705 | 2.118 | 41.217 | 6.869 | @@ -283,8 +283,8 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst | sumUnr*(r: var FF; a, b: FF) | 1694 | 23918108.013 | 70.825 | 0.042 | 69.465 | 0.041 | | diff*(r: var FF; a, b: FF) | 968 | 23438824.185 | 41.299 | 0.043 | 42.207 | 0.044 | | double*(r: var FF; a: FF) | 824 | 23427060.529 | 35.173 | 0.043 | 36.366 | 0.044 | -| prod*(r: var FF; a, b: FF; skipFinalSub: static bool = f ... | 6 | 16620498.615 | 0.361 | 0.060 | 0.594 | 0.099 | -| square*(r: var FF; a: FF; skipFinalSub: static bool = false) | 2 | 15384615.385 | 0.130 | 0.065 | 0.231 | 0.116 | +| prod*(r: var FF; a, b: FF; lazyReduce: static bool = f ... | 6 | 16620498.615 | 0.361 | 0.060 | 0.594 | 0.099 | +| square*(r: var FF; a: FF; lazyReduce: static bool = false) | 2 | 15384615.385 | 0.130 | 0.065 | 0.231 | 0.116 | | cneg*(r: var FF; a: FF; ctl: SecretBool) | 136 | 5648544.254 | 24.077 | 0.177 | 66.000 | 0.485 | | cneg*(a: var FF; ctl: SecretBool) | 136 | 4105288.578 | 33.128 | 0.244 | 95.700 | 0.704 | | csub*(a: var FF; b: FF; ctl: SecretBool) | 144 | 5656153.030 | 25.459 | 0.177 | 70.059 | 0.487 | @@ -315,8 +315,8 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst | sumUnr*(r: var FF; a, b: FF) | 2446 | 23699253.948 | 103.210 | 0.042 | 101.277 | 0.041 | | diff*(r: var FF; a, b: FF) | 496 | 23511566.174 | 21.096 | 0.043 | 22.044 | 0.044 | | double*(r: var FF; a: FF) | 1488 | 23267086.767 | 63.953 | 0.043 | 64.779 | 0.044 | -| prod*(r: var FF; a, b: FF; skipFinalSub: static bool = f ... | 6 | 16666666.667 | 0.360 | 0.060 | 0.627 | 0.104 | -| square*(r: var FF; a: FF; skipFinalSub: static bool = false) | 2 | 15384615.385 | 0.130 | 0.065 | 0.231 | 0.116 | +| prod*(r: var FF; a, b: FF; lazyReduce: static bool = f ... | 6 | 16666666.667 | 0.360 | 0.060 | 0.627 | 0.104 | +| square*(r: var FF; a: FF; lazyReduce: static bool = false) | 2 | 15384615.385 | 0.130 | 0.065 | 0.231 | 0.116 | | cneg*(r: var FF; a: FF; ctl: SecretBool) | 136 | 5655591.134 | 24.047 | 0.177 | 65.835 | 0.484 | | cneg*(a: var FF; ctl: SecretBool) | 136 | 4113359.344 | 33.063 | 0.243 | 95.832 | 0.705 | | sum*(r: var ECP_ShortW_Prj[F, G]; P, Q: ECP_ShortW_Prj[F ... | 8 | 155315.679 | 51.508 | 6.439 | 168.828 | 21.104 | @@ -344,8 +344,8 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst | sumUnr*(r: var FF; a, b: FF) | 1562 | 23860077.904 | 65.465 | 0.042 | 64.548 | 0.041 | | diff*(r: var FF; a, b: FF) | 992 | 23484292.512 | 42.241 | 0.043 | 44.319 | 0.045 | | double*(r: var FF; a: FF) | 700 | 23352793.995 | 29.975 | 0.043 | 29.997 | 0.043 | -| prod*(r: var FF; a, b: FF; skipFinalSub: static bool = f ... | 6 | 16620498.615 | 0.361 | 0.060 | 0.594 | 0.099 | -| square*(r: var FF; a: FF; skipFinalSub: static bool = false) | 2 | 16666666.667 | 0.120 | 0.060 | 0.198 | 0.099 | +| prod*(r: var FF; a, b: FF; lazyReduce: static bool = f ... | 6 | 16620498.615 | 0.361 | 0.060 | 0.594 | 0.099 | +| square*(r: var FF; a: FF; lazyReduce: static bool = false) | 2 | 16666666.667 | 0.120 | 0.060 | 0.198 | 0.099 | | double*(r: var ECP_ShortW_Jac[F, G]; P: ECP_ShortW_Jac[F ... | 67 | 278583.962 | 240.502 | 3.590 | 785.268 | 11.720 | | sum_vartime*(r: var ECP_ShortW_Jac[F, G]; p, q: ECP_Shor ... | 4 | 220337.116 | 18.154 | 4.538 | 59.367 | 14.842 | | mixedSum_vartime*(r: var ECP_ShortW_Jac[F, G]; p: ECP_ShortW ... | 69 | 296900.616 | 232.401 | 3.368 | 758.835 | 10.998 | @@ -371,8 +371,8 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst | sumUnr*(r: var FF; a, b: FF) | 2094 | 23815210.345 | 87.927 | 0.042 | 85.602 | 0.041 | | diff*(r: var FF; a, b: FF) | 465 | 23317621.101 | 19.942 | 0.043 | 20.262 | 0.044 | | double*(r: var FF; a: FF) | 936 | 23199900.855 | 40.345 | 0.043 | 40.755 | 0.044 | -| prod*(r: var FF; a, b: FF; skipFinalSub: static bool = f ... | 6 | 16666666.667 | 0.360 | 0.060 | 0.594 | 0.099 | -| square*(r: var FF; a: FF; skipFinalSub: static bool = false) | 2 | 16666666.667 | 0.120 | 0.060 | 0.198 | 0.099 | +| prod*(r: var FF; a, b: FF; lazyReduce: static bool = f ... | 6 | 16666666.667 | 0.360 | 0.060 | 0.594 | 0.099 | +| square*(r: var FF; a: FF; lazyReduce: static bool = false) | 2 | 16666666.667 | 0.120 | 0.060 | 0.198 | 0.099 | | double*(r: var ECP_ShortW_Prj[F, G]; P: ECP_ShortW_Prj[F ... | 67 | 242212.156 | 276.617 | 4.129 | 904.695 | 13.503 | | sum_vartime*(r: var ECP_ShortW_Prj[F, G]; p, q: ECP_Shor ... | 4 | 247662.683 | 16.151 | 4.038 | 52.734 | 13.184 | | mixedSum_vartime*(r: var ECP_ShortW_Prj[F, G]; p: ECP_ShortW ... | 62 | 300657.081 | 206.215 | 3.326 | 672.903 | 10.853 | @@ -399,9 +399,9 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst | sumUnr*(r: var FF; a, b: FF) | 11538 | 24022635.994 | 480.297 | 0.042 | 473.352 | 0.041 | | diff*(r: var FF; a, b: FF) | 5015 | 23495514.067 | 213.445 | 0.043 | 218.988 | 0.044 | | double*(r: var FF; a: FF) | 3332 | 23423385.424 | 142.251 | 0.043 | 144.870 | 0.043 | -| prod*(r: var FF; a, b: FF; skipFinalSub: static bool = f ... | 304 | 16876700.161 | 18.013 | 0.059 | 30.063 | 0.099 | -| square*(r: var FF; a: FF; skipFinalSub: static bool = false) | 12 | 16194331.984 | 0.741 | 0.062 | 1.254 | 0.104 | -| sumprod*(r: var FF; a, b: array[N, FF]; skipFinalSub: st ... | 36 | 7933010.137 | 4.538 | 0.126 | 10.758 | 0.299 | +| prod*(r: var FF; a, b: FF; lazyReduce: static bool = f ... | 304 | 16876700.161 | 18.013 | 0.059 | 30.063 | 0.099 | +| square*(r: var FF; a: FF; lazyReduce: static bool = false) | 12 | 16194331.984 | 0.741 | 0.062 | 1.254 | 0.104 | +| sumprod*(r: var FF; a, b: array[N, FF]; lazyReduce: st ... | 36 | 7933010.137 | 4.538 | 0.126 | 10.758 | 0.299 | | cadd*(a: var FF; b: FF; ctl: SecretBool) | 60 | 5248425.472 | 11.432 | 0.191 | 31.944 | 0.532 | | csub*(a: var FF; b: FF; ctl: SecretBool) | 30 | 5617977.528 | 5.340 | 0.178 | 14.619 | 0.487 | | div2*(a: var FF) | 252 | 23200147.303 | 10.862 | 0.043 | 11.583 | 0.046 | diff --git a/tests/math_fields/t_finite_fields.nim b/tests/math_fields/t_finite_fields.nim index 021c045c..79afa4b9 100644 --- a/tests/math_fields/t_finite_fields.nim +++ b/tests/math_fields/t_finite_fields.nim @@ -319,52 +319,55 @@ proc largeField() = check: bool r.isZero() - test "fromMont doesn't need a final substraction with 256-bit prime (full word used)": - block: - let a = Fp[Secp256k1].getMinusOne() - let expected = BigInt[256].fromHex"0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2E" + # Outdated tests as Crandall primes / Pseudo-Mersenne primes + # don't use Montgomery representaiton anymore - var r: BigInt[256] - r.fromField(a) + # test "fromMont doesn't need a final substraction with 256-bit prime (full word used)": + # block: + # let a = Fp[Secp256k1].getMinusOne() + # let expected = BigInt[256].fromHex"0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2E" - check: bool(r == expected) - block: - var a: Fp[Secp256k1] - var d: FpDbl[Secp256k1] + # var r: BigInt[256] + # r.fromField(a) - # Set Montgomery repr to the largest field element in Montgomery Residue form - a.mres = BigInt[256].fromHex"0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2E" - d.limbs2x = (BigInt[512].fromHex"0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2E").limbs + # check: bool(r == expected) + # block: + # var a: Fp[Secp256k1] + # var d: FpDbl[Secp256k1] - var r, expected: BigInt[256] + # # Set Montgomery repr to the largest field element in Montgomery Residue form + # a.mres = BigInt[256].fromHex"0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2E" + # d.limbs2x = (BigInt[512].fromHex"0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2E").limbs - r.fromField(a) - expected.limbs.redc2xMont(d.limbs2x, Fp[Secp256k1].getModulus().limbs, Fp[Secp256k1].getNegInvModWord(), Fp[Secp256k1].getSpareBits()) + # var r, expected: BigInt[256] - check: bool(r == expected) + # r.fromField(a) + # expected.limbs.redc2xMont(d.limbs2x, Fp[Secp256k1].getModulus().limbs, Fp[Secp256k1].getNegInvModWord(), Fp[Secp256k1].getSpareBits()) - test "fromMont doesn't need a final substraction with 255-bit prime (1 spare bit)": - block: - let a = Fp[Edwards25519].getMinusOne() - let expected = BigInt[255].fromHex"0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffec" + # check: bool(r == expected) - var r: BigInt[255] - r.fromField(a) + # test "fromMont doesn't need a final substraction with 255-bit prime (1 spare bit)": + # block: + # let a = Fp[Edwards25519].getMinusOne() + # let expected = BigInt[255].fromHex"0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffec" - check: bool(r == expected) - block: - var a: Fp[Edwards25519] - var d: FpDbl[Edwards25519] + # var r: BigInt[255] + # r.fromField(a) + + # check: bool(r == expected) + # block: + # var a: Fp[Edwards25519] + # var d: FpDbl[Edwards25519] - # Set Montgomery repr to the largest field element in Montgomery Residue form - a.mres = BigInt[255].fromHex"0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffec" - d.limbs2x = (BigInt[512].fromHex"0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffec").limbs + # # Set Montgomery repr to the largest field element in Montgomery Residue form + # a.mres = BigInt[255].fromHex"0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffec" + # d.limbs2x = (BigInt[512].fromHex"0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffec").limbs - var r, expected: BigInt[255] + # var r, expected: BigInt[255] - r.fromField(a) - expected.limbs.redc2xMont(d.limbs2x, Fp[Edwards25519].getModulus().limbs, Fp[Edwards25519].getNegInvModWord(), Fp[Edwards25519].getSpareBits()) + # r.fromField(a) + # expected.limbs.redc2xMont(d.limbs2x, Fp[Edwards25519].getModulus().limbs, Fp[Edwards25519].getNegInvModWord(), Fp[Edwards25519].getSpareBits()) - check: bool(r == expected) + # check: bool(r == expected) largeField() diff --git a/tests/math_fields/t_finite_fields_mulsquare.nim b/tests/math_fields/t_finite_fields_mulsquare.nim index 668ace72..87d62e1d 100644 --- a/tests/math_fields/t_finite_fields_mulsquare.nim +++ b/tests/math_fields/t_finite_fields_mulsquare.nim @@ -28,7 +28,7 @@ echo "test_finite_fields_mulsquare xoshiro512** seed: ", seed static: doAssert defined(CTT_TEST_CURVES), "This modules requires the -d:CTT_TEST_CURVES compile option" proc sanity(Name: static Algebra) = - test "Squaring 0,1,2 with " & $Name & " [FastSquaring = " & $(Fp[Name].getSpareBits() >= 2) & "]": + test "Squaring 0,1,2 with " & $Algebra(Name) & " [FastSquaring = " & $(Fp[Name].getSpareBits() >= 2) & "]": block: # 0² mod var n: Fp[Name] diff --git a/tests/math_fields/t_finite_fields_powinv.nim b/tests/math_fields/t_finite_fields_powinv.nim index 00df8a5a..70a7422f 100644 --- a/tests/math_fields/t_finite_fields_powinv.nim +++ b/tests/math_fields/t_finite_fields_powinv.nim @@ -234,13 +234,14 @@ proc main() = r.inv(x) let computed = r.toHex() - check: - computed == expected + check: computed == expected var r2: Fp[BLS12_381] r2.inv_vartime(x) let computed2 = r2.toHex() + check: computed2 == expected + test "Specific tests on Fp[BN254_Snarks]": block: var r, x: Fp[BN254_Snarks] @@ -275,6 +276,9 @@ proc main() = proc testRandomInv(name: static Algebra) = test "Random inversion testing on " & $Algebra(name): var aInv, r: Fp[name] + var aFLT, pm2: Fp[name] + pm2 = Fp[name].fromUint(2'u) + pm2.neg() for _ in 0 ..< Iters: let a = rng.random_unsafe(Fp[name]) @@ -290,6 +294,15 @@ proc main() = r.prod(aInv, a) check: bool r.isOne() or (a.isZero() and r.isZero()) + aFLT = a + aFLT.pow(pm2) + r.prod(a, aFLT) + check: bool r.isOne() or (a.isZero() and r.isZero()) + aFLT = a + aFLT.pow_vartime(pm2) + r.prod(a, aFLT) + check: bool r.isOne() or (a.isZero() and r.isZero()) + for _ in 0 ..< Iters: let a = rng.randomHighHammingWeight(Fp[name]) aInv.inv(a) @@ -303,6 +316,16 @@ proc main() = check: bool r.isOne() or (a.isZero() and r.isZero()) r.prod(aInv, a) check: bool r.isOne() or (a.isZero() and r.isZero()) + + aFLT = a + aFLT.pow(pm2) + r.prod(a, aFLT) + check: bool r.isOne() or (a.isZero() and r.isZero()) + aFLT = a + aFLT.pow_vartime(pm2) + r.prod(a, aFLT) + check: bool r.isOne() or (a.isZero() and r.isZero()) + for _ in 0 ..< Iters: let a = rng.random_long01Seq(Fp[name]) aInv.inv(a) @@ -317,6 +340,16 @@ proc main() = r.prod(aInv, a) check: bool r.isOne() or (a.isZero() and r.isZero()) + aFLT = a + aFLT.pow(pm2) + r.prod(a, aFLT) + check: bool r.isOne() or (a.isZero() and r.isZero()) + aFLT = a + aFLT.pow_vartime(pm2) + r.prod(a, aFLT) + check: bool r.isOne() or (a.isZero() and r.isZero()) + + testRandomInv P224 testRandomInv BN254_Nogami testRandomInv BN254_Snarks diff --git a/tests/math_fields/t_finite_fields_sqrt.nim b/tests/math_fields/t_finite_fields_sqrt.nim index 96d471fd..abe917b1 100644 --- a/tests/math_fields/t_finite_fields_sqrt.nim +++ b/tests/math_fields/t_finite_fields_sqrt.nim @@ -29,7 +29,7 @@ echo "test_finite_fields_sqrt xoshiro512** seed: ", seed static: doAssert defined(CTT_TEST_CURVES), "This modules requires the -d:CTT_TEST_CURVES compile option" proc exhaustiveCheck(Name: static Algebra, modulus: static int) = - test "Exhaustive square root check for " & $Algebra(C): + test "Exhaustive square root check for " & $Algebra(Name): var squares_to_roots: Table[uint16, set[uint16]] # Create all squares @@ -103,7 +103,7 @@ template testSqrtImpl(a: untyped): untyped {.dirty.} = bool(r == a or r == na) proc randomSqrtCheck(Name: static Algebra) = - test "Random square root check for " & $Algebra(C): + test "Random square root check for " & $Algebra(Name): for _ in 0 ..< Iters: let a = rng.random_unsafe(Fp[Name]) testSqrtImpl(a) @@ -129,7 +129,7 @@ template testSqrtRatioImpl(u, v: untyped): untyped {.dirty.} = check: bool(r == u_over_v) proc randomSqrtRatioCheck(Name: static Algebra) = - test "Random square root check for " & $Algebra(C): + test "Random square root check for " & $Algebra(Name): for _ in 0 ..< Iters: let u = rng.random_unsafe(Fp[Name]) let v = rng.random_unsafe(Fp[Name]) diff --git a/tests/math_fields/t_finite_fields_vs_gmp.nim b/tests/math_fields/t_finite_fields_vs_gmp.nim index 2da68470..218b727c 100644 --- a/tests/math_fields/t_finite_fields_vs_gmp.nim +++ b/tests/math_fields/t_finite_fields_vs_gmp.nim @@ -58,7 +58,7 @@ proc binary_prologue[Name: static Algebra, N: static int]( bTest = rng.random_unsafe(Fp[Name]) # Set modulus to curve modulus - let err = mpz_set_str(p, Fp[Name].getmodulus().toHex(), 0) + let err = mpz_set_str(p, Fp[Name].getModulus().toHex(), 0) doAssert err == 0, "Error on prime for curve " & $Name #########################################################