diff --git a/PLANNING.md b/PLANNING.md
index cfeae65c..7725fc1e 100644
--- a/PLANNING.md
+++ b/PLANNING.md
@@ -92,13 +92,13 @@ Other tracks are stretch goals, contributions towards them are accepted.
 
 - ARM assembly
 - Finish Nvidia GPU codegenerator up to MSM
-- Implement a backend for prime moduli of special form with fast reduction
-  that don't need Montgomery form
+- Implement a backend for Solinas prime like P256
 - Implement an unsaturated finite fields backend for Risc-V, WASM, WebGPU, AMD GPU, Apple Metal, Vulkan, ...
     - ideally in LLVM IR so that pristine Risc-V assembly can be generated
       and used in zkVMs without any risk of C stdlib or syscalls being used
       and without depending on the Nim compiler at build time.
 - introduce batchAffine_vartime
+- Optimized square_repeated in assembly for Montgomery and Crandall/Pseudo-Mersenne primes
 
 ### User Experience track
 
diff --git a/benchmarks/bench_elliptic_parallel_template.nim b/benchmarks/bench_elliptic_parallel_template.nim
index a6a7c77b..ce048534 100644
--- a/benchmarks/bench_elliptic_parallel_template.nim
+++ b/benchmarks/bench_elliptic_parallel_template.nim
@@ -35,7 +35,7 @@ export bench_elliptic_template
 #
 # ############################################################
 
-proc multiAddParallelBench*(EC: typedesc, numInputs: int, iters: int) =
+proc multiAddParallelBench*(EC: typedesc, numInputs: int, iters: int) {.noinline.} =
   var points = newSeq[EC_ShortW_Aff[EC.F, EC.G]](numInputs)
 
   for i in 0 ..< numInputs:
@@ -59,7 +59,7 @@ type BenchMsmContext*[EC] = object
   coefs: seq[BigInt[64]] # seq[getBigInt(EC.getName(), kScalarField)]
   points: seq[affine(EC)]
 
-proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmContext[EC] =
+proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmContext[EC] {.noinline.} =
   result.tp = Threadpool.new()
   let maxNumInputs = inputSizes.max()
 
@@ -103,7 +103,7 @@ proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmC
   let stop = getMonotime()
   stdout.write &"in {float64(inNanoSeconds(stop-start)) / 1e6:6.3f} ms\n"
 
-proc msmParallelBench*[EC](ctx: var BenchMsmContext[EC], numInputs: int, iters: int) =
+proc msmParallelBench*[EC](ctx: var BenchMsmContext[EC], numInputs: int, iters: int) {.noinline.} =
   const bits = 64 # EC.getScalarField().bits()
   type ECaff = affine(EC)
 
diff --git a/benchmarks/bench_elliptic_template.nim b/benchmarks/bench_elliptic_template.nim
index 13aca484..cbd20624 100644
--- a/benchmarks/bench_elliptic_template.nim
+++ b/benchmarks/bench_elliptic_template.nim
@@ -72,7 +72,7 @@ func `+=`[F; G: static Subgroup](P: var EC_ShortW_JacExt[F, G], Q: EC_ShortW_Jac
 func `+=`[F; G: static Subgroup](P: var EC_ShortW_JacExt[F, G], Q: EC_ShortW_Aff[F, G]) {.inline.}=
   P.mixedSum_vartime(P, Q)
 
-proc addBench*(EC: typedesc, iters: int) =
+proc addBench*(EC: typedesc, iters: int) {.noinline.} =
   var r {.noInit.}: EC
   let P = rng.random_unsafe(EC)
   let Q = rng.random_unsafe(EC)
@@ -88,7 +88,7 @@ proc addBench*(EC: typedesc, iters: int) =
       bench("EC Add vartime " & $EC.G, EC, iters):
         r.sum_vartime(P, Q)
 
-proc mixedAddBench*(EC: typedesc, iters: int) =
+proc mixedAddBench*(EC: typedesc, iters: int) {.noinline.} =
   var r {.noInit.}: EC
   let P = rng.random_unsafe(EC)
   let Q = rng.random_unsafe(EC)
@@ -106,25 +106,25 @@ proc mixedAddBench*(EC: typedesc, iters: int) =
       bench("EC Mixed Addition vartime " & $EC.G, EC, iters):
         r.mixedSum_vartime(P, Qaff)
 
-proc doublingBench*(EC: typedesc, iters: int) =
+proc doublingBench*(EC: typedesc, iters: int) {.noinline.} =
   var r {.noInit.}: EC
   let P = rng.random_unsafe(EC)
   bench("EC Double " & $EC.G, EC, iters):
     r.double(P)
 
-proc affFromProjBench*(EC: typedesc, iters: int) =
+proc affFromProjBench*(EC: typedesc, iters: int) {.noinline.} =
   var r {.noInit.}: EC_ShortW_Aff[EC.F, EC.G]
   let P = rng.random_unsafe(EC)
   bench("EC Projective to Affine " & $EC.G, EC, iters):
     r.affine(P)
 
-proc affFromJacBench*(EC: typedesc, iters: int) =
+proc affFromJacBench*(EC: typedesc, iters: int) {.noinline.} =
   var r {.noInit.}: EC_ShortW_Aff[EC.F, EC.G]
   let P = rng.random_unsafe(EC)
   bench("EC Jacobian to Affine " & $EC.G, EC, iters):
     r.affine(P)
 
-proc affFromProjBatchBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) =
+proc affFromProjBatchBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) {.noinline.} =
   var r = newSeq[affine(EC)](numPoints)
   var points = newSeq[EC](numPoints)
 
@@ -139,7 +139,7 @@ proc affFromProjBatchBench*(EC: typedesc, numPoints: int, useBatching: bool, ite
       for i in 0 ..< numPoints:
         r[i].affine(points[i])
 
-proc affFromJacBatchBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) =
+proc affFromJacBatchBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) {.noinline.} =
   var r = newSeq[affine(EC)](numPoints)
   var points = newSeq[EC](numPoints)
 
@@ -154,7 +154,7 @@ proc affFromJacBatchBench*(EC: typedesc, numPoints: int, useBatching: bool, iter
       for i in 0 ..< numPoints:
         r[i].affine(points[i])
 
-proc scalarMulGenericBench*(EC: typedesc, bits, window: static int, iters: int) =
+proc scalarMulGenericBench*(EC: typedesc, bits, window: static int, iters: int) {.noinline.} =
   var r {.noInit.}: EC
   var P = rng.random_unsafe(EC)
   P.clearCofactor()
@@ -165,7 +165,7 @@ proc scalarMulGenericBench*(EC: typedesc, bits, window: static int, iters: int)
     r = P
     r.scalarMulGeneric(exponent, window)
 
-proc scalarMulEndo*(EC: typedesc, bits: static int, iters: int) =
+proc scalarMulEndo*(EC: typedesc, bits: static int, iters: int) {.noinline.} =
   var r {.noInit.}: EC
   var P = rng.random_unsafe(EC)
   P.clearCofactor()
@@ -176,7 +176,7 @@ proc scalarMulEndo*(EC: typedesc, bits: static int, iters: int) =
     r = P
     r.scalarMulEndo(exponent)
 
-proc scalarMulEndoWindow*(EC: typedesc, bits: static int, iters: int) =
+proc scalarMulEndoWindow*(EC: typedesc, bits: static int, iters: int) {.noinline.} =
   var r {.noInit.}: EC
   var P = rng.random_unsafe(EC)
   P.clearCofactor()
@@ -190,7 +190,7 @@ proc scalarMulEndoWindow*(EC: typedesc, bits: static int, iters: int) =
     else:
       {.error: "Not implemented".}
 
-proc scalarMulVartimeDoubleAddBench*(EC: typedesc, bits: static int, iters: int) =
+proc scalarMulVartimeDoubleAddBench*(EC: typedesc, bits: static int, iters: int) {.noinline.} =
   var r {.noInit.}: EC
   var P = rng.random_unsafe(EC)
   P.clearCofactor()
@@ -201,7 +201,7 @@ proc scalarMulVartimeDoubleAddBench*(EC: typedesc, bits: static int, iters: int)
     r = P
     r.scalarMul_doubleAdd_vartime(exponent)
 
-proc scalarMulVartimeMinHammingWeightRecodingBench*(EC: typedesc, bits: static int, iters: int) =
+proc scalarMulVartimeMinHammingWeightRecodingBench*(EC: typedesc, bits: static int, iters: int) {.noinline.} =
   var r {.noInit.}: EC
   var P = rng.random_unsafe(EC)
   P.clearCofactor()
@@ -212,7 +212,7 @@ proc scalarMulVartimeMinHammingWeightRecodingBench*(EC: typedesc, bits: static i
     r = P
     r.scalarMul_jy00_vartime(exponent)
 
-proc scalarMulVartimeWNAFBench*(EC: typedesc, bits, window: static int, iters: int) =
+proc scalarMulVartimeWNAFBench*(EC: typedesc, bits, window: static int, iters: int) {.noinline.} =
   var r {.noInit.}: EC
   var P = rng.random_unsafe(EC)
   P.clearCofactor()
@@ -223,7 +223,7 @@ proc scalarMulVartimeWNAFBench*(EC: typedesc, bits, window: static int, iters: i
     r = P
     r.scalarMul_wNAF_vartime(exponent, window)
 
-proc scalarMulVartimeEndoWNAFBench*(EC: typedesc, bits, window: static int, iters: int) =
+proc scalarMulVartimeEndoWNAFBench*(EC: typedesc, bits, window: static int, iters: int) {.noinline.} =
   var r {.noInit.}: EC
   var P = rng.random_unsafe(EC)
   P.clearCofactor()
@@ -234,14 +234,14 @@ proc scalarMulVartimeEndoWNAFBench*(EC: typedesc, bits, window: static int, iter
     r = P
     r.scalarMulEndo_wNAF_vartime(exponent, window)
 
-proc subgroupCheckBench*(EC: typedesc, iters: int) =
+proc subgroupCheckBench*(EC: typedesc, iters: int) {.noinline.} =
   var P = rng.random_unsafe(EC)
   P.clearCofactor()
 
   bench("Subgroup check", EC, iters):
     discard P.isInSubgroup()
 
-proc subgroupCheckScalarMulVartimeEndoWNAFBench*(EC: typedesc, bits, window: static int, iters: int) =
+proc subgroupCheckScalarMulVartimeEndoWNAFBench*(EC: typedesc, bits, window: static int, iters: int) {.noinline.} =
   var r {.noInit.}: EC
   var P = rng.random_unsafe(EC)
   P.clearCofactor()
@@ -253,7 +253,7 @@ proc subgroupCheckScalarMulVartimeEndoWNAFBench*(EC: typedesc, bits, window: sta
     discard r.isInSubgroup()
     r.scalarMulEndo_wNAF_vartime(exponent, window)
 
-proc multiAddBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) =
+proc multiAddBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) {.noinline.} =
   var points = newSeq[EC_ShortW_Aff[EC.F, EC.G]](numPoints)
 
   for i in 0 ..< numPoints:
@@ -271,7 +271,7 @@ proc multiAddBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int)
         r += points[i]
 
 
-proc msmBench*(EC: typedesc, numPoints: int, iters: int) =
+proc msmBench*(EC: typedesc, numPoints: int, iters: int) {.noinline.} =
   const bits = EC.getScalarField().bits()
   var points = newSeq[EC_ShortW_Aff[EC.F, EC.G]](numPoints)
   var scalars = newSeq[BigInt[bits]](numPoints)
diff --git a/benchmarks/bench_fields_template.nim b/benchmarks/bench_fields_template.nim
index 5ad318e2..f919c6fc 100644
--- a/benchmarks/bench_fields_template.nim
+++ b/benchmarks/bench_fields_template.nim
@@ -61,37 +61,46 @@ func random_unsafe(rng: var RngState, a: var ExtensionField2x) =
   for i in 0 ..< a.coords.len:
     rng.random_unsafe(a.coords[i])
 
-proc addBench*(T: typedesc, iters: int) =
+proc addBench*(T: typedesc, iters: int) {.noinline.} =
   var x = rng.random_unsafe(T)
   let y = rng.random_unsafe(T)
   bench("Addition", T, iters):
     x += y
 
-proc subBench*(T: typedesc, iters: int) =
+proc add10Bench*(T: typedesc, iters: int) {.noinline.} =
+  var xs: array[10, T]
+  for x in xs.mitems():
+    x = rng.random_unsafe(T)
+  let y = rng.random_unsafe(T)
+  bench("Additions (10)", T, iters):
+    staticFor i, 0, 10:
+      xs[i] += y
+
+proc subBench*(T: typedesc, iters: int) {.noinline.} =
   var x = rng.random_unsafe(T)
   let y = rng.random_unsafe(T)
   preventOptimAway(x)
   bench("Substraction", T, iters):
     x -= y
 
-proc negBench*(T: typedesc, iters: int) =
+proc negBench*(T: typedesc, iters: int) {.noinline.} =
   var r: T
   let x = rng.random_unsafe(T)
   bench("Negation", T, iters):
     r.neg(x)
 
-proc ccopyBench*(T: typedesc, iters: int) =
+proc ccopyBench*(T: typedesc, iters: int) {.noinline.} =
   var r: T
   let x = rng.random_unsafe(T)
   bench("Conditional Copy", T, iters):
     r.ccopy(x, CtFalse)
 
-proc div2Bench*(T: typedesc, iters: int) =
+proc div2Bench*(T: typedesc, iters: int) {.noinline.} =
   var x = rng.random_unsafe(T)
   bench("Division by 2", T, iters):
     x.div2()
 
-proc mulBench*(T: typedesc, iters: int) =
+proc mulBench*(T: typedesc, iters: int) {.noinline.} =
   var r: T
   let x = rng.random_unsafe(T)
   let y = rng.random_unsafe(T)
@@ -99,14 +108,14 @@ proc mulBench*(T: typedesc, iters: int) =
   bench("Multiplication", T, iters):
     r.prod(x, y)
 
-proc sqrBench*(T: typedesc, iters: int) =
+proc sqrBench*(T: typedesc, iters: int) {.noinline.} =
   var r: T
   let x = rng.random_unsafe(T)
   preventOptimAway(r)
   bench("Squaring", T, iters):
     r.square(x)
 
-proc mul2xUnrBench*(T: typedesc, iters: int) =
+proc mul2xUnrBench*(T: typedesc, iters: int) {.noinline.} =
   var r: doublePrec(T)
   let x = rng.random_unsafe(T)
   let y = rng.random_unsafe(T)
@@ -114,14 +123,14 @@ proc mul2xUnrBench*(T: typedesc, iters: int) =
   bench("Multiplication 2x unreduced", T, iters):
     r.prod2x(x, y)
 
-proc sqr2xUnrBench*(T: typedesc, iters: int) =
+proc sqr2xUnrBench*(T: typedesc, iters: int) {.noinline.} =
   var r: doublePrec(T)
   let x = rng.random_unsafe(T)
   preventOptimAway(r)
   bench("Squaring 2x unreduced", T, iters):
     r.square2x(x)
 
-proc rdc2xBench*(T: typedesc, iters: int) =
+proc rdc2xBench*(T: typedesc, iters: int) {.noinline.} =
   var r: T
   var t: doublePrec(T)
   rng.random_unsafe(t)
@@ -129,7 +138,7 @@ proc rdc2xBench*(T: typedesc, iters: int) =
   bench("Redc 2x", T, iters):
     r.redc2x(t)
 
-proc sumprodBench*(T: typedesc, iters: int) =
+proc sumprodBench*(T: typedesc, iters: int) {.noinline.} =
   var r: T
   let a = rng.random_unsafe(T)
   let b = rng.random_unsafe(T)
@@ -139,40 +148,40 @@ proc sumprodBench*(T: typedesc, iters: int) =
   bench("Linear combination", T, iters):
     r.sumprod([a, b], [u, v])
 
-proc toBigBench*(T: typedesc, iters: int) =
+proc toBigBench*(T: typedesc, iters: int) {.noinline.} =
   var r: T.getBigInt()
   let x = rng.random_unsafe(T)
   preventOptimAway(r)
   bench("BigInt <- field conversion", T, iters):
     r.fromField(x)
 
-proc toFieldBench*(T: typedesc, iters: int) =
+proc toFieldBench*(T: typedesc, iters: int) {.noinline.} =
   var r: T
   let x = rng.random_unsafe(T.getBigInt())
   preventOptimAway(r)
   bench("BigInt -> field conversion", T, iters):
     r.fromBig(x)
 
-proc invBench*(T: typedesc, iters: int) =
+proc invBench*(T: typedesc, iters: int) {.noinline.} =
   var r: T
   let x = rng.random_unsafe(T)
   preventOptimAway(r)
   bench("Inversion (constant-time)", T, iters):
     r.inv(x)
 
-proc invVartimeBench*(T: typedesc, iters: int) =
+proc invVartimeBench*(T: typedesc, iters: int) {.noinline.} =
   var r: T
   let x = rng.random_unsafe(T)
   preventOptimAway(r)
   bench("Inversion (variable-time)", T, iters):
     r.inv_vartime(x)
 
-proc isSquareBench*(T: typedesc, iters: int) =
+proc isSquareBench*(T: typedesc, iters: int) {.noinline.} =
   let x = rng.random_unsafe(T)
   bench("isSquare (constant-time)", T, iters):
     let qrt = x.isSquare()
 
-proc sqrtBench*(T: typedesc, iters: int) =
+proc sqrtBench*(T: typedesc, iters: int) {.noinline.} =
   let x = rng.random_unsafe(T)
 
   const algoType = block:
@@ -192,14 +201,14 @@ proc sqrtBench*(T: typedesc, iters: int) =
     var r = x
     discard r.sqrt_if_square()
 
-proc sqrtRatioBench*(T: typedesc, iters: int) =
+proc sqrtRatioBench*(T: typedesc, iters: int) {.noinline.} =
   var r: T
   let u = rng.random_unsafe(T)
   let v = rng.random_unsafe(T)
   bench("Fused SquareRoot+Division+isSquare sqrt(u/v)", T, iters):
     let isSquare = r.sqrt_ratio_if_square(u, v)
 
-proc sqrtVartimeBench*(T: typedesc, iters: int) =
+proc sqrtVartimeBench*(T: typedesc, iters: int) {.noinline.} =
   let x = rng.random_unsafe(T)
 
   const algoType = block:
@@ -219,21 +228,21 @@ proc sqrtVartimeBench*(T: typedesc, iters: int) =
     var r = x
     discard r.sqrt_if_square_vartime()
 
-proc sqrtRatioVartimeBench*(T: typedesc, iters: int) =
+proc sqrtRatioVartimeBench*(T: typedesc, iters: int) {.noinline.} =
   var r: T
   let u = rng.random_unsafe(T)
   let v = rng.random_unsafe(T)
   bench("Fused SquareRoot+Division+isSquare sqrt_vartime(u/v)", T, iters):
     let isSquare = r.sqrt_ratio_if_square_vartime(u, v)
 
-proc powBench*(T: typedesc, iters: int) =
+proc powBench*(T: typedesc, iters: int) {.noinline.} =
   let x = rng.random_unsafe(T)
   let exponent = rng.random_unsafe(BigInt[Fr[T.Name].bits()])
   var r = x
   bench("Exp curve order (constant-time) - " & $exponent.bits & "-bit", T, iters):
     r.pow(exponent)
 
-proc powVartimeBench*(T: typedesc, iters: int) =
+proc powVartimeBench*(T: typedesc, iters: int) {.noinline.} =
   let x = rng.random_unsafe(T)
   let exponent = rng.random_unsafe(BigInt[Fr[T.Name].bits()])
   var r = x
diff --git a/benchmarks/bench_fp.nim b/benchmarks/bench_fp.nim
index 7678340b..4e818508 100644
--- a/benchmarks/bench_fp.nim
+++ b/benchmarks/bench_fp.nim
@@ -44,6 +44,7 @@ proc main() =
   staticFor i, 0, AvailableCurves.len:
     const curve = AvailableCurves[i]
     addBench(Fp[curve], Iters)
+    add10Bench(Fp[curve], Iters)
     subBench(Fp[curve], Iters)
     negBench(Fp[curve], Iters)
     ccopyBench(Fp[curve], Iters)
@@ -55,8 +56,9 @@ proc main() =
     sqr2xUnrBench(Fp[curve], Iters)
     rdc2xBench(Fp[curve], Iters)
     smallSeparator()
-    sumprodBench(Fp[curve], Iters)
-    smallSeparator()
+    when not Fp[curve].isCrandallPrimeField():
+      sumprodBench(Fp[curve], Iters)
+      smallSeparator()
     toBigBench(Fp[curve], Iters)
     toFieldBench(Fp[curve], Iters)
     smallSeparator()
diff --git a/constantine.nimble b/constantine.nimble
index 20d8c950..45bfeb15 100644
--- a/constantine.nimble
+++ b/constantine.nimble
@@ -402,10 +402,10 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
   ("tests/math_fields/t_io_fields", false),
   # ("tests/math_fields/t_finite_fields.nim", false),
   # ("tests/math_fields/t_finite_fields_conditional_arithmetic.nim", false),
-  # ("tests/math_fields/t_finite_fields_mulsquare.nim", false),
-  # ("tests/math_fields/t_finite_fields_sqrt.nim", false),
+  ("tests/math_fields/t_finite_fields_mulsquare.nim", false),
+  ("tests/math_fields/t_finite_fields_sqrt.nim", false),
   ("tests/math_fields/t_finite_fields_powinv.nim", false),
-  # ("tests/math_fields/t_finite_fields_vs_gmp.nim", true),
+  ("tests/math_fields/t_finite_fields_vs_gmp.nim", true),
   # ("tests/math_fields/t_fp_cubic_root.nim", false),
 
   # Double-precision finite fields
diff --git a/constantine/hash_to_curve/h2c_hash_to_field.nim b/constantine/hash_to_curve/h2c_hash_to_field.nim
index 3d862815..4a1d68e9 100644
--- a/constantine/hash_to_curve/h2c_hash_to_field.nim
+++ b/constantine/hash_to_curve/h2c_hash_to_field.nim
@@ -229,12 +229,12 @@ func hashToField*[Field; count: static int](
         output[i].redc2x(big2x)
         output[i].mres.mulMont(
           output[i].mres,
-          Fp[Field.Name].getR3ModP(),
+          Fp[Field.Name].getR3modP(),
           Fp[Field.Name])
 
       else:
         output[i].coords[j].redc2x(big2x)
         output[i].coords[j].mres.mulMont(
           output[i].coords[j].mres,
-          Fp[Field.Name].getR3ModP(),
+          Fp[Field.Name].getR3modP(),
           Fp[Field.Name])
diff --git a/constantine/math/arithmetic/assembly/limbs_asm_crandall_x86.nim b/constantine/math/arithmetic/assembly/limbs_asm_crandall_x86.nim
new file mode 100644
index 00000000..ee4b730a
--- /dev/null
+++ b/constantine/math/arithmetic/assembly/limbs_asm_crandall_x86.nim
@@ -0,0 +1,266 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Standard library
+  std/macros,
+  # Internal
+  constantine/platforms/abstractions
+
+when UseASM_X86_64:
+  import ./limbs_asm_mul_x86
+
+# ############################################################
+#
+#        Assembly implementation of finite fields
+#
+# ############################################################
+
+
+static: doAssert UseASM_X86_32
+
+# Crandall reductions
+# ------------------------------------------------------------
+
+macro reduceCrandallPartial_gen*[N: static int](
+       r_PIR: var array[N, SecretWord],
+       a_MEM: array[N*2, SecretWord],
+       m: static int, c: static BaseType) =
+
+  result = newStmtList()
+  var ctx = init(Assembler_x86, BaseType)
+
+  ctx.comment "Crandall reduction - Partial"
+  ctx.comment "----------------------------"
+
+  let
+    r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
+    a = asmArray(a_MEM, 2*N, MemOffsettable, asmInput)
+
+    tSym = ident"t"
+    t = asmArray(tSym, N, ElemsInReg, asmOutputEarlyClobber)
+
+    hiSym = ident"hi"
+    hi = asmValue(hiSym, Reg, asmOutputEarlyClobber)
+
+    csSym = ident"cs"
+    cs = asmValue(csSym, Reg, asmOutputEarlyClobber)
+
+    S =(N*WordBitWidth - m)
+    csImm = c shl S
+
+  # Prologue
+  result.add quote do:
+    var `hiSym`{.noinit.}, `csSym`{.noinit.}: BaseType
+    var `tSym`{.noInit.}: typeof(`r_PIR`)
+
+  ctx.`xor` hi, hi
+  ctx.mov cs, csImm
+
+  # Algorithm
+  # Note: always accumulate into a register.
+  #       add-carry into memory is huge pessimization
+
+  ctx.comment "First reduction pass"
+  ctx.comment "--------------------"
+  ctx.comment "(hi, r₀) <- aₙ*cs + a₀"
+  ctx.mov rax, a[N]
+  ctx.mul rdx, rax, cs, rax
+  ctx.add rax, a[0]
+  ctx.mov t[0], rax
+  ctx.adc hi, rdx # hi = 0
+  for i in 1 ..< N:
+    ctx.comment "  (hi, rᵢ) <- aᵢ₊ₙ*cs + aᵢ + hi"
+    ctx.mov   rax, a[i+N]
+    ctx.mul   rdx, rax, cs, rax
+    ctx.add   rax, hi
+    ctx.adc   rdx, 0
+    ctx.`xor` hi, hi
+    ctx.add   rax, a[i]
+    ctx.adc   hi, rdx
+    ctx.mov   t[i], rax
+
+  # The first reduction pass may carry in `hi`
+  # which would be hi*2ʷⁿ ≡ hi*2ʷⁿ⁻ᵐ*c (mod p)
+  #                       ≡ hi*cs (mod p)
+  ctx.shld  hi, t[N-1], S
+  # High-bits have been "carried" to `hi`, cancel them in r[N-1].
+  # Note: there might be up to `c` not reduced.
+  ctx.mov rax, BaseType(MaxWord) shr S
+  ctx.`and` t[N-1], rax
+
+  # Partially reduce to up to `m` bits
+  # We need to fold what's beyond `m` bits
+  # by repeatedly multiplying it by cs
+  # We distinguish 2 cases:
+  # 1. Folding (N..2N)*cs onto 0..N
+  #    may overflow and a 3rd folding is necessary
+  # 2. Folding (N..2N)*cs onto 0..N
+  #    may not overflow and 2 foldings are all we need.
+  #    This is possible:
+  #    - if we don't use the full 2ʷⁿ
+  #      for example we use 255 bits out of 256 available
+  #    - And (2ʷⁿ⁻ᵐ*c)² < 2ʷ
+  #
+  # There is a 3rd case that we don't handle
+  # c > 2ʷ, for example secp256k1 on 32-bit
+
+  if N*WordBitWidth == m: # Secp256k1 only according to eprint/iacr 2018/985
+    # We shifted hi by 2ʷⁿ⁻ᵐ so no need for cs = 2ʷⁿ⁻ᵐc
+    # we just need c
+    ctx.mov cs, c
+
+    # Second pass
+    ctx.mov rax, hi
+    ctx.mul rdx, rax, cs, rax
+    ctx.add t[0], rax
+    ctx.mov rax, 0
+    ctx.adc t[1], rdx
+    for i in 2 ..< N:
+      ctx.adc t[i], 0
+      ctx.mov r[i], t[i]
+
+    # Third pass
+    ctx.setc rax # This only sets the low 8 bits, need extra zero-out
+    ctx.mul rdx, rax, cs, rax
+    ctx.add t[0], rax
+    ctx.mov r[0], t[0]
+    ctx.adc t[1], rdx
+    ctx.mov r[1], t[1]
+
+  # We want to ensure that cs² < 2³² on 64-bit or 2¹⁶ on 32-bit
+  # But doing unsigned cs² may overflow, so we sqrt the rhs instead
+  elif csImm.uint64 < (1'u64 shl (WordBitWidth shr 1)) - 1:
+
+    # Second pass
+
+    # hi < cs, and cs² < 2ʷ (2³² on 32-bit or 2⁶⁴ on 64-bit)
+    # hence hi *= c cannot overflow
+    ctx.imul rax, hi, c
+    ctx.add t[0], rax
+    ctx.mov r[0], t[0]
+    for i in 1 ..< N:
+      ctx.adc t[i], 0
+      ctx.mov r[i], t[i]
+
+  else:
+    error "Not implemented"
+
+  # Code generation
+  result.add ctx.generate()
+
+func reduceCrandallPartial_asm*[N: static int](
+      r: var Limbs[N],
+      a: array[2*N, SecretWord],
+      m: static int, c: static SecretWord) =
+  ## Partial Reduction modulo p
+  ## with p with special form 2ᵐ-c
+  ## called "Crandall prime" or Pseudo-Mersenne Prime in the litterature
+  ##
+  ## This is a partial reduction that reduces down to
+  ## 2ᵐ, i.e. it fits in the same amount of word by p
+  ## but values my be up to p+c
+  ##
+  ## Crandal primes allow fast reduction from the fact that
+  ##        2ᵐ-c ≡  0     (mod p)
+  ##   <=>  2ᵐ   ≡  c     (mod p)
+  ##   <=> a2ᵐ+b ≡ ac + b (mod p)
+  r.reduceCrandallPartial_gen(a, m, BaseType(c))
+
+macro reduceCrandallFinal_gen*[N: static int](
+      a_PIR: var Limbs[N], p_MEM: Limbs[N]) =
+  ## Final Reduction modulo p
+  ## with p with special form 2ᵐ-c
+  ## called "Crandall prime" or Pseudo-Mersenne Prime in the litterature
+  ##
+  ## This reduces `a` from [0, 2ᵐ) to [0, 2ᵐ-c)
+
+  result = newStmtList()
+  var ctx = init(Assembler_x86, BaseType)
+
+  ctx.comment "Crandall reduction - Final substraction"
+  ctx.comment "---------------------------------------"
+
+  let a =asmArray(a_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but_ec_shortw_prj_g1_sum_reduce.nimt compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
+  let tSym = ident"t"
+  let t = asmArray(tSym, N, ElemsInReg, asmOutputEarlyClobber)
+  let p = asmArray(p_MEM, N, MemOffsettable, asmInput)
+
+  result.add quote do:
+    var `tsym`{.noInit.}: typeof(`a_PIR`)
+
+  # Substract the modulus, and test a < p with the last borrow
+  ctx.mov t[0], a[0]
+  ctx.sub t[0], p[0]
+  for i in 1 ..< N:
+    ctx.mov t[i], a[i]
+    ctx.sbb t[i], p[i]
+
+  # If we borrowed it means that we were smaller than
+  # the modulus and we don't need "scratch".
+  var r0 = rax
+  var r1 = rdx
+  for i in 0 ..< N:
+    ctx.mov r0, a[i]
+    ctx.cmovnc r0, t[i]
+    ctx.mov a[i], r0
+    swap(r0, r1)
+
+  # Codegen
+  result.add ctx.generate()
+
+func reduceCrandallFinal_asm*[N: static int](
+      a: var Limbs[N], p: Limbs[N]) =
+  ## Partial Reduction modulo p
+  ## with p with special form 2ᵐ-c
+  ## called "Crandall prime" or Pseudo-Mersenne Prime in the litterature
+  ##
+  ## This is a partial reduction that reduces down to
+  ## 2ᵐ, i.e. it fits in the same amount of word by p
+  ## but values my be up to p+c
+  ##
+  ## Crandal primes allow fast reduction from the fact that
+  ##        2ᵐ-c ≡  0     (mod p)
+  ##   <=>  2ᵐ   ≡  c     (mod p)
+  ##   <=> a2ᵐ+b ≡ ac + b (mod p)
+  a.reduceCrandallFinal_gen(p)
+
+# Crandall Multiplication and squaring
+# ------------------------------------------------------------
+
+func mulCranPartialReduce_asm*[N: static int](
+        r: var Limbs[N],
+        a, b: Limbs[N],
+        m: static int, c: static SecretWord) =
+  static: doAssert UseASM_X86_64, "x86-32 does not have enough registers for multiplication"
+  var r2 {.noInit.}: Limbs[2*N]
+  r2.mul_asm(a, b)
+  r.reduceCrandallPartial_asm(r2, m, c)
+
+func squareCranPartialReduce_asm*[N: static int](
+        r: var Limbs[N],
+        a: Limbs[N],
+        m: static int, c: static SecretWord) =
+  static: doAssert UseASM_X86_64, "x86-32 does not have enough registers for squaring"
+  var r2 {.noInit.}: Limbs[2*N]
+  r2.square_asm(a)
+  r.reduceCrandallPartial_asm(r2, m, c)
+
+func mulCran_asm*[N: static int](
+        r: var Limbs[N],
+        a, b, p: Limbs[N],
+        m: static int, c: static SecretWord) =
+  r.mulCranPartialReduce_asm(a, b, m, c)
+  r.reduceCrandallFinal_asm(p)
+
+func squareCran_asm*[N: static int](
+        r: var Limbs[N],
+        a, p: Limbs[N],
+        m: static int, c: static SecretWord) =
+  r.squareCranPartialReduce_asm(a, m, c)
+  r.reduceCrandallFinal_asm(p)
diff --git a/constantine/math/arithmetic/assembly/limbs_asm_crandall_x86_adx_bmi2.nim b/constantine/math/arithmetic/assembly/limbs_asm_crandall_x86_adx_bmi2.nim
new file mode 100644
index 00000000..d4256467
--- /dev/null
+++ b/constantine/math/arithmetic/assembly/limbs_asm_crandall_x86_adx_bmi2.nim
@@ -0,0 +1,209 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Standard library
+  std/macros,
+  # Internal
+  constantine/platforms/abstractions,
+  ./limbs_asm_crandall_x86
+
+when UseASM_X86_64:
+  import ./limbs_asm_mul_x86_adx_bmi2
+
+# ############################################################
+#
+#        Assembly implementation of finite fields
+#
+# ############################################################
+
+
+static: doAssert UseASM_X86_32
+
+# Crandall reduction
+# ------------------------------------------------------------
+
+macro reduceCrandallPartial_adx_gen*[N: static int](
+       r_PIR: var array[N, SecretWord],
+       a_MEM: array[N*2, SecretWord],
+       m: static int, c: static BaseType) =
+
+  result = newStmtList()
+  var ctx = init(Assembler_x86, BaseType)
+
+  ctx.comment "Crandall reduction - Partial"
+  ctx.comment "----------------------------"
+
+  let
+    r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
+    a = asmArray(a_MEM, 2*N, MemOffsettable, asmInput)
+
+    tSym = ident"t"
+    t = asmArray(tSym, N, ElemsInReg, asmOutputEarlyClobber)
+
+    hiSym = ident"hi"
+    hi = asmValue(hiSym, Reg, asmOutputEarlyClobber)
+
+    csSym = ident"cs"
+    cs = asmValue(csSym, Reg, asmOutputEarlyClobber)
+
+    S =(N*WordBitWidth - m)
+    csImm = c shl S
+
+  # Prologue
+  # ---------
+  result.add quote do:
+    var `hiSym`{.noinit.}, `csSym`{.noinit.}: BaseType
+    var `tSym`{.noInit.}: typeof(`r_PIR`)
+
+  ctx.`xor` rax, rax
+  ctx.mov cs, csImm
+
+  # Algorithm
+  # ---------
+  ctx.comment "First reduction pass"
+  ctx.comment "--------------------"
+  ctx.comment "(hi, r₀) <- aₙ*cs + a₀"
+  ctx.mov  rdx, cs
+  ctx.mov  t[0], a[0]
+  for i in 0 ..< N:
+    # TODO: should we alternate rax with another register?
+    #       to deal with false dependencies?
+    ctx.comment "  (hi, rᵢ) <- aᵢ₊ₙ*cs + aᵢ + hi"
+    if i != N-1:
+      ctx.mov t[i+1], a[i+1]
+    ctx.mulx hi, rax, a[N+i], rdx
+    ctx.adox t[i], rax
+    if i != N-1:
+      ctx.adcx t[i+1], hi
+
+  # Final carries
+  ctx.mov rdx, 0
+  ctx.adcx hi, rdx
+  ctx.adox hi, rdx
+
+  # The first reduction pass may carry in `hi`
+  # which would be hi*2ʷⁿ ≡ hi*2ʷⁿ⁻ᵐ*c (mod p)
+  #                       ≡ hi*cs (mod p)
+  ctx.shld  hi, t[N-1], S
+  # High-bits have been "carried" to `hi`, cancel them in r[N-1].
+  # Note: there might be up to `c` not reduced.
+  ctx.mov rax, BaseType(MaxWord) shr S
+  ctx.`and` t[N-1], rax
+
+  # Partially reduce to up to `m` bits
+  # We need to fold what's beyond `m` bits
+  # by repeatedly multiplying it by cs
+  # We distinguish 2 cases:
+  # 1. Folding (N..2N)*cs onto 0..N
+  #    may overflow and a 3rd folding is necessary
+  # 2. Folding (N..2N)*cs onto 0..N
+  #    may not overflow and 2 foldings are all we need.
+  #    This is possible:
+  #    - if we don't use the full 2ʷⁿ
+  #      for example we use 255 bits out of 256 available
+  #    - And (2ʷⁿ⁻ᵐ*c)² < 2ʷ
+  #
+  # There is a 3rd case that we don't handle
+  # c > 2ʷ, for example secp256k1 on 32-bit
+
+  if N*WordBitWidth == m: # Secp256k1 only according to eprint/iacr 2018/985
+    # We shifted hi by 2ʷⁿ⁻ᵐ so no need for cs = 2ʷⁿ⁻ᵐc
+    # we just need c
+    ctx.mov rdx, c
+
+    # Second pass
+    ctx.mulx hi, rax, hi, rdx
+    ctx.add t[0], rax
+    ctx.mov rax, 0
+    ctx.adc t[1], hi
+    for i in 2 ..< N:
+      ctx.adc t[i], 0
+      ctx.mov r[i], t[i]
+
+    # Third pass
+    ctx.setc rax # This only sets the low 8 bits, need extra zero-out
+    ctx.mulx hi, rax, rax, rdx
+    ctx.add  t[0], rax
+    ctx.mov  r[0], t[0]
+    ctx.adc  t[1], hi
+    ctx.mov  r[1], t[1]
+
+  # We want to ensure that cs² < 2³² on 64-bit or 2¹⁶ on 32-bit
+  # But doing unsigned cs² may overflow, so we sqrt the rhs instead
+  elif csImm.uint64 < (1'u64 shl (WordBitWidth shr 1)) - 1:
+
+    # Second pass
+
+    # hi < cs, and cs² < 2ʷ (2³² on 32-bit or 2⁶⁴ on 64-bit)
+    # hence hi *= c cannot overflow
+    ctx.imul rax, hi, c
+    ctx.add t[0], rax
+    ctx.mov r[0], t[0]
+    for i in 1 ..< N:
+      ctx.adc t[i], 0
+      ctx.mov r[i], t[i]
+
+  else:
+    error "Not implemented"
+
+  # Code generation
+  result.add ctx.generate()
+
+func reduceCrandallPartial_asm_adx*[N: static int](
+      r: var Limbs[N],
+      a: array[2*N, SecretWord],
+      m: static int, c: static SecretWord) =
+  ## Partial Reduction modulo p
+  ## with p with special form 2ᵐ-c
+  ## called "Crandall prime" or Pseudo-Mersenne Prime in the litterature
+  ##
+  ## This is a partial reduction that reduces down to
+  ## 2ᵐ, i.e. it fits in the same amount of word by p
+  ## but values my be up to p+c
+  ##
+  ## Crandal primes allow fast reduction from the fact that
+  ##        2ᵐ-c ≡  0     (mod p)
+  ##   <=>  2ᵐ   ≡  c     (mod p)
+  ##   <=> a2ᵐ+b ≡ ac + b (mod p)
+  r.reduceCrandallPartial_adx_gen(a, m, BaseType(c))
+
+# Crandall Multiplication and squaring
+# ------------------------------------------------------------
+
+func mulCranPartialReduce_asm_adx*[N: static int](
+        r: var Limbs[N],
+        a, b: Limbs[N],
+        m: static int, c: static SecretWord) =
+  static: doAssert UseASM_X86_64, "x86-32 does not have enough registers for squaring"
+  var r2 {.noInit.}: Limbs[2*N]
+  r2.mul_asm_adx(a, b)
+  r.reduceCrandallPartial_asm_adx(r2, m, c)
+
+func squareCranPartialReduce_asm_adx*[N: static int](
+        r: var Limbs[N],
+        a: Limbs[N],
+        m: static int, c: static SecretWord) =
+  static: doAssert UseASM_X86_64, "x86-32 does not have enough registers for squaring"
+  var r2 {.noInit.}: Limbs[2*N]
+  r2.square_asm_adx(a)
+  r.reduceCrandallPartial_asm_adx(r2, m, c)
+
+func mulCran_asm_adx*[N: static int](
+        r: var Limbs[N],
+        a, b, p: Limbs[N],
+        m: static int, c: static SecretWord) =
+  r.mulCranPartialReduce_asm_adx(a, b, m, c)
+  r.reduceCrandallFinal_asm(p)
+
+func squareCran_asm_adx*[N: static int](
+        r: var Limbs[N],
+        a, p: Limbs[N],
+        m: static int, c: static SecretWord) =
+  r.squareCranPartialReduce_asm_adx(a, m, c)
+  r.reduceCrandallFinal_asm(p)
diff --git a/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_x86.nim b/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_x86.nim
index 797a4306..14544d47 100644
--- a/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_x86.nim
+++ b/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_x86.nim
@@ -33,7 +33,7 @@ static: doAssert UseASM_X86_64
 macro mulMont_CIOS_sparebit_gen[N: static int](
         r_PIR: var Limbs[N], a_PIR, b_PIR,
         M_MEM: Limbs[N], m0ninv_REG: BaseType,
-        skipFinalSub: static bool): untyped =
+        lazyReduce: static bool): untyped =
   ## Generate an optimized Montgomery Multiplication kernel
   ## using the CIOS method
   ##
@@ -161,21 +161,21 @@ macro mulMont_CIOS_sparebit_gen[N: static int](
   ctx.mov rax, r # move r away from scratchspace that will be used for final substraction
   let r2 = rax.asArrayAddr(r_PIR, len = N, memIndirect = memWrite)
 
-  if skipFinalSub:
+  if lazyReduce:
     for i in 0 ..< N:
       ctx.mov r2[i], t[i]
   else:
     ctx.finalSubNoOverflowImpl(r2, t, M, scratch)
   result.add ctx.generate()
 
-func mulMont_CIOS_sparebit_asm*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) =
+func mulMont_CIOS_sparebit_asm*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, lazyReduce: static bool = false) =
   ## Constant-time Montgomery multiplication
-  ## If "skipFinalSub" is set
+  ## If "lazyReduce" is set
   ## the result is in the range [0, 2M)
   ## otherwise the result is in the range [0, M)
   ##
   ## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
-  r.mulMont_CIOS_sparebit_gen(a, b, M, m0ninv, skipFinalSub)
+  r.mulMont_CIOS_sparebit_gen(a, b, M, m0ninv, lazyReduce)
 
 # Montgomery Squaring
 # ------------------------------------------------------------
@@ -184,11 +184,11 @@ func squareMont_CIOS_asm*[N](
        r: var Limbs[N],
        a, M: Limbs[N],
        m0ninv: BaseType,
-       spareBits: static int, skipFinalSub: static bool) =
+       spareBits: static int, lazyReduce: static bool) =
   ## Constant-time modular squaring
   var r2x {.noInit.}: Limbs[2*N]
   square_asm(r2x, a)
-  r.redcMont_asm(r2x, M, m0ninv, spareBits, skipFinalSub)
+  r.redcMont_asm(r2x, M, m0ninv, spareBits, lazyReduce)
 
 # Montgomery Sum of Products
 # ------------------------------------------------------------
@@ -196,7 +196,7 @@ func squareMont_CIOS_asm*[N](
 macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
         r_PIR: var Limbs[N], a_PIR, b_PIR: array[K, Limbs[N]],
         M_MEM: Limbs[N], m0ninv_REG: BaseType,
-        skipFinalSub: static bool): untyped =
+        lazyReduce: static bool): untyped =
   ## Generate an optimized Montgomery merged sum of products ⅀aᵢ.bᵢ kernel
   ## using the CIOS method
   ##
@@ -353,7 +353,7 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
   ctx.mov rax, r # move r away from scratchspace that will be used for final substraction
   let r2 = rax.asArrayAddr(r_PIR, len = N, memIndirect = memWrite)
 
-  if skipFinalSub:
+  if lazyReduce:
     ctx.comment "  Copy result"
     for i in 0 ..< N:
       ctx.mov r2[i], t[i]
@@ -367,11 +367,11 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
 func sumprodMont_CIOS_spare2bits_asm*[N, K: static int](
         r: var Limbs[N], a, b: array[K, Limbs[N]],
         M: Limbs[N], m0ninv: BaseType,
-        skipFinalSub: static bool) =
+        lazyReduce: static bool) =
   ## Sum of products ⅀aᵢ.bᵢ in the Montgomery domain
-  ## If "skipFinalSub" is set
+  ## If "lazyReduce" is set
   ## the result is in the range [0, 2M)
   ## otherwise the result is in the range [0, M)
   ##
   ## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
-  r.sumprodMont_CIOS_spare2bits_gen(a, b, M, m0ninv, skipFinalSub)
+  r.sumprodMont_CIOS_spare2bits_gen(a, b, M, m0ninv, lazyReduce)
diff --git a/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_x86_adx_bmi2.nim b/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_x86_adx_bmi2.nim
index 12298f1c..3bbd0644 100644
--- a/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_x86_adx_bmi2.nim
+++ b/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_x86_adx_bmi2.nim
@@ -62,6 +62,8 @@ proc mulx_by_word(
 
   # Steady state
   for j in 1 ..< N-1:
+    # TODO: should we alternate lo with another register?
+    #       to deal with false dependencies?
     ctx.mulx t[j+1], lo, a[j], rdx
     if j == 1:
       ctx.add t[j], lo
@@ -172,7 +174,7 @@ proc partialRedx(
 macro mulMont_CIOS_sparebit_adx_gen[N: static int](
         r_PIR: var Limbs[N], a_PIR, b_PIR,
         M_MEM: Limbs[N], m0ninv_REG: BaseType,
-        skipFinalSub: static bool): untyped =
+        lazyReduce: static bool): untyped =
   ## Generate an optimized Montgomery Multiplication kernel
   ## using the CIOS method
   ## This requires the most significant word of the Modulus
@@ -258,7 +260,7 @@ macro mulMont_CIOS_sparebit_adx_gen[N: static int](
       M, m0ninv,
       lo, C)
 
-  if skipFinalSub:
+  if lazyReduce:
     for i in 0 ..< N:
       ctx.mov r[i], t[i]
   else:
@@ -268,14 +270,14 @@ macro mulMont_CIOS_sparebit_adx_gen[N: static int](
 
   result.add ctx.generate()
 
-func mulMont_CIOS_sparebit_asm_adx*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) =
+func mulMont_CIOS_sparebit_asm_adx*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, lazyReduce: static bool = false) =
   ## Constant-time Montgomery multiplication
-  ## If "skipFinalSub" is set
+  ## If "lazyReduce" is set
   ## the result is in the range [0, 2M)
   ## otherwise the result is in the range [0, M)
   ##
   ## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
-  r.mulMont_CIOS_sparebit_adx_gen(a, b, M, m0ninv, skipFinalSub)
+  r.mulMont_CIOS_sparebit_adx_gen(a, b, M, m0ninv, lazyReduce)
 
 # Montgomery Squaring
 # ------------------------------------------------------------
@@ -284,11 +286,11 @@ func squareMont_CIOS_asm_adx*[N](
        r: var Limbs[N],
        a, M: Limbs[N],
        m0ninv: BaseType,
-       spareBits: static int, skipFinalSub: static bool) =
+       spareBits: static int, lazyReduce: static bool) =
   ## Constant-time modular squaring
   var r2x {.noInit.}: Limbs[2*N]
   r2x.square_asm_adx(a)
-  r.redcMont_asm_adx(r2x, M, m0ninv, spareBits, skipFinalSub)
+  r.redcMont_asm_adx(r2x, M, m0ninv, spareBits, lazyReduce)
 
 # Montgomery Sum of Products
 # ------------------------------------------------------------
@@ -296,7 +298,7 @@ func squareMont_CIOS_asm_adx*[N](
 macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int](
         r_PIR: var Limbs[N], a_PIR, b_PIR: array[K, Limbs[N]],
         M_MEM: Limbs[N], m0ninv_REG: BaseType,
-        skipFinalSub: static bool): untyped =
+        lazyReduce: static bool): untyped =
   ## Generate an optimized Montgomery merged sum of products ⅀aᵢ.bᵢ kernel
   ## using the CIOS method
   ##
@@ -440,7 +442,7 @@ macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int](
   ctx.mov rax, r # move r away from scratchspace that will be used for final substraction
   let r2 = rax.asArrayAddr(r_PIR, len = N, memIndirect = memWrite)
 
-  if skipFinalSub:
+  if lazyReduce:
     ctx.comment "  Copy result"
     for i in 0 ..< N:
       ctx.mov r2[i], t[i]
@@ -452,11 +454,11 @@ macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int](
 func sumprodMont_CIOS_spare2bits_asm_adx*[N, K: static int](
         r: var Limbs[N], a, b: array[K, Limbs[N]],
         M: Limbs[N], m0ninv: BaseType,
-        skipFinalSub: static bool) =
+        lazyReduce: static bool) =
   ## Sum of products ⅀aᵢ.bᵢ in the Montgomery domain
-  ## If "skipFinalSub" is set
+  ## If "lazyReduce" is set
   ## the result is in the range [0, 2M)
   ## otherwise the result is in the range [0, M)
   ##
   ## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
-  r.sumprodMont_CIOS_spare2bits_adx_gen(a, b, M, m0ninv, skipFinalSub)
+  r.sumprodMont_CIOS_spare2bits_adx_gen(a, b, M, m0ninv, lazyReduce)
diff --git a/constantine/math/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim b/constantine/math/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim
index 50cad165..d7d6aabd 100644
--- a/constantine/math/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim
+++ b/constantine/math/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim
@@ -52,6 +52,8 @@ proc mulx_by_word(
   ctx.mov r0, rax
 
   # Steady state
+  # TODO: should we alternate rax with another register?
+  #       to deal with false dependencies?
   for j in 1 ..< N:
     ctx.mulx t[j], rax, a[j], rdx
     if j == 1:
diff --git a/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_x86.nim b/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_x86.nim
index bce6e0ea..c4743aed 100644
--- a/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_x86.nim
+++ b/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_x86.nim
@@ -33,7 +33,7 @@ macro redc2xMont_gen*[N: static int](
        a_PIR: array[N*2, SecretWord],
        M_MEM: array[N, SecretWord],
        m0ninv_REG: BaseType,
-       spareBits: static int, skipFinalSub: static bool) =
+       spareBits: static int, lazyReduce: static bool) =
   # No register spilling handling
   doAssert N > 2, "The Assembly-optimized montgomery reduction requires a minimum of 2 limbs."
   doAssert N <= 6, "The Assembly-optimized montgomery reduction requires at most 6 limbs."
@@ -134,7 +134,7 @@ macro redc2xMont_gen*[N: static int](
   # Second part - Final substraction
   # ---------------------------------------------
 
-  if not(spareBits >= 2 and skipFinalSub):
+  if not(spareBits >= 2 and lazyReduce):
     ctx.mov rdx, r_temp
   let r = rdx.asArrayAddr(r_PIR, len = N, memIndirect = memWrite)
 
@@ -150,7 +150,7 @@ macro redc2xMont_gen*[N: static int](
   # v is invalidated from now on
   let t = repackRegisters(v, u[N], u[N+1])
 
-  if spareBits >= 2 and skipFinalSub:
+  if spareBits >= 2 and lazyReduce:
     for i in 0 ..< N:
       ctx.mov r_temp[i], u[i]
   elif spareBits >= 1:
@@ -167,10 +167,10 @@ func redcMont_asm*[N: static int](
        M: array[N, SecretWord],
        m0ninv: BaseType,
        spareBits: static int,
-       skipFinalSub: static bool) =
+       lazyReduce: static bool) =
   ## Constant-time Montgomery reduction
   static: doAssert UseASM_X86_64, "This requires x86-64."
-  redc2xMont_gen(r, a, M, m0ninv, spareBits, skipFinalSub)
+  redc2xMont_gen(r, a, M, m0ninv, spareBits, lazyReduce)
 
 # Montgomery conversion
 # ----------------------------------------------------------
diff --git a/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_x86_adx_bmi2.nim b/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_x86_adx_bmi2.nim
index 41f21773..e814939e 100644
--- a/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_x86_adx_bmi2.nim
+++ b/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_x86_adx_bmi2.nim
@@ -37,7 +37,7 @@ macro redc2xMont_adx_gen[N: static int](
        a_PIR: array[N*2, SecretWord],
        M_MEM: array[N, SecretWord],
        m0ninv_REG: BaseType,
-       spareBits: static int, skipFinalSub: static bool) =
+       spareBits: static int, lazyReduce: static bool) =
 
   # No register spilling handling
   doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
@@ -127,7 +127,7 @@ macro redc2xMont_adx_gen[N: static int](
 
   let t = repackRegisters(v, u[N])
 
-  if spareBits >= 2 and skipFinalSub:
+  if spareBits >= 2 and lazyReduce:
     for i in 0 ..< N:
       ctx.mov r[i], t[i]
   elif spareBits >= 1:
@@ -144,12 +144,12 @@ func redcMont_asm_adx*[N: static int](
        M: array[N, SecretWord],
        m0ninv: BaseType,
        spareBits: static int,
-       skipFinalSub: static bool = false) =
+       lazyReduce: static bool = false) =
   ## Constant-time Montgomery reduction
   # Inlining redcMont_asm_adx twice in mul_fp2_complex_asm_adx
   # causes GCC to miscompile with -Os (--opt:size)
   # see https://github.com/mratsim/constantine/issues/229
-  redc2xMont_adx_gen(r, a, M, m0ninv, spareBits, skipFinalSub)
+  redc2xMont_adx_gen(r, a, M, m0ninv, spareBits, lazyReduce)
 
 # Montgomery conversion
 # ----------------------------------------------------------
diff --git a/constantine/math/arithmetic/bigints_crandall.nim b/constantine/math/arithmetic/bigints_crandall.nim
new file mode 100644
index 00000000..7ad51cf6
--- /dev/null
+++ b/constantine/math/arithmetic/bigints_crandall.nim
@@ -0,0 +1,128 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  constantine/platforms/abstractions,
+  ./limbs,
+  ./limbs_crandall,
+  ./bigints
+
+# No exceptions allowed
+{.push raises: [].}
+{.push inline.}
+
+# ############################################################
+#
+#                 Crandall Arithmetic
+#
+# ############################################################
+
+func mulCran*[m: static int](
+      r: var BigInt[m], a, b: BigInt[m],
+      p: BigInt[m],
+      c: static SecretWord,
+      lazyReduce: static bool = false) =
+  ## Compute r <- a*b (2ᵐ-c)
+  mulCran(r.limbs, a.limbs, b.limbs, p.limbs, m, c, lazyReduce)
+
+func squareCran*[m: static int](
+      r: var BigInt[m], a: BigInt[m],
+      p: BigInt[m],
+      c: static SecretWord,
+      lazyReduce: static bool = false) =
+  ## Compute r <- a² (2ᵐ-c), m = bits
+  squareCran(r.limbs, a.limbs, p.limbs, m, c, lazyReduce)
+
+func powCran*[m: static int](
+       a: var BigInt[m], exponent: openarray[byte],
+       p: BigInt[m],
+       windowSize: static int,
+       c: static SecretWord,
+       lazyReduce: static bool = false) =
+  ## Compute a <- a^exponent (mod M)
+  ## ``exponent`` is a BigInt in canonical big-endian representation
+  ##
+  ## This uses fixed window optimization
+  ## A window size in the range [1, 5] must be chosen
+  ##
+  ## This is constant-time: the window optimization does
+  ## not reveal the exponent bits or hamming weight
+  const scratchLen = if windowSize == 1: 2
+                     else: (1 shl windowSize) + 1
+  var scratchSpace {.noInit.}: array[scratchLen, Limbs[m.wordsRequired()]]
+  powCran(a.limbs, exponent, p.limbs, scratchSpace, m, c, lazyReduce)
+
+func powCran_vartime*[m: static int](
+       a: var BigInt[m], exponent: openarray[byte],
+       p: BigInt[m],
+       windowSize: static int,
+       c: static SecretWord,
+       lazyReduce: static bool = false) =
+  ## Compute a <- a^exponent (mod M)
+  ## ``exponent`` is a BigInt in canonical big-endian representation
+  ##
+  ## Warning ⚠️ :
+  ## This is an optimization for public exponent
+  ## Otherwise bits of the exponent can be retrieved with:
+  ## - memory access analysis
+  ## - power analysis
+  ## - timing analysis
+  ##
+  ## This uses fixed window optimization
+  ## A window size in the range [1, 5] must be chosen
+
+  const scratchLen = if windowSize == 1: 2
+                     else: (1 shl windowSize) + 1
+  var scratchSpace {.noInit.}: array[scratchLen, Limbs[m.wordsRequired()]]
+  powCran_vartime(a.limbs, exponent, p.limbs, scratchSpace, m, c, lazyReduce)
+
+func powCran*[m, eBits: static int](
+       a: var BigInt[m], exponent: BigInt[eBits],
+       p: BigInt[m],
+       windowSize: static int,
+       c: static SecretWord,
+       lazyReduce: static bool = false) =
+  ## Compute a <- a^exponent (mod M)
+  ## ``exponent`` is any BigInt, in the canonical domain
+  ##
+  ## This uses fixed window optimization
+  ## A window size in the range [1, 5] must be chosen
+  ##
+  ## This is constant-time: the window optimization does
+  ## not reveal the exponent bits or hamming weight
+  var expBE {.noInit.}: array[ebits.ceilDiv_vartime(8), byte]
+  expBE.marshal(exponent, bigEndian)
+
+  powCran(a, expBE, p, windowSize, c, lazyReduce)
+
+func powCran_vartime*[m, eBits: static int](
+       a: var BigInt[m], exponent: BigInt[eBits],
+       p: BigInt[m],
+       windowSize: static int,
+       c: static SecretWord,
+       lazyReduce: static bool = false) =
+  ## Compute a <- a^exponent (mod M)
+  ## ``a`` in the Montgomery domain
+  ## ``exponent`` is any BigInt, in the canonical domain
+  ##
+  ## Warning ⚠️ :
+  ## This is an optimization for public exponent
+  ## Otherwise bits of the exponent can be retrieved with:
+  ## - memory access analysis
+  ## - power analysis
+  ## - timing analysis
+  ##
+  ## This uses fixed window optimization
+  ## A window size in the range [1, 5] must be chosen
+  var expBE {.noInit.}: array[ebits.ceilDiv_vartime(8), byte]
+  expBE.marshal(exponent, bigEndian)
+
+  powCran_vartime(a, expBE, p, windowSize, c, lazyReduce)
+
+{.pop.} # inline
+{.pop.} # raises no exceptions
diff --git a/constantine/math/arithmetic/bigints_montgomery.nim b/constantine/math/arithmetic/bigints_montgomery.nim
index 4836ed96..27dd4bca 100644
--- a/constantine/math/arithmetic/bigints_montgomery.nim
+++ b/constantine/math/arithmetic/bigints_montgomery.nim
@@ -51,64 +51,58 @@ func fromMont*(r: var BigInt, a, M: BigInt, m0ninv: BaseType, spareBits: static
   fromMont(r.limbs, a.limbs, M.limbs, m0ninv, spareBits)
 
 func mulMont*(r: var BigInt, a, b, M: BigInt, negInvModWord: BaseType,
-              spareBits: static int, skipFinalSub: static bool = false) =
+              spareBits: static int, lazyReduce: static bool = false) =
   ## Compute r <- a*b (mod M) in the Montgomery domain
   ##
   ## This resets r to zero before processing. Use {.noInit.}
   ## to avoid duplicating with Nim zero-init policy
-  mulMont(r.limbs, a.limbs, b.limbs, M.limbs, negInvModWord, spareBits, skipFinalSub)
+  mulMont(r.limbs, a.limbs, b.limbs, M.limbs, negInvModWord, spareBits, lazyReduce)
 
 func squareMont*(r: var BigInt, a, M: BigInt, negInvModWord: BaseType,
-                 spareBits: static int, skipFinalSub: static bool = false) =
+                 spareBits: static int, lazyReduce: static bool = false) =
   ## Compute r <- a^2 (mod M) in the Montgomery domain
   ##
   ## This resets r to zero before processing. Use {.noInit.}
   ## to avoid duplicating with Nim zero-init policy
-  squareMont(r.limbs, a.limbs, M.limbs, negInvModWord, spareBits, skipFinalSub)
+  squareMont(r.limbs, a.limbs, M.limbs, negInvModWord, spareBits, lazyReduce)
 
 func sumprodMont*[N: static int](
       r: var BigInt,
       a, b: array[N, BigInt],
       M: BigInt, negInvModWord: BaseType,
-      spareBits: static int, skipFinalSub: static bool = false) =
+      spareBits: static int, lazyReduce: static bool = false) =
   ## Compute r <- ⅀aᵢ.bᵢ (mod M) (sum of products) in the Montgomery domain
   # We rely on BigInt and Limbs having the same repr to avoid array copies
   sumprodMont(
     r.limbs,
     cast[ptr array[N, typeof(a[0].limbs)]](a.unsafeAddr)[],
     cast[ptr array[N, typeof(b[0].limbs)]](b.unsafeAddr)[],
-    M.limbs, negInvModWord, spareBits, skipFinalSub
+    M.limbs, negInvModWord, spareBits, lazyReduce
   )
 
 func powMont*[mBits: static int](
        a: var BigInt[mBits], exponent: openarray[byte],
        M, one: BigInt[mBits], negInvModWord: BaseType, windowSize: static int,
-       spareBits: static int
-      ) =
+       spareBits: static int) =
   ## Compute a <- a^exponent (mod M)
   ## ``a`` in the Montgomery domain
   ## ``exponent`` is a BigInt in canonical big-endian representation
   ##
-  ## Warning ⚠️ :
-  ## This is an optimization for public exponent
-  ## Otherwise bits of the exponent can be retrieved with:
-  ## - memory access analysis
-  ## - power analysis
-  ## - timing analysis
-  ##
   ## This uses fixed window optimization
   ## A window size in the range [1, 5] must be chosen
+  ##
+  ## This is constant-time: the window optimization does
+  ## not reveal the exponent bits or hamming weight
 
   const scratchLen = if windowSize == 1: 2
                      else: (1 shl windowSize) + 1
-  var scratchSpace {.noInit.}: array[scratchLen, Limbs[mBits.wordsRequired]]
+  var scratchSpace {.noInit.}: array[scratchLen, Limbs[mBits.wordsRequired()]]
   powMont(a.limbs, exponent, M.limbs, one.limbs, negInvModWord, scratchSpace, spareBits)
 
 func powMont_vartime*[mBits: static int](
        a: var BigInt[mBits], exponent: openarray[byte],
        M, one: BigInt[mBits], negInvModWord: BaseType, windowSize: static int,
-       spareBits: static int
-      ) =
+       spareBits: static int) =
   ## Compute a <- a^exponent (mod M)
   ## ``a`` in the Montgomery domain
   ## ``exponent`` is a BigInt in canonical big-endian representation
@@ -125,14 +119,13 @@ func powMont_vartime*[mBits: static int](
 
   const scratchLen = if windowSize == 1: 2
                      else: (1 shl windowSize) + 1
-  var scratchSpace {.noInit.}: array[scratchLen, Limbs[mBits.wordsRequired]]
+  var scratchSpace {.noInit.}: array[scratchLen, Limbs[mBits.wordsRequired()]]
   powMont_vartime(a.limbs, exponent, M.limbs, one.limbs, negInvModWord, scratchSpace, spareBits)
 
 func powMont*[mBits, eBits: static int](
        a: var BigInt[mBits], exponent: BigInt[eBits],
        M, one: BigInt[mBits], negInvModWord: BaseType, windowSize: static int,
-       spareBits: static int
-      ) =
+       spareBits: static int) =
   ## Compute a <- a^exponent (mod M)
   ## ``a`` in the Montgomery domain
   ## ``exponent`` is any BigInt, in the canonical domain
@@ -150,8 +143,7 @@ func powMont*[mBits, eBits: static int](
 func powMont_vartime*[mBits, eBits: static int](
        a: var BigInt[mBits], exponent: BigInt[eBits],
        M, one: BigInt[mBits], negInvModWord: BaseType, windowSize: static int,
-       spareBits: static int
-      ) =
+       spareBits: static int) =
   ## Compute a <- a^exponent (mod M)
   ## ``a`` in the Montgomery domain
   ## ``exponent`` is any BigInt, in the canonical domain
diff --git a/constantine/math/arithmetic/finite_fields.nim b/constantine/math/arithmetic/finite_fields.nim
index b9a6542c..dd1b44ea 100644
--- a/constantine/math/arithmetic/finite_fields.nim
+++ b/constantine/math/arithmetic/finite_fields.nim
@@ -30,7 +30,9 @@ import
   constantine/platforms/abstractions,
   constantine/serialization/endians,
   constantine/named/properties_fields,
-  ./bigints, ./bigints_montgomery
+  ./bigints,
+  ./bigints_montgomery,
+  ./bigints_crandall
 
 when UseASM_X86_64:
   import ./assembly/limbs_asm_modular_x86
@@ -54,10 +56,13 @@ export Fp, Fr, FF
 
 func fromBig*(dst: var FF, src: BigInt) =
   ## Convert a BigInt to its Montgomery form
-  when nimvm:
-    dst.mres.montyResidue_precompute(src, FF.getModulus(), FF.getR2modP(), FF.getNegInvModWord())
+  when FF.isCrandallPrimeField():
+    dst.mres = src
   else:
-    dst.mres.getMont(src, FF.getModulus(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
+    when nimvm:
+      dst.mres.montyResidue_precompute(src, FF.getModulus(), FF.getR2modP(), FF.getNegInvModWord())
+    else:
+      dst.mres.getMont(src, FF.getModulus(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
 
 func fromBig*[Name: static Algebra](T: type FF[Name], src: BigInt): FF[Name] {.noInit.} =
   ## Convert a BigInt to its Montgomery form
@@ -65,7 +70,10 @@ func fromBig*[Name: static Algebra](T: type FF[Name], src: BigInt): FF[Name] {.n
 
 func fromField*(dst: var BigInt, src: FF) {.inline.} =
   ## Convert a finite-field element to a BigInt in natural representation
-  dst.fromMont(src.mres, FF.getModulus(), FF.getNegInvModWord(), FF.getSpareBits())
+  when FF.isCrandallPrimeField():
+    dst = src.mres
+  else:
+    dst.fromMont(src.mres, FF.getModulus(), FF.getNegInvModWord(), FF.getSpareBits())
 
 func toBig*(src: FF): auto {.noInit, inline.} =
   ## Convert a finite-field element to a BigInt in natural representation
@@ -121,11 +129,17 @@ func isZero*(a: FF): SecretBool =
 
 func isOne*(a: FF): SecretBool =
   ## Constant-time check if one
-  a.mres == FF.getMontyOne()
+  when FF.isCrandallPrimeField():
+    a.mres.isOne()
+  else:
+    a.mres == FF.getMontyOne()
 
 func isMinusOne*(a: FF): SecretBool =
   ## Constant-time check if -1 (mod p)
-  a.mres == FF.getMontyPrimeMinus1()
+  when FF.isCrandallPrimeField():
+    a.mres == FF.getPrimeMinus1()
+  else:
+    a.mres == FF.getMontyPrimeMinus1()
 
 func isOdd*(a: FF): SecretBool {.
   error: "Do you need the actual value to be odd\n" &
@@ -141,14 +155,20 @@ func setOne*(a: var FF) =
   # Note: we need 1 in Montgomery residue form
   # TODO: Nim codegen is not optimal it uses a temporary
   #       Check if the compiler optimizes it away
-  a.mres = FF.getMontyOne()
+  when FF.isCrandallPrimeField():
+    a.mres.setOne()
+  else:
+    a.mres = FF.getMontyOne()
 
 func setMinusOne*(a: var FF) =
   ## Set ``a`` to -1 (mod p)
   # Note: we need -1 in Montgomery residue form
   # TODO: Nim codegen is not optimal it uses a temporary
   #       Check if the compiler optimizes it away
-  a.mres = FF.getMontyPrimeMinus1()
+  when FF.isCrandallPrimeField():
+    a.mres = FF.getPrimeMinus1()
+  else:
+    a.mres = FF.getMontyPrimeMinus1()
 
 func neg*(r: var FF, a: FF) {.meter.} =
   ## Negate modulo p
@@ -234,22 +254,31 @@ func double*(r: var FF, a: FF) {.meter.} =
     overflowed = overflowed or not(r.mres < FF.getModulus())
     discard csub(r.mres, FF.getModulus(), overflowed)
 
-func prod*(r: var FF, a, b: FF, skipFinalSub: static bool = false) {.meter.} =
+func prod*(r: var FF, a, b: FF, lazyReduce: static bool = false) {.meter.} =
   ## Store the product of ``a`` by ``b`` modulo p into ``r``
   ## ``r`` is initialized / overwritten
-  r.mres.mulMont(a.mres, b.mres, FF.getModulus(), FF.getNegInvModWord(), FF.getSpareBits(), skipFinalSub)
+  when FF.isCrandallPrimeField():
+    r.mres.mulCran(a.mres, b.mres, FF.getModulus(), FF.getCrandallPrimeSubterm(), lazyReduce)
+  else:
+    r.mres.mulMont(a.mres, b.mres, FF.getModulus(), FF.getNegInvModWord(), FF.getSpareBits(), lazyReduce)
 
-func square*(r: var FF, a: FF, skipFinalSub: static bool = false) {.meter.} =
+func square*(r: var FF, a: FF, lazyReduce: static bool = false) {.meter.} =
   ## Squaring modulo p
-  r.mres.squareMont(a.mres, FF.getModulus(), FF.getNegInvModWord(), FF.getSpareBits(), skipFinalSub)
+  when FF.isCrandallPrimeField():
+    r.mres.squareCran(a.mres, FF.getModulus(), FF.getCrandallPrimeSubterm(), lazyReduce)
+  else:
+    r.mres.squareMont(a.mres, FF.getModulus(), FF.getNegInvModWord(), FF.getSpareBits(), lazyReduce)
 
-func sumprod*[N: static int](r: var FF, a, b: array[N, FF], skipFinalSub: static bool = false) {.meter.} =
+func sumprod*[N: static int](r: var FF, a, b: array[N, FF], lazyReduce: static bool = false) {.meter.} =
   ## Compute r <- ⅀aᵢ.bᵢ (mod M) (sum of products)
   # We rely on FF and Bigints having the same repr to avoid array copies
-  r.mres.sumprodMont(
-    cast[ptr array[N, typeof(a[0].mres)]](a.unsafeAddr)[],
-    cast[ptr array[N, typeof(b[0].mres)]](b.unsafeAddr)[],
-    FF.getModulus(), FF.getNegInvModWord(), FF.getSpareBits(), skipFinalSub)
+  when FF.isCrandallPrimeField():
+    {.error: "Not implemented".}
+  else:
+    r.mres.sumprodMont(
+      cast[ptr array[N, typeof(a[0].mres)]](a.unsafeAddr)[],
+      cast[ptr array[N, typeof(b[0].mres)]](b.unsafeAddr)[],
+      FF.getModulus(), FF.getNegInvModWord(), FF.getSpareBits(), lazyReduce)
 
 # ############################################################
 #
@@ -329,7 +358,10 @@ func inv*(r: var FF, a: FF) =
   ## Incidentally this avoids extra check
   ## to convert Jacobian and Projective coordinates
   ## to affine for elliptic curve
-  r.mres.invmod(a.mres, FF.getR2modP(), FF.getModulus())
+  when FF.isCrandallPrimeField():
+    r.mres.invmod(a.mres, FF.getModulus())
+  else:
+    r.mres.invmod(a.mres, FF.getR2modP(), FF.getModulus())
 
 func inv*(a: var FF) =
   ## Inversion modulo p
@@ -347,7 +379,10 @@ func inv_vartime*(r: var FF, a: FF) {.tags: [VarTime].} =
   ## Incidentally this avoids extra check
   ## to convert Jacobian and Projective coordinates
   ## to affine for elliptic curve
-  r.mres.invmod_vartime(a.mres, FF.getR2modP(), FF.getModulus())
+  when FF.isCrandallPrimeField():
+    r.mres.invmod_vartime(a.mres, FF.getModulus())
+  else:
+    r.mres.invmod_vartime(a.mres, FF.getR2modP(), FF.getModulus())
 
 func inv_vartime*(a: var FF) {.tags: [VarTime].} =
   ## Variable-time Inversion modulo p
@@ -370,23 +405,23 @@ func `*=`*(a: var FF, b: FF) {.meter.} =
   ## Multiplication modulo p
   a.prod(a, b)
 
-func square*(a: var FF, skipFinalSub: static bool = false) {.meter.} =
+func square*(a: var FF, lazyReduce: static bool = false) {.meter.} =
   ## Squaring modulo p
-  a.square(a, skipFinalSub)
+  a.square(a, lazyReduce)
 
-func square_repeated*(a: var FF, num: int, skipFinalSub: static bool = false) {.meter.} =
+func square_repeated*(a: var FF, num: int, lazyReduce: static bool = false) {.meter.} =
   ## Repeated squarings
   ## Assumes at least 1 squaring
   for _ in 0 ..< num-1:
-    a.square(skipFinalSub = true)
-  a.square(skipFinalSub)
+    a.square(lazyReduce = true)
+  a.square(lazyReduce)
 
-func square_repeated*(r: var FF, a: FF, num: int, skipFinalSub: static bool = false) {.meter.} =
+func square_repeated*(r: var FF, a: FF, num: int, lazyReduce: static bool = false) {.meter.} =
   ## Repeated squarings
-  r.square(a, skipFinalSub = true)
+  r.square(a, lazyReduce = true)
   for _ in 1 ..< num-1:
-    r.square(skipFinalSub = true)
-  r.square(skipFinalSub)
+    r.square(lazyReduce = true)
+  r.square(lazyReduce)
 
 func `*=`*(a: var FF, b: static int) =
   ## Multiplication by a small integer known at compile-time
@@ -518,31 +553,48 @@ func pow*(a: var FF, exponent: BigInt) =
   ## Exponentiation modulo p
   ## ``a``: a field element to be exponentiated
   ## ``exponent``: a big integer
-  const windowSize = 5 # TODO: find best window size for each curves
-  a.mres.powMont(
-    exponent,
-    FF.getModulus(), FF.getMontyOne(),
-    FF.getNegInvModWord(), windowSize,
-    FF.getSpareBits()
-  )
+  when FF.isCrandallPrimeField():
+    const windowSize = 5 # TODO: find best window size for each curves
+    a.mres.powCran(
+      exponent,
+      FF.getModulus(),
+      windowSize,
+      FF.getCrandallPrimeSubterm()
+    )
+  else:
+    const windowSize = 5 # TODO: find best window size for each curves
+    a.mres.powMont(
+      exponent,
+      FF.getModulus(), FF.getMontyOne(),
+      FF.getNegInvModWord(), windowSize,
+      FF.getSpareBits()
+    )
 
 func pow*(a: var FF, exponent: openarray[byte]) =
   ## Exponentiation modulo p
   ## ``a``: a field element to be exponentiated
   ## ``exponent``: a big integer in canonical big endian representation
-  const windowSize = 5 # TODO: find best window size for each curves
-  a.mres.powMont(
-    exponent,
-    FF.getModulus(), FF.getMontyOne(),
-    FF.getNegInvModWord(), windowSize,
-    FF.getSpareBits()
-  )
+  when FF.isCrandallPrimeField():
+    const windowSize = 5 # TODO: find best window size for each curves
+    a.mres.powCran(
+      exponent,
+      FF.getModulus(),
+      windowSize,
+      FF.getCrandallPrimeSubterm()
+    )
+  else:
+    const windowSize = 5 # TODO: find best window size for each curves
+    a.mres.powMont(
+      exponent,
+      FF.getModulus(), FF.getMontyOne(),
+      FF.getNegInvModWord(), windowSize,
+      FF.getSpareBits()
+    )
 
 func pow*(a: var FF, exponent: FF) =
   ## Exponentiation modulo p
   ## ``a``: a field element to be exponentiated
   ## ``exponent``: a finite field element
-  const windowSize = 5 # TODO: find best window size for each curves
   a.pow(exponent.toBig())
 
 func pow*(r: var FF, a: FF, exponent: BigInt or openArray[byte] or FF) =
@@ -566,13 +618,23 @@ func pow_vartime*(a: var FF, exponent: BigInt) =
   ## - memory access analysis
   ## - power analysis
   ## - timing analysis
-  const windowSize = 5 # TODO: find best window size for each curves
-  a.mres.powMont_vartime(
-    exponent,
-    FF.getModulus(), FF.getMontyOne(),
-    FF.getNegInvModWord(), windowSize,
-    FF.getSpareBits()
-  )
+
+  when FF.isCrandallPrimeField():
+    const windowSize = 5 # TODO: find best window size for each curves
+    a.mres.powCran_vartime(
+      exponent,
+      FF.getModulus(),
+      windowSize,
+      FF.getCrandallPrimeSubterm()
+    )
+  else:
+    const windowSize = 5 # TODO: find best window size for each curves
+    a.mres.powMont_vartime(
+      exponent,
+      FF.getModulus(), FF.getMontyOne(),
+      FF.getNegInvModWord(), windowSize,
+      FF.getSpareBits()
+    )
 
 func pow_vartime*(a: var FF, exponent: openarray[byte]) =
   ## Exponentiation modulo p
@@ -585,19 +647,28 @@ func pow_vartime*(a: var FF, exponent: openarray[byte]) =
   ## - memory access analysis
   ## - power analysis
   ## - timing analysis
-  const windowSize = 5 # TODO: find best window size for each curves
-  a.mres.powMont_vartime(
-    exponent,
-    FF.getModulus(), FF.getMontyOne(),
-    FF.getNegInvModWord(), windowSize,
-    FF.getSpareBits()
-  )
+
+  when FF.isCrandallPrimeField():
+    const windowSize = 5 # TODO: find best window size for each curves
+    a.mres.powCran_vartime(
+      exponent,
+      FF.getModulus(),
+      windowSize,
+      FF.getCrandallPrimeSubterm()
+    )
+  else:
+    const windowSize = 5 # TODO: find best window size for each curves
+    a.mres.powMont_vartime(
+      exponent,
+      FF.getModulus(), FF.getMontyOne(),
+      FF.getNegInvModWord(), windowSize,
+      FF.getSpareBits()
+    )
 
 func pow_vartime*(a: var FF, exponent: FF) =
   ## Exponentiation modulo p
   ## ``a``: a field element to be exponentiated
   ## ``exponent``: a finite field element
-  const windowSize = 5 # TODO: find best window size for each curves
   a.pow_vartime(exponent.toBig())
 
 func pow_vartime*(r: var FF, a: FF, exponent: BigInt or openArray[byte] or FF) =
@@ -636,27 +707,27 @@ func pow_squareMultiply_vartime(a: var FF, exponent: SomeUnsignedInt) {.tags:[Va
   for e in 0 ..< eBytes.len-1:
     let e = eBytes[e]
     for i in countdown(7, 0):
-      a.square(skipFinalSub = true)
+      a.square(lazyReduce = true)
       let bit = bool((e shr i) and 1)
       if bit:
-        a.prod(a, aa, skipFinalSub = true)
+        a.prod(a, aa, lazyReduce = true)
 
   let e = eBytes[eBytes.len-1]
   block: # Epilogue, byte-level
     for i in countdown(7, 1):
-          a.square(skipFinalSub = true)
+          a.square(lazyReduce = true)
           let bit = bool((e shr i) and 1)
           if bit:
-            a.prod(a, aa, skipFinalSub = true)
+            a.prod(a, aa, lazyReduce = true)
 
   block: # Epilogue, bit-level
     # for the very last bit we can't skip final substraction
     let bit = bool(e and 1)
     if bit:
-      a.square(skipFinalSub = true)
-      a.prod(a, aa, skipFinalSub = false)
+      a.square(lazyReduce = true)
+      a.prod(a, aa, lazyReduce = false)
     else:
-      a.square(skipFinalSub = false)
+      a.square(lazyReduce = false)
 
 func pow_addchain_4bit_vartime(a: var FF, exponent: SomeUnsignedInt) {.tags:[VarTime], meter.} =
   ## **Variable-time** Exponentiation
@@ -670,69 +741,69 @@ func pow_addchain_4bit_vartime(a: var FF, exponent: SomeUnsignedInt) {.tags:[Var
     a.square()
   of 3:
     var t {.noInit.}: typeof(a)
-    t.square(a, skipFinalSub = true)
+    t.square(a, lazyReduce = true)
     a *= t
   of 4:
     a.square_repeated(2)
   of 5:
     var t {.noInit.}: typeof(a)
-    t.square_repeated(a, 2, skipFinalSub = true)
+    t.square_repeated(a, 2, lazyReduce = true)
     a *= t
   of 6:
     var t {.noInit.}: typeof(a)
-    t.square(a, skipFinalSub = true)
-    t.prod(t, a, skipFinalSub = true) # 3
+    t.square(a, lazyReduce = true)
+    t.prod(t, a, lazyReduce = true) # 3
     a.square(t)
   of 7:
     var t {.noInit.}: typeof(a)
-    t.square(a, skipFinalSub = true)
-    a.prod(a, t, skipFinalSub = true) # 3
-    t.square(skipFinalSub = true)  # 4
+    t.square(a, lazyReduce = true)
+    a.prod(a, t, lazyReduce = true) # 3
+    t.square(lazyReduce = true)  # 4
     a *= t
   of 8:
     a.square_repeated(3)
   of 9:
     var t {.noInit.}: typeof(a)
-    t.square_repeated(a, 3, skipFinalSub = true)
+    t.square_repeated(a, 3, lazyReduce = true)
     a *= t
   of 10:
     var t {.noInit.}: typeof(a)
-    t.square_repeated(a, 2, skipFinalSub = true)  # 4
-    a.prod(a, t, skipFinalSub = true)             # 5
+    t.square_repeated(a, 2, lazyReduce = true)  # 4
+    a.prod(a, t, lazyReduce = true)             # 5
     a.square()
   of 11:
     var t {.noInit.}: typeof(a)
-    t.square_repeated(a, 2, skipFinalSub = true)  # 4
-    t.prod(t, a, skipFinalSub = true)             # 5
-    t.square(skipFinalSub = true)                 # 10
+    t.square_repeated(a, 2, lazyReduce = true)  # 4
+    t.prod(t, a, lazyReduce = true)             # 5
+    t.square(lazyReduce = true)                 # 10
     a *= t
   of 12:
     var t {.noInit.}: typeof(a)
-    t.square(a, skipFinalSub = true)
-    t.prod(t, a, skipFinalSub = true)  # 3
-    t.square(skipFinalSub = true)      # 6
+    t.square(a, lazyReduce = true)
+    t.prod(t, a, lazyReduce = true)  # 3
+    t.square(lazyReduce = true)      # 6
     a.square(t)                        # 12
   of 13:
     var t1 {.noInit.}, t2 {.noInit.}: typeof(a)
-    t1.square_repeated(a, 2, skipFinalSub = true) # 4
-    t2.square(t1, skipFinalSub = true)            # 8
-    t1.prod(t1, t2, skipFinalSub = true)          # 12
+    t1.square_repeated(a, 2, lazyReduce = true) # 4
+    t2.square(t1, lazyReduce = true)            # 8
+    t1.prod(t1, t2, lazyReduce = true)          # 12
     a *= t1                                       # 13
   of 14:
     var t {.noInit.}: typeof(a)
-    t.square(a, skipFinalSub = true)   # 2
+    t.square(a, lazyReduce = true)   # 2
     a *= t                             # 3
-    t.square_repeated(2, skipFinalSub = true) # 8
-    a.square(skipFinalSub = true)      # 6
+    t.square_repeated(2, lazyReduce = true) # 8
+    a.square(lazyReduce = true)      # 6
     a *= t                             # 14
   of 15:
     var t {.noInit.}: typeof(a)
-    t.square(a, skipFinalSub = true)
-    t.prod(t, a, skipFinalSub = true)            # 3
-    a.square_repeated(t, 2, skipFinalSub = true) # 12
+    t.square(a, lazyReduce = true)
+    t.prod(t, a, lazyReduce = true)            # 3
+    a.square_repeated(t, 2, lazyReduce = true) # 12
     a *= t                                       # 15
   of 16:
-    a.square_repeated(4, skipFinalSub = true)
+    a.square_repeated(4, lazyReduce = true)
   else:
     doAssert false, "exponentiation by this small int '" & $exponent & "' is not implemented"
 
@@ -781,7 +852,7 @@ import std/macros
 
 macro addchain*(fn: untyped): untyped =
   ## Modify all prod, `*=`, square, square_repeated calls
-  ## to skipFinalSub except the very last call.
+  ## to lazyReduce except the very last call.
   ## This assumes straight-line code.
   fn.expectKind(nnkFuncDef)
 
@@ -846,9 +917,9 @@ func batchInv*[F](
 
     dst[i] = acc
     if i != N-1:
-      acc.prod(acc, z, skipFinalSub = true)
+      acc.prod(acc, z, lazyReduce = true)
     else:
-      acc.prod(acc, z, skipFinalSub = false)
+      acc.prod(acc, z, lazyReduce = false)
 
   acc.inv()
 
@@ -860,7 +931,7 @@ func batchInv*[F](
     # next iteration
     var eli = elements[i]
     eli.csetOne(zeros[i])
-    acc.prod(acc, eli, skipFinalSub = true)
+    acc.prod(acc, eli, lazyReduce = true)
 
 func batchInv_vartime*[F](
         dst: ptr UncheckedArray[F],
@@ -889,9 +960,9 @@ func batchInv_vartime*[F](
 
     dst[i] = acc
     if i != N-1:
-      acc.prod(acc, elements[i], skipFinalSub = true)
+      acc.prod(acc, elements[i], lazyReduce = true)
     else:
-      acc.prod(acc, elements[i], skipFinalSub = false)
+      acc.prod(acc, elements[i], lazyReduce = false)
 
   acc.inv_vartime()
 
@@ -899,7 +970,7 @@ func batchInv_vartime*[F](
     if zeros[i] == true:
       continue
     dst[i] *= acc
-    acc.prod(acc, elements[i], skipFinalSub = true)
+    acc.prod(acc, elements[i], lazyReduce = true)
 
 func batchInv*[F](dst: var openArray[F], source: openArray[F]) {.inline.} =
   debug: doAssert dst.len == source.len
diff --git a/constantine/math/arithmetic/limbs_crandall.nim b/constantine/math/arithmetic/limbs_crandall.nim
new file mode 100644
index 00000000..83915d6c
--- /dev/null
+++ b/constantine/math/arithmetic/limbs_crandall.nim
@@ -0,0 +1,515 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  constantine/platforms/abstractions,
+  ./limbs, ./limbs_extmul
+
+when UseASM_X86_32:
+  import
+    ./assembly/limbs_asm_crandall_x86,
+    ./assembly/limbs_asm_crandall_x86_adx_bmi2
+
+# No exceptions allowed
+{.push raises: [], checks: off.}
+
+# ############################################################
+#
+#         Multiprecision Crandall prime /
+#        Pseudo-Mersenne Prime Arithmetic
+#
+# ############################################################
+#
+# Crandall primes have the form p = 2ᵐ-c
+
+# Fast reduction
+# ------------------------------------------------------------
+
+func reduceCrandallPartial_impl[N: static int](
+        r: var Limbs[N],
+        a: array[2*N, SecretWord], # using Limbs lead to type mismatch or ICE
+        m: static int,
+        c: static SecretWord) =
+  ## Partial Reduction modulo p
+  ## with p with special form 2ᵐ-c
+  ## called "Crandall prime" or Pseudo-Mersenne Prime in the litterature
+  ##
+  ## This is a partial reduction that reduces down to
+  ## 2ᵐ, i.e. it fits in the same amount of word by p
+  ## but values my be up to p+c
+  ##
+  ## Crandal primes allow fast reduction from the fact that
+  ##        2ᵐ-c ≡  0     (mod p)
+  ##   <=>  2ᵐ   ≡  c     (mod p)
+  ##   <=> a2ᵐ+b ≡ ac + b (mod p)
+
+  # In our case we split at 2ʷⁿ with w the word size (32 or 64-bit)
+  # and N the number of words needed to represent the prime
+  # hence  2ʷⁿ   ≡ 2ʷⁿ⁻ᵐc (mod p), we call this cs (c shifted)
+  # so    a2ʷⁿ+b ≡ a2ʷⁿ⁻ᵐc + b (mod p)
+  #
+  # With concrete instantiations:
+  # for p = 2²⁵⁵-19 (Curve25519)
+  #   2²⁵⁵ ≡ 19 (mod p)
+  #   2²⁵⁶ ≡ 2*19 (mod p)
+  #   We rewrite the 510 bits multiplication result as
+  #   a2²⁵⁶+b = a*2*19 + b (mod p)
+  #
+  # For Bitcoin/Ethereum, p = 2²⁵⁶-0x1000003D1 =
+  #                       p = 2²⁵⁶ - (2³²+2⁹+2⁸+2⁷+2⁶+2⁴+1)
+  #   2²⁵⁶ ≡ 0x1000003D1 (mod p)
+  #   We rewrite the 512 bits multiplication result as
+  #   a2²⁵⁶+b = a*0x1000003D1 + b (mod p)
+  #
+  # Note: on a w-bit architecture, c MUST be less than w-bit
+  #       This is not the case for secp256k1 on 32-bit
+  #       as it's c = 2³²+2⁹+2⁸+2⁷+2⁶+2⁴+1
+  #       Though as multiplying by 2³² is free
+  #       we can special-case the problem, if there was a
+  #       32-bit platform with add-with-carry that is still a valuable target.
+  #       (otherwise unsaturated arithmetic is superior)
+
+  const S = (N*WordBitWidth - m)
+  const cs = c shl S
+  static: doAssert 0 <= S and S < WordBitWidth
+
+  var hi: SecretWord
+
+  # First reduction pass
+  # multiply high-words by c shifted and accumulate in low-words
+  # assumes cs fits in a single word.
+
+  # (hi, r₀) <- aₙ*cs + a₀
+  muladd1(hi, r[0], a[N], cs, a[0])
+  staticFor i, 1, N:
+    # (hi, rᵢ) <- aᵢ₊ₙ*cs + aᵢ + hi
+    muladd2(hi, r[i], a[i+N], cs, a[i], hi)
+
+  # The first reduction pass may carry in `hi`
+  # which would be hi*2ʷⁿ ≡ hi*2ʷⁿ⁻ᵐ*c (mod p)
+  #                       ≡ hi*cs (mod p)
+
+  # Move all extra bits to hi, i.e. double-word shift
+  hi = (hi shl S) or (r[N-1] shr (WordBitWidth-S))
+
+  # High-bits have been "carried" to `hi`, cancel them in r[N-1].
+  # Note: there might be up to `c` not reduced.
+  r[N-1] = r[N-1] and (MaxWord shr S)
+
+  # Partially reduce to up to `m` bits
+  # We need to fold what's beyond `m` bits
+  # by repeatedly multiplying it by cs
+  # We distinguish 2 cases:
+  # 1. Folding (N..2N)*cs onto 0..N
+  #    may overflow and a 3rd folding is necessary
+  # 2. Folding (N..2N)*cs onto 0..N
+  #    may not overflow and 2 foldings are all we need.
+  #    This is possible:
+  #    - if we don't use the full 2ʷⁿ
+  #      for example we use 255 bits out of 256 available
+  #    - And (2ʷⁿ⁻ᵐ*c)² < 2ʷ
+  #
+  # There is a 3rd case that we don't handle
+  # c > 2ʷ, for example secp256k1 on 32-bit
+
+  when N*WordBitWidth == m: # Secp256k1 only according to eprint/iacr 2018/985
+    var t0, t1: SecretWord
+    var carry: Carry
+
+    # Second pass
+    mul(t1, t0, hi, c)
+    addC(carry, r[0], r[0], t0, Carry(0))
+    addC(carry, r[1], r[1], t1, carry)
+    staticFor i, 2, N:
+      addC(carry, r[i], r[i], Zero, carry)
+
+    # Third pass - the high-word to fold can only be m-bits+1
+    mul(t1, t0, SecretWord(carry), c)
+    addC(carry, r[0], r[0], t0, Carry(0))
+    addC(carry, r[1], r[1], t1, carry)
+
+  # We want to ensure that cs² < 2³² on 64-bit or 2¹⁶ on 32-bit
+  # But doing unsigned cs² may overflow, so we sqrt the rhs instead
+  elif uint64(cs) < (1'u64 shl (WordBitWidth shr 1)) - 1:
+    var carry: Carry
+
+    # Second pass
+
+    # hi < cs, and cs² < 2ʷ (2³² on 32-bit or 2⁶⁴ on 64-bit)
+    # hence hi *= c cannot overflow
+    hi *= c
+    addC(carry, r[0], r[0], hi, Carry(0))
+    staticFor i, 1, N:
+      addC(carry, r[i], r[i], Zero, carry)
+
+  else:
+    {.error: "Not implemented".}
+
+func reduceCrandallFinal_impl[N: static int](
+        a: var Limbs[N],
+        p: Limbs[N]) =
+  ## Final Reduction modulo p
+  ## with p with special form 2ᵐ-c
+  ## called "Crandall prime" or Pseudo-Mersenne Prime in the litterature
+  ##
+  ## This reduces `a` from [0, 2ᵐ) to [0, 2ᵐ-c)
+
+  # 1. Substract p = 2ᵐ-c
+  var t {.noInit.}: Limbs[N]
+  let underflow = t.diff(a, p)
+
+  # 2. If underflow, a has the proper reduced result
+  #    otherwise t has the proper reduced result
+  a.ccopy(t, not SecretBool(underflow))
+
+func reduceCrandallFinal[N: static int](
+        a: var Limbs[N],
+        p: Limbs[N]) {.inline.} =
+  ## Final Reduction modulo p
+  ## with p with special form 2ᵐ-c
+  ## called "Crandall prime" or Pseudo-Mersenne Prime in the litterature
+  ##
+  ## This reduces `a` from [0, 2ᵐ) to [0, 2ᵐ-c)
+  when UseASM_X86_32 and a.len in {3..6}:
+    a.reduceCrandallFinal_asm(p)
+  else:
+    a.reduceCrandallFinal_impl(p)
+
+# High-level API
+# ------------------------------------------------------------
+
+func mulCranPartialReduce[N: static int](
+        r: var Limbs[N],
+        a, b: Limbs[N],
+        m: static int, c: static SecretWord) {.inline.} =
+  when UseASM_X86_64 and a.len in {3..6}:
+    # ADX implies BMI2
+    if ({.noSideEffect.}: hasAdx()):
+      r.mulCranPartialReduce_asm_adx(a, b, m, c)
+    else:
+      r.mulCranPartialReduce_asm(a, b, m, c)
+  else:
+    var r2 {.noInit.}: Limbs[2*N]
+    r2.prod(a, b)
+    r.reduceCrandallPartial_impl(r2, m, c)
+
+func mulCran*[N: static int](
+        r: var Limbs[N],
+        a, b: Limbs[N],
+        p: Limbs[N],
+        m: static int, c: static SecretWord,
+        lazyReduce: static bool = false) {.inline.} =
+  when lazyReduce:
+    r.mulCranPartialReduce(a, b, m, c)
+  elif UseASM_X86_64 and a.len in {3..6}:
+    # ADX implies BMI2
+    if ({.noSideEffect.}: hasAdx()):
+      r.mulCran_asm_adx(a, b, p, m, c)
+    else:
+      r.mulCran_asm(a, b, p, m, c)
+  else:
+    var r2 {.noInit.}: Limbs[2*N]
+    r2.prod(a, b)
+    r.reduceCrandallPartial_impl(r2, m, c)
+    r.reduceCrandallFinal_impl(p)
+
+func squareCranPartialReduce[N: static int](
+        r: var Limbs[N],
+        a: Limbs[N],
+        m: static int, c: static SecretWord) {.inline.} =
+  when UseASM_X86_64 and a.len in {3..6}:
+    # ADX implies BMI2
+    if ({.noSideEffect.}: hasAdx()):
+      r.squareCranPartialReduce_asm_adx(a, m, c)
+    else:
+      r.squareCranPartialReduce_asm(a, m, c)
+  else:
+    var r2 {.noInit.}: Limbs[2*N]
+    r2.square(a)
+    r.reduceCrandallPartial_impl(r2, m, c)
+
+func squareCran*[N: static int](
+        r: var Limbs[N],
+        a: Limbs[N],
+        p: Limbs[N],
+        m: static int, c: static SecretWord,
+        lazyReduce: static bool = false) {.inline.} =
+  when lazyReduce:
+    r.squareCranPartialReduce(a, m, c)
+  elif UseASM_X86_64 and a.len in {3..6}:
+    # ADX implies BMI2
+    if ({.noSideEffect.}: hasAdx()):
+      r.squareCran_asm_adx(a, p, m, c)
+    else:
+      r.squareCran_asm(a, p, m, c)
+  else:
+    var r2 {.noInit.}: Limbs[2*N]
+    r2.square(a)
+    r.reduceCrandallPartial_impl(r2, m, c)
+    r.reduceCrandallFinal_impl(p)
+
+# Crandall Exponentiation
+# ------------------------------------------------------------
+# We use fixed-window based exponentiation
+# that is constant-time: i.e. the number of multiplications
+# does not depend on the number of set bits in the exponents
+# those are always done and conditionally copied.
+#
+# The exponent MUST NOT be private data (until audited otherwise)
+# - Power attack on RSA, https://www.di.ens.fr/~fouque/pub/ches06.pdf
+# - Flush-and-reload on Sliding window exponentiation: https://tutcris.tut.fi/portal/files/8966761/p1639_pereida_garcia.pdf
+# - Sliding right into disaster, https://eprint.iacr.org/2017/627.pdf
+# - Fixed window leak: https://www.scirp.org/pdf/JCC_2019102810331929.pdf
+# - Constructing sliding-windows leak, https://easychair.org/publications/open/fBNC
+#
+# For pairing curves, this is the case since exponentiation is only
+# used for inversion via the Little Fermat theorem.
+# For RSA, some exponentiations uses private exponents.
+#
+# Note:
+# - Implementation closely follows Thomas Pornin's BearSSL
+# - Apache Milagro Crypto has an alternative implementation
+#   that is more straightforward however:
+#   - the exponent hamming weight is used as loop bounds
+#   - the baseᵏ is stored at each index of a temp table of size k
+#   - the baseᵏ to use is indexed by the hamming weight
+#     of the exponent, leaking this to cache attacks
+#   - in contrast BearSSL touches the whole table to
+#     hide the actual selection
+
+template checkPowScratchSpaceLen(len: int) =
+  ## Checks that there is a minimum of scratchspace to hold the temporaries
+  debug:
+    assert len >= 2, "Internal Error: the scratchspace for powmod should be equal or greater than 2"
+
+func getWindowLen(bufLen: int): uint =
+  ## Compute the maximum window size that fits in the scratchspace buffer
+  checkPowScratchSpaceLen(bufLen)
+  result = 5
+  while (1 shl result) + 1 > bufLen:
+    dec result
+
+func powCranPrologue(
+       a: var Limbs,
+       scratchspace: var openarray[Limbs],
+       m: static int, c: static SecretWord): uint =
+  ## Setup the scratchspace
+  ## Returns the fixed-window size for exponentiation with window optimization.
+  result = scratchspace.len.getWindowLen()
+  # Precompute window content, special case for window = 1
+  # (i.e scratchspace has only space for 2 temporaries)
+  # The content scratchspace[2+k] is set at aᵏ
+  # with scratchspace[0] untouched
+  if result == 1:
+    scratchspace[1] = a
+  else:
+    scratchspace[2] = a
+    for k in 2 ..< 1 shl result:
+      scratchspace[k+1].mulCranPartialReduce(scratchspace[k], a, m, c)
+
+  # Set a to one
+  a.setOne()
+
+func powCranSquarings(
+        a: var Limbs,
+        exponent: openarray[byte],
+        tmp: var Limbs,
+        window: uint,
+        acc, acc_len: var uint,
+        e: var int,
+        m: static int, c: static SecretWord
+      ): tuple[k, bits: uint] {.inline.}=
+  ## Squaring step of exponentiation by squaring
+  ## Get the next k bits in range [1, window)
+  ## Square k times
+  ## Returns the number of squarings done and the corresponding bits
+  ##
+  ## Updates iteration variables and accumulators
+  # Due to the high number of parameters,
+  # forcing this inline actually reduces the code size
+  #
+  # ⚠️: Extreme care should be used to not leak
+  #    the exponent bits nor its real bitlength
+  #    i.e. if the exponent is zero but encoded in a
+  #    256-bit integer, only "256" should leak
+  #    as for some application like RSA
+  #    the exponent might be the user secret key.
+
+  # Get the next bits
+  # acc/acc_len must be uint to avoid Nim runtime checks leaking bits
+  # acc/acc_len must be uint to avoid Nim runtime checks leaking bits
+  # e is public
+  var k = window
+  if acc_len < window:
+    if e < exponent.len:
+      acc = (acc shl 8) or exponent[e].uint
+      inc e
+      acc_len += 8
+    else: # Drained all exponent bits
+      k = acc_len
+
+  let bits = (acc shr (acc_len - k)) and ((1'u shl k) - 1)
+  acc_len -= k
+
+  # We have k bits and can do k squaring, skip final substraction for first k-1 ones.
+  for i in 0 ..< k:
+    a.squareCranPartialReduce(a, m, c)
+
+  return (k, bits)
+
+func powCran*(
+       a: var Limbs,
+       exponent: openarray[byte],
+       p: Limbs,
+       scratchspace: var openarray[Limbs],
+       m: static int, c: static SecretWord,
+       lazyReduce: static bool = false) =
+  ## Modular exponentiation a <- a^exponent (mod M)
+  ##
+  ## This uses fixed-window optimization if possible
+  ##
+  ## - On input ``a`` is the base, on ``output`` a = a^exponent (mod M)
+  ## - ``exponent`` is the exponent in big-endian canonical format (octet-string)
+  ##   Use ``marshal`` for conversion
+  ## - ``scratchspace`` with k the window bitsize of size up to 5
+  ##   This is a buffer that can hold between 2ᵏ + 1 big-ints
+  ##   A window of of 1-bit (no window optimization) requires only 2 big-ints
+  ##
+  ## Note that the best window size require benchmarking and is a tradeoff between
+  ## - performance
+  ## - stack usage
+  ## - precomputation
+  let window = powCranPrologue(a, scratchspace, m, c)
+
+  # We process bits with from most to least significant.
+  # At each loop iteration with have acc_len bits in acc.
+  # To maintain constant-time the number of iterations
+  # or the number of operations or memory accesses should be the same
+  # regardless of acc & acc_len
+  var
+    acc, acc_len: uint
+    e = 0
+  while acc_len > 0 or e < exponent.len:
+    let (k, bits) = powCranSquarings(
+      a, exponent,
+      scratchspace[0], window,
+      acc, acc_len, e,
+      m, c)
+
+    # Window lookup: we set scratchspace[1] to the lookup value.
+    # If the window length is 1, then it's already set.
+    if window > 1:
+      # otherwise we need a constant-time lookup
+      # in particular we need the same memory accesses, we can't
+      # just index the openarray with the bits to avoid cache attacks.
+      for i in 1 ..< 1 shl k:
+        let ctl = SecretWord(i) == SecretWord(bits)
+        scratchspace[1].ccopy(scratchspace[1+i], ctl)
+
+    # Multiply with the looked-up value
+    # we keep the product only if the exponent bits are not all zeroes
+    scratchspace[0].mulCranPartialReduce(a, scratchspace[1], m, c)
+    a.ccopy(scratchspace[0], SecretWord(bits).isNonZero())
+
+  when not lazyReduce:
+    a.reduceCrandallFinal_impl(p)
+
+func powCran_vartime*(
+       a: var Limbs,
+       exponent: openarray[byte],
+       p: Limbs,
+       scratchspace: var openarray[Limbs],
+       m: static int, c: static SecretWord,
+       lazyReduce: static bool = false) =
+  ## Modular exponentiation a <- a^exponent (mod M)
+  ##
+  ## Warning ⚠️ :
+  ## This is an optimization for public exponent
+  ## Otherwise bits of the exponent can be retrieved with:
+  ## - memory access analysis
+  ## - power analysis
+  ## - timing analysis
+
+  # TODO: scratchspace[1] is unused when window > 1
+
+  let window = powCranPrologue(a, scratchspace, m, c)
+
+  var
+    acc, acc_len: uint
+    e = 0
+  while acc_len > 0 or e < exponent.len:
+    let (_, bits) = powCranSquarings(
+      a, exponent,
+      scratchspace[0], window,
+      acc, acc_len, e,
+      m, c)
+
+    ## Warning ⚠️: Exposes the exponent bits
+    if bits != 0:
+      if window > 1:
+        scratchspace[0].mulCranPartialReduce(a, scratchspace[1+bits], m, c)
+      else:
+        # scratchspace[1] holds the original `a`
+        scratchspace[0].mulCranPartialReduce(a, scratchspace[1], m, c)
+      a = scratchspace[0]
+
+  when not lazyReduce:
+    a.reduceCrandallFinal_impl(p)
+
+# Lazily reduced arithmetic
+# ------------------------------------------------------------
+
+# We can use special lazily reduced arithmetic
+# where reduction is only done when we overflow 2ʷⁿ
+# with w the word bitwidth and n the number of words
+# to represent p.
+# For example for Curve25519, p = 2²⁵⁵-19 and 2ʷⁿ=2²⁵⁶
+# Hence reduction will only happen when overflowing 2²⁵⁶ bits
+#
+# However:
+# - Restricting it to mul/squaring in addchain
+#   makes it consistent with generic Montgomery representation
+# - We don't really gain something for addition and substraction
+#   as modular addition needs:
+#   1. One pass of add-with-carry
+#   2. One pass of sub-with-borrow
+#   3. One pass of conditional mov
+#   And lazily reduced needs
+#   the same first 2 pass and replace the third with
+#   masking + single addition
+#   Total number of instruction doesn't seem to change
+#   and conditional moves can be issued 2 per cycle
+#   so we save ~1 clock cycle
+
+func sum_crandall_impl[N: static int](
+        r: var Limbs[N], a, b: Limbs[N],
+        m: int,
+        c: SecretWord) {.used.} =
+  ## Lazily reduced addition
+  ## Proof-of-concept. Currently unused.
+  let S = (N*WordBitWidth - m)
+  let cs = c shl S
+  debug: doAssert 0 <= S and S < WordBitWidth
+
+  let overflow1 = r.sum(a, b)
+  # If there is an overflow, substract 2ˢp = 2ʷⁿ - 2ˢc
+  # with w the word bitwidth and n the number of words
+  # to represent p.
+  # For example for Curve25519, p = 2²⁵⁵-19 and 2ʷⁿ=2²⁵⁶
+
+  # 0x0000 if no overflow or 0xFFFF if overflow
+  let mask1 = -SecretWord(overflow1)
+
+  # 2ˢp = 2ʷⁿ - 2ˢc ≡ 2ˢc (mod p)
+  let overflow2 = r.add(mask1 and cs)
+  let mask2 = -SecretWord(overflow2)
+
+  # We may carry again, but we just did -2ˢc
+  # so adding back 2ˢc for the extra 2ʷⁿ bit cannot carry
+  # to higher limbs
+  r[0] += mask2 and cs
diff --git a/constantine/math/arithmetic/limbs_montgomery.nim b/constantine/math/arithmetic/limbs_montgomery.nim
index 90d10edc..9438f031 100644
--- a/constantine/math/arithmetic/limbs_montgomery.nim
+++ b/constantine/math/arithmetic/limbs_montgomery.nim
@@ -54,14 +54,14 @@ func redc2xMont_CIOS[N: static int](
        r: var array[N, SecretWord],
        a: array[N*2, SecretWord],
        M: array[N, SecretWord],
-       m0ninv: BaseType, skipFinalSub: static bool = false) =
+       m0ninv: BaseType, lazyReduce: static bool = false) =
   ## Montgomery reduce a double-precision bigint modulo M
   ##
   ## This maps
-  ## - [0, 4p²) -> [0, 2p) with skipFinalSub
+  ## - [0, 4p²) -> [0, 2p) with lazyReduce
   ## - [0, 4p²) -> [0, p) without
   ##
-  ## skipFinalSub skips the final substraction step.
+  ## lazyReduce skips the final substraction step.
   # - Analyzing and Comparing Montgomery Multiplication Algorithms
   #   Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr.
   #   http://pdfs.semanticscholar.org/5e39/41ff482ec3ee41dc53c3298f0be085c69483.pdf
@@ -115,7 +115,7 @@ func redc2xMont_CIOS[N: static int](
     addC(carry, res[i], a[i+N], res[i], carry)
 
   # Final substraction
-  when not skipFinalSub:
+  when not lazyReduce:
     discard res.csub(M, SecretWord(carry).isNonZero() or not(res < M))
   r = res
 
@@ -123,14 +123,14 @@ func redc2xMont_Comba[N: static int](
        r: var array[N, SecretWord],
        a: array[N*2, SecretWord],
        M: array[N, SecretWord],
-       m0ninv: BaseType, skipFinalSub: static bool = false) {.used.} =
+       m0ninv: BaseType, lazyReduce: static bool = false) {.used.} =
   ## Montgomery reduce a double-precision bigint modulo M
   ##
   ## This maps
-  ## - [0, 4p²) -> [0, 2p) with skipFinalSub
+  ## - [0, 4p²) -> [0, 2p) with lazyReduce
   ## - [0, 4p²) -> [0, p) without
   ##
-  ## skipFinalSub skips the final substraction step.
+  ## lazyReduce skips the final substraction step.
   # We use Product Scanning / Comba multiplication
   var t, u, v = Zero
   var carry: Carry
@@ -166,14 +166,14 @@ func redc2xMont_Comba[N: static int](
   addC(carry, z[N-1], v, a[2*N-1], Carry(0))
 
   # Final substraction
-  when not skipFinalSub:
+  when not lazyReduce:
     discard z.csub(M, SecretBool(carry) or not(z < M))
   r = z
 
 # Montgomery Multiplication
 # ------------------------------------------------------------
 
-func mulMont_CIOS_sparebit(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) =
+func mulMont_CIOS_sparebit(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, lazyReduce: static bool = false) =
   ## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS)
   ## and no-carry optimization.
   ## This requires the most significant word of the Modulus
@@ -181,10 +181,10 @@ func mulMont_CIOS_sparebit(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipF
   ## https://hackmd.io/@gnark/modular_multiplication
   ##
   ## This maps
-  ## - [0, 2p) -> [0, 2p) with skipFinalSub
+  ## - [0, 2p) -> [0, 2p) with lazyReduce
   ## - [0, 2p) -> [0, p) without
   ##
-  ## skipFinalSub skips the final substraction step.
+  ## lazyReduce skips the final substraction step.
 
   # We want all the computation to be kept in registers
   # hence we use a temporary `t`, hoping that the compiler does it.
@@ -208,11 +208,11 @@ func mulMont_CIOS_sparebit(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipF
 
     t[N-1] = C + A
 
-  when not skipFinalSub:
+  when not lazyReduce:
     discard t.csub(M, not(t < M))
   r = t
 
-func mulMont_CIOS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) {.used.} =
+func mulMont_CIOS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, lazyReduce: static bool = false) {.used.} =
   ## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS)
   # - Analyzing and Comparing Montgomery Multiplication Algorithms
   #   Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr.
@@ -257,18 +257,18 @@ func mulMont_CIOS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub:
   # t[N+1] can only be non-zero in the intermediate computation
   # since it is immediately reduce to t[N] at the end of each "i" iteration
   # However if t[N] is non-zero we have t > M
-  when not skipFinalSub:
+  when not lazyReduce:
     discard t.csub(M, tN.isNonZero() or not(t < M)) # TODO: (t >= M) is unnecessary for prime in the form (2^64)ʷ
   r = t
 
-func mulMont_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) =
+func mulMont_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, lazyReduce: static bool = false) =
   ## Montgomery Multiplication using Finely Integrated Product Scanning (FIPS)
   ##
   ## This maps
-  ## - [0, 2p) -> [0, 2p) with skipFinalSub
+  ## - [0, 2p) -> [0, 2p) with lazyReduce
   ## - [0, 2p) -> [0, p) without
   ##
-  ## skipFinalSub skips the final substraction step.
+  ## lazyReduce skips the final substraction step.
   # - Architectural Enhancements for Montgomery
   #   Multiplication on Embedded RISC Processors
   #   Johann Großschädl and Guy-Armand Kamendje, 2003
@@ -301,22 +301,22 @@ func mulMont_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub:
     u = t
     t = Zero
 
-  when not skipFinalSub:
+  when not lazyReduce:
     discard z.csub(M, v.isNonZero() or not(z < M))
   r = z
 
 func sumprodMont_CIOS_spare2bits[K: static int](
        r: var Limbs, a, b: array[K, Limbs],
        M: Limbs, m0ninv: BaseType,
-       skipFinalSub: static bool = false) =
-  ## Compute r = ⅀aᵢ.bᵢ (mod M) (suim of products)
+       lazyReduce: static bool = false) =
+  ## Compute r = ⅀aᵢ.bᵢ (mod M) (sum of products)
   ## This requires 2 unused bits in the field element representation
   ##
   ## This maps
-  ## - [0, 2p) -> [0, 2p) with skipFinalSub
+  ## - [0, 2p) -> [0, 2p) with lazyReduce
   ## - [0, 2p) -> [0, p) without
   ##
-  ## skipFinalSub skips the final substraction step.
+  ## lazyReduce skips the final substraction step.
 
   # We want all the computation to be kept in registers
   # hence we use a temporary `t`, hoping that the compiler does the right thing™.
@@ -358,7 +358,7 @@ func sumprodMont_CIOS_spare2bits[K: static int](
     #  (_,t[N-1]) <- t[N] + C
     t[N-1] = tN + C
 
-  when not skipFinalSub:
+  when not lazyReduce:
     discard t.csub(M, not(t < M))
   r = t
 
@@ -452,33 +452,34 @@ func redc2xMont*[N: static int](
        a: array[N*2, SecretWord],
        M: array[N, SecretWord],
        m0ninv: BaseType,
-       spareBits: static int, skipFinalSub: static bool = false) {.inline.} =
+       spareBits: static int, lazyReduce: static bool = false) {.inline.} =
   ## Montgomery reduce a double-precision bigint modulo M
 
-  const skipFinalSub = skipFinalSub and spareBits >= 2
+  const lazyReduce = lazyReduce and spareBits >= 2
 
   when UseASM_X86_64 and r.len <= 6:
     # ADX implies BMI2
     if ({.noSideEffect.}: hasAdx()):
-      redcMont_asm_adx(r, a, M, m0ninv, spareBits, skipFinalSub)
+      redcMont_asm_adx(r, a, M, m0ninv, spareBits, lazyReduce)
     else:
       when r.len in {3..6}:
-        redcMont_asm(r, a, M, m0ninv, spareBits, skipFinalSub)
+        # TODO: Assembly faster than GCC but slower than Clang
+        redcMont_asm(r, a, M, m0ninv, spareBits, lazyReduce)
       else:
-        redc2xMont_CIOS(r, a, M, m0ninv, skipFinalSub)
+        redc2xMont_CIOS(r, a, M, m0ninv, lazyReduce)
         # redc2xMont_Comba(r, a, M, m0ninv)
-  elif UseASM_X86_64 and r.len in {3..6}:
+  elif UseASM_X86_32 and r.len in {3..6}:
     # TODO: Assembly faster than GCC but slower than Clang
-    redcMont_asm(r, a, M, m0ninv, spareBits, skipFinalSub)
+    redcMont_asm(r, a, M, m0ninv, spareBits, lazyReduce)
   else:
-    redc2xMont_CIOS(r, a, M, m0ninv, skipFinalSub)
-    # redc2xMont_Comba(r, a, M, m0ninv, skipFinalSub)
+    redc2xMont_CIOS(r, a, M, m0ninv, lazyReduce)
+    # redc2xMont_Comba(r, a, M, m0ninv, lazyReduce)
 
 func mulMont*(
         r: var Limbs, a, b, M: Limbs,
         m0ninv: BaseType,
         spareBits: static int,
-        skipFinalSub: static bool = false) {.inline.} =
+        lazyReduce: static bool = false) {.inline.} =
   ## Compute r <- a*b (mod M) in the Montgomery domain
   ## `m0ninv` = -1/M (mod SecretWord). Our words are 2^32 or 2^64
   ##
@@ -498,28 +499,28 @@ func mulMont*(
   # i.e. c'R <- a'R b'R * R^-1 (mod M) in the natural domain
   # as in the Montgomery domain all numbers are scaled by R
 
-  const skipFinalSub = skipFinalSub and spareBits >= 2
+  const lazyReduce = lazyReduce and spareBits >= 2
 
   when spareBits >= 1:
     when UseASM_X86_64 and a.len in {2 .. 6}: # TODO: handle spilling
       # ADX implies BMI2
       if ({.noSideEffect.}: hasAdx()):
-        mulMont_CIOS_sparebit_asm_adx(r, a, b, M, m0ninv, skipFinalSub)
+        mulMont_CIOS_sparebit_asm_adx(r, a, b, M, m0ninv, lazyReduce)
       else:
-        mulMont_CIOS_sparebit_asm(r, a, b, M, m0ninv, skipFinalSub)
+        mulMont_CIOS_sparebit_asm(r, a, b, M, m0ninv, lazyReduce)
     else:
-      mulMont_CIOS_sparebit(r, a, b, M, m0ninv, skipFinalSub)
+      mulMont_CIOS_sparebit(r, a, b, M, m0ninv, lazyReduce)
   else:
-    mulMont_FIPS(r, a, b, M, m0ninv, skipFinalSub)
+    mulMont_FIPS(r, a, b, M, m0ninv, lazyReduce)
 
 func squareMont*[N](r: var Limbs[N], a, M: Limbs[N],
                   m0ninv: BaseType,
                   spareBits: static int,
-                  skipFinalSub: static bool = false) {.inline.} =
+                  lazyReduce: static bool = false) {.inline.} =
   ## Compute r <- a^2 (mod M) in the Montgomery domain
   ## `m0ninv` = -1/M (mod SecretWord). Our words are 2^31 or 2^63
 
-  const skipFinalSub = skipFinalSub and spareBits >= 2
+  const lazyReduce = lazyReduce and spareBits >= 2
 
   when UseASM_X86_64 and a.len in {4, 6}:
     # ADX implies BMI2
@@ -528,37 +529,37 @@ func squareMont*[N](r: var Limbs[N], a, M: Limbs[N],
       # which uses unfused squaring then Montgomery reduction
       # is slightly slower than fused Montgomery multiplication
       when spareBits >= 1:
-        mulMont_CIOS_sparebit_asm_adx(r, a, a, M, m0ninv, skipFinalSub)
+        mulMont_CIOS_sparebit_asm_adx(r, a, a, M, m0ninv, lazyReduce)
       else:
-        squareMont_CIOS_asm_adx(r, a, M, m0ninv, spareBits, skipFinalSub)
+        squareMont_CIOS_asm_adx(r, a, M, m0ninv, spareBits, lazyReduce)
     else:
-      squareMont_CIOS_asm(r, a, M, m0ninv, spareBits, skipFinalSub)
+      squareMont_CIOS_asm(r, a, M, m0ninv, spareBits, lazyReduce)
   elif UseASM_X86_64:
     var r2x {.noInit.}: Limbs[2*N]
     r2x.square(a)
-    r.redc2xMont(r2x, M, m0ninv, spareBits, skipFinalSub)
+    r.redc2xMont(r2x, M, m0ninv, spareBits, lazyReduce)
   else:
-    mulMont(r, a, a, M, m0ninv, spareBits, skipFinalSub)
+    mulMont(r, a, a, M, m0ninv, spareBits, lazyReduce)
 
 func sumprodMont*[N: static int](
         r: var Limbs, a, b: array[N, Limbs],
         M: Limbs, m0ninv: BaseType,
         spareBits: static int,
-        skipFinalSub: static bool = false) =
+        lazyReduce: static bool = false) =
   ## Compute r <- ⅀aᵢ.bᵢ (mod M) (sum of products)
   when spareBits >= 2:
     when UseASM_X86_64 and r.len in {2 .. 6}:
       if ({.noSideEffect.}: hasAdx()):
-        r.sumprodMont_CIOS_spare2bits_asm_adx(a, b, M, m0ninv, skipFinalSub)
+        r.sumprodMont_CIOS_spare2bits_asm_adx(a, b, M, m0ninv, lazyReduce)
       else:
-        r.sumprodMont_CIOS_spare2bits_asm(a, b, M, m0ninv, skipFinalSub)
+        r.sumprodMont_CIOS_spare2bits_asm(a, b, M, m0ninv, lazyReduce)
     else:
-      r.sumprodMont_CIOS_spare2bits(a, b, M, m0ninv, skipFinalSub)
+      r.sumprodMont_CIOS_spare2bits(a, b, M, m0ninv, lazyReduce)
   else:
-    r.mulMont(a[0], b[0], M, m0ninv, spareBits, skipFinalSub = false)
+    r.mulMont(a[0], b[0], M, m0ninv, spareBits, lazyReduce = false)
     var ri {.noInit.}: Limbs
     for i in 1 ..< N:
-      ri.mulMont(a[i], b[i], M, m0ninv, spareBits, skipFinalSub = false)
+      ri.mulMont(a[i], b[i], M, m0ninv, spareBits, lazyReduce = false)
       var overflowed = SecretBool r.add(ri)
       overflowed = overflowed or not(r < M)
       discard r.csub(M, overflowed)
@@ -616,7 +617,7 @@ func getMont*(r: var Limbs, a, M, r2modM: Limbs,
   #    that range is not valid with the no-carry optimization,
   #    hence an unreduced input that uses 256-bit while prime is 254-bit
   #    can have an incorrect representation.
-  mulMont_FIPS(r, a, r2ModM, M, m0ninv, skipFinalSub = false)
+  mulMont_FIPS(r, a, r2ModM, M, m0ninv, lazyReduce = false)
 
 # Montgomery Modular Exponentiation
 # ------------------------------------------
@@ -726,7 +727,7 @@ func powMontSquarings(
 
   # We have k bits and can do k squaring, skip final substraction for first k-1 ones.
   for i in 0 ..< k:
-    a.squareMont(a, M, m0ninv, spareBits) # TODO: skipFinalSub
+    a.squareMont(a, M, m0ninv, spareBits) # TODO: lazyReduce
 
   return (k, bits)
 
diff --git a/constantine/math/elliptic/ec_multi_scalar_mul_scheduler.nim b/constantine/math/elliptic/ec_multi_scalar_mul_scheduler.nim
index 043b326b..e5ba14cd 100644
--- a/constantine/math/elliptic/ec_multi_scalar_mul_scheduler.nim
+++ b/constantine/math/elliptic/ec_multi_scalar_mul_scheduler.nim
@@ -490,7 +490,7 @@ func sparseVectorAddition[ECaff](
     elif i == numScheduled-1:
       accumDen[i].prod(accumDen[i-1], lambdas[i].den)
     else:
-      accumDen[i].prod(accumDen[i-1], lambdas[i].den, skipFinalSub = true)
+      accumDen[i].prod(accumDen[i-1], lambdas[i].den, lazyReduce = true)
 
   # Step 3: Batch invert
   var accInv {.noInit.}: F
@@ -516,8 +516,8 @@ func sparseVectorAddition[ECaff](
       continue
 
     # Compute lambda - destroys accumDen[i]
-    accumDen[i].prod(accInv, accumDen[i-1], skipFinalSub = true)
-    accumDen[i].prod(accumDen[i], lambdas[i].num, skipFinalSub = true)
+    accumDen[i].prod(accInv, accumDen[i-1], lazyReduce = true)
+    accumDen[i].prod(accumDen[i], lambdas[i].num, lazyReduce = true)
 
     # Compute EC addition
     var r{.noInit.}: ECaff
@@ -527,7 +527,7 @@ func sparseVectorAddition[ECaff](
     buckets[sps[i].bucket] = r
 
     # Next iteration
-    accInv.prod(accInv, lambdas[i].den, skipFinalSub = true)
+    accInv.prod(accInv, lambdas[i].den, lazyReduce = true)
 
   block: # tail
     if specialCases[0] == kInfLhs:
@@ -543,7 +543,7 @@ func sparseVectorAddition[ECaff](
       bucketStatuses[sps[0].bucket].excl(kAffine)
     else:
       # Compute lambda
-      accumDen[0].prod(lambdas[0].num, accInv, skipFinalSub = true)
+      accumDen[0].prod(lambdas[0].num, accInv, lazyReduce = true)
 
       # Compute EC addition
       var r{.noInit.}: ECaff
diff --git a/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim b/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim
index 9475e965..a863d7e3 100644
--- a/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim
@@ -54,9 +54,9 @@ func batchAffine*[F, G](
     z.csetOne(zeroes[i])
 
     if i != N-1:
-      affs[i].x.prod(affs[i-1].x, z, skipFinalSub = true)
+      affs[i].x.prod(affs[i-1].x, z, lazyReduce = true)
     else:
-      affs[i].x.prod(affs[i-1].x, z, skipFinalSub = false)
+      affs[i].x.prod(affs[i-1].x, z, lazyReduce = false)
 
   var accInv {.noInit.}: F
   accInv.inv(affs[N-1].x)
@@ -64,7 +64,7 @@ func batchAffine*[F, G](
   for i in countdown(N-1, 1):
     # Extract 1/Pᵢ
     var invi {.noInit.}: F
-    invi.prod(accInv, affs[i-1].x, skipFinalSub = true)
+    invi.prod(accInv, affs[i-1].x, lazyReduce = true)
     invi.csetZero(zeroes[i])
 
     # Now convert Pᵢ to affine
@@ -74,7 +74,7 @@ func batchAffine*[F, G](
     # next iteration
     invi = projs[i].z
     invi.csetOne(zeroes[i])
-    accInv.prod(accInv, invi, skipFinalSub = true)
+    accInv.prod(accInv, invi, lazyReduce = true)
 
   block: # tail
     accInv.csetZero(zeroes[0])
@@ -114,9 +114,9 @@ func batchAffine*[F, G](
     z.csetOne(zeroes[i])
 
     if i != N-1:
-      affs[i].x.prod(affs[i-1].x, z, skipFinalSub = true)
+      affs[i].x.prod(affs[i-1].x, z, lazyReduce = true)
     else:
-      affs[i].x.prod(affs[i-1].x, z, skipFinalSub = false)
+      affs[i].x.prod(affs[i-1].x, z, lazyReduce = false)
 
   var accInv {.noInit.}: F
   accInv.inv(affs[N-1].x)
@@ -124,27 +124,27 @@ func batchAffine*[F, G](
   for i in countdown(N-1, 1):
     # Extract 1/Pᵢ
     var invi {.noInit.}: F
-    invi.prod(accInv, affs[i-1].x, skipFinalSub = true)
+    invi.prod(accInv, affs[i-1].x, lazyReduce = true)
     invi.csetZero(zeroes[i])
 
     # Now convert Pᵢ to affine
     var invi2 {.noinit.}: F
-    invi2.square(invi, skipFinalSub = true)
+    invi2.square(invi, lazyReduce = true)
     affs[i].x.prod(jacs[i].x, invi2)
-    invi.prod(invi, invi2, skipFinalSub = true)
+    invi.prod(invi, invi2, lazyReduce = true)
     affs[i].y.prod(jacs[i].y, invi)
 
     # next iteration
     invi = jacs[i].z
     invi.csetOne(zeroes[i])
-    accInv.prod(accInv, invi, skipFinalSub = true)
+    accInv.prod(accInv, invi, lazyReduce = true)
 
   block: # tail
     var invi2 {.noinit.}: F
     accInv.csetZero(zeroes[0])
-    invi2.square(accInv, skipFinalSub = true)
+    invi2.square(accInv, lazyReduce = true)
     affs[0].x.prod(jacs[0].x, invi2)
-    accInv.prod(accInv, invi2, skipFinalSub = true)
+    accInv.prod(accInv, invi2, lazyReduce = true)
     affs[0].y.prod(jacs[0].y, accInv)
 
 func batchAffine*[N: static int, F, G](
@@ -309,7 +309,7 @@ func accum_half_vartime[F; G: static Subgroup](
     elif i == N-1:
       points[q].y.prod(points[q_prev].y, lambdas[i].den)
     else:
-      points[q].y.prod(points[q_prev].y, lambdas[i].den, skipFinalSub = true)
+      points[q].y.prod(points[q_prev].y, lambdas[i].den, lazyReduce = true)
 
   # Step 3: batch invert
   var accInv {.noInit.}: F
@@ -338,8 +338,8 @@ func accum_half_vartime[F; G: static Subgroup](
       continue
 
     # Compute lambda
-    points[q].y.prod(accInv, points[q_prev].y, skipFinalSub = true)
-    points[q].y.prod(points[q].y, lambdas[i].num, skipFinalSub = true)
+    points[q].y.prod(accInv, points[q_prev].y, lazyReduce = true)
+    points[q].y.prod(points[q].y, lambdas[i].num, lazyReduce = true)
 
     # Compute EC addition
     var r{.noInit.}: EC_ShortW_Aff[F, G]
@@ -349,7 +349,7 @@ func accum_half_vartime[F; G: static Subgroup](
     points[i] = r
 
     # Next iteration
-    accInv.prod(accInv, lambdas[i].den, skipFinalSub = true)
+    accInv.prod(accInv, lambdas[i].den, lazyReduce = true)
 
   block: # Tail
     let i = 0
@@ -360,7 +360,7 @@ func accum_half_vartime[F; G: static Subgroup](
       recallSpecialCase(i, p, q)
     else:
       # Compute lambda
-      points[q].y.prod(lambdas[0].num, accInv, skipFinalSub = true)
+      points[q].y.prod(lambdas[0].num, accInv, lazyReduce = true)
 
       # Compute EC addition
       var r{.noInit.}: EC_ShortW_Aff[F, G]
diff --git a/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim b/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim
index 682b5711..0810055d 100644
--- a/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim
@@ -72,15 +72,15 @@ func `==`*(P, Q: EC_ShortW_Jac): SecretBool {.meter.} =
   var z1z1 {.noInit.}, z2z2 {.noInit.}: F
   var a{.noInit.}, b{.noInit.}: F
 
-  z1z1.square(P.z, skipFinalSub = true)
-  z2z2.square(Q.z, skipFinalSub = true)
+  z1z1.square(P.z, lazyReduce = true)
+  z2z2.square(Q.z, lazyReduce = true)
 
   a.prod(P.x, z2z2)
   b.prod(Q.x, z1z1)
   result = a == b
 
-  a.prod(P.y, Q.z, skipFinalSub = true)
-  b.prod(Q.y, P.z, skipFinalSub = true)
+  a.prod(P.y, Q.z, lazyReduce = true)
+  b.prod(Q.y, P.z, lazyReduce = true)
   a *= z2z2
   b *= z1z1
   result = result and a == b
@@ -120,9 +120,9 @@ func trySetFromCoordsXandZ*[F; G](
   result = sqrt_if_square(P.y)
 
   var z2 {.noInit.}: F
-  z2.square(z, skipFinalSub = true)
+  z2.square(z, lazyReduce = true)
   P.x.prod(x, z2)
-  P.y.prod(P.y, z2, skipFinalSub = true)
+  P.y.prod(P.y, z2, lazyReduce = true)
   P.y *= z
   P.z = z
 
@@ -231,13 +231,13 @@ template sumImpl[F; G: static Subgroup](
 
   block: # Addition-only, check for exceptional cases
     var Z2Z2 {.noInit.}, U2 {.noInit.}, S2 {.noInit.}: F
-    Z2Z2.square(Q.z, skipFinalSub = true)
-    S1.prod(Q.z, Z2Z2, skipFinalSub = true)
+    Z2Z2.square(Q.z, lazyReduce = true)
+    S1.prod(Q.z, Z2Z2, lazyReduce = true)
     S1 *= P.y           # S₁ = Y₁*Z₂³
     U1.prod(P.x, Z2Z2)  # U₁ = X₁*Z₂²
 
-    Z1Z1.square(P.z, skipFinalSub = not CoefA_eq_minus3)
-    S2.prod(P.z, Z1Z1, skipFinalSub = true)
+    Z1Z1.square(P.z, lazyReduce = not CoefA_eq_minus3)
+    S2.prod(P.z, Z1Z1, lazyReduce = true)
     S2 *= Q.y           # S₂ = Y₂*Z₁³
     U2.prod(Q.x, Z1Z1)  # U₂ = X₂*Z₁²
 
@@ -301,7 +301,7 @@ template sumImpl[F; G: static Subgroup](
       HHH_or_Mpre.prod(a, b)  # HHH or X₁²
 
       # Assuming doubling path
-      a.square(HHH_or_Mpre, skipFinalSub = true)
+      a.square(HHH_or_Mpre, lazyReduce = true)
       a *= HHH_or_Mpre              # a = 3X₁²
       b.square(Z1Z1)
       b.mulCheckSparse(CoefA)       # b = αZZ, with α the "a" coefficient of the curve
@@ -453,8 +453,8 @@ func mixedSum*[F; G: static Subgroup](
     U1 = P.x
     S1 = P.y
 
-    Z1Z1.square(P.z, skipFinalSub = not CoefA_eq_minus3)
-    S2.prod(P.z, Z1Z1, skipFinalSub = true)
+    Z1Z1.square(P.z, lazyReduce = not CoefA_eq_minus3)
+    S2.prod(P.z, Z1Z1, lazyReduce = true)
     S2 *= Q.y           # S₂ = Y₂*Z₁³
     U2.prod(Q.x, Z1Z1)  # U₂ = X₂*Z₁²
 
@@ -518,7 +518,7 @@ func mixedSum*[F; G: static Subgroup](
       HHH_or_Mpre.prod(a, b)        # HHH or X₁²
 
       # Assuming doubling path
-      a.square(HHH_or_Mpre, skipFinalSub = true)
+      a.square(HHH_or_Mpre, lazyReduce = true)
       a *= HHH_or_Mpre              # a = 3X₁²
       b.square(Z1Z1)
       b.mulCheckSparse(CoefA)       # b = αZZ, with α the "a" coefficient of the curve
@@ -669,10 +669,10 @@ func affine*[F; G](
        jac: EC_ShortW_Jac[F, G]) {.meter.} =
   var invZ {.noInit.}, invZ2{.noInit.}: F
   invZ.inv(jac.z)
-  invZ2.square(invZ, skipFinalSub = true)
+  invZ2.square(invZ, lazyReduce = true)
 
   aff.x.prod(jac.x, invZ2)
-  invZ.prod(invZ, invZ2, skipFinalSub = true)
+  invZ.prod(invZ, invZ2, lazyReduce = true)
   aff.y.prod(jac.y, invZ)
 
 func fromAffine*[F; G](
@@ -741,7 +741,7 @@ func sum_vartime*[F; G: static Subgroup](
   var U {.noInit.}, S{.noInit.}, H{.noInit.}, R{.noInit.}: F
 
   if not isPz1:                            # case Z₁ != 1
-    R.square(p.z, skipFinalSub = true)     #   Z₁Z₁ = Z₁²
+    R.square(p.z, lazyReduce = true)     #   Z₁Z₁ = Z₁²
   if isQz1:                                # case Z₂ = 1
     U = p.x                                #   U₁ = X₁*Z₂Z₂
     if isPz1:                              #   case Z₁ = Z₂ = 1
@@ -751,19 +751,19 @@ func sum_vartime*[F; G: static Subgroup](
     H -= U                                 #   H  = U₂-U₁
     S = p.y                                #   S₁ = Y₁*Z₂*Z₂Z₂
   else:                                    # case Z₂ != 1
-    S.square(q.z, skipFinalSub = true)
+    S.square(q.z, lazyReduce = true)
     U.prod(p.x, S)                         #   U₁ = X₁*Z₂Z₂
     if isPz1:
       H = q.x
     else:
       H.prod(q.x, R)
     H -= U                                 #   H  = U₂-U₁
-    S.prod(S, q.z, skipFinalSub = true)
+    S.prod(S, q.z, lazyReduce = true)
     S *= p.y                               #   S₁ = Y₁*Z₂*Z₂Z₂
   if isPz1:
     R = q.y
   else:
-    R.prod(R, p.z, skipFinalSub = true)
+    R.prod(R, p.z, lazyReduce = true)
     R *= q.y                               #   S₂ = Y₂*Z₁*Z₁Z₁
   R -= S                                   # R  = S₂-S₁
 
@@ -778,7 +778,7 @@ func sum_vartime*[F; G: static Subgroup](
   var HHH{.noInit.}: F
   template V: untyped = U
 
-  HHH.square(H, skipFinalSub = true)
+  HHH.square(H, lazyReduce = true)
   V *= HHH                                # V   = U₁*HH
   HHH *= H                                # HHH = H*HH
 
@@ -804,7 +804,7 @@ func sum_vartime*[F; G: static Subgroup](
     if isQz1:
       r.z.prod(H, p.z)
     else:
-      r.z.prod(p.z, q.z, skipFinalSub = true)
+      r.z.prod(p.z, q.z, lazyReduce = true)
       r.z *= H
 
 func mixedSum_vartime*[F; G: static Subgroup](
@@ -857,7 +857,7 @@ func mixedSum_vartime*[F; G: static Subgroup](
   var U {.noInit.}, S{.noInit.}, H{.noInit.}, R{.noInit.}: F
 
   if not isPz1:                            # case Z₁ != 1
-    R.square(p.z, skipFinalSub = true)     #   Z₁Z₁ = Z₁²
+    R.square(p.z, lazyReduce = true)     #   Z₁Z₁ = Z₁²
 
   U = p.x                                  #   U₁ = X₁*Z₂Z₂
   if isPz1:                                #   case Z₁ = Z₂ = 1
@@ -870,7 +870,7 @@ func mixedSum_vartime*[F; G: static Subgroup](
   if isPz1:
     R = q.y
   else:
-    R.prod(R, p.z, skipFinalSub = true)
+    R.prod(R, p.z, lazyReduce = true)
     R *= q.y                               #   S₂ = Y₂*Z₁*Z₁Z₁
   R -= S                                   # R  = S₂-S₁
 
@@ -885,7 +885,7 @@ func mixedSum_vartime*[F; G: static Subgroup](
   var HHH{.noInit.}: F
   template V: untyped = U
 
-  HHH.square(H, skipFinalSub = true)
+  HHH.square(H, lazyReduce = true)
   V *= HHH                                # V   = U₁*HH
   HHH *= H                                # HHH = H*HH
 
diff --git a/constantine/math/elliptic/ec_shortweierstrass_jacobian_extended.nim b/constantine/math/elliptic/ec_shortweierstrass_jacobian_extended.nim
index 8c37b492..dc2c54cd 100644
--- a/constantine/math/elliptic/ec_shortweierstrass_jacobian_extended.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_jacobian_extended.nim
@@ -333,8 +333,8 @@ func affine*[F; G](
        jacext: EC_ShortW_JacExt[F, G]) {.meter.} =
   var invZZ {.noInit.}, invZZZ{.noInit.}: F
   invZZZ.inv(jacext.zzz)
-  invZZ.prod(jacext.zz, invZZZ, skipFinalSub = true)
-  invZZ.square(skipFinalSub = true)
+  invZZ.prod(jacext.zz, invZZZ, lazyReduce = true)
+  invZZ.square(lazyReduce = true)
   aff.x.prod(jacext.x, invZZ)
   aff.y.prod(jacext.y, invZZZ)
 
diff --git a/constantine/math/elliptic/ec_shortweierstrass_projective.nim b/constantine/math/elliptic/ec_shortweierstrass_projective.nim
index 912959f6..afe14c54 100644
--- a/constantine/math/elliptic/ec_shortweierstrass_projective.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_projective.nim
@@ -544,7 +544,7 @@ func sum_vartime*[F; G: static Subgroup](
 
   var VVV{.noInit.}: F
 
-  VVV.square(V, skipFinalSub = true)
+  VVV.square(V, lazyReduce = true)
   R *= VVV
   VVV *= V
 
@@ -564,7 +564,7 @@ func sum_vartime*[F; G: static Subgroup](
       A.prod(U, q.z)
       r.z.prod(VVV, q.z)
     else:
-      r.z.prod(p.z, q.z, skipFinalSub = true)
+      r.z.prod(p.z, q.z, lazyReduce = true)
       A.prod(U, r.z)
       r.z *= VVV
 
@@ -642,7 +642,7 @@ func mixedSum_vartime*[F; G: static Subgroup](
 
   var VVV{.noInit.}: F
 
-  VVV.square(V, skipFinalSub = true)
+  VVV.square(V, lazyReduce = true)
   R *= VVV
   VVV *= V
 
diff --git a/constantine/math/elliptic/ec_twistededwards_batch_ops.nim b/constantine/math/elliptic/ec_twistededwards_batch_ops.nim
index 3e0ff206..b4566b02 100644
--- a/constantine/math/elliptic/ec_twistededwards_batch_ops.nim
+++ b/constantine/math/elliptic/ec_twistededwards_batch_ops.nim
@@ -50,9 +50,9 @@ func batchAffine*[F](
     z.csetOne(zeroes[i])
 
     if i != N-1:
-      affs[i].x.prod(affs[i-1].x, z, skipFinalSub = true)
+      affs[i].x.prod(affs[i-1].x, z, lazyReduce = true)
     else:
-      affs[i].x.prod(affs[i-1].x, z, skipFinalSub = false)
+      affs[i].x.prod(affs[i-1].x, z, lazyReduce = false)
 
   var accInv {.noInit.}: F
   accInv.inv(affs[N-1].x)
@@ -60,7 +60,7 @@ func batchAffine*[F](
   for i in countdown(N-1, 1):
     # Extract 1/Pᵢ
     var invi {.noInit.}: F
-    invi.prod(accInv, affs[i-1].x, skipFinalSub = true)
+    invi.prod(accInv, affs[i-1].x, lazyReduce = true)
     invi.csetZero(zeroes[i])
 
     # Now convert Pᵢ to affine
@@ -70,7 +70,7 @@ func batchAffine*[F](
     # next iteration
     invi = projs[i].z
     invi.csetOne(zeroes[i])
-    accInv.prod(accInv, invi, skipFinalSub = true)
+    accInv.prod(accInv, invi, lazyReduce = true)
 
   block: # tail
     accInv.csetZero(zeroes[0])
diff --git a/constantine/math/extension_fields/towers.nim b/constantine/math/extension_fields/towers.nim
index 20fc0d51..c3c5c1c2 100644
--- a/constantine/math/extension_fields/towers.nim
+++ b/constantine/math/extension_fields/towers.nim
@@ -2185,21 +2185,21 @@ func inv*(a: var CubicExt) =
 # Convenience functions
 # ----------------------------------------------------------------------
 
-template square*(a: var ExtensionField, skipFinalSub: static bool) =
+template square*(a: var ExtensionField, lazyReduce: static bool) =
   # Square alias,
   # this allows using the same code for
   # the base field and its extensions while benefitting from skipping
   # the final substraction on Fp
   a.square()
 
-template square*(r: var ExtensionField, a: ExtensionField, skipFinalSub: static bool) =
+template square*(r: var ExtensionField, a: ExtensionField, lazyReduce: static bool) =
   # Square alias,
   # this allows using the same code for
   # the base field and its extensions while benefitting from skipping
   # the final substraction on Fp
   r.square(a)
 
-template prod*(r: var ExtensionField, a, b: ExtensionField, skipFinalSub: static bool) =
+template prod*(r: var ExtensionField, a, b: ExtensionField, lazyReduce: static bool) =
   # Prod alias,
   # this allows using the same code for
   # the base field and its extensions while benefitting from skipping
diff --git a/constantine/math_arbitrary_precision/arithmetic/limbs_montgomery.nim b/constantine/math_arbitrary_precision/arithmetic/limbs_montgomery.nim
index bf24eb6c..da6c7478 100644
--- a/constantine/math_arbitrary_precision/arithmetic/limbs_montgomery.nim
+++ b/constantine/math_arbitrary_precision/arithmetic/limbs_montgomery.nim
@@ -44,14 +44,14 @@ func mulMont_FIPS*(
        M: LimbsViewConst,
        m0ninv: SecretWord,
        mBits: int,
-       skipFinalSub: static bool = false) {.noInline, tags:[Alloca], meter.} =
+       lazyReduce: static bool = false) {.noInline, tags:[Alloca], meter.} =
   ## Montgomery Multiplication using Finely Integrated Product Scanning (FIPS)
   ##
   ## This maps
-  ## - [0, 2p) -> [0, 2p) with skipFinalSub
+  ## - [0, 2p) -> [0, 2p) with lazyReduce
   ## - [0, 2p) -> [0, p) without
   ##
-  ## skipFinalSub skips the final substraction step.
+  ## lazyReduce skips the final substraction step.
   # - Architectural Enhancements for Montgomery
   #   Multiplication on Embedded RISC Processors
   #   Johann Großschädl and Guy-Armand Kamendje, 2003
@@ -86,7 +86,7 @@ func mulMont_FIPS*(
     u = t
     t = Zero
 
-  when not skipFinalSub:
+  when not lazyReduce:
     discard z.csub(M, v.isNonZero() or not(z.lt(M, L)), L)
   r.copyWords(0, z, 0, L)
 
diff --git a/constantine/math_compiler/impl_fields_nvidia.nim b/constantine/math_compiler/impl_fields_nvidia.nim
index 0ffbb5b1..414f2818 100644
--- a/constantine/math_compiler/impl_fields_nvidia.nim
+++ b/constantine/math_compiler/impl_fields_nvidia.nim
@@ -193,12 +193,12 @@ proc field_sub_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field): FnDef
 
   return (subModTy, subModKernel)
 
-proc field_mul_CIOS_sparebit_gen(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, skipFinalSub: bool): FnDef =
+proc field_mul_CIOS_sparebit_gen(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, lazyReduce: bool): FnDef =
   ## Generate an optimized modular multiplication kernel
   ## with parameters `a, b, modulus: Limbs -> Limbs`
 
   let procName = cm.genSymbol(block:
-    if skipFinalSub:
+    if lazyReduce:
       case field
       of fp: opFpMulSkipFinalSub
       of fr: opFrMulSkipFinalSub
@@ -343,7 +343,7 @@ proc field_mul_CIOS_sparebit_gen(asy: Assembler_LLVM, cm: CurveMetadata, field:
     else:
       t[0] = bld.mulhiadd(m, M[0], t[0])
 
-  if not skipFinalSub:
+  if not lazyReduce:
     asy.finalSubNoOverflow(cm, field, t, t)
 
   bld.store(r, t)
@@ -351,7 +351,7 @@ proc field_mul_CIOS_sparebit_gen(asy: Assembler_LLVM, cm: CurveMetadata, field:
 
   return (mulModTy, mulModKernel)
 
-proc field_mul_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, skipFinalSub = false): FnDef =
+proc field_mul_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, lazyReduce = false): FnDef =
   ## Generate an optimized modular addition kernel
   ## with parameters `a, b, modulus: Limbs -> Limbs`
-  return asy.field_mul_CIOS_sparebit_gen(cm, field, skipFinalSub)
\ No newline at end of file
+  return asy.field_mul_CIOS_sparebit_gen(cm, field, lazyReduce)
diff --git a/constantine/named/config_fields_and_curves.nim b/constantine/named/config_fields_and_curves.nim
index c3fc397a..330d4bf5 100644
--- a/constantine/named/config_fields_and_curves.nim
+++ b/constantine/named/config_fields_and_curves.nim
@@ -183,6 +183,7 @@ declareCurves:
   curve Edwards25519: # Bernstein curve
     bitwidth: 255
     modulus: "0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffed"
+    modulusKind: Crandall(19)
 
     # Montgomery form:            y² = x³ + 486662x² + x
     # Edwards form:          x² + y² = 1+dx²y²           with d = 121665/121666
@@ -221,6 +222,8 @@ declareCurves:
   curve Secp256k1: # Bitcoin curve
     bitwidth: 256
     modulus: "0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f"
+    modulusKind: Crandall(0x1000003D1'u64)
+
     order: "0xfffffffffffffffffffffffffffffffebaaedce6af48a03bbfd25e8cd0364141"
     orderBitwidth: 256
     eq_form: ShortWeierstrass
diff --git a/constantine/named/deriv/derive_constants.nim b/constantine/named/deriv/derive_constants.nim
index f0aeac07..b021c172 100644
--- a/constantine/named/deriv/derive_constants.nim
+++ b/constantine/named/deriv/derive_constants.nim
@@ -66,7 +66,7 @@ macro genDerivedConstants*(mode: static DerivedConstantMode): untyped =
         M
       )
     )
-    # const MyCurve_R4modP = r4mod(MyCurve_Modulus)
+    # const MyCurve_R3modP = r3mod(MyCurve_Modulus)
     result.add newConstStmt(
       used(curve & ff & "_R3modP"), newCall(
         bindSym"r3mod",
@@ -102,6 +102,13 @@ macro genDerivedConstants*(mode: static DerivedConstantMode): untyped =
         M
       )
     )
+    # const MyCurve_PrimeMinus1 = primeMinus1(MyCurve_Modulus)
+    result.add newConstStmt(
+      used(curve & ff & "_PrimeMinus1"), newCall(
+        bindSym"primeMinus1",
+        M
+      )
+    )
     # const MyCurve_PrimeMinus1div2 = primeMinus1div2(MyCurve_Modulus)
     result.add newConstStmt(
       used(curve & ff & "_PrimeMinus1div2"), newCall(
diff --git a/constantine/named/deriv/parser_fields.nim b/constantine/named/deriv/parser_fields.nim
index e21be82a..d6b14fdf 100644
--- a/constantine/named/deriv/parser_fields.nim
+++ b/constantine/named/deriv/parser_fields.nim
@@ -24,6 +24,10 @@ import
 #   for example when using the `r2modP` constant in multiple overloads in the same module
 
 type
+  PrimeKind* = enum
+    kGeneric
+    kCrandall # Crandall Prime are in the form 2ᵐ-c (also called pseudo-Mersenne primes)
+
   CurveFamily* = enum
     NoFamily
     BarretoNaehrig   # BN curve
@@ -95,6 +99,8 @@ type
     # Field parameters
     bitWidth*: NimNode # nnkIntLit
     modulus*: NimNode  # nnkStrLit (hex)
+    modulusKind*: PrimeKind
+    modulusKindAssociatedValue*: BiggestInt
 
     # Towering
     nonresidue_fp*: NimNode # nnkIntLit
@@ -170,13 +176,20 @@ proc parseCurveDecls*(defs: var seq[CurveParams], curves: NimNode) =
       curveParams[i][1].expectKind(nnkStmtList)
       let sectionVal = curveParams[i][1][0]
 
+      # Field
       if sectionId.eqIdent"bitwidth":
         params.bitWidth = sectionVal
       elif sectionId.eqident"modulus":
         params.modulus = sectionVal
+      elif sectionId.eqIdent"modulusKind":
+        sectionVal.expectKind(nnkCall)
+        sectionVal[0].expectIdent"Crandall"
+        params.modulusKind = kCrandall
+        params.modulusKindAssociatedValue = sectionVal[1].intVal
+
+      # Curve
       elif sectionId.eqIdent"family":
         params.family = parseEnum[CurveFamily]($sectionVal)
-
       elif sectionId.eqIdent"eq_form":
         params.eq_form = parseEnum[CurveEquationForm]($sectionVal)
       elif sectionId.eqIdent"coef_a":
@@ -217,6 +230,7 @@ proc parseCurveDecls*(defs: var seq[CurveParams], curves: NimNode) =
       elif sectionId.eqIdent"nonresidue_fp2":
         params.nonresidue_fp2 = sectionVal
 
+      # Pairings
       elif sectionId.eqIdent"embedding_degree":
         params.embedding_degree = sectionVal.intVal.int
       elif sectionId.eqIdent"sexticTwist":
@@ -242,6 +256,7 @@ proc genFieldsConstants(defs: seq[CurveParams]): NimNode =
   var MapCurveBitWidth = nnkBracket.newTree()
   var MapCurveOrderBitWidth = nnkBracket.newTree()
   var curveModStmts = newStmtList()
+  var crandallStmts = newStmtList()
 
   for curveDef in defs:
 
@@ -271,6 +286,14 @@ proc genFieldsConstants(defs: seq[CurveParams]): NimNode =
       )
     )
 
+    crandallStmts.add newConstStmt(
+      exported($curve & "_fp_isCrandall"),
+      newLit(curveDef.modulusKind == kCrandall)
+    )
+    crandallStmts.add newConstStmt(
+      exported($curve & "_fp_CrandallSubTerm"),
+      newCall(bindsym"uint64", newLit(curveDef.modulusKindAssociatedValue))
+    )
     # Field Fr
     if not curveDef.order.isNil:
       curveDef.orderBitwidth.expectKind(nnkIntLit)
@@ -316,6 +339,7 @@ proc genFieldsConstants(defs: seq[CurveParams]): NimNode =
     exported("CurveBitWidth"), MapCurveBitWidth
   )
   result.add curveModStmts
+  result.add crandallStmts
   # const CurveOrderBitSize: array[Curve, int] = ...
   result.add newConstStmt(
     exported("CurveOrderBitWidth"), MapCurveOrderBitWidth
diff --git a/constantine/named/deriv/precompute.nim b/constantine/named/deriv/precompute.nim
index f428d2a2..3f84852c 100644
--- a/constantine/named/deriv/precompute.nim
+++ b/constantine/named/deriv/precompute.nim
@@ -391,6 +391,13 @@ func primePlus1div2*(P: BigInt): BigInt =
   let carry = result.add(1)
   doAssert not carry
 
+func primeMinus1*(P: BigInt): BigInt =
+  ## Compute P-1
+  ## Warning ⚠️: Result is in the canonical domain (not Montgomery)
+
+  result = P
+  discard result.sub(1)
+
 func primeMinus1div2*(P: BigInt): BigInt =
   ## Compute (P-1)/2
   ## For use in constant-time modular inversion
diff --git a/constantine/named/properties_fields.nim b/constantine/named/properties_fields.nim
index abc07b73..b357f9d5 100644
--- a/constantine/named/properties_fields.nim
+++ b/constantine/named/properties_fields.nim
@@ -97,6 +97,21 @@ template getBigInt*[Name: static Algebra](T: type FF[Name]): untyped =
   ## Get the underlying BigInt type.
   typeof(default(T).mres)
 
+template isCrandallPrimeField*(F: type Fr): static bool = false
+
+macro getCrandallPrimeSubterm*[Name: static Algebra](F: type Fp[Name]): static SecretWord =
+  result = newcall(bindSym"SecretWord", bindSym($Name & "_fp_CrandallSubTerm"))
+
+macro isCrandallPrimeField*[Name: static Algebra](F: type Fp[Name]): static bool =
+  let rawCrandall = bindSym($Name & "_fp_isCrandall")
+  let subTerm = bindSym($Name & "_fp_CrandallSubTerm")
+  result = quote do:
+    when `rawCrandall`:
+      when log2_vartime(`subTerm`) < WordBitWidth:
+        true
+      else: false
+    else: false
+
 func bits*[Name: static Algebra](T: type FF[Name]): static int =
   T.getBigInt().bits
 
@@ -204,6 +219,11 @@ macro getPrimePlus1div2*(ff: type FF): untyped =
   ## Warning ⚠️: Result in canonical domain (not Montgomery)
   result = bindConstant(ff, "PrimePlus1div2")
 
+macro getPrimeMinus1*(ff: type FF): untyped =
+  ## Get P-1
+  ## Warning ⚠️: Result in canonical domain (not Montgomery)
+  result = bindConstant(ff, "PrimeMinus1")
+
 macro getPrimeMinus1div2*(ff: type FF): untyped =
   ## Get (P-1) / 2 for an odd prime
   ## Warning ⚠️: Result in canonical domain (not Montgomery)
diff --git a/constantine/platforms/x86/macro_assembler_x86_att.nim b/constantine/platforms/x86/macro_assembler_x86_att.nim
index 20d8d0c1..e95f3eaf 100644
--- a/constantine/platforms/x86/macro_assembler_x86_att.nim
+++ b/constantine/platforms/x86/macro_assembler_x86_att.nim
@@ -7,8 +7,8 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  std/[macros, strutils, sets, hashes, algorithm, sequtils],
-  ../config
+  std/[macros, strutils, sets, hashes, algorithm, sequtils, enumutils],
+  ../[config, bithacks]
 
 # A compile-time inline assembler
 
@@ -484,7 +484,7 @@ func generate*(a: Assembler_x86): NimNode =
     newEmptyNode(),
     result)
 
-func getStrOffset(a: Assembler_x86, op: Operand): string =
+func getStrOffset(a: Assembler_x86, op: Operand, force32IfReg = false): string =
   if op.kind != kFromArray:
     if op.kind in {kArrayAddr, k2dArrayAddr}:
       # We are operating on an array pointer
@@ -493,8 +493,13 @@ func getStrOffset(a: Assembler_x86, op: Operand): string =
         return "%%" & op.buf[0].desc.asmId
       else:
         return "%" & op.buf[0].desc.asmId
+    elif op.kind == kRegister:
+      if force32IfReg:
+        return "%k" & op.desc.asmId
+      else:
+        return "%" & op.desc.asmId
     else:
-      return "%" & op.desc.asmId
+      error "Unsupported: " & $op.kind
 
   # Beware GCC / Clang differences with displacements
   # https://lists.llvm.org/pipermail/llvm-dev/2017-August/116202.html
@@ -556,7 +561,8 @@ func getStrOffset(a: Assembler_x86, op: Operand): string =
       return "0(%%" & op.desc.asmId & ')'
     return $(op.offset * a.wordSize) & "(%%" & op.desc.asmId & ')'
   else:
-    error "Unsupported: " & $op.desc.rm.ord
+    error "Unsupported: " & $op.desc.rm.symbolName() & "\n" &
+      op.repr
 
 func codeFragment(a: var Assembler_x86, instr: string, op: Operand) =
   # Generate a code fragment
@@ -638,7 +644,7 @@ func codeFragment(a: var Assembler_x86, instr: string, reg0, reg1: Register) =
   a.regClobbers.incl reg0
   a.regClobbers.incl reg1
 
-func codeFragment(a: var Assembler_x86, instr: string, imm: int, op: Operand) =
+func codeFragment(a: var Assembler_x86, instr: string, imm: SomeInteger, op: Operand) =
   # Generate a code fragment
   # ⚠️ Warning:
   # The caller should deal with destination/source operand
@@ -653,6 +659,24 @@ func codeFragment(a: var Assembler_x86, instr: string, imm: int, op: Operand) =
   if op.desc.constraint != asmClobberedRegister:
     a.operands.incl op.desc
 
+func codeFragment(a: var Assembler_x86, instr: string, imm: SomeInteger, op0, op1: Operand) =
+  # Generate a code fragment
+  # ⚠️ Warning:
+  # The caller should deal with destination/source operand
+  # so that it fits GNU Assembly
+  let off0 = a.getStrOffset(op0)
+  let off1 = a.getStrOffset(op1)
+
+  if a.wordBitWidth == 64:
+    a.code &= instr & "q $" & $imm & ", " & off0 & ", " & off1 & '\n'
+  else:
+    a.code &= instr & "l $" & $imm & ", " & off0 & ", " & off1 & '\n'
+
+  if op0.desc.constraint != asmClobberedRegister:
+    a.operands.incl op0.desc
+  if op1.desc.constraint != asmClobberedRegister:
+    a.operands.incl op1.desc
+
 func codeFragment(a: var Assembler_x86, instr: string, reg: Register, op: OperandReuse) =
   # Generate a code fragment
   # ⚠️ Warning:
@@ -675,7 +699,7 @@ func codeFragment(a: var Assembler_x86, instr: string, op: OperandReuse, reg: Re
     a.code &= instr & "l %" & $op.asmId & ", %%" & $reg & '\n'
   a.regClobbers.incl reg
 
-func codeFragment(a: var Assembler_x86, instr: string, imm: int, reg: Register) =
+func codeFragment(a: var Assembler_x86, instr: string, imm: SomeInteger, reg: Register) =
   # Generate a code fragment
   # ⚠️ Warning:
   # The caller should deal with destination/source operand
@@ -686,7 +710,7 @@ func codeFragment(a: var Assembler_x86, instr: string, imm: int, reg: Register)
     a.code &= instr & "l $" & $imm & ", %%" & $reg & '\n'
   a.regClobbers.incl reg
 
-func codeFragment(a: var Assembler_x86, instr: string, imm: int, reg: OperandReuse) =
+func codeFragment(a: var Assembler_x86, instr: string, imm: SomeInteger, reg: OperandReuse) =
   # Generate a code fragment
   # ⚠️ Warning:
   # The caller should deal with destination/source operand
@@ -773,6 +797,30 @@ func isOutput(op: Operand): bool =
 # Instructions
 # ------------------------------------------------------------------------------------------------------------
 
+const Reg8Low = [
+      rbx: "%%bl",
+      rdx: "%%dl",
+      r8:  "%%r8l",
+      rax: "%%al",
+      xmm0: "invalid",
+]
+
+const Reg32 = [ # Allow using 32-bit reg on 64-bit for smaller instructions
+      rbx: "%%ebx",
+      rdx: "%%edx",
+      r8 : "%%r8d",
+      rax: "%%eax",
+      xmm0: "invalid",
+]
+
+func setc*(a: var Assembler_x86, dst: Register) =
+  ## Set destination to 1 if carry flag is set
+  ## and 0 otherwise
+
+  # This only works with 8-bit reg so we special-map them
+  a.code &= "setc " & Reg8Low[dst] & '\n'
+  # No flags affected
+
 func add*(a: var Assembler_x86, dst, src: Operand) =
   ## Does: dst <- dst + src
   doAssert dst.isOutput()
@@ -809,7 +857,7 @@ func adc*(a: var Assembler_x86, dst, src: Register) =
   a.codeFragment("adc", src, dst)
   a.areFlagsClobbered = true
 
-func adc*(a: var Assembler_x86, dst: Operand, imm: int) =
+func adc*(a: var Assembler_x86, dst: Operand, imm: SomeInteger) =
   ## Does: dst <- dst + imm + borrow
   doAssert dst.isOutput()
   doAssert dst.desc.rm notin {Mem, MemOffsettable},
@@ -824,7 +872,7 @@ func adc*(a: var Assembler_x86, dst: Operand, src: Register) =
   a.codeFragment("adc", src, dst)
   a.areFlagsClobbered = true
 
-func adc*(a: var Assembler_x86, dst: Register, imm: int) =
+func adc*(a: var Assembler_x86, dst: Register, imm: SomeInteger) =
   ## Does: dst <- dst + src
   a.codeFragment("adc", imm, dst)
   a.areFlagsClobbered = true
@@ -835,6 +883,15 @@ func sub*(a: var Assembler_x86, dst, src: Operand) =
   a.codeFragment("sub", src, dst)
   a.areFlagsClobbered = true
 
+func sub*(a: var Assembler_x86, dst: Operand, imm: SomeInteger) =
+  ## Does: dst <- dst - imm
+  doAssert dst.isOutput()
+  doAssert dst.desc.rm notin {Mem, MemOffsettable},
+    "Using subborrow with a memory destination, this incurs significant performance penalties."
+
+  a.codeFragment("sub", imm, dst)
+  a.areFlagsClobbered = true
+
 func sbb*(a: var Assembler_x86, dst, src: Operand) =
   ## Does: dst <- dst - src - borrow
   doAssert dst.isOutput()
@@ -844,7 +901,7 @@ func sbb*(a: var Assembler_x86, dst, src: Operand) =
   a.codeFragment("sbb", src, dst)
   a.areFlagsClobbered = true
 
-func sbb*(a: var Assembler_x86, dst: Operand, imm: int) =
+func sbb*(a: var Assembler_x86, dst: Operand, imm: SomeInteger) =
   ## Does: dst <- dst - imm - borrow
   doAssert dst.isOutput()
   doAssert dst.desc.rm notin {Mem, MemOffsettable},
@@ -853,7 +910,7 @@ func sbb*(a: var Assembler_x86, dst: Operand, imm: int) =
   a.codeFragment("sbb", imm, dst)
   a.areFlagsClobbered = true
 
-func sbb*(a: var Assembler_x86, dst: Register, imm: int) =
+func sbb*(a: var Assembler_x86, dst: Register, imm: SomeInteger) =
   ## Does: dst <- dst - imm - borrow
   a.codeFragment("sbb", imm, dst)
   a.areFlagsClobbered = true
@@ -863,7 +920,7 @@ func sbb*(a: var Assembler_x86, dst, src: Register) =
   a.codeFragment("sbb", src, dst)
   a.areFlagsClobbered = true
 
-func sbb*(a: var Assembler_x86, dst: OperandReuse, imm: int) =
+func sbb*(a: var Assembler_x86, dst: OperandReuse, imm: SomeInteger) =
   ## Does: dst <- dst - imm - borrow
   a.codeFragment("sbb", imm, dst)
   a.areFlagsClobbered = true
@@ -873,7 +930,13 @@ func sbb*(a: var Assembler_x86, dst, src: OperandReuse) =
   a.codeFragment("sbb", src, dst)
   a.areFlagsClobbered = true
 
-func sar*(a: var Assembler_x86, dst: Operand, imm: int) =
+func shld*(a: var Assembler_x86, inout: Operand, src: Operand, imm: SomeInteger) =
+  ## Does double precision left shift
+  ## inout <- (inout, src) << imm
+  a.codeFragment("shld", imm, src, inout)
+  a.areFlagsClobbered = true
+
+func sar*(a: var Assembler_x86, dst: Operand, imm: SomeInteger) =
   ## Does Arithmetic Right Shift (i.e. with sign extension)
   doAssert dst.isOutput()
   a.codeFragment("sar", imm, dst)
@@ -885,7 +948,13 @@ func `and`*(a: var Assembler_x86, dst: Operand, src: Register) =
   a.codeFragment("and", src, dst)
   a.areFlagsClobbered = true
 
-func `and`*(a: var Assembler_x86, dst: OperandReuse, imm: int) =
+func `and`*(a: var Assembler_x86, dst: Operand, imm: SomeInteger) =
+  ## Compute the bitwise AND of x and y and
+  ## set the Sign, Zero and Parity flags
+  a.codeFragment("and", imm, dst)
+  a.areFlagsClobbered = true
+
+func `and`*(a: var Assembler_x86, dst: OperandReuse, imm: SomeInteger) =
   ## Compute the bitwise AND of x and y and
   ## set the Sign, Zero and Parity flags
   a.codeFragment("and", imm, dst)
@@ -936,14 +1005,31 @@ func `or`*(a: var Assembler_x86, dst: OperandReuse, src: Operand) =
 func `xor`*(a: var Assembler_x86, dst, src: Operand) =
   ## Compute the bitwise xor of x and y and
   ## reset all flags
-  a.codeFragment("xor", src, dst)
-  a.areFlagsClobbered = true
+  if dst.desc == src.desc:
+    # Special case zero-ing so it uses 32-bit registers
+    # and rely on auto zero-clear of upper bits
+    let off = a.getStrOffset(dst, force32IfReg = true)
+    a.code &= "xorl " & off & ", " & off & '\n'
+
+    if dst.desc.constraint != asmClobberedRegister:
+      a.operands.incl dst.desc
+    a.areFlagsClobbered = true
+  else:
+    a.codeFragment("xor", src, dst)
+    a.areFlagsClobbered = true
 
 func `xor`*(a: var Assembler_x86, dst, src: Register) =
   ## Compute the bitwise xor of x and y and
   ## reset all flags
-  a.codeFragment("xor", src, dst)
-  a.areFlagsClobbered = true
+  if dst == src:
+    # Special case zero-ing so it uses 32-bit registers
+    a.code &= "xorl " & Reg32[dst] & ", " & Reg32[dst] & '\n'
+
+    a.regClobbers.incl dst
+    a.areFlagsClobbered = true
+  else:
+    a.codeFragment("xor", src, dst)
+    a.areFlagsClobbered = true
 
 func mov*(a: var Assembler_x86, dst, src: Operand) =
   ## Does: dst <- src
@@ -966,16 +1052,33 @@ func mov*(a: var Assembler_x86, dst: OperandReuse, src: Operand) =
   a.codeFragment("mov", src, dst)
   # No clobber
 
-func mov*(a: var Assembler_x86, dst: Operand, imm: int) =
+func mov*(a: var Assembler_x86, dst: Operand, imm: SomeInteger) =
   ## Does: dst <- imm
   doAssert dst.isOutput(), $dst.repr
 
-  a.codeFragment("mov", imm, dst)
+  # We special-case register <- immediate mov
+  # as they can take up to 10 bytes (2 bytes REX instr + 8-byte data)
+  # on x86-64 even though most of the time we load small values
+  if log2_vartime(uint64 imm) >= 32:
+    a.codeFragment("mov", imm, dst)
+  else:
+    let off = a.getStrOffset(dst, force32IfReg = true)
+    a.code &= "movl $" & $imm & ", " & off & '\n'
   # No clobber
 
-func mov*(a: var Assembler_x86, dst: Register, imm: int) =
+func mov*(a: var Assembler_x86, dst: Register, imm: SomeInteger) =
   ## Does: dst <- src with dst a fixed register
-  a.codeFragment("mov", imm, dst)
+
+  # We special-case register <- immediate mov
+  # as they can take up to 10 bytes (2 bytes REX instr + 8-byte data)
+  # on x86-64 even though
+  # most of the time we load small values
+
+  if log2_vartime(uint64 imm) >= 32:
+    a.codeFragment("mov", imm, dst)
+  else:
+    a.code &= "movl $" & $imm & ", " & Reg32[dst] & '\n'
+  a.regClobbers.incl dst
 
 func mov*(a: var Assembler_x86, dst: Register, src: Operand) =
   ## Does: dst <- src with dst a fixed register
@@ -1010,6 +1113,11 @@ func cmovnc*(a: var Assembler_x86, dst, src: Operand) =
   a.codeFragment("cmovnc", src, dst)
   # No clobber
 
+func cmovnc*(a: var Assembler_x86, dst: Register, src: Operand) =
+  ## Does: dst <- src if the carry flag is not set
+  a.codeFragment("cmovnc", src, dst)
+  # No clobber
+
 func cmovz*(a: var Assembler_x86, dst: Operand, src: Register) =
   ## Does: dst <- src if the zero flag is not set
   doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr
@@ -1079,6 +1187,30 @@ func imul*(a: var Assembler_x86, dst: Register, src: Operand) =
   ## Does dst <- dst * src, keeping only the low half
   a.codeFragment("imul", src, dst)
 
+  a.areFlagsClobbered = true
+
+func imul*(a: var Assembler_x86, dst: Register, src0: Operand, imm: SomeInteger) =
+  ## Does dst <- a * imm, keeping only the low half
+  let off0 = a.getStrOffset(src0)
+
+  if a.wordBitWidth == 64:
+    a.code &= "imulq $" & $imm & ", " & $off0 & ", %%" & $dst & '\n'
+  else:
+    a.code &= "imull $" & $imm & ", " & $off0 & ", %%" & $dst & '\n'
+  a.regClobbers.incl dst
+  a.areFlagsClobbered = true
+  a.operands.incl(src0.desc)
+
+func imul*(a: var Assembler_x86, dst: Register, src0: Register, imm: SomeInteger) =
+  ## Does dst <- a * imm, keeping only the low half
+  if a.wordBitWidth == 64:
+    a.code &= "imulq $" & $imm & ", %%" & $src0 & ", %%" & $dst '\n'
+  else:
+    a.code &= "imull $" & $imm & ", %%" & $src0 & ", %%" & $dst '\n'
+  a.regClobbers.incl {dst, src0}
+  a.areFlagsClobbered = true
+
+
 func mulx*(a: var Assembler_x86, dHi, dLo, src0: Operand, src1: Register) =
   ## Does (dHi, dLo) <- src0 * src1
   doAssert src1 == rdx, "MULX requires the RDX register"
@@ -1121,6 +1253,24 @@ func mulx*(a: var Assembler_x86, dHi: Operand, dLo: Register, src0: Operand, src
   a.operands.incl src0.desc
   a.regClobbers.incl dLo
 
+func mulx*(a: var Assembler_x86, dHi: Operand, dLo, src0, src1: Register) =
+  ## Does (dHi, dLo) <- src0 * src1
+  doAssert src1 == rdx, "MULX requires the RDX register"
+  a.regClobbers.incl rdx
+
+  doAssert dHi.desc.rm in {Reg, ElemsInReg}+SpecificRegisters,
+    "The destination operand must be a register " & $dHi.repr
+  doAssert dHi.desc.constraint in OutputReg
+
+  # Annoying AT&T syntax
+  if a.wordBitWidth == 64:
+    a.code &= "mulxq %%" & $src0 & ", %%" & $dLo & ", %" & $dHi.desc.asmId & '\n'
+  else:
+    a.code &= "mulxl %%" & $src0 & ", %%" & $dLo & ", %" & $dHi.desc.asmId & '\n'
+
+  a.regClobbers.incl src0
+  a.regClobbers.incl dLo
+
 func mulx*(a: var Assembler_x86, dHi: OperandReuse, dLo, src0: Operand, src1: Register) =
   ## Does (dHi, dLo) <- src0 * src1
   doAssert src1 == rdx, "MULX requires the RDX register"
diff --git a/constantine/platforms/x86/macro_assembler_x86_intel.nim b/constantine/platforms/x86/macro_assembler_x86_intel.nim
index 357c0953..e9bb3c44 100644
--- a/constantine/platforms/x86/macro_assembler_x86_intel.nim
+++ b/constantine/platforms/x86/macro_assembler_x86_intel.nim
@@ -7,8 +7,8 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  std/[macros, strutils, sets, hashes, algorithm, sequtils],
-  ../config
+  std/[macros, strutils, sets, hashes, algorithm, sequtils, enumutils],
+  ../[config, bithacks]
 
 # A compile-time inline assembler
 
@@ -306,8 +306,7 @@ func asmArray*(nimSymbol: NimNode, len: int, rm: RM, constraint: Constraint, mem
         desc: desc,
         kind: kFromArray,
         offset: i)
-  else:
-    # For ElemsInReg
+  elif rm == ElemsInReg:
     #   We can't store an array in register so we create assign individual register
     #   per array elements instead
     for i in 0 ..< len:
@@ -320,6 +319,8 @@ func asmArray*(nimSymbol: NimNode, len: int, rm: RM, constraint: Constraint, mem
       result.buf[i] = Operand(
         desc: desc,
         kind: kRegister)
+  else:
+    error "Not implemented"
 
 func asArrayAddr*(op: Operand, memPointer: NimNode, len: int, memIndirect: MemIndirectAccess): Operand =
   ## Use the value stored in an operand as an array address
@@ -432,7 +433,7 @@ func generate*(a: Assembler_x86): NimNode =
     params.add newLit(": ") &  inOperands.foldl(a & newLit(", ") & b) & newLit("\n")
   else:
     params.add newLit(":\n")
-    
+
   let clobbers = [(a.isStackClobbered, "sp"),
                   (a.areFlagsClobbered, "cc"),
                   (memClobbered, "memory")]
@@ -484,7 +485,9 @@ func generate*(a: Assembler_x86): NimNode =
     newEmptyNode(),
     result)
 
-func getStrOffset(a: Assembler_x86, op: Operand): string =
+func getStrOffset(a: Assembler_x86, op: Operand, force32IfReg = false): string =
+  # force32IfReg forces uses of 32-bit registers (memory operand are not changed)
+
   if op.kind != kFromArray:
     if op.kind in {kArrayAddr, k2dArrayAddr}:
       # We are operating on an array pointer
@@ -493,8 +496,13 @@ func getStrOffset(a: Assembler_x86, op: Operand): string =
         return op.buf[0].desc.asmId
       else:
         return "%" & op.buf[0].desc.asmId
+    elif op.kind == kRegister:
+      if force32IfReg:
+        return "%k" & op.desc.asmId
+      else:
+        return "%" & op.desc.asmId
     else:
-      return "%" & op.desc.asmId
+      error "Unsupported: " & $op.kind
 
   # Beware GCC / Clang differences with displacements
   # https://lists.llvm.org/pipermail/llvm-dev/2017-August/116202.html
@@ -564,7 +572,8 @@ func getStrOffset(a: Assembler_x86, op: Operand): string =
         return "DWORD ptr [" & op.desc.asmId & ']'
       return "DWORD ptr [" & op.desc.asmId & " + " & $(op.offset * a.wordSize) & ']'
   else:
-    error "Unsupported: " & $op.desc.rm.ord
+    error "Unsupported: " & $op.desc.rm.symbolName() & "\n" &
+      op.repr
 
 func codeFragment(a: var Assembler_x86, instr: string, op: Operand) =
   # Generate a code fragment
@@ -628,7 +637,7 @@ func codeFragment(a: var Assembler_x86, instr: string, reg0, reg1: Register) =
   a.regClobbers.incl reg0
   a.regClobbers.incl reg1
 
-func codeFragment(a: var Assembler_x86, instr: string, op: Operand, imm: int) =
+func codeFragment(a: var Assembler_x86, instr: string, op: Operand, imm: SomeInteger) =
   # Generate a code fragment
   # ⚠️ Warning:
   # The caller should deal with destination/source operand
@@ -640,6 +649,21 @@ func codeFragment(a: var Assembler_x86, instr: string, op: Operand, imm: int) =
   if op.desc.constraint != asmClobberedRegister:
     a.operands.incl op.desc
 
+func codeFragment(a: var Assembler_x86, instr: string, op0, op1: Operand, imm: SomeInteger) =
+  # Generate a code fragment
+  # ⚠️ Warning:
+  # The caller should deal with destination/source operand
+  # so that it fits Intel Assembly
+  let off0 = a.getStrOffset(op0)
+  let off1 = a.getStrOffset(op1)
+
+  a.code &= instr & " " & off0 & ", " & off1 & ", " & $imm & '\n'
+
+  if op0.desc.constraint != asmClobberedRegister:
+    a.operands.incl op0.desc
+  if op1.desc.constraint != asmClobberedRegister:
+    a.operands.incl op1.desc
+
 func codeFragment(a: var Assembler_x86, instr: string, op: OperandReuse, reg: Register) =
   # Generate a code fragment
   # ⚠️ Warning:
@@ -656,7 +680,7 @@ func codeFragment(a: var Assembler_x86, instr: string, reg: Register, op: Operan
   a.code &= instr & " " & $reg & ", %" & $op.asmId & '\n'
   a.regClobbers.incl reg
 
-func codeFragment(a: var Assembler_x86, instr: string, reg: Register, imm: int) =
+func codeFragment(a: var Assembler_x86, instr: string, reg: Register, imm: SomeInteger) =
   # Generate a code fragment
   # ⚠️ Warning:
   # The caller should deal with destination/source operand
@@ -664,7 +688,7 @@ func codeFragment(a: var Assembler_x86, instr: string, reg: Register, imm: int)
   a.code &= instr & " " & $reg & ", " & $imm & '\n'
   a.regClobbers.incl reg
 
-func codeFragment(a: var Assembler_x86, instr: string, reg: OperandReuse, imm: int) =
+func codeFragment(a: var Assembler_x86, instr: string, reg: OperandReuse, imm: SomeInteger) =
   # Generate a code fragment
   # ⚠️ Warning:
   # The caller should deal with destination/source operand
@@ -740,6 +764,30 @@ func isOutput(op: Operand): bool =
 # Instructions
 # ------------------------------------------------------------------------------------------------------------
 
+const Reg8Low = [
+      rbx: "bl",
+      rdx: "dl",
+      r8: "r8l",
+      rax: "al",
+      xmm0: "invalid",
+]
+
+const Reg32 = [ # Allow using 32-bit reg on 64-bit for smaller instructions
+      rbx: "ebx",
+      rdx: "edx",
+      r8 : "r8d",
+      rax: "eax",
+      xmm0: "invalid",
+]
+
+func setc*(a: var Assembler_x86, dst: Register) =
+  ## Set destination to 1 if carry flag is set
+  ## and 0 otherwise
+
+  # This only works with 8-bit reg so we special-map them
+  a.code &= "setc " & Reg8Low[dst] & '\n'
+  # No flags affected
+
 func add*(a: var Assembler_x86, dst, src: Operand) =
   ## Does: dst <- dst + src
   doAssert dst.isOutput()
@@ -776,7 +824,7 @@ func adc*(a: var Assembler_x86, dst, src: Register) =
   a.codeFragment("adc", dst, src)
   a.areFlagsClobbered = true
 
-func adc*(a: var Assembler_x86, dst: Operand, imm: int) =
+func adc*(a: var Assembler_x86, dst: Operand, imm: SomeInteger) =
   ## Does: dst <- dst + imm + borrow
   doAssert dst.isOutput()
   doAssert dst.desc.rm notin {Mem, MemOffsettable},
@@ -791,7 +839,7 @@ func adc*(a: var Assembler_x86, dst: Operand, src: Register) =
   a.codeFragment("adc", dst, src)
   a.areFlagsClobbered = true
 
-func adc*(a: var Assembler_x86, dst: Register, imm: int) =
+func adc*(a: var Assembler_x86, dst: Register, imm: SomeInteger) =
   ## Does: dst <- dst + src
   a.codeFragment("adc", dst, imm)
   a.areFlagsClobbered = true
@@ -802,6 +850,15 @@ func sub*(a: var Assembler_x86, dst, src: Operand) =
   a.codeFragment("sub", dst, src)
   a.areFlagsClobbered = true
 
+func sub*(a: var Assembler_x86, dst: Operand, imm: SomeInteger) =
+  ## Does: dst <- dst - imm
+  doAssert dst.isOutput()
+  doAssert dst.desc.rm notin {Mem, MemOffsettable},
+    "Using subborrow with a memory destination, this incurs significant performance penalties."
+
+  a.codeFragment("sub", dst, imm)
+  a.areFlagsClobbered = true
+
 func sbb*(a: var Assembler_x86, dst, src: Operand) =
   ## Does: dst <- dst - src - borrow
   doAssert dst.isOutput()
@@ -811,7 +868,7 @@ func sbb*(a: var Assembler_x86, dst, src: Operand) =
   a.codeFragment("sbb", dst, src)
   a.areFlagsClobbered = true
 
-func sbb*(a: var Assembler_x86, dst: Operand, imm: int) =
+func sbb*(a: var Assembler_x86, dst: Operand, imm: SomeInteger) =
   ## Does: dst <- dst - imm - borrow
   doAssert dst.isOutput()
   doAssert dst.desc.rm notin {Mem, MemOffsettable},
@@ -820,7 +877,7 @@ func sbb*(a: var Assembler_x86, dst: Operand, imm: int) =
   a.codeFragment("sbb", dst, imm)
   a.areFlagsClobbered = true
 
-func sbb*(a: var Assembler_x86, dst: Register, imm: int) =
+func sbb*(a: var Assembler_x86, dst: Register, imm: SomeInteger) =
   ## Does: dst <- dst - imm - borrow
   a.codeFragment("sbb", dst, imm)
   a.areFlagsClobbered = true
@@ -830,7 +887,7 @@ func sbb*(a: var Assembler_x86, dst, src: Register) =
   a.codeFragment("sbb", dst, src)
   a.areFlagsClobbered = true
 
-func sbb*(a: var Assembler_x86, dst: OperandReuse, imm: int) =
+func sbb*(a: var Assembler_x86, dst: OperandReuse, imm: SomeInteger) =
   ## Does: dst <- dst - imm - borrow
   a.codeFragment("sbb", dst, imm)
   a.areFlagsClobbered = true
@@ -840,7 +897,13 @@ func sbb*(a: var Assembler_x86, dst, src: OperandReuse) =
   a.codeFragment("sbb", dst, src)
   a.areFlagsClobbered = true
 
-func sar*(a: var Assembler_x86, dst: Operand, imm: int) =
+func shld*(a: var Assembler_x86, inout: Operand, src: Operand, imm: SomeInteger) =
+  ## Does double precision left shift
+  ## inout <- (inout, src) << imm
+  a.codeFragment("shld", inout, src, imm)
+  a.areFlagsClobbered = true
+
+func sar*(a: var Assembler_x86, dst: Operand, imm: SomeInteger) =
   ## Does Arithmetic Right Shift (i.e. with sign extension)
   doAssert dst.isOutput()
   a.codeFragment("sar", dst, imm)
@@ -852,7 +915,13 @@ func `and`*(a: var Assembler_x86, dst: Operand, src: Register) =
   a.codeFragment("and", dst, src)
   a.areFlagsClobbered = true
 
-func `and`*(a: var Assembler_x86, dst: OperandReuse, imm: int) =
+func `and`*(a: var Assembler_x86, dst: Operand, imm: SomeInteger) =
+  ## Compute the bitwise AND of x and y and
+  ## set the Sign, Zero and Parity flags
+  a.codeFragment("and", dst, imm)
+  a.areFlagsClobbered = true
+
+func `and`*(a: var Assembler_x86, dst: OperandReuse, imm: SomeInteger) =
   ## Compute the bitwise AND of x and y and
   ## set the Sign, Zero and Parity flags
   a.codeFragment("and", dst, imm)
@@ -903,14 +972,31 @@ func `or`*(a: var Assembler_x86, dst: OperandReuse, src: Operand) =
 func `xor`*(a: var Assembler_x86, dst, src: Operand) =
   ## Compute the bitwise xor of x and y and
   ## reset all flags
-  a.codeFragment("xor", dst, src)
-  a.areFlagsClobbered = true
+  if dst.desc == src.desc:
+    # Special case zero-ing so it uses 32-bit registers
+    # and rely on auto zero-clear of upper bits
+    let off = a.getStrOffset(dst, force32IfReg = true)
+    a.code &= "xor " & off & ", " & off & '\n'
+
+    if dst.desc.constraint != asmClobberedRegister:
+      a.operands.incl dst.desc
+    a.areFlagsClobbered = true
+  else:
+    a.codeFragment("xor", src, dst)
+    a.areFlagsClobbered = true
 
 func `xor`*(a: var Assembler_x86, dst, src: Register) =
   ## Compute the bitwise xor of x and y and
   ## reset all flags
-  a.codeFragment("xor", dst, src)
-  a.areFlagsClobbered = true
+  if dst == src:
+    # Special case zero-ing so it uses 32-bit registers
+    a.code &= "xor " & Reg32[dst] & ", " & Reg32[dst] & '\n'
+
+    a.regClobbers.incl dst
+    a.areFlagsClobbered = true
+  else:
+    a.codeFragment("xor", src, dst)
+    a.areFlagsClobbered = true
 
 func mov*(a: var Assembler_x86, dst, src: Operand) =
   ## Does: dst <- src
@@ -933,16 +1019,31 @@ func mov*(a: var Assembler_x86, dst: OperandReuse, src: Operand) =
   a.codeFragment("mov", dst, src)
   # No clobber
 
-func mov*(a: var Assembler_x86, dst: Operand, imm: int) =
+func mov*(a: var Assembler_x86, dst: Operand, imm: SomeInteger) =
   ## Does: dst <- imm
   doAssert dst.isOutput(), $dst.repr
 
-  a.codeFragment("mov", dst, imm)
+  # We special-case register <- immediate mov
+  # as they can take up to 10 bytes (2 bytes REX instr + 8-byte data)
+  # on x86-64 even though most of the time we load small values
+  if log2_vartime(uint64 imm) >= 32:
+    a.codeFragment("mov", dst, imm)
+  else:
+    let off = a.getStrOffset(dst, force32IfReg = true)
+    a.code &= "mov " & off & ", " & $imm & '\n'
   # No clobber
 
-func mov*(a: var Assembler_x86, dst: Register, imm: int) =
+func mov*(a: var Assembler_x86, dst: Register, imm: SomeInteger) =
   ## Does: dst <- src with dst a fixed register
-  a.codeFragment("mov", dst, imm)
+
+  # We special-case register <- immediate mov
+  # as they can take up to 10 bytes (2 bytes REX instr + 8-byte data)
+  # on x86-64 even though most of the time we load small values
+  if log2_vartime(uint64 imm) >= 32:
+    a.codeFragment("mov", dst, imm)
+  else:
+    a.code &= "mov " & Reg32[dst] & ", " & $imm & '\n'
+  a.regClobbers.incl dst
 
 func mov*(a: var Assembler_x86, dst: Register, src: Operand) =
   ## Does: dst <- src with dst a fixed register
@@ -977,6 +1078,11 @@ func cmovnc*(a: var Assembler_x86, dst, src: Operand) =
   a.codeFragment("cmovnc", dst, src)
   # No clobber
 
+func cmovnc*(a: var Assembler_x86, dst: Register, src: Operand) =
+  ## Does: dst <- src if the carry flag is not set
+  a.codeFragment("cmovnc", dst, src)
+  # No clobber
+
 func cmovz*(a: var Assembler_x86, dst: Operand, src: Register) =
   ## Does: dst <- src if the zero flag is not set
   doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr
@@ -1034,6 +1140,7 @@ func mul*(a: var Assembler_x86, dHi, dLo: Register, src0: Operand, src1: Registe
   a.regClobbers.incl rdx
 
   a.codeFragment("mul", src0)
+  a.areFlagsClobbered = true
 
 func imul*(a: var Assembler_x86, dst, src: Operand) =
   ## Does dst <- dst * src, keeping only the low half
@@ -1041,10 +1148,27 @@ func imul*(a: var Assembler_x86, dst, src: Operand) =
   doAssert dst.isOutput(), $dst.repr
 
   a.codeFragment("imul", dst, src)
+  a.areFlagsClobbered = true
 
 func imul*(a: var Assembler_x86, dst: Register, src: Operand) =
   ## Does dst <- dst * src, keeping only the low half
   a.codeFragment("imul", dst, src)
+  a.areFlagsClobbered = true
+
+func imul*(a: var Assembler_x86, dst: Register, src0: Operand, imm: SomeInteger) =
+  ## Does dst <- a * imm, keeping only the low half
+  let off0 = a.getStrOffset(src0)
+
+  a.code &= "imul " & $dst & ", " & $off0 & ", " & $imm & '\n'
+  a.regClobbers.incl dst
+  a.areFlagsClobbered = true
+  a.operands.incl(src0.desc)
+
+func imul*(a: var Assembler_x86, dst: Register, src0: Register, imm: SomeInteger) =
+  ## Does dst <- a * imm, keeping only the low half
+  a.code &= "imul " & $dst & ", " & $src0 & ", " & $imm '\n'
+  a.regClobbers.incl {dst, src0}
+  a.areFlagsClobbered = true
 
 func mulx*(a: var Assembler_x86, dHi, dLo, src0: Operand, src1: Register) =
   ## Does (dHi, dLo) <- src0 * src1
@@ -1063,6 +1187,7 @@ func mulx*(a: var Assembler_x86, dHi, dLo, src0: Operand, src1: Register) =
   a.code &= "mulx %" & $dHi.desc.asmId & ", %" & $dLo.desc.asmId & ", " & off0 & '\n'
 
   a.operands.incl src0.desc
+  # No clobber
 
 func mulx*(a: var Assembler_x86, dHi: Operand, dLo: Register, src0: Operand, src1: Register) =
   ## Does (dHi, dLo) <- src0 * src1
@@ -1080,6 +1205,20 @@ func mulx*(a: var Assembler_x86, dHi: Operand, dLo: Register, src0: Operand, src
   a.operands.incl src0.desc
   a.regClobbers.incl dLo
 
+func mulx*(a: var Assembler_x86, dHi: Operand, dLo, src0, src1: Register) =
+  ## Does (dHi, dLo) <- src0 * src1
+  doAssert src1 == rdx, "MULX requires the RDX register"
+  a.regClobbers.incl rdx
+
+  doAssert dHi.desc.rm in {Reg, ElemsInReg}+SpecificRegisters,
+    "The destination operand must be a register " & $dHi.repr
+  doAssert dHi.desc.constraint in OutputReg
+
+  a.code &= "mulx %" & $dHi.desc.asmId & ", " & $dLo & ", " & $src0 & '\n'
+
+  a.regClobbers.incl src0
+  a.regClobbers.incl dLo
+
 func mulx*(a: var Assembler_x86, dHi: OperandReuse, dLo, src0: Operand, src1: Register) =
   ## Does (dHi, dLo) <- src0 * src1
   doAssert src1 == rdx, "MULX requires the RDX register"
diff --git a/metering/eip2537.md b/metering/eip2537.md
index 6450b4b9..dce863a5 100644
--- a/metering/eip2537.md
+++ b/metering/eip2537.md
@@ -86,8 +86,8 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst
 | `-=`*(a: var FF; b: FF)                                      | 4          | 20000000.000       | 0.200        | 0.050            | 0.132         | 0.033         |
 | diff*(r: var FF; a, b: FF)                                   | 2          | 25000000.000       | 0.080        | 0.040            | 0.099         | 0.050         |
 | double*(r: var FF; a: FF)                                    | 1          | 24390243.902       | 0.041        | 0.041            | 0.033         | 0.033         |
-| prod*(r: var FF; a, b: FF; skipFinalSub: static bool = f ... | 12         | 16597510.373       | 0.723        | 0.060            | 1.155         | 0.096         |
-| square*(r: var FF; a: FF; skipFinalSub: static bool = false) | 4          | 16666666.667       | 0.240        | 0.060            | 0.363         | 0.091         |
+| prod*(r: var FF; a, b: FF; lazyReduce: static bool = f ... | 12         | 16597510.373       | 0.723        | 0.060            | 1.155         | 0.096         |
+| square*(r: var FF; a: FF; lazyReduce: static bool = false) | 4          | 16666666.667       | 0.240        | 0.060            | 0.363         | 0.091         |
 | csub*(a: var FF; b: FF; ctl: SecretBool)                     | 1          | 5555555.556        | 0.180        | 0.180            | 0.495         | 0.495         |
 | div2*(a: var FF)                                             | 1          | 25000000.000       | 0.040        | 0.040            | 0.066         | 0.066         |
 | `*=`*(a: var FF; b: FF)                                      | 4          | 7797270.955        | 0.513        | 0.128            | 1.287         | 0.322         |
@@ -108,7 +108,7 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst
 | sum*(r: var FF; a, b: FF)                                    | 12         | 23483365.949       | 0.511        | 0.043            | 0.528         | 0.044         |
 | diff*(r: var FF; a, b: FF)                                   | 2          | 22222222.222       | 0.090        | 0.045            | 0.099         | 0.050         |
 | double*(r: var FF; a: FF)                                    | 5          | 25000000.000       | 0.200        | 0.040            | 0.198         | 0.040         |
-| prod*(r: var FF; a, b: FF; skipFinalSub: static bool = f ... | 12         | 16853932.584       | 0.712        | 0.059            | 1.188         | 0.099         |
+| prod*(r: var FF; a, b: FF; lazyReduce: static bool = f ... | 12         | 16853932.584       | 0.712        | 0.059            | 1.188         | 0.099         |
 | `*=`*(a: var FF; b: FF)                                      | 7          | 7856341.190        | 0.891        | 0.127            | 2.277         | 0.325         |
 | sum*(r: var ECP_ShortW_Prj[F, G]; P, Q: ECP_ShortW_Prj[F ... | 1          | 296208.531         | 3.376        | 3.376            | 11.022        | 11.022        |
 
@@ -128,14 +128,14 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst
 | double*(a: var FF)                                           | 645        | 23402634.157       | 27.561       | 0.043            | 28.512        | 0.044         |
 | diff*(r: var FF; a, b: FF)                                   | 273        | 23522316.043       | 11.606       | 0.043            | 11.913        | 0.044         |
 | double*(r: var FF; a: FF)                                    | 201        | 23547328.960       | 8.536        | 0.042            | 8.745         | 0.044         |
-| prod*(r: var FF; a, b: FF; skipFinalSub: static bool = f ... | 976        | 16877345.277       | 57.829       | 0.059            | 96.096        | 0.098         |
-| square*(r: var FF; a: FF; skipFinalSub: static bool = false) | 877        | 16773453.189       | 52.285       | 0.060            | 86.790        | 0.099         |
+| prod*(r: var FF; a, b: FF; lazyReduce: static bool = f ... | 976        | 16877345.277       | 57.829       | 0.059            | 96.096        | 0.098         |
+| square*(r: var FF; a: FF; lazyReduce: static bool = false) | 877        | 16773453.189       | 52.285       | 0.060            | 86.790        | 0.099         |
 | cneg*(r: var FF; a: FF; ctl: SecretBool)                     | 66         | 5299927.728        | 12.453       | 0.189            | 34.386        | 0.521         |
 | cneg*(a: var FF; ctl: SecretBool)                            | 66         | 3931145.393        | 16.789       | 0.254            | 48.906        | 0.741         |
 | csub*(a: var FF; b: FF; ctl: SecretBool)                     | 72         | 5666168.254        | 12.707       | 0.176            | 35.046        | 0.487         |
 | div2*(a: var FF)                                             | 72         | 22893481.717       | 3.145        | 0.044            | 3.201         | 0.044         |
 | `*=`*(a: var FF; b: FF)                                      | 354        | 7933839.844        | 44.619       | 0.126            | 113.256       | 0.320         |
-| square*(a: var FF; skipFinalSub: static bool = false)        | 129        | 7914110.429        | 16.300       | 0.126            | 41.481        | 0.322         |
+| square*(a: var FF; lazyReduce: static bool = false)        | 129        | 7914110.429        | 16.300       | 0.126            | 41.481        | 0.322         |
 | sum*(r: var ECP_ShortW_Jac[F, G]; P, Q: ECP_ShortW_Jac[F ... | 8          | 291194.991         | 27.473       | 3.434            | 89.826        | 11.228        |
 | mixedSum*(r: var ECP_ShortW_Jac[F, G]; P: ECP_ShortW_Jac[F,  ... | 64         | 337156.193         | 189.823      | 2.966            | 619.014       | 9.672         |
 | double*(r: var ECP_ShortW_Jac[F, G]; P: ECP_ShortW_Jac[F ... | 129        | 578076.127         | 223.154      | 1.730            | 722.271       | 5.599         |
@@ -160,8 +160,8 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst
 | sum*(r: var FF; a, b: FF)                                    | 610        | 23521246.240       | 25.934       | 0.043            | 26.466        | 0.043         |
 | diff*(r: var FF; a, b: FF)                                   | 80         | 23289665.211       | 3.435        | 0.043            | 3.300         | 0.041         |
 | double*(r: var FF; a: FF)                                    | 1005       | 23298945.172       | 43.135       | 0.043            | 44.187        | 0.044         |
-| prod*(r: var FF; a, b: FF; skipFinalSub: static bool = f ... | 1612       | 16807774.117       | 95.908       | 0.059            | 160.479       | 0.100         |
-| square*(r: var FF; a: FF; skipFinalSub: static bool = false) | 258        | 16701191.093       | 15.448       | 0.060            | 25.674        | 0.100         |
+| prod*(r: var FF; a, b: FF; lazyReduce: static bool = f ... | 1612       | 16807774.117       | 95.908       | 0.059            | 160.479       | 0.100         |
+| square*(r: var FF; a: FF; lazyReduce: static bool = false) | 258        | 16701191.093       | 15.448       | 0.060            | 25.674        | 0.100         |
 | cneg*(r: var FF; a: FF; ctl: SecretBool)                     | 66         | 5659406.620        | 11.662       | 0.177            | 31.977        | 0.484         |
 | cneg*(a: var FF; ctl: SecretBool)                            | 66         | 4113943.776        | 16.043       | 0.243            | 46.365        | 0.703         |
 | `*=`*(a: var FF; b: FF)                                      | 506        | 7906867.724        | 63.995       | 0.126            | 161.898       | 0.320         |
@@ -188,10 +188,10 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst
 | double*(a: var FF)                                           | 645        | 23330680.749       | 27.646       | 0.043            | 27.918        | 0.043         |
 | diff*(r: var FF; a, b: FF)                                   | 239        | 23376369.327       | 10.224       | 0.043            | 10.494        | 0.044         |
 | double*(r: var FF; a: FF)                                    | 129        | 23314657.509       | 5.533        | 0.043            | 5.577         | 0.043         |
-| prod*(r: var FF; a, b: FF; skipFinalSub: static bool = f ... | 768        | 16917044.804       | 45.398       | 0.059            | 75.669        | 0.099         |
-| square*(r: var FF; a: FF; skipFinalSub: static bool = false) | 824        | 16851060.349       | 48.899       | 0.059            | 81.246        | 0.099         |
+| prod*(r: var FF; a, b: FF; lazyReduce: static bool = f ... | 768        | 16917044.804       | 45.398       | 0.059            | 75.669        | 0.099         |
+| square*(r: var FF; a: FF; lazyReduce: static bool = false) | 824        | 16851060.349       | 48.899       | 0.059            | 81.246        | 0.099         |
 | `*=`*(a: var FF; b: FF)                                      | 417        | 7897129.005        | 52.804       | 0.127            | 132.957       | 0.319         |
-| square*(a: var FF; skipFinalSub: static bool = false)        | 129        | 7915567.282        | 16.297       | 0.126            | 41.283        | 0.320         |
+| square*(a: var FF; lazyReduce: static bool = false)        | 129        | 7915567.282        | 16.297       | 0.126            | 41.283        | 0.320         |
 | double*(r: var ECP_ShortW_Jac[F, G]; P: ECP_ShortW_Jac[F ... | 129        | 577954.400         | 223.201      | 1.730            | 722.403       | 5.600         |
 | sum_vartime*(r: var ECP_ShortW_Jac[F, G]; p, q: ECP_Shor ... | 6          | 418205.897         | 14.347       | 2.391            | 46.728        | 7.788         |
 | mixedSum_vartime*(r: var ECP_ShortW_Jac[F, G]; p: ECP_ShortW ... | 49         | 558188.280         | 87.784       | 1.792            | 284.097       | 5.798         |
@@ -216,8 +216,8 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst
 | sum*(r: var FF; a, b: FF)                                    | 256        | 23374726.077       | 10.952       | 0.043            | 10.989        | 0.043         |
 | diff*(r: var FF; a, b: FF)                                   | 53         | 23378914.865       | 2.267        | 0.043            | 2.145         | 0.040         |
 | double*(r: var FF; a: FF)                                    | 640        | 23390103.063       | 27.362       | 0.043            | 28.116        | 0.044         |
-| prod*(r: var FF; a, b: FF; skipFinalSub: static bool = f ... | 1301       | 16744317.743       | 77.698       | 0.060            | 129.690       | 0.100         |
-| square*(r: var FF; a: FF; skipFinalSub: static bool = false) | 362        | 16813748.258       | 21.530       | 0.059            | 35.838        | 0.099         |
+| prod*(r: var FF; a, b: FF; lazyReduce: static bool = f ... | 1301       | 16744317.743       | 77.698       | 0.060            | 129.690       | 0.100         |
+| square*(r: var FF; a: FF; lazyReduce: static bool = false) | 362        | 16813748.258       | 21.530       | 0.059            | 35.838        | 0.099         |
 | `*=`*(a: var FF; b: FF)                                      | 347        | 7342361.405        | 47.260       | 0.136            | 121.506       | 0.350         |
 | double*(r: var ECP_ShortW_Prj[F, G]; P: ECP_ShortW_Prj[F ... | 128        | 594950.359         | 215.144      | 1.681            | 696.036       | 5.438         |
 | sum_vartime*(r: var ECP_ShortW_Prj[F, G]; p, q: ECP_Shor ... | 6          | 472255.018         | 12.705       | 2.118            | 41.217        | 6.869         |
@@ -283,8 +283,8 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst
 | sumUnr*(r: var FF; a, b: FF)                                 | 1694       | 23918108.013       | 70.825       | 0.042            | 69.465        | 0.041         |
 | diff*(r: var FF; a, b: FF)                                   | 968        | 23438824.185       | 41.299       | 0.043            | 42.207        | 0.044         |
 | double*(r: var FF; a: FF)                                    | 824        | 23427060.529       | 35.173       | 0.043            | 36.366        | 0.044         |
-| prod*(r: var FF; a, b: FF; skipFinalSub: static bool = f ... | 6          | 16620498.615       | 0.361        | 0.060            | 0.594         | 0.099         |
-| square*(r: var FF; a: FF; skipFinalSub: static bool = false) | 2          | 15384615.385       | 0.130        | 0.065            | 0.231         | 0.116         |
+| prod*(r: var FF; a, b: FF; lazyReduce: static bool = f ... | 6          | 16620498.615       | 0.361        | 0.060            | 0.594         | 0.099         |
+| square*(r: var FF; a: FF; lazyReduce: static bool = false) | 2          | 15384615.385       | 0.130        | 0.065            | 0.231         | 0.116         |
 | cneg*(r: var FF; a: FF; ctl: SecretBool)                     | 136        | 5648544.254        | 24.077       | 0.177            | 66.000        | 0.485         |
 | cneg*(a: var FF; ctl: SecretBool)                            | 136        | 4105288.578        | 33.128       | 0.244            | 95.700        | 0.704         |
 | csub*(a: var FF; b: FF; ctl: SecretBool)                     | 144        | 5656153.030        | 25.459       | 0.177            | 70.059        | 0.487         |
@@ -315,8 +315,8 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst
 | sumUnr*(r: var FF; a, b: FF)                                 | 2446       | 23699253.948       | 103.210      | 0.042            | 101.277       | 0.041         |
 | diff*(r: var FF; a, b: FF)                                   | 496        | 23511566.174       | 21.096       | 0.043            | 22.044        | 0.044         |
 | double*(r: var FF; a: FF)                                    | 1488       | 23267086.767       | 63.953       | 0.043            | 64.779        | 0.044         |
-| prod*(r: var FF; a, b: FF; skipFinalSub: static bool = f ... | 6          | 16666666.667       | 0.360        | 0.060            | 0.627         | 0.104         |
-| square*(r: var FF; a: FF; skipFinalSub: static bool = false) | 2          | 15384615.385       | 0.130        | 0.065            | 0.231         | 0.116         |
+| prod*(r: var FF; a, b: FF; lazyReduce: static bool = f ... | 6          | 16666666.667       | 0.360        | 0.060            | 0.627         | 0.104         |
+| square*(r: var FF; a: FF; lazyReduce: static bool = false) | 2          | 15384615.385       | 0.130        | 0.065            | 0.231         | 0.116         |
 | cneg*(r: var FF; a: FF; ctl: SecretBool)                     | 136        | 5655591.134        | 24.047       | 0.177            | 65.835        | 0.484         |
 | cneg*(a: var FF; ctl: SecretBool)                            | 136        | 4113359.344        | 33.063       | 0.243            | 95.832        | 0.705         |
 | sum*(r: var ECP_ShortW_Prj[F, G]; P, Q: ECP_ShortW_Prj[F ... | 8          | 155315.679         | 51.508       | 6.439            | 168.828       | 21.104        |
@@ -344,8 +344,8 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst
 | sumUnr*(r: var FF; a, b: FF)                                 | 1562       | 23860077.904       | 65.465       | 0.042            | 64.548        | 0.041         |
 | diff*(r: var FF; a, b: FF)                                   | 992        | 23484292.512       | 42.241       | 0.043            | 44.319        | 0.045         |
 | double*(r: var FF; a: FF)                                    | 700        | 23352793.995       | 29.975       | 0.043            | 29.997        | 0.043         |
-| prod*(r: var FF; a, b: FF; skipFinalSub: static bool = f ... | 6          | 16620498.615       | 0.361        | 0.060            | 0.594         | 0.099         |
-| square*(r: var FF; a: FF; skipFinalSub: static bool = false) | 2          | 16666666.667       | 0.120        | 0.060            | 0.198         | 0.099         |
+| prod*(r: var FF; a, b: FF; lazyReduce: static bool = f ... | 6          | 16620498.615       | 0.361        | 0.060            | 0.594         | 0.099         |
+| square*(r: var FF; a: FF; lazyReduce: static bool = false) | 2          | 16666666.667       | 0.120        | 0.060            | 0.198         | 0.099         |
 | double*(r: var ECP_ShortW_Jac[F, G]; P: ECP_ShortW_Jac[F ... | 67         | 278583.962         | 240.502      | 3.590            | 785.268       | 11.720        |
 | sum_vartime*(r: var ECP_ShortW_Jac[F, G]; p, q: ECP_Shor ... | 4          | 220337.116         | 18.154       | 4.538            | 59.367        | 14.842        |
 | mixedSum_vartime*(r: var ECP_ShortW_Jac[F, G]; p: ECP_ShortW ... | 69         | 296900.616         | 232.401      | 3.368            | 758.835       | 10.998        |
@@ -371,8 +371,8 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst
 | sumUnr*(r: var FF; a, b: FF)                                 | 2094       | 23815210.345       | 87.927       | 0.042            | 85.602        | 0.041         |
 | diff*(r: var FF; a, b: FF)                                   | 465        | 23317621.101       | 19.942       | 0.043            | 20.262        | 0.044         |
 | double*(r: var FF; a: FF)                                    | 936        | 23199900.855       | 40.345       | 0.043            | 40.755        | 0.044         |
-| prod*(r: var FF; a, b: FF; skipFinalSub: static bool = f ... | 6          | 16666666.667       | 0.360        | 0.060            | 0.594         | 0.099         |
-| square*(r: var FF; a: FF; skipFinalSub: static bool = false) | 2          | 16666666.667       | 0.120        | 0.060            | 0.198         | 0.099         |
+| prod*(r: var FF; a, b: FF; lazyReduce: static bool = f ... | 6          | 16666666.667       | 0.360        | 0.060            | 0.594         | 0.099         |
+| square*(r: var FF; a: FF; lazyReduce: static bool = false) | 2          | 16666666.667       | 0.120        | 0.060            | 0.198         | 0.099         |
 | double*(r: var ECP_ShortW_Prj[F, G]; P: ECP_ShortW_Prj[F ... | 67         | 242212.156         | 276.617      | 4.129            | 904.695       | 13.503        |
 | sum_vartime*(r: var ECP_ShortW_Prj[F, G]; p, q: ECP_Shor ... | 4          | 247662.683         | 16.151       | 4.038            | 52.734        | 13.184        |
 | mixedSum_vartime*(r: var ECP_ShortW_Prj[F, G]; p: ECP_ShortW ... | 62         | 300657.081         | 206.215      | 3.326            | 672.903       | 10.853        |
@@ -399,9 +399,9 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst
 | sumUnr*(r: var FF; a, b: FF)                                 | 11538      | 24022635.994       | 480.297      | 0.042            | 473.352       | 0.041         |
 | diff*(r: var FF; a, b: FF)                                   | 5015       | 23495514.067       | 213.445      | 0.043            | 218.988       | 0.044         |
 | double*(r: var FF; a: FF)                                    | 3332       | 23423385.424       | 142.251      | 0.043            | 144.870       | 0.043         |
-| prod*(r: var FF; a, b: FF; skipFinalSub: static bool = f ... | 304        | 16876700.161       | 18.013       | 0.059            | 30.063        | 0.099         |
-| square*(r: var FF; a: FF; skipFinalSub: static bool = false) | 12         | 16194331.984       | 0.741        | 0.062            | 1.254         | 0.104         |
-| sumprod*(r: var FF; a, b: array[N, FF]; skipFinalSub: st ... | 36         | 7933010.137        | 4.538        | 0.126            | 10.758        | 0.299         |
+| prod*(r: var FF; a, b: FF; lazyReduce: static bool = f ... | 304        | 16876700.161       | 18.013       | 0.059            | 30.063        | 0.099         |
+| square*(r: var FF; a: FF; lazyReduce: static bool = false) | 12         | 16194331.984       | 0.741        | 0.062            | 1.254         | 0.104         |
+| sumprod*(r: var FF; a, b: array[N, FF]; lazyReduce: st ... | 36         | 7933010.137        | 4.538        | 0.126            | 10.758        | 0.299         |
 | cadd*(a: var FF; b: FF; ctl: SecretBool)                     | 60         | 5248425.472        | 11.432       | 0.191            | 31.944        | 0.532         |
 | csub*(a: var FF; b: FF; ctl: SecretBool)                     | 30         | 5617977.528        | 5.340        | 0.178            | 14.619        | 0.487         |
 | div2*(a: var FF)                                             | 252        | 23200147.303       | 10.862       | 0.043            | 11.583        | 0.046         |
diff --git a/tests/math_fields/t_finite_fields.nim b/tests/math_fields/t_finite_fields.nim
index 021c045c..79afa4b9 100644
--- a/tests/math_fields/t_finite_fields.nim
+++ b/tests/math_fields/t_finite_fields.nim
@@ -319,52 +319,55 @@ proc largeField() =
 
       check: bool r.isZero()
 
-    test "fromMont doesn't need a final substraction with 256-bit prime (full word used)":
-      block:
-        let a = Fp[Secp256k1].getMinusOne()
-        let expected = BigInt[256].fromHex"0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2E"
+    # Outdated tests as Crandall primes / Pseudo-Mersenne primes
+    # don't use Montgomery representaiton anymore
 
-        var r: BigInt[256]
-        r.fromField(a)
+    # test "fromMont doesn't need a final substraction with 256-bit prime (full word used)":
+    #   block:
+    #     let a = Fp[Secp256k1].getMinusOne()
+    #     let expected = BigInt[256].fromHex"0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2E"
 
-        check: bool(r == expected)
-      block:
-        var a: Fp[Secp256k1]
-        var d: FpDbl[Secp256k1]
+    #     var r: BigInt[256]
+    #     r.fromField(a)
 
-        # Set Montgomery repr to the largest field element in Montgomery Residue form
-        a.mres    = BigInt[256].fromHex"0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2E"
-        d.limbs2x = (BigInt[512].fromHex"0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2E").limbs
+    #     check: bool(r == expected)
+    #   block:
+    #     var a: Fp[Secp256k1]
+    #     var d: FpDbl[Secp256k1]
 
-        var r, expected: BigInt[256]
+    #     # Set Montgomery repr to the largest field element in Montgomery Residue form
+    #     a.mres    = BigInt[256].fromHex"0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2E"
+    #     d.limbs2x = (BigInt[512].fromHex"0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2E").limbs
 
-        r.fromField(a)
-        expected.limbs.redc2xMont(d.limbs2x, Fp[Secp256k1].getModulus().limbs, Fp[Secp256k1].getNegInvModWord(), Fp[Secp256k1].getSpareBits())
+    #     var r, expected: BigInt[256]
 
-        check: bool(r == expected)
+    #     r.fromField(a)
+    #     expected.limbs.redc2xMont(d.limbs2x, Fp[Secp256k1].getModulus().limbs, Fp[Secp256k1].getNegInvModWord(), Fp[Secp256k1].getSpareBits())
 
-    test "fromMont doesn't need a final substraction with 255-bit prime (1 spare bit)":
-      block:
-        let a = Fp[Edwards25519].getMinusOne()
-        let expected = BigInt[255].fromHex"0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffec"
+    #     check: bool(r == expected)
 
-        var r: BigInt[255]
-        r.fromField(a)
+    # test "fromMont doesn't need a final substraction with 255-bit prime (1 spare bit)":
+    #   block:
+    #     let a = Fp[Edwards25519].getMinusOne()
+    #     let expected = BigInt[255].fromHex"0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffec"
 
-        check: bool(r == expected)
-      block:
-        var a: Fp[Edwards25519]
-        var d: FpDbl[Edwards25519]
+    #     var r: BigInt[255]
+    #     r.fromField(a)
+
+    #     check: bool(r == expected)
+    #   block:
+    #     var a: Fp[Edwards25519]
+    #     var d: FpDbl[Edwards25519]
 
-        # Set Montgomery repr to the largest field element in Montgomery Residue form
-        a.mres    = BigInt[255].fromHex"0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffec"
-        d.limbs2x = (BigInt[512].fromHex"0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffec").limbs
+    #     # Set Montgomery repr to the largest field element in Montgomery Residue form
+    #     a.mres    = BigInt[255].fromHex"0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffec"
+    #     d.limbs2x = (BigInt[512].fromHex"0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffec").limbs
 
-        var r, expected: BigInt[255]
+    #     var r, expected: BigInt[255]
 
-        r.fromField(a)
-        expected.limbs.redc2xMont(d.limbs2x, Fp[Edwards25519].getModulus().limbs, Fp[Edwards25519].getNegInvModWord(), Fp[Edwards25519].getSpareBits())
+    #     r.fromField(a)
+    #     expected.limbs.redc2xMont(d.limbs2x, Fp[Edwards25519].getModulus().limbs, Fp[Edwards25519].getNegInvModWord(), Fp[Edwards25519].getSpareBits())
 
-        check: bool(r == expected)
+    #     check: bool(r == expected)
 
 largeField()
diff --git a/tests/math_fields/t_finite_fields_mulsquare.nim b/tests/math_fields/t_finite_fields_mulsquare.nim
index 668ace72..87d62e1d 100644
--- a/tests/math_fields/t_finite_fields_mulsquare.nim
+++ b/tests/math_fields/t_finite_fields_mulsquare.nim
@@ -28,7 +28,7 @@ echo "test_finite_fields_mulsquare xoshiro512** seed: ", seed
 static: doAssert defined(CTT_TEST_CURVES), "This modules requires the -d:CTT_TEST_CURVES compile option"
 
 proc sanity(Name: static Algebra) =
-  test "Squaring 0,1,2 with " & $Name & " [FastSquaring = " & $(Fp[Name].getSpareBits() >= 2) & "]":
+  test "Squaring 0,1,2 with " & $Algebra(Name) & " [FastSquaring = " & $(Fp[Name].getSpareBits() >= 2) & "]":
         block: # 0² mod
           var n: Fp[Name]
 
diff --git a/tests/math_fields/t_finite_fields_powinv.nim b/tests/math_fields/t_finite_fields_powinv.nim
index 00df8a5a..70a7422f 100644
--- a/tests/math_fields/t_finite_fields_powinv.nim
+++ b/tests/math_fields/t_finite_fields_powinv.nim
@@ -234,13 +234,14 @@ proc main() =
         r.inv(x)
         let computed = r.toHex()
 
-        check:
-          computed == expected
+        check: computed == expected
 
         var r2: Fp[BLS12_381]
         r2.inv_vartime(x)
         let computed2 = r2.toHex()
 
+        check: computed2 == expected
+
     test "Specific tests on Fp[BN254_Snarks]":
       block:
         var r, x: Fp[BN254_Snarks]
@@ -275,6 +276,9 @@ proc main() =
     proc testRandomInv(name: static Algebra) =
       test "Random inversion testing on " & $Algebra(name):
         var aInv, r: Fp[name]
+        var aFLT, pm2: Fp[name]
+        pm2 = Fp[name].fromUint(2'u)
+        pm2.neg()
 
         for _ in 0 ..< Iters:
           let a = rng.random_unsafe(Fp[name])
@@ -290,6 +294,15 @@ proc main() =
           r.prod(aInv, a)
           check: bool r.isOne() or (a.isZero() and r.isZero())
 
+          aFLT = a
+          aFLT.pow(pm2)
+          r.prod(a, aFLT)
+          check: bool r.isOne() or (a.isZero() and r.isZero())
+          aFLT = a
+          aFLT.pow_vartime(pm2)
+          r.prod(a, aFLT)
+          check: bool r.isOne() or (a.isZero() and r.isZero())
+
         for _ in 0 ..< Iters:
           let a = rng.randomHighHammingWeight(Fp[name])
           aInv.inv(a)
@@ -303,6 +316,16 @@ proc main() =
           check: bool r.isOne() or (a.isZero() and r.isZero())
           r.prod(aInv, a)
           check: bool r.isOne() or (a.isZero() and r.isZero())
+
+          aFLT = a
+          aFLT.pow(pm2)
+          r.prod(a, aFLT)
+          check: bool r.isOne() or (a.isZero() and r.isZero())
+          aFLT = a
+          aFLT.pow_vartime(pm2)
+          r.prod(a, aFLT)
+          check: bool r.isOne() or (a.isZero() and r.isZero())
+
         for _ in 0 ..< Iters:
           let a = rng.random_long01Seq(Fp[name])
           aInv.inv(a)
@@ -317,6 +340,16 @@ proc main() =
           r.prod(aInv, a)
           check: bool r.isOne() or (a.isZero() and r.isZero())
 
+          aFLT = a
+          aFLT.pow(pm2)
+          r.prod(a, aFLT)
+          check: bool r.isOne() or (a.isZero() and r.isZero())
+          aFLT = a
+          aFLT.pow_vartime(pm2)
+          r.prod(a, aFLT)
+          check: bool r.isOne() or (a.isZero() and r.isZero())
+
+
     testRandomInv P224
     testRandomInv BN254_Nogami
     testRandomInv BN254_Snarks
diff --git a/tests/math_fields/t_finite_fields_sqrt.nim b/tests/math_fields/t_finite_fields_sqrt.nim
index 96d471fd..abe917b1 100644
--- a/tests/math_fields/t_finite_fields_sqrt.nim
+++ b/tests/math_fields/t_finite_fields_sqrt.nim
@@ -29,7 +29,7 @@ echo "test_finite_fields_sqrt xoshiro512** seed: ", seed
 static: doAssert defined(CTT_TEST_CURVES), "This modules requires the -d:CTT_TEST_CURVES compile option"
 
 proc exhaustiveCheck(Name: static Algebra, modulus: static int) =
-  test "Exhaustive square root check for " & $Algebra(C):
+  test "Exhaustive square root check for " & $Algebra(Name):
     var squares_to_roots: Table[uint16, set[uint16]]
 
     # Create all squares
@@ -103,7 +103,7 @@ template testSqrtImpl(a: untyped): untyped {.dirty.} =
     bool(r == a or r == na)
 
 proc randomSqrtCheck(Name: static Algebra) =
-  test "Random square root check for " & $Algebra(C):
+  test "Random square root check for " & $Algebra(Name):
     for _ in 0 ..< Iters:
       let a = rng.random_unsafe(Fp[Name])
       testSqrtImpl(a)
@@ -129,7 +129,7 @@ template testSqrtRatioImpl(u, v: untyped): untyped {.dirty.} =
     check: bool(r == u_over_v)
 
 proc randomSqrtRatioCheck(Name: static Algebra) =
-  test "Random square root check for " & $Algebra(C):
+  test "Random square root check for " & $Algebra(Name):
     for _ in 0 ..< Iters:
       let u = rng.random_unsafe(Fp[Name])
       let v = rng.random_unsafe(Fp[Name])
diff --git a/tests/math_fields/t_finite_fields_vs_gmp.nim b/tests/math_fields/t_finite_fields_vs_gmp.nim
index 2da68470..218b727c 100644
--- a/tests/math_fields/t_finite_fields_vs_gmp.nim
+++ b/tests/math_fields/t_finite_fields_vs_gmp.nim
@@ -58,7 +58,7 @@ proc binary_prologue[Name: static Algebra, N: static int](
   bTest = rng.random_unsafe(Fp[Name])
 
   # Set modulus to curve modulus
-  let err = mpz_set_str(p, Fp[Name].getmodulus().toHex(), 0)
+  let err = mpz_set_str(p, Fp[Name].getModulus().toHex(), 0)
   doAssert err == 0, "Error on prime for curve " & $Name
 
   #########################################################