GLV impl for all SW curves except MNT (#26)

* add glv for all SW curves (except MNT) * fix unoptimal lattice basis generation * Rigorous bounds -> justify impl, update bw6 params to use general script * fix formatting issues * improve comments, glv checks, swap voracious for core::unstable_sort.. * add glv mul test * remove unused comment Co-authored-by: Kobi Gurkan <[email protected]>
celo-org · Nov 6, 2020 · c894564 · c894564
1 parent c075ce0
commit c894564
Show file tree

Hide file tree

Showing 23 changed files with 596 additions and 304 deletions.
diff --git a/algebra-core/Cargo.toml b/algebra-core/Cargo.toml
@@ -30,7 +30,6 @@ rand = { version = "0.7", default-features = false }
 rayon = { version = "1", optional = true }
 unroll = { version = "=0.1.4" }
 itertools = { version = "0.9.0", default-features = false }
-voracious_radix_sort = { version = "0.1.0", optional = true }
 either = { version = "1.6.0", default-features = false }
 thread-id = { version = "3.3.0", optional = true }
 backtrace = { version = "0.3", optional = true }
@@ -46,7 +45,7 @@ rand_xorshift = "0.2"
 
 [features]
 default = [ "std", "rand/default" ]
-std = [ "voracious_radix_sort" ]
+std = []
 parallel = [ "std", "rayon", "rand/default" ]
 derive = [ "algebra-core-derive" ]
 prefetch = [ "std" ]

diff --git a/algebra-core/src/curves/batch_arith.rs b/algebra-core/src/curves/batch_arith.rs
@@ -7,6 +7,9 @@ use num_traits::Zero;
 /// inversion close to zero while not straining the CPU cache by generating and
 /// fetching from large w-NAF tables and slices [G]
 pub const BATCH_SIZE: usize = 4096;
+
+/// We code this in the second operand for the `batch_add_in_place_read_only`
+/// method utilised in the `batch_scalar_mul_in_place` method.
 /// 0 == Identity; 1 == Neg; 2 == GLV; 3 == GLV + Neg
 pub const ENDO_CODING_BITS: usize = 2;
 
@@ -29,8 +32,9 @@ where
  // Refer to e.g. Improved Techniques for Fast Exponentiation, Section 4
  // Bodo M¨oller 2002. https://www.bmoeller.de/pdf/fastexp-icisc2002.pdf
 
- /// Computes [[p, 3 * p, ..., (2^w - 1) * p], ..., [q, 3* q, ..., ]]
- /// We need to manipulate the offsets when using the table
+ /// Computes [[p_1, 3 * p_1, ..., (2^w - 1) * p_1], ..., [p_n, 3*p_n, ...,
+ /// (2^w - 1) p_n]] We need to manipulate the offsets when using the
+ /// table
  fn batch_wnaf_tables(bases: &[Self], w: usize) -> Vec<Self> {
  let half_size = 1 << (w - 1);
  let batch_size = bases.len();
@@ -156,9 +160,9 @@ where
  /// or simply writes them to new_elems, using scratch space to store
  /// intermediate values. Scratch space is always cleared after use.
 
- /// No-ops, or copies of the elem in the slice `lookup` in the position of the index
- /// of the first operand to the new_elems vector, are encoded as !0u32 in the index
- /// for the second operand
+ /// No-ops, or copies of the elem in the slice `lookup` in the position of
+ /// the index of the first operand to the new_elems vector, are encoded
+ /// as !0u32 in the index for the second operand
  fn batch_add_write(
  lookup: &[Self],
  index: &[(u32, u32)],
@@ -169,9 +173,9 @@ where
  /// Similar to batch_add_write, only that the lookup for the first operand
  /// is performed in new_elems rather than lookup
 
- /// No-ops, or copies of the elem in the slice `lookup` in the position of the index
- /// of the first operand to the new_elems vector, are encoded as !0u32 in the index
- /// for the second operand
+ /// No-ops, or copies of the elem in the slice `lookup` in the position of
+ /// the index of the first operand to the new_elems vector, are encoded
+ /// as !0u32 in the index for the second operand
  fn batch_add_write_read_self(
  lookup: &[Self],
  index: &[(u32, u32)],

diff --git a/algebra-core/src/curves/bucketed_add.rs b/algebra-core/src/curves/bucketed_add.rs
@@ -3,42 +3,32 @@ use crate::{
  AffineCurve, Vec,
 };
 
-#[cfg(feature = "std")]
-use {core::cmp::Ordering, voracious_radix_sort::*};
-
-#[cfg(not(feature = "std"))]
-use crate::log2;
-
 #[derive(Copy, Clone, Debug)]
 pub struct BucketPosition {
  pub bucket: u32,
  pub position: u32,
 }
 
-#[cfg(feature = "std")]
-impl PartialOrd for BucketPosition {
- fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
- self.bucket.partial_cmp(&other.bucket)
- }
-}
-
-#[cfg(feature = "std")]
-impl Radixable<u32> for BucketPosition {
- type Key = u32;
- #[inline]
- fn key(&self) -> Self::Key {
- self.bucket
- }
-}
-
-impl PartialEq for BucketPosition {
- fn eq(&self, other: &Self) -> bool {
- self.bucket == other.bucket
- }
-}
-
+/// The objective of this function is to identify an addition tree of
+/// independent elliptic curve group additions for each bucket, and to batch the
+/// independent additions using the batch affine inversion method.
+
+/// The strategy taken is to sort a list of bucket assignments of all the
+/// elements (which we can for most intents and purposes, think of as being
+/// uniformly random) by bucket, so that indices corresponding to elements that
+/// must be added together are physically collocated in memory. Then, in the
+/// first round, we proceed to perform independent additions producing
+/// intermediate results at the greatest depth for each addition tree (each
+/// corresponding to a bucket), and write the result to a new vector. We do so
+/// to improve cache locality for future rounds, and take advantage of the
+/// CPU-intensive nature of elliptic curve operations along with prfetching to
+/// hide the latency of reading from essentially random locations in memory.
+
+/// Subsequently, we perform the additions in place, and the second operands
+/// become junk data. Finally, when we only have the buckets left (no more
+/// additions left to perform), we copy the result into a destination `res`
+/// slice.
 #[inline]
-#[cfg(feature = "std")]
 pub fn batch_bucketed_add<C: AffineCurve>(
  buckets: usize,
  elems: &[C],
@@ -48,8 +38,11 @@ pub fn batch_bucketed_add<C: AffineCurve>(
  assert!(elems.len() > 0);
 
  let _now = timer!();
- dlsd_radixsort(bucket_positions, 8);
- timer_println!(_now, "radixsort");
+ // We sort the bucket positions so that indices of elements assigned
+ // to the same bucket are continguous. This way, we can easily identify
+ // how to construct the addition tree for that bucket.
+ bucket_positions.sort_unstable_by_key(|x| x.bucket);
+ timer_println!(_now, "sort");
 
  let mut len = bucket_positions.len();
  let mut all_ones = true;
@@ -68,29 +61,45 @@ pub fn batch_bucketed_add<C: AffineCurve>(
  // Subsequently, we perform all the operations in place
  while glob < len {
  let current_bucket = bucket_positions[glob].bucket;
+ // We are iterating over elements using a global `glob` counter, and counting
+ // how many in a row are being assigned to the same bucket, using the `loc`
+ // counter.
  while glob + 1 < len && bucket_positions[glob + 1].bucket == current_bucket {
  glob += 1;
  loc += 1;
  }
+ // If the current bucket exceeds buckets, it encodes a noop
  if current_bucket >= buckets as u32 {
  loc = 1;
  } else if loc > 1 {
  // all ones is false if next len is not 1
+
+ // in other words, we have not reached the terminating
+ // condition that after the current round of addition
+ // there is only one element left in each addition tree
+
+ // This would be the case, if each addition tree had at
+ // most 2 elements in the current round.
  if loc > 2 {
  all_ones = false;
  }
  let is_odd = loc % 2 == 1;
  let half = loc / 2;
+ // We encode instructions to add adjacent elements
  for i in 0..half {
  instr.push((
  bucket_positions[glob - (loc - 1) + 2 * i].position,
  bucket_positions[glob - (loc - 1) + 2 * i + 1].position,
  ));
+ // Compactification of buckets
  bucket_positions[new_len + i] = BucketPosition {
  bucket: current_bucket,
  position: (new_len + i) as u32,
  };
  }
+ // If there are an odd number of elements, the lone element
+ // without a partner will be copied over to the `new_elems`
+ // vector, a noop which is encoded as !0u32
  if is_odd {
  instr.push((bucket_positions[glob].position, !0u32));
  bucket_positions[new_len + half] = BucketPosition {
@@ -99,6 +108,13 @@ pub fn batch_bucketed_add<C: AffineCurve>(
  };
  }
  // Reset the local_counter and update state
+
+ // We compactify the `bucket_positions` data by shifing left
+ // `new_len` is the len of the current compactified vector.
+
+ // We also update the `batch` counter to decide when it is
+ // optimal to invoke the batch inversion, i.e. when we have
+ // accumulated enough independent additions.
  new_len += half + (loc % 2);
  batch += half;
  loc = 1;
@@ -131,6 +147,9 @@ pub fn batch_bucketed_add<C: AffineCurve>(
  len = new_len;
  new_len = 0;
 
+ // We repeat the above procedure, except, since we are performing the addition
+ // trees in place, we do not need to encode noops to force a copy to a new
+ // vector.
  while !all_ones {
  all_ones = true;
  while glob < len {
@@ -197,79 +216,3 @@ pub fn batch_bucketed_add<C: AffineCurve>(
  timer_println!(_now, "reassign");
  res
 }
-
-#[cfg(not(feature = "std"))]
-pub fn batch_bucketed_add<C: AffineCurve>(
- buckets: usize,
- elems: &[C],
- bucket_assign: &[BucketPosition],
-) -> Vec<C> {
- let mut elems = elems.to_vec();
- let num_split = 2i32.pow(log2(buckets) / 2 + 2) as usize;
- let split_size = (buckets - 1) / num_split + 1;
- let ratio = elems.len() / buckets * 2;
- // Get the inverted index for the positions assigning to each bucket
- let mut bucket_split = vec![vec![]; num_split];
- let mut index = vec![Vec::with_capacity(ratio); buckets];
-
- for bucket_pos in bucket_assign.iter() {
- let (bucket, position) = (bucket_pos.bucket as usize, bucket_pos.position as usize);
- // Check the bucket assignment is valid
- if bucket < buckets {
- // index[bucket].push(position);
- bucket_split[bucket / split_size].push((bucket, position));
- }
- }
-
- for split in bucket_split {
- for (bucket, position) in split {
- index[bucket].push(position as u32);
- }
- }
-
- // Instructions for indexes for the in place addition tree
- let mut instr: Vec<Vec<(u32, u32)>> = vec![];
- // Find the maximum depth of the addition tree
- let max_depth = index.iter()
- // log_2
- .map(|x| log2(x.len()))
- .max().unwrap();
-
- // Generate in-place addition instructions that implement the addition tree
- // for each bucket from the leaves to the root
- for i in 0..max_depth {
- let mut instr_row = Vec::<(u32, u32)>::with_capacity(buckets);
- for to_add in index.iter_mut() {
- if to_add.len() > 1 << (max_depth - i - 1) {
- let mut new_to_add = vec![];
- for j in 0..(to_add.len() / 2) {
- new_to_add.push(to_add[2 * j]);
- instr_row.push((to_add[2 * j], to_add[2 * j + 1]));
- }
- if to_add.len() % 2 == 1 {
- new_to_add.push(*to_add.last().unwrap());
- }
- *to_add = new_to_add;
- }
- }
- instr.push(instr_row);
- }
-
- for instr_row in instr.iter() {
- for instr in C::get_chunked_instr::<(u32, u32)>(&instr_row[..], BATCH_SIZE).iter() {
- elems[..].batch_add_in_place_same_slice(&instr[..]);
- }
- }
-
- let zero = C::zero();
- let mut res = vec![zero; buckets];
-
- for (i, to_add) in index.iter().enumerate() {
- if to_add.len() == 1 {
- res[i] = elems[to_add[0] as usize];
- } else if to_add.len() > 1 {
- debug_assert!(false, "Did not successfully reduce to_add");
- }
- }
- res
-}
diff --git a/algebra-core/src/curves/glv.rs b/algebra-core/src/curves/glv.rs
@@ -3,7 +3,7 @@ use core::ops::Neg;
 
 /// The GLV parameters here require the following conditions to be satisfied:
 /// 1. MODULUS_BITS < NUM_LIMBS * 64 - 1. So 2 * n < 1 << (64 * NUM_LIMBS)
-/// We also assume that |b1| * |b2| < 2 * n
+/// We also assume that (|b1| + 2) * (|b2| + 2) < 2 * n
 /// We also know that either B1 is neg or B2 is.
 pub trait GLVParameters: Send + Sync + 'static + ModelParameters {
  type WideBigInt: BigInteger;
@@ -53,9 +53,12 @@ pub trait GLVParameters: Send + Sync + 'static + ModelParameters {
  let c2 = &c2_wide.as_ref()[..limbs];
 
  // We first assume that the final 2 bits of the representation for the modulus
- // is not set, so that 2 * n < R = 1 << (64 * NUM_LIMBS). Then, since we
- // know that |b_i| < \sqrt{2n}, wlog k|b1|/n * |b2| < 2 * k < 2 * n <
- // R.
+ // is not set, so that 2 * n < R = 1 << (64 * NUM_LIMBS).
+
+ // wlog c1 = round(k * round(|b_1|R / n) / R) < ceil(k * ceil(|b_1|* R / n) / R)
+ // < k * (b_1 * R / n + 1) / R + 1 < b_1 * k / n + 2 < b_1 + 2, so a
+ // bound like (|b1| + 2) * (|b2| + 2) < 2 * n is good enough for wlog d1
+ // < 2 * n
  let mut d1 =
  <Self::ScalarField as PrimeField>::BigInt::mul_no_reduce_lo(&c1, Self::B1.as_ref());
  if d1 > modulus {
@@ -66,7 +69,8 @@ pub trait GLVParameters: Send + Sync + 'static + ModelParameters {
  if d2 > modulus {
  d2.sub_noborrow(&modulus);
  }
- // We compute k_2 = -(c1.b1 + c1.b1) = sign(b1)*(c2|b2| - c1|b1|) = sign(b1)(d2 - d1)
+ // We compute k_2 = -(c1.b1 + c1.b1) = sign(b1)*(c2|b2| - c1|b1|) = sign(b1)(d2
+ // - d1)
  let k2_field = if !Self::B1_IS_NEG {
  Self::ScalarField::from(d2) - &Self::ScalarField::from(d1)
  } else {
@@ -91,3 +95,28 @@ pub trait GLVParameters: Send + Sync + 'static + ModelParameters {
  ((neg1, k1), (neg2, k2))
  }
 }
+
+#[macro_export]
+macro_rules! impl_glv_for_sw {
+ () => {
+ #[inline(always)]
+ fn has_glv() -> bool {
+ true
+ }
+
+ #[inline(always)]
+ fn glv_endomorphism_in_place(elem: &mut Self::BaseField) {
+ *elem *= &<Self as GLVParameters>::OMEGA;
+ }
+
+ #[inline]
+ fn glv_scalar_decomposition(
+ k: <Self::ScalarField as PrimeField>::BigInt,
+ ) -> (
+ (bool, <Self::ScalarField as PrimeField>::BigInt),
+ (bool, <Self::ScalarField as PrimeField>::BigInt),
+ ) {
+ <Self as GLVParameters>::glv_scalar_decomposition_inner(k)
+ }
+ };
+}
diff --git a/algebra-core/src/curves/mod.rs b/algebra-core/src/curves/mod.rs
@@ -20,6 +20,7 @@ pub use self::batch_arith::*;
 pub mod bucketed_add;
 pub use self::bucketed_add::*;
 
+#[macro_use]
 pub mod glv;
 pub use self::glv::*;
 
@@ -202,6 +203,8 @@ pub trait ProjectiveCurve:
  self = res;
  self
  }
+
+ fn get_x(&mut self) -> &mut Self::BaseField;
 }
 
 /// Affine representation of an elliptic curve point guaranteed to be

diff --git a/algebra-core/src/curves/models/short_weierstrass_jacobian.rs b/algebra-core/src/curves/models/short_weierstrass_jacobian.rs
@@ -367,6 +367,10 @@ impl<P: Parameters> ProjectiveCurve for GroupProjective<P> {
  self
  }
  }
+
+ fn get_x(&mut self) -> &mut Self::BaseField {
+ &mut self.x
+ }
 }
 
 impl<P: Parameters> Neg for GroupProjective<P> {

diff --git a/algebra-core/src/curves/models/short_weierstrass_projective.rs b/algebra-core/src/curves/models/short_weierstrass_projective.rs
@@ -141,6 +141,10 @@ impl<P: Parameters> ProjectiveCurve for GroupProjective<P> {
  type ScalarField = P::ScalarField;
  type Affine = GroupAffine<P>;
 
+ fn get_x(&mut self) -> &mut Self::BaseField {
+ &mut self.x
+ }
+
  #[inline]
  fn prime_subgroup_generator() -> Self {
  GroupAffine::prime_subgroup_generator().into()