From 7c518fd419c3ebed471314473a4752952162262f Mon Sep 17 00:00:00 2001 From: jon-chuang <9093549+jon-chuang@users.noreply.github.com> Date: Wed, 11 Nov 2020 00:57:36 +0800 Subject: [PATCH] CUDA Scalar Mul (#17) * First draft affine batch ops & wnaf * changes to mutability and lifetimes * delete superfluous files * crazy direction: Passing a FnMut to generate an iterator locally * unsuccessful further attempts * compile sucess using index approach * fixes for mutable borrows * Successfully passed scalar mul test * benchmarks + prefetching * stash * generic impl of batch arith for all affinecurves * batched affine formulas for TE - too expensive * improved TE affine * cleanup batch inversion * fmt... * fix minor error * remove debugging scaffolding * fmt... * delete batch arith bench as not suitable for criterion or bench * fix bench removal errors * fmt... * added missing coeff_a * refactor BatchGroupArithmetic to be separate trait * Batch verification with radix sort * Cache-locality & parallelisation * Successfully impl batch verify * added tests and bench for batch_ver, parallel_random_gen, ^ thread util * fmt * enabled missing test * remove voracious_radix_sort * commented unneeded Instant::now() * Fixed batch_ver tests for curves of small or unit cofactor * split recursive and non-recursive, tidy up shared functionality * reduce max_logn * adjust max_logn further * Batch MSM, speedup only for bw6 due to poor cache performance * fmt... * GLV iBiginteger * stash * stash * GLV with Parameter-based specialisation * GLV lattice basis script success * Successfully passed tests and benched * Improvments to MSM with and bucketed adds using lightweight index sort * changed rng to be external parameter for non-parallel batch veri * remove bench print scaffolding * remove old batch_bucketed_add using vectors instead of fixed offsets * retain parallel batch_add_split * Comments for batch arith * remove need for hashmap for no std for batch_bucketed_add * minor changes * cleanup * cleanup * fmt + use no_std Vec * removed std:: * add scratch space * Add GLV for non-batched SW mul * fix for glv_scalar_decomposition when k == MODULUS (subgroup check) * Fixed performance BUG: unnecessary table generation * GLV -> has_glv(), bigint slice bd check, refactor batch loops, u32 index * clean remove of batch_verify * fix mistake with elems indexing, unused arg for future recursion PR * trivial errors * more minor fixes * fix issues with batch_ver (.is_zero(), TE affine->proj mul) * fix issue with batch_bucketed_add_split * misname * Success in test and bench \(*v*)/ * tmp commit to cache experimental batch_add_write_shift_.. * remove batch_add_write_shift.. * optional dep, fmt... * undo accidental deletion of dlsd sort * fmt... * cleanup batch bucket add, unify impl * no std... * fixed tests * fixed unimplemented for TE, swapped wnaf table row/col for batchaddwrite * wnaf table generation uses fewer copies, remove timing instrumentation * Minor Cleanup * Add feature-activated timing instrumentation, reduce code bloat (wnaf) * unused var, no_std * Make timing macros defined globally, instrument more code * instrument w/ tid, better num_rounds est. f64, timing black/whitelisting * Minor changes * refactor tests, generic MSM test * 2D test matrix :) * batchaffine * tests * additive features * big_n feature for test-benching * prefetch unroll * minor adjustments * extension(s -> "")_fields * remove artifacts, fix asm * uncomment subgroup checks, glv param sources * gpu scalar mul * fix dependency issues * Extend GPU scalar mul to all curves * refactor * CPU + GPU coprocessing * With suboptimal BW6 assembly * add static partitioning * profiling-based static partitioining * statically partition between multiple gpus * comments * BBaseField -> BaseFieldForBatch * Outline of basic traits * Remove sw_proj, add gpu support for all sw projective curves * impl gpu kernels for all curves * feature-gate with "cuda" * rename curves/gpu directory to curves/cuda * Fix merge errors * Use github rather than local jon-chuang/accel * again * again * update README * feature = "cuda" * gpu_standalone (good for non-generic), feature gate under cuda too * fix merging errors * make helpers a same-file module * remove cancerous --all-features from github yml * Use dummy accel_dummy crate for when not compiling as CUDA * feature gate accel import * fix no_std * fix gpu-standalone does not depend algebra-core/cuda * lazy static optional * kernel-specific static profile data * cuda test, cached profile data (in OS cache dir) for all curves * rectify omission of NAMESPACE, minor errors * fix no_std, group size in bits too large for 2 groups (mnt6, cp6 - Fq3) * toml fixes * update README * remove extraneous file * bake in check for oversized group elems * typo * remove boilerplate/compactify * remove standalone * fmt * fix println and comments * fix: typo * Update README.md Co-authored-by: Kobi Gurkan * Make GPUScalarMulInternal APIs, only expose two APIs exposing more APIs is future work * add ci to test cuda compilation/link and cuda scalar mul when no gpu * change kernel accel compile branch to master * fix ci * use unreachable instead of empty implementation * install required toolchain * Empty commit to get CI working * try to fix ci * fmt * fix ci * safer error handling in gpu code * fix ci * handle dirs crate not available without cuda * don't check early intermediate results * fix no_std and nightly * fix remaining errors * No for_tests * Feature gate clear profile data * install cuda library to successfully link * change the order of CI jobs * change the order of CI again * cd .. * Get rid of cacheing * Never all features * Put back cacheing * Remove cuda .deb to save disk space * Increase max-parallel * check examples with all features Co-authored-by: Kobi Gurkan --- .github/workflows/ci.yml | 47 +- Cargo.toml | 2 +- README.md | 7 + algebra-benches/Cargo.toml | 3 +- algebra-core/Cargo.toml | 14 +- algebra-core/algebra-core-derive/Cargo.toml | 2 +- algebra-core/mince/Cargo.toml | 2 +- algebra-core/src/bytes.rs | 2 +- algebra-core/src/curves/batch_arith.rs | 4 +- algebra-core/src/curves/cuda/accel_dummy.rs | 9 + algebra-core/src/curves/cuda/mod.rs | 6 + .../curves/cuda/scalar_mul/cpu_gpu_macros.rs | 298 +++++++++++++ .../curves/cuda/scalar_mul/kernel_macros.rs | 176 ++++++++ .../src/curves/cuda/scalar_mul/mod.rs | 357 +++++++++++++++ .../cuda/scalar_mul/run_kernel_macros.rs | 100 +++++ algebra-core/src/curves/glv.rs | 1 + algebra-core/src/curves/mod.rs | 10 +- algebra-core/src/curves/models/mod.rs | 68 +-- .../curves/models/short_weierstrass_affine.rs | 64 ++- .../models/short_weierstrass_jacobian.rs | 162 +++++-- .../models/short_weierstrass_projective.rs | 415 ------------------ .../src/curves/models/sw_batch_affine.rs | 12 +- .../curves/models/twisted_edwards_extended.rs | 187 +++++--- algebra-core/src/fields/arithmetic.rs | 5 +- algebra-core/src/lib.rs | 1 + algebra/Cargo.toml | 8 +- algebra/src/bls12_377/curves/g1.rs | 12 +- algebra/src/bls12_377/curves/g2.rs | 13 +- algebra/src/bls12_377/curves/mod.rs | 13 +- algebra/src/bls12_381/curves/g1.rs | 6 +- algebra/src/bls12_381/curves/g2.rs | 6 +- algebra/src/bn254/curves/g1.rs | 16 +- algebra/src/bn254/curves/g2.rs | 16 +- algebra/src/bn254/curves/mod.rs | 13 +- algebra/src/bw6_761/curves/g1.rs | 5 +- algebra/src/bw6_761/curves/g2.rs | 5 +- algebra/src/cp6_782/curves/g1.rs | 6 +- algebra/src/cp6_782/curves/g2.rs | 6 +- algebra/src/ed_on_bls12_377/curves/mod.rs | 6 +- algebra/src/ed_on_bls12_381/curves/mod.rs | 6 +- algebra/src/ed_on_bn254/curves/mod.rs | 5 +- algebra/src/ed_on_cp6_782/curves/mod.rs | 5 +- algebra/src/ed_on_mnt4_298/curves/mod.rs | 6 +- algebra/src/ed_on_mnt4_753/curves/mod.rs | 5 +- algebra/src/mnt4_298/curves/g1.rs | 6 +- algebra/src/mnt4_298/curves/g2.rs | 6 +- algebra/src/mnt4_753/curves/g1.rs | 6 +- algebra/src/mnt4_753/curves/g2.rs | 6 +- algebra/src/mnt6_298/curves/g1.rs | 7 +- algebra/src/mnt6_298/curves/g2.rs | 6 +- algebra/src/mnt6_753/curves/g1.rs | 6 +- algebra/src/mnt6_753/curves/g2.rs | 6 +- algebra/src/tests/cuda.rs | 61 +++ algebra/src/tests/macros.rs | 24 +- algebra/src/tests/mod.rs | 1 + 55 files changed, 1565 insertions(+), 682 deletions(-) create mode 100644 algebra-core/src/curves/cuda/accel_dummy.rs create mode 100644 algebra-core/src/curves/cuda/mod.rs create mode 100644 algebra-core/src/curves/cuda/scalar_mul/cpu_gpu_macros.rs create mode 100644 algebra-core/src/curves/cuda/scalar_mul/kernel_macros.rs create mode 100644 algebra-core/src/curves/cuda/scalar_mul/mod.rs create mode 100644 algebra-core/src/curves/cuda/scalar_mul/run_kernel_macros.rs delete mode 100644 algebra-core/src/curves/models/short_weierstrass_projective.rs create mode 100644 algebra/src/tests/cuda.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cb26016f1..d6ff89852 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,7 +22,7 @@ jobs: toolchain: stable override: true components: rustfmt - + default: true - name: cargo fmt --check uses: actions-rs/cargo@v1 with: @@ -35,6 +35,7 @@ jobs: env: RUSTFLAGS: -Dwarnings strategy: + max-parallel: 6 matrix: rust: - stable @@ -50,14 +51,38 @@ jobs: toolchain: ${{ matrix.rust }} override: true - - uses: actions/cache@v2 - with: - path: | - ~/.cargo/registry - ~/.cargo/git - target + - name: Install CUDA toolchains + run: | + wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin + sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600 + wget -q https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda-repo-ubuntu1804-11-1-local_11.1.1-455.32.00-1_amd64.deb + sudo dpkg -i cuda-repo-ubuntu1804-11-1-local_11.1.1-455.32.00-1_amd64.deb + sudo apt-key add /var/cuda-repo-ubuntu1804-11-1-local/7fa2af80.pub + sudo apt-get update + sudo apt-get -y install cuda + rm cuda-repo-ubuntu* + curl -sSL https://github.com/jon-chuang/accel/raw/master/setup_nvptx_toolchain.sh | bash + + - uses: actions/cache@v2 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} + - name: Test algebra with CUDA + run: | + cd algebra + cargo test --features "all_curves cuda cuda_test" + cd .. + + - name: Test algebra + run: | + cd algebra + cargo test --features full + cd .. + - name: Check examples uses: actions-rs/cargo@v1 with: @@ -68,7 +93,7 @@ jobs: uses: actions-rs/cargo@v1 with: command: check - args: --examples --all-features --all + args: --all-features --examples --all if: matrix.rust == 'stable' - name: Check benchmarks on nightly @@ -88,12 +113,6 @@ jobs: --exclude ff-fft-benches \ -- --skip dpc --skip integration_test" - - name: Test algebra - run: | - cd algebra - cargo test --features full - cd .. - - name: Test algebra with assembly run: | cd algebra diff --git a/Cargo.toml b/Cargo.toml index b4b593c4a..525a093e7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,7 +15,7 @@ members = [ "r1cs-core", "r1cs-std", "algebra-core/algebra-core-derive", - "scripts/glv_lattice_basis" + "scripts/glv_lattice_basis", ] [profile.release] diff --git a/README.md b/README.md index 5ce72e364..e1f1bc3c0 100644 --- a/README.md +++ b/README.md @@ -87,6 +87,13 @@ To bench `algebra-benches` with greater accuracy, especially for functions with cargo +nightly bench --features "n_fold bls12_381" ``` +CUDA support is available for a limited set of functions. To allow compilation for CUDA on Linux, first run the script +``` +curl -sSL https://github.com/jon-chuang/accel/raw/master/setup_nvptx_toolchain.sh | bash +``` +or run the equivalent commands for your OS. Then, pass the `cuda` feature to rustc or cargo when compiling, and import the relevant traits (e.g. GPUScalarMulSlice) wherever the functions are called. + +When the `cuda` feature is not activated, Zexe will still compile. However, when either the `cuda` feature is not activated during compilation or CUDA is not detected on your system at runtime, Zexe will default to a CPU-only implementation of the same functionality. ## License diff --git a/algebra-benches/Cargo.toml b/algebra-benches/Cargo.toml index 0aeafe760..9d009beae 100644 --- a/algebra-benches/Cargo.toml +++ b/algebra-benches/Cargo.toml @@ -31,9 +31,10 @@ rand_xorshift = { version = "0.2" } paste = "1.0" [features] +bw6_asm = [ "algebra/bw6_asm"] asm = [ "algebra/asm"] prefetch = [ "algebra/prefetch"] -bw6_asm = [ "algebra/bw6_asm"] +cuda = [ "algebra/cuda" ] n_fold = [] mnt4_298 = [ "algebra/mnt4_298"] mnt6_298 = [ "algebra/mnt6_298"] diff --git a/algebra-core/Cargo.toml b/algebra-core/Cargo.toml index d17b113e6..77c3b0fc5 100644 --- a/algebra-core/Cargo.toml +++ b/algebra-core/Cargo.toml @@ -27,32 +27,40 @@ algebra-core-derive = { path = "algebra-core-derive", optional = true } derivative = { version = "2", features = ["use_core"] } num-traits = { version = "0.2", default-features = false } rand = { version = "0.7", default-features = false } -rayon = { version = "1", optional = true } +rayon = { version = "1.3.0", optional = true } unroll = { version = "=0.1.4" } itertools = { version = "0.9.0", default-features = false } either = { version = "1.6.0", default-features = false } thread-id = { version = "3.3.0", optional = true } backtrace = { version = "0.3", optional = true } +accel = { git = "https://github.com/jon-chuang/accel", package = "accel", optional = true } +peekmore = "0.5.6" +closure = { version = "0.3.0", optional = true } +lazy_static = { version = "1.4.0", optional = true } +serde_json = { version = "1.0.58", optional = true } +dirs = { version = "1.0.5", optional = true } +log = { version = "0.4.11", optional = true } paste = "0.1" [build-dependencies] field-assembly = { path = "./field-assembly", optional = true } -cc = "1.0" rustc_version = "0.2" +cc = "1.0" [dev-dependencies] rand_xorshift = "0.2" [features] +bw6_asm = [] default = [ "std", "rand/default" ] std = [] parallel = [ "std", "rayon", "rand/default" ] derive = [ "algebra-core-derive" ] prefetch = [ "std" ] +cuda = [ "std", "parallel", "accel", "lazy_static", "serde_json", "dirs", "closure", "log" ] timing = [ "std", "backtrace" ] timing_detailed = [ "std", "backtrace" ] timing_thread_id = [ "thread-id" ] llvm_asm = [ "field-assembly" ] -bw6_asm = [] diff --git a/algebra-core/algebra-core-derive/Cargo.toml b/algebra-core/algebra-core-derive/Cargo.toml index 4a0f5afc1..8075ed093 100644 --- a/algebra-core/algebra-core-derive/Cargo.toml +++ b/algebra-core/algebra-core-derive/Cargo.toml @@ -27,4 +27,4 @@ proc-macro = true [dependencies] proc-macro2 = "1.0" syn = "1.0" -quote = "1.0" +quote = "1.0.7" diff --git a/algebra-core/mince/Cargo.toml b/algebra-core/mince/Cargo.toml index 3e92abcce..b9aaa90d1 100644 --- a/algebra-core/mince/Cargo.toml +++ b/algebra-core/mince/Cargo.toml @@ -7,7 +7,7 @@ edition = "2018" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -quote = "1.0" +quote = "1.0.7" syn = {version = "1.0.17", features = ["full"]} [lib] diff --git a/algebra-core/src/bytes.rs b/algebra-core/src/bytes.rs index 76ff7304d..cb5469cb9 100644 --- a/algebra-core/src/bytes.rs +++ b/algebra-core/src/bytes.rs @@ -316,7 +316,7 @@ mod test { fn test_macro_empty() { let array: Vec = vec![]; let bytes: Vec = to_bytes![array].unwrap(); - assert_eq!(&bytes, &[]); + assert_eq!(bytes, Vec::::new()); assert_eq!(bytes.len(), 0); } diff --git a/algebra-core/src/curves/batch_arith.rs b/algebra-core/src/curves/batch_arith.rs index 07c4cf630..8fafc26da 100644 --- a/algebra-core/src/curves/batch_arith.rs +++ b/algebra-core/src/curves/batch_arith.rs @@ -25,7 +25,7 @@ pub trait BatchGroupArithmetic where Self: Sized + Clone + Copy + Zero + Neg, { - type BBaseField: Field; + type BaseFieldForBatch: Field; // We use the w-NAF method, achieving point density of approximately 1/(w + 1) // and requiring storage of only 2^(w - 1). @@ -136,7 +136,7 @@ where fn batch_double_in_place( bases: &mut [Self], index: &[u32], - scratch_space: Option<&mut Vec>, + scratch_space: Option<&mut Vec>, ); /// Mutates bases in place and stores result in the first operand. diff --git a/algebra-core/src/curves/cuda/accel_dummy.rs b/algebra-core/src/curves/cuda/accel_dummy.rs new file mode 100644 index 000000000..27d3c3d8a --- /dev/null +++ b/algebra-core/src/curves/cuda/accel_dummy.rs @@ -0,0 +1,9 @@ +#[cfg(not(feature = "std"))] +use alloc::vec::Vec; +pub mod error { + pub type Result = T; +} + +pub struct Context {} + +pub type DeviceMemory = Vec; diff --git a/algebra-core/src/curves/cuda/mod.rs b/algebra-core/src/curves/cuda/mod.rs new file mode 100644 index 000000000..f2dc0829d --- /dev/null +++ b/algebra-core/src/curves/cuda/mod.rs @@ -0,0 +1,6 @@ +#[macro_use] +pub mod scalar_mul; +pub use scalar_mul::*; + +#[cfg(not(feature = "cuda"))] +pub mod accel_dummy; diff --git a/algebra-core/src/curves/cuda/scalar_mul/cpu_gpu_macros.rs b/algebra-core/src/curves/cuda/scalar_mul/cpu_gpu_macros.rs new file mode 100644 index 000000000..6a4000683 --- /dev/null +++ b/algebra-core/src/curves/cuda/scalar_mul/cpu_gpu_macros.rs @@ -0,0 +1,298 @@ +// TODO: make this more generic +#[macro_export] +macro_rules! impl_gpu_cpu_run_kernel { + () => { + #[allow(unused_qualifications)] + fn init_gpu_cache_dir() -> Result { + #[cfg(feature = "cuda")] + { + let dir = dirs::cache_dir() + .unwrap() + .join("zexe-algebra") + .join("cuda-scalar-mul-profiler") + .join(P::namespace()); + std::fs::create_dir_all(&dir)?; + Ok(dir.to_str().unwrap().to_string()) + } + #[cfg(not(feature = "cuda"))] + Err(crate::CudaScalarMulError::CudaDisabledError) + } + + #[allow(unused_qualifications)] + fn read_profile_data() -> Result { + #[cfg(feature = "cuda")] + { + let dir = std::path::PathBuf::from(Self::init_gpu_cache_dir()?); + let data = std::fs::read_to_string(&dir.join("profile_data.txt"))?; + Ok(data) + } + #[cfg(not(feature = "cuda"))] + Err(crate::CudaScalarMulError::CudaDisabledError) + } + + fn clear_gpu_profiling_data() -> Result<(), crate::CudaScalarMulError> { + #[cfg(feature = "cuda")] + { + let dir = std::path::PathBuf::from(Self::init_gpu_cache_dir()?); + std::fs::File::create(&dir.join("profile_data.txt"))?; + Ok(()) + } + #[cfg(not(feature = "cuda"))] + Err(crate::CudaScalarMulError::CudaDisabledError) + } + + #[allow(unused_variables)] + fn write_profile_data(profile_data: &str) -> Result<(), crate::CudaScalarMulError> { + #[cfg(feature = "cuda")] + { + let dir = std::path::PathBuf::from(Self::init_gpu_cache_dir()?); + let mut file = std::fs::File::create(&dir.join("profile_data.txt"))?; + file.write_all(profile_data.as_bytes())?; + file.sync_all()?; + Ok(()) + } + #[cfg(not(feature = "cuda"))] + Err(crate::CudaScalarMulError::CudaDisabledError) + } + + /// We split up the job statically between the CPU and GPUs + /// based on continuous profiling stored both in a static location in memory + /// that is lost the moment the progam stops running. + /// and also a txt file in the OS' cache dir. + + /// Only one such procedure should be running at any time. + #[allow(unused_variables)] + fn cpu_gpu_static_partition_run_kernel( + bases_h: &mut [::Affine], + exps_h: &[<::ScalarField as PrimeField>::BigInt], + cuda_group_size: usize, + // size of the batch for cpu scalar mul + cpu_chunk_size: usize, + ) -> Result<(), crate::CudaScalarMulError> { + #[cfg(feature = "cuda")] + { + if !Device::init() { + panic!("Do not call this function unless the device has been checked to initialise successfully"); + } + let n_devices = Device::get_count().unwrap(); + let n = bases_h.len(); + // Create references so we can split the slices + let mut res_ref = &mut bases_h[..]; + let mut exps_h_ref = exps_h; + + let _now = timer!(); + // Get data for proportion of total throughput achieved by each device + let _ = Self::init_gpu_cache_dir()?; + + let arc_mutex = P::scalar_mul_static_profiler(); + let mut profile_data = arc_mutex.lock().unwrap(); + let mut proportions: Vec = profile_data.0.clone(); + + // If the program has just been initialised, we must check for the existence of existing + // cached profile data. If it does not exist, we create a new file + if proportions.is_empty() { + let _ = Self::read_profile_data() + .and_then(|s| { let res = serde_json::from_str(&s).map_err(|_| crate::CudaScalarMulError::ProfilingDeserializationError)?; Ok(res) }) + .and_then(|cached_data| { + *profile_data = cached_data; + proportions = profile_data.0.clone(); + Ok(()) + } + ); + } + + if proportions.is_empty() { + // By default we split the work evenly between devices and host + proportions = vec![1.0 / (n_devices as f64 + 1.0); n_devices]; + } + timer_println!(_now, "prepare profiling"); + + let _now = timer!(); + assert_eq!(proportions.len(), n_devices); + // Allocate the number of elements in the job to each device/host + let n_gpus = proportions.iter().map(|r| (r * n as f64).round() as usize).collect::>(); + let n_cpu = n - n_gpus.iter().sum::(); + + // Create storage for buffers and contexts for variable number of devices + let mut bases_split = Vec::with_capacity(n_devices); + let mut tables = Vec::with_capacity(n_devices); + let mut exps = Vec::with_capacity(n_devices); + let mut ctxs = Vec::with_capacity(n_devices); + let (mut time_cpu, mut times_gpu) = (0, vec![0; n_devices]); + + // Split data and generate tables and u8 scalar encoding in device memory + for (i, &num) in n_gpus.iter().enumerate() { + let device = Device::nth(i).unwrap(); + let ctx = device.create_context(); + + let (lower, upper) = res_ref.split_at_mut(num); + res_ref = upper; + let lower_exps = &exps_h_ref[..num]; + exps_h_ref = &exps_h_ref[num..]; + + let mut table = DeviceMemory::::zeros(&ctx, num * Self::table_size()); + let mut exp = DeviceMemory::::zeros(&ctx, num * Self::num_u8()); + + Self::generate_tables_and_recoding(lower, &mut table[..], lower_exps, &mut exp[..]); + + ctxs.push((device, ctx)); + bases_split.push(lower); + tables.push(table); + exps.push(exp); + }; + timer_println!(_now, "precomp and allocate on device"); + + let jobs_result: std::sync::Arc>> = std::sync::Arc::new(Mutex::new(Ok(()))); + + rayon::scope(|s| { + // Run jobs on GPUs + for (i, (bases_gpu, time_gpu)) in bases_split.iter_mut().zip(times_gpu.iter_mut()).enumerate() { + let n_gpu = n_gpus[i]; + let ctx = &ctxs[i].1; + let table = &tables[i]; + let exp = &exps[i]; + + let jobs_result_inner = jobs_result.clone(); + + s.spawn(move |_| { + let now = std::time::Instant::now(); + let _now = timer!(); + + let mut out = DeviceMemory::::zeros(ctx, n_gpu); + let result = P::scalar_mul_kernel( + ctx, + (n_gpu - 1) / cuda_group_size + 1, // grid + cuda_group_size, // block + table.as_ptr(), exp.as_ptr(), out.as_mut_ptr(), n_gpu as isize + ).map_err(|_| crate::CudaScalarMulError::KernelFailedError); + if result.is_err() { + *jobs_result_inner.lock().unwrap() = result; + return; + } + Self::batch_normalization(&mut out[..]); + bases_gpu.clone_from_slice(&out.par_iter().map(|p| p.into_affine()).collect::>()[..]); + *time_gpu = now.elapsed().as_micros(); + + timer_println!(_now, format!("gpu {} done", i)); + }); + } + + // Run on CPU + s.spawn(|_| { + let now = std::time::Instant::now(); + let _now = timer!(); + + let exps_mut = &mut exps_h_ref.to_vec()[..]; + rayon::scope(|t| { + for (b, s) in res_ref.chunks_mut(cpu_chunk_size).zip(exps_mut.chunks_mut(cpu_chunk_size)) { + t.spawn(move |_| b[..].batch_scalar_mul_in_place(&mut s[..], 4)); + } + }); + + time_cpu = now.elapsed().as_micros(); + timer_println!(_now, "cpu done"); + }); + }); + + // It's safe to do this, since after the rayon scope we only have one reference. + std::sync::Arc::try_unwrap(jobs_result).unwrap().into_inner().unwrap()?; + + // Update global microbenchmarking state + debug!("CUDA old profile_data: {:?}", profile_data); + let cpu_throughput = n_cpu as f64 / time_cpu as f64; + let gpu_throughputs = n_gpus + .iter() + .zip(times_gpu.iter()) + .map(|(n_gpu, time_gpu)| { + *n_gpu as f64 / *time_gpu as f64 + }) + .collect::>(); + let total_throughput = cpu_throughput + gpu_throughputs.iter().sum::(); + let n_data_points = profile_data.1 as f64; + profile_data.1 += 1; + let new_proportions = gpu_throughputs.iter().map(|t| t / total_throughput); + + if !profile_data.0.is_empty() { + profile_data.0 = new_proportions.zip(profile_data.0.clone()).map(|(new, old)| { + (new + n_data_points * old) / profile_data.1 as f64 + }).collect(); + } else { + profile_data.0 = new_proportions.collect(); + } + + // Update cached profiling data on disk + let _now = timer!(); + let s: String = serde_json::to_string(&(*profile_data)).map_err(|_| crate::CudaScalarMulError::ProfilingSerializationError)?; + Self::write_profile_data(&s)?; + + timer_println!(_now, "write data"); + + debug!("CUDA new profile_data: {:?}", profile_data); + } + + Ok(()) + } + + #[allow(unused_variables)] + fn cpu_gpu_load_balance_run_kernel( + ctx: &Context, + bases_h: &[::Affine], + exps_h: &[<::ScalarField as PrimeField>::BigInt], + cuda_group_size: usize, + // size of a single job in the queue e.g. 2 << 14 + job_size: usize, + // size of the batch for cpu scalar mul + cpu_chunk_size: usize, + ) -> Vec<::Affine> { + #[cfg(feature = "cuda")] + { + let mut bases_res = bases_h.to_vec(); + let queue = Mutex::new(bases_res.chunks_mut(job_size).zip(exps_h.chunks(job_size)).peekmore()); + + rayon::scope(|s| { + // We launch two concurrent GPU threads that block on waiting for GPU to hide latency + for i in 0..2 { + s.spawn(closure!(move i, ref queue, |_| { + std::thread::sleep(std::time::Duration::from_millis(i * 500)); + let mut iter = queue.lock().unwrap(); + while let Some((bases, exps)) = iter.next() { + iter.peek(); + if iter.peek().is_none() { break; } + let mut proj_res = Self::par_run_kernel_sync(ctx, bases, exps, cuda_group_size, iter); + Self::batch_normalization(&mut proj_res[..]); + bases.clone_from_slice(&proj_res.par_iter().map(|p| p.into_affine()).collect::>()[..]); + iter = queue.lock().unwrap(); + } + })); + } + + s.spawn(|_| { + std::thread::sleep(std::time::Duration::from_millis(20)); + let mut iter = queue.lock().unwrap(); + debug!("CUDA acquired cpu"); + while let Some((bases, exps)) = iter.next() { + let exps_mut = &mut exps.to_vec()[..]; + rayon::scope(|t| { + for (b, s) in bases.chunks_mut(cpu_chunk_size).zip(exps_mut.chunks_mut(cpu_chunk_size)) { + t.spawn(move |_| b[..].batch_scalar_mul_in_place(&mut s[..], 4)); + } + }); + // Sleep to allow other threads to unlock + drop(iter); + debug!("CUDA unlocked cpu"); + std::thread::sleep(std::time::Duration::from_millis(20)); + iter = queue.lock().unwrap(); + debug!("CUDA acquired cpu"); + } + debug!("CUDA cpu finish"); + }); + }); + drop(queue); + bases_res + } + + #[cfg(not(feature = "cuda"))] + Vec::new() + } + } +} diff --git a/algebra-core/src/curves/cuda/scalar_mul/kernel_macros.rs b/algebra-core/src/curves/cuda/scalar_mul/kernel_macros.rs new file mode 100644 index 000000000..cb04b94f0 --- /dev/null +++ b/algebra-core/src/curves/cuda/scalar_mul/kernel_macros.rs @@ -0,0 +1,176 @@ +#[macro_export] +macro_rules! impl_scalar_mul_kernel { + ($curve: ident, $curve_string:expr, $type: expr, $ProjCurve: ident) => { + paste::item! { + #[cfg(feature = "cuda")] + use {accel::*, std::sync::{Arc, Mutex}}; + + #[cfg(not(feature = "cuda"))] + use algebra_core::accel_dummy::*; + + use algebra_core::curves::cuda::scalar_mul::ScalarMulProfiler; + + #[cfg(feature = "cuda")] + lazy_static::lazy_static! { + pub static ref MICROBENCH_CPU_GPU_AVG_RATIO: + Arc, usize)>> = Arc::new(Mutex::new((vec![], 0))); + } + + #[cfg(not(feature = "cuda"))] + static MICROBENCH_CPU_GPU_AVG_RATIO: () = (); + + const NAMESPACE: &'static str = stringify!([<$curve _ $type _cuda_namespace>]); + + #[cfg(feature = "cuda")] + #[kernel_mod(transparent)] + #[name([<$curve _ $type _cuda_namespace>])] + #[dependencies("accel-core" = { git = "https://github.com/jon-chuang/accel", package = "accel-core" })] + #[dependencies("algebra-core" = { git = "https://github.com/celo-org/zexe", branch = "master", package = "algebra-core", default_features = false})] + #[dependencies("algebra" = { git = "https://github.com/celo-org/zexe", branch = "master", package = "algebra", default_features = false, features = [$curve_string]})] + pub mod scalar_mul { + use algebra::{$curve::$ProjCurve}; + use algebra_core::{curves::ProjectiveCurve, fields::PrimeField, FpParameters, Zero}; + + const NUM_BITS: isize = + <<<$ProjCurve as ProjectiveCurve>::ScalarField as PrimeField>::Params as FpParameters>::MODULUS_BITS as isize; + const LOG2_W: isize = 5; + const TABLE_SIZE: isize = 1 << LOG2_W; + const NUM_U8: isize = (NUM_BITS - 1) / LOG2_W + 1; + + #[kernel_func] + pub unsafe fn scalar_mul( + #[type_substitute(*const super::$ProjCurve)] + table: *const $ProjCurve, + exps: *const u8, + #[type_substitute(*mut super::$ProjCurve)] + out: *mut $ProjCurve, + n: isize, + ) { + let i = accel_core::index(); + if i < n { + let mut res = $ProjCurve::zero(); + res += &(*table.offset(i * TABLE_SIZE + *exps.offset(i * NUM_U8) as isize)); + + for j in 1..NUM_U8 as isize { + for _ in 0..LOG2_W { + res.double_in_place(); + } + res += &(*table + .offset(i * TABLE_SIZE + *exps.offset(i * NUM_U8 + j) as isize)); + } + *out.offset(i) = res; + } + } + } + } + } +} + +#[macro_export] +macro_rules! impl_scalar_mul_kernel_glv { + ($curve: ident, $curve_string:expr, $type: expr, $ProjCurve: ident) => { + paste::item! { + #[cfg(feature = "cuda")] + use {accel::*, std::sync::{Arc, Mutex}}; + + #[cfg(not(feature = "cuda"))] + use algebra_core::accel_dummy::*; + + use algebra_core::curves::cuda::scalar_mul::ScalarMulProfiler; + + #[cfg(feature = "cuda")] + lazy_static::lazy_static! { + pub static ref MICROBENCH_CPU_GPU_AVG_RATIO: + Arc, usize)>> = Arc::new(Mutex::new((vec![], 0))); + } + + #[cfg(not(feature = "cuda"))] + static MICROBENCH_CPU_GPU_AVG_RATIO: () = (); + + const NAMESPACE: &'static str = stringify!([<$curve _ $type _cuda_namespace>]); + + #[cfg(feature = "cuda")] + #[kernel_mod(transparent)] + #[name([<$curve _ $type _cuda_namespace>])] + #[dependencies("accel-core" = { git = "https://github.com/jon-chuang/accel", package = "accel-core" })] + #[dependencies("algebra-core" = { git = "https://github.com/celo-org/zexe", branch = "master", package = "algebra-core", default_features = false})] + #[dependencies("algebra" = { git = "https://github.com/celo-org/zexe", branch = "master", package = "algebra", default_features = false, features = [$curve_string]})] + pub mod scalar_mul { + use algebra::{$curve::$ProjCurve}; + use algebra_core::{curves::ProjectiveCurve, fields::PrimeField, FpParameters, Zero}; + + const NUM_BITS: isize = + <<<$ProjCurve as ProjectiveCurve>::ScalarField as PrimeField>::Params as FpParameters>::MODULUS_BITS as isize; + const LOG2_W: isize = 5; + const TABLE_SIZE: isize = 1 << LOG2_W; + const HALF_TABLE_SIZE: isize = 1 << (LOG2_W - 1); + const NUM_U8: isize = 2 * ((NUM_BITS - 1) / (2 * (LOG2_W - 1)) + 2); + + #[kernel_func] + pub unsafe fn scalar_mul( + #[type_substitute(*const super::$ProjCurve)] + table: *const $ProjCurve, + exps: *const u8, + #[type_substitute(*mut super::$ProjCurve)] + out: *mut $ProjCurve, + n: isize, + ) { + let i = accel_core::index(); + if i < n { + let mut res = $ProjCurve::zero(); + + res += &(*table.offset(i * TABLE_SIZE + *exps.offset(i * NUM_U8) as isize)); + res += &(*table.offset( + i * TABLE_SIZE + HALF_TABLE_SIZE + *exps.offset(i * NUM_U8 + 1) as isize, + )); + + for j in 1..NUM_U8 as isize / 2 { + for _ in 0..(LOG2_W - 1) { + res.double_in_place(); + } + res += &(*table + .offset(i * TABLE_SIZE + *exps.offset(i * NUM_U8 + 2 * j) as isize)); + res += &(*table.offset( + i * TABLE_SIZE + + HALF_TABLE_SIZE + + *exps.offset(i * NUM_U8 + 2 * j + 1) as isize, + )); + } + *out.offset(i) = res; + } + } + } + } + } +} + +#[macro_export] +macro_rules! impl_scalar_mul_parameters { + ($ProjCurve:ident) => { + #[allow(unused_variables)] + fn scalar_mul_kernel( + ctx: &Context, + grid: usize, + block: usize, + table: *const $ProjCurve, + exps: *const u8, + out: *mut $ProjCurve, + n: isize, + ) -> error::Result<()> { + #[cfg(feature = "cuda")] + scalar_mul(ctx, grid, block, (table, exps, out, n)) + } + + fn scalar_mul_static_profiler() -> ScalarMulProfiler { + #[cfg(feature = "cuda")] + return (*MICROBENCH_CPU_GPU_AVG_RATIO).clone(); + + #[cfg(not(feature = "cuda"))] + MICROBENCH_CPU_GPU_AVG_RATIO + } + + fn namespace() -> &'static str { + NAMESPACE + } + }; +} diff --git a/algebra-core/src/curves/cuda/scalar_mul/mod.rs b/algebra-core/src/curves/cuda/scalar_mul/mod.rs new file mode 100644 index 000000000..e96f4b0f9 --- /dev/null +++ b/algebra-core/src/curves/cuda/scalar_mul/mod.rs @@ -0,0 +1,357 @@ +#[macro_use] +mod kernel_macros; +pub use kernel_macros::*; + +#[macro_use] +mod cpu_gpu_macros; + +#[macro_use] +mod run_kernel_macros; + +#[cfg(feature = "cuda")] +use std::sync::{Arc, Mutex}; + +use core::fmt; + +use crate::{ + cfg_chunks_mut, + curves::{AffineCurve, BatchGroupArithmeticSlice}, + fields::PrimeField, +}; +use internal::GPUScalarMulInternal; + +#[cfg(feature = "cuda")] +pub type ScalarMulProfiler = Arc, usize)>>; +#[cfg(not(feature = "cuda"))] +pub type ScalarMulProfiler = (); + +#[cfg(feature = "parallel")] +use rayon::prelude::*; + +pub const MAX_GROUP_ELEM_BYTES: usize = 400; + +#[derive(Debug)] +pub enum CudaScalarMulError { + CudaDisabledError, + IoError, + KernelFailedError, + ProfilingSerializationError, + ProfilingDeserializationError, +} + +#[cfg(feature = "std")] +impl std::error::Error for CudaScalarMulError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + None + } +} + +#[cfg(feature = "std")] +impl From for CudaScalarMulError { + fn from(_: std::io::Error) -> Self { + CudaScalarMulError::IoError + } +} + +impl fmt::Display for CudaScalarMulError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { + match self { + CudaScalarMulError::CudaDisabledError => write!(f, "CUDA is disabled"), + CudaScalarMulError::IoError => write!(f, "IO error"), + CudaScalarMulError::KernelFailedError => write!(f, "Failed running kernel"), + CudaScalarMulError::ProfilingSerializationError => { + write!(f, "Failed serlializing profiling data") + } + CudaScalarMulError::ProfilingDeserializationError => { + write!(f, "Failed deserializing profiling data") + } + } + } +} + +pub trait GPUScalarMul: GPUScalarMulInternal { + fn clear_gpu_profiling_data() { + #[cfg(feature = "cuda")] + >::clear_gpu_profiling_data() + .expect("Should have cleared GPU profiling data"); + } + + #[allow(unused_variables)] + fn cpu_gpu_scalar_mul( + elems: &mut [G], + exps_h: &[<::ScalarField as PrimeField>::BigInt], + cuda_group_size: usize, + // size of the batch for cpu scalar mul + cpu_chunk_size: usize, + ) -> Result<(), CudaScalarMulError> { + #[cfg(feature = "cuda")] + { + // CUDA will return ILLEGAL_ADRESS if group elem size is too large. + if accel::Device::init() && core::mem::size_of::() < MAX_GROUP_ELEM_BYTES { + ::Projective::cpu_gpu_static_partition_run_kernel( + elems, + exps_h, + cuda_group_size, + cpu_chunk_size, + )?; + } else { + let mut exps_mut = exps_h.to_vec(); + cfg_chunks_mut!(elems, cpu_chunk_size) + .zip(cfg_chunks_mut!(exps_mut, cpu_chunk_size)) + .for_each(|(b, s)| { + b[..].batch_scalar_mul_in_place(&mut s[..], 4); + }); + } + } + + #[cfg(not(feature = "cuda"))] + { + let mut exps_mut = exps_h.to_vec(); + cfg_chunks_mut!(elems, cpu_chunk_size) + .zip(cfg_chunks_mut!(exps_mut, cpu_chunk_size)) + .for_each(|(b, s)| { + b[..].batch_scalar_mul_in_place(&mut s[..], 4); + }); + } + + Ok(()) + } +} + +impl GPUScalarMul for G::Projective {} + +pub(crate) mod internal { + #[cfg(feature = "cuda")] + use accel::*; + + #[cfg(not(feature = "cuda"))] + use crate::accel_dummy::*; + + #[cfg(not(feature = "std"))] + use alloc::{string::String, vec::Vec}; + + use crate::{curves::AffineCurve, fields::PrimeField, CudaScalarMulError}; + + #[allow(unused_variables)] + pub trait GPUScalarMulInternal: Sized { + const NUM_BITS: usize; + const LOG2_W: usize; + + fn table_size() -> usize { + 1 << Self::LOG2_W + } + + fn num_u8() -> usize; + + fn init_gpu_cache_dir() -> Result; + fn read_profile_data() -> Result; + fn write_profile_data(profile_data: &str) -> Result<(), CudaScalarMulError>; + fn clear_gpu_profiling_data() -> Result<(), CudaScalarMulError>; + + fn par_run_kernel( + ctx: &Context, + bases_h: &[G], + exps_h: &[<::ScalarField as PrimeField>::BigInt], + cuda_group_size: usize, + ) -> DeviceMemory; + + fn par_run_kernel_sync( + ctx: &Context, + bases_h: &[G], + exps_h: &[<::ScalarField as PrimeField>::BigInt], + cuda_group_size: usize, + lock: T, + ) -> DeviceMemory; + + fn generate_tables_and_recoding( + bases_h: &[G], + tables_h: &mut [Self], + exps_h: &[<::ScalarField as PrimeField>::BigInt], + exps_recode_h: &mut [u8], + ); + + fn cpu_gpu_load_balance_run_kernel( + ctx: &Context, + bases_h: &[G], + exps_h: &[<::ScalarField as PrimeField>::BigInt], + cuda_group_size: usize, + // size of a single job in the queue e.g. 2 << 14 + job_size: usize, + // size of the batch for cpu scalar mul + cpu_chunk_size: usize, + ) -> Vec; + + fn cpu_gpu_static_partition_run_kernel( + bases_h: &mut [G], + exps_h: &[<::ScalarField as PrimeField>::BigInt], + cuda_group_size: usize, + // size of the batch for cpu scalar mul + cpu_chunk_size: usize, + ) -> Result<(), CudaScalarMulError>; + } +} + +#[macro_export] +macro_rules! impl_gpu_sw_projective { + ($Parameters:ident) => { + impl GPUScalarMulInternal> for GroupProjective

{ + const NUM_BITS: usize = + <<::ScalarField as PrimeField>::Params as FpParameters>::MODULUS_BITS as usize; + const LOG2_W: usize = 5; + + fn num_u8() -> usize { + if P::has_glv() { + 2 * ((Self::NUM_BITS - 1) / (2 * (Self::LOG2_W - 1)) + 2) + } else { + (Self::NUM_BITS - 1) / Self::LOG2_W + 1 + } + } + + fn generate_tables_and_recoding( + bases_h: &[::Affine], + tables_h: &mut [Self], + exps_h: &[<::ScalarField as PrimeField>::BigInt], + exps_recode_h: &mut [u8], + ) { + if P::has_glv() { + let scalar_recode_glv = + |k1: &mut <::ScalarField as PrimeField>::BigInt, k2: &mut <::ScalarField as PrimeField>::BigInt| -> Vec { + let table_size_glv: u64 = 1u64 << (Self::LOG2_W - 1); + let mut out = vec![0; Self::num_u8()]; + for i in (0..Self::num_u8() / 2).rev() { + out[2 * i] = (k1.as_ref()[0] % table_size_glv) as u8; + out[2 * i + 1] = (k2.as_ref()[0] % table_size_glv) as u8; + k1.divn(Self::LOG2_W as u32 - 1); + k2.divn(Self::LOG2_W as u32 - 1); + } + assert!(k1.is_zero()); + assert!(k2.is_zero()); + out + }; + + cfg_iter!(exps_h) + .zip(cfg_chunks_mut!(exps_recode_h, Self::num_u8())) + .zip(cfg_chunks_mut!(tables_h, Self::table_size()).zip(cfg_iter!(bases_h))) + .for_each(|((k, exps_chunk), (table, base))| { + let ((k1_neg, mut k1), (k2_neg, mut k2)) = + P::glv_scalar_decomposition(*k); + let base = base.into_projective(); + exps_chunk.clone_from_slice(&scalar_recode_glv(&mut k1, &mut k2)); + + table[0] = Self::zero(); + table[Self::table_size() / 2] = Self::zero(); + + for i in 1..Self::table_size() / 2 { + let mut res = if k1_neg { + table[i - 1] - base + } else { + table[i - 1] + base + }; + table[i] = res; + + P::glv_endomorphism_in_place(&mut res.x); + table[Self::table_size() / 2 + i] = + if k2_neg != k1_neg { res.neg() } else { res }; + } + }); + } else { + let scalar_recode = |k: &mut <::ScalarField as PrimeField>::BigInt| -> Vec { + let mut out = vec![0; Self::num_u8()]; + for i in (0..Self::num_u8()).rev() { + out[i] = (k.as_ref()[0] % Self::table_size() as u64) as u8; + k.divn(Self::LOG2_W as u32); + } + assert!(k.is_zero()); + out + }; + cfg_iter!(exps_h) + .zip(cfg_chunks_mut!(exps_recode_h, Self::num_u8())) + .zip(cfg_chunks_mut!(tables_h, Self::table_size()).zip(cfg_iter!(bases_h))) + .for_each(|((k, exps_chunk), (table, base))| { + let base = base.into_projective(); + exps_chunk.clone_from_slice(&scalar_recode(&mut k.clone())[..]); + + table[0] = Self::zero(); + for i in 1..Self::table_size() { + table[i] = table[i - 1] + base; + } + }); + } + } + + impl_run_kernel!(); + impl_gpu_cpu_run_kernel!(); + } + }; +} + +#[macro_export] +macro_rules! impl_gpu_te_projective { + ($Parameters:ident) => { + impl GPUScalarMulInternal> for GroupProjective

{ + const NUM_BITS: usize = + <<::ScalarField as PrimeField>::Params as FpParameters>::MODULUS_BITS as usize; + const LOG2_W: usize = 5; + + fn generate_tables_and_recoding( + bases_h: &[::Affine], + tables_h: &mut [Self], + exps_h: &[<::ScalarField as PrimeField>::BigInt], + exps_recode_h: &mut [u8], + ) { + let scalar_recode = |k: &mut <::ScalarField as PrimeField>::BigInt| -> Vec { + let mut out = vec![0; Self::num_u8()]; + for i in (0..Self::num_u8()).rev() { + out[i] = (k.as_ref()[0] % Self::table_size() as u64) as u8; + k.divn(Self::LOG2_W as u32); + } + assert!(k.is_zero()); + out + }; + cfg_iter!(exps_h) + .zip(cfg_chunks_mut!(exps_recode_h, Self::num_u8())) + .zip(cfg_chunks_mut!(tables_h, Self::table_size()).zip(cfg_iter!(bases_h))) + .for_each(|((k, exps_chunk), (table, base))| { + let base = base.into_projective(); + exps_chunk.clone_from_slice(&scalar_recode(&mut k.clone())[..]); + + table[0] = Self::zero(); + for i in 1..Self::table_size() { + table[i] = table[i - 1] + base; + } + } + ); + } + + fn num_u8() -> usize { + (Self::NUM_BITS - 1) / Self::LOG2_W + 1 + } + + impl_run_kernel!(); + impl_gpu_cpu_run_kernel!(); + } + }; +} + +pub trait GPUScalarMulSlice { + #[allow(unused_variables)] + fn cpu_gpu_scalar_mul( + &mut self, + exps_h: &[<::ScalarField as PrimeField>::BigInt], + cuda_group_size: usize, + // size of the batch for cpu scalar mul + cpu_chunk_size: usize, + ) -> Result<(), CudaScalarMulError>; +} + +impl GPUScalarMulSlice for [G] { + fn cpu_gpu_scalar_mul( + &mut self, + exps_h: &[<::ScalarField as PrimeField>::BigInt], + cuda_group_size: usize, + // size of the batch for cpu scalar mul + cpu_chunk_size: usize, + ) -> Result<(), CudaScalarMulError> { + G::Projective::cpu_gpu_scalar_mul(self, exps_h, cuda_group_size, cpu_chunk_size) + } +} diff --git a/algebra-core/src/curves/cuda/scalar_mul/run_kernel_macros.rs b/algebra-core/src/curves/cuda/scalar_mul/run_kernel_macros.rs new file mode 100644 index 000000000..031533064 --- /dev/null +++ b/algebra-core/src/curves/cuda/scalar_mul/run_kernel_macros.rs @@ -0,0 +1,100 @@ +#[macro_export] +macro_rules! impl_run_kernel { + () => { + // We drop a lock only after the parallel portion has been handled + #[allow(unused_variables)] + fn par_run_kernel_sync( + ctx: &Context, + bases_h: &[::Affine], + exps_h: &[<::ScalarField as PrimeField>::BigInt], + cuda_group_size: usize, + lock: T, + ) -> DeviceMemory { + #[cfg(feature = "cuda")] + { + assert_eq!(bases_h.len(), exps_h.len()); + let n = bases_h.len(); + + let mut tables_h = vec![Self::zero(); n * Self::table_size()]; + let mut exps_recode_h = vec![0u8; n * Self::num_u8()]; + + let _now = timer!(); + Self::generate_tables_and_recoding( + bases_h, + &mut tables_h[..], + exps_h, + &mut exps_recode_h[..], + ); + drop(lock); + timer_println!(_now, "generated tables & recode"); + + let _now = timer!(); + let mut out = DeviceMemory::::zeros(&ctx, n); + let mut tables = DeviceMemory::::zeros(&ctx, n * Self::table_size()); + let mut exps = DeviceMemory::::zeros(&ctx, n * Self::num_u8()); + timer_println!(_now, "allocate device memory"); + + let _now = timer!(); + tables.copy_from_slice(&tables_h); + exps.copy_from_slice(&exps_recode_h); + timer_println!(_now, "copy data to device"); + + let _now = timer!(); + P::scalar_mul_kernel( + &ctx, + n / cuda_group_size, // grid + cuda_group_size, // block + tables.as_ptr(), + exps.as_ptr(), + out.as_mut_ptr(), + n as isize, + ) + .expect("Kernel call failed"); + timer_println!(_now, "run kernel"); + out + } + #[cfg(not(feature = "cuda"))] + unreachable!(); + } + + #[allow(unused_variables)] + fn par_run_kernel( + ctx: &Context, + bases_h: &[::Affine], + exps_h: &[<::ScalarField as PrimeField>::BigInt], + cuda_group_size: usize, + ) -> DeviceMemory { + #[cfg(feature = "cuda")] + { + assert_eq!(bases_h.len(), exps_h.len()); + let n = bases_h.len(); + + let _now = timer!(); + let mut tables = DeviceMemory::::zeros(&ctx, n * Self::table_size()); + let mut exps = DeviceMemory::::zeros(&ctx, n * Self::num_u8()); + let mut out = DeviceMemory::::zeros(&ctx, n); + timer_println!(_now, "allocate device memory"); + + let _now = timer!(); + Self::generate_tables_and_recoding(bases_h, &mut tables[..], exps_h, &mut exps[..]); + timer_println!(_now, "generated tables & recode"); + + let _now = timer!(); + P::scalar_mul_kernel( + &ctx, + n / cuda_group_size, // grid + cuda_group_size, // block + tables.as_ptr(), + exps.as_ptr(), + out.as_mut_ptr(), + n as isize, + ) + .expect("Kernel call failed"); + timer_println!(_now, "run kernel"); + out + } + #[cfg(not(feature = "cuda"))] + unreachable!(); + } + }; +} diff --git a/algebra-core/src/curves/glv.rs b/algebra-core/src/curves/glv.rs index eb4af4a35..bf46c213b 100644 --- a/algebra-core/src/curves/glv.rs +++ b/algebra-core/src/curves/glv.rs @@ -15,6 +15,7 @@ pub trait GLVParameters: Send + Sync + 'static + ModelParameters { const B1: ::BigInt; // |b1| const B2: ::BigInt; // |b2| const B1_IS_NEG: bool; + const R_BITS: u32; #[inline] diff --git a/algebra-core/src/curves/mod.rs b/algebra-core/src/curves/mod.rs index 1ba08682d..ade771000 100644 --- a/algebra-core/src/curves/mod.rs +++ b/algebra-core/src/curves/mod.rs @@ -26,6 +26,10 @@ pub use self::glv::*; pub mod models; +#[macro_use] +pub mod cuda; +pub use cuda::*; + pub use self::models::*; pub trait PairingEngine: Sized + 'static + Copy + Debug + Sync + Send + Eq + PartialEq { @@ -36,6 +40,7 @@ pub trait PairingEngine: Sized + 'static + Copy + Debug + Sync + Send + Eq + Par type G1Projective: ProjectiveCurve + From + Into + + GPUScalarMul + MulAssign; // needed due to https://github.com/rust-lang/rust/issues/69640 /// The affine representation of an element in G1. @@ -51,6 +56,7 @@ pub trait PairingEngine: Sized + 'static + Copy + Debug + Sync + Send + Eq + Par type G2Projective: ProjectiveCurve + From + Into + + GPUScalarMul + MulAssign; // needed due to https://github.com/rust-lang/rust/issues/69640 /// The affine representation of an element in G2. @@ -134,6 +140,7 @@ pub trait ProjectiveCurve: + core::iter::Sum + for<'a> core::iter::Sum<&'a Self> + From<::Affine> + + GPUScalarMul<::Affine> { const COFACTOR: &'static [u64]; type ScalarField: PrimeField + SquareRootField; @@ -229,7 +236,7 @@ pub trait AffineCurve: + Zero + Neg + From<::Projective> - + BatchGroupArithmetic::BaseField> + + BatchGroupArithmetic::BaseField> { const COFACTOR: &'static [u64]; type ScalarField: PrimeField + SquareRootField + Into<::BigInt>; @@ -237,6 +244,7 @@ pub trait AffineCurve: type Projective: ProjectiveCurve + From + Into + + GPUScalarMul + MulAssign; // needed due to https://github.com/rust-lang/rust/issues/69640 /// Returns a fixed generator of unknown exponent. diff --git a/algebra-core/src/curves/models/mod.rs b/algebra-core/src/curves/models/mod.rs index 5a7f51270..0c0329973 100644 --- a/algebra-core/src/curves/models/mod.rs +++ b/algebra-core/src/curves/models/mod.rs @@ -12,9 +12,11 @@ pub(crate) mod sw_batch_affine; pub mod short_weierstrass_affine; #[macro_use] pub mod short_weierstrass_jacobian; -pub mod short_weierstrass_projective; pub mod twisted_edwards_extended; +pub use short_weierstrass_jacobian::SWModelParameters; +pub use twisted_edwards_extended::TEModelParameters; + pub trait ModelParameters: Send + Sync + 'static { type BaseField: Field + SquareRootField; type ScalarField: PrimeField @@ -23,70 +25,6 @@ pub trait ModelParameters: Send + Sync + 'static { + From<::BigInt>; } -pub trait SWModelParameters: ModelParameters { - const COEFF_A: Self::BaseField; - const COEFF_B: Self::BaseField; - const COFACTOR: &'static [u64]; - const COFACTOR_INV: Self::ScalarField; - const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField); - - #[inline(always)] - fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField { - let mut copy = *elem; - copy *= &Self::COEFF_A; - copy - } - - #[inline(always)] - fn add_b(elem: &Self::BaseField) -> Self::BaseField { - let mut copy = *elem; - copy += &Self::COEFF_B; - copy - } - - #[inline(always)] - fn has_glv() -> bool { - false - } - - #[inline(always)] - fn glv_endomorphism_in_place(_elem: &mut Self::BaseField) { - unimplemented!() - } - - #[inline(always)] - fn glv_scalar_decomposition( - _k: ::BigInt, - ) -> ( - (bool, ::BigInt), - (bool, ::BigInt), - ) { - unimplemented!() - } - - #[inline(always)] - fn glv_window_size() -> usize { - 4 - } -} - -pub trait TEModelParameters: ModelParameters { - const COEFF_A: Self::BaseField; - const COEFF_D: Self::BaseField; - const COFACTOR: &'static [u64]; - const COFACTOR_INV: Self::ScalarField; - const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField); - - type MontgomeryModelParameters: MontgomeryModelParameters; - - #[inline(always)] - fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField { - let mut copy = *elem; - copy *= &Self::COEFF_A; - copy - } -} - pub trait MontgomeryModelParameters: ModelParameters { const COEFF_A: Self::BaseField; const COEFF_B: Self::BaseField; diff --git a/algebra-core/src/curves/models/short_weierstrass_affine.rs b/algebra-core/src/curves/models/short_weierstrass_affine.rs index 1be242d3d..995ee2977 100644 --- a/algebra-core/src/curves/models/short_weierstrass_affine.rs +++ b/algebra-core/src/curves/models/short_weierstrass_affine.rs @@ -10,15 +10,15 @@ macro_rules! specialise_affine_to_proj { #[derive(Derivative)] #[derivative( - Copy(bound = "P: Parameters"), - Clone(bound = "P: Parameters"), - PartialEq(bound = "P: Parameters"), - Eq(bound = "P: Parameters"), - Debug(bound = "P: Parameters"), - Hash(bound = "P: Parameters") + Copy(bound = "P: SWModelParameters"), + Clone(bound = "P: SWModelParameters"), + PartialEq(bound = "P: SWModelParameters"), + Eq(bound = "P: SWModelParameters"), + Debug(bound = "P: SWModelParameters"), + Hash(bound = "P: SWModelParameters") )] #[repr(C)] - pub struct GroupAffine { + pub struct GroupAffine { pub infinity: bool, pub x: P::BaseField, pub y: P::BaseField, @@ -26,7 +26,35 @@ macro_rules! specialise_affine_to_proj { _params: PhantomData

, } - impl AffineCurve for GroupAffine

{ + impl GroupAffine

{ + #[inline(always)] + pub fn has_glv() -> bool { + P::has_glv() + } + + #[inline(always)] + pub fn glv_endomorphism_in_place(elem: &mut ::BaseField) { + P::glv_endomorphism_in_place(elem); + } + + #[inline] + pub fn glv_scalar_decomposition( + k: <::ScalarField as PrimeField>::BigInt, + ) -> ( + ( + bool, + <::ScalarField as PrimeField>::BigInt, + ), + ( + bool, + <::ScalarField as PrimeField>::BigInt, + ), + ) { + P::glv_scalar_decomposition(k) + } + } + + impl AffineCurve for GroupAffine

{ const COFACTOR: &'static [u64] = P::COFACTOR; type BaseField = P::BaseField; type ScalarField = P::ScalarField; @@ -81,7 +109,7 @@ macro_rules! specialise_affine_to_proj { } } - impl GroupAffine

{ + impl GroupAffine

{ pub fn new(x: P::BaseField, y: P::BaseField, infinity: bool) -> Self { Self { x, @@ -147,7 +175,7 @@ macro_rules! specialise_affine_to_proj { } } - impl Display for GroupAffine

{ + impl Display for GroupAffine

{ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { if self.infinity { write!(f, "GroupAffine(Infinity)") @@ -157,7 +185,7 @@ macro_rules! specialise_affine_to_proj { } } - impl Zero for GroupAffine

{ + impl Zero for GroupAffine

{ fn zero() -> Self { Self::new(P::BaseField::zero(), P::BaseField::one(), true) } @@ -167,7 +195,7 @@ macro_rules! specialise_affine_to_proj { } } - impl Add for GroupAffine

{ + impl Add for GroupAffine

{ type Output = Self; fn add(self, other: Self) -> Self { let mut copy = self; @@ -176,7 +204,7 @@ macro_rules! specialise_affine_to_proj { } } - impl<'a, P: Parameters> AddAssign<&'a Self> for GroupAffine

{ + impl<'a, P: SWModelParameters> AddAssign<&'a Self> for GroupAffine

{ fn add_assign(&mut self, other: &'a Self) { let mut s_proj = ::Projective::from(*self); s_proj.add_assign_mixed(other); @@ -184,7 +212,7 @@ macro_rules! specialise_affine_to_proj { } } - impl Neg for GroupAffine

{ + impl Neg for GroupAffine

{ type Output = Self; #[inline] @@ -199,7 +227,7 @@ macro_rules! specialise_affine_to_proj { impl_sw_batch_affine!(GroupAffine); - impl ToBytes for GroupAffine

{ + impl ToBytes for GroupAffine

{ #[inline] fn write(&self, mut writer: W) -> IoResult<()> { self.x.write(&mut writer)?; @@ -208,7 +236,7 @@ macro_rules! specialise_affine_to_proj { } } - impl FromBytes for GroupAffine

{ + impl FromBytes for GroupAffine

{ #[inline] fn read(mut reader: R) -> IoResult { let x = P::BaseField::read(&mut reader)?; @@ -218,14 +246,14 @@ macro_rules! specialise_affine_to_proj { } } - impl Default for GroupAffine

{ + impl Default for GroupAffine

{ #[inline] fn default() -> Self { Self::zero() } } - impl_sw_curve_serializer!(Parameters); + impl_sw_curve_serializer!(SWModelParameters); }; } diff --git a/algebra-core/src/curves/models/short_weierstrass_jacobian.rs b/algebra-core/src/curves/models/short_weierstrass_jacobian.rs index 3b06ff835..7ecd95982 100644 --- a/algebra-core/src/curves/models/short_weierstrass_jacobian.rs +++ b/algebra-core/src/curves/models/short_weierstrass_jacobian.rs @@ -1,5 +1,4 @@ use crate::{ - curves::models::SWModelParameters as Parameters, io::{Read, Result as IoResult, Write}, serialize::{Flags, SWFlags}, UniformRand, Vec, @@ -15,10 +14,26 @@ use rand::{ Rng, }; +#[cfg(not(feature = "cuda"))] +use crate::accel_dummy::*; +#[cfg(feature = "cuda")] +use accel::*; + +#[cfg(feature = "cuda")] +use { + crate::curves::BatchGroupArithmeticSlice, closure::closure, log::debug, peekmore::PeekMore, + std::sync::Mutex, +}; + use crate::{ bytes::{FromBytes, ToBytes}, - curves::{AffineCurve, BatchGroupArithmetic, ProjectiveCurve}, - fields::{BitIteratorBE, Field, PrimeField, SquareRootField}, + cfg_chunks_mut, cfg_iter, + curves::{ + cuda::scalar_mul::{internal::GPUScalarMulInternal, ScalarMulProfiler}, + AffineCurve, BatchGroupArithmetic, ModelParameters, ProjectiveCurve, + }, + fields::{BitIteratorBE, Field, FpParameters, PrimeField, SquareRootField}, + impl_gpu_cpu_run_kernel, impl_gpu_sw_projective, impl_run_kernel, }; use crate::{ @@ -31,30 +46,119 @@ specialise_affine_to_proj!(GroupProjective); #[cfg(feature = "parallel")] use rayon::prelude::*; +pub trait SWModelParameters: ModelParameters + Sized { + const COEFF_A: Self::BaseField; + const COEFF_B: Self::BaseField; + const COFACTOR: &'static [u64]; + const COFACTOR_INV: Self::ScalarField; + const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField); + + #[inline(always)] + fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField { + let mut copy = *elem; + copy *= &Self::COEFF_A; + copy + } + + #[inline(always)] + fn glv_window_size() -> usize { + 4 + } + + #[inline(always)] + fn add_b(elem: &Self::BaseField) -> Self::BaseField { + let mut copy = *elem; + copy += &Self::COEFF_B; + copy + } + + #[inline(always)] + fn has_glv() -> bool { + false + } + + #[inline(always)] + fn glv_endomorphism_in_place(_elem: &mut Self::BaseField) { + unimplemented!() + } + + #[inline(always)] + fn glv_scalar_decomposition( + _k: ::BigInt, + ) -> ( + (bool, ::BigInt), + (bool, ::BigInt), + ) { + unimplemented!() + } + + fn scalar_mul_kernel( + ctx: &Context, + grid: usize, + block: usize, + table: *const GroupProjective, + exps: *const u8, + out: *mut GroupProjective, + n: isize, + ) -> error::Result<()>; + + fn scalar_mul_static_profiler() -> ScalarMulProfiler; + + fn namespace() -> &'static str; +} + +impl_gpu_sw_projective!(SWModelParameters); + #[derive(Derivative)] #[derivative( - Copy(bound = "P: Parameters"), - Clone(bound = "P: Parameters"), - Eq(bound = "P: Parameters"), - Debug(bound = "P: Parameters"), - Hash(bound = "P: Parameters") + Copy(bound = "P: SWModelParameters"), + Clone(bound = "P: SWModelParameters"), + Eq(bound = "P: SWModelParameters"), + Debug(bound = "P: SWModelParameters"), + Hash(bound = "P: SWModelParameters") )] -#[must_use] -pub struct GroupProjective { +pub struct GroupProjective { pub x: P::BaseField, pub y: P::BaseField, pub z: P::BaseField, - #[derivative(Debug = "ignore")] _params: PhantomData

, } -impl Display for GroupProjective

{ +impl GroupProjective

{ + #[inline(always)] + pub fn has_glv() -> bool { + P::has_glv() + } + + #[inline(always)] + pub fn glv_endomorphism_in_place(elem: &mut ::BaseField) { + P::glv_endomorphism_in_place(elem); + } + + #[inline] + pub fn glv_scalar_decomposition( + k: <::ScalarField as PrimeField>::BigInt, + ) -> ( + ( + bool, + <::ScalarField as PrimeField>::BigInt, + ), + ( + bool, + <::ScalarField as PrimeField>::BigInt, + ), + ) { + P::glv_scalar_decomposition(k) + } +} + +impl Display for GroupProjective

{ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { write!(f, "{}", GroupAffine::from(*self)) } } -impl PartialEq for GroupProjective

{ +impl PartialEq for GroupProjective

{ fn eq(&self, other: &Self) -> bool { if self.is_zero() { return other.is_zero(); @@ -78,7 +182,7 @@ impl PartialEq for GroupProjective

{ } } -impl Distribution> for Standard { +impl Distribution> for Standard { #[inline] fn sample(&self, rng: &mut R) -> GroupProjective

{ let mut res = GroupProjective::prime_subgroup_generator(); @@ -88,7 +192,7 @@ impl Distribution> for Standard { } } -impl ToBytes for GroupProjective

{ +impl ToBytes for GroupProjective

{ #[inline] fn write(&self, mut writer: W) -> IoResult<()> { self.x.write(&mut writer)?; @@ -97,7 +201,7 @@ impl ToBytes for GroupProjective

{ } } -impl FromBytes for GroupProjective

{ +impl FromBytes for GroupProjective

{ #[inline] fn read(mut reader: R) -> IoResult { let x = P::BaseField::read(&mut reader)?; @@ -107,14 +211,14 @@ impl FromBytes for GroupProjective

{ } } -impl Default for GroupProjective

{ +impl Default for GroupProjective

{ #[inline] fn default() -> Self { Self::zero() } } -impl GroupProjective

{ +impl GroupProjective

{ pub fn new(x: P::BaseField, y: P::BaseField, z: P::BaseField) -> Self { Self { x, @@ -125,7 +229,7 @@ impl GroupProjective

{ } } -impl Zero for GroupProjective

{ +impl Zero for GroupProjective

{ // The point at infinity is always represented by // Z = 0. #[inline] @@ -145,7 +249,7 @@ impl Zero for GroupProjective

{ } } -impl ProjectiveCurve for GroupProjective

{ +impl ProjectiveCurve for GroupProjective

{ const COFACTOR: &'static [u64] = P::COFACTOR; type BaseField = P::BaseField; type ScalarField = P::ScalarField; @@ -373,7 +477,7 @@ impl ProjectiveCurve for GroupProjective

{ } } -impl Neg for GroupProjective

{ +impl Neg for GroupProjective

{ type Output = Self; #[inline] @@ -386,9 +490,9 @@ impl Neg for GroupProjective

{ } } -crate::impl_additive_ops_from_ref!(GroupProjective, Parameters); +crate::impl_additive_ops_from_ref!(GroupProjective, SWModelParameters); -impl<'a, P: Parameters> Add<&'a Self> for GroupProjective

{ +impl<'a, P: SWModelParameters> Add<&'a Self> for GroupProjective

{ type Output = Self; #[inline] @@ -399,7 +503,7 @@ impl<'a, P: Parameters> Add<&'a Self> for GroupProjective

{ } } -impl<'a, P: Parameters> AddAssign<&'a Self> for GroupProjective

{ +impl<'a, P: SWModelParameters> AddAssign<&'a Self> for GroupProjective

{ fn add_assign(&mut self, other: &'a Self) { if self.is_zero() { *self = *other; @@ -464,7 +568,7 @@ impl<'a, P: Parameters> AddAssign<&'a Self> for GroupProjective

{ } } -impl<'a, P: Parameters> Sub<&'a Self> for GroupProjective

{ +impl<'a, P: SWModelParameters> Sub<&'a Self> for GroupProjective

{ type Output = Self; #[inline] @@ -475,13 +579,13 @@ impl<'a, P: Parameters> Sub<&'a Self> for GroupProjective

{ } } -impl<'a, P: Parameters> SubAssign<&'a Self> for GroupProjective

{ +impl<'a, P: SWModelParameters> SubAssign<&'a Self> for GroupProjective

{ fn sub_assign(&mut self, other: &'a Self) { *self += &(-(*other)); } } -impl MulAssign for GroupProjective

{ +impl MulAssign for GroupProjective

{ fn mul_assign(&mut self, other: P::ScalarField) { *self = self.mul(other.into_repr()) } @@ -489,7 +593,7 @@ impl MulAssign for GroupProjective

{ // The affine point X, Y is represented in the Jacobian // coordinates with Z = 1. -impl From> for GroupProjective

{ +impl From> for GroupProjective

{ #[inline] fn from(p: GroupAffine

) -> GroupProjective

{ if p.is_zero() { @@ -502,7 +606,7 @@ impl From> for GroupProjective

{ // The projective point X, Y, Z is represented in the affine // coordinates as X/Z^2, Y/Z^3. -impl From> for GroupAffine

{ +impl From> for GroupAffine

{ #[inline] fn from(p: GroupProjective

) -> GroupAffine

{ if p.is_zero() { diff --git a/algebra-core/src/curves/models/short_weierstrass_projective.rs b/algebra-core/src/curves/models/short_weierstrass_projective.rs deleted file mode 100644 index 854268ee8..000000000 --- a/algebra-core/src/curves/models/short_weierstrass_projective.rs +++ /dev/null @@ -1,415 +0,0 @@ -use crate::{ - curves::models::SWModelParameters as Parameters, - io::{Read, Result as IoResult, Write}, - serialize::{Flags, SWFlags}, - UniformRand, Vec, -}; -use core::{ - fmt::{Display, Formatter, Result as FmtResult}, - marker::PhantomData, - ops::{Add, AddAssign, MulAssign, Neg, Sub, SubAssign}, -}; -use num_traits::{One, Zero}; -use rand::{ - distributions::{Distribution, Standard}, - Rng, -}; - -use crate::{ - bytes::{FromBytes, ToBytes}, - curves::{AffineCurve, BatchGroupArithmetic, ProjectiveCurve}, - fields::{BitIteratorBE, Field, PrimeField, SquareRootField}, -}; - -use crate::{ - CanonicalDeserialize, CanonicalDeserializeWithFlags, CanonicalSerialize, - CanonicalSerializeWithFlags, ConstantSerializedSize, -}; - -#[derive(Derivative)] -#[derivative( - Copy(bound = "P: Parameters"), - Clone(bound = "P: Parameters"), - Eq(bound = "P: Parameters"), - Debug(bound = "P: Parameters"), - Hash(bound = "P: Parameters") -)] -#[must_use] -pub struct GroupProjective { - pub x: P::BaseField, - pub y: P::BaseField, - pub z: P::BaseField, - _params: PhantomData

, -} - -specialise_affine_to_proj!(GroupProjective); - -impl Display for GroupProjective

{ - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - write!(f, "{}", GroupAffine::from(*self)) - } -} - -impl PartialEq for GroupProjective

{ - fn eq(&self, other: &Self) -> bool { - if self.is_zero() { - return other.is_zero(); - } - - if other.is_zero() { - return false; - } - - // x1/z1 == x2/z2 <==> x1 * z2 == x2 * z1 - if (self.x * &other.z) != (other.x * &self.z) { - false - } else { - (self.y * &other.z) == (other.y * &self.z) - } - } -} - -impl Distribution> for Standard { - #[inline] - fn sample(&self, rng: &mut R) -> GroupProjective

{ - let mut res = GroupProjective::prime_subgroup_generator(); - res.mul_assign(P::ScalarField::rand(rng)); - debug_assert!(GroupAffine::from(res).is_in_correct_subgroup_assuming_on_curve()); - res - } -} - -impl ToBytes for GroupProjective

{ - #[inline] - fn write(&self, mut writer: W) -> IoResult<()> { - self.x.write(&mut writer)?; - self.y.write(&mut writer)?; - self.z.write(writer) - } -} - -impl FromBytes for GroupProjective

{ - #[inline] - fn read(mut reader: R) -> IoResult { - let x = P::BaseField::read(&mut reader)?; - let y = P::BaseField::read(&mut reader)?; - let z = P::BaseField::read(reader)?; - Ok(Self::new(x, y, z)) - } -} - -impl Default for GroupProjective

{ - #[inline] - fn default() -> Self { - Self::zero() - } -} - -impl GroupProjective

{ - pub fn new(x: P::BaseField, y: P::BaseField, z: P::BaseField) -> Self { - Self { - x, - y, - z, - _params: PhantomData, - } - } -} - -impl Zero for GroupProjective

{ - // The point at infinity is always represented by Z = 0. - #[inline] - fn zero() -> Self { - Self::new( - P::BaseField::zero(), - P::BaseField::one(), - P::BaseField::zero(), - ) - } - - // The point at infinity is always represented by - // Z = 0. - #[inline] - fn is_zero(&self) -> bool { - self.z.is_zero() - } -} - -impl ProjectiveCurve for GroupProjective

{ - const COFACTOR: &'static [u64] = P::COFACTOR; - type BaseField = P::BaseField; - type ScalarField = P::ScalarField; - type Affine = GroupAffine

; - - fn get_x(&mut self) -> &mut Self::BaseField { - &mut self.x - } - - #[inline] - fn prime_subgroup_generator() -> Self { - GroupAffine::prime_subgroup_generator().into() - } - - #[inline] - fn is_normalized(&self) -> bool { - self.is_zero() || self.z.is_one() - } - - fn batch_normalization(v: &mut [Self]) { - // Montgomery’s Trick and Fast Implementation of Masked AES - // Genelle, Prouff and Quisquater - // Section 3.2 - - // First pass: compute [a, ab, abc, ...] - let mut prod = Vec::with_capacity(v.len()); - let mut tmp = P::BaseField::one(); - for g in v.iter_mut() - // Ignore normalized elements - .filter(|g| !g.is_normalized()) - { - tmp *= &g.z; - prod.push(tmp); - } - - // Invert `tmp`. - tmp = tmp.inverse().unwrap(); // Guaranteed to be nonzero. - - // Second pass: iterate backwards to compute inverses - for (g, s) in v.iter_mut() - // Backwards - .rev() - // Ignore normalized elements - .filter(|g| !g.is_normalized()) - // Backwards, skip last element, fill in one for last term. - .zip(prod.into_iter().rev().skip(1).chain(Some(P::BaseField::one()))) - { - // tmp := tmp * g.z; g.z := tmp * s = 1/z - let newtmp = tmp * &g.z; - g.z = tmp * &s; - tmp = newtmp; - } - - // Perform affine transformations - for g in v.iter_mut().filter(|g| !g.is_normalized()) { - g.x *= &g.z; // x/z^2 - g.y *= &g.z; - g.z = P::BaseField::one(); // z = 1 - } - } - - fn double_in_place(&mut self) -> &mut Self { - if self.is_zero() { - self - } else { - // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective.html#doubling-dbl-2007-bl - - // XX = X1^2 - let xx = self.x.square(); - // ZZ = Z1^2 - let zz = self.z.square(); - // w = a*ZZ + 3*XX - let w = P::mul_by_a(&zz) + &(xx + &xx.double()); - // s = 2*Y1*Z1 - let mut s = self.y * &(self.z); - s.double_in_place(); - // sss = s^3 - let mut sss = s.square(); - sss *= &s; - // R = Y1*s - let r = self.y * &s; - // RR = R2 - let rr = r.square(); - // B = (X1+R)^2-XX-RR - let b = (self.x + &r).square() - &xx - &rr; - // h = w2-2*B - let h = w.square() - &(b + &b); - // X3 = h*s - self.x = h * &s; - // Y3 = w*(B-h)-2*RR - self.y = w * &(b - &h) - &(rr + &rr); - // Z3 = sss - self.z = sss; - - self - } - } - - fn add_assign_mixed(&mut self, other: &GroupAffine

) { - if other.is_zero() { - return; - } else if self.is_zero() { - self.x = other.x; - self.y = other.y; - self.z = P::BaseField::one(); - return; - } - let mut v = other.x * &self.z; - let mut u = other.y * &self.z; - if u == self.y && v == self.x { - // x1 / z1 == x2 / z2 <==> x1 * z2 == x2 * z1; - // Here, z2 = 1, so we have x1 == x2 * z1; - self.double_in_place(); - } else { - // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective.html#addition-madd-1998-cmo - // u = Y2*Z1-Y1 - u -= &self.y; - // uu = u^2 - let uu = u.square(); - // v = X2*Z1-X1 - v -= &self.x; - // vv = v2 - let vv = v.square(); - // vvv = v*vv - let vvv = v * &vv; - // r = vv*X1 - let r = vv * &self.x; - // a = uu*Z1-vvv-2*r - let a = uu * &self.z - &vvv - &r.double(); - // X3 = v*a - self.x = v * &a; - // Y3 = u*(R-A)-vvv*Y1 - self.y = u * &(r - &a) - &(vvv * &self.y); - // Z3 = vvv*Z1 - self.z = vvv * &self.z; - } - } - - fn mul::BigInt>>(mut self, other: S) -> Self { - if P::has_glv() { - let w = P::glv_window_size(); - let mut res = Self::zero(); - impl_glv_mul!(Self, P, w, self, res, other); - res - } else { - let mut res = Self::zero(); - for b in BitIteratorBE::without_leading_zeros(other.into()) { - res.double_in_place(); - if b { - res += self; - } - } - - self = res; - self - } - } -} - -impl Neg for GroupProjective

{ - type Output = Self; - fn neg(self) -> Self { - if !self.is_zero() { - Self::new(self.x, -self.y, self.z) - } else { - self - } - } -} - -crate::impl_additive_ops_from_ref!(GroupProjective, Parameters); - -impl<'a, P: Parameters> Add<&'a Self> for GroupProjective

{ - type Output = Self; - fn add(self, other: &'a Self) -> Self { - let mut copy = self; - copy += other; - copy - } -} - -impl<'a, P: Parameters> AddAssign<&'a Self> for GroupProjective

{ - fn add_assign(&mut self, other: &'a Self) { - if self.is_zero() { - *self = *other; - return; - } - - if other.is_zero() { - return; - } - // https://www.hyperelliptic.org/EFD/g1p/data/shortw/projective/addition/add-1998-cmo-2 - - if self == other { - self.double_in_place(); - } else { - // Y1Z2 = Y1*Z2 - let y1z2 = self.y * &other.z; - // X1Z2 = X1*Z2 - let x1z2 = self.x * &other.z; - // Z1Z2 = Z1*Z2 - let z1z2 = self.z * &other.z; - // u = Y2*Z1-Y1Z2 - let u = (self.z * &other.y) - &y1z2; - // uu = u^2 - let uu = u.square(); - // v = X2*Z1-X1Z2 - let v = (self.z * &other.x) - &x1z2; - // vv = v^2 - let vv = v.square(); - // vvv = v*vv - let vvv = v * &vv; - // R = vv*X1Z2 - let r = vv * &x1z2; - // A = uu*Z1Z2-vvv-2*R - let a = (uu * &z1z2) - &(vvv + &r + &r); - // X3 = v*A - self.x = v * &a; - // Y3 = u*(R-A)-vvv*Y1Z2 - self.y = ((r - &a) * &u) - &(vvv * &y1z2); - // Z3 = vvv*Z1Z2 - self.z = vvv * &z1z2; - } - } -} - -impl<'a, P: Parameters> Sub<&'a Self> for GroupProjective

{ - type Output = Self; - fn sub(self, other: &'a Self) -> Self { - let mut copy = self; - copy -= other; - copy - } -} - -impl<'a, P: Parameters> SubAssign<&'a Self> for GroupProjective

{ - fn sub_assign(&mut self, other: &'a Self) { - *self += &(-(*other)); - } -} - -impl MulAssign for GroupProjective

{ - fn mul_assign(&mut self, other: P::ScalarField) { - *self = self.mul(other.into_repr()) - } -} - -// The affine point X, Y is represented in the jacobian -// coordinates with Z = 1. -impl From> for GroupProjective

{ - fn from(p: GroupAffine

) -> GroupProjective

{ - if p.is_zero() { - Self::zero() - } else { - Self::new(p.x, p.y, P::BaseField::one()) - } - } -} - -// The projective point X, Y, Z is represented in the affine -// coordinates as X/Z, Y/Z. -impl From> for GroupAffine

{ - fn from(p: GroupProjective

) -> GroupAffine

{ - if p.is_zero() { - GroupAffine::zero() - } else if p.z.is_one() { - // If Z is one, the point is already normalized. - GroupAffine::new(p.x, p.y, false) - } else { - // Z is nonzero, so it must have an inverse in a field. - let z_inv = p.z.inverse().unwrap(); - let x = p.x * &z_inv; - let y = p.y * &z_inv; - GroupAffine::new(x, y, false) - } - } -} diff --git a/algebra-core/src/curves/models/sw_batch_affine.rs b/algebra-core/src/curves/models/sw_batch_affine.rs index cd77ab8dd..eaa96ab88 100644 --- a/algebra-core/src/curves/models/sw_batch_affine.rs +++ b/algebra-core/src/curves/models/sw_batch_affine.rs @@ -97,8 +97,8 @@ macro_rules! impl_sw_batch_affine { }; } - impl BatchGroupArithmetic for $GroupAffine

{ - type BBaseField = P::BaseField; + impl BatchGroupArithmetic for $GroupAffine

{ + type BaseFieldForBatch = P::BaseField; /// This implementation of batch group ops takes particular /// care to make most use of points fetched from memory to prevent /// reallocations @@ -115,7 +115,7 @@ macro_rules! impl_sw_batch_affine { fn batch_double_in_place( bases: &mut [Self], index: &[u32], - scratch_space: Option<&mut Vec>, + scratch_space: Option<&mut Vec>, ) { let mut inversion_tmp = P::BaseField::one(); @@ -437,7 +437,8 @@ macro_rules! impl_sw_batch_affine { let batch_size = bases.len(); if P::has_glv() { use itertools::{EitherOrBoth::*, Itertools}; - let mut scratch_space = Vec::::with_capacity(bases.len()); + let mut scratch_space = + Vec::::with_capacity(bases.len()); let mut scratch_space_group = Vec::::with_capacity(bases.len() / w); let _now = timer!(); @@ -558,7 +559,8 @@ macro_rules! impl_sw_batch_affine { } timer_println!(_now, "batch ops"); } else { - let mut scratch_space = Vec::::with_capacity(bases.len()); + let mut scratch_space = + Vec::::with_capacity(bases.len()); let opcode_vectorised = Self::batch_wnaf_opcode_recoding::(scalars, w, None); let tables = Self::batch_wnaf_tables(bases, w); diff --git a/algebra-core/src/curves/models/twisted_edwards_extended.rs b/algebra-core/src/curves/models/twisted_edwards_extended.rs index 772c5c714..5e97bd971 100644 --- a/algebra-core/src/curves/models/twisted_edwards_extended.rs +++ b/algebra-core/src/curves/models/twisted_edwards_extended.rs @@ -1,10 +1,15 @@ +#[cfg(not(feature = "cuda"))] +use crate::accel_dummy::*; use crate::{ curves::batch_arith::decode_endo_from_u32, io::{Read, Result as IoResult, Write}, serialize::{EdwardsFlags, Flags}, - BatchGroupArithmetic, CanonicalDeserialize, CanonicalDeserializeWithFlags, CanonicalSerialize, + CanonicalDeserialize, CanonicalDeserializeWithFlags, CanonicalSerialize, CanonicalSerializeWithFlags, ConstantSerializedSize, UniformRand, Vec, }; +#[cfg(feature = "cuda")] +use {accel::*, log::debug}; + use core::{ fmt::{Display, Formatter, Result as FmtResult}, marker::PhantomData, @@ -16,43 +21,82 @@ use rand::{ Rng, }; +#[cfg(feature = "cuda")] +use { + crate::curves::BatchGroupArithmeticSlice, closure::closure, peekmore::PeekMore, + std::sync::Mutex, +}; + use crate::{ + biginteger::BigInteger, bytes::{FromBytes, ToBytes}, + cfg_chunks_mut, cfg_iter, curves::{ - models::{ - MontgomeryModelParameters as MontgomeryParameters, TEModelParameters as Parameters, - }, - AffineCurve, ProjectiveCurve, + cuda::scalar_mul::{internal::GPUScalarMulInternal, ScalarMulProfiler}, + models::MontgomeryModelParameters, + AffineCurve, BatchGroupArithmetic, ModelParameters, ProjectiveCurve, }, - fields::{BitIteratorBE, Field, PrimeField, SquareRootField}, + fields::{BitIteratorBE, Field, FpParameters, PrimeField, SquareRootField}, + impl_gpu_cpu_run_kernel, impl_gpu_te_projective, impl_run_kernel, }; + #[cfg(feature = "parallel")] use rayon::prelude::*; +pub trait TEModelParameters: ModelParameters + Sized { + const COEFF_A: Self::BaseField; + const COEFF_D: Self::BaseField; + const COFACTOR: &'static [u64]; + const COFACTOR_INV: Self::ScalarField; + const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField); + + type MontgomeryModelParameters: MontgomeryModelParameters; + + #[inline(always)] + fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField { + let mut copy = *elem; + copy *= &Self::COEFF_A; + copy + } + + fn scalar_mul_kernel( + ctx: &Context, + grid: usize, + block: usize, + table: *const GroupProjective, + exps: *const u8, + out: *mut GroupProjective, + n: isize, + ) -> error::Result<()>; + + fn scalar_mul_static_profiler() -> ScalarMulProfiler; + + fn namespace() -> &'static str; +} + #[derive(Derivative)] #[derivative( - Copy(bound = "P: Parameters"), - Clone(bound = "P: Parameters"), - PartialEq(bound = "P: Parameters"), - Eq(bound = "P: Parameters"), - Debug(bound = "P: Parameters"), - Hash(bound = "P: Parameters") + Copy(bound = "P: TEModelParameters"), + Clone(bound = "P: TEModelParameters"), + PartialEq(bound = "P: TEModelParameters"), + Eq(bound = "P: TEModelParameters"), + Debug(bound = "P: TEModelParameters"), + Hash(bound = "P: TEModelParameters") )] -#[must_use] -pub struct GroupAffine { +pub struct GroupAffine { pub x: P::BaseField, pub y: P::BaseField, #[derivative(Debug = "ignore")] _params: PhantomData

, } -impl Display for GroupAffine

{ +impl Display for GroupAffine

{ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { write!(f, "GroupAffine(x={}, y={})", self.x, self.y) } } -impl GroupAffine

{ +impl GroupAffine

{ pub fn new(x: P::BaseField, y: P::BaseField) -> Self { Self { x, @@ -117,7 +161,7 @@ impl GroupAffine

{ } } -impl Zero for GroupAffine

{ +impl Zero for GroupAffine

{ fn zero() -> Self { Self::new(P::BaseField::zero(), P::BaseField::one()) } @@ -127,7 +171,7 @@ impl Zero for GroupAffine

{ } } -impl AffineCurve for GroupAffine

{ +impl AffineCurve for GroupAffine

{ const COFACTOR: &'static [u64] = P::COFACTOR; type BaseField = P::BaseField; type ScalarField = P::ScalarField; @@ -206,13 +250,13 @@ macro_rules! batch_add_loop_2 { }; } -impl BatchGroupArithmetic for GroupAffine

{ - type BBaseField = P::BaseField; +impl BatchGroupArithmetic for GroupAffine

{ + type BaseFieldForBatch = P::BaseField; fn batch_double_in_place( bases: &mut [Self], index: &[u32], - _scratch_space: Option<&mut Vec>, + _scratch_space: Option<&mut Vec>, ) { Self::batch_add_in_place( bases, @@ -367,7 +411,7 @@ impl BatchGroupArithmetic for GroupAffine

{ } } -impl Neg for GroupAffine

{ +impl Neg for GroupAffine

{ type Output = Self; fn neg(self) -> Self { @@ -375,9 +419,9 @@ impl Neg for GroupAffine

{ } } -crate::impl_additive_ops_from_ref!(GroupAffine, Parameters); +crate::impl_additive_ops_from_ref!(GroupAffine, TEModelParameters); -impl<'a, P: Parameters> Add<&'a Self> for GroupAffine

{ +impl<'a, P: TEModelParameters> Add<&'a Self> for GroupAffine

{ type Output = Self; fn add(self, other: &'a Self) -> Self { let mut copy = self; @@ -386,7 +430,7 @@ impl<'a, P: Parameters> Add<&'a Self> for GroupAffine

{ } } -impl<'a, P: Parameters> AddAssign<&'a Self> for GroupAffine

{ +impl<'a, P: TEModelParameters> AddAssign<&'a Self> for GroupAffine

{ fn add_assign(&mut self, other: &'a Self) { let y1y2 = self.y * &other.y; let x1x2 = self.x * &other.x; @@ -403,7 +447,7 @@ impl<'a, P: Parameters> AddAssign<&'a Self> for GroupAffine

{ } } -impl<'a, P: Parameters> Sub<&'a Self> for GroupAffine

{ +impl<'a, P: TEModelParameters> Sub<&'a Self> for GroupAffine

{ type Output = Self; fn sub(self, other: &'a Self) -> Self { let mut copy = self; @@ -412,19 +456,19 @@ impl<'a, P: Parameters> Sub<&'a Self> for GroupAffine

{ } } -impl<'a, P: Parameters> SubAssign<&'a Self> for GroupAffine

{ +impl<'a, P: TEModelParameters> SubAssign<&'a Self> for GroupAffine

{ fn sub_assign(&mut self, other: &'a Self) { *self += &(-(*other)); } } -impl MulAssign for GroupAffine

{ +impl MulAssign for GroupAffine

{ fn mul_assign(&mut self, other: P::ScalarField) { *self = self.mul(other.into_repr()).into() } } -impl ToBytes for GroupAffine

{ +impl ToBytes for GroupAffine

{ #[inline] fn write(&self, mut writer: W) -> IoResult<()> { self.x.write(&mut writer)?; @@ -432,7 +476,7 @@ impl ToBytes for GroupAffine

{ } } -impl FromBytes for GroupAffine

{ +impl FromBytes for GroupAffine

{ #[inline] fn read(mut reader: R) -> IoResult { let x = P::BaseField::read(&mut reader)?; @@ -441,14 +485,14 @@ impl FromBytes for GroupAffine

{ } } -impl Default for GroupAffine

{ +impl Default for GroupAffine

{ #[inline] fn default() -> Self { Self::zero() } } -impl Distribution> for Standard { +impl Distribution> for Standard { #[inline] fn sample(&self, rng: &mut R) -> GroupAffine

{ loop { @@ -466,7 +510,7 @@ mod group_impl { use super::*; use crate::groups::Group; - impl Group for GroupAffine

{ + impl Group for GroupAffine

{ type ScalarField = P::ScalarField; #[inline] @@ -491,14 +535,13 @@ mod group_impl { #[derive(Derivative)] #[derivative( - Copy(bound = "P: Parameters"), - Clone(bound = "P: Parameters"), - Eq(bound = "P: Parameters"), - Debug(bound = "P: Parameters"), - Hash(bound = "P: Parameters") + Copy(bound = "P: TEModelParameters"), + Clone(bound = "P: TEModelParameters"), + Eq(bound = "P: TEModelParameters"), + Debug(bound = "P: TEModelParameters"), + Hash(bound = "P: TEModelParameters") )] -#[must_use] -pub struct GroupProjective { +pub struct GroupProjective { pub x: P::BaseField, pub y: P::BaseField, pub t: P::BaseField, @@ -507,25 +550,25 @@ pub struct GroupProjective { _params: PhantomData

, } -impl PartialEq> for GroupAffine

{ +impl PartialEq> for GroupAffine

{ fn eq(&self, other: &GroupProjective

) -> bool { self.into_projective() == *other } } -impl PartialEq> for GroupProjective

{ +impl PartialEq> for GroupProjective

{ fn eq(&self, other: &GroupAffine

) -> bool { *self == other.into_projective() } } -impl Display for GroupProjective

{ +impl Display for GroupProjective

{ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { write!(f, "{}", GroupAffine::from(*self)) } } -impl PartialEq for GroupProjective

{ +impl PartialEq for GroupProjective

{ fn eq(&self, other: &Self) -> bool { if self.is_zero() { return other.is_zero(); @@ -540,7 +583,7 @@ impl PartialEq for GroupProjective

{ } } -impl Distribution> for Standard { +impl Distribution> for Standard { #[inline] fn sample(&self, rng: &mut R) -> GroupProjective

{ loop { @@ -554,7 +597,7 @@ impl Distribution> for Standard { } } -impl ToBytes for GroupProjective

{ +impl ToBytes for GroupProjective

{ #[inline] fn write(&self, mut writer: W) -> IoResult<()> { self.x.write(&mut writer)?; @@ -564,7 +607,7 @@ impl ToBytes for GroupProjective

{ } } -impl FromBytes for GroupProjective

{ +impl FromBytes for GroupProjective

{ #[inline] fn read(mut reader: R) -> IoResult { let x = P::BaseField::read(&mut reader)?; @@ -575,14 +618,14 @@ impl FromBytes for GroupProjective

{ } } -impl Default for GroupProjective

{ +impl Default for GroupProjective

{ #[inline] fn default() -> Self { Self::zero() } } -impl GroupProjective

{ +impl GroupProjective

{ pub fn new(x: P::BaseField, y: P::BaseField, t: P::BaseField, z: P::BaseField) -> Self { Self { x, @@ -594,7 +637,7 @@ impl GroupProjective

{ } } -impl Zero for GroupProjective

{ +impl Zero for GroupProjective

{ fn zero() -> Self { Self::new( P::BaseField::zero(), @@ -609,7 +652,9 @@ impl Zero for GroupProjective

{ } } -impl ProjectiveCurve for GroupProjective

{ +impl_gpu_te_projective!(TEModelParameters); + +impl ProjectiveCurve for GroupProjective

{ const COFACTOR: &'static [u64] = P::COFACTOR; type BaseField = P::BaseField; type ScalarField = P::ScalarField; @@ -709,7 +754,7 @@ impl ProjectiveCurve for GroupProjective

{ } } -impl Neg for GroupProjective

{ +impl Neg for GroupProjective

{ type Output = Self; fn neg(mut self) -> Self { self.x = -self.x; @@ -718,9 +763,9 @@ impl Neg for GroupProjective

{ } } -crate::impl_additive_ops_from_ref!(GroupProjective, Parameters); +crate::impl_additive_ops_from_ref!(GroupProjective, TEModelParameters); -impl<'a, P: Parameters> Add<&'a Self> for GroupProjective

{ +impl<'a, P: TEModelParameters> Add<&'a Self> for GroupProjective

{ type Output = Self; fn add(self, other: &'a Self) -> Self { let mut copy = self; @@ -729,7 +774,7 @@ impl<'a, P: Parameters> Add<&'a Self> for GroupProjective

{ } } -impl<'a, P: Parameters> AddAssign<&'a Self> for GroupProjective

{ +impl<'a, P: TEModelParameters> AddAssign<&'a Self> for GroupProjective

{ fn add_assign(&mut self, other: &'a Self) { // See "Twisted Edwards Curves Revisited" // Huseyin Hisil, Kenneth Koon-Ho Wong, Gary Carter, and Ed Dawson @@ -773,7 +818,7 @@ impl<'a, P: Parameters> AddAssign<&'a Self> for GroupProjective

{ } } -impl<'a, P: Parameters> Sub<&'a Self> for GroupProjective

{ +impl<'a, P: TEModelParameters> Sub<&'a Self> for GroupProjective

{ type Output = Self; fn sub(self, other: &'a Self) -> Self { let mut copy = self; @@ -782,13 +827,13 @@ impl<'a, P: Parameters> Sub<&'a Self> for GroupProjective

{ } } -impl<'a, P: Parameters> SubAssign<&'a Self> for GroupProjective

{ +impl<'a, P: TEModelParameters> SubAssign<&'a Self> for GroupProjective

{ fn sub_assign(&mut self, other: &'a Self) { *self += &(-(*other)); } } -impl MulAssign for GroupProjective

{ +impl MulAssign for GroupProjective

{ fn mul_assign(&mut self, other: P::ScalarField) { *self = self.mul(other.into_repr()) } @@ -796,7 +841,7 @@ impl MulAssign for GroupProjective

{ // The affine point (X, Y) is represented in the Extended Projective coordinates // with Z = 1. -impl From> for GroupProjective

{ +impl From> for GroupProjective

{ fn from(p: GroupAffine

) -> GroupProjective

{ Self::new(p.x, p.y, p.x * &p.y, P::BaseField::one()) } @@ -804,7 +849,7 @@ impl From> for GroupProjective

{ // The projective point X, Y, T, Z is represented in the affine // coordinates as X/Z, Y/Z. -impl From> for GroupAffine

{ +impl From> for GroupAffine

{ fn from(p: GroupProjective

) -> GroupAffine

{ if p.is_zero() { GroupAffine::zero() @@ -821,7 +866,7 @@ impl From> for GroupAffine

{ } } -impl core::str::FromStr for GroupAffine

+impl core::str::FromStr for GroupAffine

where P::BaseField: core::str::FromStr, { @@ -859,27 +904,27 @@ where #[derive(Derivative)] #[derivative( - Copy(bound = "P: MontgomeryParameters"), - Clone(bound = "P: MontgomeryParameters"), - PartialEq(bound = "P: MontgomeryParameters"), - Eq(bound = "P: MontgomeryParameters"), - Debug(bound = "P: MontgomeryParameters"), - Hash(bound = "P: MontgomeryParameters") + Copy(bound = "P: MontgomeryModelParameters"), + Clone(bound = "P: MontgomeryModelParameters"), + PartialEq(bound = "P: MontgomeryModelParameters"), + Eq(bound = "P: MontgomeryModelParameters"), + Debug(bound = "P: MontgomeryModelParameters"), + Hash(bound = "P: MontgomeryModelParameters") )] -pub struct MontgomeryGroupAffine { +pub struct MontgomeryGroupAffine { pub x: P::BaseField, pub y: P::BaseField, #[derivative(Debug = "ignore")] _params: PhantomData

, } -impl Display for MontgomeryGroupAffine

{ +impl Display for MontgomeryGroupAffine

{ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { write!(f, "MontgomeryGroupAffine(x={}, y={})", self.x, self.y) } } -impl MontgomeryGroupAffine

{ +impl MontgomeryGroupAffine

{ pub fn new(x: P::BaseField, y: P::BaseField) -> Self { Self { x, @@ -889,4 +934,4 @@ impl MontgomeryGroupAffine

{ } } -impl_edwards_curve_serializer!(Parameters); +impl_edwards_curve_serializer!(TEModelParameters); diff --git a/algebra-core/src/fields/arithmetic.rs b/algebra-core/src/fields/arithmetic.rs index f84e66499..5fa95cc57 100644 --- a/algebra-core/src/fields/arithmetic.rs +++ b/algebra-core/src/fields/arithmetic.rs @@ -1,18 +1,18 @@ /// All of these methods store intermediate results on the stack, and so /// they support overlap of input and output parameters. -#[cfg(feature = "bw6_asm")] +#[cfg(use_bw6_asm)] extern "C" { pub fn modmul768(x: *const u64, y: *const u64, m: *const u64, z: *mut u64); pub fn modadd768(x: *const u64, y: *const u64, m: *const u64, z: *mut u64); pub fn modsub768(x: *const u64, y: *const u64, m: *const u64, z: *mut u64); } - /// This modular multiplication algorithm uses Montgomery /// reduction for efficient implementation. It also additionally /// uses the "no-carry optimization" outlined /// [here](https://hackmd.io/@zkteam/modular_multiplication) if /// `P::MODULUS` has BOTH (a) a zero MSB, AND (b) at least one /// zero bit in the rest of the modulus. + macro_rules! impl_field_mul_assign { ($limbs:expr) => { #[inline] @@ -255,6 +255,7 @@ macro_rules! impl_field_square_in_place { return self; } } + // Checking the modulus at compile time let first_bit_set = P::MODULUS.0[$limbs - 1] >> 63 != 0; let mut all_bits_set = P::MODULUS.0[$limbs - 1] == !0 - (1 << 63); diff --git a/algebra-core/src/lib.rs b/algebra-core/src/lib.rs index 25fe4f5ae..0d95e37ea 100644 --- a/algebra-core/src/lib.rs +++ b/algebra-core/src/lib.rs @@ -75,6 +75,7 @@ pub use self::fields::*; pub mod biginteger; pub use self::biginteger::*; +#[macro_use] pub mod curves; pub use self::curves::*; diff --git a/algebra/Cargo.toml b/algebra/Cargo.toml index 4c56a90b7..91498cb02 100644 --- a/algebra/Cargo.toml +++ b/algebra/Cargo.toml @@ -23,6 +23,10 @@ edition = "2018" [dependencies] algebra-core = { path = "../algebra-core", default-features = false } +accel = { git = "https://github.com/jon-chuang/accel", package = "accel", optional = true } +# accel = { path = "/home/jonch/Desktop/Programming/Rust/accel/accel", optional = true } +lazy_static = { version = "1.4.0", optional = true } +paste = "0.1" [dev-dependencies] rand = { version = "0.7", default-features = false } @@ -73,6 +77,7 @@ mnt6_298 = [] mnt6_753 = [] curve = [] +cuda_test = [] batch_affine = [] msm = [] verify = [] @@ -91,8 +96,9 @@ parallel = [ "std", "algebra-core/parallel" ] parallel_random_gen = [] derive = [ "algebra-core/derive" ] asm = [ "algebra-core/llvm_asm" ] +bw6_asm = [ "algebra-core/bw6_asm" ] prefetch = [ "algebra-core/prefetch"] +cuda = [ "algebra-core/cuda", "accel", "std", "lazy_static" ] timing = [ "algebra-core/timing"] timing_detailed = [ "algebra-core/timing_detailed" ] timing_thread_id = [ "algebra-core/timing_thread_id" ] -bw6_asm = [ "algebra-core/bw6_asm" ] diff --git a/algebra/src/bls12_377/curves/g1.rs b/algebra/src/bls12_377/curves/g1.rs index 3c318afda..1fb3c6786 100644 --- a/algebra/src/bls12_377/curves/g1.rs +++ b/algebra/src/bls12_377/curves/g1.rs @@ -1,13 +1,18 @@ use algebra_core::{ biginteger::{BigInteger256, BigInteger384, BigInteger512}, curves::{ + bls12, models::{ModelParameters, SWModelParameters}, GLVParameters, }, - field_new, impl_glv_for_sw, PrimeField, Zero, + field_new, impl_glv_for_sw, impl_scalar_mul_kernel, impl_scalar_mul_parameters, PrimeField, + Zero, }; -use crate::bls12_377::{Fq, Fr}; +use crate::{bls12_377, bls12_377::*}; + +pub type G1Affine = bls12::G1Affine; +pub type G1Projective = bls12::G1Projective; #[derive(Clone, Default, PartialEq, Eq)] pub struct Parameters; @@ -17,6 +22,8 @@ impl ModelParameters for Parameters { type ScalarField = Fr; } +impl_scalar_mul_kernel!(bls12_377, "bls12_377", g1, G1Projective); + impl GLVParameters for Parameters { type WideBigInt = BigInteger512; const OMEGA: Self::BaseField = field_new!( @@ -88,6 +95,7 @@ impl SWModelParameters for Parameters { Self::BaseField::zero() } + impl_scalar_mul_parameters!(G1Projective); impl_glv_for_sw!(); } diff --git a/algebra/src/bls12_377/curves/g2.rs b/algebra/src/bls12_377/curves/g2.rs index efab698de..dd221381e 100644 --- a/algebra/src/bls12_377/curves/g2.rs +++ b/algebra/src/bls12_377/curves/g2.rs @@ -1,13 +1,19 @@ -use crate::bls12_377::{g1, Fq, Fq2, Fr}; use algebra_core::{ biginteger::{BigInteger256, BigInteger384, BigInteger512}, curves::{ + bls12, models::{ModelParameters, SWModelParameters}, GLVParameters, }, - field_new, impl_glv_for_sw, PrimeField, Zero, + field_new, impl_glv_for_sw, impl_scalar_mul_kernel, impl_scalar_mul_parameters, PrimeField, + Zero, }; +use crate::{bls12_377, bls12_377::*}; + +pub type G2Affine = bls12::G2Affine; +pub type G2Projective = bls12::G2Projective; + #[derive(Clone, Default, PartialEq, Eq)] pub struct Parameters; @@ -16,6 +22,8 @@ impl ModelParameters for Parameters { type ScalarField = Fr; } +impl_scalar_mul_kernel!(bls12_377, "bls12_377", g2, G2Projective); + impl GLVParameters for Parameters { type WideBigInt = BigInteger512; const OMEGA: Self::BaseField = field_new!( @@ -114,6 +122,7 @@ impl SWModelParameters for Parameters { Self::BaseField::zero() } + impl_scalar_mul_parameters!(G2Projective); impl_glv_for_sw!(); } diff --git a/algebra/src/bls12_377/curves/mod.rs b/algebra/src/bls12_377/curves/mod.rs index bc3c1a127..286feac59 100644 --- a/algebra/src/bls12_377/curves/mod.rs +++ b/algebra/src/bls12_377/curves/mod.rs @@ -1,11 +1,11 @@ use crate::bls12_377::*; -use algebra_core::curves::{ - bls12, - bls12::{Bls12, Bls12Parameters, TwistType}, -}; +use algebra_core::curves::bls12::{Bls12, Bls12Parameters, TwistType}; pub mod g1; +pub use self::g1::{G1Affine, G1Projective}; + pub mod g2; +pub use self::g2::{G2Affine, G2Projective}; #[cfg(test)] mod tests; @@ -26,8 +26,3 @@ impl Bls12Parameters for Parameters { } pub type Bls12_377 = Bls12; - -pub type G1Affine = bls12::G1Affine; -pub type G1Projective = bls12::G1Projective; -pub type G2Affine = bls12::G2Affine; -pub type G2Projective = bls12::G2Projective; diff --git a/algebra/src/bls12_381/curves/g1.rs b/algebra/src/bls12_381/curves/g1.rs index b7508f27f..f0fa7ba72 100644 --- a/algebra/src/bls12_381/curves/g1.rs +++ b/algebra/src/bls12_381/curves/g1.rs @@ -7,7 +7,8 @@ use crate::{ models::{ModelParameters, SWModelParameters}, GLVParameters, }, - field_new, impl_glv_for_sw, PrimeField, Zero, + field_new, impl_glv_for_sw, impl_scalar_mul_kernel, impl_scalar_mul_parameters, PrimeField, + Zero, }; pub type G1Affine = bls12::G1Affine; @@ -21,6 +22,8 @@ impl ModelParameters for Parameters { type ScalarField = Fr; } +impl_scalar_mul_kernel!(bls12_381, "bls12_381", g1, G1Projective); + impl GLVParameters for Parameters { type WideBigInt = BigInteger512; const OMEGA: Self::BaseField = field_new!( @@ -91,6 +94,7 @@ impl SWModelParameters for Parameters { Self::BaseField::zero() } + impl_scalar_mul_parameters!(G1Projective); impl_glv_for_sw!(); } diff --git a/algebra/src/bls12_381/curves/g2.rs b/algebra/src/bls12_381/curves/g2.rs index a851d53e0..c62d759ef 100644 --- a/algebra/src/bls12_381/curves/g2.rs +++ b/algebra/src/bls12_381/curves/g2.rs @@ -7,7 +7,8 @@ use crate::{ models::{ModelParameters, SWModelParameters}, GLVParameters, }, - field_new, impl_glv_for_sw, PrimeField, Zero, + field_new, impl_glv_for_sw, impl_scalar_mul_kernel, impl_scalar_mul_parameters, PrimeField, + Zero, }; pub type G2Affine = bls12::G2Affine; @@ -21,6 +22,8 @@ impl ModelParameters for Parameters { type ScalarField = Fr; } +impl_scalar_mul_kernel!(bls12_381, "bls12_381", g2, G2Projective); + impl GLVParameters for Parameters { type WideBigInt = BigInteger512; const OMEGA: Self::BaseField = field_new!( @@ -100,6 +103,7 @@ impl SWModelParameters for Parameters { Self::BaseField::zero() } + impl_scalar_mul_parameters!(G2Projective); impl_glv_for_sw!(); } diff --git a/algebra/src/bn254/curves/g1.rs b/algebra/src/bn254/curves/g1.rs index b9b59ce23..c020d00af 100644 --- a/algebra/src/bn254/curves/g1.rs +++ b/algebra/src/bn254/curves/g1.rs @@ -1,10 +1,17 @@ use algebra_core::{ biginteger::{BigInteger256, BigInteger512}, - curves::models::{ModelParameters, SWModelParameters}, - field_new, impl_glv_for_sw, GLVParameters, PrimeField, Zero, + curves::{ + bn, + models::{ModelParameters, SWModelParameters}, + }, + field_new, impl_glv_for_sw, impl_scalar_mul_kernel, impl_scalar_mul_parameters, GLVParameters, + PrimeField, Zero, }; -use crate::bn254::{Fq, Fr}; +use crate::{bn254, bn254::*}; + +pub type G1Affine = bn::G1Affine; +pub type G1Projective = bn::G1Projective; #[derive(Clone, Default, PartialEq, Eq)] pub struct Parameters; @@ -14,6 +21,8 @@ impl ModelParameters for Parameters { type ScalarField = Fr; } +impl_scalar_mul_kernel!(bn254, "bn254", g1, G1Projective); + impl GLVParameters for Parameters { type WideBigInt = BigInteger512; const OMEGA: Self::BaseField = field_new!( @@ -83,6 +92,7 @@ impl SWModelParameters for Parameters { Self::BaseField::zero() } + impl_scalar_mul_parameters!(G1Projective); impl_glv_for_sw!(); } diff --git a/algebra/src/bn254/curves/g2.rs b/algebra/src/bn254/curves/g2.rs index d4c51e6f5..c2b7382e9 100644 --- a/algebra/src/bn254/curves/g2.rs +++ b/algebra/src/bn254/curves/g2.rs @@ -1,10 +1,17 @@ use algebra_core::{ biginteger::{BigInteger256, BigInteger512}, - curves::models::{ModelParameters, SWModelParameters}, - field_new, impl_glv_for_sw, GLVParameters, PrimeField, Zero, + curves::{ + bn, + models::{ModelParameters, SWModelParameters}, + }, + field_new, impl_glv_for_sw, impl_scalar_mul_kernel, impl_scalar_mul_parameters, GLVParameters, + PrimeField, Zero, }; -use crate::bn254::{g1, Fq, Fq2, Fr}; +use crate::{bn254, bn254::*}; + +pub type G2Affine = bn::G2Affine; +pub type G2Projective = bn::G2Projective; #[derive(Clone, Default, PartialEq, Eq)] pub struct Parameters; @@ -14,6 +21,8 @@ impl ModelParameters for Parameters { type ScalarField = Fr; } +impl_scalar_mul_kernel!(bn254, "bn254", g2, G2Projective); + impl GLVParameters for Parameters { type WideBigInt = BigInteger512; const OMEGA: Self::BaseField = field_new!( @@ -107,6 +116,7 @@ impl SWModelParameters for Parameters { Self::BaseField::zero() } + impl_scalar_mul_parameters!(G2Projective); impl_glv_for_sw!(); } diff --git a/algebra/src/bn254/curves/mod.rs b/algebra/src/bn254/curves/mod.rs index 396b77668..53cbeac3e 100644 --- a/algebra/src/bn254/curves/mod.rs +++ b/algebra/src/bn254/curves/mod.rs @@ -1,14 +1,14 @@ use crate::bn254::*; use algebra_core::{ biginteger::BigInteger256, - curves::{ - bn, - bn::{Bn, BnParameters, TwistType}, - }, + curves::bn::{Bn, BnParameters, TwistType}, field_new, }; pub mod g1; +pub use self::g1::{G1Affine, G1Projective}; + pub mod g2; +pub use self::g2::{G2Affine, G2Projective}; #[cfg(test)] mod tests; @@ -78,8 +78,3 @@ impl BnParameters for Parameters { } pub type Bn254 = Bn; - -pub type G1Affine = bn::G1Affine; -pub type G1Projective = bn::G1Projective; -pub type G2Affine = bn::G2Affine; -pub type G2Projective = bn::G2Projective; diff --git a/algebra/src/bw6_761/curves/g1.rs b/algebra/src/bw6_761/curves/g1.rs index a6512199e..941bc5aa4 100644 --- a/algebra/src/bw6_761/curves/g1.rs +++ b/algebra/src/bw6_761/curves/g1.rs @@ -8,7 +8,7 @@ use crate::{ }, field_new, fields::PrimeField, - impl_glv_for_sw, + impl_glv_for_sw, impl_scalar_mul_kernel_glv, impl_scalar_mul_parameters, }; pub type G1Affine = GroupAffine; @@ -22,6 +22,8 @@ impl ModelParameters for Parameters { type ScalarField = Fr; } +impl_scalar_mul_kernel_glv!(bw6_761, "bw6_761", g1, G1Projective); + /// The parameters can be obtained from /// Optimized and secure pairing-friendly elliptic /// curves suitable for one layer proof composition @@ -161,6 +163,7 @@ impl SWModelParameters for Parameters { Self::BaseField::zero() } + impl_scalar_mul_parameters!(G1Projective); impl_glv_for_sw!(); } diff --git a/algebra/src/bw6_761/curves/g2.rs b/algebra/src/bw6_761/curves/g2.rs index a3d363067..619f20552 100644 --- a/algebra/src/bw6_761/curves/g2.rs +++ b/algebra/src/bw6_761/curves/g2.rs @@ -8,7 +8,7 @@ use crate::{ }, field_new, fields::PrimeField, - impl_glv_for_sw, + impl_glv_for_sw, impl_scalar_mul_kernel_glv, impl_scalar_mul_parameters, }; pub type G2Affine = GroupAffine; @@ -22,6 +22,8 @@ impl ModelParameters for Parameters { type ScalarField = Fr; } +impl_scalar_mul_kernel_glv!(bw6_761, "bw6_761", g2, G2Projective); + impl GLVParameters for Parameters { type WideBigInt = BigInteger768; @@ -154,6 +156,7 @@ impl SWModelParameters for Parameters { Self::BaseField::zero() } + impl_scalar_mul_parameters!(G2Projective); impl_glv_for_sw!(); } diff --git a/algebra/src/cp6_782/curves/g1.rs b/algebra/src/cp6_782/curves/g1.rs index c2d05df2e..ebe37e417 100644 --- a/algebra/src/cp6_782/curves/g1.rs +++ b/algebra/src/cp6_782/curves/g1.rs @@ -5,7 +5,7 @@ use crate::{ models::{ModelParameters, SWModelParameters}, short_weierstrass_jacobian::{GroupAffine, GroupProjective}, }, - field_new, + field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters, }; pub type G1Affine = GroupAffine; @@ -19,6 +19,8 @@ impl ModelParameters for Parameters { type ScalarField = Fr; } +impl_scalar_mul_kernel!(cp6_782, "cp6_782", g1, G1Projective); + impl SWModelParameters for Parameters { /// COEFF_A = 5 #[rustfmt::skip] @@ -84,6 +86,8 @@ impl SWModelParameters for Parameters { /// AFFINE_GENERATOR_COEFFS = (G1_GENERATOR_X, G1_GENERATOR_Y) const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField) = (G1_GENERATOR_X, G1_GENERATOR_Y); + + impl_scalar_mul_parameters!(G1Projective); } /// G1_GENERATOR_X = diff --git a/algebra/src/cp6_782/curves/g2.rs b/algebra/src/cp6_782/curves/g2.rs index 88d0ea2ce..4d30afcd1 100644 --- a/algebra/src/cp6_782/curves/g2.rs +++ b/algebra/src/cp6_782/curves/g2.rs @@ -5,7 +5,7 @@ use crate::{ models::{ModelParameters, SWModelParameters}, short_weierstrass_jacobian::{GroupAffine, GroupProjective}, }, - field_new, + field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters, }; pub type G2Affine = GroupAffine; @@ -19,6 +19,8 @@ impl ModelParameters for Parameters { type ScalarField = Fr; } +impl_scalar_mul_kernel!(cp6_782, "cp6_782", g2, G2Projective); + impl SWModelParameters for Parameters { /// COEFF_A = (0, 0, COEFF_A * TWIST^2) = (0, 0, 5) #[rustfmt::skip] @@ -118,6 +120,8 @@ impl SWModelParameters for Parameters { /// AFFINE_GENERATOR_COEFFS = (G2_GENERATOR_X, G2_GENERATOR_Y) const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField) = (G2_GENERATOR_X, G2_GENERATOR_Y); + + impl_scalar_mul_parameters!(G2Projective); } const G2_GENERATOR_X: Fq3 = diff --git a/algebra/src/ed_on_bls12_377/curves/mod.rs b/algebra/src/ed_on_bls12_377/curves/mod.rs index 5fd929481..d76440175 100644 --- a/algebra/src/ed_on_bls12_377/curves/mod.rs +++ b/algebra/src/ed_on_bls12_377/curves/mod.rs @@ -5,9 +5,11 @@ use algebra_core::{ models::{ModelParameters, MontgomeryModelParameters, TEModelParameters}, twisted_edwards_extended::{GroupAffine, GroupProjective}, }, - field_new, + field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters, }; +impl_scalar_mul_kernel!(ed_on_bls12_377, "ed_on_bls12_377", proj, EdwardsProjective); + #[cfg(test)] mod tests; @@ -65,6 +67,8 @@ impl TEModelParameters for EdwardsParameters { fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField { -*elem } + + impl_scalar_mul_parameters!(EdwardsProjective); } impl MontgomeryModelParameters for EdwardsParameters { diff --git a/algebra/src/ed_on_bls12_381/curves/mod.rs b/algebra/src/ed_on_bls12_381/curves/mod.rs index fe01f833a..6c4d254c6 100644 --- a/algebra/src/ed_on_bls12_381/curves/mod.rs +++ b/algebra/src/ed_on_bls12_381/curves/mod.rs @@ -5,7 +5,7 @@ use algebra_core::{ models::{ModelParameters, MontgomeryModelParameters, TEModelParameters}, twisted_edwards_extended::{GroupAffine, GroupProjective}, }, - field_new, + field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters, }; #[cfg(test)] @@ -58,6 +58,8 @@ impl ModelParameters for EdwardsParameters { type ScalarField = Fr; } +impl_scalar_mul_kernel!(ed_on_bls12_381, "ed_on_bls12_381", proj, EdwardsProjective); + impl TEModelParameters for EdwardsParameters { /// COEFF_A = -1 #[rustfmt::skip] @@ -100,6 +102,8 @@ impl TEModelParameters for EdwardsParameters { fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField { -(*elem) } + + impl_scalar_mul_parameters!(EdwardsProjective); } impl MontgomeryModelParameters for EdwardsParameters { diff --git a/algebra/src/ed_on_bn254/curves/mod.rs b/algebra/src/ed_on_bn254/curves/mod.rs index d4286349e..41634da40 100644 --- a/algebra/src/ed_on_bn254/curves/mod.rs +++ b/algebra/src/ed_on_bn254/curves/mod.rs @@ -5,7 +5,7 @@ use algebra_core::{ models::{ModelParameters, MontgomeryModelParameters, TEModelParameters}, twisted_edwards_extended::{GroupAffine, GroupProjective}, }, - field_new, + field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters, }; #[cfg(test)] @@ -44,6 +44,7 @@ impl ModelParameters for EdwardsParameters { type ScalarField = Fr; } +impl_scalar_mul_kernel!(ed_on_bn254, "ed_on_bn254", proj, EdwardsProjective); impl TEModelParameters for EdwardsParameters { /// COEFF_A = 1 #[rustfmt::skip] @@ -86,6 +87,8 @@ impl TEModelParameters for EdwardsParameters { const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField) = (GENERATOR_X, GENERATOR_Y); type MontgomeryModelParameters = EdwardsParameters; + + impl_scalar_mul_parameters!(EdwardsProjective); } impl MontgomeryModelParameters for EdwardsParameters { diff --git a/algebra/src/ed_on_cp6_782/curves/mod.rs b/algebra/src/ed_on_cp6_782/curves/mod.rs index 0e218cc4e..face754c7 100644 --- a/algebra/src/ed_on_cp6_782/curves/mod.rs +++ b/algebra/src/ed_on_cp6_782/curves/mod.rs @@ -4,7 +4,7 @@ use crate::{ models::{ModelParameters, MontgomeryModelParameters, TEModelParameters}, twisted_edwards_extended::{GroupAffine, GroupProjective}, }, - field_new, + field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters, }; use crate::ed_on_cp6_782::{fq::Fq, fr::Fr}; @@ -23,6 +23,7 @@ impl ModelParameters for EdwardsParameters { type ScalarField = Fr; } +impl_scalar_mul_kernel!(ed_on_cp6_782, "ed_on_cp6_782", proj, EdwardsProjective); impl TEModelParameters for EdwardsParameters { /// COEFF_A = -1 = /// 258664426012969094010652733694893533536393512754914660539884262666720468348340822774968888139573360124440321458176 @@ -72,6 +73,8 @@ impl TEModelParameters for EdwardsParameters { fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField { -*elem } + + impl_scalar_mul_parameters!(EdwardsProjective); } impl MontgomeryModelParameters for EdwardsParameters { diff --git a/algebra/src/ed_on_mnt4_298/curves/mod.rs b/algebra/src/ed_on_mnt4_298/curves/mod.rs index 681a885e1..d5e5879f9 100644 --- a/algebra/src/ed_on_mnt4_298/curves/mod.rs +++ b/algebra/src/ed_on_mnt4_298/curves/mod.rs @@ -5,7 +5,7 @@ use algebra_core::{ models::{ModelParameters, MontgomeryModelParameters, TEModelParameters}, twisted_edwards_extended::{GroupAffine, GroupProjective}, }, - field_new, + field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters, }; #[cfg(test)] @@ -27,6 +27,8 @@ impl ModelParameters for EdwardsParameters { // R for Fq: 223364648326281414938801705359223029554923725549792420683051274872200260503540791531766876 // R for Fr: 104384076783966083500464392945960916666734135485183910065100558776489954102951241798239545 +impl_scalar_mul_kernel!(ed_on_mnt4_298, "ed_on_mnt4_298", proj, EdwardsProjective); + impl TEModelParameters for EdwardsParameters { /// COEFF_A = -1 /// Needs to be in the Montgomery residue form in Fq @@ -81,6 +83,8 @@ impl TEModelParameters for EdwardsParameters { fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField { -*elem } + + impl_scalar_mul_parameters!(EdwardsProjective); } impl MontgomeryModelParameters for EdwardsParameters { diff --git a/algebra/src/ed_on_mnt4_753/curves/mod.rs b/algebra/src/ed_on_mnt4_753/curves/mod.rs index 1bcf02e3e..67742eef7 100644 --- a/algebra/src/ed_on_mnt4_753/curves/mod.rs +++ b/algebra/src/ed_on_mnt4_753/curves/mod.rs @@ -5,7 +5,7 @@ use algebra_core::{ models::{ModelParameters, MontgomeryModelParameters, TEModelParameters}, twisted_edwards_extended::{GroupAffine, GroupProjective}, }, - field_new, + field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters, }; #[cfg(test)] @@ -27,6 +27,7 @@ impl ModelParameters for EdwardsParameters { // R for Fq: 11407975440035778516953587871987109648531742722982233186120790377529569367095961954159305159259556262528904776132787438725571821295685691762729353555475679813615501328617736020411951837995932262333059670631633855898874183380802 // R for Fr: 933352698056040166367534174176950366489065242993745918174914647273231163953185260894581718311971532174387033963715296372791285468903747270837716556902938133611910788060028435531754797383796835009316018259656953442114538695438 +impl_scalar_mul_kernel!(ed_on_mnt4_753, "ed_on_mnt4_753", proj, EdwardsProjective); impl TEModelParameters for EdwardsParameters { /// COEFF_A = -1 /// Needs to be in the Montgomery residue form in Fq @@ -102,6 +103,8 @@ impl TEModelParameters for EdwardsParameters { fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField { -*elem } + + impl_scalar_mul_parameters!(EdwardsProjective); } impl MontgomeryModelParameters for EdwardsParameters { diff --git a/algebra/src/mnt4_298/curves/g1.rs b/algebra/src/mnt4_298/curves/g1.rs index e17684810..a70ac5996 100644 --- a/algebra/src/mnt4_298/curves/g1.rs +++ b/algebra/src/mnt4_298/curves/g1.rs @@ -5,7 +5,7 @@ use algebra_core::{ mnt4, models::{ModelParameters, SWModelParameters}, }, - field_new, + field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters, }; pub type G1Affine = mnt4::G1Affine; @@ -20,6 +20,8 @@ impl ModelParameters for Parameters { type ScalarField = Fr; } +impl_scalar_mul_kernel!(mnt4_298, "mnt4_298", g1, G1Projective); + impl SWModelParameters for Parameters { /// COEFF_A = 2 /// Reference: https://github.com/scipr-lab/libff/blob/c927821ebe02e0a24b5e0f9170cec5e211a35f08/libff/algebra/curves/mnt/mnt4/mnt4_init.cpp#L116 @@ -54,6 +56,8 @@ impl SWModelParameters for Parameters { /// AFFINE_GENERATOR_COEFFS = (G1_GENERATOR_X, G1_GENERATOR_Y) const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField) = (G1_GENERATOR_X, G1_GENERATOR_Y); + + impl_scalar_mul_parameters!(G1Projective); } // Generator of G1 diff --git a/algebra/src/mnt4_298/curves/g2.rs b/algebra/src/mnt4_298/curves/g2.rs index 9b5c89a63..84b5a4bfd 100644 --- a/algebra/src/mnt4_298/curves/g2.rs +++ b/algebra/src/mnt4_298/curves/g2.rs @@ -6,7 +6,7 @@ use algebra_core::{ mnt4::MNT4Parameters, models::{ModelParameters, SWModelParameters}, }, - field_new, + field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters, }; pub type G2Affine = mnt4::G2Affine; @@ -29,6 +29,8 @@ pub const MUL_BY_A_C0: Fq = G1_COEFF_A_NON_RESIDUE; #[rustfmt::skip] pub const MUL_BY_A_C1: Fq = G1_COEFF_A_NON_RESIDUE; +impl_scalar_mul_kernel!(mnt4_298, "mnt4_298", g2, G2Projective); + impl SWModelParameters for Parameters { const COEFF_A: Fq2 = mnt4_298::Parameters::TWIST_COEFF_A; // B coefficient of MNT4-298 G2 = @@ -82,6 +84,8 @@ impl SWModelParameters for Parameters { fn mul_by_a(elt: &Fq2) -> Fq2 { field_new!(Fq2, MUL_BY_A_C0 * &elt.c0, MUL_BY_A_C1 * &elt.c1,) } + + impl_scalar_mul_parameters!(G2Projective); } const G2_GENERATOR_X: Fq2 = field_new!(Fq2, G2_GENERATOR_X_C0, G2_GENERATOR_X_C1); diff --git a/algebra/src/mnt4_753/curves/g1.rs b/algebra/src/mnt4_753/curves/g1.rs index ce101a3b2..90a11fa0d 100644 --- a/algebra/src/mnt4_753/curves/g1.rs +++ b/algebra/src/mnt4_753/curves/g1.rs @@ -5,7 +5,7 @@ use algebra_core::{ mnt4, models::{ModelParameters, SWModelParameters}, }, - field_new, + field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters, }; pub type G1Affine = mnt4::G1Affine; @@ -20,6 +20,8 @@ impl ModelParameters for Parameters { type ScalarField = Fr; } +impl_scalar_mul_kernel!(mnt4_753, "mnt4_753", g1, G1Projective); + impl SWModelParameters for Parameters { /// COEFF_A = 2 #[rustfmt::skip] @@ -66,6 +68,8 @@ impl SWModelParameters for Parameters { /// AFFINE_GENERATOR_COEFFS = (G1_GENERATOR_X, G1_GENERATOR_Y) const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField) = (G1_GENERATOR_X, G1_GENERATOR_Y); + + impl_scalar_mul_parameters!(G1Projective); } // Generator of G1 diff --git a/algebra/src/mnt4_753/curves/g2.rs b/algebra/src/mnt4_753/curves/g2.rs index e5e9f8c4c..28ea85853 100644 --- a/algebra/src/mnt4_753/curves/g2.rs +++ b/algebra/src/mnt4_753/curves/g2.rs @@ -6,7 +6,7 @@ use algebra_core::{ mnt4::MNT4Parameters, models::{ModelParameters, SWModelParameters}, }, - field_new, + field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters, }; pub type G2Affine = mnt4::G2Affine; @@ -29,6 +29,8 @@ pub const MUL_BY_A_C0: Fq = G1_COEFF_A_NON_RESIDUE; #[rustfmt::skip] pub const MUL_BY_A_C1: Fq = G1_COEFF_A_NON_RESIDUE; +impl_scalar_mul_kernel!(mnt4_753, "mnt4_753", g2, G2Projective); + impl SWModelParameters for Parameters { const COEFF_A: Fq2 = mnt4_753::Parameters::TWIST_COEFF_A; // B coefficient of MNT4-753 G2 = @@ -103,6 +105,8 @@ impl SWModelParameters for Parameters { fn mul_by_a(elt: &Fq2) -> Fq2 { field_new!(Fq2, MUL_BY_A_C0 * &elt.c0, MUL_BY_A_C1 * &elt.c1,) } + + impl_scalar_mul_parameters!(G2Projective); } const G2_GENERATOR_X: Fq2 = field_new!(Fq2, G2_GENERATOR_X_C0, G2_GENERATOR_X_C1); diff --git a/algebra/src/mnt6_298/curves/g1.rs b/algebra/src/mnt6_298/curves/g1.rs index f10388cab..c476b91f8 100644 --- a/algebra/src/mnt6_298/curves/g1.rs +++ b/algebra/src/mnt6_298/curves/g1.rs @@ -5,7 +5,7 @@ use algebra_core::{ mnt6, models::{ModelParameters, SWModelParameters}, }, - field_new, + field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters, }; pub type G1Affine = mnt6::G1Affine; @@ -19,6 +19,9 @@ impl ModelParameters for Parameters { type BaseField = Fq; type ScalarField = Fr; } + +impl_scalar_mul_kernel!(mnt6_298, "mnt6_298", g1, G1Projective); + impl SWModelParameters for Parameters { /// COEFF_A = #[rustfmt::skip] @@ -57,6 +60,8 @@ impl SWModelParameters for Parameters { /// AFFINE_GENERATOR_COEFFS = (G1_GENERATOR_X, G1_GENERATOR_Y) const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField) = (G1_GENERATOR_X, G1_GENERATOR_Y); + + impl_scalar_mul_parameters!(G1Projective); } /// G1_GENERATOR_X = diff --git a/algebra/src/mnt6_298/curves/g2.rs b/algebra/src/mnt6_298/curves/g2.rs index a4b779f1f..f5411f24f 100644 --- a/algebra/src/mnt6_298/curves/g2.rs +++ b/algebra/src/mnt6_298/curves/g2.rs @@ -6,7 +6,7 @@ use algebra_core::{ mnt6::MNT6Parameters, models::{ModelParameters, SWModelParameters}, }, - field_new, + field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters, }; pub type G2Affine = mnt6::G2Affine; @@ -44,6 +44,8 @@ pub const MUL_BY_A_C1: Fq = field_new!(Fq, BigInteger320([ /// MUL_BY_A_C2 = COEFF_A pub const MUL_BY_A_C2: Fq = g1::Parameters::COEFF_A; +impl_scalar_mul_kernel!(mnt6_298, "mnt6_298", g2, G2Projective); + impl SWModelParameters for Parameters { const COEFF_A: Fq3 = mnt6_298::Parameters::TWIST_COEFF_A; #[rustfmt::skip] @@ -99,6 +101,8 @@ impl SWModelParameters for Parameters { MUL_BY_A_C2 * &elt.c0, ) } + + impl_scalar_mul_parameters!(G2Projective); } const G2_GENERATOR_X: Fq3 = diff --git a/algebra/src/mnt6_753/curves/g1.rs b/algebra/src/mnt6_753/curves/g1.rs index 7ba2daf0d..9765e47fd 100644 --- a/algebra/src/mnt6_753/curves/g1.rs +++ b/algebra/src/mnt6_753/curves/g1.rs @@ -5,7 +5,7 @@ use algebra_core::{ mnt6, models::{ModelParameters, SWModelParameters}, }, - field_new, + field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters, }; pub type G1Affine = mnt6::G1Affine; @@ -20,6 +20,8 @@ impl ModelParameters for Parameters { type ScalarField = Fr; } +impl_scalar_mul_kernel!(mnt6_753, "mnt6_753", g1, G1Projective); + impl SWModelParameters for Parameters { /// COEFF_A = 11 #[rustfmt::skip] @@ -66,6 +68,8 @@ impl SWModelParameters for Parameters { /// AFFINE_GENERATOR_COEFFS = (G1_GENERATOR_X, G1_GENERATOR_Y) const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField) = (G1_GENERATOR_X, G1_GENERATOR_Y); + + impl_scalar_mul_parameters!(G1Projective); } // Generator of G1 diff --git a/algebra/src/mnt6_753/curves/g2.rs b/algebra/src/mnt6_753/curves/g2.rs index a203b25c1..9da13d77a 100644 --- a/algebra/src/mnt6_753/curves/g2.rs +++ b/algebra/src/mnt6_753/curves/g2.rs @@ -6,7 +6,7 @@ use algebra_core::{ mnt6::MNT6Parameters, models::{ModelParameters, SWModelParameters}, }, - field_new, + field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters, }; pub type G2Affine = mnt6::G2Affine; @@ -58,6 +58,8 @@ pub const MUL_BY_A_C1: Fq = field_new!(Fq, BigInteger768([ /// MUL_BY_A_C2 = COEFF_A pub const MUL_BY_A_C2: Fq = g1::Parameters::COEFF_A; +impl_scalar_mul_kernel!(mnt6_753, "mnt6_753", g2, G2Projective); + impl SWModelParameters for Parameters { const COEFF_A: Fq3 = mnt6_753::Parameters::TWIST_COEFF_A; // B coefficient of MNT6-753 G2 = @@ -152,6 +154,8 @@ impl SWModelParameters for Parameters { MUL_BY_A_C2 * &elt.c0, ) } + + impl_scalar_mul_parameters!(G2Projective); } const G2_GENERATOR_X: Fq3 = diff --git a/algebra/src/tests/cuda.rs b/algebra/src/tests/cuda.rs new file mode 100644 index 000000000..e407838fe --- /dev/null +++ b/algebra/src/tests/cuda.rs @@ -0,0 +1,61 @@ +use algebra_core::{ + cuda::scalar_mul::{GPUScalarMul, GPUScalarMulSlice, MAX_GROUP_ELEM_BYTES}, + AffineCurve, BatchGroupArithmeticSlice, PrimeField, UniformRand, Zero, +}; +use rand::SeedableRng; +use rand_xorshift::XorShiftRng; + +use crate::{cfg_chunks_mut, tests::helpers::create_pseudo_uniform_random_elems}; + +const CHUNK_SIZE: usize = 1 << 12; + +#[cfg(feature = "parallel")] +use rayon::prelude::*; + +#[allow(unused)] +pub fn test_cuda_scalar_mul() { + #[cfg(not(feature = "big_n"))] + const MAX_LOGN: usize = 14; + #[cfg(feature = "big_n")] + const MAX_LOGN: usize = 20; + + let cuda_group_size = 1 << 5; + if core::mem::size_of::() >= MAX_GROUP_ELEM_BYTES { + println!("Group size too large to run on GPU, defaulting to CPU-only implementation"); + } + + const SAMPLES: usize = 1 << MAX_LOGN; + + let _lol = G::Projective::zero(); + let mut rng = XorShiftRng::seed_from_u64(234872845u64); + + let exps_h = (0..SAMPLES) + .map(|_| G::ScalarField::rand(&mut rng).into_repr()) + .collect::>(); + let mut bases_h = create_pseudo_uniform_random_elems::(&mut rng, MAX_LOGN); + + let mut bases_d = bases_h.to_vec(); + let mut exps_cpu = exps_h.to_vec(); + + let now = std::time::Instant::now(); + cfg_chunks_mut!(bases_h, CHUNK_SIZE) + .zip(cfg_chunks_mut!(exps_cpu, CHUNK_SIZE)) + .for_each(|(b, s)| b[..].batch_scalar_mul_in_place(&mut s[..], 4)); + println!("CPU mul: {}us", now.elapsed().as_micros()); + + ::Projective::clear_gpu_profiling_data(); + + let mut junk_data = bases_d.to_vec(); + for _ in 0..3 { + let now = std::time::Instant::now(); + &mut junk_data[..].cpu_gpu_scalar_mul(&exps_h[..], cuda_group_size, CHUNK_SIZE); + println!("CPU + GPU mul: {}us", now.elapsed().as_micros()); + } + let now = std::time::Instant::now(); + &mut bases_d[..].cpu_gpu_scalar_mul(&exps_h[..], cuda_group_size, CHUNK_SIZE); + println!("CPU + GPU mul: {}us", now.elapsed().as_micros()); + + for (b_h, b_d) in bases_h.into_iter().zip(bases_d.into_iter()) { + assert_eq!(b_h, b_d); + } +} diff --git a/algebra/src/tests/macros.rs b/algebra/src/tests/macros.rs index f4f0b089a..72584e57d 100644 --- a/algebra/src/tests/macros.rs +++ b/algebra/src/tests/macros.rs @@ -7,7 +7,7 @@ macro_rules! std_curve_tests { }; use rand::Rng; - use crate::tests::{curves::*, groups::*, msm::*}; + use crate::tests::{cuda::*, curves::*, groups::*, msm::*}; #[test] #[cfg(feature = "curve")] @@ -99,6 +99,18 @@ macro_rules! std_curve_tests { test_msm::(); } + #[test] + #[cfg(any(feature = "curve", feature = "cuda_test"))] + fn test_g1_cuda_scalar_mul() { + test_cuda_scalar_mul::(); + } + + #[test] + #[cfg(any(feature = "curve", feature = "cuda_test"))] + fn test_g2_cuda_scalar_mul() { + test_cuda_scalar_mul::(); + } + #[test] #[cfg(feature = "pairing")] fn test_bilinearity() { @@ -152,7 +164,7 @@ macro_rules! edwards_curve_tests { }; use rand::Rng; - use crate::tests::{curves::*, groups::*, msm::*}; + use crate::tests::{cuda::*, curves::*, groups::*, msm::*}; #[test] #[cfg(feature = "curve")] @@ -206,7 +218,13 @@ macro_rules! edwards_curve_tests { } #[test] - #[cfg(feature = "curve")] + #[cfg(any(feature = "curve", feature = "cuda_test"))] + fn test_edwards_cuda_scalar_mul() { + test_cuda_scalar_mul::(); + } + + #[test] + #[cfg(any(feature = "curve", feature = "cuda_test"))] fn test_generator() { let generator = EdwardsAffine::prime_subgroup_generator(); assert!(generator.is_on_curve()); diff --git a/algebra/src/tests/mod.rs b/algebra/src/tests/mod.rs index 93864eadf..fee88d8e9 100644 --- a/algebra/src/tests/mod.rs +++ b/algebra/src/tests/mod.rs @@ -1,3 +1,4 @@ +pub(crate) mod cuda; pub(crate) mod curves; pub(crate) mod fields; pub(crate) mod groups;