From 7c518fd419c3ebed471314473a4752952162262f Mon Sep 17 00:00:00 2001
From: jon-chuang <9093549+jon-chuang@users.noreply.github.com>
Date: Wed, 11 Nov 2020 00:57:36 +0800
Subject: [PATCH] CUDA Scalar Mul (#17)

* First draft affine batch ops & wnaf

* changes to mutability and lifetimes

* delete superfluous files

* crazy direction: Passing a FnMut to generate an iterator locally

* unsuccessful further attempts

* compile sucess using index approach

* fixes for mutable borrows

* Successfully passed scalar mul test

* benchmarks + prefetching

* stash

* generic impl of batch arith for all affinecurves

* batched affine formulas for TE - too expensive

* improved TE affine

* cleanup batch inversion

* fmt...

* fix minor error

* remove debugging scaffolding

* fmt...

* delete batch arith bench as not suitable for criterion or bench

* fix bench removal errors

* fmt...

* added missing coeff_a

* refactor BatchGroupArithmetic to be separate trait

* Batch verification with radix sort

* Cache-locality & parallelisation

* Successfully impl batch verify

* added tests and bench for batch_ver, parallel_random_gen, ^ thread util

* fmt

* enabled missing test

* remove voracious_radix_sort

* commented unneeded Instant::now()

* Fixed batch_ver tests for curves of small or unit cofactor

* split recursive and non-recursive, tidy up shared functionality

* reduce max_logn

* adjust max_logn further

* Batch MSM, speedup only for bw6 due to poor cache performance

* fmt...

* GLV iBiginteger

* stash

* stash

* GLV with Parameter-based specialisation

* GLV lattice basis script success

* Successfully passed tests and benched

* Improvments to MSM with and bucketed adds using lightweight index sort

* changed rng to be external parameter for non-parallel batch veri

* remove bench print scaffolding

* remove old batch_bucketed_add using vectors instead of fixed offsets

* retain parallel batch_add_split

* Comments for batch arith

* remove need for hashmap for no std for batch_bucketed_add

* minor changes

* cleanup

* cleanup

* fmt + use no_std Vec

* removed std::

* add scratch space

* Add GLV for non-batched SW mul

* fix for glv_scalar_decomposition when k == MODULUS (subgroup check)

* Fixed performance BUG: unnecessary table generation

* GLV -> has_glv(), bigint slice bd check, refactor batch loops, u32 index

* clean remove of batch_verify

* fix mistake with elems indexing, unused arg for future recursion PR

* trivial errors

* more minor fixes

* fix issues with batch_ver (.is_zero(), TE affine->proj mul)

* fix issue with batch_bucketed_add_split

* misname

* Success in test and bench \(*v*)/

* tmp commit to cache experimental batch_add_write_shift_..

* remove batch_add_write_shift..

* optional dep, fmt...

* undo accidental deletion of dlsd sort

* fmt...

* cleanup batch bucket add, unify impl

* no std...

* fixed tests

* fixed unimplemented for TE, swapped wnaf table row/col for batchaddwrite

* wnaf table generation uses fewer copies, remove timing instrumentation

* Minor Cleanup

* Add feature-activated timing instrumentation, reduce code bloat (wnaf)

* unused var, no_std

* Make timing macros defined globally, instrument more code

* instrument w/ tid, better num_rounds est. f64, timing black/whitelisting

* Minor changes

* refactor tests, generic MSM test

* 2D test matrix :)

* batchaffine

* tests

* additive features

* big_n feature for test-benching

* prefetch unroll

* minor adjustments

* extension(s -> "")_fields

* remove artifacts, fix asm

* uncomment subgroup checks, glv param sources

* gpu scalar mul

* fix dependency issues

* Extend GPU scalar mul to all curves

* refactor

* CPU + GPU coprocessing

* With suboptimal BW6 assembly

* add static partitioning

* profiling-based static partitioining

* statically partition between multiple gpus

* comments

* BBaseField -> BaseFieldForBatch

* Outline of basic traits

* Remove sw_proj, add gpu support for all sw projective curves

* impl gpu kernels for all curves

* feature-gate with "cuda"

* rename curves/gpu directory to curves/cuda

* Fix merge errors

* Use github rather than local jon-chuang/accel

* again

* again

* update README

* feature = "cuda"

* gpu_standalone (good for non-generic), feature gate under cuda too

* fix merging errors

* make helpers a same-file module

* remove cancerous --all-features from github yml

* Use dummy accel_dummy crate for when not compiling as CUDA

* feature gate accel import

* fix no_std

* fix gpu-standalone does not depend algebra-core/cuda

* lazy static optional

* kernel-specific static profile data

* cuda test, cached profile data (in OS cache dir) for all curves

* rectify omission of NAMESPACE, minor errors

* fix no_std, group size in bits too large for 2 groups (mnt6, cp6 - Fq3)

* toml fixes

* update README

* remove extraneous file

* bake in check for oversized group elems

* typo

* remove boilerplate/compactify

* remove standalone

* fmt

* fix println and comments

* fix: typo

* Update README.md

Co-authored-by: Kobi Gurkan <kobigurk@gmail.com>

* Make GPUScalarMulInternal APIs, only expose two APIs
exposing more APIs is future work

* add ci to test cuda compilation/link and cuda scalar mul when no gpu

* change kernel accel compile branch to master

* fix ci

* use unreachable instead of empty implementation

* install required toolchain

* Empty commit to get CI working

* try to fix ci

* fmt

* fix ci

* safer error handling in gpu code

* fix ci

* handle dirs crate not available without cuda

* don't check early intermediate results

* fix no_std and nightly

* fix remaining errors

* No for_tests

* Feature gate clear profile data

* install cuda library to successfully link

* change the order of CI jobs

* change the order of CI again

* cd ..

* Get rid of cacheing

* Never all features

* Put back cacheing

* Remove cuda .deb to save disk space

* Increase max-parallel

* check examples with all features

Co-authored-by: Kobi Gurkan <kobigurk@gmail.com>
---
 .github/workflows/ci.yml                      |  47 +-
 Cargo.toml                                    |   2 +-
 README.md                                     |   7 +
 algebra-benches/Cargo.toml                    |   3 +-
 algebra-core/Cargo.toml                       |  14 +-
 algebra-core/algebra-core-derive/Cargo.toml   |   2 +-
 algebra-core/mince/Cargo.toml                 |   2 +-
 algebra-core/src/bytes.rs                     |   2 +-
 algebra-core/src/curves/batch_arith.rs        |   4 +-
 algebra-core/src/curves/cuda/accel_dummy.rs   |   9 +
 algebra-core/src/curves/cuda/mod.rs           |   6 +
 .../curves/cuda/scalar_mul/cpu_gpu_macros.rs  | 298 +++++++++++++
 .../curves/cuda/scalar_mul/kernel_macros.rs   | 176 ++++++++
 .../src/curves/cuda/scalar_mul/mod.rs         | 357 +++++++++++++++
 .../cuda/scalar_mul/run_kernel_macros.rs      | 100 +++++
 algebra-core/src/curves/glv.rs                |   1 +
 algebra-core/src/curves/mod.rs                |  10 +-
 algebra-core/src/curves/models/mod.rs         |  68 +--
 .../curves/models/short_weierstrass_affine.rs |  64 ++-
 .../models/short_weierstrass_jacobian.rs      | 162 +++++--
 .../models/short_weierstrass_projective.rs    | 415 ------------------
 .../src/curves/models/sw_batch_affine.rs      |  12 +-
 .../curves/models/twisted_edwards_extended.rs | 187 +++++---
 algebra-core/src/fields/arithmetic.rs         |   5 +-
 algebra-core/src/lib.rs                       |   1 +
 algebra/Cargo.toml                            |   8 +-
 algebra/src/bls12_377/curves/g1.rs            |  12 +-
 algebra/src/bls12_377/curves/g2.rs            |  13 +-
 algebra/src/bls12_377/curves/mod.rs           |  13 +-
 algebra/src/bls12_381/curves/g1.rs            |   6 +-
 algebra/src/bls12_381/curves/g2.rs            |   6 +-
 algebra/src/bn254/curves/g1.rs                |  16 +-
 algebra/src/bn254/curves/g2.rs                |  16 +-
 algebra/src/bn254/curves/mod.rs               |  13 +-
 algebra/src/bw6_761/curves/g1.rs              |   5 +-
 algebra/src/bw6_761/curves/g2.rs              |   5 +-
 algebra/src/cp6_782/curves/g1.rs              |   6 +-
 algebra/src/cp6_782/curves/g2.rs              |   6 +-
 algebra/src/ed_on_bls12_377/curves/mod.rs     |   6 +-
 algebra/src/ed_on_bls12_381/curves/mod.rs     |   6 +-
 algebra/src/ed_on_bn254/curves/mod.rs         |   5 +-
 algebra/src/ed_on_cp6_782/curves/mod.rs       |   5 +-
 algebra/src/ed_on_mnt4_298/curves/mod.rs      |   6 +-
 algebra/src/ed_on_mnt4_753/curves/mod.rs      |   5 +-
 algebra/src/mnt4_298/curves/g1.rs             |   6 +-
 algebra/src/mnt4_298/curves/g2.rs             |   6 +-
 algebra/src/mnt4_753/curves/g1.rs             |   6 +-
 algebra/src/mnt4_753/curves/g2.rs             |   6 +-
 algebra/src/mnt6_298/curves/g1.rs             |   7 +-
 algebra/src/mnt6_298/curves/g2.rs             |   6 +-
 algebra/src/mnt6_753/curves/g1.rs             |   6 +-
 algebra/src/mnt6_753/curves/g2.rs             |   6 +-
 algebra/src/tests/cuda.rs                     |  61 +++
 algebra/src/tests/macros.rs                   |  24 +-
 algebra/src/tests/mod.rs                      |   1 +
 55 files changed, 1565 insertions(+), 682 deletions(-)
 create mode 100644 algebra-core/src/curves/cuda/accel_dummy.rs
 create mode 100644 algebra-core/src/curves/cuda/mod.rs
 create mode 100644 algebra-core/src/curves/cuda/scalar_mul/cpu_gpu_macros.rs
 create mode 100644 algebra-core/src/curves/cuda/scalar_mul/kernel_macros.rs
 create mode 100644 algebra-core/src/curves/cuda/scalar_mul/mod.rs
 create mode 100644 algebra-core/src/curves/cuda/scalar_mul/run_kernel_macros.rs
 delete mode 100644 algebra-core/src/curves/models/short_weierstrass_projective.rs
 create mode 100644 algebra/src/tests/cuda.rs

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index cb26016f1..d6ff89852 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -22,7 +22,7 @@ jobs:
         toolchain: stable
         override: true
         components: rustfmt
-
+        default: true
     - name: cargo fmt --check
       uses: actions-rs/cargo@v1
       with:
@@ -35,6 +35,7 @@ jobs:
     env:
       RUSTFLAGS: -Dwarnings
     strategy:
+      max-parallel: 6
       matrix:
         rust:
           - stable
@@ -50,14 +51,38 @@ jobs:
           toolchain: ${{ matrix.rust }}
           override: true
 
-      - uses: actions/cache@v2
-        with:
-          path: |
-            ~/.cargo/registry
-            ~/.cargo/git
-            target
+      - name: Install CUDA toolchains
+        run: |
+          wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
+          sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600
+          wget -q https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda-repo-ubuntu1804-11-1-local_11.1.1-455.32.00-1_amd64.deb
+          sudo dpkg -i cuda-repo-ubuntu1804-11-1-local_11.1.1-455.32.00-1_amd64.deb
+          sudo apt-key add /var/cuda-repo-ubuntu1804-11-1-local/7fa2af80.pub
+          sudo apt-get update
+          sudo apt-get -y install cuda
+          rm cuda-repo-ubuntu*
+          curl -sSL https://github.com/jon-chuang/accel/raw/master/setup_nvptx_toolchain.sh | bash
+
+      - uses: actions/cache@v2	
+        with:	
+          path: |	
+            ~/.cargo/registry	
+            ~/.cargo/git	
+            target	
           key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
 
+      - name: Test algebra with CUDA
+        run: |
+          cd algebra
+          cargo test --features "all_curves cuda cuda_test"
+          cd ..
+          
+      - name: Test algebra
+        run: |
+          cd algebra
+          cargo test --features full
+          cd ..
+
       - name: Check examples
         uses: actions-rs/cargo@v1
         with:
@@ -68,7 +93,7 @@ jobs:
         uses: actions-rs/cargo@v1
         with:
             command: check
-            args: --examples --all-features --all
+            args: --all-features --examples --all
         if: matrix.rust == 'stable'
 
       - name: Check benchmarks on nightly
@@ -88,12 +113,6 @@ jobs:
                    --exclude ff-fft-benches \
                    -- --skip dpc --skip integration_test"
 
-      - name: Test algebra
-        run: |
-          cd algebra
-          cargo test --features full
-          cd ..
-
       - name: Test algebra with assembly
         run: |
           cd algebra
diff --git a/Cargo.toml b/Cargo.toml
index b4b593c4a..525a093e7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,7 +15,7 @@ members = [
     "r1cs-core",
     "r1cs-std",
     "algebra-core/algebra-core-derive",
-    "scripts/glv_lattice_basis"
+    "scripts/glv_lattice_basis",
 ]
 
 [profile.release]
diff --git a/README.md b/README.md
index 5ce72e364..e1f1bc3c0 100644
--- a/README.md
+++ b/README.md
@@ -87,6 +87,13 @@ To bench `algebra-benches` with greater accuracy, especially for functions with
 cargo +nightly bench --features "n_fold bls12_381"
 ```
 
+CUDA support is available for a limited set of functions. To allow compilation for CUDA on Linux, first run the script
+```
+curl -sSL https://github.com/jon-chuang/accel/raw/master/setup_nvptx_toolchain.sh | bash
+```
+or run the equivalent commands for your OS. Then, pass the `cuda` feature to rustc or cargo when compiling, and import the relevant traits (e.g. GPUScalarMulSlice) wherever the functions are called.
+
+When the `cuda` feature is not activated, Zexe will still compile. However, when either the `cuda` feature is not activated during compilation or CUDA is not detected on your system at runtime, Zexe will default to a CPU-only implementation of the same functionality.
 
 ## License
 
diff --git a/algebra-benches/Cargo.toml b/algebra-benches/Cargo.toml
index 0aeafe760..9d009beae 100644
--- a/algebra-benches/Cargo.toml
+++ b/algebra-benches/Cargo.toml
@@ -31,9 +31,10 @@ rand_xorshift = { version = "0.2" }
 paste = "1.0"
 
 [features]
+bw6_asm = [ "algebra/bw6_asm"]
 asm = [ "algebra/asm"]
 prefetch = [ "algebra/prefetch"]
-bw6_asm = [ "algebra/bw6_asm"]
+cuda = [ "algebra/cuda" ]
 n_fold = []
 mnt4_298 = [ "algebra/mnt4_298"]
 mnt6_298 = [ "algebra/mnt6_298"]
diff --git a/algebra-core/Cargo.toml b/algebra-core/Cargo.toml
index d17b113e6..77c3b0fc5 100644
--- a/algebra-core/Cargo.toml
+++ b/algebra-core/Cargo.toml
@@ -27,32 +27,40 @@ algebra-core-derive = { path = "algebra-core-derive", optional = true }
 derivative = { version = "2", features = ["use_core"] }
 num-traits = { version = "0.2", default-features = false }
 rand = { version = "0.7", default-features = false }
-rayon = { version = "1", optional = true }
+rayon = { version = "1.3.0", optional = true }
 unroll = { version = "=0.1.4" }
 itertools = { version = "0.9.0", default-features = false }
 either = { version = "1.6.0", default-features = false }
 thread-id = { version = "3.3.0", optional = true }
 backtrace = { version = "0.3", optional = true }
+accel = { git = "https://github.com/jon-chuang/accel", package = "accel", optional = true }
+peekmore = "0.5.6"
+closure = { version = "0.3.0", optional = true }
+lazy_static = { version = "1.4.0", optional = true }
+serde_json = { version = "1.0.58", optional = true }
+dirs = { version = "1.0.5", optional = true }
+log = { version = "0.4.11", optional = true }
 paste = "0.1"
 
 [build-dependencies]
 field-assembly = { path = "./field-assembly", optional = true }
-cc = "1.0"
 rustc_version = "0.2"
+cc = "1.0"
 
 [dev-dependencies]
 rand_xorshift = "0.2"
 
 [features]
+bw6_asm = []
 default = [ "std", "rand/default" ]
 std = []
 parallel = [ "std", "rayon", "rand/default" ]
 derive = [ "algebra-core-derive" ]
 prefetch = [ "std" ]
+cuda = [ "std", "parallel", "accel", "lazy_static", "serde_json", "dirs", "closure", "log" ]
 
 timing = [ "std", "backtrace" ]
 timing_detailed = [ "std", "backtrace" ]
 timing_thread_id = [ "thread-id" ]
 
 llvm_asm = [ "field-assembly" ]
-bw6_asm = []
diff --git a/algebra-core/algebra-core-derive/Cargo.toml b/algebra-core/algebra-core-derive/Cargo.toml
index 4a0f5afc1..8075ed093 100644
--- a/algebra-core/algebra-core-derive/Cargo.toml
+++ b/algebra-core/algebra-core-derive/Cargo.toml
@@ -27,4 +27,4 @@ proc-macro = true
 [dependencies]
 proc-macro2 = "1.0"
 syn = "1.0"
-quote = "1.0"
+quote = "1.0.7"
diff --git a/algebra-core/mince/Cargo.toml b/algebra-core/mince/Cargo.toml
index 3e92abcce..b9aaa90d1 100644
--- a/algebra-core/mince/Cargo.toml
+++ b/algebra-core/mince/Cargo.toml
@@ -7,7 +7,7 @@ edition = "2018"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-quote = "1.0"
+quote = "1.0.7"
 syn = {version = "1.0.17", features = ["full"]}
 
 [lib]
diff --git a/algebra-core/src/bytes.rs b/algebra-core/src/bytes.rs
index 76ff7304d..cb5469cb9 100644
--- a/algebra-core/src/bytes.rs
+++ b/algebra-core/src/bytes.rs
@@ -316,7 +316,7 @@ mod test {
     fn test_macro_empty() {
         let array: Vec<u8> = vec![];
         let bytes: Vec<u8> = to_bytes![array].unwrap();
-        assert_eq!(&bytes, &[]);
+        assert_eq!(bytes, Vec::<u8>::new());
         assert_eq!(bytes.len(), 0);
     }
 
diff --git a/algebra-core/src/curves/batch_arith.rs b/algebra-core/src/curves/batch_arith.rs
index 07c4cf630..8fafc26da 100644
--- a/algebra-core/src/curves/batch_arith.rs
+++ b/algebra-core/src/curves/batch_arith.rs
@@ -25,7 +25,7 @@ pub trait BatchGroupArithmetic
 where
     Self: Sized + Clone + Copy + Zero + Neg<Output = Self>,
 {
-    type BBaseField: Field;
+    type BaseFieldForBatch: Field;
 
     // We use the w-NAF method, achieving point density of approximately 1/(w + 1)
     // and requiring storage of only 2^(w - 1).
@@ -136,7 +136,7 @@ where
     fn batch_double_in_place(
         bases: &mut [Self],
         index: &[u32],
-        scratch_space: Option<&mut Vec<Self::BBaseField>>,
+        scratch_space: Option<&mut Vec<Self::BaseFieldForBatch>>,
     );
 
     /// Mutates bases in place and stores result in the first operand.
diff --git a/algebra-core/src/curves/cuda/accel_dummy.rs b/algebra-core/src/curves/cuda/accel_dummy.rs
new file mode 100644
index 000000000..27d3c3d8a
--- /dev/null
+++ b/algebra-core/src/curves/cuda/accel_dummy.rs
@@ -0,0 +1,9 @@
+#[cfg(not(feature = "std"))]
+use alloc::vec::Vec;
+pub mod error {
+    pub type Result<T> = T;
+}
+
+pub struct Context {}
+
+pub type DeviceMemory<T> = Vec<T>;
diff --git a/algebra-core/src/curves/cuda/mod.rs b/algebra-core/src/curves/cuda/mod.rs
new file mode 100644
index 000000000..f2dc0829d
--- /dev/null
+++ b/algebra-core/src/curves/cuda/mod.rs
@@ -0,0 +1,6 @@
+#[macro_use]
+pub mod scalar_mul;
+pub use scalar_mul::*;
+
+#[cfg(not(feature = "cuda"))]
+pub mod accel_dummy;
diff --git a/algebra-core/src/curves/cuda/scalar_mul/cpu_gpu_macros.rs b/algebra-core/src/curves/cuda/scalar_mul/cpu_gpu_macros.rs
new file mode 100644
index 000000000..6a4000683
--- /dev/null
+++ b/algebra-core/src/curves/cuda/scalar_mul/cpu_gpu_macros.rs
@@ -0,0 +1,298 @@
+// TODO: make this more generic
+#[macro_export]
+macro_rules! impl_gpu_cpu_run_kernel {
+    () =>  {
+        #[allow(unused_qualifications)]
+        fn init_gpu_cache_dir() -> Result<crate::String, crate::CudaScalarMulError> {
+            #[cfg(feature = "cuda")]
+            {
+                let dir = dirs::cache_dir()
+                    .unwrap()
+                    .join("zexe-algebra")
+                    .join("cuda-scalar-mul-profiler")
+                    .join(P::namespace());
+                std::fs::create_dir_all(&dir)?;
+                Ok(dir.to_str().unwrap().to_string())
+            }
+            #[cfg(not(feature = "cuda"))]
+            Err(crate::CudaScalarMulError::CudaDisabledError)
+        }
+
+        #[allow(unused_qualifications)]
+        fn read_profile_data() -> Result<crate::String, crate::CudaScalarMulError> {
+            #[cfg(feature = "cuda")]
+            {
+                let dir = std::path::PathBuf::from(Self::init_gpu_cache_dir()?);
+                let data = std::fs::read_to_string(&dir.join("profile_data.txt"))?;
+                Ok(data)
+            }
+            #[cfg(not(feature = "cuda"))]
+            Err(crate::CudaScalarMulError::CudaDisabledError)
+        }
+
+        fn clear_gpu_profiling_data() -> Result<(), crate::CudaScalarMulError> {
+            #[cfg(feature = "cuda")]
+            {
+                let dir = std::path::PathBuf::from(Self::init_gpu_cache_dir()?);
+                std::fs::File::create(&dir.join("profile_data.txt"))?;
+                Ok(())
+            }
+            #[cfg(not(feature = "cuda"))]
+            Err(crate::CudaScalarMulError::CudaDisabledError)
+        }
+
+        #[allow(unused_variables)]
+        fn write_profile_data(profile_data: &str) -> Result<(), crate::CudaScalarMulError> {
+            #[cfg(feature = "cuda")]
+            {
+                let dir = std::path::PathBuf::from(Self::init_gpu_cache_dir()?);
+                let mut file = std::fs::File::create(&dir.join("profile_data.txt"))?;
+                file.write_all(profile_data.as_bytes())?;
+                file.sync_all()?;
+                Ok(())
+            }
+            #[cfg(not(feature = "cuda"))]
+            Err(crate::CudaScalarMulError::CudaDisabledError)
+        }
+
+        /// We split up the job statically between the CPU and GPUs
+        /// based on continuous profiling stored both in a static location in memory
+        /// that is lost the moment the progam stops running.
+        /// and also a txt file in the OS' cache dir.
+
+        /// Only one such procedure should be running at any time.
+        #[allow(unused_variables)]
+        fn cpu_gpu_static_partition_run_kernel(
+            bases_h: &mut [<Self as ProjectiveCurve>::Affine],
+            exps_h: &[<<Self as ProjectiveCurve>::ScalarField as PrimeField>::BigInt],
+            cuda_group_size: usize,
+            // size of the batch for cpu scalar mul
+            cpu_chunk_size: usize,
+        ) -> Result<(), crate::CudaScalarMulError> {
+            #[cfg(feature = "cuda")]
+            {
+                if !Device::init() {
+                    panic!("Do not call this function unless the device has been checked to initialise successfully");
+                }
+                let n_devices = Device::get_count().unwrap();
+                let n = bases_h.len();
+                // Create references so we can split the slices
+                let mut res_ref = &mut bases_h[..];
+                let mut exps_h_ref = exps_h;
+
+                let _now = timer!();
+                // Get data for proportion of total throughput achieved by each device
+                let _ = Self::init_gpu_cache_dir()?;
+
+                let arc_mutex = P::scalar_mul_static_profiler();
+                let mut profile_data = arc_mutex.lock().unwrap();
+                let mut proportions: Vec<f64> = profile_data.0.clone();
+
+                // If the program has just been initialised, we must check for the existence of existing
+                // cached profile data. If it does not exist, we create a new file
+                if proportions.is_empty() {
+                    let _ = Self::read_profile_data()
+                        .and_then(|s| { let res = serde_json::from_str(&s).map_err(|_| crate::CudaScalarMulError::ProfilingDeserializationError)?; Ok(res) })
+                        .and_then(|cached_data| {
+                            *profile_data = cached_data;
+                            proportions = profile_data.0.clone();
+                            Ok(())
+                        }
+                    );
+                }
+
+                if proportions.is_empty() {
+                    // By default we split the work evenly between devices and host
+                    proportions = vec![1.0 / (n_devices as f64 + 1.0); n_devices];
+                }
+                timer_println!(_now, "prepare profiling");
+
+                let _now = timer!();
+                assert_eq!(proportions.len(), n_devices);
+                // Allocate the number of elements in the job to each device/host
+                let n_gpus = proportions.iter().map(|r| (r * n as f64).round() as usize).collect::<Vec<_>>();
+                let n_cpu = n - n_gpus.iter().sum::<usize>();
+
+                // Create storage for buffers and contexts for variable number of devices
+                let mut bases_split = Vec::with_capacity(n_devices);
+                let mut tables = Vec::with_capacity(n_devices);
+                let mut exps = Vec::with_capacity(n_devices);
+                let mut ctxs = Vec::with_capacity(n_devices);
+                let (mut time_cpu, mut times_gpu) = (0, vec![0; n_devices]);
+
+                // Split data and generate tables and u8 scalar encoding in device memory
+                for (i, &num) in n_gpus.iter().enumerate() {
+                    let device = Device::nth(i).unwrap();
+                    let ctx = device.create_context();
+
+                    let (lower, upper) = res_ref.split_at_mut(num);
+                    res_ref = upper;
+                    let lower_exps = &exps_h_ref[..num];
+                    exps_h_ref = &exps_h_ref[num..];
+
+                    let mut table = DeviceMemory::<Self>::zeros(&ctx, num * Self::table_size());
+                    let mut exp = DeviceMemory::<u8>::zeros(&ctx, num * Self::num_u8());
+
+                    Self::generate_tables_and_recoding(lower, &mut table[..], lower_exps, &mut exp[..]);
+
+                    ctxs.push((device, ctx));
+                    bases_split.push(lower);
+                    tables.push(table);
+                    exps.push(exp);
+                };
+                timer_println!(_now, "precomp and allocate on device");
+
+                let jobs_result: std::sync::Arc<Mutex<Result<(), crate::CudaScalarMulError>>> = std::sync::Arc::new(Mutex::new(Ok(())));
+
+                rayon::scope(|s| {
+                    // Run jobs on GPUs
+                    for (i, (bases_gpu, time_gpu)) in bases_split.iter_mut().zip(times_gpu.iter_mut()).enumerate() {
+                        let n_gpu = n_gpus[i];
+                        let ctx = &ctxs[i].1;
+                        let table = &tables[i];
+                        let exp = &exps[i];
+
+                        let jobs_result_inner = jobs_result.clone();
+
+                        s.spawn(move |_| {
+                            let now = std::time::Instant::now();
+                            let _now = timer!();
+
+                            let mut out = DeviceMemory::<Self>::zeros(ctx, n_gpu);
+                            let result = P::scalar_mul_kernel(
+                                ctx,
+                                (n_gpu - 1) / cuda_group_size + 1, // grid
+                                cuda_group_size,     // block
+                                table.as_ptr(), exp.as_ptr(), out.as_mut_ptr(), n_gpu as isize
+                            ).map_err(|_| crate::CudaScalarMulError::KernelFailedError);
+                            if result.is_err() {
+                                *jobs_result_inner.lock().unwrap() = result;
+                                return;
+                            }
+                            Self::batch_normalization(&mut out[..]);
+                            bases_gpu.clone_from_slice(&out.par_iter().map(|p| p.into_affine()).collect::<Vec<_>>()[..]);
+                            *time_gpu = now.elapsed().as_micros();
+
+                            timer_println!(_now, format!("gpu {} done", i));
+                        });
+                    }
+
+                    // Run on CPU
+                    s.spawn(|_| {
+                        let now = std::time::Instant::now();
+                        let _now = timer!();
+
+                        let exps_mut = &mut exps_h_ref.to_vec()[..];
+                        rayon::scope(|t| {
+                            for (b, s) in res_ref.chunks_mut(cpu_chunk_size).zip(exps_mut.chunks_mut(cpu_chunk_size)) {
+                                t.spawn(move |_| b[..].batch_scalar_mul_in_place(&mut s[..], 4));
+                            }
+                        });
+
+                        time_cpu = now.elapsed().as_micros();
+                        timer_println!(_now, "cpu done");
+                    });
+                });
+
+                // It's safe to do this, since after the rayon scope we only have one reference.
+                std::sync::Arc::try_unwrap(jobs_result).unwrap().into_inner().unwrap()?;
+
+                // Update global microbenchmarking state
+                debug!("CUDA old profile_data: {:?}", profile_data);
+                let cpu_throughput = n_cpu as f64 / time_cpu as f64;
+                let gpu_throughputs = n_gpus
+                    .iter()
+                    .zip(times_gpu.iter())
+                    .map(|(n_gpu, time_gpu)| {
+                        *n_gpu as f64 / *time_gpu as f64
+                })
+                .collect::<Vec<_>>();
+                let total_throughput = cpu_throughput + gpu_throughputs.iter().sum::<f64>();
+                let n_data_points = profile_data.1 as f64;
+                profile_data.1 += 1;
+                let new_proportions = gpu_throughputs.iter().map(|t| t / total_throughput);
+
+                if !profile_data.0.is_empty() {
+                    profile_data.0 = new_proportions.zip(profile_data.0.clone()).map(|(new, old)| {
+                        (new + n_data_points * old) / profile_data.1 as f64
+                    }).collect();
+                } else {
+                    profile_data.0 = new_proportions.collect();
+                }
+
+                // Update cached profiling data on disk
+                let _now = timer!();
+                let s: String = serde_json::to_string(&(*profile_data)).map_err(|_| crate::CudaScalarMulError::ProfilingSerializationError)?;
+                Self::write_profile_data(&s)?;
+
+                timer_println!(_now, "write data");
+
+                debug!("CUDA new profile_data: {:?}", profile_data);
+            }
+
+            Ok(())
+        }
+
+        #[allow(unused_variables)]
+        fn cpu_gpu_load_balance_run_kernel(
+            ctx: &Context,
+            bases_h: &[<Self as ProjectiveCurve>::Affine],
+            exps_h: &[<<Self as ProjectiveCurve>::ScalarField as PrimeField>::BigInt],
+            cuda_group_size: usize,
+            // size of a single job in the queue e.g. 2 << 14
+            job_size: usize,
+            // size of the batch for cpu scalar mul
+            cpu_chunk_size: usize,
+        ) -> Vec<<Self as ProjectiveCurve>::Affine> {
+            #[cfg(feature = "cuda")]
+            {
+                let mut bases_res = bases_h.to_vec();
+                let queue = Mutex::new(bases_res.chunks_mut(job_size).zip(exps_h.chunks(job_size)).peekmore());
+
+                rayon::scope(|s| {
+                    // We launch two concurrent GPU threads that block on waiting for GPU to hide latency
+                    for i in 0..2 {
+                        s.spawn(closure!(move i, ref queue, |_| {
+                            std::thread::sleep(std::time::Duration::from_millis(i * 500));
+                            let mut iter = queue.lock().unwrap();
+                            while let Some((bases, exps)) = iter.next() {
+                                iter.peek();
+                                if iter.peek().is_none() { break; }
+                                let mut proj_res = Self::par_run_kernel_sync(ctx, bases, exps, cuda_group_size, iter);
+                                Self::batch_normalization(&mut proj_res[..]);
+                                bases.clone_from_slice(&proj_res.par_iter().map(|p| p.into_affine()).collect::<Vec<_>>()[..]);
+                                iter = queue.lock().unwrap();
+                            }
+                        }));
+                    }
+
+                    s.spawn(|_| {
+                        std::thread::sleep(std::time::Duration::from_millis(20));
+                        let mut iter = queue.lock().unwrap();
+                        debug!("CUDA acquired cpu");
+                        while let Some((bases, exps)) = iter.next() {
+                            let exps_mut = &mut exps.to_vec()[..];
+                            rayon::scope(|t| {
+                                for (b, s) in bases.chunks_mut(cpu_chunk_size).zip(exps_mut.chunks_mut(cpu_chunk_size)) {
+                                    t.spawn(move |_| b[..].batch_scalar_mul_in_place(&mut s[..], 4));
+                                }
+                            });
+                            // Sleep to allow other threads to unlock
+                            drop(iter);
+                            debug!("CUDA unlocked cpu");
+                            std::thread::sleep(std::time::Duration::from_millis(20));
+                            iter = queue.lock().unwrap();
+                            debug!("CUDA acquired cpu");
+                        }
+                        debug!("CUDA cpu finish");
+                    });
+                });
+                drop(queue);
+                bases_res
+            }
+
+            #[cfg(not(feature = "cuda"))]
+            Vec::new()
+        }
+    }
+}
diff --git a/algebra-core/src/curves/cuda/scalar_mul/kernel_macros.rs b/algebra-core/src/curves/cuda/scalar_mul/kernel_macros.rs
new file mode 100644
index 000000000..cb04b94f0
--- /dev/null
+++ b/algebra-core/src/curves/cuda/scalar_mul/kernel_macros.rs
@@ -0,0 +1,176 @@
+#[macro_export]
+macro_rules! impl_scalar_mul_kernel {
+    ($curve: ident, $curve_string:expr, $type: expr, $ProjCurve: ident) => {
+        paste::item! {
+            #[cfg(feature = "cuda")]
+            use {accel::*, std::sync::{Arc, Mutex}};
+
+            #[cfg(not(feature = "cuda"))]
+            use algebra_core::accel_dummy::*;
+
+            use algebra_core::curves::cuda::scalar_mul::ScalarMulProfiler;
+
+            #[cfg(feature = "cuda")]
+            lazy_static::lazy_static! {
+                pub static ref MICROBENCH_CPU_GPU_AVG_RATIO:
+                    Arc<Mutex<(Vec<f64>, usize)>> = Arc::new(Mutex::new((vec![], 0)));
+            }
+
+            #[cfg(not(feature = "cuda"))]
+            static MICROBENCH_CPU_GPU_AVG_RATIO: () = ();
+
+            const NAMESPACE: &'static str = stringify!([<$curve _ $type _cuda_namespace>]);
+
+            #[cfg(feature = "cuda")]
+            #[kernel_mod(transparent)]
+            #[name([<$curve _ $type _cuda_namespace>])]
+            #[dependencies("accel-core" = { git = "https://github.com/jon-chuang/accel", package = "accel-core" })]
+            #[dependencies("algebra-core" = { git = "https://github.com/celo-org/zexe", branch = "master", package = "algebra-core", default_features = false})]
+            #[dependencies("algebra" = { git = "https://github.com/celo-org/zexe", branch = "master", package = "algebra", default_features = false, features = [$curve_string]})]
+            pub mod scalar_mul {
+                use algebra::{$curve::$ProjCurve};
+                use algebra_core::{curves::ProjectiveCurve, fields::PrimeField, FpParameters, Zero};
+
+                const NUM_BITS: isize =
+                    <<<$ProjCurve as ProjectiveCurve>::ScalarField as PrimeField>::Params as FpParameters>::MODULUS_BITS as isize;
+                const LOG2_W: isize = 5;
+                const TABLE_SIZE: isize = 1 << LOG2_W;
+                const NUM_U8: isize = (NUM_BITS - 1) / LOG2_W + 1;
+
+                #[kernel_func]
+                pub unsafe fn scalar_mul(
+                    #[type_substitute(*const super::$ProjCurve)]
+                    table: *const $ProjCurve,
+                    exps: *const u8,
+                    #[type_substitute(*mut super::$ProjCurve)]
+                    out: *mut $ProjCurve,
+                    n: isize,
+                ) {
+                    let i = accel_core::index();
+                    if i < n {
+                        let mut res = $ProjCurve::zero();
+                        res += &(*table.offset(i * TABLE_SIZE + *exps.offset(i * NUM_U8) as isize));
+
+                        for j in 1..NUM_U8 as isize {
+                            for _ in 0..LOG2_W {
+                                res.double_in_place();
+                            }
+                            res += &(*table
+                                .offset(i * TABLE_SIZE + *exps.offset(i * NUM_U8 + j) as isize));
+                        }
+                        *out.offset(i) = res;
+                    }
+                }
+            }
+        }
+    }
+}
+
+#[macro_export]
+macro_rules! impl_scalar_mul_kernel_glv {
+    ($curve: ident, $curve_string:expr, $type: expr, $ProjCurve: ident) => {
+        paste::item! {
+            #[cfg(feature = "cuda")]
+            use {accel::*, std::sync::{Arc, Mutex}};
+
+            #[cfg(not(feature = "cuda"))]
+            use algebra_core::accel_dummy::*;
+
+            use algebra_core::curves::cuda::scalar_mul::ScalarMulProfiler;
+
+            #[cfg(feature = "cuda")]
+            lazy_static::lazy_static! {
+                pub static ref MICROBENCH_CPU_GPU_AVG_RATIO:
+                    Arc<Mutex<(Vec<f64>, usize)>> = Arc::new(Mutex::new((vec![], 0)));
+            }
+
+            #[cfg(not(feature = "cuda"))]
+            static MICROBENCH_CPU_GPU_AVG_RATIO: () = ();
+
+            const NAMESPACE: &'static str = stringify!([<$curve _ $type _cuda_namespace>]);
+
+            #[cfg(feature = "cuda")]
+            #[kernel_mod(transparent)]
+            #[name([<$curve _ $type _cuda_namespace>])]
+            #[dependencies("accel-core" = { git = "https://github.com/jon-chuang/accel", package = "accel-core" })]
+            #[dependencies("algebra-core" = { git = "https://github.com/celo-org/zexe", branch = "master", package = "algebra-core", default_features = false})]
+            #[dependencies("algebra" = { git = "https://github.com/celo-org/zexe", branch = "master", package = "algebra", default_features = false, features = [$curve_string]})]
+            pub mod scalar_mul {
+                use algebra::{$curve::$ProjCurve};
+                use algebra_core::{curves::ProjectiveCurve, fields::PrimeField, FpParameters, Zero};
+
+                const NUM_BITS: isize =
+                    <<<$ProjCurve as ProjectiveCurve>::ScalarField as PrimeField>::Params as FpParameters>::MODULUS_BITS as isize;
+                const LOG2_W: isize = 5;
+                const TABLE_SIZE: isize = 1 << LOG2_W;
+                const HALF_TABLE_SIZE: isize = 1 << (LOG2_W - 1);
+                const NUM_U8: isize = 2 * ((NUM_BITS - 1) / (2 * (LOG2_W - 1)) + 2);
+
+                #[kernel_func]
+                pub unsafe fn scalar_mul(
+                    #[type_substitute(*const super::$ProjCurve)]
+                    table: *const $ProjCurve,
+                    exps: *const u8,
+                    #[type_substitute(*mut super::$ProjCurve)]
+                    out: *mut $ProjCurve,
+                    n: isize,
+                ) {
+                    let i = accel_core::index();
+                    if i < n {
+                        let mut res = $ProjCurve::zero();
+
+                        res += &(*table.offset(i * TABLE_SIZE + *exps.offset(i * NUM_U8) as isize));
+                        res += &(*table.offset(
+                            i * TABLE_SIZE + HALF_TABLE_SIZE + *exps.offset(i * NUM_U8 + 1) as isize,
+                        ));
+
+                        for j in 1..NUM_U8 as isize / 2 {
+                            for _ in 0..(LOG2_W - 1) {
+                                res.double_in_place();
+                            }
+                            res += &(*table
+                                .offset(i * TABLE_SIZE + *exps.offset(i * NUM_U8 + 2 * j) as isize));
+                            res += &(*table.offset(
+                                i * TABLE_SIZE
+                                    + HALF_TABLE_SIZE
+                                    + *exps.offset(i * NUM_U8 + 2 * j + 1) as isize,
+                            ));
+                        }
+                        *out.offset(i) = res;
+                    }
+                }
+            }
+        }
+    }
+}
+
+#[macro_export]
+macro_rules! impl_scalar_mul_parameters {
+    ($ProjCurve:ident) => {
+        #[allow(unused_variables)]
+        fn scalar_mul_kernel(
+            ctx: &Context,
+            grid: usize,
+            block: usize,
+            table: *const $ProjCurve,
+            exps: *const u8,
+            out: *mut $ProjCurve,
+            n: isize,
+        ) -> error::Result<()> {
+            #[cfg(feature = "cuda")]
+            scalar_mul(ctx, grid, block, (table, exps, out, n))
+        }
+
+        fn scalar_mul_static_profiler() -> ScalarMulProfiler {
+            #[cfg(feature = "cuda")]
+            return (*MICROBENCH_CPU_GPU_AVG_RATIO).clone();
+
+            #[cfg(not(feature = "cuda"))]
+            MICROBENCH_CPU_GPU_AVG_RATIO
+        }
+
+        fn namespace() -> &'static str {
+            NAMESPACE
+        }
+    };
+}
diff --git a/algebra-core/src/curves/cuda/scalar_mul/mod.rs b/algebra-core/src/curves/cuda/scalar_mul/mod.rs
new file mode 100644
index 000000000..e96f4b0f9
--- /dev/null
+++ b/algebra-core/src/curves/cuda/scalar_mul/mod.rs
@@ -0,0 +1,357 @@
+#[macro_use]
+mod kernel_macros;
+pub use kernel_macros::*;
+
+#[macro_use]
+mod cpu_gpu_macros;
+
+#[macro_use]
+mod run_kernel_macros;
+
+#[cfg(feature = "cuda")]
+use std::sync::{Arc, Mutex};
+
+use core::fmt;
+
+use crate::{
+    cfg_chunks_mut,
+    curves::{AffineCurve, BatchGroupArithmeticSlice},
+    fields::PrimeField,
+};
+use internal::GPUScalarMulInternal;
+
+#[cfg(feature = "cuda")]
+pub type ScalarMulProfiler = Arc<Mutex<(Vec<f64>, usize)>>;
+#[cfg(not(feature = "cuda"))]
+pub type ScalarMulProfiler = ();
+
+#[cfg(feature = "parallel")]
+use rayon::prelude::*;
+
+pub const MAX_GROUP_ELEM_BYTES: usize = 400;
+
+#[derive(Debug)]
+pub enum CudaScalarMulError {
+    CudaDisabledError,
+    IoError,
+    KernelFailedError,
+    ProfilingSerializationError,
+    ProfilingDeserializationError,
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for CudaScalarMulError {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        None
+    }
+}
+
+#[cfg(feature = "std")]
+impl From<std::io::Error> for CudaScalarMulError {
+    fn from(_: std::io::Error) -> Self {
+        CudaScalarMulError::IoError
+    }
+}
+
+impl fmt::Display for CudaScalarMulError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
+        match self {
+            CudaScalarMulError::CudaDisabledError => write!(f, "CUDA is disabled"),
+            CudaScalarMulError::IoError => write!(f, "IO error"),
+            CudaScalarMulError::KernelFailedError => write!(f, "Failed running kernel"),
+            CudaScalarMulError::ProfilingSerializationError => {
+                write!(f, "Failed serlializing profiling data")
+            }
+            CudaScalarMulError::ProfilingDeserializationError => {
+                write!(f, "Failed deserializing profiling data")
+            }
+        }
+    }
+}
+
+pub trait GPUScalarMul<G: AffineCurve>: GPUScalarMulInternal<G> {
+    fn clear_gpu_profiling_data() {
+        #[cfg(feature = "cuda")]
+        <Self as internal::GPUScalarMulInternal<G>>::clear_gpu_profiling_data()
+            .expect("Should have cleared GPU profiling data");
+    }
+
+    #[allow(unused_variables)]
+    fn cpu_gpu_scalar_mul(
+        elems: &mut [G],
+        exps_h: &[<<G as AffineCurve>::ScalarField as PrimeField>::BigInt],
+        cuda_group_size: usize,
+        // size of the batch for cpu scalar mul
+        cpu_chunk_size: usize,
+    ) -> Result<(), CudaScalarMulError> {
+        #[cfg(feature = "cuda")]
+        {
+            // CUDA will return ILLEGAL_ADRESS if group elem size is too large.
+            if accel::Device::init() && core::mem::size_of::<G>() < MAX_GROUP_ELEM_BYTES {
+                <G as AffineCurve>::Projective::cpu_gpu_static_partition_run_kernel(
+                    elems,
+                    exps_h,
+                    cuda_group_size,
+                    cpu_chunk_size,
+                )?;
+            } else {
+                let mut exps_mut = exps_h.to_vec();
+                cfg_chunks_mut!(elems, cpu_chunk_size)
+                    .zip(cfg_chunks_mut!(exps_mut, cpu_chunk_size))
+                    .for_each(|(b, s)| {
+                        b[..].batch_scalar_mul_in_place(&mut s[..], 4);
+                    });
+            }
+        }
+
+        #[cfg(not(feature = "cuda"))]
+        {
+            let mut exps_mut = exps_h.to_vec();
+            cfg_chunks_mut!(elems, cpu_chunk_size)
+                .zip(cfg_chunks_mut!(exps_mut, cpu_chunk_size))
+                .for_each(|(b, s)| {
+                    b[..].batch_scalar_mul_in_place(&mut s[..], 4);
+                });
+        }
+
+        Ok(())
+    }
+}
+
+impl<G: AffineCurve> GPUScalarMul<G> for G::Projective {}
+
+pub(crate) mod internal {
+    #[cfg(feature = "cuda")]
+    use accel::*;
+
+    #[cfg(not(feature = "cuda"))]
+    use crate::accel_dummy::*;
+
+    #[cfg(not(feature = "std"))]
+    use alloc::{string::String, vec::Vec};
+
+    use crate::{curves::AffineCurve, fields::PrimeField, CudaScalarMulError};
+
+    #[allow(unused_variables)]
+    pub trait GPUScalarMulInternal<G: AffineCurve>: Sized {
+        const NUM_BITS: usize;
+        const LOG2_W: usize;
+
+        fn table_size() -> usize {
+            1 << Self::LOG2_W
+        }
+
+        fn num_u8() -> usize;
+
+        fn init_gpu_cache_dir() -> Result<String, CudaScalarMulError>;
+        fn read_profile_data() -> Result<String, CudaScalarMulError>;
+        fn write_profile_data(profile_data: &str) -> Result<(), CudaScalarMulError>;
+        fn clear_gpu_profiling_data() -> Result<(), CudaScalarMulError>;
+
+        fn par_run_kernel(
+            ctx: &Context,
+            bases_h: &[G],
+            exps_h: &[<<G as AffineCurve>::ScalarField as PrimeField>::BigInt],
+            cuda_group_size: usize,
+        ) -> DeviceMemory<Self>;
+
+        fn par_run_kernel_sync<T>(
+            ctx: &Context,
+            bases_h: &[G],
+            exps_h: &[<<G as AffineCurve>::ScalarField as PrimeField>::BigInt],
+            cuda_group_size: usize,
+            lock: T,
+        ) -> DeviceMemory<Self>;
+
+        fn generate_tables_and_recoding(
+            bases_h: &[G],
+            tables_h: &mut [Self],
+            exps_h: &[<<G as AffineCurve>::ScalarField as PrimeField>::BigInt],
+            exps_recode_h: &mut [u8],
+        );
+
+        fn cpu_gpu_load_balance_run_kernel(
+            ctx: &Context,
+            bases_h: &[G],
+            exps_h: &[<<G as AffineCurve>::ScalarField as PrimeField>::BigInt],
+            cuda_group_size: usize,
+            // size of a single job in the queue e.g. 2 << 14
+            job_size: usize,
+            // size of the batch for cpu scalar mul
+            cpu_chunk_size: usize,
+        ) -> Vec<G>;
+
+        fn cpu_gpu_static_partition_run_kernel(
+            bases_h: &mut [G],
+            exps_h: &[<<G as AffineCurve>::ScalarField as PrimeField>::BigInt],
+            cuda_group_size: usize,
+            // size of the batch for cpu scalar mul
+            cpu_chunk_size: usize,
+        ) -> Result<(), CudaScalarMulError>;
+    }
+}
+
+#[macro_export]
+macro_rules! impl_gpu_sw_projective {
+    ($Parameters:ident) => {
+        impl<P: $Parameters> GPUScalarMulInternal<GroupAffine<P>> for GroupProjective<P> {
+            const NUM_BITS: usize =
+                <<<Self as ProjectiveCurve>::ScalarField as PrimeField>::Params as FpParameters>::MODULUS_BITS as usize;
+            const LOG2_W: usize = 5;
+
+            fn num_u8() -> usize {
+                if P::has_glv() {
+                    2 * ((Self::NUM_BITS - 1) / (2 * (Self::LOG2_W - 1)) + 2)
+                } else {
+                    (Self::NUM_BITS - 1) / Self::LOG2_W + 1
+                }
+            }
+
+            fn generate_tables_and_recoding(
+                bases_h: &[<Self as ProjectiveCurve>::Affine],
+                tables_h: &mut [Self],
+                exps_h: &[<<Self as ProjectiveCurve>::ScalarField as PrimeField>::BigInt],
+                exps_recode_h: &mut [u8],
+            ) {
+                if P::has_glv() {
+                    let scalar_recode_glv =
+                        |k1: &mut <<Self as ProjectiveCurve>::ScalarField as PrimeField>::BigInt, k2: &mut <<Self as ProjectiveCurve>::ScalarField as PrimeField>::BigInt| -> Vec<u8> {
+                            let table_size_glv: u64 = 1u64 << (Self::LOG2_W - 1);
+                            let mut out = vec![0; Self::num_u8()];
+                            for i in (0..Self::num_u8() / 2).rev() {
+                                out[2 * i] = (k1.as_ref()[0] % table_size_glv) as u8;
+                                out[2 * i + 1] = (k2.as_ref()[0] % table_size_glv) as u8;
+                                k1.divn(Self::LOG2_W as u32 - 1);
+                                k2.divn(Self::LOG2_W as u32 - 1);
+                            }
+                            assert!(k1.is_zero());
+                            assert!(k2.is_zero());
+                            out
+                        };
+
+                    cfg_iter!(exps_h)
+                        .zip(cfg_chunks_mut!(exps_recode_h, Self::num_u8()))
+                        .zip(cfg_chunks_mut!(tables_h, Self::table_size()).zip(cfg_iter!(bases_h)))
+                        .for_each(|((k, exps_chunk), (table, base))| {
+                            let ((k1_neg, mut k1), (k2_neg, mut k2)) =
+                                P::glv_scalar_decomposition(*k);
+                            let base = base.into_projective();
+                            exps_chunk.clone_from_slice(&scalar_recode_glv(&mut k1, &mut k2));
+
+                            table[0] = Self::zero();
+                            table[Self::table_size() / 2] = Self::zero();
+
+                            for i in 1..Self::table_size() / 2 {
+                                let mut res = if k1_neg {
+                                    table[i - 1] - base
+                                } else {
+                                    table[i - 1] + base
+                                };
+                                table[i] = res;
+
+                                P::glv_endomorphism_in_place(&mut res.x);
+                                table[Self::table_size() / 2 + i] =
+                                    if k2_neg != k1_neg { res.neg() } else { res };
+                            }
+                        });
+                } else {
+                    let scalar_recode = |k: &mut <<Self as ProjectiveCurve>::ScalarField as PrimeField>::BigInt| -> Vec<u8> {
+                        let mut out = vec![0; Self::num_u8()];
+                        for i in (0..Self::num_u8()).rev() {
+                            out[i] = (k.as_ref()[0] % Self::table_size() as u64) as u8;
+                            k.divn(Self::LOG2_W as u32);
+                        }
+                        assert!(k.is_zero());
+                        out
+                    };
+                    cfg_iter!(exps_h)
+                        .zip(cfg_chunks_mut!(exps_recode_h, Self::num_u8()))
+                        .zip(cfg_chunks_mut!(tables_h, Self::table_size()).zip(cfg_iter!(bases_h)))
+                        .for_each(|((k, exps_chunk), (table, base))| {
+                            let base = base.into_projective();
+                            exps_chunk.clone_from_slice(&scalar_recode(&mut k.clone())[..]);
+
+                            table[0] = Self::zero();
+                            for i in 1..Self::table_size() {
+                                table[i] = table[i - 1] + base;
+                            }
+                        });
+                }
+            }
+
+            impl_run_kernel!();
+            impl_gpu_cpu_run_kernel!();
+        }
+    };
+}
+
+#[macro_export]
+macro_rules! impl_gpu_te_projective {
+    ($Parameters:ident) => {
+        impl<P: $Parameters> GPUScalarMulInternal<GroupAffine<P>> for GroupProjective<P> {
+            const NUM_BITS: usize =
+                <<<Self as ProjectiveCurve>::ScalarField as PrimeField>::Params as FpParameters>::MODULUS_BITS as usize;
+            const LOG2_W: usize = 5;
+
+            fn generate_tables_and_recoding(
+                bases_h: &[<Self as ProjectiveCurve>::Affine],
+                tables_h: &mut [Self],
+                exps_h: &[<<Self as ProjectiveCurve>::ScalarField as PrimeField>::BigInt],
+                exps_recode_h: &mut [u8],
+            ) {
+                let scalar_recode = |k: &mut <<Self as ProjectiveCurve>::ScalarField as PrimeField>::BigInt| -> Vec<u8> {
+                    let mut out = vec![0; Self::num_u8()];
+                    for i in (0..Self::num_u8()).rev() {
+                        out[i] = (k.as_ref()[0] % Self::table_size() as u64) as u8;
+                        k.divn(Self::LOG2_W as u32);
+                    }
+                    assert!(k.is_zero());
+                    out
+                };
+                cfg_iter!(exps_h)
+                    .zip(cfg_chunks_mut!(exps_recode_h, Self::num_u8()))
+                    .zip(cfg_chunks_mut!(tables_h, Self::table_size()).zip(cfg_iter!(bases_h)))
+                    .for_each(|((k, exps_chunk), (table, base))| {
+                        let base = base.into_projective();
+                        exps_chunk.clone_from_slice(&scalar_recode(&mut k.clone())[..]);
+
+                        table[0] = Self::zero();
+                        for i in 1..Self::table_size() {
+                            table[i] = table[i - 1] + base;
+                        }
+                    }
+                );
+            }
+
+            fn num_u8() -> usize {
+                (Self::NUM_BITS - 1) / Self::LOG2_W + 1
+            }
+
+            impl_run_kernel!();
+            impl_gpu_cpu_run_kernel!();
+        }
+    };
+}
+
+pub trait GPUScalarMulSlice<G: AffineCurve> {
+    #[allow(unused_variables)]
+    fn cpu_gpu_scalar_mul(
+        &mut self,
+        exps_h: &[<<G as AffineCurve>::ScalarField as PrimeField>::BigInt],
+        cuda_group_size: usize,
+        // size of the batch for cpu scalar mul
+        cpu_chunk_size: usize,
+    ) -> Result<(), CudaScalarMulError>;
+}
+
+impl<G: AffineCurve> GPUScalarMulSlice<G> for [G] {
+    fn cpu_gpu_scalar_mul(
+        &mut self,
+        exps_h: &[<<G as AffineCurve>::ScalarField as PrimeField>::BigInt],
+        cuda_group_size: usize,
+        // size of the batch for cpu scalar mul
+        cpu_chunk_size: usize,
+    ) -> Result<(), CudaScalarMulError> {
+        G::Projective::cpu_gpu_scalar_mul(self, exps_h, cuda_group_size, cpu_chunk_size)
+    }
+}
diff --git a/algebra-core/src/curves/cuda/scalar_mul/run_kernel_macros.rs b/algebra-core/src/curves/cuda/scalar_mul/run_kernel_macros.rs
new file mode 100644
index 000000000..031533064
--- /dev/null
+++ b/algebra-core/src/curves/cuda/scalar_mul/run_kernel_macros.rs
@@ -0,0 +1,100 @@
+#[macro_export]
+macro_rules! impl_run_kernel {
+    () => {
+        // We drop a lock only after the parallel portion has been handled
+        #[allow(unused_variables)]
+        fn par_run_kernel_sync<T>(
+            ctx: &Context,
+            bases_h: &[<Self as ProjectiveCurve>::Affine],
+            exps_h: &[<<Self as ProjectiveCurve>::ScalarField as PrimeField>::BigInt],
+            cuda_group_size: usize,
+            lock: T,
+        ) -> DeviceMemory<Self> {
+            #[cfg(feature = "cuda")]
+            {
+                assert_eq!(bases_h.len(), exps_h.len());
+                let n = bases_h.len();
+
+                let mut tables_h = vec![Self::zero(); n * Self::table_size()];
+                let mut exps_recode_h = vec![0u8; n * Self::num_u8()];
+
+                let _now = timer!();
+                Self::generate_tables_and_recoding(
+                    bases_h,
+                    &mut tables_h[..],
+                    exps_h,
+                    &mut exps_recode_h[..],
+                );
+                drop(lock);
+                timer_println!(_now, "generated tables & recode");
+
+                let _now = timer!();
+                let mut out = DeviceMemory::<Self>::zeros(&ctx, n);
+                let mut tables = DeviceMemory::<Self>::zeros(&ctx, n * Self::table_size());
+                let mut exps = DeviceMemory::<u8>::zeros(&ctx, n * Self::num_u8());
+                timer_println!(_now, "allocate device memory");
+
+                let _now = timer!();
+                tables.copy_from_slice(&tables_h);
+                exps.copy_from_slice(&exps_recode_h);
+                timer_println!(_now, "copy data to device");
+
+                let _now = timer!();
+                P::scalar_mul_kernel(
+                    &ctx,
+                    n / cuda_group_size, // grid
+                    cuda_group_size,     // block
+                    tables.as_ptr(),
+                    exps.as_ptr(),
+                    out.as_mut_ptr(),
+                    n as isize,
+                )
+                .expect("Kernel call failed");
+                timer_println!(_now, "run kernel");
+                out
+            }
+            #[cfg(not(feature = "cuda"))]
+            unreachable!();
+        }
+
+        #[allow(unused_variables)]
+        fn par_run_kernel(
+            ctx: &Context,
+            bases_h: &[<Self as ProjectiveCurve>::Affine],
+            exps_h: &[<<Self as ProjectiveCurve>::ScalarField as PrimeField>::BigInt],
+            cuda_group_size: usize,
+        ) -> DeviceMemory<Self> {
+            #[cfg(feature = "cuda")]
+            {
+                assert_eq!(bases_h.len(), exps_h.len());
+                let n = bases_h.len();
+
+                let _now = timer!();
+                let mut tables = DeviceMemory::<Self>::zeros(&ctx, n * Self::table_size());
+                let mut exps = DeviceMemory::<u8>::zeros(&ctx, n * Self::num_u8());
+                let mut out = DeviceMemory::<Self>::zeros(&ctx, n);
+                timer_println!(_now, "allocate device memory");
+
+                let _now = timer!();
+                Self::generate_tables_and_recoding(bases_h, &mut tables[..], exps_h, &mut exps[..]);
+                timer_println!(_now, "generated tables & recode");
+
+                let _now = timer!();
+                P::scalar_mul_kernel(
+                    &ctx,
+                    n / cuda_group_size, // grid
+                    cuda_group_size,     // block
+                    tables.as_ptr(),
+                    exps.as_ptr(),
+                    out.as_mut_ptr(),
+                    n as isize,
+                )
+                .expect("Kernel call failed");
+                timer_println!(_now, "run kernel");
+                out
+            }
+            #[cfg(not(feature = "cuda"))]
+            unreachable!();
+        }
+    };
+}
diff --git a/algebra-core/src/curves/glv.rs b/algebra-core/src/curves/glv.rs
index eb4af4a35..bf46c213b 100644
--- a/algebra-core/src/curves/glv.rs
+++ b/algebra-core/src/curves/glv.rs
@@ -15,6 +15,7 @@ pub trait GLVParameters: Send + Sync + 'static + ModelParameters {
     const B1: <Self::ScalarField as PrimeField>::BigInt; // |b1|
     const B2: <Self::ScalarField as PrimeField>::BigInt; // |b2|
     const B1_IS_NEG: bool;
+
     const R_BITS: u32;
 
     #[inline]
diff --git a/algebra-core/src/curves/mod.rs b/algebra-core/src/curves/mod.rs
index 1ba08682d..ade771000 100644
--- a/algebra-core/src/curves/mod.rs
+++ b/algebra-core/src/curves/mod.rs
@@ -26,6 +26,10 @@ pub use self::glv::*;
 
 pub mod models;
 
+#[macro_use]
+pub mod cuda;
+pub use cuda::*;
+
 pub use self::models::*;
 
 pub trait PairingEngine: Sized + 'static + Copy + Debug + Sync + Send + Eq + PartialEq {
@@ -36,6 +40,7 @@ pub trait PairingEngine: Sized + 'static + Copy + Debug + Sync + Send + Eq + Par
     type G1Projective: ProjectiveCurve<BaseField = Self::Fq, ScalarField = Self::Fr, Affine = Self::G1Affine>
         + From<Self::G1Affine>
         + Into<Self::G1Affine>
+        + GPUScalarMul<Self::G1Affine>
         + MulAssign<Self::Fr>; // needed due to https://github.com/rust-lang/rust/issues/69640
 
     /// The affine representation of an element in G1.
@@ -51,6 +56,7 @@ pub trait PairingEngine: Sized + 'static + Copy + Debug + Sync + Send + Eq + Par
     type G2Projective: ProjectiveCurve<BaseField = Self::Fqe, ScalarField = Self::Fr, Affine = Self::G2Affine>
         + From<Self::G2Affine>
         + Into<Self::G2Affine>
+        + GPUScalarMul<Self::G2Affine>
         + MulAssign<Self::Fr>; // needed due to https://github.com/rust-lang/rust/issues/69640
 
     /// The affine representation of an element in G2.
@@ -134,6 +140,7 @@ pub trait ProjectiveCurve:
     + core::iter::Sum<Self>
     + for<'a> core::iter::Sum<&'a Self>
     + From<<Self as ProjectiveCurve>::Affine>
+    + GPUScalarMul<<Self as ProjectiveCurve>::Affine>
 {
     const COFACTOR: &'static [u64];
     type ScalarField: PrimeField + SquareRootField;
@@ -229,7 +236,7 @@ pub trait AffineCurve:
     + Zero
     + Neg<Output = Self>
     + From<<Self as AffineCurve>::Projective>
-    + BatchGroupArithmetic<BBaseField = <Self as AffineCurve>::BaseField>
+    + BatchGroupArithmetic<BaseFieldForBatch = <Self as AffineCurve>::BaseField>
 {
     const COFACTOR: &'static [u64];
     type ScalarField: PrimeField + SquareRootField + Into<<Self::ScalarField as PrimeField>::BigInt>;
@@ -237,6 +244,7 @@ pub trait AffineCurve:
     type Projective: ProjectiveCurve<Affine = Self, ScalarField = Self::ScalarField, BaseField = Self::BaseField>
         + From<Self>
         + Into<Self>
+        + GPUScalarMul<Self>
         + MulAssign<Self::ScalarField>; // needed due to https://github.com/rust-lang/rust/issues/69640
 
     /// Returns a fixed generator of unknown exponent.
diff --git a/algebra-core/src/curves/models/mod.rs b/algebra-core/src/curves/models/mod.rs
index 5a7f51270..0c0329973 100644
--- a/algebra-core/src/curves/models/mod.rs
+++ b/algebra-core/src/curves/models/mod.rs
@@ -12,9 +12,11 @@ pub(crate) mod sw_batch_affine;
 pub mod short_weierstrass_affine;
 #[macro_use]
 pub mod short_weierstrass_jacobian;
-pub mod short_weierstrass_projective;
 pub mod twisted_edwards_extended;
 
+pub use short_weierstrass_jacobian::SWModelParameters;
+pub use twisted_edwards_extended::TEModelParameters;
+
 pub trait ModelParameters: Send + Sync + 'static {
     type BaseField: Field + SquareRootField;
     type ScalarField: PrimeField
@@ -23,70 +25,6 @@ pub trait ModelParameters: Send + Sync + 'static {
         + From<<Self::ScalarField as PrimeField>::BigInt>;
 }
 
-pub trait SWModelParameters: ModelParameters {
-    const COEFF_A: Self::BaseField;
-    const COEFF_B: Self::BaseField;
-    const COFACTOR: &'static [u64];
-    const COFACTOR_INV: Self::ScalarField;
-    const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField);
-
-    #[inline(always)]
-    fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField {
-        let mut copy = *elem;
-        copy *= &Self::COEFF_A;
-        copy
-    }
-
-    #[inline(always)]
-    fn add_b(elem: &Self::BaseField) -> Self::BaseField {
-        let mut copy = *elem;
-        copy += &Self::COEFF_B;
-        copy
-    }
-
-    #[inline(always)]
-    fn has_glv() -> bool {
-        false
-    }
-
-    #[inline(always)]
-    fn glv_endomorphism_in_place(_elem: &mut Self::BaseField) {
-        unimplemented!()
-    }
-
-    #[inline(always)]
-    fn glv_scalar_decomposition(
-        _k: <Self::ScalarField as PrimeField>::BigInt,
-    ) -> (
-        (bool, <Self::ScalarField as PrimeField>::BigInt),
-        (bool, <Self::ScalarField as PrimeField>::BigInt),
-    ) {
-        unimplemented!()
-    }
-
-    #[inline(always)]
-    fn glv_window_size() -> usize {
-        4
-    }
-}
-
-pub trait TEModelParameters: ModelParameters {
-    const COEFF_A: Self::BaseField;
-    const COEFF_D: Self::BaseField;
-    const COFACTOR: &'static [u64];
-    const COFACTOR_INV: Self::ScalarField;
-    const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField);
-
-    type MontgomeryModelParameters: MontgomeryModelParameters<BaseField = Self::BaseField>;
-
-    #[inline(always)]
-    fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField {
-        let mut copy = *elem;
-        copy *= &Self::COEFF_A;
-        copy
-    }
-}
-
 pub trait MontgomeryModelParameters: ModelParameters {
     const COEFF_A: Self::BaseField;
     const COEFF_B: Self::BaseField;
diff --git a/algebra-core/src/curves/models/short_weierstrass_affine.rs b/algebra-core/src/curves/models/short_weierstrass_affine.rs
index 1be242d3d..995ee2977 100644
--- a/algebra-core/src/curves/models/short_weierstrass_affine.rs
+++ b/algebra-core/src/curves/models/short_weierstrass_affine.rs
@@ -10,15 +10,15 @@ macro_rules! specialise_affine_to_proj {
 
         #[derive(Derivative)]
         #[derivative(
-            Copy(bound = "P: Parameters"),
-            Clone(bound = "P: Parameters"),
-            PartialEq(bound = "P: Parameters"),
-            Eq(bound = "P: Parameters"),
-            Debug(bound = "P: Parameters"),
-            Hash(bound = "P: Parameters")
+            Copy(bound = "P: SWModelParameters"),
+            Clone(bound = "P: SWModelParameters"),
+            PartialEq(bound = "P: SWModelParameters"),
+            Eq(bound = "P: SWModelParameters"),
+            Debug(bound = "P: SWModelParameters"),
+            Hash(bound = "P: SWModelParameters")
         )]
         #[repr(C)]
-        pub struct GroupAffine<P: Parameters> {
+        pub struct GroupAffine<P: SWModelParameters> {
             pub infinity: bool,
             pub x: P::BaseField,
             pub y: P::BaseField,
@@ -26,7 +26,35 @@ macro_rules! specialise_affine_to_proj {
             _params: PhantomData<P>,
         }
 
-        impl<P: Parameters> AffineCurve for GroupAffine<P> {
+        impl<P: SWModelParameters> GroupAffine<P> {
+            #[inline(always)]
+            pub fn has_glv() -> bool {
+                P::has_glv()
+            }
+
+            #[inline(always)]
+            pub fn glv_endomorphism_in_place(elem: &mut <Self as AffineCurve>::BaseField) {
+                P::glv_endomorphism_in_place(elem);
+            }
+
+            #[inline]
+            pub fn glv_scalar_decomposition(
+                k: <<Self as AffineCurve>::ScalarField as PrimeField>::BigInt,
+            ) -> (
+                (
+                    bool,
+                    <<Self as AffineCurve>::ScalarField as PrimeField>::BigInt,
+                ),
+                (
+                    bool,
+                    <<Self as AffineCurve>::ScalarField as PrimeField>::BigInt,
+                ),
+            ) {
+                P::glv_scalar_decomposition(k)
+            }
+        }
+
+        impl<P: SWModelParameters> AffineCurve for GroupAffine<P> {
             const COFACTOR: &'static [u64] = P::COFACTOR;
             type BaseField = P::BaseField;
             type ScalarField = P::ScalarField;
@@ -81,7 +109,7 @@ macro_rules! specialise_affine_to_proj {
             }
         }
 
-        impl<P: Parameters> GroupAffine<P> {
+        impl<P: SWModelParameters> GroupAffine<P> {
             pub fn new(x: P::BaseField, y: P::BaseField, infinity: bool) -> Self {
                 Self {
                     x,
@@ -147,7 +175,7 @@ macro_rules! specialise_affine_to_proj {
             }
         }
 
-        impl<P: Parameters> Display for GroupAffine<P> {
+        impl<P: SWModelParameters> Display for GroupAffine<P> {
             fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
                 if self.infinity {
                     write!(f, "GroupAffine(Infinity)")
@@ -157,7 +185,7 @@ macro_rules! specialise_affine_to_proj {
             }
         }
 
-        impl<P: Parameters> Zero for GroupAffine<P> {
+        impl<P: SWModelParameters> Zero for GroupAffine<P> {
             fn zero() -> Self {
                 Self::new(P::BaseField::zero(), P::BaseField::one(), true)
             }
@@ -167,7 +195,7 @@ macro_rules! specialise_affine_to_proj {
             }
         }
 
-        impl<P: Parameters> Add<Self> for GroupAffine<P> {
+        impl<P: SWModelParameters> Add<Self> for GroupAffine<P> {
             type Output = Self;
             fn add(self, other: Self) -> Self {
                 let mut copy = self;
@@ -176,7 +204,7 @@ macro_rules! specialise_affine_to_proj {
             }
         }
 
-        impl<'a, P: Parameters> AddAssign<&'a Self> for GroupAffine<P> {
+        impl<'a, P: SWModelParameters> AddAssign<&'a Self> for GroupAffine<P> {
             fn add_assign(&mut self, other: &'a Self) {
                 let mut s_proj = <Self as AffineCurve>::Projective::from(*self);
                 s_proj.add_assign_mixed(other);
@@ -184,7 +212,7 @@ macro_rules! specialise_affine_to_proj {
             }
         }
 
-        impl<P: Parameters> Neg for GroupAffine<P> {
+        impl<P: SWModelParameters> Neg for GroupAffine<P> {
             type Output = Self;
 
             #[inline]
@@ -199,7 +227,7 @@ macro_rules! specialise_affine_to_proj {
 
         impl_sw_batch_affine!(GroupAffine);
 
-        impl<P: Parameters> ToBytes for GroupAffine<P> {
+        impl<P: SWModelParameters> ToBytes for GroupAffine<P> {
             #[inline]
             fn write<W: Write>(&self, mut writer: W) -> IoResult<()> {
                 self.x.write(&mut writer)?;
@@ -208,7 +236,7 @@ macro_rules! specialise_affine_to_proj {
             }
         }
 
-        impl<P: Parameters> FromBytes for GroupAffine<P> {
+        impl<P: SWModelParameters> FromBytes for GroupAffine<P> {
             #[inline]
             fn read<R: Read>(mut reader: R) -> IoResult<Self> {
                 let x = P::BaseField::read(&mut reader)?;
@@ -218,14 +246,14 @@ macro_rules! specialise_affine_to_proj {
             }
         }
 
-        impl<P: Parameters> Default for GroupAffine<P> {
+        impl<P: SWModelParameters> Default for GroupAffine<P> {
             #[inline]
             fn default() -> Self {
                 Self::zero()
             }
         }
 
-        impl_sw_curve_serializer!(Parameters);
+        impl_sw_curve_serializer!(SWModelParameters);
     };
 }
 
diff --git a/algebra-core/src/curves/models/short_weierstrass_jacobian.rs b/algebra-core/src/curves/models/short_weierstrass_jacobian.rs
index 3b06ff835..7ecd95982 100644
--- a/algebra-core/src/curves/models/short_weierstrass_jacobian.rs
+++ b/algebra-core/src/curves/models/short_weierstrass_jacobian.rs
@@ -1,5 +1,4 @@
 use crate::{
-    curves::models::SWModelParameters as Parameters,
     io::{Read, Result as IoResult, Write},
     serialize::{Flags, SWFlags},
     UniformRand, Vec,
@@ -15,10 +14,26 @@ use rand::{
     Rng,
 };
 
+#[cfg(not(feature = "cuda"))]
+use crate::accel_dummy::*;
+#[cfg(feature = "cuda")]
+use accel::*;
+
+#[cfg(feature = "cuda")]
+use {
+    crate::curves::BatchGroupArithmeticSlice, closure::closure, log::debug, peekmore::PeekMore,
+    std::sync::Mutex,
+};
+
 use crate::{
     bytes::{FromBytes, ToBytes},
-    curves::{AffineCurve, BatchGroupArithmetic, ProjectiveCurve},
-    fields::{BitIteratorBE, Field, PrimeField, SquareRootField},
+    cfg_chunks_mut, cfg_iter,
+    curves::{
+        cuda::scalar_mul::{internal::GPUScalarMulInternal, ScalarMulProfiler},
+        AffineCurve, BatchGroupArithmetic, ModelParameters, ProjectiveCurve,
+    },
+    fields::{BitIteratorBE, Field, FpParameters, PrimeField, SquareRootField},
+    impl_gpu_cpu_run_kernel, impl_gpu_sw_projective, impl_run_kernel,
 };
 
 use crate::{
@@ -31,30 +46,119 @@ specialise_affine_to_proj!(GroupProjective);
 #[cfg(feature = "parallel")]
 use rayon::prelude::*;
 
+pub trait SWModelParameters: ModelParameters + Sized {
+    const COEFF_A: Self::BaseField;
+    const COEFF_B: Self::BaseField;
+    const COFACTOR: &'static [u64];
+    const COFACTOR_INV: Self::ScalarField;
+    const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField);
+
+    #[inline(always)]
+    fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField {
+        let mut copy = *elem;
+        copy *= &Self::COEFF_A;
+        copy
+    }
+
+    #[inline(always)]
+    fn glv_window_size() -> usize {
+        4
+    }
+
+    #[inline(always)]
+    fn add_b(elem: &Self::BaseField) -> Self::BaseField {
+        let mut copy = *elem;
+        copy += &Self::COEFF_B;
+        copy
+    }
+
+    #[inline(always)]
+    fn has_glv() -> bool {
+        false
+    }
+
+    #[inline(always)]
+    fn glv_endomorphism_in_place(_elem: &mut Self::BaseField) {
+        unimplemented!()
+    }
+
+    #[inline(always)]
+    fn glv_scalar_decomposition(
+        _k: <Self::ScalarField as PrimeField>::BigInt,
+    ) -> (
+        (bool, <Self::ScalarField as PrimeField>::BigInt),
+        (bool, <Self::ScalarField as PrimeField>::BigInt),
+    ) {
+        unimplemented!()
+    }
+
+    fn scalar_mul_kernel(
+        ctx: &Context,
+        grid: usize,
+        block: usize,
+        table: *const GroupProjective<Self>,
+        exps: *const u8,
+        out: *mut GroupProjective<Self>,
+        n: isize,
+    ) -> error::Result<()>;
+
+    fn scalar_mul_static_profiler() -> ScalarMulProfiler;
+
+    fn namespace() -> &'static str;
+}
+
+impl_gpu_sw_projective!(SWModelParameters);
+
 #[derive(Derivative)]
 #[derivative(
-    Copy(bound = "P: Parameters"),
-    Clone(bound = "P: Parameters"),
-    Eq(bound = "P: Parameters"),
-    Debug(bound = "P: Parameters"),
-    Hash(bound = "P: Parameters")
+    Copy(bound = "P: SWModelParameters"),
+    Clone(bound = "P: SWModelParameters"),
+    Eq(bound = "P: SWModelParameters"),
+    Debug(bound = "P: SWModelParameters"),
+    Hash(bound = "P: SWModelParameters")
 )]
-#[must_use]
-pub struct GroupProjective<P: Parameters> {
+pub struct GroupProjective<P: SWModelParameters> {
     pub x: P::BaseField,
     pub y: P::BaseField,
     pub z: P::BaseField,
-    #[derivative(Debug = "ignore")]
     _params: PhantomData<P>,
 }
 
-impl<P: Parameters> Display for GroupProjective<P> {
+impl<P: SWModelParameters> GroupProjective<P> {
+    #[inline(always)]
+    pub fn has_glv() -> bool {
+        P::has_glv()
+    }
+
+    #[inline(always)]
+    pub fn glv_endomorphism_in_place(elem: &mut <Self as ProjectiveCurve>::BaseField) {
+        P::glv_endomorphism_in_place(elem);
+    }
+
+    #[inline]
+    pub fn glv_scalar_decomposition(
+        k: <<Self as ProjectiveCurve>::ScalarField as PrimeField>::BigInt,
+    ) -> (
+        (
+            bool,
+            <<Self as ProjectiveCurve>::ScalarField as PrimeField>::BigInt,
+        ),
+        (
+            bool,
+            <<Self as ProjectiveCurve>::ScalarField as PrimeField>::BigInt,
+        ),
+    ) {
+        P::glv_scalar_decomposition(k)
+    }
+}
+
+impl<P: SWModelParameters> Display for GroupProjective<P> {
     fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
         write!(f, "{}", GroupAffine::from(*self))
     }
 }
 
-impl<P: Parameters> PartialEq for GroupProjective<P> {
+impl<P: SWModelParameters> PartialEq for GroupProjective<P> {
     fn eq(&self, other: &Self) -> bool {
         if self.is_zero() {
             return other.is_zero();
@@ -78,7 +182,7 @@ impl<P: Parameters> PartialEq for GroupProjective<P> {
     }
 }
 
-impl<P: Parameters> Distribution<GroupProjective<P>> for Standard {
+impl<P: SWModelParameters> Distribution<GroupProjective<P>> for Standard {
     #[inline]
     fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> GroupProjective<P> {
         let mut res = GroupProjective::prime_subgroup_generator();
@@ -88,7 +192,7 @@ impl<P: Parameters> Distribution<GroupProjective<P>> for Standard {
     }
 }
 
-impl<P: Parameters> ToBytes for GroupProjective<P> {
+impl<P: SWModelParameters> ToBytes for GroupProjective<P> {
     #[inline]
     fn write<W: Write>(&self, mut writer: W) -> IoResult<()> {
         self.x.write(&mut writer)?;
@@ -97,7 +201,7 @@ impl<P: Parameters> ToBytes for GroupProjective<P> {
     }
 }
 
-impl<P: Parameters> FromBytes for GroupProjective<P> {
+impl<P: SWModelParameters> FromBytes for GroupProjective<P> {
     #[inline]
     fn read<R: Read>(mut reader: R) -> IoResult<Self> {
         let x = P::BaseField::read(&mut reader)?;
@@ -107,14 +211,14 @@ impl<P: Parameters> FromBytes for GroupProjective<P> {
     }
 }
 
-impl<P: Parameters> Default for GroupProjective<P> {
+impl<P: SWModelParameters> Default for GroupProjective<P> {
     #[inline]
     fn default() -> Self {
         Self::zero()
     }
 }
 
-impl<P: Parameters> GroupProjective<P> {
+impl<P: SWModelParameters> GroupProjective<P> {
     pub fn new(x: P::BaseField, y: P::BaseField, z: P::BaseField) -> Self {
         Self {
             x,
@@ -125,7 +229,7 @@ impl<P: Parameters> GroupProjective<P> {
     }
 }
 
-impl<P: Parameters> Zero for GroupProjective<P> {
+impl<P: SWModelParameters> Zero for GroupProjective<P> {
     // The point at infinity is always represented by
     // Z = 0.
     #[inline]
@@ -145,7 +249,7 @@ impl<P: Parameters> Zero for GroupProjective<P> {
     }
 }
 
-impl<P: Parameters> ProjectiveCurve for GroupProjective<P> {
+impl<P: SWModelParameters> ProjectiveCurve for GroupProjective<P> {
     const COFACTOR: &'static [u64] = P::COFACTOR;
     type BaseField = P::BaseField;
     type ScalarField = P::ScalarField;
@@ -373,7 +477,7 @@ impl<P: Parameters> ProjectiveCurve for GroupProjective<P> {
     }
 }
 
-impl<P: Parameters> Neg for GroupProjective<P> {
+impl<P: SWModelParameters> Neg for GroupProjective<P> {
     type Output = Self;
 
     #[inline]
@@ -386,9 +490,9 @@ impl<P: Parameters> Neg for GroupProjective<P> {
     }
 }
 
-crate::impl_additive_ops_from_ref!(GroupProjective, Parameters);
+crate::impl_additive_ops_from_ref!(GroupProjective, SWModelParameters);
 
-impl<'a, P: Parameters> Add<&'a Self> for GroupProjective<P> {
+impl<'a, P: SWModelParameters> Add<&'a Self> for GroupProjective<P> {
     type Output = Self;
 
     #[inline]
@@ -399,7 +503,7 @@ impl<'a, P: Parameters> Add<&'a Self> for GroupProjective<P> {
     }
 }
 
-impl<'a, P: Parameters> AddAssign<&'a Self> for GroupProjective<P> {
+impl<'a, P: SWModelParameters> AddAssign<&'a Self> for GroupProjective<P> {
     fn add_assign(&mut self, other: &'a Self) {
         if self.is_zero() {
             *self = *other;
@@ -464,7 +568,7 @@ impl<'a, P: Parameters> AddAssign<&'a Self> for GroupProjective<P> {
     }
 }
 
-impl<'a, P: Parameters> Sub<&'a Self> for GroupProjective<P> {
+impl<'a, P: SWModelParameters> Sub<&'a Self> for GroupProjective<P> {
     type Output = Self;
 
     #[inline]
@@ -475,13 +579,13 @@ impl<'a, P: Parameters> Sub<&'a Self> for GroupProjective<P> {
     }
 }
 
-impl<'a, P: Parameters> SubAssign<&'a Self> for GroupProjective<P> {
+impl<'a, P: SWModelParameters> SubAssign<&'a Self> for GroupProjective<P> {
     fn sub_assign(&mut self, other: &'a Self) {
         *self += &(-(*other));
     }
 }
 
-impl<P: Parameters> MulAssign<P::ScalarField> for GroupProjective<P> {
+impl<P: SWModelParameters> MulAssign<P::ScalarField> for GroupProjective<P> {
     fn mul_assign(&mut self, other: P::ScalarField) {
         *self = self.mul(other.into_repr())
     }
@@ -489,7 +593,7 @@ impl<P: Parameters> MulAssign<P::ScalarField> for GroupProjective<P> {
 
 // The affine point X, Y is represented in the Jacobian
 // coordinates with Z = 1.
-impl<P: Parameters> From<GroupAffine<P>> for GroupProjective<P> {
+impl<P: SWModelParameters> From<GroupAffine<P>> for GroupProjective<P> {
     #[inline]
     fn from(p: GroupAffine<P>) -> GroupProjective<P> {
         if p.is_zero() {
@@ -502,7 +606,7 @@ impl<P: Parameters> From<GroupAffine<P>> for GroupProjective<P> {
 
 // The projective point X, Y, Z is represented in the affine
 // coordinates as X/Z^2, Y/Z^3.
-impl<P: Parameters> From<GroupProjective<P>> for GroupAffine<P> {
+impl<P: SWModelParameters> From<GroupProjective<P>> for GroupAffine<P> {
     #[inline]
     fn from(p: GroupProjective<P>) -> GroupAffine<P> {
         if p.is_zero() {
diff --git a/algebra-core/src/curves/models/short_weierstrass_projective.rs b/algebra-core/src/curves/models/short_weierstrass_projective.rs
deleted file mode 100644
index 854268ee8..000000000
--- a/algebra-core/src/curves/models/short_weierstrass_projective.rs
+++ /dev/null
@@ -1,415 +0,0 @@
-use crate::{
-    curves::models::SWModelParameters as Parameters,
-    io::{Read, Result as IoResult, Write},
-    serialize::{Flags, SWFlags},
-    UniformRand, Vec,
-};
-use core::{
-    fmt::{Display, Formatter, Result as FmtResult},
-    marker::PhantomData,
-    ops::{Add, AddAssign, MulAssign, Neg, Sub, SubAssign},
-};
-use num_traits::{One, Zero};
-use rand::{
-    distributions::{Distribution, Standard},
-    Rng,
-};
-
-use crate::{
-    bytes::{FromBytes, ToBytes},
-    curves::{AffineCurve, BatchGroupArithmetic, ProjectiveCurve},
-    fields::{BitIteratorBE, Field, PrimeField, SquareRootField},
-};
-
-use crate::{
-    CanonicalDeserialize, CanonicalDeserializeWithFlags, CanonicalSerialize,
-    CanonicalSerializeWithFlags, ConstantSerializedSize,
-};
-
-#[derive(Derivative)]
-#[derivative(
-    Copy(bound = "P: Parameters"),
-    Clone(bound = "P: Parameters"),
-    Eq(bound = "P: Parameters"),
-    Debug(bound = "P: Parameters"),
-    Hash(bound = "P: Parameters")
-)]
-#[must_use]
-pub struct GroupProjective<P: Parameters> {
-    pub x: P::BaseField,
-    pub y: P::BaseField,
-    pub z: P::BaseField,
-    _params: PhantomData<P>,
-}
-
-specialise_affine_to_proj!(GroupProjective);
-
-impl<P: Parameters> Display for GroupProjective<P> {
-    fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
-        write!(f, "{}", GroupAffine::from(*self))
-    }
-}
-
-impl<P: Parameters> PartialEq for GroupProjective<P> {
-    fn eq(&self, other: &Self) -> bool {
-        if self.is_zero() {
-            return other.is_zero();
-        }
-
-        if other.is_zero() {
-            return false;
-        }
-
-        // x1/z1 == x2/z2  <==> x1 * z2 == x2 * z1
-        if (self.x * &other.z) != (other.x * &self.z) {
-            false
-        } else {
-            (self.y * &other.z) == (other.y * &self.z)
-        }
-    }
-}
-
-impl<P: Parameters> Distribution<GroupProjective<P>> for Standard {
-    #[inline]
-    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> GroupProjective<P> {
-        let mut res = GroupProjective::prime_subgroup_generator();
-        res.mul_assign(P::ScalarField::rand(rng));
-        debug_assert!(GroupAffine::from(res).is_in_correct_subgroup_assuming_on_curve());
-        res
-    }
-}
-
-impl<P: Parameters> ToBytes for GroupProjective<P> {
-    #[inline]
-    fn write<W: Write>(&self, mut writer: W) -> IoResult<()> {
-        self.x.write(&mut writer)?;
-        self.y.write(&mut writer)?;
-        self.z.write(writer)
-    }
-}
-
-impl<P: Parameters> FromBytes for GroupProjective<P> {
-    #[inline]
-    fn read<R: Read>(mut reader: R) -> IoResult<Self> {
-        let x = P::BaseField::read(&mut reader)?;
-        let y = P::BaseField::read(&mut reader)?;
-        let z = P::BaseField::read(reader)?;
-        Ok(Self::new(x, y, z))
-    }
-}
-
-impl<P: Parameters> Default for GroupProjective<P> {
-    #[inline]
-    fn default() -> Self {
-        Self::zero()
-    }
-}
-
-impl<P: Parameters> GroupProjective<P> {
-    pub fn new(x: P::BaseField, y: P::BaseField, z: P::BaseField) -> Self {
-        Self {
-            x,
-            y,
-            z,
-            _params: PhantomData,
-        }
-    }
-}
-
-impl<P: Parameters> Zero for GroupProjective<P> {
-    // The point at infinity is always represented by Z = 0.
-    #[inline]
-    fn zero() -> Self {
-        Self::new(
-            P::BaseField::zero(),
-            P::BaseField::one(),
-            P::BaseField::zero(),
-        )
-    }
-
-    // The point at infinity is always represented by
-    // Z = 0.
-    #[inline]
-    fn is_zero(&self) -> bool {
-        self.z.is_zero()
-    }
-}
-
-impl<P: Parameters> ProjectiveCurve for GroupProjective<P> {
-    const COFACTOR: &'static [u64] = P::COFACTOR;
-    type BaseField = P::BaseField;
-    type ScalarField = P::ScalarField;
-    type Affine = GroupAffine<P>;
-
-    fn get_x(&mut self) -> &mut Self::BaseField {
-        &mut self.x
-    }
-
-    #[inline]
-    fn prime_subgroup_generator() -> Self {
-        GroupAffine::prime_subgroup_generator().into()
-    }
-
-    #[inline]
-    fn is_normalized(&self) -> bool {
-        self.is_zero() || self.z.is_one()
-    }
-
-    fn batch_normalization(v: &mut [Self]) {
-        // Montgomery’s Trick and Fast Implementation of Masked AES
-        // Genelle, Prouff and Quisquater
-        // Section 3.2
-
-        // First pass: compute [a, ab, abc, ...]
-        let mut prod = Vec::with_capacity(v.len());
-        let mut tmp = P::BaseField::one();
-        for g in v.iter_mut()
-            // Ignore normalized elements
-            .filter(|g| !g.is_normalized())
-        {
-            tmp *= &g.z;
-            prod.push(tmp);
-        }
-
-        // Invert `tmp`.
-        tmp = tmp.inverse().unwrap(); // Guaranteed to be nonzero.
-
-        // Second pass: iterate backwards to compute inverses
-        for (g, s) in v.iter_mut()
-            // Backwards
-            .rev()
-                // Ignore normalized elements
-                .filter(|g| !g.is_normalized())
-                // Backwards, skip last element, fill in one for last term.
-                .zip(prod.into_iter().rev().skip(1).chain(Some(P::BaseField::one())))
-        {
-            // tmp := tmp * g.z; g.z := tmp * s = 1/z
-            let newtmp = tmp * &g.z;
-            g.z = tmp * &s;
-            tmp = newtmp;
-        }
-
-        // Perform affine transformations
-        for g in v.iter_mut().filter(|g| !g.is_normalized()) {
-            g.x *= &g.z; // x/z^2
-            g.y *= &g.z;
-            g.z = P::BaseField::one(); // z = 1
-        }
-    }
-
-    fn double_in_place(&mut self) -> &mut Self {
-        if self.is_zero() {
-            self
-        } else {
-            // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective.html#doubling-dbl-2007-bl
-
-            // XX = X1^2
-            let xx = self.x.square();
-            // ZZ = Z1^2
-            let zz = self.z.square();
-            // w = a*ZZ + 3*XX
-            let w = P::mul_by_a(&zz) + &(xx + &xx.double());
-            // s = 2*Y1*Z1
-            let mut s = self.y * &(self.z);
-            s.double_in_place();
-            // sss = s^3
-            let mut sss = s.square();
-            sss *= &s;
-            // R = Y1*s
-            let r = self.y * &s;
-            // RR = R2
-            let rr = r.square();
-            // B = (X1+R)^2-XX-RR
-            let b = (self.x + &r).square() - &xx - &rr;
-            // h = w2-2*B
-            let h = w.square() - &(b + &b);
-            // X3 = h*s
-            self.x = h * &s;
-            // Y3 = w*(B-h)-2*RR
-            self.y = w * &(b - &h) - &(rr + &rr);
-            // Z3 = sss
-            self.z = sss;
-
-            self
-        }
-    }
-
-    fn add_assign_mixed(&mut self, other: &GroupAffine<P>) {
-        if other.is_zero() {
-            return;
-        } else if self.is_zero() {
-            self.x = other.x;
-            self.y = other.y;
-            self.z = P::BaseField::one();
-            return;
-        }
-        let mut v = other.x * &self.z;
-        let mut u = other.y * &self.z;
-        if u == self.y && v == self.x {
-            // x1 / z1 == x2 / z2 <==> x1 * z2 == x2 * z1;
-            // Here, z2 = 1, so we have x1 == x2 * z1;
-            self.double_in_place();
-        } else {
-            // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective.html#addition-madd-1998-cmo
-            // u = Y2*Z1-Y1
-            u -= &self.y;
-            // uu = u^2
-            let uu = u.square();
-            // v = X2*Z1-X1
-            v -= &self.x;
-            // vv = v2
-            let vv = v.square();
-            // vvv = v*vv
-            let vvv = v * &vv;
-            // r = vv*X1
-            let r = vv * &self.x;
-            // a = uu*Z1-vvv-2*r
-            let a = uu * &self.z - &vvv - &r.double();
-            // X3 = v*a
-            self.x = v * &a;
-            // Y3 = u*(R-A)-vvv*Y1
-            self.y = u * &(r - &a) - &(vvv * &self.y);
-            // Z3 = vvv*Z1
-            self.z = vvv * &self.z;
-        }
-    }
-
-    fn mul<S: Into<<Self::ScalarField as PrimeField>::BigInt>>(mut self, other: S) -> Self {
-        if P::has_glv() {
-            let w = P::glv_window_size();
-            let mut res = Self::zero();
-            impl_glv_mul!(Self, P, w, self, res, other);
-            res
-        } else {
-            let mut res = Self::zero();
-            for b in BitIteratorBE::without_leading_zeros(other.into()) {
-                res.double_in_place();
-                if b {
-                    res += self;
-                }
-            }
-
-            self = res;
-            self
-        }
-    }
-}
-
-impl<P: Parameters> Neg for GroupProjective<P> {
-    type Output = Self;
-    fn neg(self) -> Self {
-        if !self.is_zero() {
-            Self::new(self.x, -self.y, self.z)
-        } else {
-            self
-        }
-    }
-}
-
-crate::impl_additive_ops_from_ref!(GroupProjective, Parameters);
-
-impl<'a, P: Parameters> Add<&'a Self> for GroupProjective<P> {
-    type Output = Self;
-    fn add(self, other: &'a Self) -> Self {
-        let mut copy = self;
-        copy += other;
-        copy
-    }
-}
-
-impl<'a, P: Parameters> AddAssign<&'a Self> for GroupProjective<P> {
-    fn add_assign(&mut self, other: &'a Self) {
-        if self.is_zero() {
-            *self = *other;
-            return;
-        }
-
-        if other.is_zero() {
-            return;
-        }
-        // https://www.hyperelliptic.org/EFD/g1p/data/shortw/projective/addition/add-1998-cmo-2
-
-        if self == other {
-            self.double_in_place();
-        } else {
-            // Y1Z2 = Y1*Z2
-            let y1z2 = self.y * &other.z;
-            // X1Z2 = X1*Z2
-            let x1z2 = self.x * &other.z;
-            // Z1Z2 = Z1*Z2
-            let z1z2 = self.z * &other.z;
-            // u = Y2*Z1-Y1Z2
-            let u = (self.z * &other.y) - &y1z2;
-            // uu = u^2
-            let uu = u.square();
-            // v = X2*Z1-X1Z2
-            let v = (self.z * &other.x) - &x1z2;
-            // vv = v^2
-            let vv = v.square();
-            // vvv = v*vv
-            let vvv = v * &vv;
-            // R = vv*X1Z2
-            let r = vv * &x1z2;
-            // A = uu*Z1Z2-vvv-2*R
-            let a = (uu * &z1z2) - &(vvv + &r + &r);
-            // X3 = v*A
-            self.x = v * &a;
-            // Y3 = u*(R-A)-vvv*Y1Z2
-            self.y = ((r - &a) * &u) - &(vvv * &y1z2);
-            // Z3 = vvv*Z1Z2
-            self.z = vvv * &z1z2;
-        }
-    }
-}
-
-impl<'a, P: Parameters> Sub<&'a Self> for GroupProjective<P> {
-    type Output = Self;
-    fn sub(self, other: &'a Self) -> Self {
-        let mut copy = self;
-        copy -= other;
-        copy
-    }
-}
-
-impl<'a, P: Parameters> SubAssign<&'a Self> for GroupProjective<P> {
-    fn sub_assign(&mut self, other: &'a Self) {
-        *self += &(-(*other));
-    }
-}
-
-impl<P: Parameters> MulAssign<P::ScalarField> for GroupProjective<P> {
-    fn mul_assign(&mut self, other: P::ScalarField) {
-        *self = self.mul(other.into_repr())
-    }
-}
-
-// The affine point X, Y is represented in the jacobian
-// coordinates with Z = 1.
-impl<P: Parameters> From<GroupAffine<P>> for GroupProjective<P> {
-    fn from(p: GroupAffine<P>) -> GroupProjective<P> {
-        if p.is_zero() {
-            Self::zero()
-        } else {
-            Self::new(p.x, p.y, P::BaseField::one())
-        }
-    }
-}
-
-// The projective point X, Y, Z is represented in the affine
-// coordinates as X/Z, Y/Z.
-impl<P: Parameters> From<GroupProjective<P>> for GroupAffine<P> {
-    fn from(p: GroupProjective<P>) -> GroupAffine<P> {
-        if p.is_zero() {
-            GroupAffine::zero()
-        } else if p.z.is_one() {
-            // If Z is one, the point is already normalized.
-            GroupAffine::new(p.x, p.y, false)
-        } else {
-            // Z is nonzero, so it must have an inverse in a field.
-            let z_inv = p.z.inverse().unwrap();
-            let x = p.x * &z_inv;
-            let y = p.y * &z_inv;
-            GroupAffine::new(x, y, false)
-        }
-    }
-}
diff --git a/algebra-core/src/curves/models/sw_batch_affine.rs b/algebra-core/src/curves/models/sw_batch_affine.rs
index cd77ab8dd..eaa96ab88 100644
--- a/algebra-core/src/curves/models/sw_batch_affine.rs
+++ b/algebra-core/src/curves/models/sw_batch_affine.rs
@@ -97,8 +97,8 @@ macro_rules! impl_sw_batch_affine {
             };
         }
 
-        impl<P: Parameters> BatchGroupArithmetic for $GroupAffine<P> {
-            type BBaseField = P::BaseField;
+        impl<P: SWModelParameters> BatchGroupArithmetic for $GroupAffine<P> {
+            type BaseFieldForBatch = P::BaseField;
             /// This implementation of batch group ops takes particular
             /// care to make most use of points fetched from memory to prevent
             /// reallocations
@@ -115,7 +115,7 @@ macro_rules! impl_sw_batch_affine {
             fn batch_double_in_place(
                 bases: &mut [Self],
                 index: &[u32],
-                scratch_space: Option<&mut Vec<Self::BBaseField>>,
+                scratch_space: Option<&mut Vec<Self::BaseFieldForBatch>>,
             ) {
                 let mut inversion_tmp = P::BaseField::one();
 
@@ -437,7 +437,8 @@ macro_rules! impl_sw_batch_affine {
                 let batch_size = bases.len();
                 if P::has_glv() {
                     use itertools::{EitherOrBoth::*, Itertools};
-                    let mut scratch_space = Vec::<Self::BBaseField>::with_capacity(bases.len());
+                    let mut scratch_space =
+                        Vec::<Self::BaseFieldForBatch>::with_capacity(bases.len());
                     let mut scratch_space_group = Vec::<Self>::with_capacity(bases.len() / w);
 
                     let _now = timer!();
@@ -558,7 +559,8 @@ macro_rules! impl_sw_batch_affine {
                     }
                     timer_println!(_now, "batch ops");
                 } else {
-                    let mut scratch_space = Vec::<Self::BBaseField>::with_capacity(bases.len());
+                    let mut scratch_space =
+                        Vec::<Self::BaseFieldForBatch>::with_capacity(bases.len());
                     let opcode_vectorised =
                         Self::batch_wnaf_opcode_recoding::<BigInt>(scalars, w, None);
                     let tables = Self::batch_wnaf_tables(bases, w);
diff --git a/algebra-core/src/curves/models/twisted_edwards_extended.rs b/algebra-core/src/curves/models/twisted_edwards_extended.rs
index 772c5c714..5e97bd971 100644
--- a/algebra-core/src/curves/models/twisted_edwards_extended.rs
+++ b/algebra-core/src/curves/models/twisted_edwards_extended.rs
@@ -1,10 +1,15 @@
+#[cfg(not(feature = "cuda"))]
+use crate::accel_dummy::*;
 use crate::{
     curves::batch_arith::decode_endo_from_u32,
     io::{Read, Result as IoResult, Write},
     serialize::{EdwardsFlags, Flags},
-    BatchGroupArithmetic, CanonicalDeserialize, CanonicalDeserializeWithFlags, CanonicalSerialize,
+    CanonicalDeserialize, CanonicalDeserializeWithFlags, CanonicalSerialize,
     CanonicalSerializeWithFlags, ConstantSerializedSize, UniformRand, Vec,
 };
+#[cfg(feature = "cuda")]
+use {accel::*, log::debug};
+
 use core::{
     fmt::{Display, Formatter, Result as FmtResult},
     marker::PhantomData,
@@ -16,43 +21,82 @@ use rand::{
     Rng,
 };
 
+#[cfg(feature = "cuda")]
+use {
+    crate::curves::BatchGroupArithmeticSlice, closure::closure, peekmore::PeekMore,
+    std::sync::Mutex,
+};
+
 use crate::{
+    biginteger::BigInteger,
     bytes::{FromBytes, ToBytes},
+    cfg_chunks_mut, cfg_iter,
     curves::{
-        models::{
-            MontgomeryModelParameters as MontgomeryParameters, TEModelParameters as Parameters,
-        },
-        AffineCurve, ProjectiveCurve,
+        cuda::scalar_mul::{internal::GPUScalarMulInternal, ScalarMulProfiler},
+        models::MontgomeryModelParameters,
+        AffineCurve, BatchGroupArithmetic, ModelParameters, ProjectiveCurve,
     },
-    fields::{BitIteratorBE, Field, PrimeField, SquareRootField},
+    fields::{BitIteratorBE, Field, FpParameters, PrimeField, SquareRootField},
+    impl_gpu_cpu_run_kernel, impl_gpu_te_projective, impl_run_kernel,
 };
+
 #[cfg(feature = "parallel")]
 use rayon::prelude::*;
 
+pub trait TEModelParameters: ModelParameters + Sized {
+    const COEFF_A: Self::BaseField;
+    const COEFF_D: Self::BaseField;
+    const COFACTOR: &'static [u64];
+    const COFACTOR_INV: Self::ScalarField;
+    const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField);
+
+    type MontgomeryModelParameters: MontgomeryModelParameters<BaseField = Self::BaseField>;
+
+    #[inline(always)]
+    fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField {
+        let mut copy = *elem;
+        copy *= &Self::COEFF_A;
+        copy
+    }
+
+    fn scalar_mul_kernel(
+        ctx: &Context,
+        grid: usize,
+        block: usize,
+        table: *const GroupProjective<Self>,
+        exps: *const u8,
+        out: *mut GroupProjective<Self>,
+        n: isize,
+    ) -> error::Result<()>;
+
+    fn scalar_mul_static_profiler() -> ScalarMulProfiler;
+
+    fn namespace() -> &'static str;
+}
+
 #[derive(Derivative)]
 #[derivative(
-    Copy(bound = "P: Parameters"),
-    Clone(bound = "P: Parameters"),
-    PartialEq(bound = "P: Parameters"),
-    Eq(bound = "P: Parameters"),
-    Debug(bound = "P: Parameters"),
-    Hash(bound = "P: Parameters")
+    Copy(bound = "P: TEModelParameters"),
+    Clone(bound = "P: TEModelParameters"),
+    PartialEq(bound = "P: TEModelParameters"),
+    Eq(bound = "P: TEModelParameters"),
+    Debug(bound = "P: TEModelParameters"),
+    Hash(bound = "P: TEModelParameters")
 )]
-#[must_use]
-pub struct GroupAffine<P: Parameters> {
+pub struct GroupAffine<P: TEModelParameters> {
     pub x: P::BaseField,
     pub y: P::BaseField,
     #[derivative(Debug = "ignore")]
     _params: PhantomData<P>,
 }
 
-impl<P: Parameters> Display for GroupAffine<P> {
+impl<P: TEModelParameters> Display for GroupAffine<P> {
     fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
         write!(f, "GroupAffine(x={}, y={})", self.x, self.y)
     }
 }
 
-impl<P: Parameters> GroupAffine<P> {
+impl<P: TEModelParameters> GroupAffine<P> {
     pub fn new(x: P::BaseField, y: P::BaseField) -> Self {
         Self {
             x,
@@ -117,7 +161,7 @@ impl<P: Parameters> GroupAffine<P> {
     }
 }
 
-impl<P: Parameters> Zero for GroupAffine<P> {
+impl<P: TEModelParameters> Zero for GroupAffine<P> {
     fn zero() -> Self {
         Self::new(P::BaseField::zero(), P::BaseField::one())
     }
@@ -127,7 +171,7 @@ impl<P: Parameters> Zero for GroupAffine<P> {
     }
 }
 
-impl<P: Parameters> AffineCurve for GroupAffine<P> {
+impl<P: TEModelParameters> AffineCurve for GroupAffine<P> {
     const COFACTOR: &'static [u64] = P::COFACTOR;
     type BaseField = P::BaseField;
     type ScalarField = P::ScalarField;
@@ -206,13 +250,13 @@ macro_rules! batch_add_loop_2 {
     };
 }
 
-impl<P: Parameters> BatchGroupArithmetic for GroupAffine<P> {
-    type BBaseField = P::BaseField;
+impl<P: TEModelParameters> BatchGroupArithmetic for GroupAffine<P> {
+    type BaseFieldForBatch = P::BaseField;
 
     fn batch_double_in_place(
         bases: &mut [Self],
         index: &[u32],
-        _scratch_space: Option<&mut Vec<Self::BBaseField>>,
+        _scratch_space: Option<&mut Vec<Self::BaseFieldForBatch>>,
     ) {
         Self::batch_add_in_place(
             bases,
@@ -367,7 +411,7 @@ impl<P: Parameters> BatchGroupArithmetic for GroupAffine<P> {
     }
 }
 
-impl<P: Parameters> Neg for GroupAffine<P> {
+impl<P: TEModelParameters> Neg for GroupAffine<P> {
     type Output = Self;
 
     fn neg(self) -> Self {
@@ -375,9 +419,9 @@ impl<P: Parameters> Neg for GroupAffine<P> {
     }
 }
 
-crate::impl_additive_ops_from_ref!(GroupAffine, Parameters);
+crate::impl_additive_ops_from_ref!(GroupAffine, TEModelParameters);
 
-impl<'a, P: Parameters> Add<&'a Self> for GroupAffine<P> {
+impl<'a, P: TEModelParameters> Add<&'a Self> for GroupAffine<P> {
     type Output = Self;
     fn add(self, other: &'a Self) -> Self {
         let mut copy = self;
@@ -386,7 +430,7 @@ impl<'a, P: Parameters> Add<&'a Self> for GroupAffine<P> {
     }
 }
 
-impl<'a, P: Parameters> AddAssign<&'a Self> for GroupAffine<P> {
+impl<'a, P: TEModelParameters> AddAssign<&'a Self> for GroupAffine<P> {
     fn add_assign(&mut self, other: &'a Self) {
         let y1y2 = self.y * &other.y;
         let x1x2 = self.x * &other.x;
@@ -403,7 +447,7 @@ impl<'a, P: Parameters> AddAssign<&'a Self> for GroupAffine<P> {
     }
 }
 
-impl<'a, P: Parameters> Sub<&'a Self> for GroupAffine<P> {
+impl<'a, P: TEModelParameters> Sub<&'a Self> for GroupAffine<P> {
     type Output = Self;
     fn sub(self, other: &'a Self) -> Self {
         let mut copy = self;
@@ -412,19 +456,19 @@ impl<'a, P: Parameters> Sub<&'a Self> for GroupAffine<P> {
     }
 }
 
-impl<'a, P: Parameters> SubAssign<&'a Self> for GroupAffine<P> {
+impl<'a, P: TEModelParameters> SubAssign<&'a Self> for GroupAffine<P> {
     fn sub_assign(&mut self, other: &'a Self) {
         *self += &(-(*other));
     }
 }
 
-impl<P: Parameters> MulAssign<P::ScalarField> for GroupAffine<P> {
+impl<P: TEModelParameters> MulAssign<P::ScalarField> for GroupAffine<P> {
     fn mul_assign(&mut self, other: P::ScalarField) {
         *self = self.mul(other.into_repr()).into()
     }
 }
 
-impl<P: Parameters> ToBytes for GroupAffine<P> {
+impl<P: TEModelParameters> ToBytes for GroupAffine<P> {
     #[inline]
     fn write<W: Write>(&self, mut writer: W) -> IoResult<()> {
         self.x.write(&mut writer)?;
@@ -432,7 +476,7 @@ impl<P: Parameters> ToBytes for GroupAffine<P> {
     }
 }
 
-impl<P: Parameters> FromBytes for GroupAffine<P> {
+impl<P: TEModelParameters> FromBytes for GroupAffine<P> {
     #[inline]
     fn read<R: Read>(mut reader: R) -> IoResult<Self> {
         let x = P::BaseField::read(&mut reader)?;
@@ -441,14 +485,14 @@ impl<P: Parameters> FromBytes for GroupAffine<P> {
     }
 }
 
-impl<P: Parameters> Default for GroupAffine<P> {
+impl<P: TEModelParameters> Default for GroupAffine<P> {
     #[inline]
     fn default() -> Self {
         Self::zero()
     }
 }
 
-impl<P: Parameters> Distribution<GroupAffine<P>> for Standard {
+impl<P: TEModelParameters> Distribution<GroupAffine<P>> for Standard {
     #[inline]
     fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> GroupAffine<P> {
         loop {
@@ -466,7 +510,7 @@ mod group_impl {
     use super::*;
     use crate::groups::Group;
 
-    impl<P: Parameters> Group for GroupAffine<P> {
+    impl<P: TEModelParameters> Group for GroupAffine<P> {
         type ScalarField = P::ScalarField;
 
         #[inline]
@@ -491,14 +535,13 @@ mod group_impl {
 
 #[derive(Derivative)]
 #[derivative(
-    Copy(bound = "P: Parameters"),
-    Clone(bound = "P: Parameters"),
-    Eq(bound = "P: Parameters"),
-    Debug(bound = "P: Parameters"),
-    Hash(bound = "P: Parameters")
+    Copy(bound = "P: TEModelParameters"),
+    Clone(bound = "P: TEModelParameters"),
+    Eq(bound = "P: TEModelParameters"),
+    Debug(bound = "P: TEModelParameters"),
+    Hash(bound = "P: TEModelParameters")
 )]
-#[must_use]
-pub struct GroupProjective<P: Parameters> {
+pub struct GroupProjective<P: TEModelParameters> {
     pub x: P::BaseField,
     pub y: P::BaseField,
     pub t: P::BaseField,
@@ -507,25 +550,25 @@ pub struct GroupProjective<P: Parameters> {
     _params: PhantomData<P>,
 }
 
-impl<P: Parameters> PartialEq<GroupProjective<P>> for GroupAffine<P> {
+impl<P: TEModelParameters> PartialEq<GroupProjective<P>> for GroupAffine<P> {
     fn eq(&self, other: &GroupProjective<P>) -> bool {
         self.into_projective() == *other
     }
 }
 
-impl<P: Parameters> PartialEq<GroupAffine<P>> for GroupProjective<P> {
+impl<P: TEModelParameters> PartialEq<GroupAffine<P>> for GroupProjective<P> {
     fn eq(&self, other: &GroupAffine<P>) -> bool {
         *self == other.into_projective()
     }
 }
 
-impl<P: Parameters> Display for GroupProjective<P> {
+impl<P: TEModelParameters> Display for GroupProjective<P> {
     fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
         write!(f, "{}", GroupAffine::from(*self))
     }
 }
 
-impl<P: Parameters> PartialEq for GroupProjective<P> {
+impl<P: TEModelParameters> PartialEq for GroupProjective<P> {
     fn eq(&self, other: &Self) -> bool {
         if self.is_zero() {
             return other.is_zero();
@@ -540,7 +583,7 @@ impl<P: Parameters> PartialEq for GroupProjective<P> {
     }
 }
 
-impl<P: Parameters> Distribution<GroupProjective<P>> for Standard {
+impl<P: TEModelParameters> Distribution<GroupProjective<P>> for Standard {
     #[inline]
     fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> GroupProjective<P> {
         loop {
@@ -554,7 +597,7 @@ impl<P: Parameters> Distribution<GroupProjective<P>> for Standard {
     }
 }
 
-impl<P: Parameters> ToBytes for GroupProjective<P> {
+impl<P: TEModelParameters> ToBytes for GroupProjective<P> {
     #[inline]
     fn write<W: Write>(&self, mut writer: W) -> IoResult<()> {
         self.x.write(&mut writer)?;
@@ -564,7 +607,7 @@ impl<P: Parameters> ToBytes for GroupProjective<P> {
     }
 }
 
-impl<P: Parameters> FromBytes for GroupProjective<P> {
+impl<P: TEModelParameters> FromBytes for GroupProjective<P> {
     #[inline]
     fn read<R: Read>(mut reader: R) -> IoResult<Self> {
         let x = P::BaseField::read(&mut reader)?;
@@ -575,14 +618,14 @@ impl<P: Parameters> FromBytes for GroupProjective<P> {
     }
 }
 
-impl<P: Parameters> Default for GroupProjective<P> {
+impl<P: TEModelParameters> Default for GroupProjective<P> {
     #[inline]
     fn default() -> Self {
         Self::zero()
     }
 }
 
-impl<P: Parameters> GroupProjective<P> {
+impl<P: TEModelParameters> GroupProjective<P> {
     pub fn new(x: P::BaseField, y: P::BaseField, t: P::BaseField, z: P::BaseField) -> Self {
         Self {
             x,
@@ -594,7 +637,7 @@ impl<P: Parameters> GroupProjective<P> {
     }
 }
 
-impl<P: Parameters> Zero for GroupProjective<P> {
+impl<P: TEModelParameters> Zero for GroupProjective<P> {
     fn zero() -> Self {
         Self::new(
             P::BaseField::zero(),
@@ -609,7 +652,9 @@ impl<P: Parameters> Zero for GroupProjective<P> {
     }
 }
 
-impl<P: Parameters> ProjectiveCurve for GroupProjective<P> {
+impl_gpu_te_projective!(TEModelParameters);
+
+impl<P: TEModelParameters> ProjectiveCurve for GroupProjective<P> {
     const COFACTOR: &'static [u64] = P::COFACTOR;
     type BaseField = P::BaseField;
     type ScalarField = P::ScalarField;
@@ -709,7 +754,7 @@ impl<P: Parameters> ProjectiveCurve for GroupProjective<P> {
     }
 }
 
-impl<P: Parameters> Neg for GroupProjective<P> {
+impl<P: TEModelParameters> Neg for GroupProjective<P> {
     type Output = Self;
     fn neg(mut self) -> Self {
         self.x = -self.x;
@@ -718,9 +763,9 @@ impl<P: Parameters> Neg for GroupProjective<P> {
     }
 }
 
-crate::impl_additive_ops_from_ref!(GroupProjective, Parameters);
+crate::impl_additive_ops_from_ref!(GroupProjective, TEModelParameters);
 
-impl<'a, P: Parameters> Add<&'a Self> for GroupProjective<P> {
+impl<'a, P: TEModelParameters> Add<&'a Self> for GroupProjective<P> {
     type Output = Self;
     fn add(self, other: &'a Self) -> Self {
         let mut copy = self;
@@ -729,7 +774,7 @@ impl<'a, P: Parameters> Add<&'a Self> for GroupProjective<P> {
     }
 }
 
-impl<'a, P: Parameters> AddAssign<&'a Self> for GroupProjective<P> {
+impl<'a, P: TEModelParameters> AddAssign<&'a Self> for GroupProjective<P> {
     fn add_assign(&mut self, other: &'a Self) {
         // See "Twisted Edwards Curves Revisited"
         // Huseyin Hisil, Kenneth Koon-Ho Wong, Gary Carter, and Ed Dawson
@@ -773,7 +818,7 @@ impl<'a, P: Parameters> AddAssign<&'a Self> for GroupProjective<P> {
     }
 }
 
-impl<'a, P: Parameters> Sub<&'a Self> for GroupProjective<P> {
+impl<'a, P: TEModelParameters> Sub<&'a Self> for GroupProjective<P> {
     type Output = Self;
     fn sub(self, other: &'a Self) -> Self {
         let mut copy = self;
@@ -782,13 +827,13 @@ impl<'a, P: Parameters> Sub<&'a Self> for GroupProjective<P> {
     }
 }
 
-impl<'a, P: Parameters> SubAssign<&'a Self> for GroupProjective<P> {
+impl<'a, P: TEModelParameters> SubAssign<&'a Self> for GroupProjective<P> {
     fn sub_assign(&mut self, other: &'a Self) {
         *self += &(-(*other));
     }
 }
 
-impl<P: Parameters> MulAssign<P::ScalarField> for GroupProjective<P> {
+impl<P: TEModelParameters> MulAssign<P::ScalarField> for GroupProjective<P> {
     fn mul_assign(&mut self, other: P::ScalarField) {
         *self = self.mul(other.into_repr())
     }
@@ -796,7 +841,7 @@ impl<P: Parameters> MulAssign<P::ScalarField> for GroupProjective<P> {
 
 // The affine point (X, Y) is represented in the Extended Projective coordinates
 // with Z = 1.
-impl<P: Parameters> From<GroupAffine<P>> for GroupProjective<P> {
+impl<P: TEModelParameters> From<GroupAffine<P>> for GroupProjective<P> {
     fn from(p: GroupAffine<P>) -> GroupProjective<P> {
         Self::new(p.x, p.y, p.x * &p.y, P::BaseField::one())
     }
@@ -804,7 +849,7 @@ impl<P: Parameters> From<GroupAffine<P>> for GroupProjective<P> {
 
 // The projective point X, Y, T, Z is represented in the affine
 // coordinates as X/Z, Y/Z.
-impl<P: Parameters> From<GroupProjective<P>> for GroupAffine<P> {
+impl<P: TEModelParameters> From<GroupProjective<P>> for GroupAffine<P> {
     fn from(p: GroupProjective<P>) -> GroupAffine<P> {
         if p.is_zero() {
             GroupAffine::zero()
@@ -821,7 +866,7 @@ impl<P: Parameters> From<GroupProjective<P>> for GroupAffine<P> {
     }
 }
 
-impl<P: Parameters> core::str::FromStr for GroupAffine<P>
+impl<P: TEModelParameters> core::str::FromStr for GroupAffine<P>
 where
     P::BaseField: core::str::FromStr<Err = ()>,
 {
@@ -859,27 +904,27 @@ where
 
 #[derive(Derivative)]
 #[derivative(
-    Copy(bound = "P: MontgomeryParameters"),
-    Clone(bound = "P: MontgomeryParameters"),
-    PartialEq(bound = "P: MontgomeryParameters"),
-    Eq(bound = "P: MontgomeryParameters"),
-    Debug(bound = "P: MontgomeryParameters"),
-    Hash(bound = "P: MontgomeryParameters")
+    Copy(bound = "P: MontgomeryModelParameters"),
+    Clone(bound = "P: MontgomeryModelParameters"),
+    PartialEq(bound = "P: MontgomeryModelParameters"),
+    Eq(bound = "P: MontgomeryModelParameters"),
+    Debug(bound = "P: MontgomeryModelParameters"),
+    Hash(bound = "P: MontgomeryModelParameters")
 )]
-pub struct MontgomeryGroupAffine<P: MontgomeryParameters> {
+pub struct MontgomeryGroupAffine<P: MontgomeryModelParameters> {
     pub x: P::BaseField,
     pub y: P::BaseField,
     #[derivative(Debug = "ignore")]
     _params: PhantomData<P>,
 }
 
-impl<P: MontgomeryParameters> Display for MontgomeryGroupAffine<P> {
+impl<P: MontgomeryModelParameters> Display for MontgomeryGroupAffine<P> {
     fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
         write!(f, "MontgomeryGroupAffine(x={}, y={})", self.x, self.y)
     }
 }
 
-impl<P: MontgomeryParameters> MontgomeryGroupAffine<P> {
+impl<P: MontgomeryModelParameters> MontgomeryGroupAffine<P> {
     pub fn new(x: P::BaseField, y: P::BaseField) -> Self {
         Self {
             x,
@@ -889,4 +934,4 @@ impl<P: MontgomeryParameters> MontgomeryGroupAffine<P> {
     }
 }
 
-impl_edwards_curve_serializer!(Parameters);
+impl_edwards_curve_serializer!(TEModelParameters);
diff --git a/algebra-core/src/fields/arithmetic.rs b/algebra-core/src/fields/arithmetic.rs
index f84e66499..5fa95cc57 100644
--- a/algebra-core/src/fields/arithmetic.rs
+++ b/algebra-core/src/fields/arithmetic.rs
@@ -1,18 +1,18 @@
 /// All of these methods store intermediate results on the stack, and so
 /// they support overlap of input and output parameters.
-#[cfg(feature = "bw6_asm")]
+#[cfg(use_bw6_asm)]
 extern "C" {
     pub fn modmul768(x: *const u64, y: *const u64, m: *const u64, z: *mut u64);
     pub fn modadd768(x: *const u64, y: *const u64, m: *const u64, z: *mut u64);
     pub fn modsub768(x: *const u64, y: *const u64, m: *const u64, z: *mut u64);
 }
-
 /// This modular multiplication algorithm uses Montgomery
 /// reduction for efficient implementation. It also additionally
 /// uses the "no-carry optimization" outlined
 /// [here](https://hackmd.io/@zkteam/modular_multiplication) if
 /// `P::MODULUS` has BOTH (a) a zero MSB, AND (b) at least one
 /// zero bit in the rest of the modulus.
+
 macro_rules! impl_field_mul_assign {
     ($limbs:expr) => {
         #[inline]
@@ -255,6 +255,7 @@ macro_rules! impl_field_square_in_place {
                     return self;
                 }
             }
+
             // Checking the modulus at compile time
             let first_bit_set = P::MODULUS.0[$limbs - 1] >> 63 != 0;
             let mut all_bits_set = P::MODULUS.0[$limbs - 1] == !0 - (1 << 63);
diff --git a/algebra-core/src/lib.rs b/algebra-core/src/lib.rs
index 25fe4f5ae..0d95e37ea 100644
--- a/algebra-core/src/lib.rs
+++ b/algebra-core/src/lib.rs
@@ -75,6 +75,7 @@ pub use self::fields::*;
 pub mod biginteger;
 pub use self::biginteger::*;
 
+#[macro_use]
 pub mod curves;
 pub use self::curves::*;
 
diff --git a/algebra/Cargo.toml b/algebra/Cargo.toml
index 4c56a90b7..91498cb02 100644
--- a/algebra/Cargo.toml
+++ b/algebra/Cargo.toml
@@ -23,6 +23,10 @@ edition = "2018"
 
 [dependencies]
 algebra-core = { path = "../algebra-core", default-features = false }
+accel = { git = "https://github.com/jon-chuang/accel", package = "accel", optional = true }
+# accel = { path = "/home/jonch/Desktop/Programming/Rust/accel/accel", optional = true }
+lazy_static = { version = "1.4.0", optional = true }
+paste = "0.1"
 
 [dev-dependencies]
 rand = { version = "0.7", default-features = false }
@@ -73,6 +77,7 @@ mnt6_298 = []
 mnt6_753 = []
 
 curve = []
+cuda_test = []
 batch_affine = []
 msm = []
 verify = []
@@ -91,8 +96,9 @@ parallel = [ "std", "algebra-core/parallel" ]
 parallel_random_gen = []
 derive = [ "algebra-core/derive" ]
 asm = [ "algebra-core/llvm_asm" ]
+bw6_asm = [ "algebra-core/bw6_asm" ]
 prefetch = [ "algebra-core/prefetch"]
+cuda = [ "algebra-core/cuda", "accel", "std", "lazy_static" ]
 timing = [ "algebra-core/timing"]
 timing_detailed = [ "algebra-core/timing_detailed" ]
 timing_thread_id = [ "algebra-core/timing_thread_id" ]
-bw6_asm = [ "algebra-core/bw6_asm" ]
diff --git a/algebra/src/bls12_377/curves/g1.rs b/algebra/src/bls12_377/curves/g1.rs
index 3c318afda..1fb3c6786 100644
--- a/algebra/src/bls12_377/curves/g1.rs
+++ b/algebra/src/bls12_377/curves/g1.rs
@@ -1,13 +1,18 @@
 use algebra_core::{
     biginteger::{BigInteger256, BigInteger384, BigInteger512},
     curves::{
+        bls12,
         models::{ModelParameters, SWModelParameters},
         GLVParameters,
     },
-    field_new, impl_glv_for_sw, PrimeField, Zero,
+    field_new, impl_glv_for_sw, impl_scalar_mul_kernel, impl_scalar_mul_parameters, PrimeField,
+    Zero,
 };
 
-use crate::bls12_377::{Fq, Fr};
+use crate::{bls12_377, bls12_377::*};
+
+pub type G1Affine = bls12::G1Affine<bls12_377::Parameters>;
+pub type G1Projective = bls12::G1Projective<bls12_377::Parameters>;
 
 #[derive(Clone, Default, PartialEq, Eq)]
 pub struct Parameters;
@@ -17,6 +22,8 @@ impl ModelParameters for Parameters {
     type ScalarField = Fr;
 }
 
+impl_scalar_mul_kernel!(bls12_377, "bls12_377", g1, G1Projective);
+
 impl GLVParameters for Parameters {
     type WideBigInt = BigInteger512;
     const OMEGA: Self::BaseField = field_new!(
@@ -88,6 +95,7 @@ impl SWModelParameters for Parameters {
         Self::BaseField::zero()
     }
 
+    impl_scalar_mul_parameters!(G1Projective);
     impl_glv_for_sw!();
 }
 
diff --git a/algebra/src/bls12_377/curves/g2.rs b/algebra/src/bls12_377/curves/g2.rs
index efab698de..dd221381e 100644
--- a/algebra/src/bls12_377/curves/g2.rs
+++ b/algebra/src/bls12_377/curves/g2.rs
@@ -1,13 +1,19 @@
-use crate::bls12_377::{g1, Fq, Fq2, Fr};
 use algebra_core::{
     biginteger::{BigInteger256, BigInteger384, BigInteger512},
     curves::{
+        bls12,
         models::{ModelParameters, SWModelParameters},
         GLVParameters,
     },
-    field_new, impl_glv_for_sw, PrimeField, Zero,
+    field_new, impl_glv_for_sw, impl_scalar_mul_kernel, impl_scalar_mul_parameters, PrimeField,
+    Zero,
 };
 
+use crate::{bls12_377, bls12_377::*};
+
+pub type G2Affine = bls12::G2Affine<bls12_377::Parameters>;
+pub type G2Projective = bls12::G2Projective<bls12_377::Parameters>;
+
 #[derive(Clone, Default, PartialEq, Eq)]
 pub struct Parameters;
 
@@ -16,6 +22,8 @@ impl ModelParameters for Parameters {
     type ScalarField = Fr;
 }
 
+impl_scalar_mul_kernel!(bls12_377, "bls12_377", g2, G2Projective);
+
 impl GLVParameters for Parameters {
     type WideBigInt = BigInteger512;
     const OMEGA: Self::BaseField = field_new!(
@@ -114,6 +122,7 @@ impl SWModelParameters for Parameters {
         Self::BaseField::zero()
     }
 
+    impl_scalar_mul_parameters!(G2Projective);
     impl_glv_for_sw!();
 }
 
diff --git a/algebra/src/bls12_377/curves/mod.rs b/algebra/src/bls12_377/curves/mod.rs
index bc3c1a127..286feac59 100644
--- a/algebra/src/bls12_377/curves/mod.rs
+++ b/algebra/src/bls12_377/curves/mod.rs
@@ -1,11 +1,11 @@
 use crate::bls12_377::*;
-use algebra_core::curves::{
-    bls12,
-    bls12::{Bls12, Bls12Parameters, TwistType},
-};
+use algebra_core::curves::bls12::{Bls12, Bls12Parameters, TwistType};
 
 pub mod g1;
+pub use self::g1::{G1Affine, G1Projective};
+
 pub mod g2;
+pub use self::g2::{G2Affine, G2Projective};
 
 #[cfg(test)]
 mod tests;
@@ -26,8 +26,3 @@ impl Bls12Parameters for Parameters {
 }
 
 pub type Bls12_377 = Bls12<Parameters>;
-
-pub type G1Affine = bls12::G1Affine<Parameters>;
-pub type G1Projective = bls12::G1Projective<Parameters>;
-pub type G2Affine = bls12::G2Affine<Parameters>;
-pub type G2Projective = bls12::G2Projective<Parameters>;
diff --git a/algebra/src/bls12_381/curves/g1.rs b/algebra/src/bls12_381/curves/g1.rs
index b7508f27f..f0fa7ba72 100644
--- a/algebra/src/bls12_381/curves/g1.rs
+++ b/algebra/src/bls12_381/curves/g1.rs
@@ -7,7 +7,8 @@ use crate::{
         models::{ModelParameters, SWModelParameters},
         GLVParameters,
     },
-    field_new, impl_glv_for_sw, PrimeField, Zero,
+    field_new, impl_glv_for_sw, impl_scalar_mul_kernel, impl_scalar_mul_parameters, PrimeField,
+    Zero,
 };
 
 pub type G1Affine = bls12::G1Affine<bls12_381::Parameters>;
@@ -21,6 +22,8 @@ impl ModelParameters for Parameters {
     type ScalarField = Fr;
 }
 
+impl_scalar_mul_kernel!(bls12_381, "bls12_381", g1, G1Projective);
+
 impl GLVParameters for Parameters {
     type WideBigInt = BigInteger512;
     const OMEGA: Self::BaseField = field_new!(
@@ -91,6 +94,7 @@ impl SWModelParameters for Parameters {
         Self::BaseField::zero()
     }
 
+    impl_scalar_mul_parameters!(G1Projective);
     impl_glv_for_sw!();
 }
 
diff --git a/algebra/src/bls12_381/curves/g2.rs b/algebra/src/bls12_381/curves/g2.rs
index a851d53e0..c62d759ef 100644
--- a/algebra/src/bls12_381/curves/g2.rs
+++ b/algebra/src/bls12_381/curves/g2.rs
@@ -7,7 +7,8 @@ use crate::{
         models::{ModelParameters, SWModelParameters},
         GLVParameters,
     },
-    field_new, impl_glv_for_sw, PrimeField, Zero,
+    field_new, impl_glv_for_sw, impl_scalar_mul_kernel, impl_scalar_mul_parameters, PrimeField,
+    Zero,
 };
 
 pub type G2Affine = bls12::G2Affine<bls12_381::Parameters>;
@@ -21,6 +22,8 @@ impl ModelParameters for Parameters {
     type ScalarField = Fr;
 }
 
+impl_scalar_mul_kernel!(bls12_381, "bls12_381", g2, G2Projective);
+
 impl GLVParameters for Parameters {
     type WideBigInt = BigInteger512;
     const OMEGA: Self::BaseField = field_new!(
@@ -100,6 +103,7 @@ impl SWModelParameters for Parameters {
         Self::BaseField::zero()
     }
 
+    impl_scalar_mul_parameters!(G2Projective);
     impl_glv_for_sw!();
 }
 
diff --git a/algebra/src/bn254/curves/g1.rs b/algebra/src/bn254/curves/g1.rs
index b9b59ce23..c020d00af 100644
--- a/algebra/src/bn254/curves/g1.rs
+++ b/algebra/src/bn254/curves/g1.rs
@@ -1,10 +1,17 @@
 use algebra_core::{
     biginteger::{BigInteger256, BigInteger512},
-    curves::models::{ModelParameters, SWModelParameters},
-    field_new, impl_glv_for_sw, GLVParameters, PrimeField, Zero,
+    curves::{
+        bn,
+        models::{ModelParameters, SWModelParameters},
+    },
+    field_new, impl_glv_for_sw, impl_scalar_mul_kernel, impl_scalar_mul_parameters, GLVParameters,
+    PrimeField, Zero,
 };
 
-use crate::bn254::{Fq, Fr};
+use crate::{bn254, bn254::*};
+
+pub type G1Affine = bn::G1Affine<bn254::Parameters>;
+pub type G1Projective = bn::G1Projective<bn254::Parameters>;
 
 #[derive(Clone, Default, PartialEq, Eq)]
 pub struct Parameters;
@@ -14,6 +21,8 @@ impl ModelParameters for Parameters {
     type ScalarField = Fr;
 }
 
+impl_scalar_mul_kernel!(bn254, "bn254", g1, G1Projective);
+
 impl GLVParameters for Parameters {
     type WideBigInt = BigInteger512;
     const OMEGA: Self::BaseField = field_new!(
@@ -83,6 +92,7 @@ impl SWModelParameters for Parameters {
         Self::BaseField::zero()
     }
 
+    impl_scalar_mul_parameters!(G1Projective);
     impl_glv_for_sw!();
 }
 
diff --git a/algebra/src/bn254/curves/g2.rs b/algebra/src/bn254/curves/g2.rs
index d4c51e6f5..c2b7382e9 100644
--- a/algebra/src/bn254/curves/g2.rs
+++ b/algebra/src/bn254/curves/g2.rs
@@ -1,10 +1,17 @@
 use algebra_core::{
     biginteger::{BigInteger256, BigInteger512},
-    curves::models::{ModelParameters, SWModelParameters},
-    field_new, impl_glv_for_sw, GLVParameters, PrimeField, Zero,
+    curves::{
+        bn,
+        models::{ModelParameters, SWModelParameters},
+    },
+    field_new, impl_glv_for_sw, impl_scalar_mul_kernel, impl_scalar_mul_parameters, GLVParameters,
+    PrimeField, Zero,
 };
 
-use crate::bn254::{g1, Fq, Fq2, Fr};
+use crate::{bn254, bn254::*};
+
+pub type G2Affine = bn::G2Affine<bn254::Parameters>;
+pub type G2Projective = bn::G2Projective<bn254::Parameters>;
 
 #[derive(Clone, Default, PartialEq, Eq)]
 pub struct Parameters;
@@ -14,6 +21,8 @@ impl ModelParameters for Parameters {
     type ScalarField = Fr;
 }
 
+impl_scalar_mul_kernel!(bn254, "bn254", g2, G2Projective);
+
 impl GLVParameters for Parameters {
     type WideBigInt = BigInteger512;
     const OMEGA: Self::BaseField = field_new!(
@@ -107,6 +116,7 @@ impl SWModelParameters for Parameters {
         Self::BaseField::zero()
     }
 
+    impl_scalar_mul_parameters!(G2Projective);
     impl_glv_for_sw!();
 }
 
diff --git a/algebra/src/bn254/curves/mod.rs b/algebra/src/bn254/curves/mod.rs
index 396b77668..53cbeac3e 100644
--- a/algebra/src/bn254/curves/mod.rs
+++ b/algebra/src/bn254/curves/mod.rs
@@ -1,14 +1,14 @@
 use crate::bn254::*;
 use algebra_core::{
     biginteger::BigInteger256,
-    curves::{
-        bn,
-        bn::{Bn, BnParameters, TwistType},
-    },
+    curves::bn::{Bn, BnParameters, TwistType},
     field_new,
 };
 pub mod g1;
+pub use self::g1::{G1Affine, G1Projective};
+
 pub mod g2;
+pub use self::g2::{G2Affine, G2Projective};
 
 #[cfg(test)]
 mod tests;
@@ -78,8 +78,3 @@ impl BnParameters for Parameters {
 }
 
 pub type Bn254 = Bn<Parameters>;
-
-pub type G1Affine = bn::G1Affine<Parameters>;
-pub type G1Projective = bn::G1Projective<Parameters>;
-pub type G2Affine = bn::G2Affine<Parameters>;
-pub type G2Projective = bn::G2Projective<Parameters>;
diff --git a/algebra/src/bw6_761/curves/g1.rs b/algebra/src/bw6_761/curves/g1.rs
index a6512199e..941bc5aa4 100644
--- a/algebra/src/bw6_761/curves/g1.rs
+++ b/algebra/src/bw6_761/curves/g1.rs
@@ -8,7 +8,7 @@ use crate::{
     },
     field_new,
     fields::PrimeField,
-    impl_glv_for_sw,
+    impl_glv_for_sw, impl_scalar_mul_kernel_glv, impl_scalar_mul_parameters,
 };
 
 pub type G1Affine = GroupAffine<Parameters>;
@@ -22,6 +22,8 @@ impl ModelParameters for Parameters {
     type ScalarField = Fr;
 }
 
+impl_scalar_mul_kernel_glv!(bw6_761, "bw6_761", g1, G1Projective);
+
 /// The parameters can be obtained from
 /// Optimized and secure pairing-friendly elliptic
 /// curves suitable for one layer proof composition
@@ -161,6 +163,7 @@ impl SWModelParameters for Parameters {
         Self::BaseField::zero()
     }
 
+    impl_scalar_mul_parameters!(G1Projective);
     impl_glv_for_sw!();
 }
 
diff --git a/algebra/src/bw6_761/curves/g2.rs b/algebra/src/bw6_761/curves/g2.rs
index a3d363067..619f20552 100644
--- a/algebra/src/bw6_761/curves/g2.rs
+++ b/algebra/src/bw6_761/curves/g2.rs
@@ -8,7 +8,7 @@ use crate::{
     },
     field_new,
     fields::PrimeField,
-    impl_glv_for_sw,
+    impl_glv_for_sw, impl_scalar_mul_kernel_glv, impl_scalar_mul_parameters,
 };
 
 pub type G2Affine = GroupAffine<Parameters>;
@@ -22,6 +22,8 @@ impl ModelParameters for Parameters {
     type ScalarField = Fr;
 }
 
+impl_scalar_mul_kernel_glv!(bw6_761, "bw6_761", g2, G2Projective);
+
 impl GLVParameters for Parameters {
     type WideBigInt = BigInteger768;
 
@@ -154,6 +156,7 @@ impl SWModelParameters for Parameters {
         Self::BaseField::zero()
     }
 
+    impl_scalar_mul_parameters!(G2Projective);
     impl_glv_for_sw!();
 }
 
diff --git a/algebra/src/cp6_782/curves/g1.rs b/algebra/src/cp6_782/curves/g1.rs
index c2d05df2e..ebe37e417 100644
--- a/algebra/src/cp6_782/curves/g1.rs
+++ b/algebra/src/cp6_782/curves/g1.rs
@@ -5,7 +5,7 @@ use crate::{
         models::{ModelParameters, SWModelParameters},
         short_weierstrass_jacobian::{GroupAffine, GroupProjective},
     },
-    field_new,
+    field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters,
 };
 
 pub type G1Affine = GroupAffine<Parameters>;
@@ -19,6 +19,8 @@ impl ModelParameters for Parameters {
     type ScalarField = Fr;
 }
 
+impl_scalar_mul_kernel!(cp6_782, "cp6_782", g1, G1Projective);
+
 impl SWModelParameters for Parameters {
     /// COEFF_A = 5
     #[rustfmt::skip]
@@ -84,6 +86,8 @@ impl SWModelParameters for Parameters {
     /// AFFINE_GENERATOR_COEFFS = (G1_GENERATOR_X, G1_GENERATOR_Y)
     const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField) =
         (G1_GENERATOR_X, G1_GENERATOR_Y);
+
+    impl_scalar_mul_parameters!(G1Projective);
 }
 
 /// G1_GENERATOR_X =
diff --git a/algebra/src/cp6_782/curves/g2.rs b/algebra/src/cp6_782/curves/g2.rs
index 88d0ea2ce..4d30afcd1 100644
--- a/algebra/src/cp6_782/curves/g2.rs
+++ b/algebra/src/cp6_782/curves/g2.rs
@@ -5,7 +5,7 @@ use crate::{
         models::{ModelParameters, SWModelParameters},
         short_weierstrass_jacobian::{GroupAffine, GroupProjective},
     },
-    field_new,
+    field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters,
 };
 
 pub type G2Affine = GroupAffine<Parameters>;
@@ -19,6 +19,8 @@ impl ModelParameters for Parameters {
     type ScalarField = Fr;
 }
 
+impl_scalar_mul_kernel!(cp6_782, "cp6_782", g2, G2Projective);
+
 impl SWModelParameters for Parameters {
     /// COEFF_A = (0, 0, COEFF_A * TWIST^2) = (0, 0, 5)
     #[rustfmt::skip]
@@ -118,6 +120,8 @@ impl SWModelParameters for Parameters {
     /// AFFINE_GENERATOR_COEFFS = (G2_GENERATOR_X, G2_GENERATOR_Y)
     const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField) =
         (G2_GENERATOR_X, G2_GENERATOR_Y);
+
+    impl_scalar_mul_parameters!(G2Projective);
 }
 
 const G2_GENERATOR_X: Fq3 =
diff --git a/algebra/src/ed_on_bls12_377/curves/mod.rs b/algebra/src/ed_on_bls12_377/curves/mod.rs
index 5fd929481..d76440175 100644
--- a/algebra/src/ed_on_bls12_377/curves/mod.rs
+++ b/algebra/src/ed_on_bls12_377/curves/mod.rs
@@ -5,9 +5,11 @@ use algebra_core::{
         models::{ModelParameters, MontgomeryModelParameters, TEModelParameters},
         twisted_edwards_extended::{GroupAffine, GroupProjective},
     },
-    field_new,
+    field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters,
 };
 
+impl_scalar_mul_kernel!(ed_on_bls12_377, "ed_on_bls12_377", proj, EdwardsProjective);
+
 #[cfg(test)]
 mod tests;
 
@@ -65,6 +67,8 @@ impl TEModelParameters for EdwardsParameters {
     fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField {
         -*elem
     }
+
+    impl_scalar_mul_parameters!(EdwardsProjective);
 }
 
 impl MontgomeryModelParameters for EdwardsParameters {
diff --git a/algebra/src/ed_on_bls12_381/curves/mod.rs b/algebra/src/ed_on_bls12_381/curves/mod.rs
index fe01f833a..6c4d254c6 100644
--- a/algebra/src/ed_on_bls12_381/curves/mod.rs
+++ b/algebra/src/ed_on_bls12_381/curves/mod.rs
@@ -5,7 +5,7 @@ use algebra_core::{
         models::{ModelParameters, MontgomeryModelParameters, TEModelParameters},
         twisted_edwards_extended::{GroupAffine, GroupProjective},
     },
-    field_new,
+    field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters,
 };
 
 #[cfg(test)]
@@ -58,6 +58,8 @@ impl ModelParameters for EdwardsParameters {
     type ScalarField = Fr;
 }
 
+impl_scalar_mul_kernel!(ed_on_bls12_381, "ed_on_bls12_381", proj, EdwardsProjective);
+
 impl TEModelParameters for EdwardsParameters {
     /// COEFF_A = -1
     #[rustfmt::skip]
@@ -100,6 +102,8 @@ impl TEModelParameters for EdwardsParameters {
     fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField {
         -(*elem)
     }
+
+    impl_scalar_mul_parameters!(EdwardsProjective);
 }
 
 impl MontgomeryModelParameters for EdwardsParameters {
diff --git a/algebra/src/ed_on_bn254/curves/mod.rs b/algebra/src/ed_on_bn254/curves/mod.rs
index d4286349e..41634da40 100644
--- a/algebra/src/ed_on_bn254/curves/mod.rs
+++ b/algebra/src/ed_on_bn254/curves/mod.rs
@@ -5,7 +5,7 @@ use algebra_core::{
         models::{ModelParameters, MontgomeryModelParameters, TEModelParameters},
         twisted_edwards_extended::{GroupAffine, GroupProjective},
     },
-    field_new,
+    field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters,
 };
 
 #[cfg(test)]
@@ -44,6 +44,7 @@ impl ModelParameters for EdwardsParameters {
     type ScalarField = Fr;
 }
 
+impl_scalar_mul_kernel!(ed_on_bn254, "ed_on_bn254", proj, EdwardsProjective);
 impl TEModelParameters for EdwardsParameters {
     /// COEFF_A = 1
     #[rustfmt::skip]
@@ -86,6 +87,8 @@ impl TEModelParameters for EdwardsParameters {
     const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField) = (GENERATOR_X, GENERATOR_Y);
 
     type MontgomeryModelParameters = EdwardsParameters;
+
+    impl_scalar_mul_parameters!(EdwardsProjective);
 }
 
 impl MontgomeryModelParameters for EdwardsParameters {
diff --git a/algebra/src/ed_on_cp6_782/curves/mod.rs b/algebra/src/ed_on_cp6_782/curves/mod.rs
index 0e218cc4e..face754c7 100644
--- a/algebra/src/ed_on_cp6_782/curves/mod.rs
+++ b/algebra/src/ed_on_cp6_782/curves/mod.rs
@@ -4,7 +4,7 @@ use crate::{
         models::{ModelParameters, MontgomeryModelParameters, TEModelParameters},
         twisted_edwards_extended::{GroupAffine, GroupProjective},
     },
-    field_new,
+    field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters,
 };
 
 use crate::ed_on_cp6_782::{fq::Fq, fr::Fr};
@@ -23,6 +23,7 @@ impl ModelParameters for EdwardsParameters {
     type ScalarField = Fr;
 }
 
+impl_scalar_mul_kernel!(ed_on_cp6_782, "ed_on_cp6_782", proj, EdwardsProjective);
 impl TEModelParameters for EdwardsParameters {
     /// COEFF_A = -1 =
     /// 258664426012969094010652733694893533536393512754914660539884262666720468348340822774968888139573360124440321458176
@@ -72,6 +73,8 @@ impl TEModelParameters for EdwardsParameters {
     fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField {
         -*elem
     }
+
+    impl_scalar_mul_parameters!(EdwardsProjective);
 }
 
 impl MontgomeryModelParameters for EdwardsParameters {
diff --git a/algebra/src/ed_on_mnt4_298/curves/mod.rs b/algebra/src/ed_on_mnt4_298/curves/mod.rs
index 681a885e1..d5e5879f9 100644
--- a/algebra/src/ed_on_mnt4_298/curves/mod.rs
+++ b/algebra/src/ed_on_mnt4_298/curves/mod.rs
@@ -5,7 +5,7 @@ use algebra_core::{
         models::{ModelParameters, MontgomeryModelParameters, TEModelParameters},
         twisted_edwards_extended::{GroupAffine, GroupProjective},
     },
-    field_new,
+    field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters,
 };
 
 #[cfg(test)]
@@ -27,6 +27,8 @@ impl ModelParameters for EdwardsParameters {
 // R for Fq: 223364648326281414938801705359223029554923725549792420683051274872200260503540791531766876
 // R for Fr: 104384076783966083500464392945960916666734135485183910065100558776489954102951241798239545
 
+impl_scalar_mul_kernel!(ed_on_mnt4_298, "ed_on_mnt4_298", proj, EdwardsProjective);
+
 impl TEModelParameters for EdwardsParameters {
     /// COEFF_A = -1
     /// Needs to be in the Montgomery residue form in Fq
@@ -81,6 +83,8 @@ impl TEModelParameters for EdwardsParameters {
     fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField {
         -*elem
     }
+
+    impl_scalar_mul_parameters!(EdwardsProjective);
 }
 
 impl MontgomeryModelParameters for EdwardsParameters {
diff --git a/algebra/src/ed_on_mnt4_753/curves/mod.rs b/algebra/src/ed_on_mnt4_753/curves/mod.rs
index 1bcf02e3e..67742eef7 100644
--- a/algebra/src/ed_on_mnt4_753/curves/mod.rs
+++ b/algebra/src/ed_on_mnt4_753/curves/mod.rs
@@ -5,7 +5,7 @@ use algebra_core::{
         models::{ModelParameters, MontgomeryModelParameters, TEModelParameters},
         twisted_edwards_extended::{GroupAffine, GroupProjective},
     },
-    field_new,
+    field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters,
 };
 
 #[cfg(test)]
@@ -27,6 +27,7 @@ impl ModelParameters for EdwardsParameters {
 // R for Fq: 11407975440035778516953587871987109648531742722982233186120790377529569367095961954159305159259556262528904776132787438725571821295685691762729353555475679813615501328617736020411951837995932262333059670631633855898874183380802
 // R for Fr: 933352698056040166367534174176950366489065242993745918174914647273231163953185260894581718311971532174387033963715296372791285468903747270837716556902938133611910788060028435531754797383796835009316018259656953442114538695438
 
+impl_scalar_mul_kernel!(ed_on_mnt4_753, "ed_on_mnt4_753", proj, EdwardsProjective);
 impl TEModelParameters for EdwardsParameters {
     /// COEFF_A = -1
     /// Needs to be in the Montgomery residue form in Fq
@@ -102,6 +103,8 @@ impl TEModelParameters for EdwardsParameters {
     fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField {
         -*elem
     }
+
+    impl_scalar_mul_parameters!(EdwardsProjective);
 }
 
 impl MontgomeryModelParameters for EdwardsParameters {
diff --git a/algebra/src/mnt4_298/curves/g1.rs b/algebra/src/mnt4_298/curves/g1.rs
index e17684810..a70ac5996 100644
--- a/algebra/src/mnt4_298/curves/g1.rs
+++ b/algebra/src/mnt4_298/curves/g1.rs
@@ -5,7 +5,7 @@ use algebra_core::{
         mnt4,
         models::{ModelParameters, SWModelParameters},
     },
-    field_new,
+    field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters,
 };
 
 pub type G1Affine = mnt4::G1Affine<mnt4_298::Parameters>;
@@ -20,6 +20,8 @@ impl ModelParameters for Parameters {
     type ScalarField = Fr;
 }
 
+impl_scalar_mul_kernel!(mnt4_298, "mnt4_298", g1, G1Projective);
+
 impl SWModelParameters for Parameters {
     /// COEFF_A = 2
     /// Reference: https://github.com/scipr-lab/libff/blob/c927821ebe02e0a24b5e0f9170cec5e211a35f08/libff/algebra/curves/mnt/mnt4/mnt4_init.cpp#L116
@@ -54,6 +56,8 @@ impl SWModelParameters for Parameters {
     /// AFFINE_GENERATOR_COEFFS = (G1_GENERATOR_X, G1_GENERATOR_Y)
     const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField) =
         (G1_GENERATOR_X, G1_GENERATOR_Y);
+
+    impl_scalar_mul_parameters!(G1Projective);
 }
 
 // Generator of G1
diff --git a/algebra/src/mnt4_298/curves/g2.rs b/algebra/src/mnt4_298/curves/g2.rs
index 9b5c89a63..84b5a4bfd 100644
--- a/algebra/src/mnt4_298/curves/g2.rs
+++ b/algebra/src/mnt4_298/curves/g2.rs
@@ -6,7 +6,7 @@ use algebra_core::{
         mnt4::MNT4Parameters,
         models::{ModelParameters, SWModelParameters},
     },
-    field_new,
+    field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters,
 };
 
 pub type G2Affine = mnt4::G2Affine<mnt4_298::Parameters>;
@@ -29,6 +29,8 @@ pub const MUL_BY_A_C0: Fq = G1_COEFF_A_NON_RESIDUE;
 #[rustfmt::skip]
 pub const MUL_BY_A_C1: Fq = G1_COEFF_A_NON_RESIDUE;
 
+impl_scalar_mul_kernel!(mnt4_298, "mnt4_298", g2, G2Projective);
+
 impl SWModelParameters for Parameters {
     const COEFF_A: Fq2 = mnt4_298::Parameters::TWIST_COEFF_A;
     // B coefficient of MNT4-298 G2 =
@@ -82,6 +84,8 @@ impl SWModelParameters for Parameters {
     fn mul_by_a(elt: &Fq2) -> Fq2 {
         field_new!(Fq2, MUL_BY_A_C0 * &elt.c0, MUL_BY_A_C1 * &elt.c1,)
     }
+
+    impl_scalar_mul_parameters!(G2Projective);
 }
 
 const G2_GENERATOR_X: Fq2 = field_new!(Fq2, G2_GENERATOR_X_C0, G2_GENERATOR_X_C1);
diff --git a/algebra/src/mnt4_753/curves/g1.rs b/algebra/src/mnt4_753/curves/g1.rs
index ce101a3b2..90a11fa0d 100644
--- a/algebra/src/mnt4_753/curves/g1.rs
+++ b/algebra/src/mnt4_753/curves/g1.rs
@@ -5,7 +5,7 @@ use algebra_core::{
         mnt4,
         models::{ModelParameters, SWModelParameters},
     },
-    field_new,
+    field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters,
 };
 
 pub type G1Affine = mnt4::G1Affine<mnt4_753::Parameters>;
@@ -20,6 +20,8 @@ impl ModelParameters for Parameters {
     type ScalarField = Fr;
 }
 
+impl_scalar_mul_kernel!(mnt4_753, "mnt4_753", g1, G1Projective);
+
 impl SWModelParameters for Parameters {
     /// COEFF_A = 2
     #[rustfmt::skip]
@@ -66,6 +68,8 @@ impl SWModelParameters for Parameters {
     /// AFFINE_GENERATOR_COEFFS = (G1_GENERATOR_X, G1_GENERATOR_Y)
     const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField) =
         (G1_GENERATOR_X, G1_GENERATOR_Y);
+
+    impl_scalar_mul_parameters!(G1Projective);
 }
 
 // Generator of G1
diff --git a/algebra/src/mnt4_753/curves/g2.rs b/algebra/src/mnt4_753/curves/g2.rs
index e5e9f8c4c..28ea85853 100644
--- a/algebra/src/mnt4_753/curves/g2.rs
+++ b/algebra/src/mnt4_753/curves/g2.rs
@@ -6,7 +6,7 @@ use algebra_core::{
         mnt4::MNT4Parameters,
         models::{ModelParameters, SWModelParameters},
     },
-    field_new,
+    field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters,
 };
 
 pub type G2Affine = mnt4::G2Affine<mnt4_753::Parameters>;
@@ -29,6 +29,8 @@ pub const MUL_BY_A_C0: Fq = G1_COEFF_A_NON_RESIDUE;
 #[rustfmt::skip]
 pub const MUL_BY_A_C1: Fq = G1_COEFF_A_NON_RESIDUE;
 
+impl_scalar_mul_kernel!(mnt4_753, "mnt4_753", g2, G2Projective);
+
 impl SWModelParameters for Parameters {
     const COEFF_A: Fq2 = mnt4_753::Parameters::TWIST_COEFF_A;
     // B coefficient of MNT4-753 G2 =
@@ -103,6 +105,8 @@ impl SWModelParameters for Parameters {
     fn mul_by_a(elt: &Fq2) -> Fq2 {
         field_new!(Fq2, MUL_BY_A_C0 * &elt.c0, MUL_BY_A_C1 * &elt.c1,)
     }
+
+    impl_scalar_mul_parameters!(G2Projective);
 }
 
 const G2_GENERATOR_X: Fq2 = field_new!(Fq2, G2_GENERATOR_X_C0, G2_GENERATOR_X_C1);
diff --git a/algebra/src/mnt6_298/curves/g1.rs b/algebra/src/mnt6_298/curves/g1.rs
index f10388cab..c476b91f8 100644
--- a/algebra/src/mnt6_298/curves/g1.rs
+++ b/algebra/src/mnt6_298/curves/g1.rs
@@ -5,7 +5,7 @@ use algebra_core::{
         mnt6,
         models::{ModelParameters, SWModelParameters},
     },
-    field_new,
+    field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters,
 };
 
 pub type G1Affine = mnt6::G1Affine<mnt6_298::Parameters>;
@@ -19,6 +19,9 @@ impl ModelParameters for Parameters {
     type BaseField = Fq;
     type ScalarField = Fr;
 }
+
+impl_scalar_mul_kernel!(mnt6_298, "mnt6_298", g1, G1Projective);
+
 impl SWModelParameters for Parameters {
     /// COEFF_A =
     #[rustfmt::skip]
@@ -57,6 +60,8 @@ impl SWModelParameters for Parameters {
     /// AFFINE_GENERATOR_COEFFS = (G1_GENERATOR_X, G1_GENERATOR_Y)
     const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField) =
         (G1_GENERATOR_X, G1_GENERATOR_Y);
+
+    impl_scalar_mul_parameters!(G1Projective);
 }
 
 /// G1_GENERATOR_X =
diff --git a/algebra/src/mnt6_298/curves/g2.rs b/algebra/src/mnt6_298/curves/g2.rs
index a4b779f1f..f5411f24f 100644
--- a/algebra/src/mnt6_298/curves/g2.rs
+++ b/algebra/src/mnt6_298/curves/g2.rs
@@ -6,7 +6,7 @@ use algebra_core::{
         mnt6::MNT6Parameters,
         models::{ModelParameters, SWModelParameters},
     },
-    field_new,
+    field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters,
 };
 
 pub type G2Affine = mnt6::G2Affine<mnt6_298::Parameters>;
@@ -44,6 +44,8 @@ pub const MUL_BY_A_C1: Fq = field_new!(Fq, BigInteger320([
 /// MUL_BY_A_C2 = COEFF_A
 pub const MUL_BY_A_C2: Fq = g1::Parameters::COEFF_A;
 
+impl_scalar_mul_kernel!(mnt6_298, "mnt6_298", g2, G2Projective);
+
 impl SWModelParameters for Parameters {
     const COEFF_A: Fq3 = mnt6_298::Parameters::TWIST_COEFF_A;
     #[rustfmt::skip]
@@ -99,6 +101,8 @@ impl SWModelParameters for Parameters {
             MUL_BY_A_C2 * &elt.c0,
         )
     }
+
+    impl_scalar_mul_parameters!(G2Projective);
 }
 
 const G2_GENERATOR_X: Fq3 =
diff --git a/algebra/src/mnt6_753/curves/g1.rs b/algebra/src/mnt6_753/curves/g1.rs
index 7ba2daf0d..9765e47fd 100644
--- a/algebra/src/mnt6_753/curves/g1.rs
+++ b/algebra/src/mnt6_753/curves/g1.rs
@@ -5,7 +5,7 @@ use algebra_core::{
         mnt6,
         models::{ModelParameters, SWModelParameters},
     },
-    field_new,
+    field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters,
 };
 
 pub type G1Affine = mnt6::G1Affine<mnt6_753::Parameters>;
@@ -20,6 +20,8 @@ impl ModelParameters for Parameters {
     type ScalarField = Fr;
 }
 
+impl_scalar_mul_kernel!(mnt6_753, "mnt6_753", g1, G1Projective);
+
 impl SWModelParameters for Parameters {
     /// COEFF_A = 11
     #[rustfmt::skip]
@@ -66,6 +68,8 @@ impl SWModelParameters for Parameters {
     /// AFFINE_GENERATOR_COEFFS = (G1_GENERATOR_X, G1_GENERATOR_Y)
     const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField) =
         (G1_GENERATOR_X, G1_GENERATOR_Y);
+
+    impl_scalar_mul_parameters!(G1Projective);
 }
 
 // Generator of G1
diff --git a/algebra/src/mnt6_753/curves/g2.rs b/algebra/src/mnt6_753/curves/g2.rs
index a203b25c1..9da13d77a 100644
--- a/algebra/src/mnt6_753/curves/g2.rs
+++ b/algebra/src/mnt6_753/curves/g2.rs
@@ -6,7 +6,7 @@ use algebra_core::{
         mnt6::MNT6Parameters,
         models::{ModelParameters, SWModelParameters},
     },
-    field_new,
+    field_new, impl_scalar_mul_kernel, impl_scalar_mul_parameters,
 };
 
 pub type G2Affine = mnt6::G2Affine<mnt6_753::Parameters>;
@@ -58,6 +58,8 @@ pub const MUL_BY_A_C1: Fq = field_new!(Fq, BigInteger768([
 /// MUL_BY_A_C2 = COEFF_A
 pub const MUL_BY_A_C2: Fq = g1::Parameters::COEFF_A;
 
+impl_scalar_mul_kernel!(mnt6_753, "mnt6_753", g2, G2Projective);
+
 impl SWModelParameters for Parameters {
     const COEFF_A: Fq3 = mnt6_753::Parameters::TWIST_COEFF_A;
     // B coefficient of MNT6-753 G2 =
@@ -152,6 +154,8 @@ impl SWModelParameters for Parameters {
             MUL_BY_A_C2 * &elt.c0,
         )
     }
+
+    impl_scalar_mul_parameters!(G2Projective);
 }
 
 const G2_GENERATOR_X: Fq3 =
diff --git a/algebra/src/tests/cuda.rs b/algebra/src/tests/cuda.rs
new file mode 100644
index 000000000..e407838fe
--- /dev/null
+++ b/algebra/src/tests/cuda.rs
@@ -0,0 +1,61 @@
+use algebra_core::{
+    cuda::scalar_mul::{GPUScalarMul, GPUScalarMulSlice, MAX_GROUP_ELEM_BYTES},
+    AffineCurve, BatchGroupArithmeticSlice, PrimeField, UniformRand, Zero,
+};
+use rand::SeedableRng;
+use rand_xorshift::XorShiftRng;
+
+use crate::{cfg_chunks_mut, tests::helpers::create_pseudo_uniform_random_elems};
+
+const CHUNK_SIZE: usize = 1 << 12;
+
+#[cfg(feature = "parallel")]
+use rayon::prelude::*;
+
+#[allow(unused)]
+pub fn test_cuda_scalar_mul<G: AffineCurve>() {
+    #[cfg(not(feature = "big_n"))]
+    const MAX_LOGN: usize = 14;
+    #[cfg(feature = "big_n")]
+    const MAX_LOGN: usize = 20;
+
+    let cuda_group_size = 1 << 5;
+    if core::mem::size_of::<G>() >= MAX_GROUP_ELEM_BYTES {
+        println!("Group size too large to run on GPU, defaulting to CPU-only implementation");
+    }
+
+    const SAMPLES: usize = 1 << MAX_LOGN;
+
+    let _lol = G::Projective::zero();
+    let mut rng = XorShiftRng::seed_from_u64(234872845u64);
+
+    let exps_h = (0..SAMPLES)
+        .map(|_| G::ScalarField::rand(&mut rng).into_repr())
+        .collect::<Vec<_>>();
+    let mut bases_h = create_pseudo_uniform_random_elems::<G, XorShiftRng>(&mut rng, MAX_LOGN);
+
+    let mut bases_d = bases_h.to_vec();
+    let mut exps_cpu = exps_h.to_vec();
+
+    let now = std::time::Instant::now();
+    cfg_chunks_mut!(bases_h, CHUNK_SIZE)
+        .zip(cfg_chunks_mut!(exps_cpu, CHUNK_SIZE))
+        .for_each(|(b, s)| b[..].batch_scalar_mul_in_place(&mut s[..], 4));
+    println!("CPU mul: {}us", now.elapsed().as_micros());
+
+    <G as AffineCurve>::Projective::clear_gpu_profiling_data();
+
+    let mut junk_data = bases_d.to_vec();
+    for _ in 0..3 {
+        let now = std::time::Instant::now();
+        &mut junk_data[..].cpu_gpu_scalar_mul(&exps_h[..], cuda_group_size, CHUNK_SIZE);
+        println!("CPU + GPU mul: {}us", now.elapsed().as_micros());
+    }
+    let now = std::time::Instant::now();
+    &mut bases_d[..].cpu_gpu_scalar_mul(&exps_h[..], cuda_group_size, CHUNK_SIZE);
+    println!("CPU + GPU mul: {}us", now.elapsed().as_micros());
+
+    for (b_h, b_d) in bases_h.into_iter().zip(bases_d.into_iter()) {
+        assert_eq!(b_h, b_d);
+    }
+}
diff --git a/algebra/src/tests/macros.rs b/algebra/src/tests/macros.rs
index f4f0b089a..72584e57d 100644
--- a/algebra/src/tests/macros.rs
+++ b/algebra/src/tests/macros.rs
@@ -7,7 +7,7 @@ macro_rules! std_curve_tests {
         };
         use rand::Rng;
 
-        use crate::tests::{curves::*, groups::*, msm::*};
+        use crate::tests::{cuda::*, curves::*, groups::*, msm::*};
 
         #[test]
         #[cfg(feature = "curve")]
@@ -99,6 +99,18 @@ macro_rules! std_curve_tests {
             test_msm::<G2Affine>();
         }
 
+        #[test]
+        #[cfg(any(feature = "curve", feature = "cuda_test"))]
+        fn test_g1_cuda_scalar_mul() {
+            test_cuda_scalar_mul::<G1Affine>();
+        }
+
+        #[test]
+        #[cfg(any(feature = "curve", feature = "cuda_test"))]
+        fn test_g2_cuda_scalar_mul() {
+            test_cuda_scalar_mul::<G2Affine>();
+        }
+
         #[test]
         #[cfg(feature = "pairing")]
         fn test_bilinearity() {
@@ -152,7 +164,7 @@ macro_rules! edwards_curve_tests {
         };
         use rand::Rng;
 
-        use crate::tests::{curves::*, groups::*, msm::*};
+        use crate::tests::{cuda::*, curves::*, groups::*, msm::*};
 
         #[test]
         #[cfg(feature = "curve")]
@@ -206,7 +218,13 @@ macro_rules! edwards_curve_tests {
         }
 
         #[test]
-        #[cfg(feature = "curve")]
+        #[cfg(any(feature = "curve", feature = "cuda_test"))]
+        fn test_edwards_cuda_scalar_mul() {
+            test_cuda_scalar_mul::<EdwardsAffine>();
+        }
+
+        #[test]
+        #[cfg(any(feature = "curve", feature = "cuda_test"))]
         fn test_generator() {
             let generator = EdwardsAffine::prime_subgroup_generator();
             assert!(generator.is_on_curve());
diff --git a/algebra/src/tests/mod.rs b/algebra/src/tests/mod.rs
index 93864eadf..fee88d8e9 100644
--- a/algebra/src/tests/mod.rs
+++ b/algebra/src/tests/mod.rs
@@ -1,3 +1,4 @@
+pub(crate) mod cuda;
 pub(crate) mod curves;
 pub(crate) mod fields;
 pub(crate) mod groups;