mpc-msri · kanav99 · May 17, 2024 · May 17, 2024 · May 17, 2024 · May 17, 2024
diff --git a/GPU-MPC/Makefile b/GPU-MPC/Makefile
@@ -56,6 +56,9 @@ truncate: tests/fss/truncate.cu
 mha: tests/fss/mha.cu
 	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/mha
 
+rotary_embedding: tests/fss/rotary_embedding.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/rotary_embedding
+
 secfloat_softmax: tests/fss/secfloat_softmax.cu
 	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) $(SECFLOAT_LIBS) -o tests/fss/secfloat_softmax
 
@@ -107,6 +110,9 @@ orca_inference_u32: experiments/orca/orca_inference.cu
 sigma: experiments/sigma/sigma.cu 
 	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o experiments/sigma/sigma
 
+sigma_correctness: experiments/sigma/sigma.cu 
+	$(CXX) $(FLAGS) -DCORRECTNESS=1 $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o experiments/sigma/sigma
+
 piranha: experiments/orca/piranha.cu 
 	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o experiments/orca/piranha
 

diff --git a/GPU-MPC/README.md b/GPU-MPC/README.md
@@ -1,7 +1,7 @@
 
-# Orca: FSS-based Secure Training and Inference with GPUs
+# GPU-MPC
 
-Implementation of protocols from the paper [Orca](https://eprint.iacr.org/2023/206).
+Implementation of protocols from the papers [Orca](https://eprint.iacr.org/2023/206) and [SIGMA]().
 
 **Warning**: This is an academic proof-of-concept prototype and has not received careful code review. This implementation is NOT ready for production use.
 
@@ -33,48 +33,19 @@ sh setup.sh
 ```
 make orca
 ```
+4. Make sigma (this does not require making Orca)
 
-## Run
-
-1. Each party runs two processes: a dealer and an evaluator. The configuration needs to define the GPU on which the dealer will run, and the directory in which it will store FSS keys. This is done in `config.json` as:
-
-```javascript
-"dealer" :
-    { "gpu": <The ID of the GPU to use>,
-      "key_dir": <The directory in which the dealer will store keys>
-    }
 ```
-
-FSS keys tend to be quite large so please make sure that the key directory has at least 500GB of free space. Please also ensure that it is writeable.
-The runtime of Orca can be sensitive to the disk latency. Hence we recommend placing this directory in a fast nvme drive. Using SATA can slow down Orca on some tasks. 
-
-Similarly, the configuration also needs to define the GPU on which the evaluator will run, and the IP address of its peer, i.e., the address of the remote party the evaluator will communicate with for secure training or inference. This is done in `config.json` as:
-
-```javascript
-"dealer" :
-    { "gpu": <The ID of the GPU to use>,
-      "peer": <The address of the remote peer>
-    }
+make sigma
 ```
 
-You can run Orca to generate Figures 5a and 5b, as well as Tables 3, 4, 6, 7, 8 and 9. Table 5 can be generated by throttling the network bandwidth (with `tc`, for example) and regenerating Table 4. The script reports numbers for Tables 4, 6, 7 and 9 as the average of 10 iterations.
+## Run Orca
 
-Figure 5b and Table 3 run end-to-end training and so can take a couple of days to finish.
-
-Evaluation runs through `experiments/orca/run_experiment.py`. Here are the relevant options:
-
-```
-usage: run_experiment.py [-h] [--figure FIGURE] [--table TABLE] --party 0/1
-
-optional arguments:
-  --figure FIGURE  Figure # to run.
-  --table TABLE    Table # to run.
-  --all true       Run all the experiments.
-```
+Please see the [Orca README](experiments/orca/README.md).
 
-Results are stored in the `output/P<party-number>/Table<table-number>` or `output/P<party-number>/Fig<figure-number>` folders. 
+## Run SIGMA
 
-Log files (which might help with debugging) are stored in the corresponding experiment folders, i.e., in `output/P<party-number>/Table<table-number>/logs` and `output/P<party-number>/Fig<figure-number>/logs`.
+Please see the [SIGMA README](experiments/sigma/README.md)
 
 ## Docker Build
 
@@ -107,7 +78,7 @@ docker pull trajore/gpu_mpc
 sudo docker run --gpus all --network host -v /home/$USER/path_to_GPU-MPC/:/home -it container_name /bin/bash
 
 ```
-Then Run setup.sh to configure according to GPU_arch and make orca as mentioned above.
+Then Run setup.sh to configure according to GPU_arch and make Orca/SIGMA as mentioned above.
 
 ## Citation
 

diff --git a/GPU-MPC/backend/orca.h b/GPU-MPC/backend/orca.h
@@ -96,8 +96,6 @@ class Orca : public OrcaBase<T>
         {
             assert(0);
         }
-        // auto h_data = (T*) moveToCPU((u8*) in.d_data, in.size() * sizeof(T), NULL);
-        // printf("Truncate output=%lu, %lu, %lu\n", h_data[0], h_data[1], h_data[in.size() - 1]);
 
         auto end = std::chrono::high_resolution_clock::now();
         auto elapsed = end - start;

diff --git a/GPU-MPC/backend/sigma.h b/GPU-MPC/backend/sigma.h
@@ -128,12 +128,9 @@ class SIGMA : public Backend<T>
         p.N = b.d2;
         p.batchSz = 1;
         stdInit(p, bw, 0);
+
         auto k = readGPUMatmulKey<T>(p, TruncateType::None, &keyBuf);
         c.d_data = gpuMatmul(peer, party, p, k, a.d_data, b.data, useBias ? d.data : (T *)NULL, TruncateType::None, &g, &s, false);
-        // printf("Matmul weights=%ld, %ld, %ld\n", b.data[0], b.data[1], b.data[b.size() - 1]);
-
-        // auto h_out = (T*) moveToCPU((u8*) c.d_data, p.size_C * sizeof(T), NULL);
-        // printf("Matmul output=%ld, %ld\n", h_out[0], h_out[1]);
 
         auto end = std::chrono::high_resolution_clock::now();
         auto elapsed = end - start;
@@ -218,16 +215,12 @@ class SIGMA : public Backend<T>
 
     void truncateForward(Tensor<T> &in, u64 shift, u8 mode = 0)
     {
-        // printf("Truncate=%lu, %lu, %lu\n", mode, shift, size);
         auto start = std::chrono::high_resolution_clock::now();
 
         TruncateType t = TruncateType::TrFloor;
         auto k = readGPUTruncateKey<T>(t, &keyBuf);
         in.d_data = gpuTruncate<T, T>(k.bin, k.bout, t, k, k.shift, peer, party, k.N, in.d_data, &g, &s);
 
-        // auto h_data = (T*) moveToCPU((u8*) in.d_data, in.size() * sizeof(T), NULL);
-        // printf("Truncate output=%lu, %lu, %lu\n", h_data[0], h_data[1], h_data[in.size() - 1]);
-
         auto end = std::chrono::high_resolution_clock::now();
         auto elapsed = end - start;
         s.truncate_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
@@ -242,25 +235,20 @@ class SIGMA : public Backend<T>
 
     void output(Tensor<T> &a)
     {
-        // printf("Inside output=%lx\n", a.d_data);
-        // int tmpBw = bw - scale;
         int N = a.size();
-        // printf("keyBuf=%lx, %lu\n", keyBuf, keyBuf - startPtr);
         unmaskValues(bw, N, a.d_data, (T *)keyBuf, &s);
-        // printf("boo\n");
         moveIntoCPUMem((u8 *)a.data, (u8 *)a.d_data, N * sizeof(T), &s);
     }
 
     void add(const std::vector<Tensor<T> *> &in, Tensor<T> &out)
     {
-        int tmpBw = bw - scale;
         int N = in[0]->size();
         std::vector<T *> gpuInp;
         for (int i = 0; i < in.size(); i++)
         {
             gpuInp.push_back(in[i]->d_data);
         }
-        out.d_data = gpuAdd(tmpBw, N, gpuInp);
+        out.d_data = gpuAdd(bw, N, gpuInp);
     }
 
     void optimize(LayerGraphNode<T> *root)
@@ -281,6 +269,7 @@ class SIGMAKeygen : public Backend<T>
     size_t keyBufSize = 0;
     int party = -1;
     std::string keyFile;
+    size_t keySize = 0;
     int scale;
     int bw;
     AESGlobalContext g;
@@ -312,7 +301,7 @@ class SIGMAKeygen : public Backend<T>
 
     void close()
     {
-        size_t keySize = keyBuf - startPtr;
+        /*size_t*/ keySize = keyBuf - startPtr;
         size_t padding = 4096 - (keySize % 4096);
         char *zeros = new char[padding];
         memset(zeros, 0, padding);
@@ -345,6 +334,7 @@ class SIGMAKeygen : public Backend<T>
     void silu(const Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode = 0)
     {
         out.d_data = gpuKeyGenGelu<T, u16, 10>(&keyBuf, party, bw, bw - scale, (int)scale, in.size(), in.d_data, &g);
+
     }
 
     void SIGMALayernormKeygen(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale, bool computeMu)
@@ -376,7 +366,6 @@ class SIGMAKeygen : public Backend<T>
     {
         MHAParams pMHA = {X.d1, n_embed, n_heads, dim_W, selfAttn, doNormQKt, doRotEmb};
         MHAMulParams pMHAMul = initMHAMulParams(pMHA, bw, scale);
-        printf("scale=%d\n", pMHAMul.pQKV.shift);
         Y.d_data = gpuKeygenMHA(&keyBuf, party, bw, scale, pMHA, pMHAMul, wQKV.data, bQKV.data, wProj.data, bProj.data, X.d_data, &g);
     }
 
@@ -393,16 +382,13 @@ class SIGMAKeygen : public Backend<T>
 
     void add(const std::vector<Tensor<T> *> &in, Tensor<T> &out)
     {
-        int tmpBw = bw - scale;
         int N = in[0]->size();
-        // printf("Add input=%d, %lx, %lx\n", N, in[0]->d_data, in[1]->d_data);
         std::vector<T *> gpuInp;
         for (int i = 0; i < in.size(); i++)
         {
             gpuInp.push_back(in[i]->d_data);
-            // printf("Add inp=%lx\n", in[i]->d_data);
         }
-        out.d_data = gpuAdd(tmpBw, N, gpuInp);
+        out.d_data = gpuAdd(bw, N, gpuInp);
     }
 
     void addbias(Tensor<T> &x, const Tensor1D<T> &bias)

diff --git a/GPU-MPC/experiments/orca/README.md b/GPU-MPC/experiments/orca/README.md
@@ -0,0 +1,91 @@
+
+
+# Orca: FSS-based Secure Training and Inference with GPUs
+
+Implementation of protocols from the paper [Orca](https://eprint.iacr.org/2023/206).
+
+**Warning**: This is an academic proof-of-concept prototype and has not received careful code review. This implementation is NOT ready for production use.
+
+## Build
+
+This project requires NVIDIA GPUs, and assumes that GPU drivers and the [NVIDIA CUDA Toolkit](https://docs.nvidia.com/cuda/) are already installed. The following has been tested on Ubuntu 20.04 with CUDA 11.7, CMake 3.27.2 and g++-9. 
+
+Please note that Sytorch requires CMake version >= 3.17 and the build will fail if this depency is not met. 
+
+The code uses CUTLASS version 2.11 by default, so if you change the CUDA version, please make sure that the CUTLASS version being built is compatible with the new CUDA version. To change the version of CUTLASS being built, add `git checkout <branch>;` after line 31 (`cd ext/cutlass;`) of setup.sh.
+
+The last line of `setup.sh` tries to install `matplotlib`, which is needed for generating Figures 5a and 5b. In our experience, the installation fails if the versions of Python and `pip` do not match. In case the installation fails, please install `matplotlib` manually before running `run_experiment.py`.
+
+1. Export environment variables
+
+```
+export CUDA_VERSION=11.7
+export GPU_ARCH=86
+```
+
+2. Set up the environment
+
+```
+sh setup.sh
+```
+
+3. Make Orca
+
+```
+make orca
+```
+
+## Run Orca
+
+1. Each party runs two processes: a dealer and an evaluator. The configuration needs to define the GPU on which the dealer will run, and the directory in which it will store FSS keys. This is done in `config.json` as:
+
+```javascript
+"dealer" :
+    { "gpu": <The ID of the GPU to use>,
+      "key_dir": <The directory in which the dealer will store keys>
+    }
+```
+
+FSS keys tend to be quite large so please make sure that the key directory has at least 500GB of free space. Please also ensure that it is writeable.
+
+Similarly, the configuration also needs to define the GPU on which the evaluator will run, and the IP address of its peer, i.e., the address of the remote party the evaluator will communicate with for secure training or inference. This is done in `config.json` as:
+
+```javascript
+"evaluator" :
+    { "gpu": <The ID of the GPU to use>,
+      "peer": <The address of the remote peer>
+    }
+```
+
+You can run Orca to generate Figures 5a and 5b, as well as Tables 3, 4, 6, 7, 8 and 9. Table 5 can be generated by throttling the network bandwidth (with `tc`, for example) and regenerating Table 4. The script reports numbers for Tables 4, 6, 7 and 9 as the average of 10 iterations.
+
+Figure 5b and Table 3 run end-to-end training and so can take a couple of days to finish.
+
+Evaluation runs through `experiments/orca/run_experiment.py`. Here are the relevant options:
+
+```
+usage: run_experiment.py [-h] [--figure FIGURE] [--table TABLE] --party 0/1
+
+optional arguments:
+  --figure FIGURE  Figure # to run.
+  --table TABLE    Table # to run.
+  --all true       Run all the experiments.
+```
+
+Results are stored in the `output/P<party-number>/Table<table-number>` or `output/P<party-number>/Fig<figure-number>` folders. 
+
+Log files (which might help with debugging) are stored in the corresponding experiment folders, i.e., in `output/P<party-number>/Table<table-number>/logs` and `output/P<party-number>/Fig<figure-number>/logs`.
+
+## Citation
+
+You can cite the paper using the following BibTeX entry:
+
+```
+@INPROCEEDINGS {,
+author = {N. Jawalkar and K. Gupta and A. Basu and N. Chandran and D. Gupta and R. Sharma},
+booktitle = {2024 IEEE Symposium on Security and Privacy (SP)},
+title = {Orca: FSS-based Secure Training and Inference with GPUs},
+year = {2024}
+}
+```
+
diff --git a/GPU-MPC/experiments/orca/run_experiment.py b/GPU-MPC/experiments/orca/run_experiment.py
@@ -53,7 +53,7 @@ def run_fig_helper(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip, exp_nam
     plt.xlabel("Iterations")
     plt.ylabel("Cross-entropy loss")
     plt.savefig("output/P{}/{}/{}.png".format(party, fig_name, fig_name), dpi=300, bbox_inches='tight')
-
+    plt.clf()
     with open('output/P{}/{}/loss.csv'.format(party, fig_name),'w') as out_file:
         writer = csv.writer(out_file)
         writer.writerow(['Iteration','Cross-Entropy Loss'])