diff --git a/GPU-MPC/Makefile b/GPU-MPC/Makefile
index 022da47b..5c815265 100644
--- a/GPU-MPC/Makefile
+++ b/GPU-MPC/Makefile
@@ -56,6 +56,9 @@ truncate: tests/fss/truncate.cu
 mha: tests/fss/mha.cu
 	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/mha
 
+rotary_embedding: tests/fss/rotary_embedding.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/rotary_embedding
+
 secfloat_softmax: tests/fss/secfloat_softmax.cu
 	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) $(SECFLOAT_LIBS) -o tests/fss/secfloat_softmax
 
@@ -107,6 +110,9 @@ orca_inference_u32: experiments/orca/orca_inference.cu
 sigma: experiments/sigma/sigma.cu 
 	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o experiments/sigma/sigma
 
+sigma_correctness: experiments/sigma/sigma.cu 
+	$(CXX) $(FLAGS) -DCORRECTNESS=1 $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o experiments/sigma/sigma
+
 piranha: experiments/orca/piranha.cu 
 	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o experiments/orca/piranha
 
diff --git a/GPU-MPC/README.md b/GPU-MPC/README.md
index d6b28d99..2a50943b 100644
--- a/GPU-MPC/README.md
+++ b/GPU-MPC/README.md
@@ -1,7 +1,7 @@
 
-# Orca: FSS-based Secure Training and Inference with GPUs
+# GPU-MPC
 
-Implementation of protocols from the paper [Orca](https://eprint.iacr.org/2023/206).
+Implementation of protocols from the papers [Orca](https://eprint.iacr.org/2023/206) and [SIGMA]().
 
 **Warning**: This is an academic proof-of-concept prototype and has not received careful code review. This implementation is NOT ready for production use.
 
@@ -33,48 +33,19 @@ sh setup.sh
 ```
 make orca
 ```
+4. Make sigma (this does not require making Orca)
 
-## Run
-
-1. Each party runs two processes: a dealer and an evaluator. The configuration needs to define the GPU on which the dealer will run, and the directory in which it will store FSS keys. This is done in `config.json` as:
-
-```javascript
-"dealer" :
-    { "gpu": <The ID of the GPU to use>,
-      "key_dir": <The directory in which the dealer will store keys>
-    }
 ```
-
-FSS keys tend to be quite large so please make sure that the key directory has at least 500GB of free space. Please also ensure that it is writeable.
-The runtime of Orca can be sensitive to the disk latency. Hence we recommend placing this directory in a fast nvme drive. Using SATA can slow down Orca on some tasks. 
-
-Similarly, the configuration also needs to define the GPU on which the evaluator will run, and the IP address of its peer, i.e., the address of the remote party the evaluator will communicate with for secure training or inference. This is done in `config.json` as:
-
-```javascript
-"dealer" :
-    { "gpu": <The ID of the GPU to use>,
-      "peer": <The address of the remote peer>
-    }
+make sigma
 ```
 
-You can run Orca to generate Figures 5a and 5b, as well as Tables 3, 4, 6, 7, 8 and 9. Table 5 can be generated by throttling the network bandwidth (with `tc`, for example) and regenerating Table 4. The script reports numbers for Tables 4, 6, 7 and 9 as the average of 10 iterations.
+## Run Orca
 
-Figure 5b and Table 3 run end-to-end training and so can take a couple of days to finish.
-
-Evaluation runs through `experiments/orca/run_experiment.py`. Here are the relevant options:
-
-```
-usage: run_experiment.py [-h] [--figure FIGURE] [--table TABLE] --party 0/1
-
-optional arguments:
-  --figure FIGURE  Figure # to run.
-  --table TABLE    Table # to run.
-  --all true       Run all the experiments.
-```
+Please see the [Orca README](experiments/orca/README.md).
 
-Results are stored in the `output/P<party-number>/Table<table-number>` or `output/P<party-number>/Fig<figure-number>` folders. 
+## Run SIGMA
 
-Log files (which might help with debugging) are stored in the corresponding experiment folders, i.e., in `output/P<party-number>/Table<table-number>/logs` and `output/P<party-number>/Fig<figure-number>/logs`.
+Please see the [SIGMA README](experiments/sigma/README.md)
 
 ## Docker Build
 
@@ -107,7 +78,7 @@ docker pull trajore/gpu_mpc
 sudo docker run --gpus all --network host -v /home/$USER/path_to_GPU-MPC/:/home -it container_name /bin/bash
 
 ```
-Then Run setup.sh to configure according to GPU_arch and make orca as mentioned above.
+Then Run setup.sh to configure according to GPU_arch and make Orca/SIGMA as mentioned above.
 
 ## Citation
 
diff --git a/GPU-MPC/backend/orca.h b/GPU-MPC/backend/orca.h
index 4850da52..671a2b8c 100644
--- a/GPU-MPC/backend/orca.h
+++ b/GPU-MPC/backend/orca.h
@@ -96,8 +96,6 @@ class Orca : public OrcaBase<T>
         {
             assert(0);
         }
-        // auto h_data = (T*) moveToCPU((u8*) in.d_data, in.size() * sizeof(T), NULL);
-        // printf("Truncate output=%lu, %lu, %lu\n", h_data[0], h_data[1], h_data[in.size() - 1]);
 
         auto end = std::chrono::high_resolution_clock::now();
         auto elapsed = end - start;
diff --git a/GPU-MPC/backend/sigma.h b/GPU-MPC/backend/sigma.h
index d5667478..8f62f5f6 100644
--- a/GPU-MPC/backend/sigma.h
+++ b/GPU-MPC/backend/sigma.h
@@ -128,12 +128,9 @@ class SIGMA : public Backend<T>
         p.N = b.d2;
         p.batchSz = 1;
         stdInit(p, bw, 0);
+
         auto k = readGPUMatmulKey<T>(p, TruncateType::None, &keyBuf);
         c.d_data = gpuMatmul(peer, party, p, k, a.d_data, b.data, useBias ? d.data : (T *)NULL, TruncateType::None, &g, &s, false);
-        // printf("Matmul weights=%ld, %ld, %ld\n", b.data[0], b.data[1], b.data[b.size() - 1]);
-
-        // auto h_out = (T*) moveToCPU((u8*) c.d_data, p.size_C * sizeof(T), NULL);
-        // printf("Matmul output=%ld, %ld\n", h_out[0], h_out[1]);
 
         auto end = std::chrono::high_resolution_clock::now();
         auto elapsed = end - start;
@@ -218,16 +215,12 @@ class SIGMA : public Backend<T>
 
     void truncateForward(Tensor<T> &in, u64 shift, u8 mode = 0)
     {
-        // printf("Truncate=%lu, %lu, %lu\n", mode, shift, size);
         auto start = std::chrono::high_resolution_clock::now();
 
         TruncateType t = TruncateType::TrFloor;
         auto k = readGPUTruncateKey<T>(t, &keyBuf);
         in.d_data = gpuTruncate<T, T>(k.bin, k.bout, t, k, k.shift, peer, party, k.N, in.d_data, &g, &s);
 
-        // auto h_data = (T*) moveToCPU((u8*) in.d_data, in.size() * sizeof(T), NULL);
-        // printf("Truncate output=%lu, %lu, %lu\n", h_data[0], h_data[1], h_data[in.size() - 1]);
-
         auto end = std::chrono::high_resolution_clock::now();
         auto elapsed = end - start;
         s.truncate_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
@@ -242,25 +235,20 @@ class SIGMA : public Backend<T>
 
     void output(Tensor<T> &a)
     {
-        // printf("Inside output=%lx\n", a.d_data);
-        // int tmpBw = bw - scale;
         int N = a.size();
-        // printf("keyBuf=%lx, %lu\n", keyBuf, keyBuf - startPtr);
         unmaskValues(bw, N, a.d_data, (T *)keyBuf, &s);
-        // printf("boo\n");
         moveIntoCPUMem((u8 *)a.data, (u8 *)a.d_data, N * sizeof(T), &s);
     }
 
     void add(const std::vector<Tensor<T> *> &in, Tensor<T> &out)
     {
-        int tmpBw = bw - scale;
         int N = in[0]->size();
         std::vector<T *> gpuInp;
         for (int i = 0; i < in.size(); i++)
         {
             gpuInp.push_back(in[i]->d_data);
         }
-        out.d_data = gpuAdd(tmpBw, N, gpuInp);
+        out.d_data = gpuAdd(bw, N, gpuInp);
     }
 
     void optimize(LayerGraphNode<T> *root)
@@ -281,6 +269,7 @@ class SIGMAKeygen : public Backend<T>
     size_t keyBufSize = 0;
     int party = -1;
     std::string keyFile;
+    size_t keySize = 0;
     int scale;
     int bw;
     AESGlobalContext g;
@@ -312,7 +301,7 @@ class SIGMAKeygen : public Backend<T>
 
     void close()
     {
-        size_t keySize = keyBuf - startPtr;
+        /*size_t*/ keySize = keyBuf - startPtr;
         size_t padding = 4096 - (keySize % 4096);
         char *zeros = new char[padding];
         memset(zeros, 0, padding);
@@ -345,6 +334,7 @@ class SIGMAKeygen : public Backend<T>
     void silu(const Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode = 0)
     {
         out.d_data = gpuKeyGenGelu<T, u16, 10>(&keyBuf, party, bw, bw - scale, (int)scale, in.size(), in.d_data, &g);
+
     }
 
     void SIGMALayernormKeygen(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale, bool computeMu)
@@ -376,7 +366,6 @@ class SIGMAKeygen : public Backend<T>
     {
         MHAParams pMHA = {X.d1, n_embed, n_heads, dim_W, selfAttn, doNormQKt, doRotEmb};
         MHAMulParams pMHAMul = initMHAMulParams(pMHA, bw, scale);
-        printf("scale=%d\n", pMHAMul.pQKV.shift);
         Y.d_data = gpuKeygenMHA(&keyBuf, party, bw, scale, pMHA, pMHAMul, wQKV.data, bQKV.data, wProj.data, bProj.data, X.d_data, &g);
     }
 
@@ -393,16 +382,13 @@ class SIGMAKeygen : public Backend<T>
 
     void add(const std::vector<Tensor<T> *> &in, Tensor<T> &out)
     {
-        int tmpBw = bw - scale;
         int N = in[0]->size();
-        // printf("Add input=%d, %lx, %lx\n", N, in[0]->d_data, in[1]->d_data);
         std::vector<T *> gpuInp;
         for (int i = 0; i < in.size(); i++)
         {
             gpuInp.push_back(in[i]->d_data);
-            // printf("Add inp=%lx\n", in[i]->d_data);
         }
-        out.d_data = gpuAdd(tmpBw, N, gpuInp);
+        out.d_data = gpuAdd(bw, N, gpuInp);
     }
 
     void addbias(Tensor<T> &x, const Tensor1D<T> &bias)
diff --git a/GPU-MPC/experiments/orca/README.md b/GPU-MPC/experiments/orca/README.md
new file mode 100644
index 00000000..3a201bab
--- /dev/null
+++ b/GPU-MPC/experiments/orca/README.md
@@ -0,0 +1,91 @@
+
+
+# Orca: FSS-based Secure Training and Inference with GPUs
+
+Implementation of protocols from the paper [Orca](https://eprint.iacr.org/2023/206).
+
+**Warning**: This is an academic proof-of-concept prototype and has not received careful code review. This implementation is NOT ready for production use.
+
+## Build
+
+This project requires NVIDIA GPUs, and assumes that GPU drivers and the [NVIDIA CUDA Toolkit](https://docs.nvidia.com/cuda/) are already installed. The following has been tested on Ubuntu 20.04 with CUDA 11.7, CMake 3.27.2 and g++-9. 
+
+Please note that Sytorch requires CMake version >= 3.17 and the build will fail if this depency is not met. 
+
+The code uses CUTLASS version 2.11 by default, so if you change the CUDA version, please make sure that the CUTLASS version being built is compatible with the new CUDA version. To change the version of CUTLASS being built, add `git checkout <branch>;` after line 31 (`cd ext/cutlass;`) of setup.sh.
+
+The last line of `setup.sh` tries to install `matplotlib`, which is needed for generating Figures 5a and 5b. In our experience, the installation fails if the versions of Python and `pip` do not match. In case the installation fails, please install `matplotlib` manually before running `run_experiment.py`.
+
+1. Export environment variables
+
+```
+export CUDA_VERSION=11.7
+export GPU_ARCH=86
+```
+
+2. Set up the environment
+
+```
+sh setup.sh
+```
+
+3. Make Orca
+
+```
+make orca
+```
+
+## Run Orca
+
+1. Each party runs two processes: a dealer and an evaluator. The configuration needs to define the GPU on which the dealer will run, and the directory in which it will store FSS keys. This is done in `config.json` as:
+
+```javascript
+"dealer" :
+    { "gpu": <The ID of the GPU to use>,
+      "key_dir": <The directory in which the dealer will store keys>
+    }
+```
+
+FSS keys tend to be quite large so please make sure that the key directory has at least 500GB of free space. Please also ensure that it is writeable.
+
+Similarly, the configuration also needs to define the GPU on which the evaluator will run, and the IP address of its peer, i.e., the address of the remote party the evaluator will communicate with for secure training or inference. This is done in `config.json` as:
+
+```javascript
+"evaluator" :
+    { "gpu": <The ID of the GPU to use>,
+      "peer": <The address of the remote peer>
+    }
+```
+
+You can run Orca to generate Figures 5a and 5b, as well as Tables 3, 4, 6, 7, 8 and 9. Table 5 can be generated by throttling the network bandwidth (with `tc`, for example) and regenerating Table 4. The script reports numbers for Tables 4, 6, 7 and 9 as the average of 10 iterations.
+
+Figure 5b and Table 3 run end-to-end training and so can take a couple of days to finish.
+
+Evaluation runs through `experiments/orca/run_experiment.py`. Here are the relevant options:
+
+```
+usage: run_experiment.py [-h] [--figure FIGURE] [--table TABLE] --party 0/1
+
+optional arguments:
+  --figure FIGURE  Figure # to run.
+  --table TABLE    Table # to run.
+  --all true       Run all the experiments.
+```
+
+Results are stored in the `output/P<party-number>/Table<table-number>` or `output/P<party-number>/Fig<figure-number>` folders. 
+
+Log files (which might help with debugging) are stored in the corresponding experiment folders, i.e., in `output/P<party-number>/Table<table-number>/logs` and `output/P<party-number>/Fig<figure-number>/logs`.
+
+## Citation
+
+You can cite the paper using the following BibTeX entry:
+
+```
+@INPROCEEDINGS {,
+author = {N. Jawalkar and K. Gupta and A. Basu and N. Chandran and D. Gupta and R. Sharma},
+booktitle = {2024 IEEE Symposium on Security and Privacy (SP)},
+title = {Orca: FSS-based Secure Training and Inference with GPUs},
+year = {2024}
+}
+```
+
diff --git a/GPU-MPC/experiments/orca/run_experiment.py b/GPU-MPC/experiments/orca/run_experiment.py
index 8e91a236..b8c239ec 100644
--- a/GPU-MPC/experiments/orca/run_experiment.py
+++ b/GPU-MPC/experiments/orca/run_experiment.py
@@ -53,7 +53,7 @@ def run_fig_helper(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip, exp_nam
     plt.xlabel("Iterations")
     plt.ylabel("Cross-entropy loss")
     plt.savefig("output/P{}/{}/{}.png".format(party, fig_name, fig_name), dpi=300, bbox_inches='tight')
-    
+    plt.clf()
     with open('output/P{}/{}/loss.csv'.format(party, fig_name),'w') as out_file:
         writer = csv.writer(out_file)
         writer.writerow(['Iteration','Cross-Entropy Loss'])
diff --git a/GPU-MPC/experiments/sigma/README.md b/GPU-MPC/experiments/sigma/README.md
new file mode 100644
index 00000000..e84dc85d
--- /dev/null
+++ b/GPU-MPC/experiments/sigma/README.md
@@ -0,0 +1,173 @@
+
+# SIGMA: Secure GPT Inference with Function Secret Sharing
+
+Implementation of protocols from the paper [SIGMA](https://eprint.iacr.org/2023/1269).
+
+**Warning**: This is an academic proof-of-concept prototype and has not received careful code review. This implementation is NOT ready for production use.
+
+## Build
+
+This project requires NVIDIA GPUs and assumes that GPU drivers and the [NVIDIA CUDA Toolkit](https://docs.nvidia.com/cuda/) are already installed. The following has been tested on Ubuntu 20.04 with CUDA 11.7, CMake 3.27.2 and g++-9. 
+
+Please note that Sytorch requires CMake version >= 3.17 and the build will fail if this dependency is not met. 
+
+The code uses CUTLASS version 2.11 by default, so if you change the CUDA version, please make sure that the CUTLASS version being built is compatible with the new CUDA version. To change the version of CUTLASS being built, add `git checkout <branch>;` after line 31 (`cd ext/cutlass;`) of setup.sh.
+
+The last line of `setup.sh` tries to install `matplotlib`, which is needed for generating Figure 10. In our experience, the installation fails if the versions of Python and `pip` do not match. In case the installation fails, please install `matplotlib` manually before running `run_experiment.py`.
+
+This artifact requires a large amount of resources to run. To produce the numbers reported in the paper, we used two machines connected via LAN with 9.4 Gbps bandwidth and 0.05 ms ping time. Each machine has 1 TB RAM, an A6000 GPU with 46GB GPU memory, and an AMD Epyc 7742 processor.
+
+1. Export environment variables
+
+```
+export CUDA_VERSION=11.7
+export GPU_ARCH=86
+```
+
+2. Set up the environment
+
+```
+sh setup.sh
+```
+_Note:_ The above script compiles CUTLASS with the maximum possible number of threads. As this step requires a large amount of memory, you can change line 34 from `make -j` to `make -j<n>` to compile on `n` threads and avoid out-of-memory errors. 
+
+3. Make SIGMA
+
+```
+make sigma
+```
+
+4. Switch to the experiments directory
+
+```
+cd experiments/sigma
+```
+
+## Run SIGMA
+
+### Prerequisites and caveats
+
+1. Since FSS generates large keys, please ensure that you have a writeable disk with at least 500GB of free space. This is only required by our largest model (Llama2-13B). Other models require less space, and an idea of how much free space is needed per model can be estimated from the key size reported in Table 9 of the paper.
+
+2. In the online phase, SIGMA loads the entire key from the disk into CPU memory. Thus, the CPU must have (free) memory that is at least as large as the key that will be read from the disk.
+
+3. Currently, we only support sequence lengths that are powers-of-2.
+
+
+### Run standalone
+
+Make produces the `sigma` executable which is in `experiments/sigma`.
+
+Each party (the server and the client) needs to run two processes in sequence: the dealer and the evaluator. In addition to other arguments, the dealer requires the user to specify the directory in which it will store keys (see prerequisites and caveats). The evaluator requires the user to specify the directory to read keys from, the IP address of its peer, and the number of CPU threads to use for computation.
+
+The syntax for running the dealer is 
+```javascript
+./sigma <model name> <sequence length> <role=0 for dealer> <party=0/1 (server/client)> <key directory>
+```
+
+The syntax for running the evaluator is 
+```javascript
+./sigma <model name> <sequence length> <role=1 for evaluator> <party=0/1 (server/client)> <key directory> <peer IP> <CPU threads>`
+```
+
+We currently support the following models: `bert-tiny, bert-base, bert-large, gpt2, llama-7b, llama-13b`.
+
+**Example:** To run GPT2, the server will run (in sequence):
+```javascript
+./sigma gpt2 128 0 0 /tmp/
+./sigma gpt2 128 1 0 /tmp/ <client IP> 64
+```
+
+The client will run (_on a different machine_):
+```javascript
+./sigma gpt2 128 0 1 /tmp/
+./sigma gpt2 128 1 1 /tmp/ <server IP> 64
+```
+
+Results are stored in the `output/P<party number>/models/<model name>-<sequence length>/` folder.
+
+### Running the artifact
+
+Before the artifact can be run, we need to specify the dealer and evaluator configurations in `config.json`. 
+
+For the server(=P0), `config.json` looks like:
+```javascript
+{
+    "P0": {
+        "dealer": {
+            "gpu": <The ID of the GPU to use>,
+            "key_dir": <The directory in which the dealer will store keys>
+        },
+        "evaluator": {
+            "gpu": <The ID of the GPU to use>,
+            "peer": <The IP address of the remote peer>,
+            "cpu_threads": <The number of CPU threads to use for computation>
+        }
+    }
+}
+```
+
+For the client(=P1), `config.json` looks exactly the same, only the arguments are specified under the key "P1".
+
+A sample `config.json` file can be found in the `experiments/sigma` folder.
+
+Once `config.json` has been filled, the script `run_experiment.py` can be used to reproduce the tables and figures in the paper. Here are the relevant options:
+
+```
+usage: python run_experiment.py [-h] [--perf true] [--n_seq true] [--all true] --party 0/1
+
+optional arguments:
+  --perf true      Generate Tables 3, 5, 9, and Figure 10.
+  --n_seq true     Generate Table 8.
+  --all true       Run all the experiments.
+```
+
+Table 7 can be reproduced by throttling the network bandwidth (with `tc`, for example) and re-running `python run_experiment.py --perf true` to generate Table 5. 
+
+Results are stored in `output/P<party-number>/Table<table-number>.json` or `output/P<party-number>/Fig<figure-number>.json`. 
+
+Log files (which might help with debugging) can be found in the `output/P<party number>/models/<model name>-<sequence length>/logs/` folder.
+
+### Generating CPU numbers
+
+To generate CPU performance numbers in Table 3 and Figures 9,10, follow these steps:
+
+1. On both machines, run `setup.sh` script, as described in previous sections.
+
+2. On the first machine, change to the build directory and run the Python script with its IP address
+
+```
+cd ext/sytorch/build/
+python ../scripts/all-cpu-benchmarks-remote.py <ip> 0
+```
+
+3. On the second machine, use the IP address of the first machine.
+
+```
+cd ext/sytorch/build/
+python ../scripts/all-cpu-benchmarks-remote.py <ip> 1
+```
+
+If you'd like to run on a single machine, use the local script without any arguments. Note that since it runs 2 processes in parallel, it requires double the hardware.
+
+```
+python ../scripts/all-cpu-benchmarks-local.py
+```
+
+At the end of this script, which could take several hours, you get a `results.csv` file containing all the required time and communication numbers required to generate Table 3 and Figure 9,10.
+
+## Citation
+
+You can cite the paper using the following BibTeX entry:
+
+```
+@misc{cryptoeprint:2023/1269,
+      author = {Kanav Gupta and Neha Jawalkar and Ananta Mukherjee and Nishanth Chandran and Divya Gupta and Ashish Panwar and Rahul Sharma},
+      title = {SIGMA: Secure GPT Inference with Function Secret Sharing},
+      howpublished = {Cryptology ePrint Archive, Paper 2023/1269},
+      year = {2023},
+      note = {\url{https://eprint.iacr.org/2023/1269}},
+      url = {https://eprint.iacr.org/2023/1269}
+}
+```
+
diff --git a/GPU-MPC/experiments/sigma/bert.h b/GPU-MPC/experiments/sigma/bert.h
index 976be4eb..25dc91c1 100644
--- a/GPU-MPC/experiments/sigma/bert.h
+++ b/GPU-MPC/experiments/sigma/bert.h
@@ -1,8 +1,8 @@
 // Author: Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -21,6 +21,10 @@
 
 #pragma once
 
+#ifndef CORRECTNESS
+#define CORRECTNESS 0
+#endif
+
 #include <sytorch/module.h>
 
 template <typename T>
@@ -70,8 +74,10 @@ class GPUBERT : public SytorchModule<T>
     using SytorchModule<T>::add;
     using SytorchModule<T>::unsqueeze;
     std::vector<GPUBertTransformerBlock<T> *> blocks;
+#if CORRECTNESS
     LayerNorm<T> *ln_f;
     FC<T> *pool;
+#endif
     u64 n_layer, n_heads, n_embd;
     std::string attnMask, qkvFormat;
 
@@ -82,20 +88,22 @@ class GPUBERT : public SytorchModule<T>
         {
             blocks.push_back(new GPUBertTransformerBlock<T>(n_heads, n_embd, attnMask, qkvFormat));
         }
+#if CORRECTNESS
         ln_f = new LayerNorm<T>(n_embd);
         pool = new FC<T>(n_embd, n_embd, true);
+#endif
     }
 
     Tensor<T> &_forward(Tensor<T> &input)
     {
-        // auto &y = ln_f->forward(input);
-        // Tensor<T> *x = &y;
-
         Tensor<T> *x = &input;
+#if CORRECTNESS
+        auto &ln_out = ln_f->forward(input);
+        x = &ln_out;
+#endif
         for (u64 i = 0; i < n_layer; ++i)
         {
-            auto &block = blocks[i];
-            auto &x_out = block->forward(*x);
+            auto &x_out = blocks[i]->forward(*x);
             x = &x_out;
         }
         return *x;
diff --git a/GPU-MPC/experiments/sigma/config.json b/GPU-MPC/experiments/sigma/config.json
new file mode 100644
index 00000000..190d5a61
--- /dev/null
+++ b/GPU-MPC/experiments/sigma/config.json
@@ -0,0 +1,24 @@
+{
+    "P0": {
+        "dealer": {
+            "gpu": 0,
+            "key_dir": "/tmp/"
+        },
+        "evaluator": {
+            "gpu": 1,
+            "peer": "0.0.0.0",
+            "cpu_threads": 64
+        }
+    },
+    "P1": {
+        "dealer": {
+            "gpu": 2,
+            "key_dir": "/tmp/"
+        },
+        "evaluator": {
+            "gpu": 3,
+            "peer": "0.0.0.0",
+            "cpu_threads": 64
+        }
+    }
+}
diff --git a/GPU-MPC/experiments/sigma/run_experiment.py b/GPU-MPC/experiments/sigma/run_experiment.py
new file mode 100644
index 00000000..bcc93e90
--- /dev/null
+++ b/GPU-MPC/experiments/sigma/run_experiment.py
@@ -0,0 +1,183 @@
+# 
+# Copyright:
+# 
+# Copyright (c) 2024 Microsoft Research
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import argparse
+import json
+import os
+import csv
+
+# -- matplotlib stuff --
+
+import matplotlib.pyplot as plt
+
+import sys
+sys.path.insert(0, '../..')
+from experiments.utils import run_seq, remove_key
+
+def get_time(line):
+    return round(float(line.split('=')[-1].split(' ')[0]) / 10**6, 3)
+
+def get_comm(line):
+    return round(float(line.split('(')[-1].split(' ')[0]), 3)
+
+def run_perf(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip, cpu_threads):
+    for model in ['bert-tiny', 'bert-base', 'bert-large', 'gpt2', 'gpt-neo', 'gpt-neo-large', 'llama7b', 'llama13b']:
+        dealer_cmd = "CUDA_VISIBLE_DEVICES={} ./sigma {} 128 0 {} {}".format(dealer_gpu, model, party, dealer_key_dir)
+        eval_cmd = "CUDA_VISIBLE_DEVICES={} ./sigma {} 128 1 {} {} {} {}".format(eval_gpu, model, party, dealer_key_dir, peer_ip, cpu_threads)
+        log_dir = "output/P{}/models/{}-128/logs/".format(party, model)
+        run_seq(dealer_cmd, eval_cmd, log_dir)
+        key_file = '{}_inference_key_{}.dat'.format(model, party)
+        remove_key(dealer_key_dir, key_file)
+
+    stats = dict({'dealer': dict(), 'evaluator': dict()})
+    for model in ['bert-tiny', 'bert-base', 'bert-large', 'gpt2', 'gpt-neo', 'gpt-neo-large', 'llama7b', 'llama13b']:
+        stats['dealer'][model] = dict()
+        stats['evaluator'][model] = dict()
+        
+        dealer_lines = open('output/P{}/models/{}-128/dealer.txt'.format(party, model)).readlines()
+        stats['dealer'][model]['time'] = get_time(dealer_lines[0])
+        stats['dealer'][model]['key_size'] = get_comm(dealer_lines[1])
+
+        eval_lines = open('output/P{}/models/{}-128/evaluator.txt'.format(party, model)).readlines()
+        stats['evaluator'][model]['gelu'] = dict()
+        stats['evaluator'][model]['gelu']['time'] = get_time(eval_lines[6])
+        stats['evaluator'][model]['gelu']['comm'] = get_comm(eval_lines[11])
+        stats['evaluator'][model]['softmax'] = dict()
+        stats['evaluator'][model]['softmax']['time'] = get_time(eval_lines[7])
+        stats['evaluator'][model]['softmax']['comm'] = get_comm(eval_lines[12])
+        stats['evaluator'][model]['layernorm'] = dict()
+        stats['evaluator'][model]['layernorm']['time'] = get_time(eval_lines[8])
+        stats['evaluator'][model]['layernorm']['comm'] = get_comm(eval_lines[13])
+        stats['evaluator'][model]['total'] = dict()
+        stats['evaluator'][model]['total']['time'] = get_time(eval_lines[0])
+        stats['evaluator'][model]['total']['comm'] = get_comm(eval_lines[10])
+    
+    with open('output/P{}/Table3.json'.format(party), 'w') as outfile:
+        table3 = dict()
+        for tup in [('BERT-tiny', 'bert-tiny'), ('BERT-base', 'bert-base'), ('BERT-large', 'bert-large'), ('GPT2', 'gpt2'), ('GPT-Neo', 'gpt-neo'), ('Llama2-7B', 'llama7b'), ('Llama2-13B', 'llama13b')]:
+            pretty_name, model = tup
+            table3[pretty_name] = {
+                'Activation': 
+                {
+                    'Time (s)': stats['evaluator'][model]['gelu']['time'], 
+                    'Comm (GB)': stats['evaluator'][model]['gelu']['comm']
+                },
+                'Softmax': 
+                {
+                    'Time (s)': stats['evaluator'][model]['softmax']['time'], 
+                    'Comm (GB)': stats['evaluator'][model]['softmax']['comm']
+                },
+                'Norm': 
+                {   'Time (s)': stats['evaluator'][model]['layernorm']['time'], 
+                    'Comm (GB)': stats['evaluator'][model]['layernorm']['comm']
+                }
+            }
+        json.dump(table3, outfile, indent=4)
+    
+    with open('output/P{}/Table5.json'.format(party), 'w') as outfile:
+        table5 = dict()
+        for tup in [('BERT-tiny', 'bert-tiny'), ('BERT-base', 'bert-base'), ('BERT-large', 'bert-large'), ('GPT2', 'gpt2'), ('GPT-Neo', 'gpt-neo'), ('Llama2-7B', 'llama7b'), ('Llama2-13B', 'llama13b')]:
+            pretty_name, model = tup
+            table5[pretty_name] = {
+                'Time (s)': stats['evaluator'][model]['total']['time'],
+                'Comm (GB)': stats['evaluator'][model]['total']['comm']
+            }
+        json.dump(table5, outfile, indent=4)
+
+    
+    with open('output/P{}/Table9.json'.format(party), 'w') as outfile:
+        table9 = dict()
+        for tup in [('BERT-tiny', 'bert-tiny'), ('BERT-base', 'bert-base'), ('BERT-large', 'bert-large'), ('GPT2', 'gpt2'), ('GPT-Neo', 'gpt-neo'), ('Llama2-7B', 'llama7b'), ('Llama2-13B', 'llama13b')]:
+            pretty_name, model = tup
+            table9[pretty_name] = {
+                'Key size (GB)': stats['dealer'][model]['key_size'],
+                'Generation time (s)': stats['dealer'][model]['time'],
+                'Online time (s)': stats['evaluator'][model]['total']['time']
+            }
+        json.dump(table9, outfile, indent=4)
+
+    with open('output/P{}/Fig11_data.csv'.format(party),'w') as out_file:
+        online_time = list(map(lambda model: stats['evaluator'][model]['total']['time'], ['gpt-neo', 'gpt-neo-large', 'llama7b', 'llama13b']))
+        X = ('1.3', '2.7', '7', '13')
+        plt.plot(X, online_time, marker='s', label='SIGMA-GPU')
+        plt.legend(loc='upper left')
+        plt.xlabel('Number of parameters (in billions)')
+        plt.ylabel('Time (s)')
+        plt.savefig("output/P{}/Fig11.png".format(party), dpi=300, bbox_inches='tight')
+        plt.clf()
+        
+        writer = csv.writer(out_file)
+        writer.writerow(['Number of parameters (in billions)','Time (s)'])
+        for i in range(len(X)):
+            writer.writerow((X[i], online_time[i]))
+
+
+def run_table8(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip, cpu_threads):
+
+    for n_seq in [64, 128, 256, 512, 1024]:
+        dealer_cmd = "CUDA_VISIBLE_DEVICES={} ./sigma gpt2 {} 0 {} {}".format(dealer_gpu, n_seq, party, dealer_key_dir)
+        eval_cmd = "CUDA_VISIBLE_DEVICES={} ./sigma gpt2 {} 1 {} {} {} {}".format(eval_gpu, n_seq, party, dealer_key_dir, peer_ip, cpu_threads)
+        log_dir = 'output/P{}/models/gpt2-{}/logs/'.format(party, n_seq)
+        run_seq(dealer_cmd, eval_cmd, log_dir)
+        key_file = 'gpt2_inference_key_{}.dat'.format(party)
+        remove_key(dealer_key_dir, key_file)
+
+    with open('output/P{}/Table8.json'.format(party), 'w') as outfile:
+        table8 = dict()
+        for n_seq in [64, 128, 256, 512, 1024]:
+            eval_lines = open('output/P{}/models/gpt2-{}/evaluator.txt'.format(party, n_seq)).readlines()
+            table8[n_seq] = {
+                'Time (s)': get_time(eval_lines[0]),
+                'Comm (GB)': get_comm(eval_lines[10])
+            }
+        json.dump(table8, outfile, indent=4)
+
+def main():
+    parser = argparse.ArgumentParser(description='Run artifact evaluation!')
+    parser.add_argument('--n_seq', default=False, type=bool, help='Run Table 8.')
+    parser.add_argument('--perf', default=False, type=bool, help='Run all performance experiments.')
+    parser.add_argument('--all', default=False, type=bool, help='Run all experiments.')
+    parser.add_argument('--party', default=0, type=int, help='Party to run (0/1).')
+
+    args = parser.parse_args();
+    global_config = None
+    with open('config.json', 'r') as f:
+        global_config = json.load(f)
+    config = None
+
+    if args.party == None:
+        raise Exception("Must specify party")
+    if args.party == 0:
+        config = global_config['P0']
+    else:
+        config = global_config['P1']
+    dealer_config = config['dealer']
+    eval_config = config['evaluator']
+    if args.all:
+        run_perf(args.party, dealer_config['gpu'], eval_config['gpu'], dealer_config['key_dir'], eval_config['peer'], eval_config['cpu_threads'])
+        run_table8(args.party, dealer_config['gpu'], eval_config['gpu'], dealer_config['key_dir'], eval_config['peer'], eval_config['cpu_threads'])
+    elif args.perf:
+        run_perf(args.party, dealer_config['gpu'], eval_config['gpu'], dealer_config['key_dir'], eval_config['peer'], eval_config['cpu_threads'])
+    elif args.n_seq:
+        run_table8(args.party, dealer_config['gpu'], eval_config['gpu'], dealer_config['key_dir'], eval_config['peer'], eval_config['cpu_threads'])
+
+if __name__ == '__main__':
+    main();
diff --git a/GPU-MPC/experiments/sigma/sigma.cu b/GPU-MPC/experiments/sigma/sigma.cu
index 047daba0..795715a5 100644
--- a/GPU-MPC/experiments/sigma/sigma.cu
+++ b/GPU-MPC/experiments/sigma/sigma.cu
@@ -1,8 +1,8 @@
 // Author: Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -20,13 +20,15 @@
 // SOFTWARE.
 
 #include <sytorch/module.h>
+#include <sytorch/utils.h>
 #include "gpt2.h"
 #include "bert.h"
 #include "llama2.h"
 #include "backend/sigma.h"
 
-inline std::string toGB(u64 bytes) {
-    return std::to_string(bytes) + " B (" + std::to_string((float) bytes / (1024.0f * 1024.0f * 1024.0f)) + " GB)";
+inline std::string toGB(u64 bytes)
+{
+    return std::to_string(bytes) + " B (" + std::to_string((float)bytes / (1024.0f * 1024.0f * 1024.0f)) + " GB)";
 }
 
 int main(int __argc, char **__argv)
@@ -50,6 +52,7 @@ int main(int __argc, char **__argv)
     auto keyFile = keyDir + model + "_inference_key";
     u64 keyBufSz = 0;
     SytorchModule<u64> *net;
+    Tensor<u64> input({n_seq, n_embd});
 
     if (model == "gpt2")
     {
@@ -58,8 +61,13 @@ int main(int __argc, char **__argv)
         n_embd = 768;
         attnMask = "self";
         bw = 50;
-        keyBufSz = 20 * OneGB;
+        u64 mul = (u64) std::pow(2.3, std::log2(n_seq / 64));
+        keyBufSz = 10 * mul * OneGB;
         net = new GPUGPT2<u64>(n_layer, n_head, n_embd, attnMask, qkvFormat);
+        input.resize({n_seq, n_embd});
+        input.zero();
+        net->init(scale, input);
+        net->zero();
     }
     else if (model == "bert-tiny")
     {
@@ -69,6 +77,10 @@ int main(int __argc, char **__argv)
         bw = 37;
         keyBufSz = OneGB;
         net = new GPUBERT<u64>(n_layer, n_head, n_embd, attnMask, qkvFormat);
+        input.resize({n_seq, n_embd});
+        input.zero();
+        net->init(scale, input);
+        net->zero();
     }
     else if (model == "bert-base")
     {
@@ -78,6 +90,10 @@ int main(int __argc, char **__argv)
         bw = 50;
         keyBufSz = 20 * OneGB;
         net = new GPUBERT<u64>(n_layer, n_head, n_embd, attnMask, qkvFormat);
+        input.resize({n_seq, n_embd});
+        input.zero();
+        net->init(scale, input);
+        net->zero();
     }
     else if (model == "bert-large")
     {
@@ -87,6 +103,10 @@ int main(int __argc, char **__argv)
         bw = 50;
         keyBufSz = 50 * OneGB;
         net = new GPUBERT<u64>(n_layer, n_head, n_embd, attnMask, qkvFormat);
+        input.resize({n_seq, n_embd});
+        input.zero();
+        net->init(scale, input);
+        net->zero();
     }
     else if (model == "gpt-neo")
     {
@@ -98,6 +118,10 @@ int main(int __argc, char **__argv)
         bw = 51;
         keyBufSz = 80 * OneGB;
         net = new GPUGPT2<u64>(n_layer, n_head, n_embd, attnMask, qkvFormat, false);
+        input.resize({n_seq, n_embd});
+        input.zero();
+        net->init(scale, input);
+        net->zero();
     }
     else if (model == "gpt-neo-large")
     {
@@ -109,6 +133,10 @@ int main(int __argc, char **__argv)
         bw = 51; // 52;
         keyBufSz = 200 * OneGB;
         net = new GPUGPT2<u64>(n_layer, n_head, n_embd, attnMask, qkvFormat, false);
+        input.resize({n_seq, n_embd});
+        input.zero();
+        net->init(scale, input);
+        net->zero();
     }
     else if (model == "llama7b")
     {
@@ -121,6 +149,10 @@ int main(int __argc, char **__argv)
         u64 intermediate_size = 11008;
         keyBufSz = 300 * OneGB;
         net = new GPULlama<u64>(n_layer, n_head, n_embd, intermediate_size);
+        input.resize({n_seq, n_embd});
+        input.zero();
+        net->init(scale, input);
+        net->zero();
     }
     else if (model == "llama13b")
     {
@@ -133,21 +165,36 @@ int main(int __argc, char **__argv)
         u64 intermediate_size = 13824;
         keyBufSz = 450 * OneGB;
         net = new GPULlama<u64>(n_layer, n_head, n_embd, intermediate_size);
+        input.resize({n_seq, n_embd});
+        input.zero();
+        net->init(scale, input);
+        net->zero();
     }
-
-    Tensor<u64> input({n_seq, n_embd});
-    net->init(scale, input);
     srand(time(NULL));
-
+    std::string outDir = "output/P" + std::to_string(party) + "/models/";
+    makeDir(outDir);
+    auto inferenceDir = outDir + model + "-" + std::to_string(n_seq) + "/";
+    makeDir(inferenceDir);
     if (role == 0)
     {
         auto sigma = new SIGMAKeygen<u64>(party, bw, scale, keyFile, keyBufSz);
         net->setBackend(sigma);
         net->optimize();
+        auto start = std::chrono::high_resolution_clock::now();
         input.d_data = (u64 *)moveToGPU((u8 *)input.data, input.size() * sizeof(u64), (Stats *)NULL);
         auto &activation = net->forward(input);
         sigma->output(activation);
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
         sigma->close();
+        std::stringstream ss;
+        ss << "Total time=" + std::to_string(elapsed.count()) + " us";
+        ss << std::endl;
+        ss << "Key size=" + toGB(sigma->keySize);
+        ss << std::endl;
+        std::ofstream statsFile(inferenceDir + "dealer.txt");
+        statsFile << ss.rdbuf();
+        statsFile.close();
     }
     else
     {
@@ -163,27 +210,30 @@ int main(int __argc, char **__argv)
         auto end = std::chrono::high_resolution_clock::now();
         auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
         sigma->close();
+        auto signedAct = Tensor<i64>((i64 *)activation.data, activation.shape).as_2d();
+        // print(signedAct.as_nd(), scale, (u64) bw);
+        auto maxIdx = signedAct.argmax(0);
+        printf("%d, %ld\n", maxIdx, activation.data[maxIdx]);
 
         std::stringstream ss;
 
-        ss << "Time in ms" << std::endl;
-        ss << "Total time=" + std::to_string(elapsed.count());
+        ss << "Total time=" + std::to_string(elapsed.count()) + " us";
         ss << std::endl;
-        ss << "Comm time=" + std::to_string(sigma->s.comm_time);
+        ss << "Comm time=" + std::to_string(sigma->s.comm_time) + " us";
         ss << std::endl;
-        ss << "Transfer time=" + std::to_string(sigma->s.transfer_time);
+        ss << "Transfer time=" + std::to_string(sigma->s.transfer_time) + " us";
         ss << std::endl;
-        ss << "MHA time=" + std::to_string(sigma->s.mha_time);
+        ss << "MHA time=" + std::to_string(sigma->s.mha_time) + " us";
         ss << std::endl;
-        ss << "Matmul time=" + std::to_string(sigma->s.matmul_time);
+        ss << "Matmul time=" + std::to_string(sigma->s.matmul_time) + " us";
         ss << std::endl;
-        ss << "Truncate time=" + std::to_string(sigma->s.truncate_time);
+        ss << "Truncate time=" + std::to_string(sigma->s.truncate_time) + " us";
         ss << std::endl;
-        ss << "Gelu time=" + std::to_string(sigma->s.gelu_time);
+        ss << "Gelu time=" + std::to_string(sigma->s.gelu_time) + " us";
         ss << std::endl;
-        ss << "Softmax time=" + std::to_string(sigma->s.softmax_time);
+        ss << "Softmax time=" + std::to_string(sigma->s.softmax_time) + " us";
         ss << std::endl;
-        ss << "Layernorm time=" + std::to_string(sigma->s.layernorm_time);
+        ss << "Layernorm time=" + std::to_string(sigma->s.layernorm_time) + " us";
         ss << std::endl;
         ss << std::endl;
         ss << "Total Comm=" + toGB(sigma->peer->bytesSent() + sigma->peer->bytesReceived());
@@ -195,8 +245,7 @@ int main(int __argc, char **__argv)
         ss << "Layernorm Comm=" + toGB(sigma->s.layernorm_comm_bytes);
         ss << std::endl;
 
-        auto inferenceDir = "output/P" + std::to_string(party) + "/";
-        std::ofstream statsFile(inferenceDir + model + ".txt");
+        std::ofstream statsFile(inferenceDir + "evaluator.txt");
         statsFile << ss.rdbuf();
         statsFile.close();
     }
diff --git a/GPU-MPC/ext/sytorch/CMakeLists.txt b/GPU-MPC/ext/sytorch/CMakeLists.txt
index 544e70a8..9d871fad 100755
--- a/GPU-MPC/ext/sytorch/CMakeLists.txt
+++ b/GPU-MPC/ext/sytorch/CMakeLists.txt
@@ -370,3 +370,52 @@ add_executable(
 )
 
 target_link_libraries(gptneobenchmark ${PROJECT_NAME})
+
+add_executable(
+    benchmark-bert-tiny
+    benchmarks/bert-tiny.cpp
+)
+
+target_link_libraries(benchmark-bert-tiny ${PROJECT_NAME})
+
+add_executable(
+    benchmark-bert-base
+    benchmarks/bert-base.cpp
+)
+
+target_link_libraries(benchmark-bert-base ${PROJECT_NAME})
+
+add_executable(
+    benchmark-bert-large
+    benchmarks/bert-large.cpp
+)
+
+target_link_libraries(benchmark-bert-large ${PROJECT_NAME})
+
+add_executable(
+    benchmark-gpt2
+    benchmarks/gpt2.cpp
+)
+
+target_link_libraries(benchmark-gpt2 ${PROJECT_NAME})
+
+add_executable(
+    benchmark-gptneo
+    benchmarks/gptneo.cpp
+)
+
+target_link_libraries(benchmark-gptneo ${PROJECT_NAME})
+
+add_executable(
+    benchmark-llama-7b
+    benchmarks/llama-7b.cpp
+)
+
+target_link_libraries(benchmark-llama-7b ${PROJECT_NAME})
+
+add_executable(
+    benchmark-llama-13b
+    benchmarks/llama-13b.cpp
+)
+
+target_link_libraries(benchmark-llama-13b ${PROJECT_NAME})
diff --git a/GPU-MPC/ext/sytorch/benchmarks/bert-base.cpp b/GPU-MPC/ext/sytorch/benchmarks/bert-base.cpp
new file mode 100644
index 00000000..7f22f60b
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/benchmarks/bert-base.cpp
@@ -0,0 +1,256 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+//
+// Copyright (c) 2024 Microsoft Research
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/backend/piranha_cleartext.h>
+#include <sytorch/backend/secureml_cleartext.h>
+#include <sytorch/backend/float.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+template <typename T>
+class FFN : public SytorchModule<T>
+{
+public:
+    using SytorchModule<T>::gelu;
+
+    u64 in;
+    u64 hidden;
+
+public:
+    FC<T> *up;
+    FC<T> *down;
+
+    FFN(u64 in, u64 hidden) : in(in), hidden(hidden)
+    {
+        up = new FC<T>(in, hidden, true);
+        down = new FC<T>(hidden, in, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        return down->forward(gelu(up->forward(input)));
+    }
+};
+
+template <typename T>
+class MultiHeadAttention : public SytorchModule<T>
+{
+public:
+    using SytorchModule<T>::split;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::transpose;
+    using SytorchModule<T>::matmul;
+    using SytorchModule<T>::scalarmul;
+    using SytorchModule<T>::invsqrt;
+    using SytorchModule<T>::softmax;
+    using SytorchModule<T>::concat;
+    using SytorchModule<T>::attention_mask;
+
+public:
+    FC<T> *c_attn;
+    FC<T> *c_proj;
+
+    u64 n_heads;
+    u64 n_embd;
+
+    MultiHeadAttention(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd)
+    {
+        always_assert(n_embd % n_heads == 0);
+        c_attn = new FC<T>(n_embd, 3 * n_embd, true);
+        c_proj = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &x = c_attn->forward(input);
+        auto &qkv_heads = split(x, 3);
+        auto &q_heads = view(qkv_heads, 0);
+        auto &k_heads = view(qkv_heads, 1);
+        auto &v_heads = view(qkv_heads, 2);
+        auto &qs = split(q_heads, n_heads);
+        auto &ks = split(k_heads, n_heads);
+        auto &vs = split(v_heads, n_heads);
+
+        double divisor = 1 / sqrt(double(n_embd) / double(n_heads));
+
+        std::vector<Tensor<T> *> qks_sm_vs;
+        for (u64 i = 0; i < n_heads; ++i)
+        {
+            auto &q = view(qs, i);
+            auto &k = view(ks, i);
+            auto &v = view(vs, i);
+            auto &kt = transpose(k);
+            auto &qk = matmul(q, kt);
+            auto &qks = scalarmul(qk, divisor);
+
+            auto &qks_sm = softmax(qks);
+
+            auto &qks_sm_v = matmul(qks_sm, v);
+            qks_sm_vs.push_back(&qks_sm_v);
+        }
+
+        auto &qks_sm_vs_cat = concat(qks_sm_vs);
+        auto &res = c_proj->forward(qks_sm_vs_cat);
+        return res;
+    }
+};
+
+template <typename T>
+class TransformerBlock : public SytorchModule<T>
+{
+public:
+    using SytorchModule<T>::add;
+
+    MultiHeadAttention<T> *attn;
+    FFN<T> *ffn;
+    LayerNorm<T> *ln0;
+    LayerNorm<T> *ln1;
+
+    u64 n_heads, n_embd;
+
+public:
+    TransformerBlock(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd)
+    {
+        attn = new MultiHeadAttention<T>(n_heads, n_embd);
+        ffn = new FFN<T>(n_embd, 4 * n_embd);
+        ln0 = new LayerNorm<T>(n_embd);
+        ln1 = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &attn_out = attn->forward(input);
+        auto &add0_out = add(attn_out, input);
+        auto &ln0_out = ln0->forward(add0_out);
+
+        auto &ffn_out = ffn->forward(ln0_out);
+        auto &add1_out = add(ffn_out, ln0_out);
+        auto &ln1_out = ln1->forward(add1_out);
+        return ln1_out;
+    }
+};
+
+template <typename T>
+class BERT : public SytorchModule<T>
+{
+public:
+    using SytorchModule<T>::tanh;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::unsqueeze;
+    std::vector<TransformerBlock<T> *> blocks;
+    LayerNorm<T> *ln_f;
+    FC<T> *pool;
+    u64 n_layer, n_heads, n_embd;
+
+public:
+    BERT(u64 n_layer, u64 n_heads, u64 n_embd) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd)
+    {
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new TransformerBlock<T>(n_heads, n_embd));
+        }
+        ln_f = new LayerNorm<T>(n_embd);
+        pool = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &y = ln_f->forward(input);
+        Tensor<T> *x = &y;
+        // Tensor<T> *x = &input;
+
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+
+        return *x;
+    }
+};
+
+int main(int __argc, char **__argv)
+{
+    sytorch_init();
+
+    // bert base
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+    const u64 scale = 12;
+    const u64 bw = 50;
+    const u64 n_seq = 128;
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+    if (__argc > 2)
+        ip = __argv[2];
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = bw;
+    LlamaConfig::party = party;
+
+    llama->init(ip, true);
+
+    BERT<u64> net(n_layer, n_head, n_embd);
+    Tensor<u64> input({n_seq, n_embd});
+    net.init(scale, input);
+    net.setBackend(llama);
+    net.optimize();
+    if (party != DEALER)
+    {
+        // net.load("bert-tiny-weights.dat");
+        // input.load("15469.dat", scale);
+        input.fill(1LL << (scale - 2));
+    }
+    else
+    {
+        net.zero();
+    }
+
+    llama->initializeInferencePartyA(net.root);
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    net.forward(input);
+    llama::end();
+
+    auto &output = net.activation;
+    llama->outputA(output);
+    llama->finalize();
+
+    if (party == CLIENT)
+    {
+        auto signedAct = Tensor<i64>((i64*) net.activation.data, net.activation.shape);
+        print(signedAct, scale, bw);
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/benchmarks/bert-large.cpp b/GPU-MPC/ext/sytorch/benchmarks/bert-large.cpp
new file mode 100644
index 00000000..47819674
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/benchmarks/bert-large.cpp
@@ -0,0 +1,256 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+//
+// Copyright (c) 2024 Microsoft Research
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/backend/piranha_cleartext.h>
+#include <sytorch/backend/secureml_cleartext.h>
+#include <sytorch/backend/float.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+template <typename T>
+class FFN : public SytorchModule<T>
+{
+public:
+    using SytorchModule<T>::gelu;
+
+    u64 in;
+    u64 hidden;
+
+public:
+    FC<T> *up;
+    FC<T> *down;
+
+    FFN(u64 in, u64 hidden) : in(in), hidden(hidden)
+    {
+        up = new FC<T>(in, hidden, true);
+        down = new FC<T>(hidden, in, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        return down->forward(gelu(up->forward(input)));
+    }
+};
+
+template <typename T>
+class MultiHeadAttention : public SytorchModule<T>
+{
+public:
+    using SytorchModule<T>::split;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::transpose;
+    using SytorchModule<T>::matmul;
+    using SytorchModule<T>::scalarmul;
+    using SytorchModule<T>::invsqrt;
+    using SytorchModule<T>::softmax;
+    using SytorchModule<T>::concat;
+    using SytorchModule<T>::attention_mask;
+
+public:
+    FC<T> *c_attn;
+    FC<T> *c_proj;
+
+    u64 n_heads;
+    u64 n_embd;
+
+    MultiHeadAttention(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd)
+    {
+        always_assert(n_embd % n_heads == 0);
+        c_attn = new FC<T>(n_embd, 3 * n_embd, true);
+        c_proj = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &x = c_attn->forward(input);
+        auto &qkv_heads = split(x, 3);
+        auto &q_heads = view(qkv_heads, 0);
+        auto &k_heads = view(qkv_heads, 1);
+        auto &v_heads = view(qkv_heads, 2);
+        auto &qs = split(q_heads, n_heads);
+        auto &ks = split(k_heads, n_heads);
+        auto &vs = split(v_heads, n_heads);
+
+        double divisor = 1 / sqrt(double(n_embd) / double(n_heads));
+
+        std::vector<Tensor<T> *> qks_sm_vs;
+        for (u64 i = 0; i < n_heads; ++i)
+        {
+            auto &q = view(qs, i);
+            auto &k = view(ks, i);
+            auto &v = view(vs, i);
+            auto &kt = transpose(k);
+            auto &qk = matmul(q, kt);
+            auto &qks = scalarmul(qk, divisor);
+
+            auto &qks_sm = softmax(qks);
+
+            auto &qks_sm_v = matmul(qks_sm, v);
+            qks_sm_vs.push_back(&qks_sm_v);
+        }
+
+        auto &qks_sm_vs_cat = concat(qks_sm_vs);
+        auto &res = c_proj->forward(qks_sm_vs_cat);
+        return res;
+    }
+};
+
+template <typename T>
+class TransformerBlock : public SytorchModule<T>
+{
+public:
+    using SytorchModule<T>::add;
+
+    MultiHeadAttention<T> *attn;
+    FFN<T> *ffn;
+    LayerNorm<T> *ln0;
+    LayerNorm<T> *ln1;
+
+    u64 n_heads, n_embd;
+
+public:
+    TransformerBlock(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd)
+    {
+        attn = new MultiHeadAttention<T>(n_heads, n_embd);
+        ffn = new FFN<T>(n_embd, 4 * n_embd);
+        ln0 = new LayerNorm<T>(n_embd);
+        ln1 = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &attn_out = attn->forward(input);
+        auto &add0_out = add(attn_out, input);
+        auto &ln0_out = ln0->forward(add0_out);
+
+        auto &ffn_out = ffn->forward(ln0_out);
+        auto &add1_out = add(ffn_out, ln0_out);
+        auto &ln1_out = ln1->forward(add1_out);
+        return ln1_out;
+    }
+};
+
+template <typename T>
+class BERT : public SytorchModule<T>
+{
+public:
+    using SytorchModule<T>::tanh;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::unsqueeze;
+    std::vector<TransformerBlock<T> *> blocks;
+    LayerNorm<T> *ln_f;
+    FC<T> *pool;
+    u64 n_layer, n_heads, n_embd;
+
+public:
+    BERT(u64 n_layer, u64 n_heads, u64 n_embd) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd)
+    {
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new TransformerBlock<T>(n_heads, n_embd));
+        }
+        ln_f = new LayerNorm<T>(n_embd);
+        pool = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &y = ln_f->forward(input);
+        Tensor<T> *x = &y;
+        // Tensor<T> *x = &input;
+
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+
+        return *x;
+    }
+};
+
+int main(int __argc, char **__argv)
+{
+    sytorch_init();
+
+    // bert large
+    const u64 n_embd = 1024;
+    const u64 n_head = n_embd / 64;
+    const u64 n_layer = 24;
+    const u64 scale = 12;
+    const u64 bw = 50;
+    const u64 n_seq = 128;
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+    if (__argc > 2)
+        ip = __argv[2];
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = bw;
+    LlamaConfig::party = party;
+
+    llama->init(ip, true);
+
+    BERT<u64> net(n_layer, n_head, n_embd);
+    Tensor<u64> input({n_seq, n_embd});
+    net.init(scale, input);
+    net.setBackend(llama);
+    net.optimize();
+    if (party != DEALER)
+    {
+        // net.load("bert-tiny-weights.dat");
+        // input.load("15469.dat", scale);
+        input.fill(1LL << (scale - 2));
+    }
+    else
+    {
+        net.zero();
+    }
+
+    llama->initializeInferencePartyA(net.root);
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    net.forward(input);
+    llama::end();
+
+    auto &output = net.activation;
+    llama->outputA(output);
+    llama->finalize();
+
+    if (party == CLIENT)
+    {
+        auto signedAct = Tensor<i64>((i64*) net.activation.data, net.activation.shape);
+        print(signedAct, scale, bw);
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/benchmarks/bert-tiny.cpp b/GPU-MPC/ext/sytorch/benchmarks/bert-tiny.cpp
new file mode 100644
index 00000000..4f428c8e
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/benchmarks/bert-tiny.cpp
@@ -0,0 +1,250 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+//
+// Copyright (c) 2024 Microsoft Research
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/backend/piranha_cleartext.h>
+#include <sytorch/backend/secureml_cleartext.h>
+#include <sytorch/backend/float.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+template <typename T>
+class FFN : public SytorchModule<T>
+{
+public:
+    using SytorchModule<T>::gelu;
+
+    u64 in;
+    u64 hidden;
+
+public:
+    FC<T> *up;
+    FC<T> *down;
+
+    FFN(u64 in, u64 hidden) : in(in), hidden(hidden)
+    {
+        up = new FC<T>(in, hidden, true);
+        down = new FC<T>(hidden, in, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        return down->forward(gelu(up->forward(input)));
+    }
+};
+
+template <typename T>
+class MultiHeadAttention : public SytorchModule<T>
+{
+public:
+    using SytorchModule<T>::split;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::transpose;
+    using SytorchModule<T>::matmul;
+    using SytorchModule<T>::scalarmul;
+    using SytorchModule<T>::invsqrt;
+    using SytorchModule<T>::softmax;
+    using SytorchModule<T>::concat;
+    using SytorchModule<T>::attention_mask;
+
+public:
+    FC<T> *c_attn;
+    FC<T> *c_proj;
+
+    u64 n_heads;
+    u64 n_embd;
+
+    MultiHeadAttention(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd)
+    {
+        always_assert(n_embd % n_heads == 0);
+        c_attn = new FC<T>(n_embd, 3 * n_embd, true);
+        c_proj = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &x = c_attn->forward(input);
+        auto &qkv_heads = split(x, 3);
+        auto &q_heads = view(qkv_heads, 0);
+        auto &k_heads = view(qkv_heads, 1);
+        auto &v_heads = view(qkv_heads, 2);
+        auto &qs = split(q_heads, n_heads);
+        auto &ks = split(k_heads, n_heads);
+        auto &vs = split(v_heads, n_heads);
+
+        double divisor = 1 / sqrt(double(n_embd) / double(n_heads));
+
+        std::vector<Tensor<T> *> qks_sm_vs;
+        for (u64 i = 0; i < n_heads; ++i)
+        {
+            auto &q = view(qs, i);
+            auto &k = view(ks, i);
+            auto &v = view(vs, i);
+            auto &kt = transpose(k);
+            auto &qk = matmul(q, kt);
+            auto &qks = scalarmul(qk, divisor);
+
+            auto &qks_sm = softmax(qks);
+
+            auto &qks_sm_v = matmul(qks_sm, v);
+            qks_sm_vs.push_back(&qks_sm_v);
+        }
+
+        auto &qks_sm_vs_cat = concat(qks_sm_vs);
+        auto &res = c_proj->forward(qks_sm_vs_cat);
+        return res;
+    }
+};
+
+template <typename T>
+class TransformerBlock : public SytorchModule<T>
+{
+public:
+    using SytorchModule<T>::add;
+
+    MultiHeadAttention<T> *attn;
+    FFN<T> *ffn;
+    LayerNorm<T> *ln0;
+    LayerNorm<T> *ln1;
+
+    u64 n_heads, n_embd;
+
+public:
+    TransformerBlock(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd)
+    {
+        attn = new MultiHeadAttention<T>(n_heads, n_embd);
+        ffn = new FFN<T>(n_embd, 4 * n_embd);
+        ln0 = new LayerNorm<T>(n_embd);
+        ln1 = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &attn_out = attn->forward(input);
+        auto &add0_out = add(attn_out, input);
+        auto &ln0_out = ln0->forward(add0_out);
+
+        auto &ffn_out = ffn->forward(ln0_out);
+        auto &add1_out = add(ffn_out, ln0_out);
+        auto &ln1_out = ln1->forward(add1_out);
+        return ln1_out;
+    }
+};
+
+template <typename T>
+class BERT : public SytorchModule<T>
+{
+public:
+    using SytorchModule<T>::tanh;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::unsqueeze;
+    std::vector<TransformerBlock<T> *> blocks;
+    LayerNorm<T> *ln_f;
+    FC<T> *pool;
+    u64 n_layer, n_heads, n_embd;
+
+public:
+    BERT(u64 n_layer, u64 n_heads, u64 n_embd) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd)
+    {
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new TransformerBlock<T>(n_heads, n_embd));
+        }
+        ln_f = new LayerNorm<T>(n_embd);
+        pool = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &y = ln_f->forward(input);
+        Tensor<T> *x = &y;
+        // Tensor<T> *x = &input;
+
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+
+        return *x;
+    }
+};
+
+int main(int __argc, char **__argv)
+{
+    sytorch_init();
+
+    // bert tiny
+    const u64 n_embd = 128;
+    const u64 n_head = n_embd / 64;
+    const u64 n_layer = 2;
+    const u64 scale = 12;
+    const u64 bw = 37;
+    const u64 n_seq = 128;
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+    if (__argc > 2)
+        ip = __argv[2];
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = bw;
+    LlamaConfig::party = party;
+
+    llama->init(ip, true);
+
+    BERT<u64> net(n_layer, n_head, n_embd);
+    Tensor<u64> input({n_seq, n_embd});
+    net.init(scale, input);
+    net.setBackend(llama);
+    net.optimize();
+    if (party != DEALER)
+    {
+        // net.load("bert-tiny-weights.dat");
+        // input.load("15469.dat", scale);
+        input.fill(1LL << (scale - 2));
+    }
+    else
+    {
+        net.zero();
+    }
+
+    llama->initializeInferencePartyA(net.root);
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    net.forward(input);
+    llama::end();
+
+    auto &output = net.activation;
+    llama->outputA(output);
+    llama->finalize();
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/benchmarks/gpt2.cpp b/GPU-MPC/ext/sytorch/benchmarks/gpt2.cpp
new file mode 100644
index 00000000..5b990079
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/benchmarks/gpt2.cpp
@@ -0,0 +1,226 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+template <typename T>
+class FFN : public SytorchModule<T>
+{
+    using SytorchModule<T>::gelu;
+
+    u64 in;
+    u64 hidden;
+public:
+    FC<T> *up;
+    FC<T> *down;
+
+    FFN(u64 in, u64 hidden) : in(in), hidden(hidden) 
+    {
+        up = new FC<T>(in, hidden, true);
+        down = new FC<T>(hidden, in, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        return down->forward(gelu(up->forward(input)));
+    }
+};
+
+template <typename T>
+class MultiHeadAttention : public SytorchModule<T>
+{
+    using SytorchModule<T>::split;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::transpose;
+    using SytorchModule<T>::matmul;
+    using SytorchModule<T>::matmul_triangular;
+    using SytorchModule<T>::scalarmul;
+    using SytorchModule<T>::softmax_triangular;
+    using SytorchModule<T>::concat;
+
+public:
+    FC<T> *c_attn;
+    FC<T> *c_proj;
+
+    u64 n_heads;
+    u64 n_embd;
+
+    MultiHeadAttention(u64 n_heads, u64 n_embd): n_heads(n_heads), n_embd(n_embd)
+    {
+        always_assert(n_embd % n_heads == 0);
+        c_attn = new FC<T>(n_embd, 3*n_embd, true);
+        c_proj = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &x = c_attn->forward(input);
+        auto &qkv_heads = split(x, 3);
+        auto &q_heads = view(qkv_heads, 0);
+        auto &k_heads = view(qkv_heads, 1);
+        auto &v_heads = view(qkv_heads, 2);
+        auto &qs = split(q_heads, n_heads);
+        auto &ks = split(k_heads, n_heads);
+        auto &vs = split(v_heads, n_heads);
+
+        double divisor = 1 / sqrt(double(n_embd) / double(n_heads));
+
+        std::vector<Tensor<T>*> qks_sm_vs;
+        for(u64 i = 0; i < n_heads; ++i)
+        {
+            auto &q = view(qs, i);
+            auto &k = view(ks, i);
+            auto &v = view(vs, i);
+            auto &kt = transpose(k);
+            auto &qk = matmul_triangular(q, kt);
+            auto &qks = scalarmul(qk, divisor);
+
+            auto &qks_sm = softmax_triangular(qks);
+
+            auto &qks_sm_v = matmul(qks_sm, v);
+            qks_sm_vs.push_back(&qks_sm_v);
+        }
+
+        auto &qks_sm_vs_cat = concat(qks_sm_vs);
+        auto &res = c_proj->forward(qks_sm_vs_cat);
+        return res;
+    }
+};
+
+template <typename T>
+class TransformerBlock : public SytorchModule<T>
+{
+    using SytorchModule<T>::add;
+
+    MultiHeadAttention<T> *attn;
+    FFN<T> *ffn;
+    LayerNorm<T> *ln0;
+    LayerNorm<T> *ln1;
+    
+    u64 n_heads, n_embd;
+public:
+
+    TransformerBlock(u64 n_heads, u64 n_embd): n_heads(n_heads), n_embd(n_embd)
+    {
+        attn = new MultiHeadAttention<T>(n_heads, n_embd);
+        ffn = new FFN<T>(n_embd, 4*n_embd);
+        ln0 = new LayerNorm<T>(n_embd);
+        ln1 = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &ln0_out = ln0->forward(input);
+        auto &attn_out = attn->forward(ln0_out);
+        auto &attn_out_add = add(attn_out, input);
+        auto &ln1_out = ln1->forward(attn_out_add);
+        auto &ffn_out = ffn->forward(ln1_out);
+        auto &ffn_out_add = add(ffn_out, attn_out_add);
+        return ffn_out_add;
+    }
+};
+
+template <typename T>
+class GPT2 : public SytorchModule<T>
+{
+    std::vector<TransformerBlock<T> *> blocks;
+    LayerNorm<T> *ln_f;
+    u64 n_layer, n_heads, n_embd;
+
+public:
+    
+    GPT2(u64 n_layer, u64 n_heads, u64 n_embd): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd)
+    {
+        for(u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new TransformerBlock<T>(n_heads, n_embd));
+        }
+        ln_f = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        Tensor<T> *x = &input;
+        
+        for(u64 i = 0; i < n_layer; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+        // return ln_f->forward(*x);
+        return *x;
+    }
+};
+
+int main(int __argc, char**__argv)
+{
+    sytorch_init();
+
+    // gpt2
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+    const u64 bw = 50;
+
+    const u64 scale = 12;
+    const u64 n_seq = 128;
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+    if (__argc > 2)
+        ip = __argv[2];
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = bw;
+    LlamaConfig::party = party;
+
+    llama->init(ip, true);
+
+    GPT2<u64> net(n_layer, n_head, n_embd);
+    net.init(scale);
+    net.setBackend(llama);
+    net.optimize();
+    llama->initializeInferencePartyA(net.root);
+
+    Tensor<u64> input({n_seq, n_embd});
+    if(party == CLIENT){
+        input.fill(1LL << (scale-2));
+    }
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    net.forward(input);
+    llama::end();
+
+    auto &output = net.activation;
+    llama->outputA(output);
+    llama->finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/benchmarks/gptneo.cpp b/GPU-MPC/ext/sytorch/benchmarks/gptneo.cpp
new file mode 100644
index 00000000..065c8181
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/benchmarks/gptneo.cpp
@@ -0,0 +1,304 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+template <typename T>
+class FFN : public SytorchModule<T>
+{
+    using SytorchModule<T>::gelu;
+
+    u64 in;
+    u64 hidden;
+public:
+    FC<T> *up;
+    FC<T> *down;
+
+    FFN(u64 in, u64 hidden) : in(in), hidden(hidden) 
+    {
+        up = new FC<T>(in, hidden, true);
+        down = new FC<T>(hidden, in, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        return down->forward(gelu(up->forward(input)));
+    }
+};
+
+template <typename T>
+class MultiHeadAttention : public SytorchModule<T>
+{
+    using SytorchModule<T>::split;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::transpose;
+    using SytorchModule<T>::matmul;
+    using SytorchModule<T>::scalarmul;
+    using SytorchModule<T>::invsqrt;
+    using SytorchModule<T>::softmax;
+    using SytorchModule<T>::concat;
+    using SytorchModule<T>::attention_mask;
+    // using SytorchModule<T>::local_attention_mask;
+    ///////////////////////////
+    using SytorchModule<T>::matmul_triangular;
+    using SytorchModule<T>::softmax_triangular;
+
+public:
+    // FC<T> *c_attn;
+    FC<T> *k_attn;
+    FC<T> *v_attn;
+    FC<T> *q_attn;
+    FC<T> *c_proj;
+
+    u64 n_heads;
+    u64 n_embd;
+    u64 attention_type;
+    u64 window_size;
+
+    MultiHeadAttention(u64 n_heads, u64 n_embd, u64 attention_type, u64 window_size): n_heads(n_heads), n_embd(n_embd)
+    {
+        always_assert(n_embd % n_heads == 0);
+        // c_attn = new FC<T>(n_embd, 3*n_embd, true);
+        k_attn = new FC<T>(n_embd, n_embd, false);
+        v_attn = new FC<T>(n_embd, n_embd, false);
+        q_attn = new FC<T>(n_embd, n_embd, false);
+        c_proj = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        // auto &x = c_attn->forward(input);
+        // auto &qkv_heads = split(x, 3);
+        // auto &q_heads = view(qkv_heads, 0);
+        // auto &k_heads = view(qkv_heads, 1);
+        // auto &v_heads = view(qkv_heads, 2);
+        auto &k_heads = k_attn->forward(input);
+        auto &v_heads = v_attn->forward(input);
+        auto &q_heads = q_attn->forward(input);
+        auto &qs = split(q_heads, n_heads);
+        auto &ks = split(k_heads, n_heads);
+        auto &vs = split(v_heads, n_heads);
+
+        // double divisor = 1 / sqrt(double(n_embd) / double(n_heads));
+        // double divisor = 1;
+
+        std::vector<Tensor<T>*> qks_sm_vs;
+        for(u64 i = 0; i < n_heads; ++i)
+        {
+            auto &q = view(qs, i);
+            auto &k = view(ks, i);
+            auto &v = view(vs, i);
+            auto &kt = transpose(k);
+            // auto &qks = matmul(q, kt);
+            auto &qks = matmul_triangular(q, kt);
+            // auto &qk = matmul(q, kt);
+            // auto &qks = scalarmul(qk, divisor);
+
+            /*
+            Tensor<T> *x = &input;
+            if(attention_type % 2 == 0)
+            {   
+                // printf("global\n");
+                auto &qks_masked = attention_mask(qks, 10000.0);
+                x = &qks_masked;
+            }
+            else 
+            {
+                auto &qks_masked = local_attention_mask(qks, 10000.0);
+                x = &qks_masked;
+            }
+            auto &qks_sm = softmax(*x);
+            auto &qks_sm_v = matmul(qks_sm, v);
+            */
+
+           Tensor<T> *x = &input;
+            if(attention_type % 2 == 0)
+            {   
+                auto &qks_sm = softmax_triangular(qks);
+                x = &qks_sm;
+            }
+            else 
+            {
+                // auto &qks_masked = local_attention_mask(qks, 10000.0);
+                // auto &qks_sm = softmax_triangular(qks_masked);
+
+                auto &qks_sm = softmax_triangular(qks);
+                x = &qks_sm;
+            }
+            auto &qks_sm_v = matmul(*x, v);
+
+            qks_sm_vs.push_back(&qks_sm_v);
+        }
+
+        auto &qks_sm_vs_cat = concat(qks_sm_vs);
+        auto &res = c_proj->forward(qks_sm_vs_cat);
+        return res;
+    }
+};
+
+template <typename T>
+class TransformerBlock : public SytorchModule<T>
+{
+    using SytorchModule<T>::add;
+
+    MultiHeadAttention<T> *attn;
+    FFN<T> *ffn;
+    LayerNorm<T> *ln0;
+    LayerNorm<T> *ln1;
+    
+    u64 n_heads, n_embd;
+    u64 attention_type; 
+    u64 window_size;
+public:
+
+    TransformerBlock(u64 n_heads, u64 n_embd, u64 attention_type, u64 window_size): n_heads(n_heads), n_embd(n_embd)
+    {
+        attn = new MultiHeadAttention<T>(n_heads, n_embd, attention_type, window_size);
+        ffn = new FFN<T>(n_embd, 4*n_embd);
+        ln0 = new LayerNorm<T>(n_embd);
+        ln1 = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &ln0_out = ln0->forward(input);
+        auto &attn_out = attn->forward(ln0_out);
+        auto &attn_out_add = add(attn_out, input);
+        auto &ln1_out = ln1->forward(attn_out_add);
+        auto &ffn_out = ffn->forward(ln1_out);
+        auto &ffn_out_add = add(ffn_out, attn_out_add);
+        return ffn_out_add;
+    }
+};
+
+template <typename T>
+class GPT2 : public SytorchModule<T>
+{
+    std::vector<TransformerBlock<T> *> blocks;
+    // LayerNorm<T> *ln_f;
+    u64 n_layer, n_heads, n_embd;
+    u64 window_size;
+
+public:
+    
+    GPT2(u64 n_layer, u64 n_heads, u64 n_embd, u64 window_size): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd)
+    {
+        for(u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new TransformerBlock<T>(n_heads, n_embd, i, window_size));
+        }
+        // ln_f = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        Tensor<T> *x = &input;
+
+        for(u64 i = 0; i < n_layer - 1; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+
+        auto &block = blocks[n_layer - 1];
+        return block->forward(*x);
+        
+        // for(u64 i = 0; i < n_layer; ++i)
+        // {
+        //     auto &block = blocks[i];
+        //     auto &x_out = block->forward(*x);
+        //     x = &x_out;
+        // }
+        // return ln_f->forward(*x);
+    }
+};
+
+
+int lt_main(int __argc, char**__argv){
+    
+    sytorch_init();
+
+
+    const u64 n_embd = 2048;
+    const u64 n_head = 16;
+    const u64 n_layer = 24;
+    const u64 window_size = 256;
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+    if (__argc > 2)
+        ip = __argv[2];
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+    
+    const u64 scale = 12;
+
+    LlamaConfig::bitlength = 51;
+    LlamaConfig::party = party;
+    LlamaConfig::stochasticT = false;
+    LlamaConfig::stochasticRT = false;
+    LlamaConfig::num_threads = 4;
+
+    llama->init(ip, true);
+
+    GPT2<u64> net(n_layer, n_head, n_embd, window_size);
+    net.init(scale);
+    net.setBackend(llama);
+    net.optimize();
+    if(party == SERVER){
+        // net.load("gpt-neo-1pt3B-weights.dat");
+        net.zero();
+    }
+    else if(party == DEALER){
+        net.zero();
+    }
+    llama->initializeInferencePartyA(net.root);
+
+    u64 n_seq = 128;
+    Tensor<u64> input({n_seq, n_embd});
+    if(party == CLIENT){
+        input.fill(1LL << (scale-2));
+    }
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    net.forward(input);
+    llama::end();
+
+    auto &output = net.activation;
+    llama->outputA(output);
+    llama->finalize();
+
+    return 0;
+}
+
+int main(int __argc, char**__argv)
+{
+    lt_main(__argc,__argv);
+}
diff --git a/GPU-MPC/ext/sytorch/benchmarks/llama-13b.cpp b/GPU-MPC/ext/sytorch/benchmarks/llama-13b.cpp
new file mode 100644
index 00000000..e208cad5
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/benchmarks/llama-13b.cpp
@@ -0,0 +1,266 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+//
+// Copyright (c) 2024 Microsoft Research
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+#include <filesystem>
+
+template <typename T>
+class FFN : public SytorchModule<T>
+{
+    using SytorchModule<T>::silu;
+    using SytorchModule<T>::mul;
+
+    u64 in;
+    u64 intermediate_size;
+
+public:
+    FC<T> *up1;
+    FC<T> *up2;
+    FC<T> *down;
+
+    FFN(u64 in, u64 intermediate_size) : in(in), intermediate_size(intermediate_size)
+    {
+        up1 = new FC<T>(in, intermediate_size, false);
+        up2 = new FC<T>(in, intermediate_size, false);
+        down = new FC<T>(intermediate_size, in, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &a = up1->forward(input);
+        auto &b = up2->forward(input);
+        return down->forward(mul(silu(a), b));
+    }
+};
+
+template <typename T>
+class MultiHeadAttention : public SytorchModule<T>
+{
+    using SytorchModule<T>::split;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::transpose;
+    using SytorchModule<T>::matmul;
+    using SytorchModule<T>::matmul_triangular;
+    using SytorchModule<T>::scalarmul;
+    using SytorchModule<T>::softmax_triangular;
+    using SytorchModule<T>::concat;
+
+    using SytorchModule<T>::mul;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::silu;
+    using SytorchModule<T>::rotary_embedding;
+
+public:
+    FC<T> *q_attn;
+    FC<T> *k_attn;
+    FC<T> *v_attn;
+    FC<T> *c_proj;
+
+    u64 n_heads;
+    u64 n_embd;
+
+    MultiHeadAttention(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd)
+    {
+        always_assert(n_embd % n_heads == 0);
+        q_attn = new FC<T>(n_embd, n_embd, false);
+        k_attn = new FC<T>(n_embd, n_embd, false);
+        v_attn = new FC<T>(n_embd, n_embd, false);
+        c_proj = new FC<T>(n_embd, n_embd, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &q_heads = q_attn->forward(input);
+        auto &k_heads = k_attn->forward(input);
+        auto &v_heads = v_attn->forward(input);
+        auto &qs = split(q_heads, n_heads);
+        auto &ks = split(k_heads, n_heads);
+        auto &vs = split(v_heads, n_heads);
+
+        double divisor = 1 / sqrt(double(n_embd) / double(n_heads));
+
+        std::vector<Tensor<T> *> qks_sm_vs;
+        for (u64 i = 0; i < n_heads; ++i)
+        {
+            auto &q = view(qs, i);
+            auto &k = view(ks, i);
+            auto &v = view(vs, i);
+
+            auto &q1 = rotary_embedding(q);
+            auto &k1 = rotary_embedding(k);
+
+            auto &kt = transpose(k1);
+            auto &qk = matmul_triangular(q1, kt);
+            auto &qks = scalarmul(qk, divisor);
+
+            auto &qks_sm = softmax_triangular(qks);
+
+            auto &qks_sm_v = matmul(qks_sm, v);
+            qks_sm_vs.push_back(&qks_sm_v);
+        }
+
+        auto &qks_sm_vs_cat = concat(qks_sm_vs);
+        auto &res = c_proj->forward(qks_sm_vs_cat);
+        return res;
+    }
+};
+
+template <typename T>
+class TransformerBlock : public SytorchModule<T>
+{
+    using SytorchModule<T>::add;
+
+    MultiHeadAttention<T> *attn;
+    FFN<T> *ffn;
+    RMSNorm<T> *ln0;
+    RMSNorm<T> *ln1;
+
+    u64 n_heads, n_embd, intermediate_size;
+
+public:
+    TransformerBlock(u64 n_heads, u64 n_embd, u64 intermediate_size) : n_heads(n_heads), n_embd(n_embd), intermediate_size(intermediate_size)
+    {
+        attn = new MultiHeadAttention<T>(n_heads, n_embd);
+        ffn = new FFN<T>(n_embd, intermediate_size);
+        ln0 = new RMSNorm<T>(n_embd, false);
+        ln1 = new RMSNorm<T>(n_embd, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &ln0_out = ln0->forward(input);
+        auto &attn_out = attn->forward(ln0_out);
+        auto &attn_out_add = add(attn_out, input);
+        auto &ln1_out = ln1->forward(attn_out_add);
+        auto &ffn_out = ffn->forward(ln1_out);
+        auto &ffn_out_add = add(ffn_out, attn_out_add);
+        return ffn_out_add;
+    }
+};
+
+template <typename T>
+class LLAMA_MODEL : public SytorchModule<T>
+{
+    std::vector<TransformerBlock<T> *> blocks;
+    RMSNorm<T> *ln_f;
+    u64 n_layer, n_heads, n_embd, intermediate_size;
+
+public:
+    LLAMA_MODEL(u64 n_layer, u64 n_heads, u64 n_embd, u64 intermediate_size) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd), intermediate_size(intermediate_size)
+    {
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new TransformerBlock<T>(n_heads, n_embd, intermediate_size));
+        }
+        ln_f = new RMSNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        Tensor<T> *x = &input;
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+        return *x;
+        // return ln_f->forward(*x);
+    }
+};
+
+template <typename T>
+class LlamaNextWordLogits : public SytorchModule<T>
+{
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::transpose;
+    LLAMA_MODEL<T> *llama_model;
+    FC<T> *fc;
+    u64 n_layer, n_heads, n_embd, n_vocab, intermediate_size;
+
+public:
+    LlamaNextWordLogits(u64 n_layer, u64 n_heads, u64 n_embd, u64 n_vocab, u64 intermediate_size) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd), n_vocab(n_vocab), intermediate_size(intermediate_size)
+    {
+        llama_model = new LLAMA_MODEL<T>(n_layer, n_heads, n_embd, intermediate_size);
+        fc = new FC<T>(n_embd, n_vocab, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &fc_in = llama_model->forward(input);
+        return fc_in;
+        // auto &fc_out = fc->forward(fc_in);
+        // return view(fc_out, -1);
+    }
+};
+
+void lt_main(int party, std::string ip)
+{
+    sytorch_init();
+
+    const u64 n_vocab = 32000;
+    const u64 n_embd = 5120;
+    const u64 n_head = 40;  // 40;
+    const u64 n_layer = 40; // 40;
+    const u64 intermediate_size = 13824;
+    const u64 scale = 12;
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    LlamaConfig::bitlength = 48;
+    LlamaConfig::party = party;
+    llama->init(ip, true);
+
+    LlamaNextWordLogits<u64> llama_model(n_layer, n_head, n_embd, n_vocab, intermediate_size);
+    u64 n_seq = 128; // get_n_seq(fname, n_embd);
+    Tensor<u64> input({n_seq, n_embd});
+    input.zero();
+    llama_model.init(scale, input);
+    llama_model.setBackend(llama);
+    llama_model.optimize();
+    llama_model.zero();
+
+    // std::string fname = std::string("lambada-meta-llama2-7b/") + /*std::to_string(i)*/ +"999.dat";
+    llama->initializeInferencePartyA(llama_model.root);
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    auto &res = llama_model.forward(input);
+    llama::end();
+
+    auto &output = llama_model.activation;
+    llama->outputA(output);
+    llama->finalize();
+}
+
+int main(int __argc, char **__argv)
+{
+    int party = atoi(__argv[1]);
+    std::string ip = "0.0.0.0";
+    if (__argc > 2)
+        ip = __argv[2];
+    lt_main(party, ip);
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/benchmarks/llama-7b.cpp b/GPU-MPC/ext/sytorch/benchmarks/llama-7b.cpp
new file mode 100644
index 00000000..595dd5b6
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/benchmarks/llama-7b.cpp
@@ -0,0 +1,266 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+//
+// Copyright (c) 2024 Microsoft Research
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+#include <filesystem>
+
+template <typename T>
+class FFN : public SytorchModule<T>
+{
+    using SytorchModule<T>::silu;
+    using SytorchModule<T>::mul;
+
+    u64 in;
+    u64 intermediate_size;
+
+public:
+    FC<T> *up1;
+    FC<T> *up2;
+    FC<T> *down;
+
+    FFN(u64 in, u64 intermediate_size) : in(in), intermediate_size(intermediate_size)
+    {
+        up1 = new FC<T>(in, intermediate_size, false);
+        up2 = new FC<T>(in, intermediate_size, false);
+        down = new FC<T>(intermediate_size, in, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &a = up1->forward(input);
+        auto &b = up2->forward(input);
+        return down->forward(mul(silu(a), b));
+    }
+};
+
+template <typename T>
+class MultiHeadAttention : public SytorchModule<T>
+{
+    using SytorchModule<T>::split;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::transpose;
+    using SytorchModule<T>::matmul;
+    using SytorchModule<T>::matmul_triangular;
+    using SytorchModule<T>::scalarmul;
+    using SytorchModule<T>::softmax_triangular;
+    using SytorchModule<T>::concat;
+
+    using SytorchModule<T>::mul;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::silu;
+    using SytorchModule<T>::rotary_embedding;
+
+public:
+    FC<T> *q_attn;
+    FC<T> *k_attn;
+    FC<T> *v_attn;
+    FC<T> *c_proj;
+
+    u64 n_heads;
+    u64 n_embd;
+
+    MultiHeadAttention(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd)
+    {
+        always_assert(n_embd % n_heads == 0);
+        q_attn = new FC<T>(n_embd, n_embd, false);
+        k_attn = new FC<T>(n_embd, n_embd, false);
+        v_attn = new FC<T>(n_embd, n_embd, false);
+        c_proj = new FC<T>(n_embd, n_embd, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &q_heads = q_attn->forward(input);
+        auto &k_heads = k_attn->forward(input);
+        auto &v_heads = v_attn->forward(input);
+        auto &qs = split(q_heads, n_heads);
+        auto &ks = split(k_heads, n_heads);
+        auto &vs = split(v_heads, n_heads);
+
+        double divisor = 1 / sqrt(double(n_embd) / double(n_heads));
+
+        std::vector<Tensor<T> *> qks_sm_vs;
+        for (u64 i = 0; i < n_heads; ++i)
+        {
+            auto &q = view(qs, i);
+            auto &k = view(ks, i);
+            auto &v = view(vs, i);
+
+            auto &q1 = rotary_embedding(q);
+            auto &k1 = rotary_embedding(k);
+
+            auto &kt = transpose(k1);
+            auto &qk = matmul_triangular(q1, kt);
+            auto &qks = scalarmul(qk, divisor);
+
+            auto &qks_sm = softmax_triangular(qks);
+
+            auto &qks_sm_v = matmul(qks_sm, v);
+            qks_sm_vs.push_back(&qks_sm_v);
+        }
+
+        auto &qks_sm_vs_cat = concat(qks_sm_vs);
+        auto &res = c_proj->forward(qks_sm_vs_cat);
+        return res;
+    }
+};
+
+template <typename T>
+class TransformerBlock : public SytorchModule<T>
+{
+    using SytorchModule<T>::add;
+
+    MultiHeadAttention<T> *attn;
+    FFN<T> *ffn;
+    RMSNorm<T> *ln0;
+    RMSNorm<T> *ln1;
+
+    u64 n_heads, n_embd, intermediate_size;
+
+public:
+    TransformerBlock(u64 n_heads, u64 n_embd, u64 intermediate_size) : n_heads(n_heads), n_embd(n_embd), intermediate_size(intermediate_size)
+    {
+        attn = new MultiHeadAttention<T>(n_heads, n_embd);
+        ffn = new FFN<T>(n_embd, intermediate_size);
+        ln0 = new RMSNorm<T>(n_embd, false);
+        ln1 = new RMSNorm<T>(n_embd, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &ln0_out = ln0->forward(input);
+        auto &attn_out = attn->forward(ln0_out);
+        auto &attn_out_add = add(attn_out, input);
+        auto &ln1_out = ln1->forward(attn_out_add);
+        auto &ffn_out = ffn->forward(ln1_out);
+        auto &ffn_out_add = add(ffn_out, attn_out_add);
+        return ffn_out_add;
+    }
+};
+
+template <typename T>
+class LLAMA_MODEL : public SytorchModule<T>
+{
+    std::vector<TransformerBlock<T> *> blocks;
+    RMSNorm<T> *ln_f;
+    u64 n_layer, n_heads, n_embd, intermediate_size;
+
+public:
+    LLAMA_MODEL(u64 n_layer, u64 n_heads, u64 n_embd, u64 intermediate_size) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd), intermediate_size(intermediate_size)
+    {
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new TransformerBlock<T>(n_heads, n_embd, intermediate_size));
+        }
+        ln_f = new RMSNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        Tensor<T> *x = &input;
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+        return *x;
+        // return ln_f->forward(*x);
+    }
+};
+
+template <typename T>
+class LlamaNextWordLogits : public SytorchModule<T>
+{
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::transpose;
+    LLAMA_MODEL<T> *llama_model;
+    FC<T> *fc;
+    u64 n_layer, n_heads, n_embd, n_vocab, intermediate_size;
+
+public:
+    LlamaNextWordLogits(u64 n_layer, u64 n_heads, u64 n_embd, u64 n_vocab, u64 intermediate_size) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd), n_vocab(n_vocab), intermediate_size(intermediate_size)
+    {
+        llama_model = new LLAMA_MODEL<T>(n_layer, n_heads, n_embd, intermediate_size);
+        fc = new FC<T>(n_embd, n_vocab, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &fc_in = llama_model->forward(input);
+        return fc_in;
+        // auto &fc_out = fc->forward(fc_in);
+        // return view(fc_out, -1);
+    }
+};
+
+void lt_main(int party, std::string ip)
+{
+    sytorch_init();
+
+    const u64 n_vocab = 32000;
+    const u64 n_embd = 4096;
+    const u64 n_head = 32;
+    const u64 n_layer = 32;//32;
+    const u64 intermediate_size = 11008;
+    const u64 scale = 12;
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    LlamaConfig::bitlength = 48;
+    LlamaConfig::party = party;
+    llama->init(ip, true);
+
+    LlamaNextWordLogits<u64> llama_model(n_layer, n_head, n_embd, n_vocab, intermediate_size);
+    u64 n_seq = 128; // get_n_seq(fname, n_embd);
+    Tensor<u64> input({n_seq, n_embd});
+    input.zero();
+    llama_model.init(scale, input);
+    llama_model.setBackend(llama);
+    llama_model.optimize();
+    llama_model.zero();
+
+    // std::string fname = std::string("lambada-meta-llama2-7b/") + /*std::to_string(i)*/ +"999.dat";
+    llama->initializeInferencePartyA(llama_model.root);
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    auto &res = llama_model.forward(input);
+    llama::end();
+
+    auto &output = llama_model.activation;
+    llama->outputA(output);
+    llama->finalize();
+}
+
+int main(int __argc, char **__argv)
+{
+    int party = atoi(__argv[1]);
+    std::string ip = "0.0.0.0";
+    if (__argc > 2)
+        ip = __argv[2];
+    lt_main(party, ip);
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/examples/bert.cpp b/GPU-MPC/ext/sytorch/examples/bert.cpp
index c77e6d23..1e7f04b6 100644
--- a/GPU-MPC/ext/sytorch/examples/bert.cpp
+++ b/GPU-MPC/ext/sytorch/examples/bert.cpp
@@ -199,13 +199,14 @@ class BERT : public SytorchModule<T>
             auto &x_out = block->forward(*x);
             x = &x_out;
         }
-
-        auto &x0 = view(*x, 0);
-        auto &x0_unsqueeze = unsqueeze(x0);
-        auto &pool_out = pool->forward(x0_unsqueeze);
-        auto &tanh_out = tanh(pool_out);
-        // return view(tanh_out, 0);
-        return tanh_out;
+        return *x;
+
+        // auto &x0 = view(*x, 0);
+        // auto &x0_unsqueeze = unsqueeze(x0);
+        // auto &pool_out = pool->forward(x0_unsqueeze);
+        // auto &tanh_out = tanh(pool_out);
+        // // return view(tanh_out, 0);
+        // return tanh_out;
     }
 };
 
@@ -229,8 +230,9 @@ class BERTSequenceClassification : public SytorchModule<T>
     Tensor<T> &_forward(Tensor<T> &input)
     {
         auto &fc_in = gpt2->forward(input);
-        auto &fc_out = fc->forward(fc_in);
-        return view(fc_out, 0);
+        return fc_in;
+        // auto &fc_out = fc->forward(fc_in);
+        // return view(fc_out, 0);
     }
 };
 
@@ -381,22 +383,40 @@ int fixed_mrpc_validation(int __argc, char**__argv) {
 
 int ct_main(int __argc, char**__argv) {
     sytorch_init();
-
-    const u64 n_vocab = 50257;
-    const u64 n_ctx = 1024;
-    const u64 n_embd = 768;
-    const u64 n_head = 12;
-    const u64 n_layer = 12;
+     // Bert base
+    // const u64 n_embd = 768;
+    // const u64 n_head = 12;
+    // const u64 n_layer = 12;
+    // const u64 bw = 50;
+    // const u64 scale = 12;
+
+    // Bert tiny
+    // const u64 n_embd = 128;
+    // const u64 n_head = 2;
+    // const u64 n_layer = 2;
+    // const u64 bw = 37;
+    // const u64 scale = 12;
+
+    // // Bert large
+    const u64 n_embd = 1024;
+    const u64 n_head = 16;
+    const u64 n_layer = 24;
+    const u64 bw = 50;
     const u64 scale = 12;
 
+    
     BERTSequenceClassification<i64> bert(n_layer, n_head, n_embd, 2);
-    bert.init(scale);
+    u64 n_seq = 128;//get_n_seq(fname, n_embd);
+    Tensor<i64> input({n_seq, n_embd});
+    bert.init(scale, input);
     hasInit = true;
-    bert.load("bertclass.dat");
 
+    auto ct = new ClearText<i64>();
+    ct->bw = bw;
+    bert.setBackend(ct);
+
+    bert.load("bert-large-weights.dat");
     std::string fname = __argv[1];
-    u64 n_seq = get_n_seq(fname, n_embd);
-    Tensor<i64> input({n_seq, n_embd});
     input.load(fname, scale);
 
     auto t1 = std::chrono::high_resolution_clock::now();
@@ -404,7 +424,7 @@ int ct_main(int __argc, char**__argv) {
     auto t2 = std::chrono::high_resolution_clock::now();
     auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
     std::cout << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
-    print(bert.activation, scale);
+    print(bert.activation, scale, bw);
 
     return 0;
 }
@@ -545,8 +565,9 @@ int float_sst2_single(int __argc, char**__argv) {
 int main(int __argc, char**__argv)
 {
     // float_sst2_validation(__argc, __argv);
-    fixed_sst2_validation(__argc, __argv);
+    // fixed_sst2_validation(__argc, __argv);
     // lt_main(__argc, __argv);
+    ct_main(__argc, __argv);
     // float_mrpc_validation(__argc, __argv);
     // fixed_mrpc_validation(__argc, __argv);
     // float_sst2_single(__argc, __argv);
diff --git a/GPU-MPC/ext/sytorch/examples/bertbenchmark.cpp b/GPU-MPC/ext/sytorch/examples/bertbenchmark.cpp
index 6239b8cd..bd2e01eb 100644
--- a/GPU-MPC/ext/sytorch/examples/bertbenchmark.cpp
+++ b/GPU-MPC/ext/sytorch/examples/bertbenchmark.cpp
@@ -1,8 +1,8 @@
 // Authors: Kanav Gupta, Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -32,16 +32,17 @@
 template <typename T>
 class FFN : public SytorchModule<T>
 {
-    public:
+public:
     using SytorchModule<T>::gelu;
 
     u64 in;
     u64 hidden;
+
 public:
     FC<T> *up;
     FC<T> *down;
 
-    FFN(u64 in, u64 hidden) : in(in), hidden(hidden) 
+    FFN(u64 in, u64 hidden) : in(in), hidden(hidden)
     {
         up = new FC<T>(in, hidden, true);
         down = new FC<T>(hidden, in, true);
@@ -56,7 +57,7 @@ class FFN : public SytorchModule<T>
 template <typename T>
 class MultiHeadAttention : public SytorchModule<T>
 {
-    public:
+public:
     using SytorchModule<T>::split;
     using SytorchModule<T>::view;
     using SytorchModule<T>::add;
@@ -75,10 +76,10 @@ class MultiHeadAttention : public SytorchModule<T>
     u64 n_heads;
     u64 n_embd;
 
-    MultiHeadAttention(u64 n_heads, u64 n_embd): n_heads(n_heads), n_embd(n_embd)
+    MultiHeadAttention(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd)
     {
         always_assert(n_embd % n_heads == 0);
-        c_attn = new FC<T>(n_embd, 3*n_embd, true);
+        c_attn = new FC<T>(n_embd, 3 * n_embd, true);
         c_proj = new FC<T>(n_embd, n_embd, true);
     }
 
@@ -95,8 +96,8 @@ class MultiHeadAttention : public SytorchModule<T>
 
         double divisor = 1 / sqrt(double(n_embd) / double(n_heads));
 
-        std::vector<Tensor<T>*> qks_sm_vs;
-        for(u64 i = 0; i < n_heads; ++i)
+        std::vector<Tensor<T> *> qks_sm_vs;
+        for (u64 i = 0; i < n_heads; ++i)
         {
             auto &q = view(qs, i);
             auto &k = view(ks, i);
@@ -120,21 +121,21 @@ class MultiHeadAttention : public SytorchModule<T>
 template <typename T>
 class TransformerBlock : public SytorchModule<T>
 {
-    public:
+public:
     using SytorchModule<T>::add;
 
     MultiHeadAttention<T> *attn;
     FFN<T> *ffn;
     LayerNorm<T> *ln0;
     LayerNorm<T> *ln1;
-    
+
     u64 n_heads, n_embd;
-public:
 
-    TransformerBlock(u64 n_heads, u64 n_embd): n_heads(n_heads), n_embd(n_embd)
+public:
+    TransformerBlock(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd)
     {
         attn = new MultiHeadAttention<T>(n_heads, n_embd);
-        ffn = new FFN<T>(n_embd, 4*n_embd);
+        ffn = new FFN<T>(n_embd, 4 * n_embd);
         ln0 = new LayerNorm<T>(n_embd);
         ln1 = new LayerNorm<T>(n_embd);
     }
@@ -155,7 +156,7 @@ class TransformerBlock : public SytorchModule<T>
 template <typename T>
 class BERT : public SytorchModule<T>
 {
-    public:
+public:
     using SytorchModule<T>::tanh;
     using SytorchModule<T>::view;
     using SytorchModule<T>::add;
@@ -166,10 +167,9 @@ class BERT : public SytorchModule<T>
     u64 n_layer, n_heads, n_embd;
 
 public:
-    
-    BERT(u64 n_layer, u64 n_heads, u64 n_embd): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd)
+    BERT(u64 n_layer, u64 n_heads, u64 n_embd) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd)
     {
-        for(u64 i = 0; i < n_layer; ++i)
+        for (u64 i = 0; i < n_layer; ++i)
         {
             blocks.push_back(new TransformerBlock<T>(n_heads, n_embd));
         }
@@ -179,11 +179,11 @@ class BERT : public SytorchModule<T>
 
     Tensor<T> &_forward(Tensor<T> &input)
     {
-        // auto &y = ln_f->forward(input);
-        // Tensor<T> *x = &y;
-        Tensor<T> *x = &input;
-        
-        for(u64 i = 0; i < n_layer; ++i)
+        auto &y = ln_f->forward(input);
+        Tensor<T> *x = &y;
+        // Tensor<T> *x = &input;
+
+        for (u64 i = 0; i < n_layer; ++i)
         {
             auto &block = blocks[i];
             auto &x_out = block->forward(*x);
@@ -194,18 +194,18 @@ class BERT : public SytorchModule<T>
     }
 };
 
-int main(int __argc, char**__argv)
+int main(int __argc, char **__argv)
 {
     sytorch_init();
 
     // bert tiny
-    // const u64 n_embd = 128;
-    // const u64 n_head = n_embd / 64;
-    // const u64 n_layer = 2;
-    // const u64 scale = 12;
-    // const u64 bw = 38;
-    // const u64 n_seq = 128;
-    
+    const u64 n_embd = 128;
+    const u64 n_head = n_embd / 64;
+    const u64 n_layer = 2;
+    const u64 scale = 12;
+    const u64 bw = 37;
+    const u64 n_seq = 128;
+
     // bert base
     // const u64 n_embd = 768;
     // const u64 n_head = 12;
@@ -215,12 +215,12 @@ int main(int __argc, char**__argv)
     // const u64 n_seq = 128;
 
     // bert large
-    const u64 n_embd = 1024;
-    const u64 n_head = n_embd / 64;
-    const u64 n_layer = 24;
-    const u64 scale = 12;
-    const u64 bw = 51;
-    const u64 n_seq = 128;
+    // const u64 n_embd = 1024;
+    // const u64 n_head = n_embd / 64;
+    // const u64 n_layer = 24;
+    // const u64 scale = 12;
+    // const u64 bw = 51;
+    // const u64 n_seq = 128;
 
     int party = atoi(__argv[1]);
     std::string ip = "127.0.0.1";
@@ -237,15 +237,17 @@ int main(int __argc, char**__argv)
     llama->init(ip, true);
 
     BERT<u64> net(n_layer, n_head, n_embd);
-    net.init(scale);
+    Tensor<u64> input({n_seq, n_embd});
+    net.init(scale, input);
     net.setBackend(llama);
     net.optimize();
-    llama->initializeInferencePartyA(net.root);
-
-    Tensor<u64> input({n_seq, n_embd});
-    if(party == CLIENT){
-        input.fill(1LL << (scale-2));
+    if (party != DEALER)
+    {
+        net.load("bert-tiny-weights.dat");
+        input.load("15469.dat", scale);
     }
+
+    llama->initializeInferencePartyA(net.root);
     llama->initializeInferencePartyB(input);
 
     llama::start();
@@ -256,5 +258,10 @@ int main(int __argc, char**__argv)
     llama->outputA(output);
     llama->finalize();
 
+    if (party == CLIENT)
+    {
+        auto signedAct = Tensor<i64>((i64*) net.activation.data, net.activation.shape);
+        print(signedAct, scale, bw);
+    }
     return 0;
 }
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/examples/gpt2.cpp b/GPU-MPC/ext/sytorch/examples/gpt2.cpp
index a4f09c79..351eb3d2 100644
--- a/GPU-MPC/ext/sytorch/examples/gpt2.cpp
+++ b/GPU-MPC/ext/sytorch/examples/gpt2.cpp
@@ -337,10 +337,13 @@ int lt_main(int __argc, char**__argv){
     const u64 n_layer = 12;
     const u64 scale = 12;
     u64 bitlength = 50;
-    GPT2<u64> net(n_layer, n_head, n_embd);
-    net.init(scale);
-    net.load("gpt2lmr.dat");
-    Tensor<u64> input({128, n_embd});
+    GPT2<i64> net(n_layer, n_head, n_embd);
+    Tensor<i64> input({128, n_embd});
+    net.init(scale, input);
+    auto ct = new ClearText<i64>();
+    ct->bw = 50;
+    net.setBackend(ct);
+    net.load("gpt2-weights.dat");
     input.load("15469.dat", scale);
     printf("Starting\n");
     net.forward(input);
diff --git a/GPU-MPC/ext/sytorch/examples/gptneo.cpp b/GPU-MPC/ext/sytorch/examples/gptneo.cpp
index 87584ff6..a1f321a9 100644
--- a/GPU-MPC/ext/sytorch/examples/gptneo.cpp
+++ b/GPU-MPC/ext/sytorch/examples/gptneo.cpp
@@ -257,7 +257,8 @@ class GPT2 : public SytorchModule<T>
             auto &x_out = block->forward(*x);
             x = &x_out;
         }
-        return ln_f->forward(*x);
+        return *x;
+        // return ln_f->forward(*x);
     }
 };
 
@@ -281,9 +282,10 @@ class GPT2NextWordLogits : public SytorchModule<T>
     Tensor<T> &_forward(Tensor<T> &input)
     {
         auto &fc_in = gpt2->forward(input);
+        return fc_in;
         // printshape(fc_in.shape);
-        auto &fc_out = fc->forward(fc_in);
-        return view(fc_out, -1);
+        // auto &fc_out = fc->forward(fc_in);
+        // return view(fc_out, -1);
     }
 };
 
@@ -304,14 +306,19 @@ void ct_main(std::string fname) {
     const u64 n_layer = 24;
     const u64 scale = 12;
     const u64 window_size = 256;
+    const u64 bw = 51;
 
     GPT2NextWordLogits <i64> net(n_layer, n_head, n_embd, n_vocab, window_size);
-    net.init(scale);
-    hasInit = true;
-    net.load("gpt-neo-1pt3B-weights.dat");
-
-    u64 n_seq = get_n_seq(fname, n_embd);
+    u64 n_seq = 128;//get_n_seq(fname, n_embd);
     Tensor<i64> input({n_seq, n_embd});
+    net.init(scale, input);
+
+    auto ct = new ClearText<i64>();
+    ct->bw = bw;
+    net.setBackend(ct);
+
+    hasInit = true;
+    net.load("gpt-neo-weights.dat");
     input.load(fname, scale);
 
     auto t1 = std::chrono::high_resolution_clock::now();
@@ -319,7 +326,8 @@ void ct_main(std::string fname) {
     auto t2 = std::chrono::high_resolution_clock::now();
     auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
     std::cout << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
-    printfe(net.activation, 5);
+    // printfe(net.activation, 5);
+    print(net.activation, scale, bw);
 }
 
 int lt_main(std::string fname, int __argc, char**__argv){
diff --git a/GPU-MPC/ext/sytorch/examples/llama7b.cpp b/GPU-MPC/ext/sytorch/examples/llama7b.cpp
index 4c239287..1c3efba3 100644
--- a/GPU-MPC/ext/sytorch/examples/llama7b.cpp
+++ b/GPU-MPC/ext/sytorch/examples/llama7b.cpp
@@ -1,8 +1,8 @@
 // Authors: Kanav Gupta, Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -186,7 +186,8 @@ class LLAMA_MODEL : public SytorchModule<T>
             auto &x_out = block->forward(*x);
             x = &x_out;
         }
-        return ln_f->forward(*x);
+        return *x;
+        // return ln_f->forward(*x);
     }
 };
 
@@ -209,8 +210,9 @@ class LlamaNextWordLogits : public SytorchModule<T>
     Tensor<T> &_forward(Tensor<T> &input)
     {
         auto &fc_in = llama_model->forward(input);
-        auto &fc_out = fc->forward(fc_in);
-        return view(fc_out, -1);
+        return fc_in;
+        // auto &fc_out = fc->forward(fc_in);
+        // return view(fc_out, -1);
     }
 };
 
@@ -221,43 +223,122 @@ u64 get_n_seq(std::string filename, u64 n_embd)
     return n_elements / (4 * n_embd);
 }
 
-void ct_main()
+void ct_main(std::string inpName)
 {
     sytorch_init();
 
+    // const u64 n_vocab = 32000;
+    // const u64 n_embd = 4096;
+    // const u64 n_head = 32;
+    // const u64 n_layer = 32;
+    // const u64 intermediate_size = 11008;
+    // const u64 scale = 12;
+
     const u64 n_vocab = 32000;
-    const u64 n_embd = 4096;
-    const u64 n_head = 32;
-    const u64 n_layer = 32;
-    const u64 intermediate_size = 11008;
+    const u64 n_ctx = 4096;
+    const u64 n_embd = 5120;
+    const u64 n_head = 40;  // 40;
+    const u64 n_layer = 1; // 40;
+    const u64 intermediate_size = 13824;
     const u64 scale = 12;
 
     LlamaNextWordLogits<i64> llama_model(n_layer, n_head, n_embd, n_vocab, intermediate_size);
-    llama_model.init(scale);
-    llama_model.load("/home/t-nejawalkar/ananta/meta_llama2_7b.dat");
-    std::string fname = std::string("/home/t-nejawalkar/ananta/lambada-meta-llama2-7b/") + /*std::to_string(i)*/ +"993.dat";
-    u64 n_seq = get_n_seq(fname, n_embd);
+    u64 n_seq = 128; // get_n_seq(fname, n_embd);
     Tensor<i64> input({n_seq, n_embd});
+    llama_model.init(scale, input);
+
+    auto ct = new ClearText<i64>();
+    ct->bw = 48;
+    llama_model.setBackend(ct);
+
+    // llama_model.load("meta_llama2_7b.dat");
+    llama_model.load("meta_llama2_13b.dat");
+
+    // std::string fname = std::string("lambada-meta-llama2-7b/") + /*std::to_string(i)*/ +"999.dat";
+    std::string fname = std::string("lambada-meta-llama2-13b/") + /*std::to_string(i)*/ inpName;
     input.load(fname, scale);
     auto &res = llama_model.forward(input);
-    i64 max = INT_MIN;
-    int argmax = 0;
-    for (int i = 0; i < n_vocab; i++)
+    auto signedAct = Tensor<i64>((i64 *)res.data, res.shape);
+    print(signedAct, scale, ct->bw);
+    auto maxIdx = signedAct.as_2d().argmax(0);
+    std::cout << "Output:" << std::endl;
+    std::cout << maxIdx << std::endl;
+    std::cout << res.data[maxIdx] << std::endl;
+    printf("%ld\n", signedAct.data[res.size() - 1]);
+}
+
+void lt_main(std::string inpName, int party)
+{
+    sytorch_init();
+
+    // const u64 n_vocab = 32000;
+    // const u64 n_embd = 4096;
+    // const u64 n_head = 32;
+    // const u64 n_layer = 32;//32;
+    // const u64 intermediate_size = 11008;
+    // const u64 scale = 12;
+
+    const u64 n_vocab = 32000;
+    const u64 n_ctx = 4096;
+    const u64 n_embd = 5120;
+    const u64 n_head = 40;  // 40;
+    const u64 n_layer = 40; // 40;
+    const u64 intermediate_size = 13824;
+    const u64 scale = 12;
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    LlamaConfig::bitlength = 48;
+    LlamaConfig::party = party;
+    llama->init("0.0.0.0", true);
+
+    LlamaNextWordLogits<u64> llama_model(n_layer, n_head, n_embd, n_vocab, intermediate_size);
+    u64 n_seq = 128; // get_n_seq(fname, n_embd);
+    Tensor<u64> input({n_seq, n_embd});
+    input.zero();
+    llama_model.init(scale, input);
+    llama_model.setBackend(llama);
+    llama_model.optimize();
+    llama_model.zero();
+
+    if (party != DEALER)
+    {
+        // llama_model.load("meta_llama2_7b.dat");
+        llama_model.load("meta_llama2_13b.dat");
+        std::string fname = std::string("lambada-meta-llama2-13b/") + /*std::to_string(i)*/ inpName;
+        input.load(fname, scale);
+    }
+
+    // std::string fname = std::string("lambada-meta-llama2-7b/") + /*std::to_string(i)*/ +"999.dat";
+    llama->initializeInferencePartyA(llama_model.root);
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    auto &res = llama_model.forward(input);
+    llama::end();
+
+    auto &output = llama_model.activation;
+    llama->outputA(output);
+    llama->finalize();
+
+    if (party == CLIENT)
     {
-        if (i == 0)
-            printf("res=%ld\n", res.data[i]);
-        if (res.data[i] > max)
-        {
-            max = res.data[i];
-            argmax = i;
-        }
+        auto signedAct = Tensor<i64>((i64 *)llama_model.activation.data, llama_model.activation.shape);
+        print(signedAct, scale, LlamaConfig::bitlength);
+        auto maxIdx = signedAct.as_2d().argmax(0);
+        std::cout << "Output:" << std::endl;
+        std::cout << maxIdx << std::endl;
+        std::cout << output.data[maxIdx] << std::endl;
+        printf("%ld\n", signedAct.data[output.size() - 1]);
     }
-    std::cout << argmax << std::endl;
-    std::cout << max << std::endl;
 }
 
-int main()
+int main(int __argc, char **__argv)
 {
-    ct_main();
+    int party = atoi(__argv[1]);
+    if (party == 0)
+        ct_main("999.dat");
+    else
+        lt_main("999.dat", party);
     return 0;
 }
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ext/llama/api.cpp b/GPU-MPC/ext/sytorch/ext/llama/api.cpp
index abc5c6c0..052a2f65 100644
--- a/GPU-MPC/ext/sytorch/ext/llama/api.cpp
+++ b/GPU-MPC/ext/sytorch/ext/llama/api.cpp
@@ -1,8 +1,8 @@
 // Authors: Kanav Gupta, Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -2648,12 +2648,12 @@ void Rsqrt(int size, GroupElement *x, GroupElement *y, GroupElement extradiv, in
         (*lut)[i] = GroupElement(double(1LL<<(2*scale)) / sqrt(val / extradiv));
     } });
     }
-    if (party != DEALER)
-    {
-        // Llama::stat_t stat = { "Rsqrt::LutGen", 0, t, 0, 0, 0 };
-        // stat.print();
-        // Llama::push_stats(stat);
-    }
+    // if (party != DEALER)
+    // {
+    //     // Llama::stat_t stat = { "Rsqrt::LutGen", 0, t, 0, 0, 0 };
+    //     // stat.print();
+    //     // Llama::push_stats(stat);
+    // }
 
     LUT_dpf(size, 13, bitlength, *lut, y, y, prefix + "Rsqrt::");
     // printf("Op=%lu\n", y[0]);
@@ -2770,6 +2770,83 @@ void SlothGelu(int size, int bin, GroupElement *x, GroupElement *out, int scale)
     }
 }
 
+inline double relu_sub_silu(double x)
+{
+    double g = x / (1 + exp(-x));
+    return std::max(0.0, x) - g;
+}
+
+inline GroupElement relu_sub_silu(GroupElement x, int scale_in, int scale_out)
+{
+    return (GroupElement)(relu_sub_silu((double)x / (1LL << scale_in)) * (1LL << scale_out));
+}
+
+void SlothSilu(int size, int bin, GroupElement *x, GroupElement *out, int scale)
+{
+    always_assert(scale == 12);
+
+    GroupElement *y = new GroupElement[size];
+    GroupElement *d = new GroupElement[size];
+    GroupElement *rp = new GroupElement[size];
+    GroupElement *abs = new GroupElement[size];
+    GroupElement *r = new GroupElement[size];
+
+    SlothTR(size, bin, x, y, 6, "SiLU::");
+    SlothDrelu(size, bin - 6, y, d, "SiLU::");
+
+    Select(size, bin - 6, d, y, rp, "SiLU::");
+
+    auto t1 = time_this_block([&]()
+                              {
+#pragma omp parallel for
+    for (int i = 0; i < size; ++i)
+    {
+        abs[i] = 2 * rp[i] - y[i];
+        mod(abs[i], bin - 6);
+    } });
+
+    SlothClip(size, bin - 6, 10, 10, abs, abs, "SiLU::");
+
+    std::vector<GroupElement> lut(1LL << 10);
+    auto t2 = time_this_block([&]()
+                              {
+    for(int i = 0; i < (1LL<<10); ++i)
+    {
+        lut[i] = relu_sub_silu(i, 6, scale);
+    } });
+
+    LUT_dpf(size, 10, bitlength, lut, abs, abs, "SiLU::", false);
+
+    Select(size, bitlength, d, x, r, "SiLU::", false);
+
+    auto t3 = time_this_block([&]()
+                              {
+#pragma omp parallel for
+    for (int i = 0; i < size; ++i)
+    {
+        out[i] = r[i] - abs[i];
+    } });
+
+    if (party != DEALER)
+    {
+        Llama::push_stats({"SiLU::Misc", 0, t1 + t3, 0, 0, 0});
+        // Llama::push_stats({ "GeLU::LutGen", 0, t2, 0, 0, 0 });
+        auto reconstruction_stats = time_comm_this_block([&]()
+                                                         { reconstruct(size, out, bitlength); });
+
+        Llama::stat_t stat = {
+            "SiLU::Reconstruct",
+            0,
+            0,
+            reconstruction_stats.first,
+            reconstruction_stats.second,
+            0,
+        };
+        stat.print();
+        Llama::push_stats(stat);
+    }
+}
+
 void TruncateReduce(int size, int bin, GroupElement *x, GroupElement *y, int scale, std::string prefix)
 {
     if (party == DEALER)
@@ -3398,6 +3475,75 @@ void SlothLayerNorm(int s1, int s2, GroupElement *x, GroupElement *A, GroupEleme
     delete[] var;
 }
 
+///////////////////////////////////////////////////////////////////////////////////
+
+void SlothRMSNorm(int s1, int s2, GroupElement *x, GroupElement *A, GroupElement *B, GroupElement *y, int scale)
+// void SlothRMSNorm(int s1, int s2, GroupElement *x, GroupElement *A, GroupElement *y, int scale)
+{
+    GroupElement *tmp = new GroupElement[s1 * s2];
+
+    auto t1 = time_this_block([&]() {
+    #pragma omp parallel for collapse(2)
+    for (int i = 0; i < s1; ++i) {
+        for (int j = 0; j < s2; j++) {
+            tmp[i * s2 + j] = x[i * s2 + j];
+        }
+    }
+    });
+
+    GroupElement *var = new GroupElement[s1];
+    SumOfSquare(s1, s2, tmp, var, "LayerNorm::");
+
+    Rsqrt(s1, var, var, s2, scale, "LayerNorm::");
+
+    auto t2 = time_this_block([&]() { 
+    #pragma omp parallel for collapse(2)
+    for (int i = 0; i < s1; ++i) {
+        for (int j = 0; j < s2; j++) {
+            y[i * s2 + j] = var[i];
+        }
+    }
+    });
+
+    ElemWiseMul(s1 * s2, tmp, y, y, "LayerNorm::");
+    SlothARS(s1 * s2, y, y, scale, "LayerNorm::");
+
+    GroupElement *Aexpand = tmp;
+    auto t3 = time_this_block([&]() { 
+    #pragma omp parallel for collapse(2)
+    for (int i = 0; i < s1; ++i) {
+        for (int j = 0; j < s2; j++) {
+            Aexpand[i * s2 + j] = A[j];
+        }
+    }
+    });
+    
+    ElemWiseMul(s1 * s2, Aexpand, y, y, "LayerNorm::");
+
+    // auto t5 = time_this_block([&]() { 
+    // #pragma omp parallel for collapse(2)
+    // for (int i = 0; i < s1; ++i) {
+    //     for (int j = 0; j < s2; j++) {
+    //         y[i * s2 + j] += B[j];
+    //     }
+    // }
+    // });
+
+    // SlothARS(s1 * s2, y, y, scale, "LayerNorm::");
+
+    Llama::stat_t stat = {"LayerNorm::Misc", 0, t1 + t2 + t3, 0, 0, 0};
+    stat.print();
+    Llama::push_stats(stat);
+
+    delete[] tmp;
+    delete[] var;
+
+
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////
+
 // unused
 void SlothGemm(int s1, int s2, int s3, GroupElement *x, GroupElement *A, GroupElement *y, int scale)
 {
@@ -4681,7 +4827,8 @@ void mult_threads_helper(int thread_idx, int32_t size, GroupElement *inArr, Grou
 {
     auto thread_start = std::chrono::high_resolution_clock::now();
     auto p = get_start_end(size, thread_idx);
-    for(int i = p.first; i < p.second; i += 1){
+    for (int i = p.first; i < p.second; i += 1)
+    {
         outArr[i] = MultEval(party - SERVER, keys[i], inArr[i], multArrVec[i]);
     }
     auto thread_end = std::chrono::high_resolution_clock::now();
@@ -4691,12 +4838,14 @@ void ElemWiseSecretSharedVectorMult(int32_t size, MASK_PAIR(GroupElement *inArr)
                                     MASK_PAIR(GroupElement *multArrVec), MASK_PAIR(GroupElement *outputArr))
 {
     std::cerr << ">> ElemWise Mult - start" << std::endl;
-    if (party == DEALER) {
+    if (party == DEALER)
+    {
         uint64_t dealer_toal_time = 0;
         pair<MultKey> *keys = new pair<MultKey>[size];
 
-        #pragma omp parallel for
-        for(int i = 0; i < size; ++i) {
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
             auto dealer_start = std::chrono::high_resolution_clock::now();
             auto rout = random_ge(bitlength);
             keys[i] = MultGen(inArr_mask[i], multArrVec_mask[i], rout);
@@ -4705,31 +4854,37 @@ void ElemWiseSecretSharedVectorMult(int32_t size, MASK_PAIR(GroupElement *inArr)
             dealer_toal_time += std::chrono::duration_cast<std::chrono::microseconds>(dealer_end - dealer_start).count();
         }
 
-        for(int i = 0; i < size; ++i) {
+        for (int i = 0; i < size; ++i)
+        {
             server->send_mult_key(keys[i].first);
             client->send_mult_key(keys[i].second);
         }
         dealerMicroseconds = dealerMicroseconds + dealer_toal_time;
         delete[] keys;
     }
-    else {
+    else
+    {
         MultKey *keys = new MultKey[size];
         auto keyread_start = std::chrono::high_resolution_clock::now();
-        for(int i = 0; i < size; ++i) {
+        for (int i = 0; i < size; ++i)
+        {
             keys[i] = dealer->recv_mult_key();
         }
         auto keyread_end = std::chrono::high_resolution_clock::now();
         auto keyread_time_taken = std::chrono::duration_cast<std::chrono::milliseconds>(keyread_end -
-                                                            keyread_start).count();
+                                                                                        keyread_start)
+                                      .count();
 
         peer->sync();
         auto start = std::chrono::high_resolution_clock::now();
         std::thread thread_pool[num_threads];
-        for(int i = 0; i < num_threads; ++i) {
+        for (int i = 0; i < num_threads; ++i)
+        {
             thread_pool[i] = std::thread(mult_threads_helper, i, size, inArr, multArrVec, outputArr, keys);
         }
 
-        for(int i = 0; i < num_threads; ++i) {
+        for (int i = 0; i < num_threads; ++i)
+        {
             thread_pool[i].join();
         }
         auto mid = std::chrono::high_resolution_clock::now();
@@ -4744,68 +4899,81 @@ void ElemWiseSecretSharedVectorMult(int32_t size, MASK_PAIR(GroupElement *inArr)
         evalMicroseconds += (reconstruct_time + compute_time);
         multEvalMicroseconds += (reconstruct_time + compute_time);
         delete[] keys;
-
     }
     std::cerr << ">> ElemWise Mult - end" << std::endl;
 }
 
-
-void PiranhaSoftmax(int32_t s1, int32_t s2, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr), int32_t sf) 
+void PiranhaSoftmax(int32_t s1, int32_t s2, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr), int32_t sf)
 {
     // s1 = batch size
     // s2 = number of classes
 
     std::cerr << ">> Softmax - start" << std::endl;
-    GroupElement *max = make_array<GroupElement>(s1); 
+    GroupElement *max = make_array<GroupElement>(s1);
     // step 1 - calculate max for each image in batch
     GroupElement *oneHot = make_array<GroupElement>(s1 * (s2 - 1));
     MaxPool(s1, 1, 1, 1, s2, 1, 0, 0, 0, 0, 1, 1, s1, s2, 1, 1, MASK_PAIR(inArr), max, max, oneHot);
     delete[] oneHot; // TODO: support passing oneHot as nullptr
 
     // step 2 - subtract max from each element in each image in batch and add 2
-    if (party == DEALER) {
-        for(int i = 0; i < s1; ++i) {
-            for(int j = 0; j < s2; ++j) {
+    if (party == DEALER)
+    {
+        for (int i = 0; i < s1; ++i)
+        {
+            for (int j = 0; j < s2; ++j)
+            {
                 Arr2DIdx(outArr_mask, s1, s2, i, j) = Arr2DIdx(inArr_mask, s1, s2, i, j) - max[i];
             }
         }
     }
-    else {
-        for(int i = 0; i < s1; ++i) {
-            for(int j = 0; j < s2; ++j) {
-                Arr2DIdx(outArr, s1, s2, i, j) = Arr2DIdx(inArr, s1, s2, i, j) - max[i] + (1<<(sf + 1));
+    else
+    {
+        for (int i = 0; i < s1; ++i)
+        {
+            for (int j = 0; j < s2; ++j)
+            {
+                Arr2DIdx(outArr, s1, s2, i, j) = Arr2DIdx(inArr, s1, s2, i, j) - max[i] + (1 << (sf + 1));
             }
         }
     }
 
-    // step 3 - exponentiate each element in each image in batch 
+    // step 3 - exponentiate each element in each image in batch
     // e^x = RT((x+2), 1) for negative x
     // ReluTruncate(s1 * s2, MASK_PAIR(outArr), MASK_PAIR(outArr), 1, nullptr); // Q: can we do this in place? can be a source of bug in future
     Relu2Round(s1 * s2, MASK_PAIR(outArr), MASK_PAIR(outArr), nullptr, 64);
-    for(int i = 0; i < s1 * s2; ++i) {
-        if (party == DEALER) {
+    for (int i = 0; i < s1 * s2; ++i)
+    {
+        if (party == DEALER)
+        {
             outArr_mask[i] = outArr_mask[i] / 2;
         }
-        else {
+        else
+        {
             outArr[i] = outArr[i] / 2;
         }
     }
 
     GroupElement *denominators = max; // reuse the array
     // // step 4 - calculate sum of exponentiated elements for each image in batch
-    if (party == DEALER) {
-        for(int i = 0; i < s1; ++i) {
+    if (party == DEALER)
+    {
+        for (int i = 0; i < s1; ++i)
+        {
             denominators[i] = 0;
-            for(int j = 0; j < s2; ++j) {
+            for (int j = 0; j < s2; ++j)
+            {
                 denominators[i] = denominators[i] + Arr2DIdx(outArr_mask, s1, s2, i, j);
             }
             // denominators[i] = denominators[i] * s1;
         }
     }
-    else {
-        for(int i = 0; i < s1; ++i) {
+    else
+    {
+        for (int i = 0; i < s1; ++i)
+        {
             denominators[i] = 0;
-            for(int j = 0; j < s2; ++j) {
+            for (int j = 0; j < s2; ++j)
+            {
                 denominators[i] = denominators[i] + Arr2DIdx(outArr, s1, s2, i, j);
             }
             // denominators[i] = denominators[i] * s1;
@@ -4816,8 +4984,10 @@ void PiranhaSoftmax(int32_t s1, int32_t s2, MASK_PAIR(GroupElement *inArr), MASK
 
     // step 6 - multiply each element in each image in batch by the inverse of the denominator
     GroupElement *expandedDenominator = make_array<GroupElement>(s1 * s2);
-    for(int i = 0; i < s1; ++i) {
-        for(int j = 0; j < s2; ++j) {
+    for (int i = 0; i < s1; ++i)
+    {
+        for (int j = 0; j < s2; ++j)
+        {
             Arr2DIdx(expandedDenominator, s1, s2, i, j) = denominators[i];
         }
     }
@@ -4826,13 +4996,16 @@ void PiranhaSoftmax(int32_t s1, int32_t s2, MASK_PAIR(GroupElement *inArr), MASK
     ElemWiseSecretSharedVectorMult(s1 * s2, expandedDenominator, expandedDenominator, MASK_PAIR(outArr), MASK_PAIR(outArr));
     // ScaleDown(s1 * s2, MASK_PAIR(outArr), sf);
 
-    always_assert((s1 & (s1-1)) == 0);
+    always_assert((s1 & (s1 - 1)) == 0);
     auto logs1 = osuCrypto::log2ceil(s1);
-    for(int i = 0; i < s1 * s2; ++i) {
-        if (party == DEALER) {
+    for (int i = 0; i < s1 * s2; ++i)
+    {
+        if (party == DEALER)
+        {
             outArr_mask[i] = outArr_mask[i] >> (sf + logs1);
         }
-        else {
+        else
+        {
             outArr[i] = outArr[i] >> (sf + logs1);
         }
     }
diff --git a/GPU-MPC/ext/sytorch/ext/llama/include/llama/api.h b/GPU-MPC/ext/sytorch/ext/llama/include/llama/api.h
index 5f16e0fb..3e146c40 100644
--- a/GPU-MPC/ext/sytorch/ext/llama/include/llama/api.h
+++ b/GPU-MPC/ext/sytorch/ext/llama/include/llama/api.h
@@ -182,6 +182,7 @@ void SlothMaxpool(int s1, int s2, int bin, GroupElement *x, GroupElement *y, std
 void SlothMaxpoolTriangular(int s1, int s2, int bin, GroupElement *x, GroupElement *y, std::string prefix = "");
 void SumOfSquare(int s1, int s2, GroupElement *x, GroupElement *y, std::string prefix = "");
 void SlothLayerNorm(int s1, int s2, GroupElement *x, GroupElement *A, GroupElement *B, GroupElement *y, int scale);
+void SlothRMSNorm(int s1, int s2, GroupElement *x, GroupElement *A, GroupElement *B, GroupElement *y, int scale);
 void SlothGemm(int s1, int s2, int s3, GroupElement *x, GroupElement *A, GroupElement *y, int scale);
 void SoftmaxTriangular(int32_t s1, int32_t s2, int bin, GroupElement *x, GroupElement *y, int32_t scale);
 void MatMul2DTriangular(int32_t s1, int32_t s2, int32_t s3, MASK_PAIR(GroupElement *A),
@@ -191,6 +192,7 @@ void SlothLRS(int size, GroupElement *x, GroupElement *y, int scale, std::string
 void SlothARS(int size, GroupElement *x, GroupElement *y, int scale, std::string prefix = "");
 void SlothTR(int size, int bin, GroupElement *x, GroupElement *y, int scale, std::string prefix = "");
 void SlothGelu(int size, int bin, GroupElement *x, GroupElement *out, int scale);
+void SlothSilu(int size, int bin, GroupElement *x, GroupElement *out, int scale);
 void SlothFaithfulARS(int size, int bin, GroupElement *x, GroupElement *y, int scale, std::string prefix = "");
 
 void reconstruct(int32_t size, GroupElement *arr, int bw);
diff --git a/GPU-MPC/ext/sytorch/ext/llama/lut.cpp b/GPU-MPC/ext/sytorch/ext/llama/lut.cpp
index 92d62046..8dc5dd08 100644
--- a/GPU-MPC/ext/sytorch/ext/llama/lut.cpp
+++ b/GPU-MPC/ext/sytorch/ext/llama/lut.cpp
@@ -171,7 +171,7 @@ GroupElement evalLUTSS_2(int party, GroupElement res, GroupElement corr, const L
 
 std::pair<LUTDPFETKeyPack, LUTDPFETKeyPack> keyGenLUTDPFET(int bin, int bout, GroupElement rin, GroupElement routRes, GroupElement routCorr)
 {
-    assert(bin == 8);
+    assert(bin >= 8);
     LUTDPFETKeyPack key0, key1;
     key0.bin = bin;
     key1.bin = bin;
@@ -196,7 +196,7 @@ std::pair<LUTDPFETKeyPack, LUTDPFETKeyPack> keyGenLUTDPFET(int bin, int bout, Gr
 std::pair<GroupElement, GroupElement> evalLUTDPFET_1(int party, GroupElement x, const std::vector<GroupElement> &tab, LUTDPFETKeyPack &kp)
 {
     int bin = kp.bin;
-    assert(bin == 8);
+    assert(bin >= 8);
     mod(x, bin);
 
     GroupElement res = 0, corr = 0;
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/backend/backend.h b/GPU-MPC/ext/sytorch/include/sytorch/backend/backend.h
index c4aba60a..264543c5 100644
--- a/GPU-MPC/ext/sytorch/include/sytorch/backend/backend.h
+++ b/GPU-MPC/ext/sytorch/include/sytorch/backend/backend.h
@@ -1,8 +1,8 @@
 // Authors: Kanav Gupta, Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -182,6 +182,36 @@ class Backend
         assert(0 && "not implemented");
     }
 
+    virtual void rotary_embedding(Tensor<T> &x, Tensor<T> &y, u64 scale, u64 max_position_embeddings = 2048, u64 base = 10000)
+    {
+        u64 n_seq = x.shape[0];
+        u64 dim = x.shape[1];
+        // printf("dims=%d, %lu, %ld, %ld\n", x.shape.size(), x.size(), x.data[0], y.data[0]);
+        auto y_2d = y.as_2d();
+        auto x_2d = x.as_2d();
+
+        for (u64 i = 0; i < n_seq; ++i)
+        {
+            for (u64 j = 0; j < dim; j++)
+            {
+                double scalar = 1.0 / (std::pow(base, (double)((2 * j) % dim) / dim));
+                T scalarInt = (i * scalar) * std::pow(2, scale);
+                T sinx = std::sin(scalarInt / (float)std::pow(2, scale)) * std::pow(2, scale - 3);
+                T cosx = std::cos(scalarInt / (float)std::pow(2, scale)) * std::pow(2, scale - 3);
+
+                if (sinx == (1ULL << (scale - 3)))
+                    sinx -= 1;
+                if (cosx == (1ULL << (scale - 3)))
+                    cosx -= 1;
+                u64 k = (j + dim / 2) % dim;
+                T mul = 2 * (j >= dim / 2) - 1;
+                T z = cosx * x_2d(i, j) + sinx * mul * x_2d(i, k);
+                y_2d(i, j) = z;
+            }
+        }
+        this->truncate(y_2d, scale - 3, 1);
+    }
+
     virtual void optimize(LayerGraphNode<T> *root)
     {
     }
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_base.h b/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_base.h
index 34e70c92..ffb632b8 100644
--- a/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_base.h
+++ b/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_base.h
@@ -1,8 +1,8 @@
 // Authors: Kanav Gupta, Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -285,6 +285,8 @@ class LlamaBase : public Backend<T>
             {
                 auto mask = LlamaConfig::dealer->recv_mask();
                 a[i] = a[i] - mask;
+                mod(a[i], LlamaConfig::bitlength);
+                a[i] -= ((a[i] >> (LlamaConfig::bitlength - 1) << LlamaConfig::bitlength));
             }
         }
     }
@@ -305,7 +307,6 @@ class LlamaBase : public Backend<T>
             for (int i = 0; i < sz; i++)
             {
                 auto mask = LlamaConfig::dealer->recv_mask();
-                if(i == 0) printf("Mask=%lu\n", mask);
                 a[i] = a[i] - mask;
             }
         }
@@ -533,26 +534,26 @@ class LlamaBase : public Backend<T>
         }
     }
 
-    //     void add(const std::vector<Tensor<T> *> &in, Tensor<T> &out)
-    //     {
-    //         always_assert(in.size() > 0);
-    //         always_assert(out.size() == in[0]->size());
-    //         for (int i = 0; i < in.size(); i++)
-    //         {
-    //             always_assert(out.size() == in[i]->size());
-    //         }
-
-    // #pragma omp parallel for
-    //         for (u64 i = 0; i < out.size(); ++i)
-    //         {
-    //             T sum = 0;
-    //             for (int j = 0; j < in.size(); j++)
-    //             {
-    //                 sum += in[j]->data[i];
-    //             }
-    //             out.data[i] = sum;
-    //         }
-    //     }
+    void add(const std::vector<Tensor<T> *> &in, Tensor<T> &out)
+    {
+        always_assert(in.size() > 0);
+        always_assert(out.size() == in[0]->size());
+        for (int i = 0; i < in.size(); i++)
+        {
+            always_assert(out.size() == in[i]->size());
+        }
+
+#pragma omp parallel for
+        for (u64 i = 0; i < out.size(); ++i)
+        {
+            T sum = 0;
+            for (int j = 0; j < in.size(); j++)
+            {
+                sum += in[j]->data[i];
+            }
+            out.data[i] = sum;
+        }
+    }
 
     void addbias(Tensor<T> &x, const Tensor1D<T> &bias)
     {
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_transformer.h b/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_transformer.h
index f287bf51..c753e148 100644
--- a/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_transformer.h
+++ b/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_transformer.h
@@ -1,8 +1,8 @@
 // Authors: Kanav Gupta, Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -23,16 +23,28 @@
 #include <sytorch/backend/llama_base.h>
 
 template <typename T>
-class LlamaTransformer : public LlamaBase<T> {
+class LlamaTransformer : public LlamaBase<T>
+{
 public:
-
-    void truncate(T *in, T *out, u64 shift, u64 size, u8 mode) {
+    void truncate(T *in, T *out, u64 shift, u64 size, u8 mode)
+    {
         // ARS(size, in, in, out, out, shift);
         // SlothARS(size, in, out, shift);
-        SlothFaithfulARS(size, LlamaConfig::bitlength, in, out, shift, "Linear::");
+        if (mode == 0)
+        {
+            SlothFaithfulARS(size, LlamaConfig::bitlength, in, out, shift, "Linear::");
+        }
+        else if (mode == 1)
+        {
+            SlothARS(size, in, out, shift);
+        }
+        else
+        {
+            assert(0 && "Unknown truncate type");
+        }
     }
 
-    void gelu(const Tensor<T> &in, const Tensor<T> &out, u64 scale, u64 mode = 0)
+    void gelu(const Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode = 0)
     {
         u64 sz = in.size();
         always_assert(sz == out.size());
@@ -46,13 +58,27 @@ class LlamaTransformer : public LlamaBase<T> {
         }
     }
 
+    void silu(const Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode = 0)
+    {
+        u64 sz = in.size();
+        always_assert(sz == out.size());
+        if (mode == 0)
+        {
+            SlothSilu(sz, LlamaConfig::bitlength, in.data, out.data, scale);
+        }
+        else if (mode == 1)
+        {
+            SlothSilu(sz, LlamaConfig::bitlength - scale, in.data, out.data, scale);
+        }
+    }
+
     void softmax(Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode)
     {
         in.is_same_shape(out);
         if (mode == 0)
             Softmax(in.shape[0], in.shape[1], LlamaConfig::bitlength, in.data, out.data, scale);
         else if (mode == 1)
-            Softmax(in.shape[0], in.shape[1], LlamaConfig::bitlength - scale, in.data, out.data, scale);
+            Softmax(in.shape[0], in.shape[1], LlamaConfig::bitlength - scale + 1, in.data, out.data, scale);
     }
 
     void layernorm(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale)
@@ -65,6 +91,16 @@ class LlamaTransformer : public LlamaBase<T> {
         SlothLayerNorm(s1, s2, x.data, A.data, B.data, y.data, scale);
     }
 
+    void rmsnorm(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale)
+    {
+        always_assert(A.d1 == B.d1);
+        always_assert(A.d1 == x.shape.back());
+        always_assert(x.is_same_shape(y));
+        u64 s2 = x.shape.back();
+        u64 s1 = x.size() / s2;
+        SlothRMSNorm(s1, s2, x.data, A.data, B.data, y.data, scale);
+    }
+
     void attention_mask(Tensor<T> &x, T scalar, Tensor<T> &y)
     {
         always_assert(x.is_same_shape(y));
@@ -81,11 +117,14 @@ class LlamaTransformer : public LlamaBase<T> {
             auto y_2d = y.as_2d();
             auto x_2d = x.as_2d();
 
-            for (u64 j = 0; j < n_seq; ++j) {
-                for (u64 k = 0; k < j + 1; ++k) {
+            for (u64 j = 0; j < n_seq; ++j)
+            {
+                for (u64 k = 0; k < j + 1; ++k)
+                {
                     y_2d(j, k) = x_2d(j, k);
                 }
-                for (u64 k = j + 1; k < n_seq; ++k) {
+                for (u64 k = j + 1; k < n_seq; ++k)
+                {
                     y_2d(j, k) = x_2d(j, k) - scalar;
                 }
             }
@@ -116,12 +155,12 @@ class LlamaTransformer : public LlamaBase<T> {
 
     void doOptimizeGelu(LayerGraphNode<T> *node, LayerGraphNode<T> *root)
     {
-        if (node->layer->doTruncationForward) 
+        if (node->layer->doTruncationForward)
         {
-            if (node->children.size() == 1) 
+            if (node->children.size() == 1)
             {
                 LayerGraphNode<T> *child = node->children[0];
-                if (child->layer->name == "GeLU") 
+                if (child->layer->name == "GeLU" || child->layer->name == "SiLU")
                 {
                     child->layer->mode = 1;
                 }
@@ -131,17 +170,17 @@ class LlamaTransformer : public LlamaBase<T> {
 
     void doOptimizeDiv(LayerGraphNode<T> *node, LayerGraphNode<T> *root)
     {
-        if (node->layer->doTruncationForward) 
+        if (node->layer->doTruncationForward)
         {
-            if (node->children.size() == 1) 
+            if (node->children.size() == 1)
             {
                 LayerGraphNode<T> *child = node->children[0];
-                if (child->layer->name == "_ScalarDiv") 
+                if (child->layer->name == "_ScalarDiv")
                 {
                     auto layer_sd = (_ScalarDiv<T> *)child->layer;
                     T d = T(double(1LL << (layer_sd->scale)) / layer_sd->scalar);
                     // if d is power of two
-                    if ((d & (d - 1)) == 0) 
+                    if ((d & (d - 1)) == 0)
                     {
                         // seems very hacky
                         node->layer->scale += (layer_sd->scale - log2(d));
@@ -161,12 +200,12 @@ class LlamaTransformer : public LlamaBase<T> {
 
     void doOptimizeSoftmax(LayerGraphNode<T> *node, LayerGraphNode<T> *root)
     {
-        if (node->layer->doTruncationForward || node->layer->name == "_ScalarDiv") 
+        if (node->layer->doTruncationForward || node->layer->name == "_ScalarDiv")
         {
-            if (node->children.size() == 1) 
+            if (node->children.size() == 1)
             {
                 LayerGraphNode<T> *child = node->children[0];
-                if (child->layer->name == "SoftMax" || child->layer->name == "SoftMaxTriangular") 
+                if (child->layer->name == "SoftMax" || child->layer->name == "SoftMaxTriangular")
                 {
                     child->layer->mode = 1;
                 }
@@ -176,26 +215,24 @@ class LlamaTransformer : public LlamaBase<T> {
 
     void optimize(LayerGraphNode<T> *root)
     {
-        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r) {
-            doOptimizeGelu(n, r);
-        });
-        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r) {
-            doOptimizeSoftmax(n, r);
-        });
-        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r) {
-            doOptimizeDiv(n, r);
-        });
+        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r)
+                         { doOptimizeGelu(n, r); });
+        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r)
+                         { doOptimizeSoftmax(n, r); });
+        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r)
+                         { doOptimizeDiv(n, r); });
     }
 
     void scalardiv(Tensor<T> &x, double scalar, Tensor<T> &y, u64 scale, u64 mode)
     {
-        if (mode == 1) {
+        if (mode == 1)
+        {
             y.copy(x, false);
         }
         else
         {
             T d = T(double(1LL << (scale)) / scalar);
-            if ((d & (d - 1)) == 0) 
+            if ((d & (d - 1)) == 0)
             {
                 SlothFaithfulARS(x.size(), LlamaConfig::bitlength, x.data, y.data, scale - log2(d), "Linear::");
             }
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/layers/layers.h b/GPU-MPC/ext/sytorch/include/sytorch/layers/layers.h
index 9fb03770..f870de8e 100644
--- a/GPU-MPC/ext/sytorch/include/sytorch/layers/layers.h
+++ b/GPU-MPC/ext/sytorch/include/sytorch/layers/layers.h
@@ -1,8 +1,8 @@
 // Authors: Kanav Gupta, Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -167,7 +167,6 @@ class Layer
         if (doTruncationForward)
         {
             this->backend->truncateForward(activation, scale, forwardTruncationMode);
-            // printf("After truncate=%ld\n", activation.data[0]);
         }
         if (doPostSignExtension)
         {
@@ -489,11 +488,8 @@ class Flatten : public Layer<T>
 
     void _forward(Tensor<T> &a)
     {
-        printf("############### Flatten=%d\n", transpose);
         if (transpose && (a.shape.size() == 4 || a.shape.size() == 5))
         {
-            printf("@@@@@@@@@@@@@@@@@@@@@@@ In here!!!!!!!!!!!\n");
-            // printf("Flatten: %d\n", a.shape.size());
             if (a.shape.size() == 4)
             {
                 auto a_4d = a.as_4d();
@@ -512,9 +508,7 @@ class Flatten : public Layer<T>
                         {
                             for (u64 l = 0; l < d4; l++)
                             {
-                                // this->activation(i, j * d3 * d4 + k * d4 + l, 0, 0) = a(i, j, k, l);
                                 act_2d(i, l * d2 * d3 + j * d3 + k) = a_4d(i, j, k, l);
-                                // printf("Flatten: %ld, %ld\n", act_2d(i, l * d2 * d3 + j * d3 + k), a_4d(i, j, k, l));
                             }
                         }
                     }
@@ -551,7 +545,6 @@ class Flatten : public Layer<T>
         }
         else
         {
-            printf("################# In here!!!!!!!!!!!\n");
             u64 sz = a.size();
 #pragma omp parallel for
             for (u64 i = 0; i < sz; i++)
@@ -1708,7 +1701,7 @@ template <typename T>
 class RotaryEmbedding : public Layer<T>
 {
 public:
-    u64 base=10000;
+    u64 base = 10000;
 
     RotaryEmbedding() : Layer<T>("RotaryEmbedding") {}
 
@@ -1719,34 +1712,7 @@ class RotaryEmbedding : public Layer<T>
 
     void _forward(Tensor<T> &a)
     {
-        u64 n_seq = a.shape[0];
-        u64 dim = a.shape[1];
-        auto x_2d = a.as_2d();
-        auto y_2d = this->activation.as_2d();
-        
-
-        for (u64 i = 0; i < n_seq; ++i)
-        {
-            for (u64 j = 0; j < dim; j++)
-            {
-                double scalar = 1.0 / (std::pow(base, (double)((2 * j) % dim) / dim));
-                T scalarInt = (i * this->scalar) * std::pow(2, this->scale);
-                T sinx = std::sin(scalarInt / (float) std::pow(2, this->scale)) * std::pow(2, this->scale - 3);
-                T cosx = std::cos(scalarInt / (float) std::pow(2, this->scale)) * std::pow(2, this->scale - 3);
-                // T sinx = std::sin(i * scalar) * std::pow(2, this->scale - 3);
-                // T cosx = std::cos(i * scalar) * std::pow(2, this->scale - 3);
-
-                if (sinx == (1ULL << (this->scale - 3)))
-                    sinx -= 1;
-                if (cosx == (1ULL << (this->scale - 3)))
-                    cosx -= 1;
-                u64 k = (j + dim / 2) % dim;
-                T mul = 2 * (j >= dim / 2) - 1;
-                T z = cosx * x_2d(i, j) + sinx * mul * x_2d(i, k);
-                y_2d(i, j) = z;
-            }
-        }
-        this->backend->truncate(this->activation, this->scale - 3);
+        this->backend->rotary_embedding(a, this->activation, this->scale);
     }
 
     std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/module.h b/GPU-MPC/ext/sytorch/include/sytorch/module.h
index 3b1b1d5c..39a69b4c 100644
--- a/GPU-MPC/ext/sytorch/include/sytorch/module.h
+++ b/GPU-MPC/ext/sytorch/include/sytorch/module.h
@@ -1,8 +1,8 @@
 // Authors: Kanav Gupta, Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -227,7 +227,7 @@ class SytorchModule
         for (auto &node : allNodesInExecutionOrder)
         {
             auto layer = node->layer;
-            printf("Loading weights layer %s\n", layer->name.data());
+            // printf("Loading weights layer %s\n", layer->name.data());
             if (layer->name == "_MHADummy")
             {
                 auto mha = (_MHADummy<T> *)layer;
@@ -253,7 +253,6 @@ class SytorchModule
                     Tensor2D<T> wQ(mha->wQKV.d1, mha->wQKV.d2 / 3);
                     if (mha->qkvLayout == "kvqsep")
                     {
-                        printf("#################### %d, %d, %d\n", wK.d1, wK.d2, mha->wQKV.d1);
                         for (u64 j = 0; j < wK.size(); j++)
                         {
                             wK.data[j] = T(floatWeights[wIdx + j] * (1LL << scale));
@@ -296,8 +295,6 @@ class SytorchModule
                     {
                         assert(0);
                     }
-
-                    printf("Loading model weights=%ld, %ld, %ld\n", wK.data[0], wV.data[0], wQ.data[0]);
                     for (u64 j = 0; j < mha->wQKV.d1; j++)
                     {
                         for (u64 k = 0; k < mha->wQKV.d2 / 3; k++)
@@ -307,8 +304,6 @@ class SytorchModule
                             mha->wQKV(j, 2 * mha->wQKV.d2 / 3 + k) = wV(j, k);
                         }
                     }
-                    printf("Loaded model weights=%d, %d, %ld, %ld, %ld\n", mha->wQKV.d1, mha->wQKV.d2, mha->wQKV(mha->wQKV.d1 - 1, 0), mha->wQKV(mha->wQKV.d1 - 1, mha->wQKV.d2 / 3), mha->wQKV(mha->wQKV.d1 - 1, 2 * mha->wQKV.d2 / 3));
-                    // memcpy(mha->wQKV.data, wQKV.data, mha->wQKV.size());
                     mha->bQKV.as_nd().zero();
                 }
                 for (u64 j = 0; j < mha->wProj.size(); j++)
@@ -316,11 +311,18 @@ class SytorchModule
                     mha->wProj.data[j] = T(floatWeights[wIdx + j] * (1LL << scale));
                 }
                 wIdx += mha->wProj.size();
-                for (u64 j = 0; j < mha->bProj.size(); ++j)
+                if (mha->qkvLayout == "qkvsep")
+                {
+                    mha->bProj.as_nd().zero();
+                }
+                else
                 {
-                    mha->bProj.data[j] = T(floatWeights[wIdx + j] * (1LL << (2 * scale)));
+                    for (u64 j = 0; j < mha->bProj.size(); ++j)
+                    {
+                        mha->bProj.data[j] = T(floatWeights[wIdx + j] * (1LL << (2 * scale)));
+                    }
+                    wIdx += mha->bProj.size();
                 }
-                wIdx += mha->bProj.size();
             }
             else if (layer->name == "BatchNormInference")
             {
diff --git a/GPU-MPC/ext/sytorch/scripts/all-cpu-benchmarks-local.py b/GPU-MPC/ext/sytorch/scripts/all-cpu-benchmarks-local.py
new file mode 100644
index 00000000..659d38bc
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/scripts/all-cpu-benchmarks-local.py
@@ -0,0 +1,85 @@
+import subprocess
+import csv
+
+mute = False
+
+benchmarks = [
+    'bert-tiny',
+    'bert-base',
+    'bert-large', # very large key
+    'gpt2',
+    'gptneo', # very large key
+    "llama-7b", # very large key
+    "llama-13b", # very large key
+]
+
+logfile1 = open("log1.log", 'a')
+logfile2 = open("log2.log", 'a')
+outcsv = open("results.csv", 'a')
+outcsv.write("model,act_time,act_comm,softmax_time,softmax_comm,norm_time,norm_comm,total_time,total_comm\n")
+outcsv.flush()
+
+def run_seq(cmd):
+    p = subprocess.Popen(cmd, shell=True, stdout=logfile1, stderr=logfile1)
+    p.wait()
+
+
+def run_par(cmd1, cmd2):
+    p1 = subprocess.Popen(cmd1, shell=True, stdout=logfile1, stderr=logfile1)
+    p2 = subprocess.Popen(cmd2, shell=True, stdout=logfile2, stderr=logfile2)
+    p1.wait()
+    p2.wait()
+
+for b in benchmarks:
+    print("[+] benchmarking " + b)
+    print("[+] --- compiling...")
+    run_seq('make benchmark-' + b)
+    print("[+] --- running dealer...")
+    run_seq(f'OMP_NUM_THREADS=4 ./benchmark-{b} 1')
+    print("[+] --- running online phase...")
+    run_par(f'OMP_NUM_THREADS=4 ./benchmark-{b} 2', f'OMP_NUM_THREADS=4 ./benchmark-{b} 3')
+
+    total_time = 0.0
+    total_comm = 0.0
+    act_time = 0.0
+    act_comm = 0.0
+    softmax_time = 0.0
+    softmax_comm = 0.0
+    norm_time = 0.0
+    norm_comm = 0.0
+    with open('llama3.csv') as f:
+        csvFile = csv.reader(f)
+        header_skipped = False
+        for lines in csvFile:
+            if not header_skipped:
+                header_skipped = True
+                continue
+            if lines[0].startswith('GeLU::'):
+                act_time += float(lines[1])
+                act_comm += float(lines[2])
+            elif lines[0].startswith('LayerNorm::'):
+                norm_time += float(lines[1])
+                norm_comm += float(lines[2])
+            elif lines[0].startswith('nExp::'):
+                softmax_time += float(lines[1])
+                softmax_comm += float(lines[2])
+            elif lines[0].startswith('Softmax::'):
+                softmax_time += float(lines[1])
+                softmax_comm += float(lines[2])
+            total_time += float(lines[1])
+            total_comm += float(lines[2])
+    run_seq(f"cp llama3.csv {b}.csv")
+    print("[+] --- act time = " + str(act_time/1000.0) + " s")
+    print("[+] --- act comm = " + str(act_comm/1024.0) + " GB")
+    print("[+] --- softmax time = " + str(softmax_time/1000.0) + " s")
+    print("[+] --- softmax comm = " + str(softmax_comm/1024.0) + " GB")
+    print("[+] --- norm time = " + str(norm_time/1000.0) + " s")
+    print("[+] --- norm comm = " + str(norm_comm/1024.0) + " GB")
+    print("[+] --- online time = " + str(total_time/1000.0) + " s")
+    print("[+] --- online comm = " + str(total_comm/1024.0) + " GB")
+    outcsv.write(f"{b},{act_time/1000.0},{act_comm/1024.0},{softmax_time/1000.0},{softmax_comm/1024.0},{norm_time/1000.0},{norm_comm/1024.0},{total_time/1000.0},{total_comm/1024.0}\n")
+    outcsv.flush()
+
+logfile1.close()
+logfile2.close()
+outcsv.close()
diff --git a/GPU-MPC/ext/sytorch/scripts/all-cpu-benchmarks-remote.py b/GPU-MPC/ext/sytorch/scripts/all-cpu-benchmarks-remote.py
new file mode 100644
index 00000000..f28d3589
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/scripts/all-cpu-benchmarks-remote.py
@@ -0,0 +1,87 @@
+import subprocess
+import csv
+import sys
+
+mute = False
+
+if len(sys.argv) < 3:
+    print("missing arguments")
+    print(f"usage: python {sys.argv[0]} <ip-of-party-0> <party id 0/1>")
+    exit()
+
+ip = sys.argv[1]
+party = int(sys.argv[2])
+
+benchmarks = [
+    'bert-tiny',
+    'bert-base',
+    'bert-large', # very large key
+    'gpt2',
+    'gptneo', # very large key
+    "llama-7b", # very large key
+    "llama-13b", # very large key
+]
+
+logfile1 = open("log1.log", 'a')
+outcsv = open("results.csv", 'a')
+outcsv.write("model,act_time,act_comm,softmax_time,softmax_comm,norm_time,norm_comm,total_time,total_comm\n")
+outcsv.flush()
+
+def run_seq(cmd):
+    p = subprocess.Popen(cmd, shell=True, stdout=logfile1, stderr=logfile1)
+    p.wait()
+
+
+for b in benchmarks:
+    print("[+] benchmarking " + b)
+    print("[+] --- compiling...")
+    run_seq('make benchmark-' + b)
+    print("[+] --- running dealer...")
+    run_seq(f'OMP_NUM_THREADS=4 ./benchmark-{b} 1')
+    print("[+] --- running online phase...")
+    # run_par(f'OMP_NUM_THREADS=4 ./benchmark-{b} 2', f'OMP_NUM_THREADS=4 ./benchmark-{b} 3')
+    run_seq(f"OMP_NUM_THREADS=4 ./benchmark-{b} {party+2} {ip}")
+
+    total_time = 0.0
+    total_comm = 0.0
+    act_time = 0.0
+    act_comm = 0.0
+    softmax_time = 0.0
+    softmax_comm = 0.0
+    norm_time = 0.0
+    norm_comm = 0.0
+    with open(f'llama{party+2}.csv') as f:
+        csvFile = csv.reader(f)
+        header_skipped = False
+        for lines in csvFile:
+            if not header_skipped:
+                header_skipped = True
+                continue
+            if lines[0].startswith('GeLU::'):
+                act_time += float(lines[1])
+                act_comm += float(lines[2])
+            elif lines[0].startswith('LayerNorm::'):
+                norm_time += float(lines[1])
+                norm_comm += float(lines[2])
+            elif lines[0].startswith('nExp::'):
+                softmax_time += float(lines[1])
+                softmax_comm += float(lines[2])
+            elif lines[0].startswith('Softmax::'):
+                softmax_time += float(lines[1])
+                softmax_comm += float(lines[2])
+            total_time += float(lines[1])
+            total_comm += float(lines[2])
+    run_seq(f"cp llama{party+2}.csv remote-{b}.csv")
+    print("[+] --- act time = " + str(act_time/1000.0) + " s")
+    print("[+] --- act comm = " + str(act_comm/1024.0) + " GB")
+    print("[+] --- softmax time = " + str(softmax_time/1000.0) + " s")
+    print("[+] --- softmax comm = " + str(softmax_comm/1024.0) + " GB")
+    print("[+] --- norm time = " + str(norm_time/1000.0) + " s")
+    print("[+] --- norm comm = " + str(norm_comm/1024.0) + " GB")
+    print("[+] --- online time = " + str(total_time/1000.0) + " s")
+    print("[+] --- online comm = " + str(total_comm/1024.0) + " GB")
+    outcsv.write(f"remote-{b},{act_time/1000.0},{act_comm/1024.0},{softmax_time/1000.0},{softmax_comm/1024.0},{norm_time/1000.0},{norm_comm/1024.0},{total_time/1000.0},{total_comm/1024.0}\n")
+    outcsv.flush()
+
+logfile1.close()
+outcsv.close()
diff --git a/GPU-MPC/fss/dcf/gpu_dcf_templates.h b/GPU-MPC/fss/dcf/gpu_dcf_templates.h
index d569e058..d4a73bac 100644
--- a/GPU-MPC/fss/dcf/gpu_dcf_templates.h
+++ b/GPU-MPC/fss/dcf/gpu_dcf_templates.h
@@ -90,9 +90,7 @@ namespace dcf
         {
             auto x2 = (x + (1ULL << (bin - 1)));
             gpuMod(x2, bin);
-            // printf("x=%lu, x2=%lu, %lu\n", x, x2, (1ULL << (bin - 1)));
             o += (x2 >= (1ULL << (bin - 1)));
-            // printf("o=%ld, %d, %d, %d\n", o, (x2 >= (1ULL << (bin - 1))), bin, bout);
         }
         gpuMod(o, bout);
         writePackedOp(out_g, o, bout, N);
diff --git a/GPU-MPC/fss/gpu_dpf.h b/GPU-MPC/fss/gpu_dpf.h
index 307a5d70..c963e020 100644
--- a/GPU-MPC/fss/gpu_dpf.h
+++ b/GPU-MPC/fss/gpu_dpf.h
@@ -89,8 +89,6 @@ GPUDPFKey readGPUDPFKey(u8 **key_as_bytes)
         memcpy(&k, *key_as_bytes, 3 * sizeof(int));
         *key_as_bytes += (3 * sizeof(int));
 
-        printf("M=%d, B=%d\n", k.M, k.B);
-
         k.dpfTreeKey = new GPUDPFTreeKey[k.B];
         k.memSzOut = 0;
         for (int b = 0; b < k.B; b++)
diff --git a/GPU-MPC/fss/gpu_gelu.cu b/GPU-MPC/fss/gpu_gelu.cu
index 26cb0e91..c4e4643d 100644
--- a/GPU-MPC/fss/gpu_gelu.cu
+++ b/GPU-MPC/fss/gpu_gelu.cu
@@ -134,7 +134,6 @@ T *gpuGelu(SigmaPeer *peer, int party, GPUGeluKey<T, TClip> &k, int bw, int bin,
     assert(8 * sizeof(TClip) >= clipBw);
     assert(bin > scale - 6);
     int bwXt = bin - scale + 6 + 1;
-    printf("BwXt=%d, Bout=%d, N=%d\n", bwXt, bw, N);
     // do a truncate reduce
     auto d_Xt = gpuTruncate(bw, bwXt, TruncateType::TrWithSlack, k.trKey, scale - 6, peer, party, N, d_X, gaes, s);
     // the -1 doesn't matter because anything larger is anyway set to (1 << clipBw) - 1
@@ -149,11 +148,8 @@ T *gpuGelu(SigmaPeer *peer, int party, GPUGeluKey<T, TClip> &k, int bw, int bin,
     auto d_clippedX = geluMux<T, TClip>(peer, party, k.muxKey, bwXt, clipBw, N, d_dRelu, d_ic, d_Xt, s);
     gpuFree(d_Xt);
     auto d_reluSubGelu = gpuDpfLUT<TClip, T>(k.lutKey, peer, party, d_clippedX, d_geluSubRelu, gaes, s, false);
-    // printf("Finished LUT\n");
     gpuFree(d_clippedX);
-    // printf("Starting relu\n");
     T *d_relu = gpuSelect<T, T, 0, 0>(peer, party, bw, k.reluSelectKey, d_dRelu, d_X, s, false);
-    // printf("Finished relu\n");
     gpuFree(d_res);
     gpuLinearComb(bw, N, d_relu, T(1), d_relu, -T(1), d_reluSubGelu);
     gpuFree(d_reluSubGelu);
diff --git a/GPU-MPC/fss/gpu_lut.cu b/GPU-MPC/fss/gpu_lut.cu
index b13587b4..01ef611c 100644
--- a/GPU-MPC/fss/gpu_lut.cu
+++ b/GPU-MPC/fss/gpu_lut.cu
@@ -65,12 +65,6 @@ __global__ void dpfLUT(int party, int bin, int N, TIn *X, TOut *tab, AESBlock *s
     if (threadId < N)
     {
         storeAESBlock(stack_g, 0, scw_g[threadId], N, threadId);
-        // stack[threadIdx.x / 32][0][threadIdx.x & 31] = scw[threadId];
-        // for (int i = 0; i < bin - LOG_AES_BLOCK_LEN; i++)
-        // storeAESBlock(scw, 0, scw_g[(i + 1) * N + threadId], N, threadId);
-        // scw[threadIdx.x / 32][i][threadIdx.x & 31] = scw_g[(i+1) * N + threadId];
-        // store these in registers for now and see what happens
-        // hopefully there is no spill
         auto x = (u64)X[threadId];
         gpuMod(x, bin);
         auto l0_cw = l0_g[threadId];
@@ -81,22 +75,10 @@ __global__ void dpfLUT(int party, int bin, int N, TIn *X, TOut *tab, AESBlock *s
         TOut u = 0, v = 0;
         while (depth > 0)
         {
-            // if(threadId == 2) printf("Stack: %u, Depth: %d\n", pathStack, depth);
-            // peek at the top of the stack
             auto seed = loadAESBlock(stack_g, depth - 1, N, threadId);
-            // if (threadId == 2) {
-            // printAESBlock(&seed);
-            // }
-            // auto seed = stack[threadIdx.x / 32][depth - 1][threadIdx.x & 31];
-            // extract the stack bit
             auto bit = pathStack & 1ULL;
-            // should this be +1?
             if (depth == bin - LOG_AES_BLOCK_LEN)
             {
-                // if(stack == 0) {
-                //     l0[threadIdx.x / 32][depth - 1][threadIdx.x & 31] = l0_g[threadId];
-                //     l1[threadIdx.x / 32][depth - 1][threadIdx.x & 31] = l1_g[threadId];
-                // }
                 auto lastBlock = expandDPFTreeNode(bin, party,
                                                    seed,
                                                    0,
@@ -115,11 +97,9 @@ __global__ void dpfLUT(int party, int bin, int N, TIn *X, TOut *tab, AESBlock *s
                     u += w;
                     auto lookup = x - (lb ^ i);
                     gpuMod(lookup, bin);
-                    // printf("current: %ld, %ld, %ld, %ld\n", x, lb ^ i, lookup, tab[lookup]);
-                    v += /*reluSubGelu(lookup, 6, 12)*/ tab[lookup] * w;
+                    v += tab[lookup] * w;
                     lastBlock >>= 1;
                 }
-                // sum &= 1;
                 // pop all the 1s from the stack
                 while (pathStack & 1ULL /*&& depth > 0*/)
                 {
@@ -130,11 +110,7 @@ __global__ void dpfLUT(int party, int bin, int N, TIn *X, TOut *tab, AESBlock *s
                 pathStack ^= 1;
             }
             else
-            { // load the scws into shared memory along the all 0 path
-                // if (stack == 0)
-                // {
-                //     scw[][][] = scw_g[];
-                // }
+            { 
                 // manipulate the seed depending on the bit
                 // aren't storing the first cw because it sees no reuse
                 auto tR_l = (tR >> (depth - 1)) & 1;
@@ -168,9 +144,7 @@ __global__ void dpfLUT(int party, int bin, int N, TIn *X, TOut *tab, AESBlock *s
         gpuMod(u, 1);
         auto maskU = getVCW(1, U, N, 0);
         writeVCW(1, U, u64(u ^ maskU), 0, N);
-        // U[threadId] += u;
         V[threadId] += v;
-        // printf("%d: %ld, %ld\n", threadId, u, v);
     }
 }
 
@@ -179,7 +153,6 @@ TOut *gpuDpfLUT(GPULUTKey<TOut> k0, SigmaPeer *peer, int party, TIn *d_X, TOut *
 {
     auto k = *(k0.k.dpfTreeKey);
     assert(k0.k.bin >= 8 && k0.k.B == 1);
-    printf("############### %d, %d, %d\n", k.bin, k.evalAll, k.N, k0.k.B);
     // Neha: need to change the key reading and writing code
     //  do not change tb size it is needed to load the sbox
     const int tbSz = 256;
@@ -197,15 +170,8 @@ TOut *gpuDpfLUT(GPULUTKey<TOut> k0, SigmaPeer *peer, int party, TIn *d_X, TOut *
     d_tR = (u32 *)moveToGPU((uint8_t *)k.tR, k.memSzT, s);
     auto d_U = (u32 *)moveToGPU((u8 *)k0.maskU, k.memSzOut, s); // a lot of bits packed together
     auto d_V = (TOut *)moveToGPU((u8 *)k0.s.b, k.N * sizeof(TOut), s);
-    // d_out = (uint32_t *)gpuMalloc(k.memSzOut);
-    // int shmSize = 32768;
-    // checkCudaErrors(cudaFuncSetAttribute(dpfEvalAll, cudaFuncAttributeMaxDynamicSharedMemorySize, shmSize));
-    // auto start = std::chrono::high_resolution_clock::now();
     dpfLUT<TIn, TOut><<<tb, tbSz /*, shmSize*/>>>(party, k.bin, k.N, d_X, d_tab, d_scw, d_stack, d_l0, d_l1, d_tR, d_U, d_V, *g);
     checkCudaErrors(cudaDeviceSynchronize());
-    // auto end = std::chrono::high_resolution_clock::now();
-    // auto elapsed = end - start;
-    // printf("Time taken by dpfLUT kernel=%lu micros\n", std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count());
 
     gpuFree(d_scw);
     gpuFree(d_stack);
diff --git a/GPU-MPC/fss/gpu_matmul.cu b/GPU-MPC/fss/gpu_matmul.cu
index f47ce1d8..08e40f73 100644
--- a/GPU-MPC/fss/gpu_matmul.cu
+++ b/GPU-MPC/fss/gpu_matmul.cu
@@ -357,7 +357,6 @@ T *gpuMatmul(SigmaPeer *peer, int party, MatmulParams p, GPUMatmulKey<T> &k, T *
 
     u64 b1 = peer->bytesSent() + peer->bytesReceived();
     s->linear_comm_bytes += (b1 - b0);
-    printf("Matmul Comm=%ld\n", b1 - b0);
     return d_truncatedZ;
 }
 
diff --git a/GPU-MPC/fss/gpu_maxpool.cu b/GPU-MPC/fss/gpu_maxpool.cu
index c3924f85..003901b6 100644
--- a/GPU-MPC/fss/gpu_maxpool.cu
+++ b/GPU-MPC/fss/gpu_maxpool.cu
@@ -358,7 +358,6 @@ T *maxpoolLogHelper(SigmaPeer *peer, int party, MaxpoolParams p, int i, GPUReluK
 template <typename T>
 T *gpuMaxpoolLog(SigmaPeer *peer, int party, MaxpoolParams p, GPUMaxpoolKey<T> k, T *d_I, AESGlobalContext *gaes, Stats *s)
 {
-    // printf("##################### Using fixed maxpool ##########################\n");
     assert(/*p.N == 1 &&*/ p.C == 1 && p.strideH == 1 && p.strideW == p.FW && p.strideH == p.FH);
     // T *d_I = d_in;
     T *d_O;
@@ -400,7 +399,6 @@ template <typename T>
 T *gpuMaxpool(SigmaPeer *peer, int party, MaxpoolParams p, GPUMaxpoolKey<T> k, T *d_I, AESGlobalContext *gaes, Stats *s)
 {
     T *d_O;
-    printf("Gpu maxpool rounds=%d, %d, %d\n", k.rounds, p.FH, p.FW);
     if (k.rounds < p.FH * p.FW - 1)
     {
         assert(p.zPadHLeft == 0 && p.zPadHRight == 0 && p.zPadWLeft == 0 && p.zPadWRight == 0);
diff --git a/GPU-MPC/fss/gpu_maxpool.h b/GPU-MPC/fss/gpu_maxpool.h
index ba393f20..a8edfc8a 100644
--- a/GPU-MPC/fss/gpu_maxpool.h
+++ b/GPU-MPC/fss/gpu_maxpool.h
@@ -56,20 +56,11 @@ GPUMaxpoolKey<T> readGPUMaxpoolKey(MaxpoolParams p, u8 **key_as_bytes)
 {
     GPUMaxpoolKey<T> k;
     k.rounds = *((int *)*key_as_bytes);
-    printf("Rounds=%d\n", k.rounds);
     *key_as_bytes += sizeof(int);
     k.reluKey = new GPUReluKey<T>[/*p.FH * p.FW*/ k.rounds];
     for (int i = 0; i < /*p.FH*/ k.rounds; i++)
     {
-        // for (int j = 0; j < p.FW; j++)
-        // {
-        // if (i == 0 && j == 0)
-        // continue;
-        // printf("Reading Relu key=%d, %d\n", i, j);
         k.reluKey[i] = readReluKey<T>(key_as_bytes);
-        printf("Round %d=%d relus\n", i, k.reluKey[i].numRelus);
-        // if(this->train) maxpoolKey.andKey[i * p.FW + j] = readGPUAndKey(key_as_bytes);
-        // }
     }
     return k;
 }
diff --git a/GPU-MPC/fss/gpu_mha.cu b/GPU-MPC/fss/gpu_mha.cu
index 5f1417d8..e9e8e3d2 100644
--- a/GPU-MPC/fss/gpu_mha.cu
+++ b/GPU-MPC/fss/gpu_mha.cu
@@ -1,8 +1,8 @@
 // Author: Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -34,9 +34,8 @@
 #include "fss/gpu_scalarmul.h"
 #include "fss/gpu_truncate.h"
 
-
 template <typename T>
-__global__ void rotEmbKernel(MHAParams pMHA, int scale, u64 N, T *X, T *Y)
+__global__ void rotEmbKernel(MHAParams pMHA, int bw, int scale, u64 N, T *X, T *Y)
 {
     // the vectors are N x dim_W
     assert(pMHA.dim_W % 2 == 0);
@@ -51,12 +50,13 @@ __global__ void rotEmbKernel(MHAParams pMHA, int scale, u64 N, T *X, T *Y)
         temp = temp % (pMHA.n_seq * pMHA.dim_W);
         int i = temp / pMHA.dim_W;
         int j = temp % pMHA.dim_W;
-        float sinx, cosx;
         auto k = j - (j >= dim_W_half) * dim_W_half;
-        __sincosf(i / __powf(10000, (2 * k / (float)pMHA.dim_W)), &sinx, &cosx);
+
+        double scalar = 1.0 / std::pow(10000.0, (2 * k / (double)pMHA.dim_W));
+        float scalarInt = T((i * scalar) * (1ULL << scale)) / (float) (1ULL << scale); 
         const auto uLim = T(1ULL << (scale - 3));
-        T sinxi = T(sinx * uLim);
-        T cosxi = T(cosx * uLim);
+        T sinxi = (T)(i64)(std::sin(scalarInt) * uLim);
+        T cosxi = (T)(i64)(std::cos(scalarInt) * uLim);
         if (sinxi == uLim)
             sinxi -= 1;
         if (cosxi == uLim)
@@ -65,16 +65,16 @@ __global__ void rotEmbKernel(MHAParams pMHA, int scale, u64 N, T *X, T *Y)
         auto l = (j + dim_W_half) % pMHA.dim_W;
         T m1 = 2 * (j >= dim_W_half) - 1;
         Y[tid] = cosxi * X[tid] + m1 * sinxi * X[head * pMHA.n_seq * pMHA.dim_W + i * pMHA.dim_W + l];
+        gpuMod(Y[tid], bw);
     }
 }
 
 template <typename T>
 T *gpuKeygenRotEmb(u8 **key_as_bytes, int party, int bw, int scale, MHAParams pMHA, T *d_mask_X, AESGlobalContext *g)
 {
-    printf("*********** Generating rotary embedding key! ***************\n");
     size_t size_X = pMHA.n_heads * (u64)pMHA.n_seq * pMHA.dim_W;
     auto d_mask_X1 = (T *)gpuMalloc(size_X * sizeof(T));
-    rotEmbKernel<<<(size_X - 1) / 128 + 1, 128>>>(pMHA, scale, size_X, d_mask_X, d_mask_X1);
+    rotEmbKernel<<<(size_X - 1) / 128 + 1, 128>>>(pMHA, bw, scale, size_X, d_mask_X, d_mask_X1);
     // gpuFree(d_mask_X);
     auto d_mask_truncated_X = genGPUTruncateKey<T, T>(key_as_bytes, party, TruncateType::TrWithSlack, bw, bw, scale - 3, size_X, d_mask_X1, g);
     gpuFree(d_mask_X1);
@@ -88,7 +88,7 @@ T *gpuRotEmb(SigmaPeer *peer, int party, int bw, int scale, MHAParams pMHA, GPUT
 
     size_t size_X = pMHA.n_heads * (u64)pMHA.n_seq * pMHA.dim_W;
     auto d_X1 = (T *)gpuMalloc(size_X * sizeof(T));
-    rotEmbKernel<<<(size_X - 1) / 128 + 1, 128>>>(pMHA, scale, size_X, d_X, d_X1);
+    rotEmbKernel<<<(size_X - 1) / 128 + 1, 128>>>(pMHA, bw, scale, size_X, d_X, d_X1);
     // don't free this because QKV is one long array
     // gpuFree(d_X);
     auto d_truncated_X = gpuTruncate<T, T>(bw, bw, TruncateType::TrWithSlack, trKey, scale - 3, peer, party, size_X, d_X1, g, s); //, true);
@@ -96,7 +96,6 @@ T *gpuRotEmb(SigmaPeer *peer, int party, int bw, int scale, MHAParams pMHA, GPUT
 
     u64 b1 = peer->bytesSent() + peer->bytesReceived();
     s->linear_comm_bytes += (b1 - b0);
-    printf("Comm in rotary embedding=%lu, N=%d\n", b1 - b0, size_X);
     return d_truncated_X;
 }
 
@@ -107,7 +106,6 @@ T *gpuKeygenMHA(u8 **key_as_bytes, int party, int bw, int scale, MHAParams pMHA,
     auto d_mask_QKV = gpuKeygenMatmul(key_as_bytes, party, pMHAMul.pQKV, d_mask_X, WQKV, YQKV, TruncateType::TrFloor, g);
     // this->activation.d_data = d_mask_QKV;
 
-    // printf("Size of Q=%d\n", pQKV.size_C / 3);
     int QKSz = pMHAMul.pQKV.size_C / 3;
     auto d_mask_Q = d_mask_QKV;
     auto d_mask_K = d_mask_QKV + QKSz;
@@ -130,7 +128,6 @@ T *gpuKeygenMHA(u8 **key_as_bytes, int party, int bw, int scale, MHAParams pMHA,
     if (pMHA.doNormQKt && int(log2(pMHA.dim_W)) % 2 == 1)
     {
         T invSqrtDimW = T((1.0f / sqrt(double(pMHA.dim_W))) * (1LL << scale));
-        printf("####### Doing a scalar multiplication\n");
         d_mask_normQKt = gpuKeygenScalarMul(key_as_bytes, party, bw, pMHAMul.pQKt.size_C, invSqrtDimW, d_mask_QKt, TruncateType::TrFloor, scale, g);
         gpuFree(d_mask_QKt);
     }
@@ -182,7 +179,6 @@ T *gpuMHA(SigmaPeer *peer, int party, int bw, int scale, MHAParams pMHA, MHAMulP
     if (pMHA.doNormQKt && int(log2(pMHA.dim_W)) % 2 == 1)
     {
         T invSqrtDimW = T((1.0f / sqrt(double(pMHA.dim_W))) * (1LL << scale));
-        printf("####### Doing a scalar multiplication\n");
         d_normQKt = gpuScalarMul(peer, party, bw, pMHAMul.pQKt.size_C, k.normQKtTrKey, invSqrtDimW, d_QKt, TruncateType::TrFloor, scale, g, s);
         gpuFree(d_QKt);
     }
@@ -202,6 +198,5 @@ T *gpuMHA(SigmaPeer *peer, int party, int bw, int scale, MHAParams pMHA, MHAMulP
     auto d_proj = gpuMatmul(peer, party, pMHAMul.pProj, k.mmKeyProj, d_smQKtV, WProj, YProj, TruncateType::TrFloor, g, s);
     gpuFree(d_smQKtV);
     auto b1 = peer->bytesSent() + peer->bytesReceived();
-    printf("MHA Comm=%ld\n", b1 - b0);
     return d_proj;
 }
diff --git a/GPU-MPC/fss/gpu_mha.h b/GPU-MPC/fss/gpu_mha.h
index 09e29d9d..4b733e52 100644
--- a/GPU-MPC/fss/gpu_mha.h
+++ b/GPU-MPC/fss/gpu_mha.h
@@ -92,14 +92,9 @@ inline MatmulParams initPQKt(MHAParams pMHA, int bw, int scale)
     if (pMHA.doNormQKt && int(log2(pMHA.dim_W)) % 2 == 0)
     {
         // assert(int(log2(dim_W)) % 2 == 0);
-        printf("Shift=%d\n", int(log2(pMHA.dim_W) / 2));
+        // printf("Shift=%d\n", int(log2(pMHA.dim_W) / 2));
         pQKt.shift += int(log2(pMHA.dim_W) / 2);
     }
-    else
-    {
-        printf("Not merging the two truncations, shift=%d\n", pQKt.shift);
-        // assert(0);
-    }
     // K is stored in column-major form
     pQKt.rowMaj_B = false;
     pQKt.ld_B = pQKt.K;
@@ -147,7 +142,6 @@ inline MaxpoolParams initPMaxpool(MHAParams pMHA, int bw, int scale)
     pMPool.zPadWRight = 0;
     pMPool.bw = bw;
     pMPool.bin = bw - scale;
-    printf("Bin=%d\n", pMPool.bin);
     pMPool.scale = scale;
     pMPool.scaleDiv = 0;
     initPoolParams(pMPool);
@@ -181,7 +175,6 @@ GPUMHAKey<T> readGPUMHAKey(MHAParams pMHA, MHAMulParams pMHAMul, u8 **key_as_byt
     if (pMHA.doNormQKt && int(log2(pMHA.dim_W)) % 2 == 1)
         k.normQKtTrKey = readGPUTruncateKey<T>(TruncateType::TrFloor, key_as_bytes);
     k.softmaxKey = readGPUSoftMaxKey<T>(pMHAMul.pMPool, key_as_bytes);
-    printf("Maxpool rounds=%d", k.softmaxKey.maxPoolKey.rounds);
     k.mmKeySmQKtV = readGPUMatmulKey<T>(pMHAMul.pSmQKtV, TruncateType::TrFloor, key_as_bytes);
     k.mmKeyProj = readGPUMatmulKey<T>(pMHAMul.pProj, TruncateType::TrFloor, key_as_bytes);
     return k;
diff --git a/GPU-MPC/fss/gpu_mul.cu b/GPU-MPC/fss/gpu_mul.cu
index b476c35b..506da33b 100644
--- a/GPU-MPC/fss/gpu_mul.cu
+++ b/GPU-MPC/fss/gpu_mul.cu
@@ -1,8 +1,8 @@
 // Author: Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -40,7 +40,6 @@ __global__ void doBeaverMul(int party, int bw, int N, T *X, T *Y, T *a, T *b, T
     {
         Z[i] = (party == SERVER1) * (X[i] * Y[i]) - X[i] * b[i] - a[i] * Y[i] + c[i];
         gpuMod(Z[i], bw);
-        // printf("%ld, %ld, %ld\n", X[i], Y[i], Z[i]);
     }
 }
 
@@ -55,9 +54,9 @@ T *gpuKeygenMul(u8 **key_as_bytes, int party, int bw, int scale, int N, T *d_mas
     writeShares<T, T>(key_as_bytes, party, N, d_mask_B, bw);
     writeShares<T, T>(key_as_bytes, party, N, d_mask_C1, bw);
     gpuFree(d_mask_C1);
-    printf("##Num truncations: %d\n", N);
-    auto d_mask_truncated_C = genGPUTruncateKey<T, T>(key_as_bytes, party, /*TruncateType::TrWithSlack*/t, bw, bw, scale, N, d_mask_C, gaes);
-    gpuFree(d_mask_C);
+    auto d_mask_truncated_C = genGPUTruncateKey<T, T>(key_as_bytes, party, /*TruncateType::TrWithSlack*/ t, bw, bw, scale, N, d_mask_C, gaes);
+    if (d_mask_truncated_C != d_mask_C)
+        gpuFree(d_mask_C);
     return d_mask_truncated_C;
 }
 
@@ -72,11 +71,11 @@ T *gpuMul(SigmaPeer *peer, int party, int bw, int scale, int N, GPUMulKey<T> k,
     doBeaverMul<<<(N - 1) / 128 + 1, 128>>>(party, bw, N, d_X, d_Y, d_a, d_b, d_c, d_Z);
     gpuFree(d_a);
     peer->reconstructInPlace(d_Z, bw, N, s);
-    auto d_truncated_Z = gpuTruncate<T, T>(bw, bw, /*TruncateType::TrWithSlack*/t, k.trKey, scale, peer, party, N, d_Z, gaes, s); //, true);
-    gpuFree(d_Z);
+    auto d_truncated_Z = gpuTruncate<T, T>(bw, bw, t, k.trKey, scale, peer, party, N, d_Z, gaes, s); //, true);
+    if (d_truncated_Z != d_Z)
+        gpuFree(d_Z);
     u64 b1 = peer->bytesSent() + peer->bytesReceived();
     if (s)
         s->linear_comm_bytes += (b1 - b0);
-    printf("Comm inside Mul=%ld\n", b1 - b0);
     return d_truncated_Z;
 }
diff --git a/GPU-MPC/fss/gpu_mul.h b/GPU-MPC/fss/gpu_mul.h
index 6bbdc377..d168d84d 100644
--- a/GPU-MPC/fss/gpu_mul.h
+++ b/GPU-MPC/fss/gpu_mul.h
@@ -39,16 +39,12 @@ GPUMulKey<T> readGPUMulKey(u8** key_as_bytes, u64 szA, u64 szB, u64 szC, Truncat
     k.szB = szB;
     k.szC = szC;
     k.a = (T*) *key_as_bytes;
-    // printf("a=%ld\n", *k.a);
     *key_as_bytes += (szA * sizeof(T));
     k.b = (T*) *key_as_bytes;
-    // printf("b=%ld\n", *k.b);
     *key_as_bytes += (szB * sizeof(T));
     k.c = (T*) *key_as_bytes;
-    // printf("c=%ld\n", *k.c);
     *key_as_bytes += (szC * sizeof(T));
-    printf("Reading truncate key######\n");
-    k.trKey = readGPUTruncateKey<T>(/*TruncateType::TrWithSlack*/t, key_as_bytes);
+    k.trKey = readGPUTruncateKey<T>(t, key_as_bytes);
     return k;
 }
 
diff --git a/GPU-MPC/fss/gpu_nexp.cu b/GPU-MPC/fss/gpu_nexp.cu
index 27a4b237..eb5cf5f6 100644
--- a/GPU-MPC/fss/gpu_nexp.cu
+++ b/GPU-MPC/fss/gpu_nexp.cu
@@ -33,7 +33,6 @@ T *gpuKeygenNExp(u8 **key_as_bytes, int party, int bw, int bin, int scale, int N
     // flip(relu(x - p)) + p
     // this is wrong, can't arbitrarily do bin + 1 whenever you please
     // the input is a 39 bit input
-    printf("Input to nExp=%d\n", N);
     auto d_clipMask = gpuGenReluKey<T, u16, p, p, true>(key_as_bytes, party, bin, 16, N, d_mask_X, gaes);
     // generate the output in the full bw and scale
     auto d_lsbLutMask = gpuKeyGenLUT<u16, T>(key_as_bytes, party, 8, bw, N, d_clipMask, gaes);
@@ -42,7 +41,6 @@ T *gpuKeygenNExp(u8 **key_as_bytes, int party, int bw, int bin, int scale, int N
     auto d_msbLutMask = gpuKeyGenLUT<u8, T>(key_as_bytes, party, 8, bw, N, d_msbMask, gaes);
     gpuFree(d_msbMask);
     // clipMask is lsb mask
-    printf("Calling mul with bw=%d, scale=%d, N=%d\n", bw, scale, N);
     auto d_nExpMask = gpuKeygenMul(key_as_bytes, party, bw, scale, N, d_msbLutMask, d_lsbLutMask, TruncateType::TrWithSlack, gaes);
     gpuFree(d_msbLutMask);
     gpuFree(d_lsbLutMask);
diff --git a/GPU-MPC/fss/gpu_nexp.h b/GPU-MPC/fss/gpu_nexp.h
index 435dd070..f7a65010 100644
--- a/GPU-MPC/fss/gpu_nexp.h
+++ b/GPU-MPC/fss/gpu_nexp.h
@@ -43,14 +43,11 @@ GPUNExpKey<T> readGPUNExpKey(u8 **key_as_bytes)
 {
     GPUNExpKey<T> k;
     k.reluKey = readReluKey<u16>(key_as_bytes);
-    printf("##Reading Relu key=%d\n", k.reluKey.bout);
     k.N = k.reluKey.numRelus;
     k.lsbLutKey = readGPULUTKey<T>(key_as_bytes);
     k.trKey = readGPUTruncateKey<u8>(TruncateType::TrWithSlack, key_as_bytes);
     k.msbLutKey = readGPULUTKey<T>(key_as_bytes);
     k.mulKey = readGPUMulKey<T>(key_as_bytes, (u64)k.N, (u64)k.N, (u64)k.N, TruncateType::TrWithSlack);
-    // printf("Done reading nexp key\n");
-    // k.mulTrKey = readGPUTruncateKey<T>(TruncateType::TrWithSlack, key_as_bytes);
     return k;
 }
 
diff --git a/GPU-MPC/fss/gpu_scalarmul.h b/GPU-MPC/fss/gpu_scalarmul.h
index be7654ea..b941f871 100644
--- a/GPU-MPC/fss/gpu_scalarmul.h
+++ b/GPU-MPC/fss/gpu_scalarmul.h
@@ -1,8 +1,8 @@
 // Author: Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -30,7 +30,6 @@ T *gpuKeygenScalarMul(u8 **key_as_bytes, int party, int bw, int N, T a, T *d_mas
 {
     auto d_mask_Z = (T *)gpuMalloc(N * sizeof(T));
     gpuLinearComb(bw, N, d_mask_Z, a, d_mask_X);
-    printf("Truncate type=%d\n", t);
     auto d_mask_truncated_Z = genGPUTruncateKey<T, T>(key_as_bytes, party, t, bw, bw, shift, N, d_mask_X, gaes);
     if (d_mask_truncated_Z != d_mask_Z)
         gpuFree(d_mask_Z);
@@ -43,9 +42,9 @@ T *gpuScalarMul(SigmaPeer *peer, int party, int bw, int N, GPUTruncateKey<T> k,
     u64 b0 = peer->bytesSent() + peer->bytesReceived();
     auto d_Z = (T *)gpuMalloc(N * sizeof(T));
     gpuLinearComb(bw, N, d_Z, a, d_X);
-    printf("Truncate type=%d\n", t);
-    auto d_truncated_Z = gpuTruncate<T, T>(bw, bw, t, k, shift, peer, party, N, d_Z, gaes, s); //, true);
-    gpuFree(d_Z);
+    auto d_truncated_Z = gpuTruncate<T, T>(bw, bw, t, k, shift, peer, party, N, d_Z, gaes, s);
+    if (d_truncated_Z != d_Z)
+        gpuFree(d_Z);
     u64 b1 = peer->bytesSent() + peer->bytesReceived();
     s->linear_comm_bytes += (b1 - b0);
     return d_truncated_Z;
diff --git a/GPU-MPC/fss/gpu_softmax.cu b/GPU-MPC/fss/gpu_softmax.cu
index b331fc4e..ad27094c 100644
--- a/GPU-MPC/fss/gpu_softmax.cu
+++ b/GPU-MPC/fss/gpu_softmax.cu
@@ -63,7 +63,6 @@ T *gpuKeygenSoftmax(u8 **key_as_bytes, int party, MaxpoolParams p, T *d_mask_X,
 {
     int inSz = getInSz(p);
     int mSz = getMSz(p);
-    printf("################# InSz=%d\n", inSz);
     int ogBw = p.bw;
     int reducedBw = p.bin + 2;
     // int bin = p.bin;
@@ -74,7 +73,7 @@ T *gpuKeygenSoftmax(u8 **key_as_bytes, int party, MaxpoolParams p, T *d_mask_X,
     p.bin = p.bin + 1;
     // get the max in 39 bits (implicit reduce)
     // in this case the input bw and the output bw are the same
-    auto d_maxMask = gpuKeygenMaxpool(key_as_bytes, party, p, d_mask_X, gaes, (inSz & (inSz - 1)) == 0);
+    auto d_maxMask = gpuKeygenMaxpool(key_as_bytes, party, p, d_mask_X, gaes, true);
     assert(p.strideH == p.FH && p.strideW == p.FW);
     auto d_X1Mask = windowFunc<T, xPlusM<u64(-1), u64(1)>>(party, p, d_mask_X, d_maxMask);
     gpuFree(d_maxMask);
diff --git a/GPU-MPC/fss/gpu_truncate.cu b/GPU-MPC/fss/gpu_truncate.cu
index 1e31ef85..aaff4606 100644
--- a/GPU-MPC/fss/gpu_truncate.cu
+++ b/GPU-MPC/fss/gpu_truncate.cu
@@ -43,7 +43,6 @@ using keygenTrFunc = void (*)(int party, int bin, int shift, int bout, int N, in
 template <typename TIn, typename TOut>
 __device__ TOut trReduce(int party, int bin, int shift, int i, TIn x, u8 *bytes)
 {
-    // if(i == 0) printf("%lu, %lu\n", x, x >> shift);
     return (party == SERVER1) * TOut(x >> shift);
 }
 
@@ -60,8 +59,6 @@ __device__ TOut trWithSlack(int party, int bin, int shift, int i, TIn x, u8 *byt
     auto x1 = (x + (1ULL << (bin - 2)));
     gpuMod(x1, bin);
     auto msb_x1 = gpuMsb(x1, bin);
-    // if (i == 0)
-    //     printf("bin=%d, x=%lu, x1=%lu, msb corr=%lu\n", bin, x, x1, ((TOut *)bytes)[i]);
     return (party == SERVER1) * TOut((x1 >> shift) - (1ULL << (bin - shift - 2))) + ((TOut *)bytes)[i] * (!msb_x1);
 }
 
@@ -74,8 +71,6 @@ __global__ void trCorrKernel(int party, int bin, int shift, int bout, int N, TIn
         u32 z = (z_g[i / 32] >> (threadIdx.x & 0x1f)) & 1;
         auto y_l = (TOut)tf(party, bin, shift, i, x[i], bytes) + corr[2 * i + z];
         gpuMod(y_l, bout);
-        // if (i == 0)
-        //     printf("corr=%lu, %lu, %lu, %u, %lu\n", corr[0], corr[1], y_l, z, x[i]);
         y[i] = y_l;
     }
 }
@@ -110,7 +105,6 @@ __device__ void keygenTrWithSlack(int party, int bin, int shift, int bout, int N
 {
     keygenTrReduce(party, bin, shift, bout, N, i, x, y, z, trKey, bytes);
     trKey[2 * N + i] = TOut(gpuMsb(x, bin) * (1ULL << (bin - shift)));
-    // if(i == 0) printf("trSlack key=%lu\n", ((TOut *)bytes)[N + i]);
 }
 
 template <typename TIn, typename TOut, keygenTrFunc<TIn, TOut> tf>
diff --git a/GPU-MPC/fss/gpu_window.cu b/GPU-MPC/fss/gpu_window.cu
index b5349fd7..09910f9c 100644
--- a/GPU-MPC/fss/gpu_window.cu
+++ b/GPU-MPC/fss/gpu_window.cu
@@ -1,8 +1,8 @@
 // Author: Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -87,11 +87,8 @@ __global__ void windowFuncKernel(int party, MaxpoolParams p, T *d_X, T *d_max, T
             j = n * p.H * p.W * p.C + (h / p.strideH) * p.W * p.C + (w / p.strideW) * p.C + c;
         }
         WindowArgs wa{party, i, j, N, M};
-        // printf("i=%d, j=%d, max[%d]=%ld\n", i, j, j, d_max[j]);
         auto o = T(f(wa, u64(d_X[i]), u64(d_max[j]), bytes));
         gpuMod(o, p.bw);
-        // if (i <= 4)
-        // printf("o[%d]=%ld, %ld, %ld, %d\n", i, u64(d_X[i]), u64(d_max[j]), o, j);
         d_out[i] = o;
     }
 }
@@ -104,7 +101,7 @@ T *windowFunc(int party, MaxpoolParams p, T *d_X, T *d_M, u8 *d_bytes = NULL, bo
     T *d_out = d_X;
     if (!inPlace)
         d_out = (T *)gpuMalloc(inSz * sizeof(T));
-    printf("%d, %d, %d, %d\n", p.strideH, p.FH, p.strideW, p.FW);
+    // printf("%d, %d, %d, %d\n", p.strideH, p.FH, p.strideW, p.FW);
     assert(p.strideH == p.FH && p.strideW == p.FW);
     assert(p.zPadHLeft == 0 && p.zPadHRight == 0 && p.zPadWLeft == 0 && p.zPadWRight == 0);
     windowFuncKernel<T, f><<<(inSz - 1) / 128 + 1, 128>>>(party, p, d_X, d_M, d_out, inSz, mSz, d_bytes);
@@ -120,7 +117,6 @@ T *keygenWindowMul(u8 **key_as_bytes, int party, MaxpoolParams p, T *d_mask_X, T
     auto d_mulMask = randomGEOnGpu<T>(inSz, p.bw);
     // checkCudaErrors(cudaMemset(d_mulMask, 0, inSz * sizeof(T)));
     auto d_mulMask1 = windowFunc<T, keygenMul<T>>(party, p, d_mask_X, d_mask_M, (u8 *)d_mulMask);
-    // printf("Writing mul key, N=%lx\n", *key_as_bytes);
     writeShares<T, T>(key_as_bytes, party, inSz, d_mask_X, p.bw);
     writeShares<T, T>(key_as_bytes, party, mSz, d_mask_M, p.bw);
     writeShares<T, T>(key_as_bytes, party, inSz, d_mulMask1, p.bw);
@@ -131,19 +127,18 @@ T *keygenWindowMul(u8 **key_as_bytes, int party, MaxpoolParams p, T *d_mask_X, T
         assert(d_tempMask == d_mulMask);
     }
     // truncate X*M + B as is correct
-    auto d_truncateMask = genGPUTruncateKey<T, T>(key_as_bytes, party, /*TruncateType::TrWithSlack*/t, p.bw, p.bw, p.scale, inSz, d_mulMask, gaes);
-    gpuFree(d_mulMask);
+    auto d_truncateMask = genGPUTruncateKey<T, T>(key_as_bytes, party, t, p.bw, p.bw, p.scale, inSz, d_mulMask, gaes);
+    if (d_truncateMask != d_mulMask)
+        gpuFree(d_mulMask);
     return d_truncateMask;
 }
 
 template <typename T>
 T *windowMul(SigmaPeer *peer, int party, MaxpoolParams p, GPUMulKey<T> &k, T *d_X, T *d_M, TruncateType t, AESGlobalContext *gaes, Stats *s, T *d_B = NULL)
 {
-    // printf("Start################\n");
     auto inSz = getInSz(p);
     auto mSz = getMSz(p);
     auto d_mulKey = (u8 *)moveToGPU((u8 *)k.a, (2 * inSz + mSz) * sizeof(T), s);
-    // printf("%ld, %ld, %ld\n", k.mulKey.a[0], k.mulKey.b[0], k.mulKey.c[0]);
     auto d_mulOut = windowFunc<T, beaverMul<T>>(party, p, d_X, d_M, (u8 *)d_mulKey);
     gpuFree(d_mulKey);
     peer->reconstructInPlace(d_mulOut, p.bw, inSz, s);
@@ -152,8 +147,8 @@ T *windowMul(SigmaPeer *peer, int party, MaxpoolParams p, GPUMulKey<T> &k, T *d_
         auto d_temp = windowFunc<T, xPlusM<u64(1), u64(1)>>(party, p, d_mulOut, d_B, NULL, true);
         assert(d_mulOut == d_temp);
     }
-    auto d_truncated_O = gpuTruncate<T, T>(p.bw, p.bw, /*TruncateType::TrWithSlack*/t, k.trKey, p.scale, peer, party, inSz, d_mulOut, gaes, s);
-    gpuFree(d_mulOut);
-    // printf("End################\n");
+    auto d_truncated_O = gpuTruncate<T, T>(p.bw, p.bw, t, k.trKey, p.scale, peer, party, inSz, d_mulOut, gaes, s);
+    if (d_truncated_O != d_mulOut)
+        gpuFree(d_mulOut);
     return d_truncated_O;
 }
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_window.h b/GPU-MPC/fss/gpu_window.h
index c6d9ed2b..f6a55ac8 100644
--- a/GPU-MPC/fss/gpu_window.h
+++ b/GPU-MPC/fss/gpu_window.h
@@ -29,7 +29,6 @@ GPUMulKey<T> readGPUWindowMulKey(MaxpoolParams p, TruncateType t, u8 **key_as_by
     GPUMulKey<T> k;
     u64 inSz = getInSz(p);
     u64 mSz = getMSz(p);
-    printf("%d, %d\n", inSz, mSz);
     k = readGPUMulKey<T>(key_as_bytes, inSz, mSz, inSz, t);
     return k;
 }
diff --git a/GPU-MPC/setup.sh b/GPU-MPC/setup.sh
index 6574e99b..2b94522e 100644
--- a/GPU-MPC/setup.sh
+++ b/GPU-MPC/setup.sh
@@ -29,7 +29,7 @@ sudo apt install cmake make libeigen3-dev;
 echo "Building CUTLASS"
 # Build CUTLASS
 cd ext/cutlass;
-mkdir build && cd build;
+mkdir -p build && cd build;
 cmake .. -DCUTLASS_NVCC_ARCHS=$GPU_ARCH -DCMAKE_CUDA_COMPILER_WORKS=1 -DCMAKE_CUDA_COMPILER=$NVCC_PATH;
 make -j;
 cd ../../..;
@@ -37,8 +37,8 @@ cd ../../..;
 # Build sytorch
 echo "Building Sytorch"
 cd ext/sytorch;
-mkdir build && cd build;
-cmake -DCMAKE_INSTALL_PREFIX=./install -DCMAKE_BUILD_TYPE=Release ../;
+mkdir -p build && cd build;
+cmake -DCMAKE_INSTALL_PREFIX=./install -DCMAKE_BUILD_TYPE=Release ../ -DCUDAToolkit_ROOT="/usr/local/cuda-$CUDA_VERSION/bin/";
 make sytorch -j;
 cd ../../..;
 
@@ -59,18 +59,18 @@ cd ../..;
 
 # Make output directories
 # Orca
-mkdir experiments/orca/output;
-mkdir experiments/orca/output/P0;
-mkdir experiments/orca/output/P1;
-mkdir experiments/orca/output/P0/training;
-mkdir experiments/orca/output/P1/training;
-mkdir experiments/orca/output/P0/inference;
-mkdir experiments/orca/output/P1/inference;
+mkdir -p experiments/orca/output;
+mkdir -p experiments/orca/output/P0;
+mkdir -p experiments/orca/output/P1;
+mkdir -p experiments/orca/output/P0/training;
+mkdir -p experiments/orca/output/P1/training;
+mkdir -p experiments/orca/output/P0/inference;
+mkdir -p experiments/orca/output/P1/inference;
 
 # Sigma
-mkdir experiments/sigma/output;
-mkdir experiments/sigma/output/P0;
-mkdir experiments/sigma/output/P1;
+mkdir -p experiments/sigma/output;
+mkdir -p experiments/sigma/output/P0;
+mkdir -p experiments/sigma/output/P1;
 
 # install matplotlib
 pip3 install matplotlib
diff --git a/GPU-MPC/tests/fss/dpf.cu b/GPU-MPC/tests/fss/dpf.cu
index 72caac79..9197d430 100644
--- a/GPU-MPC/tests/fss/dpf.cu
+++ b/GPU-MPC/tests/fss/dpf.cu
@@ -34,6 +34,111 @@
 
 using T = u64;
 
+int main(int argc, char *argv[])
+{
+    AESGlobalContext g;
+    initAESContext(&g);
+    initGPUMemPool();
+    int bin = atoi(argv[1]);
+    int N = atoi(argv[2]);
+
+    u8 *ptr1, *ptr2;
+    getKeyBuf(&ptr1, &ptr2, 50 * OneGB);
+    auto keyBuf1 = ptr1;
+    auto keyBuf2 = ptr2;
+    // auto d_x = (T*) gpuMalloc(N * sizeof(T));
+    // checkCudaErrors(cudaMemset(d_x, 0, N * sizeof(T)));
+    printf("N=%d, memSzN=%lu\n", N, N * sizeof(T));
+    initGPURandomness();
+    auto d_rin = randomGEOnGpu<T>(N, bin);
+    auto h_rin = (T *)moveToCPU((u8 *)d_rin, N * sizeof(T), NULL);
+    auto d_X = randomGEOnGpu<T>(N, bin);
+    auto h_X = (T *)moveToCPU((u8 *)d_X, N * sizeof(T), NULL);
+    // printf("%ld\n", h_X[3]);
+    destroyGPURandomness();
+
+    initGPURandomness();
+    gpuKeyGenDCF(&keyBuf1, 0, bin, N, d_rin, &g);
+    printf("Key size=%lu\n", keyBuf1 - ptr1);
+    auto k1 = readGPUDPFKey(&ptr1);
+
+    Stats s;
+    auto start = std::chrono::high_resolution_clock::now();
+    auto d_O1 = gpuDpf(k1, 0, d_X, &g, &s);
+    // gpuDcf<T, 1, idPrologue, idEpilogue>(k1, 0, d_X, &g, (Stats *)&s);
+    auto end = std::chrono::high_resolution_clock::now();
+    auto elapsed = end - start;
+
+    printf("Time taken for P0=%lu micros, Transfer time=%lu\n", std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count(), s.transfer_time);
+
+    auto h_O1 = (u32 *)moveToCPU((u8 *)d_O1, k1.memSzOut, (Stats *)NULL);
+    gpuFree(d_O1);
+    destroyGPURandomness();
+
+    initGPURandomness();
+    gpuKeyGenDCF(&keyBuf2, 1, bin, N, d_rin, &g);
+    auto k2 = readGPUDPFKey(&ptr2);
+
+    start = std::chrono::high_resolution_clock::now();
+    auto d_O2 = gpuDcf<T, 1, idPrologue, idEpilogue>(k2, 1, d_X, &g, (Stats *)NULL);
+    end = std::chrono::high_resolution_clock::now();
+    elapsed = end - start;
+
+    printf("Time taken for P1=%lu micros\n", std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count());
+
+    auto h_O2 = (u32 *)moveToCPU((u8 *)d_O2, k2.memSzOut, NULL);
+    gpuFree(d_O2);
+    destroyGPURandomness();
+
+    for (int i = 0; i < N; i++)
+    {
+        auto o1 = (h_O1[i / 32] >> (i & 31)) & T(1);
+        auto o2 = (h_O2[i / 32] >> (i & 31)) & T(1);
+        auto o = (o1 + o2) & u32(1);
+        if (i < 10 || (o != (h_X[i] < h_rin[i])))
+            printf("%d: %u, %u, %u, %lu, %lu\n", i, o1, o2, o, h_X[i], h_rin[i]);
+        // assert((h_O1[i] ^ h_O2[i]) == u32(0));
+        // assert(o == (h_X[i] < h_rin[i]));
+        assert(o == (h_X[i] < h_rin[i]));
+    }
+    return 0;
+}
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "utils/gpu_data_types.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_mem.h"
+
+#include <cassert>
+#include <chrono>
+
+#include "utils/gpu_random.h"
+#include "fss/gpu_dpf.h"
+
+#include <sytorch/tensor.h>
+
+using T = u64;
+
 int main(int argc, char *argv[])
 {
     AESGlobalContext g;
diff --git a/GPU-MPC/tests/fss/layernorm.cu b/GPU-MPC/tests/fss/layernorm.cu
index 617ae774..391ff4d1 100644
--- a/GPU-MPC/tests/fss/layernorm.cu
+++ b/GPU-MPC/tests/fss/layernorm.cu
@@ -88,7 +88,7 @@ int main(int argc, char *argv[])
     auto d_masked_A = getMaskedInputOnGpu(p.imgW, p.bw, d_mask_A, &h_A, true, 15);
     auto d_masked_B = getMaskedInputOnGpu(p.imgW, p.bw, d_mask_B, &h_B, true, 15);
 
-    printf("A=%ld, B=%ld, I=%ld, %ld, %ld, %ld\n", h_A[0], h_B[0], h_I[0], h_I[1], h_I[2], h_I[3]);
+    // printf("A=%ld, B=%ld, I=%ld, %ld, %ld, %ld\n", h_A[0], h_B[0], h_I[0], h_I[1], h_I[2], h_I[3]);
     u8 *startPtr, *curPtr;
     getKeyBuf(&startPtr, &curPtr, 8 * OneGB);
     llama::start();
diff --git a/GPU-MPC/tests/fss/rmsnorm.cu b/GPU-MPC/tests/fss/rmsnorm.cu
index 537ab820..c6955b1b 100644
--- a/GPU-MPC/tests/fss/rmsnorm.cu
+++ b/GPU-MPC/tests/fss/rmsnorm.cu
@@ -88,7 +88,7 @@ int main(int argc, char *argv[])
     auto d_masked_A = getMaskedInputOnGpu(p.imgW, p.bw, d_mask_A, &h_A, true, 15);
     auto d_masked_B = getMaskedInputOnGpu(p.imgW, p.bw, d_mask_B, &h_B, true, 15);
 
-    printf("A=%ld, B=%ld, I=%ld, %ld, %ld, %ld\n", h_A[0], h_B[0], h_I[0], h_I[1], h_I[2], h_I[3]);
+    // printf("A=%ld, B=%ld, I=%ld, %ld, %ld, %ld\n", h_A[0], h_B[0], h_I[0], h_I[1], h_I[2], h_I[3]);
     u8 *startPtr, *curPtr;
     getKeyBuf(&startPtr, &curPtr, 8 * OneGB);
     llama::start();
diff --git a/GPU-MPC/tests/fss/rotary_embedding.cu b/GPU-MPC/tests/fss/rotary_embedding.cu
new file mode 100644
index 00000000..e4b120fc
--- /dev/null
+++ b/GPU-MPC/tests/fss/rotary_embedding.cu
@@ -0,0 +1,113 @@
+// Author: Neha Jawalkar
+// Copyright:
+//
+// Copyright (c) 2024 Microsoft Research
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "utils/gpu_data_types.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_mem.h"
+#include "utils/gpu_random.h"
+#include "utils/gpu_comms.h"
+
+#include <cassert>
+
+#include <sytorch/tensor.h>
+#include <sytorch/backend/cleartext.h>
+
+#include "fss/gpu_mha.h"
+
+using T = u64;
+
+int main(int argc, char *argv[])
+{
+    initGPUMemPool();
+    AESGlobalContext g;
+    initAESContext(&g);
+
+    int bw = 48;
+    int scale = 12;
+    auto ct = new ClearText<i64>();
+    ct->bw = bw;
+
+    int party = atoi(argv[1]);
+    int n_seq = 128;
+    int n_heads = 32;
+    int n_embed = 4096;
+    int dim_W = 128;
+    MHAParams pMHA = {n_seq, n_embed, n_heads, dim_W, true, true, true};
+    int N = pMHA.n_heads * pMHA.n_seq * pMHA.dim_W;
+
+    auto peer = new GpuPeer(true);
+    peer->connect(party, argv[2]);
+
+    uint8_t *startPtr, *curPtr;
+    getKeyBuf(&startPtr, &curPtr, 10 * OneGB);
+
+    initGPURandomness();
+    auto d_mask_X = randomGEOnGpu<T>(N, bw);
+    auto h_mask_X = (T *)moveToCPU((u8 *)d_mask_X, N * sizeof(T), NULL);
+    T *h_X;
+    auto d_masked_X = getMaskedInputOnGpu(N, bw, d_mask_X, &h_X, true, bw - scale);
+    auto d_mask_O = gpuKeygenRotEmb(&curPtr, party, bw, scale, pMHA, d_mask_X, &g);
+    auto h_mask_O = (T *)moveToCPU((u8 *)d_mask_O, N * sizeof(T), NULL);
+    auto k = readGPUTruncateKey<T>(TruncateType::TrWithSlack, &startPtr);
+    T *d_O;
+    Stats s;
+    for (int i = 0; i < 1; i++)
+    {
+        s.comm_time = 0;
+        s.transfer_time = 0;
+        peer->sync();
+        auto start = std::chrono::high_resolution_clock::now();
+        d_O = gpuRotEmb(peer, party, bw, scale, pMHA, k, d_masked_X, &g, &s);
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        printf("Comm time=%lu micros\n", s.comm_time);
+        printf("Transfer time=%lu micros\n", s.transfer_time);
+        printf("Rotary embedding time=%lu micros\n", std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count());
+    }
+    unmaskValues(bw, N, d_O, h_mask_O, NULL);
+    auto h_O = (T *)moveToCPU((uint8_t *)d_O, N * sizeof(T), (Stats *)NULL);
+    printf("%ld, %ld\n", h_O[0], h_O[1]);
+    gpuFree(d_O);
+    destroyGPURandomness();
+    for (int i = 0; i < n_heads; i++)
+    {
+        auto h_X_temp = (i64 *)(h_X + i * n_seq * dim_W);
+        auto h_O_temp = (i64 *)(h_O + i * n_seq * dim_W);
+        Tensor<i64> x((i64 *)h_X_temp, {(u64)n_seq, (u64)dim_W});
+        Tensor<i64> y({(u64)n_seq, (u64)dim_W});
+        ct->rotary_embedding(x, y, (u64)scale);
+        for (int j = 0; j < n_seq * dim_W; j++)
+        {
+            if (i * n_seq * dim_W + j < 10)
+            {
+                printf("%d=%ld, %ld\n", i * n_seq * dim_W + j, y.data[j], h_O_temp[j]);
+            }
+            auto diff = std::abs((i64)((i64)y.data[j] - (i64)h_O_temp[j]));
+            if (diff > 0)
+            {
+                printf("%d=%ld, %ld, %ld\n", i * n_seq * dim_W + j, y.data[j], h_O_temp[j], diff);
+                assert(0);
+            }
+        }
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/utils/cpu_comms.h b/GPU-MPC/utils/cpu_comms.h
index dfb39d44..dec8d659 100644
--- a/GPU-MPC/utils/cpu_comms.h
+++ b/GPU-MPC/utils/cpu_comms.h
@@ -74,13 +74,11 @@ class CpuPeer : public SigmaPeer
 
     void cpuXor(u64 N, u32 *x, u32 *y)
     {
-        // printf("x=%u, y=%u\n", x[0], y[0]);
 #pragma omp parallel for
         for (u64 i = 0; i < N; i++)
         {
             x[i] ^= y[i];
         }
-        // printf("%u\n", x[0]);
     }
 
     template <typename T>
diff --git a/GPU-MPC/utils/sigma_comms.cpp b/GPU-MPC/utils/sigma_comms.cpp
index 43e968db..01f8d8bc 100644
--- a/GPU-MPC/utils/sigma_comms.cpp
+++ b/GPU-MPC/utils/sigma_comms.cpp
@@ -63,7 +63,6 @@ void SigmaPeer::wait()
 
 void SigmaPeer::initCommBufs(bool pinMem)
 {
-    // printf("################## Increase the size of comm bufs! #####################\n");
     printf("Allocating %lu bytes of memory for comm bufs\n", commBufSize);
     h_bufA0 = cpuMalloc(commBufSize, pinMem);
     h_bufA1 = cpuMalloc(commBufSize, pinMem);
diff --git a/sytorch/include/sytorch/tensor.h b/sytorch/include/sytorch/tensor.h
index c07c2baf..c381291f 100644
--- a/sytorch/include/sytorch/tensor.h
+++ b/sytorch/include/sytorch/tensor.h
@@ -40,13 +40,16 @@ inline u64 type_cast(float val)
 }
 
 template <typename T>
-class TensorRef {
+class TensorRef
+{
 public:
-    T* data;
+    T *data;
     u64 size;
     TensorRef(T *data, u64 size) : data(data), size(size) {}
-    void zero() {
-        for(u64 i = 0; i < size; i++) {
+    void zero()
+    {
+        for (u64 i = 0; i < size; i++)
+        {
             data[i] = 0;
         }
     }
@@ -65,7 +68,8 @@ template <typename T>
 class Tensor1D;
 
 template <typename T>
-class Tensor {
+class Tensor
+{
     bool isFreed = false;
 
 public:
@@ -93,24 +97,31 @@ class Tensor {
         return this->data[offset];
     }
 
-    void allocate(const std::vector<u64> &s) {
+    void allocate(const std::vector<u64> &s)
+    {
         always_assert(isOwner);
         this->shape = s;
-        if (this->size() > 0) {
+        if (this->size() > 0)
+        {
             this->data = new T[this->size()];
             isFreed = false;
-        } else {
+        }
+        else
+        {
             this->data = nullptr;
             isFreed = true;
         }
     }
 
-    void free() {
+    void free()
+    {
         always_assert(isOwner);
-        if (isFreed) {
+        if (isFreed)
+        {
             return;
         }
-        if (this->size() == 0) {
+        if (this->size() == 0)
+        {
             return;
         }
         delete[] data;
@@ -118,17 +129,22 @@ class Tensor {
         isFreed = true;
     }
 
-    void resize(const std::vector<u64> &s) {
+    void resize(const std::vector<u64> &s)
+    {
         always_assert(isOwner);
-        if (s.size() == this->shape.size()){
+        if (s.size() == this->shape.size())
+        {
             bool allSameDims = true;
-            for (u64 i = 0; i < s.size(); i++) {
-                if (s[i] != this->shape[i]) {
+            for (u64 i = 0; i < s.size(); i++)
+            {
+                if (s[i] != this->shape[i])
+                {
                     allSameDims = false;
                     break;
                 }
             }
-            if (allSameDims) {
+            if (allSameDims)
+            {
                 return;
             }
         }
@@ -136,60 +152,74 @@ class Tensor {
         allocate(s);
     }
 
-    Tensor(const std::vector<u64> &s) {
+    Tensor(const std::vector<u64> &s)
+    {
         allocate(s);
     }
 
-    Tensor(std::initializer_list<u64> s) {
+    Tensor(std::initializer_list<u64> s)
+    {
         allocate(s);
     }
 
-    Tensor(T* data, const std::vector<u64> &s) {
+    Tensor(T *data, const std::vector<u64> &s)
+    {
         this->data = data;
         this->shape = s;
         this->isOwner = false;
     }
 
-    ~Tensor() {
+    ~Tensor()
+    {
         if (isOwner)
             free();
-    } 
+    }
 
-    u64 size() const {
-        if (this->shape.size() == 0) {
+    u64 size() const
+    {
+        if (this->shape.size() == 0)
+        {
             return 0;
         }
         u64 s = 1;
-        for (auto d : this->shape) {
+        for (auto d : this->shape)
+        {
             s *= d;
         }
         return s;
     }
 
-    bool is_same_shape(const Tensor<T> &other) const {
-        if (!(this->shape.size() == other.shape.size())) {
+    bool is_same_shape(const Tensor<T> &other) const
+    {
+        if (!(this->shape.size() == other.shape.size()))
+        {
             return false;
         }
-        for (u64 i = 0; i < this->shape.size(); i++) {
-            if (!(this->shape[i] == other.shape[i])) {
+        for (u64 i = 0; i < this->shape.size(); i++)
+        {
+            if (!(this->shape[i] == other.shape[i]))
+            {
                 return false;
             }
         }
         return true;
     }
-    
-    void assert_same_shape(const Tensor<T> &other) {
+
+    void assert_same_shape(const Tensor<T> &other)
+    {
         always_assert(this->shape.size() == other.shape.size());
-        for (u64 i = 0; i < this->shape.size(); i++) {
+        for (u64 i = 0; i < this->shape.size(); i++)
+        {
             always_assert(this->shape[i] == other.shape[i]);
         }
     }
 
-    void copy(const Tensor<T> &other, bool copyGraph = true) {
+    void copy(const Tensor<T> &other, bool copyGraph = true)
+    {
         assert_same_shape(other);
         // memcpy(data, other.data, size() * sizeof(T));
-        //#pragma omp parallel for
-        for(u64 i = 0; i < size(); ++i)
+        // #pragma omp parallel for
+        for (u64 i = 0; i < size(); ++i)
         {
             data[i] = other.data[i];
         }
@@ -197,13 +227,16 @@ class Tensor {
             this->graphNode = other.graphNode;
     }
 
-    void fill(T x) {
-        for (u64 i = 0; i < size(); i++) {
+    void fill(T x)
+    {
+        for (u64 i = 0; i < size(); i++)
+        {
             data[i] = x;
         }
     }
 
-    void zero() {
+    void zero()
+    {
         fill(0);
     }
 
@@ -214,18 +247,17 @@ class Tensor {
             double d;
             std::cin >> d;
             data[i] = type_cast<T>(d * (1LL << scale));
-
         }
     }
 
     void input_nchw(int scale)
     {
         always_assert(this->shape.size() >= 2); // atleast batch and channel axis
-        
+
         u64 batch_size = shape[0];
         u64 num_channel = shape.back();
         u64 rest_size = size() / (batch_size * num_channel);
-        
+
         for (u64 i = 0; i < size(); i++)
         {
             double d;
@@ -238,7 +270,7 @@ class Tensor {
             data[new_idx] = type_cast<T>(d);
 #else
             data[new_idx] = type_cast<T>(d * (1LL << scale));
-#endif        
+#endif
         }
     }
 
@@ -257,9 +289,11 @@ class Tensor {
         std::cout << "\n";
     }
 
-    void printshape() {
+    void printshape()
+    {
         std::cout << "(";
-        for(int i = 0; i < this->shape.size(); i++) {
+        for (int i = 0; i < this->shape.size(); i++)
+        {
             std::cout << this->shape[i] << ", ";
         }
         std::cout << ")" << "\n";
@@ -330,14 +364,15 @@ class Tensor {
         struct stat sb;
         fstat(fd2, &sb);
         buffersize = sb.st_size;
-        int advise=posix_fadvise(fd2, 0, sb.st_size, POSIX_FADV_WILLNEED);
-        floatInput= (float*)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd2, 0);
-        for(u64 i = 0; i < size(); ++i)
+        int advise = posix_fadvise(fd2, 0, sb.st_size, POSIX_FADV_WILLNEED);
+        floatInput = (float *)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd2, 0);
+        for (u64 i = 0; i < size(); ++i)
         {
             data[i] = type_cast<T>(floatInput[i] * (1LL << scale));
         }
         ::close(fd2);
-        //delete[] floatInput;
+        printf("Input=%lu\n", data[0]);
+        // delete[] floatInput;
         munmap(floatInput, buffersize);
     }
 
@@ -346,7 +381,7 @@ class Tensor {
         assert(this->shape.size() == 5);
         return Tensor5D<T>(this->data, this->shape[0], this->shape[1], this->shape[2], this->shape[3], this->shape[4]);
     }
-    
+
     Tensor4D<T> as_4d()
     {
         assert(this->shape.size() == 4);
@@ -370,46 +405,56 @@ class Tensor {
 };
 
 template <typename T>
-class Tensor1D {
+class Tensor1D
+{
 public:
     T *data;
     u64 d1;
 
     Tensor1D(u64 s) : d1(s), data(new T[s]) {}
 
-    void randomize(double range) {
-        for(u64 i = 0; i < this->d1; i++) {
+    void randomize(double range)
+    {
+        for (u64 i = 0; i < this->d1; i++)
+        {
             auto r = (double)prngWeights.get<int32_t>();
             this->data[i] = (T)((r / (1LL << 31)) * range);
         }
     }
 
-    ~Tensor1D() {
+    ~Tensor1D()
+    {
         delete[] this->data;
     }
 
-    u64 size() const {
+    u64 size() const
+    {
         return d1;
     }
 
-    TensorRef<T> ref() {
+    TensorRef<T> ref()
+    {
         return TensorRef<T>(data, size());
     }
 
-    T &operator()(u64 i) const {
+    T &operator()(u64 i) const
+    {
         assert(i < this->d1);
         return this->data[i];
     }
 
-    void fill(T val) {
-        for (u64 i = 0; i < this->d1; i++) {
+    void fill(T val)
+    {
+        for (u64 i = 0; i < this->d1; i++)
+        {
             this->data[i] = val;
         }
     }
 };
 
 template <typename T>
-class Tensor2D {
+class Tensor2D
+{
 public:
     u64 d1, d2;
     T *data;
@@ -419,26 +464,33 @@ class Tensor2D {
 
     Tensor2D(T *data, u64 d1, u64 d2) : d1(d1), d2(d2), data(data), isOwner(false) {}
 
-    void randomize(double range) {
-        for(u64 i = 0; i < this->d1; i++) {
-            for(u64 j = 0; j < this->d2; j++) {
+    void randomize(double range)
+    {
+        for (u64 i = 0; i < this->d1; i++)
+        {
+            for (u64 j = 0; j < this->d2; j++)
+            {
                 auto r = (double)prngWeights.get<int32_t>();
                 this->data[i * this->d2 + j] = (T)((r / (1LL << 31)) * range);
             }
         }
     }
 
-    u64 size() const {
+    u64 size() const
+    {
         return d1 * d2;
     }
 
-    TensorRef<T> ref() {
+    TensorRef<T> ref()
+    {
         return TensorRef<T>(data, size());
     }
 
-    void resize(u64 d1, u64 d2) {
+    void resize(u64 d1, u64 d2)
+    {
         always_assert(this->isOwner);
-        if (this->d1 == d1 && this->d2 == d2) {
+        if (this->d1 == d1 && this->d2 == d2)
+        {
             return;
         }
         delete[] data;
@@ -447,29 +499,51 @@ class Tensor2D {
         data = new T[d1 * d2];
     }
 
-    ~Tensor2D() {
+    ~Tensor2D()
+    {
         if (this->isOwner)
             delete[] this->data;
     }
 
-    T& operator()(u64 i, u64 j) const {
+    T &operator()(u64 i, u64 j) const
+    {
         assert(i < this->d1);
         assert(j < this->d2);
         return this->data[i * this->d2 + j];
     }
 
-    void zero() {
+    void zero()
+    {
         fill(0);
     }
 
-    void fill(T val) {
-        for(u64 i = 0; i < this->d1; i++) {
-            for(u64 j = 0; j < this->d2; j++) {
+    void fill(T val)
+    {
+        for (u64 i = 0; i < this->d1; i++)
+        {
+            for (u64 j = 0; j < this->d2; j++)
+            {
                 this->data[i * this->d2 + j] = val;
             }
         }
     }
 
+    u64 argmax(u64 i)
+    {
+        assert(i < d1);
+        u64 maxIndex = 0;
+        T maxValue = data[i * d2];
+        for (u64 j = 1; j < d2; j++)
+        {
+            if (data[i * d2 + j] > maxValue)
+            {
+                maxValue = data[i * d2 + j];
+                maxIndex = j;
+            }
+        }
+        return maxIndex;
+    }
+
     Tensor<T> as_nd()
     {
         return Tensor<T>(data, {d1, d2});
@@ -477,37 +551,46 @@ class Tensor2D {
 };
 
 template <typename T>
-class Tensor4D {
+class Tensor4D
+{
 public:
     u64 d1, d2, d3, d4;
-    T* data;
+    T *data;
     bool isOwner = true;
 
-    Tensor4D(u64 d1, u64 d2, u64 d3, u64 d4) : d1(d1), d2(d2), d3(d3), d4(d4) {
+    Tensor4D(u64 d1, u64 d2, u64 d3, u64 d4) : d1(d1), d2(d2), d3(d3), d4(d4)
+    {
         data = new T[d1 * d2 * d3 * d4];
     }
 
-    Tensor4D(T* data, u64 d1, u64 d2, u64 d3, u64 d4) : data(data), d1(d1), d2(d2), d3(d3), d4(d4) {
+    Tensor4D(T *data, u64 d1, u64 d2, u64 d3, u64 d4) : data(data), d1(d1), d2(d2), d3(d3), d4(d4)
+    {
         isOwner = false;
     }
 
-    ~Tensor4D() {
-        if (isOwner) {
+    ~Tensor4D()
+    {
+        if (isOwner)
+        {
             delete[] data;
         }
     }
 
-    u64 size() const {
+    u64 size() const
+    {
         return d1 * d2 * d3 * d4;
     }
 
-    TensorRef<T> ref() {
+    TensorRef<T> ref()
+    {
         return TensorRef<T>(data, size());
     }
 
-    void resize(u64 d1, u64 d2, u64 d3, u64 d4) {
+    void resize(u64 d1, u64 d2, u64 d3, u64 d4)
+    {
         always_assert(isOwner);
-        if (this->d1 == d1 && this->d2 == d2 && this->d3 == d3 && this->d4 == d4) {
+        if (this->d1 == d1 && this->d2 == d2 && this->d3 == d3 && this->d4 == d4)
+        {
             return;
         }
         delete[] data;
@@ -518,13 +601,15 @@ class Tensor4D {
         data = new T[d1 * d2 * d3 * d4];
     }
 
-    void resize(const std::vector<u64> &shape) {
+    void resize(const std::vector<u64> &shape)
+    {
         always_assert(isOwner);
         always_assert(shape.size() == 4);
         resize(shape[0], shape[1], shape[2], shape[3]);
     }
 
-    T& operator()(u64 i, u64 j, u64 k, u64 l) const {
+    T &operator()(u64 i, u64 j, u64 k, u64 l) const
+    {
         assert(i < d1);
         assert(j < d2);
         assert(k < d3);
@@ -532,14 +617,17 @@ class Tensor4D {
         return data[i * d2 * d3 * d4 + j * d3 * d4 + k * d4 + l];
     }
 
-    u64 argmax(u64 i) {
+    u64 argmax(u64 i)
+    {
         assert(d3 == 1);
         assert(d4 == 1);
         assert(i < d1);
         u64 maxIndex = 0;
         T maxValue = data[i * d2];
-        for (u64 j = 1; j < d2; j++) {
-            if (data[i * d2 + j] > maxValue) {
+        for (u64 j = 1; j < d2; j++)
+        {
+            if (data[i * d2 + j] > maxValue)
+            {
                 maxValue = data[i * d2 + j];
                 maxIndex = j;
             }
@@ -552,47 +640,56 @@ class Tensor4D {
         return Tensor<T>(data, {d1, d2, d3, d4});
     }
 
-    void fill(T val) {
-        for (u64 i = 0; i < size(); i++) {
+    void fill(T val)
+    {
+        for (u64 i = 0; i < size(); i++)
+        {
             this->data[i] = val;
         }
     }
-
 };
 
-
 template <typename T>
-class Tensor5D {
+class Tensor5D
+{
 public:
     u64 d1, d2, d3, d4, d5;
-    T* data;
+    T *data;
     bool isOwner = true;
 
-    Tensor5D(u64 d1, u64 d2, u64 d3, u64 d4, u64 d5) : d1(d1), d2(d2), d3(d3), d4(d4), d5(d5) {
+    Tensor5D(u64 d1, u64 d2, u64 d3, u64 d4, u64 d5) : d1(d1), d2(d2), d3(d3), d4(d4), d5(d5)
+    {
         data = new T[d1 * d2 * d3 * d4 * d5];
     }
 
-    Tensor5D(T* data, u64 d1, u64 d2, u64 d3, u64 d4, u64 d5) : data(data), d1(d1), d2(d2), d3(d3), d4(d4), d5(d5) {
+    Tensor5D(T *data, u64 d1, u64 d2, u64 d3, u64 d4, u64 d5) : data(data), d1(d1), d2(d2), d3(d3), d4(d4), d5(d5)
+    {
         isOwner = false;
     }
 
-    ~Tensor5D() {
-        if (isOwner) {
+    ~Tensor5D()
+    {
+        if (isOwner)
+        {
             delete[] data;
         }
     }
 
-    u64 size() const {
+    u64 size() const
+    {
         return d1 * d2 * d3 * d4 * d5;
     }
 
-    TensorRef<T> ref() {
+    TensorRef<T> ref()
+    {
         return TensorRef<T>(data, size());
     }
 
-    void resize(u64 d1, u64 d2, u64 d3, u64 d4, u64 d5) {
+    void resize(u64 d1, u64 d2, u64 d3, u64 d4, u64 d5)
+    {
         always_assert(isOwner);
-        if (this->d1 == d1 && this->d2 == d2 && this->d3 == d3 && this->d4 == d4 && this->d5 == d5) {
+        if (this->d1 == d1 && this->d2 == d2 && this->d3 == d3 && this->d4 == d4 && this->d5 == d5)
+        {
             return;
         }
         delete[] data;
@@ -604,13 +701,15 @@ class Tensor5D {
         data = new T[d1 * d2 * d3 * d4 * d5];
     }
 
-    void resize(const std::vector<u64> &shape) {
+    void resize(const std::vector<u64> &shape)
+    {
         always_assert(isOwner);
         always_assert(shape.size() == 5);
         resize(shape[0], shape[1], shape[2], shape[3], shape[4]);
     }
 
-    T& operator()(u64 i, u64 j, u64 k, u64 l, u64 m) const {
+    T &operator()(u64 i, u64 j, u64 k, u64 l, u64 m) const
+    {
         assert(i < d1);
         assert(j < d2);
         assert(k < d3);
@@ -623,5 +722,4 @@ class Tensor5D {
     {
         return Tensor<T>(data, {d1, d2, d3, d4, d5});
     }
-
 };