diff --git a/NONCOMMERCIAL.txt b/NONCOMMERCIAL.txt
new file mode 100644
index 0000000..86498a0
--- /dev/null
+++ b/NONCOMMERCIAL.txt
@@ -0,0 +1,146 @@
+NON_COMMERICAL SOFTWARE LICENSE FOR THE GELIB SOFTWARE LIBRARY
+
+Copyright (c) 2021- Imre (Risi) Kondor. All rights reserved.
+
+
+DEFINITIONS
+
+"Program" means a copy of the GELIB software library or parts of the GELIB software library explicitly 
+marked in the source code as distributed under this Noncommercial Software License.
+
+"Copyright holder" means the author of GELIB, Imre Kondor, who retains the copyright to Program. 
+
+"Work based on the Program" means either the Program or any derivative work under copyright law: that is 
+to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or 
+translated into another language. (Hereinafter, translation is included without limitation in the term 
+"modification".)
+
+"Using the Program" means any act of creating executables that contain or directly use libraries that 
+are part of the Program, running any part of the Program or any tools that are part of the Program, or 
+creating works based on the Program.
+
+Each licensee is addressed as "you".
+
+
+TERMS AND CONDITIONS FOR USE, COPYING, DISTRIBUTION AND MODIFICATION
+
+1. This License grants you permission to use the Program free of charge for any noncommercial purpose, 
+including teaching and research at universities, colleges and other educational institutions, research 
+at non-profit research institutions, and personal non-profit purposes. 
+
+2. This License does NOT grant permission to use the Program for commercial purposes, including but not 
+restricted to (a) bundling or integrating the Program with any hardware product or any other software for 
+transfer, sale or license to a third party (even if distributing the Program on separate media and not 
+charging for the Program); (b) providing customers with a link to the Program or a copy of the Program 
+for use with hardware or another program purchased by that customer; or (c) use in connection with the 
+performance of services for which you are compensated (d) use in connection with research and development 
+activities in the service of developing commercial products or obtaining patents for derived products 
+such as pharmaceuticals; (e) other forms of indirect commercial use, such as on a website that accepts 
+advertising money for content. 
+
+3. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any 
+medium, provided that you retain the copyright notice on each file of the source code and conspicuously 
+and appropriately include a copy of this License and Disclaimer of Warranty with the Program in a file 
+named LICENSE.TXT.
+
+4. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on 
+the Program, and copy and distribute such modifications or work under the terms of Section 2 above, 
+provided that:
+
+a) You cause the modified files to carry prominent notices stating that you changed the files and the 
+date of any change.
+
+b) You cause any work that you distribute or publish, that in whole or in part contains or is derived 
+from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under 
+the terms of this License.
+
+c) You retain the original copyright notice on each file of this Program's source code and conspicuously 
+include a copy of this License and Disclaimer of Warranty under the terms described in Section 3. 
+
+These requirements apply to the modified work as a whole. If identifiable sections of that work are not 
+derived from the Program, and can be reasonably considered independent and separate works in themselves, 
+then this License, and its terms, do not apply to those sections when you distribute them as separate 
+works. But when you distribute the same sections as part of a whole which is a work based on the Program, 
+the distribution of the whole must be on the terms of this License, whose regulations for other licensees 
+extend to the entire whole, and thus to each and every part regardless of who wrote it. (If the same, 
+independent sections are distributed as part of a package that is otherwise reliant on, or is based on 
+the Program, then the distribution of the whole package, including but not restricted to the independent 
+section, must be on the unmodified terms of this License, regadless of who the author of the included 
+sections was.)
+
+Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely 
+by you; rather, the intent is to exercise the right to control the distribution of derivative or collective 
+works based or reliant on the Program.
+
+In addition, mere aggregation of another work not based on the Program with the Program (or with a work 
+based on the Program) on a volume of storage or distribution medium does not bring the other work under 
+the scope of this License.
+
+5. You may copy and distribute the Program (or a work based on it, under Section 3) in object code or 
+executable form under the terms of Sections 3 and 4 above provided that you also accompany it with the 
+complete corresponding machine-readable source code under the terms of Sections 3 and 4, as well as the 
+License and Disclaimer of Warranty, under the terms of Section 3. 
+
+If distribution of executable or object code is made by offering access to copy from a designated place, 
+then offering equivalent access to copy the source code from the same place counts as distribution of the 
+source code, even though third parties are not compelled to copy the source along with the object code.
+
+6. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this 
+License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will 
+automatically terminate your rights under this License. However, parties who have received copies, or rights, 
+from you under this License will not have their licenses terminated so long as such parties remain in full 
+compliance.
+
+7. You are not required to accept this License, since you have not signed it. Nothing else grants you 
+permission to modify or distribute the Program or its derivative works; law prohibits these actions if you 
+do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the 
+Program), you indicate your acceptance of this License and all its terms and conditions for copying, 
+distributing or modifying the Program or works based on it, to do so.
+
+8. Each time you redistribute the Program (or any work based on the Program), the recipient automatically 
+receives a license from the original licensor to copy, distribute or modify the Program subject to these 
+terms and conditions. You may not impose any further restrictions on the recipients to exercise the 
+rights granted herein. You are not responsible for enforcing compliance by third parties to this License.
+
+9. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason 
+(not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) 
+that contradict the conditions of this License, they do not excuse you from the conditions of this License. 
+If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other 
+pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a 
+patent license would not permit royalty-free redistribution of the Program by all those who receive copies 
+directly or indirectly through you, then the only way you could satisfy both it and this License would be to 
+refrain entirely from distribution of the Program. If any portion of this section is held invalid or 
+unenforceable under any particular circumstance, the balance of the section is intended to apply and the 
+section as a whole is intended to apply in other circumstances.
+
+10. If the distribution and/or use of the Program are restricted in certain countries either by patents or 
+by copyrighted interfaces, the original copyright holder who places the Program under this License may add an 
+explicit geographical distribution limitation excluding those countries, so that distribution is permitted 
+only in or among countries not thus excluded. In such case, this License incorporates the limitation as if 
+written in the body of this License.
+
+11. Copyright holder retains the right to grant broader rights to the Program to individuals or to commercial 
+entities on a case by case basis, possibly for a fee.
+
+
+DISCLAIMER OF WARRANTY
+
+12. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, 
+INCLUDING, BUT NOT LIMITED TO, ANY IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 
+
+13. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED ON IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY 
+OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO 
+USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 
+OR PROFITS; BUSINESS INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+POSSIBILITY OF SUCH DAMAGE.
+
+
+
+
+
+
+
+
diff --git a/common.txt b/common.txt
index 8d003bb..22313c3 100644
--- a/common.txt
+++ b/common.txt
@@ -29,7 +29,7 @@ SO2DIR=$(ROOTDIR)/objects/SO2
 SO3DIR=$(ROOTDIR)/objects/SO3
 SO3NDIR=$(ROOTDIR)/objects/SO3n
 SO3CDIR=$(ROOTDIR)/objects/SO3c
-GELIB_CUDADIR=$(ROOTDIR)/../GElib-cuda/cuda
+GELIB_CUDADIR=$(ROOTDIR)/cuda
 
 # COMBINATORIALDIR=$(ROOTDIR)/objects/combinatorial
 # GROUPSDIR=$(ROOTDIR)/objects/groups
diff --git a/cuda/GElib_base.cu b/cuda/GElib_base.cu
new file mode 100644
index 0000000..206f998
--- /dev/null
+++ b/cuda/GElib_base.cu
@@ -0,0 +1,19 @@
+/*
+ * This file is part of GElib, a C++/CUDA library for group equivariant 
+ * tensor operations. 
+ *  
+ * Copyright (c) 2023, Imre Risi Kondor
+ *
+ * This source code file is subject to the terms of the noncommercial 
+ * license distributed with GElib in the file NONCOMMERICAL.TXT. Commercial 
+ * use is prohibited. All redistributed versions of this file (in orginal
+ * or modified form) must retain this copyright notice and must be 
+ * accompanied by a verbatim copy of the license. 
+ *
+ */
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "GElib_base.hpp"
+
+__device__ __constant__ unsigned char cg_cmem[CNINE_CONST_MEM_SIZE];
diff --git a/cuda/Generate_SO3part_addCGproduct_kernel_calls.cpp b/cuda/Generate_SO3part_addCGproduct_kernel_calls.cpp
new file mode 100644
index 0000000..b60f157
--- /dev/null
+++ b/cuda/Generate_SO3part_addCGproduct_kernel_calls.cpp
@@ -0,0 +1,54 @@
+/*
+ * This file is part of GElib, a C++/CUDA library for group
+ * equivariant tensor operations. 
+ *  
+ * Copyright (c) 2023, Imre Risi Kondor
+ *
+ * This source code file is subject to the terms of the noncommercial 
+ * license distributed with GElib in the file NONCOMMERICAL.TXT. Commercial 
+ * use is prohibited. All redistributed versions of this file (in 
+ * original or modified form) must retain this copyright notice and 
+ * must be accompanied by a verbatim copy of the license. 
+ *
+ */
+
+#include "GElib_base.cpp"
+#include "GElibSession.hpp"
+#include <fstream>
+
+using namespace cnine;
+using namespace GElib;
+
+const int maxl1=2;
+const int maxl=4;
+
+
+int main(int argc, char** arg){
+
+  ofstream ofs("SO3part_addCGproduct_explicit_calls.inc");
+  
+  ofs<<"  switch(l1){\n";
+  for(int l1=0; l1<=maxl1; l1++){
+    ofs<<"      case "<<l1<<":\n";
+
+    ofs<<"        switch(l2){\n";
+    for(int l2=0; l2<=maxl1; l2++){
+      ofs<<"        case "<<l2<<":\n";
+
+      ofs<<"          switch(l){\n";
+      for(int l=std::abs(l1-l2); l<=l1+l2 && l<=maxl; l++){
+	ofs<<"          case "<<l<<": ";
+	ofs<<"SO3part_addCGproduct_explicit<SO3part_addCGproduct_explicit_kernel_"<<l1<<"_"<<l2<<"_"<<l<<">"
+	   <<"<<<b,cnine::roundup(y.n2,32),nlines*128,stream>>>(r,x,y); break;"<<endl;
+      }
+      ofs<<"          }"<<endl;
+      ofs<<"        break;"<<endl;
+    }
+    ofs<<"        }"<<endl<<endl;
+    ofs<<"      break;"<<endl;
+
+  }
+  ofs<<"      }"<<endl;
+
+  ofs.close();
+}
diff --git a/cuda/Generate_SO3part_addCGproduct_subkernels.cpp b/cuda/Generate_SO3part_addCGproduct_subkernels.cpp
new file mode 100644
index 0000000..40df302
--- /dev/null
+++ b/cuda/Generate_SO3part_addCGproduct_subkernels.cpp
@@ -0,0 +1,79 @@
+/*
+ * This file is part of GElib, a C++/CUDA library for group
+ * equivariant tensor operations. 
+ *  
+ * Copyright (c) 2023, Imre Risi Kondor
+ *
+ * This source code file is subject to the terms of the noncommercial 
+ * license distributed with GElib in the file NONCOMMERICAL.TXT. Commercial 
+ * use is prohibited. All redistributed versions of this file (in 
+ * original or modified form) must retain this copyright notice and 
+ * must be accompanied by a verbatim copy of the license. 
+ *
+ */
+
+#include "GElib_base.cpp"
+#include "GElibSession.hpp"
+#include <fstream>
+
+extern GElib::SO3_CGbank SO3_cgbank;
+
+using namespace cnine;
+using namespace GElib;
+
+const int maxl1=2;
+const int maxl=4;
+
+
+int main(int argc, char** arg){
+
+  ofstream ofs("SO3part_addCGproduct_subkernels.inc");
+  
+  for(int l1=0; l1<=maxl1; l1++){
+
+    for(int l2=0; l2<=maxl1; l2++){
+
+      for(int l=std::abs(l1-l2); l<=l1+l2 && l<=maxl; l++){
+	auto& C=SO3_cgbank.getf(CGindex(l1,l2,l));
+
+	ofs<<"__forceinline__ __device__ void SO3part_addCGproduct_explicit_kernel_"<<l1<<"_"<<l2<<"_"<<l<<
+	  "(const float* xpr, const float* xpi, const float* ypr, const float* ypi, const int ys, float* rpr, float* rpi, const int rs){"<<endl;
+
+	for(int m=-l; m<=l; m++){
+
+	  ofs<<"  rpr["<<m+l<<"*rs]+="<<endl;
+	  for(int m1=max(-l1,m-l2); m1<=min(l1,m+l2); m1++){
+	    int m2=m-m1;
+	    float c=C(m1+l1,m2+l2); 
+	    string cs=to_string(c);
+	    //if(c==floor(c)) cs=cs+".";
+	    //c=1.0;
+	    ofs<<"    ("<<cs<<"f)*(xpr["<<m1+l1<<"]*ypr["<<m2+l2<<"*ys]-xpi["<<m1+l1<<"]*ypi["<<m2+l2<<"*ys])";
+	    if(m1<min(l1,m+l2)) ofs<<"+"<<endl;
+	  }
+	  ofs<<";"<<endl;
+
+	  ofs<<"  rpi["<<m+l<<"*rs]+="<<endl;
+	  for(int m1=max(-l1,m-l2); m1<=min(l1,m+l2); m1++){
+	    int m2=m-m1;
+	    float c=C(m1+l1,m2+l2); 
+	    string cs=to_string(c);
+	    //if(c==floor(c)) cs=cs+".";
+	    //c=1.0;
+	    ofs<<"    ("<<cs<<"f)*(xpr["<<m1+l1<<"]*ypi["<<m2+l2<<"*ys]+xpi["<<m1+l1<<"]*ypr["<<m2+l2<<"*ys])";
+	    if(m1<min(l1,m+l2)) ofs<<"+"<<endl;
+	  }
+	  ofs<<";"<<endl;
+
+	}
+
+	ofs<<"}"<<endl<<endl;
+
+      }
+    }
+  }
+
+  ofs.close();
+
+
+}
diff --git a/cuda/SO3CGproducts_combo.cu b/cuda/SO3CGproducts_combo.cu
new file mode 100644
index 0000000..57a6bdd
--- /dev/null
+++ b/cuda/SO3CGproducts_combo.cu
@@ -0,0 +1,38 @@
+/*
+ * This file is part of GElib, a C++/CUDA library for group equivariant 
+ * tensor operations. 
+ *  
+ * Copyright (c) 2023, Imre Risi Kondor
+ *
+ * This source code file is subject to the terms of the noncommercial 
+ * license distributed with GElib in the file NONCOMMERICAL.TXT. Commercial 
+ * use is prohibited. All redistributed versions of this file (in orginal
+ * or modified form) must retain this copyright notice and must be 
+ * accompanied by a verbatim copy of the license. 
+ *
+ */
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "GElib_base.hpp"
+
+__device__ __constant__ unsigned char cg_cmem[CNINE_CONST_MEM_SIZE];
+#define _SO3CG_CUDA_CONCAT
+
+//#include "SO3partA_CGproduct.cu"
+//#include "SO3partA_DiagCGproduct.cu"
+
+#include "SO3partB_addCGproduct.cu"
+#include "SO3partB_addCGproduct_back0.cu"
+#include "SO3partB_addCGproduct_back1.cu"
+
+#include "SO3partB_addDiagCGproduct.cu"
+#include "SO3partB_addDiagCGproduct_back0.cu"
+#include "SO3partB_addDiagCGproduct_back1.cu"
+
+#include "SO3Fpart_addFproduct.cu"
+#include "SO3Fpart_addFproduct_back0.cu"
+#include "SO3Fpart_addFproduct_back1.cu"
+
+#include "SO3part_addCGtransform.cu"
+
diff --git a/cuda/SO3Fpart_addFproduct.cu b/cuda/SO3Fpart_addFproduct.cu
new file mode 100644
index 0000000..66c24fe
--- /dev/null
+++ b/cuda/SO3Fpart_addFproduct.cu
@@ -0,0 +1,316 @@
+/*
+ * This file is part of GElib, a C++/CUDA library for group equivariant 
+ * tensor operations. 
+ *  
+ * Copyright (c) 2023, Imre Risi Kondor
+ *
+ * This source code file is subject to the terms of the noncommercial 
+ * license distributed with GElib in the file NONCOMMERICAL.TXT. Commercial 
+ * use is prohibited. All redistributed versions of this file (in orginal
+ * or modified form) must retain this copyright notice and must be 
+ * accompanied by a verbatim copy of the license. 
+ *
+ */
+
+#ifndef _SO3Fpart_addFproduct_cu
+#define _SO3Fpart_addFproduct_cu
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+//#include <thrust/complex.h>
+//#include <thrust/tuple.h>
+
+#include "SO3_CGbank.hpp"
+#include "Ctensor2_view.hpp"
+#include "Ctensor3_view.hpp"
+
+//__device__ __constant__ unsigned char cg_cmem[32276]; 
+
+extern GElib::SO3_CGbank SO3_cgbank;
+
+
+
+
+__device__ int loadg3(const cnine::Ctensor3_view& x, float* dest, const int b, const int t){
+  int I=x.n1;
+  int J=x.n2;
+  int s1=x.s1;
+  int s2=x.s2;
+  int offs=I*J; //((I*J-1)/32+1)*32;
+  float* destc=dest+offs;
+  float* source=x.arr+x.s0*b;
+  float* sourcec=x.arrc+x.s0*b;
+  if(t<J){
+    for(int i=0; i<I; i++)
+      dest[i*J+t]=source[i*s1+t*s2];
+    for(int i=0; i<I; i++)
+      destc[i*J+t]=sourcec[i*s1+t*s2];
+  }
+  return offs;
+}
+
+
+__device__ int saveg3(const cnine::Ctensor3_view& x, float* source, const int b, const int t){
+  int I=x.n1;
+  int J=x.n2;
+  int s1=x.s1;
+  int s2=x.s2;
+  int offs=I*J; //((I*J-1)/32+1)*32;
+  float* sourcec=source+offs;
+  float* dest=x.arr+x.s0*b;
+  float* destc=x.arrc+x.s0*b;
+  if(t<J){
+    for(int i=0; i<I; i++)
+      dest[i*s1+t*s2]=source[i*J+t];
+    for(int i=0; i<I; i++)
+      destc[i*s1+t*s2]=sourcec[i*J+t];
+  }
+  return offs;
+}
+
+__device__ int loadg3c(const cnine::Ctensor3_view& x, float* dest, const int b, const int t){
+  int I=x.n1;
+  int J=x.n2;
+  int s1=x.s1;
+  int s2=x.s2;
+  int offs=I*J; //((I*J-1)/32+1)*32;
+  float* destc=dest+offs;
+  float* source=x.arr+x.s0*b;
+  float* sourcec=x.arrc+x.s0*b;
+  if(t<J){
+    for(int i=0; i<I; i++)
+      dest[i*J+t]=source[i*s1+t*s2];
+    for(int i=0; i<I; i++)
+      destc[i*J+t]=-sourcec[i*s1+t*s2];
+  }
+  return offs;
+}
+
+
+/*
+__device__ int saveg3c(const cnine::Ctensor3_view& x, float* source, const int b, const int t){
+  int I=x.n1;
+  int J=x.n2;
+  int s1=x.s1;
+  int s2=x.s2;
+  int offs=I*J; //((I*J-1)/32+1)*32;
+  float* sourcec=source+offs;
+  float* dest=x.arr+x.s0*b;
+  float* destc=x.arrc+x.s0*b;
+  if(t<J){
+    for(int i=0; i<I; i++)
+      dest[i*s1+t*s2]=source[i*J+t];
+    for(int i=0; i<I; i++)
+      destc[i*s1+t*s2]=-sourcec[i*J+t];
+  }
+  return offs;
+}
+*/
+
+
+__global__ void SO3Fpart_addFproduct_kernel(const cnine::Ctensor3_view r, const cnine::Ctensor3_view x, 
+  const cnine::Ctensor3_view y, const int Cptr, float* cptr_global, const int conj){
+
+  extern __shared__ unsigned char _shared[]; 
+  //const float* C_ptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+  const int b=blockIdx.x;
+  const int t=threadIdx.x;
+
+  int l1=(x.n1-1)/2;
+  int l2=(y.n1-1)/2;
+  int l=(r.n1-1)/2;
+  int xn=x.n2;
+  int yn=y.n2;
+  int rn=r.n2;
+
+  float* cptr;
+  if(Cptr>=0) cptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+  else cptr=cptr_global;
+
+  float* xpr=reinterpret_cast<float*>(_shared);
+  float* xpi=xpr+x.n1*x.n2;
+  loadg3(x,xpr,b,t);
+
+  float* ypr=xpr+((2*xn*xn-1)/32+1)*32;
+  float* ypi=ypr+y.n1*y.n2;
+  if(conj==0) loadg3(y,ypr,b,t);
+  else loadg3c(y,ypr,b,t);
+
+  float* rpr=ypr+((2*yn*yn-1)/32+1)*32;
+  float* rpi=rpr+r.n1*r.n2;
+  loadg3(r,rpr,b,t);
+
+  __syncthreads();
+
+  if(t<xn*yn){
+
+    int i1=t/yn;
+    xpr=xpr+i1;
+    xpi=xpi+i1;
+    
+    int i2=t%yn;
+    ypr=ypr+i2;
+    ypi=ypi+i2;
+    
+    int i=i1+i2-l1-l2+l;
+    float* _rpr=rpr+i;
+    float* _rpi=rpi+i;
+
+    if(i>=0 && i<rn){
+
+      float c0=cptr[i1*yn+i2]*xn*yn/rn;
+      
+      for(int m1=-l1; m1<=l1; m1++){
+	const float x_r=xpr[xn*(m1+l1)];
+	const float x_i=xpi[xn*(m1+l1)];
+	int lower=-l-m1; if(lower<-l2) lower=-l2;
+	int upper=l-m1; if(upper>l2) upper=l2;
+	for(int m2=lower; m2<=upper; m2++){
+	  float c=cptr[(m1+l1)*yn+m2+l2];
+	  const float y_r=ypr[yn*(m2+l2)];
+	  const float y_i=ypi[yn*(m2+l2)];
+	  //_rpr[rn*(m1+m2+l)]+=c0*c*(x_r*y_r-x_i*y_i); 
+	  //_rpi[rn*(m1+m2+l)]+=c0*c*(x_r*y_i+x_i*y_r);
+	  atomicAdd(_rpr+rn*(m1+m2+l),c0*c*(x_r*y_r-x_i*y_i)); 
+	  atomicAdd(_rpi+rn*(m1+m2+l),c0*c*(x_r*y_i+x_i*y_r));
+	}
+ 
+      }
+    }
+  }
+
+  __syncthreads();
+  
+  saveg3(r,rpr,b,t);
+
+}
+
+
+__global__ void SO3Fpart_addFproduct_large_kernel(const cnine::Ctensor3_view r, const cnine::Ctensor3_view x, 
+  const cnine::Ctensor3_view y, const int Cptr, float* cptr_global, const int conj){
+
+  extern __shared__ unsigned char _shared[]; 
+  //const float* C_ptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+  const int b=blockIdx.x;
+  const int t=threadIdx.x;
+
+  int l1=(x.n1-1)/2;
+  int l2=(y.n1-1)/2;
+  int l=(r.n1-1)/2;
+  int xn=x.n2;
+  int yn=y.n2;
+  int rn=r.n2;
+
+  float* cptr;
+  if(Cptr>=0) cptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+  else cptr=cptr_global;
+
+  float* xpr=reinterpret_cast<float*>(_shared);
+  float* xpi=xpr+x.n1*x.n2;
+  loadg3(x,xpr,b,t);
+
+  float* ypr=xpr+((2*xn*xn-1)/32+1)*32;
+  float* ypi=ypr+y.n1*y.n2;
+  if(conj==0) loadg3(y,ypr,b,t);
+  else loadg3c(y,ypr,b,t);
+
+  float* rpr=ypr+((2*yn*yn-1)/32+1)*32;
+  float* rpi=rpr+r.n1*r.n2;
+  loadg3(r,rpr,b,t);
+
+  //int tn=xn*yn;
+  //float* tpr=rpr+((2*rn*rn-1)/32+1)*32;
+  //float* tpi=tpr+tn*rn;
+
+  __syncthreads();
+
+  if(t<rn){
+    int i=t;
+    float* _rpr=rpr+i+rn*l;
+    float* _rpi=rpi+i+rn*l;
+
+    for(int i1=max(0,i-l-l2+l1); i1<min(xn,i-l+l2+l1+1); i1++){
+      float* _xpr=xpr+i1;
+      float* _xpi=xpi+i1;
+
+      int i2=i-l-(i1-l1)+l2;
+      float* _ypr=ypr+i2+yn*l2;
+      float* _ypi=ypi+i2+yn*l2;
+
+      float c0=cptr[i1*yn+i2]*xn*yn/rn;
+      
+      for(int m1=-l1; m1<=l1; m1++){
+	const float x_r=_xpr[xn*(m1+l1)];
+	const float x_i=_xpi[xn*(m1+l1)];
+	int lower=-l-m1; if(lower<-l2) lower=-l2;
+	int upper=l-m1; if(upper>l2) upper=l2;
+
+	for(int m2=lower; m2<=upper; m2++){
+	  float c=cptr[(m1+l1)*yn+m2+l2];
+	  const float y_r=_ypr[yn*(m2)];
+	  const float y_i=_ypi[yn*(m2)];
+	  _rpr[rn*(m1+m2)]+=c0*c*(x_r*y_r-x_i*y_i); 
+	  _rpi[rn*(m1+m2)]+=c0*c*(x_r*y_i+x_i*y_r);
+	  //atomicAdd(_rpr+rn*(m1+m2+l),c0*c*(x_r*y_r-x_i*y_i)); 
+	  //atomicAdd(_rpi+rn*(m1+m2+l),c0*c*(x_r*y_i+x_i*y_r));
+	}
+ 
+      }
+
+    }
+  }
+
+  __syncthreads();
+  
+  saveg3(r,rpr,b,t);
+
+}
+
+
+namespace GElib{
+
+
+  void SO3Fpart_addFproduct_cu(const cnine::Ctensor3_view& r, const cnine::Ctensor3_view& x, const cnine::Ctensor3_view& y, 
+    const int conj, const int method, const cudaStream_t& stream){
+
+    const int xl=(x.n1-1)/2;
+    const int yl=(y.n1-1)/2;
+    const int l=(r.n1-1)/2;
+    const int b=r.n0;
+
+    float* cptr=nullptr;
+    int Cptr=SO3_cgbank.getfC(xl,yl,l)/4;
+    if(Cptr<0) cptr=SO3_cgbank.getf(CGindex(xl,yl,l),r.dev).arrg;
+    int clines=cnine::roundup(x.n1*y.n1,32)/32;
+
+    int nlines=cnine::roundup(x.n1*x.n2*2,32)/32+
+      cnine::roundup(y.n1*y.n2*2,32)/32+
+      cnine::roundup(r.n1*r.n2*2,32)/32;
+
+    if(nlines<=384){
+
+      if(method==0){
+
+	SO3Fpart_addFproduct_kernel<<<b,cnine::roundup(x.n2*y.n2,32),nlines*128,stream>>>
+	  (r,x,y,Cptr,cptr,conj);
+	return; 
+
+      }else{
+
+	SO3Fpart_addFproduct_large_kernel<<<b,cnine::roundup(std::max(std::max(x.n2,y.n2),r.n2),32),nlines*128,stream>>>
+	  (r,x,y,Cptr,cptr,conj);
+	return; 
+
+      }
+    }
+
+    cout<<"error"<<endl;
+
+  }    
+
+
+}
+
+
+#endif 
+
diff --git a/cuda/SO3Fpart_addFproduct_back0.cu b/cuda/SO3Fpart_addFproduct_back0.cu
new file mode 100644
index 0000000..7878906
--- /dev/null
+++ b/cuda/SO3Fpart_addFproduct_back0.cu
@@ -0,0 +1,305 @@
+/*
+ * This file is part of GElib, a C++/CUDA library for group equivariant 
+ * tensor operations. 
+ *  
+ * Copyright (c) 2023, Imre Risi Kondor
+ *
+ * This source code file is subject to the terms of the noncommercial 
+ * license distributed with GElib in the file NONCOMMERICAL.TXT. Commercial 
+ * use is prohibited. All redistributed versions of this file (in orginal
+ * or modified form) must retain this copyright notice and must be 
+ * accompanied by a verbatim copy of the license. 
+ *
+ */
+
+#ifndef _SO3Fpart_addFproduct_back0_cu
+#define _SO3Fpart_addFproduct_back0_cu
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+//#include <thrust/complex.h>
+//#include <thrust/tuple.h>
+
+#include "SO3_CGbank.hpp"
+#include "Ctensor3_view.hpp"
+
+
+extern GElib::SO3_CGbank SO3_cgbank;
+
+
+__device__ int loadg4(const cnine::Ctensor3_view& x, float* dest, const int b, const int t){
+  int I=x.n1;
+  int J=x.n2;
+  int s1=x.s1;
+  int s2=x.s2;
+  int offs=I*J; //((I*J-1)/32+1)*32;
+  float* destc=dest+offs;
+  float* source=x.arr+x.s0*b;
+  float* sourcec=x.arrc+x.s0*b;
+  if(t<J){
+    for(int i=0; i<I; i++)
+      dest[i*J+t]=source[i*s1+t*s2];
+    for(int i=0; i<I; i++)
+      destc[i*J+t]=sourcec[i*s1+t*s2];
+  }
+  return offs;
+}
+
+
+__device__ int saveg4(const cnine::Ctensor3_view& x, float* source, const int b, const int t){
+  int I=x.n1;
+  int J=x.n2;
+  int s1=x.s1;
+  int s2=x.s2;
+  int offs=I*J; //((I*J-1)/32+1)*32;
+  float* sourcec=source+offs;
+  float* dest=x.arr+x.s0*b;
+  float* destc=x.arrc+x.s0*b;
+  if(t<J){
+    for(int i=0; i<I; i++)
+      dest[i*s1+t*s2]=source[i*J+t];
+    for(int i=0; i<I; i++)
+      destc[i*s1+t*s2]=sourcec[i*J+t];
+  }
+  return offs;
+}
+
+
+__device__ int loadg4c(const cnine::Ctensor3_view& x, float* dest, const int b, const int t){
+  int I=x.n1;
+  int J=x.n2;
+  int s1=x.s1;
+  int s2=x.s2;
+  int offs=I*J; //((I*J-1)/32+1)*32;
+  float* destc=dest+offs;
+  float* source=x.arr+x.s0*b;
+  float* sourcec=x.arrc+x.s0*b;
+  if(t<J){
+    for(int i=0; i<I; i++)
+      dest[i*J+t]=source[i*s1+t*s2];
+    for(int i=0; i<I; i++)
+      destc[i*J+t]=-sourcec[i*s1+t*s2];
+  }
+  return offs;
+}
+
+/*
+__device__ int saveg4c(const cnine::Ctensor3_view& x, float* source, const int b, const int t){
+  int I=x.n1;
+  int J=x.n2;
+  int s1=x.s1;
+  int s2=x.s2;
+  int offs=I*J; //((I*J-1)/32+1)*32;
+  float* sourcec=source+offs;
+  float* dest=x.arr+x.s0*b;
+  float* destc=x.arrc+x.s0*b;
+  if(t<J){
+    for(int i=0; i<I; i++)
+      dest[i*s1+t*s2]=source[i*J+t];
+    for(int i=0; i<I; i++)
+      destc[i*s1+t*s2]=-sourcec[i*J+t];
+  }
+  return offs;
+}
+*/
+
+
+__global__ void SO3Fpart_addFproduct_back0_kernel(const cnine::Ctensor3_view r, const cnine::Ctensor3_view x, 
+  const cnine::Ctensor3_view y, const int Cptr, float* cptr_global, const int conj){
+
+  extern __shared__ unsigned char _shared[]; 
+  const int b=blockIdx.x;
+  const int t=threadIdx.x;
+
+  int l1=(x.n1-1)/2;
+  int l2=(y.n1-1)/2;
+  int l=(r.n1-1)/2;
+  int xn=x.n2;
+  int yn=y.n2;
+  int rn=r.n2;
+
+  float* cptr;
+  if(Cptr>=0) cptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+  else cptr=cptr_global;
+
+  float* xpr=reinterpret_cast<float*>(_shared);
+  float* xpi=xpr+loadg4(x,xpr,b,t);
+
+  float* ypr=xpr+((2*xn*xn-1)/32+1)*32;
+  float* ypi;
+  if(conj==0) ypi=ypr+loadg4(y,ypr,b,t);
+  else ypi=ypr+loadg4c(y,ypr,b,t);
+
+  float* rpr=ypr+((2*yn*yn-1)/32+1)*32;
+  float* rpi=rpr+loadg4(r,rpr,b,t);
+
+  __syncthreads();
+
+  if(t<xn*yn){
+
+    int i1=t%xn;
+    float* _xpr=xpr+i1;
+    float* _xpi=xpi+i1;
+    
+    int i2=t/xn;
+    ypr=ypr+i2;
+    ypi=ypi+i2;
+    
+    int i=i1+i2-l1-l2+l;
+    float* _rpr=rpr+i;
+    float* _rpi=rpi+i;
+
+    if(i>=0 && i<rn){
+      float c0=cptr[i1*yn+i2]*xn*yn/rn;
+      
+      for(int m1=-l1; m1<=l1; m1++){
+	int lower=-l-m1; if(lower<-l2) lower=-l2;
+	int upper=l-m1; if(upper>l2) upper=l2;
+	for(int m2=lower; m2<=upper; m2++){
+	  float c=cptr[(m1+l1)*yn+m2+l2];
+	  const float y_r=ypr[yn*(m2+l2)];
+	  const float y_i=ypi[yn*(m2+l2)];
+	  const float g_r=_rpr[rn*(m1+m2+l)];
+	  const float g_i=_rpi[rn*(m1+m2+l)];
+	  //_xpr[xn*(m1+l1)]+=c0*c*(g_r*y_r+g_i*y_i);
+	  //_xpi[xn*(m1+l1)]+=c0*c*(-g_r*y_i+g_i*y_r);
+	  atomicAdd(_xpr+xn*(m1+l1),c0*c*(g_r*y_r+g_i*y_i));
+	  atomicAdd(_xpi+xn*(m1+l1),c0*c*(-g_r*y_i+g_i*y_r));
+	}
+ 
+      }
+    }
+  }
+
+  __syncthreads();
+  
+  saveg4(x,xpr,b,t);
+
+}
+
+
+__global__ void SO3Fpart_addFproduct_back0_large_kernel(const cnine::Ctensor3_view r, const cnine::Ctensor3_view x, 
+  const cnine::Ctensor3_view y, const int Cptr, float* cptr_global, const int conj){
+
+  extern __shared__ unsigned char _shared[]; 
+  const int b=blockIdx.x;
+  const int t=threadIdx.x;
+
+  int l1=(x.n1-1)/2;
+  int l2=(y.n1-1)/2;
+  int l=(r.n1-1)/2;
+  int xn=x.n2;
+  int yn=y.n2;
+  int rn=r.n2;
+
+  float* cptr;
+  if(Cptr>=0) cptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+  else cptr=cptr_global;
+
+  float* xpr=reinterpret_cast<float*>(_shared);
+  float* xpi=xpr+loadg4(x,xpr,b,t);
+
+  float* ypr=xpr+((2*xn*xn-1)/32+1)*32;
+  float* ypi;
+  if(conj==0) ypi=ypr+loadg4(y,ypr,b,t);
+  else ypi=ypr+loadg4c(y,ypr,b,t);
+
+  float* rpr=ypr+((2*yn*yn-1)/32+1)*32;
+  float* rpi=rpr+loadg4(r,rpr,b,t);
+
+  __syncthreads();
+
+  if(t<xn){
+
+    int i1=t;
+    float* _xpr=xpr+i1;
+    float* _xpi=xpi+i1;
+    
+    for(int i2=0; i2<yn; i2++){
+      ypr=ypr+i2;
+      ypi=ypi+i2;
+    
+      int i=i1+i2-l1-l2+l;
+      float* _rpr=rpr+i;
+      float* _rpi=rpi+i;
+
+      if(i>=0 && i<rn){
+	float c0=cptr[i1*yn+i2]*xn*yn/rn;
+      
+	for(int m1=-l1; m1<=l1; m1++){
+	  int lower=-l-m1; if(lower<-l2) lower=-l2;
+	  int upper=l-m1; if(upper>l2) upper=l2;
+	  for(int m2=lower; m2<=upper; m2++){
+	    float c=cptr[(m1+l1)*yn+m2+l2];
+	    const float y_r=ypr[yn*(m2+l2)];
+	    const float y_i=ypi[yn*(m2+l2)];
+	    const float g_r=_rpr[rn*(m1+m2+l)];
+	    const float g_i=_rpi[rn*(m1+m2+l)];
+	    _xpr[xn*(m1+l1)]+=c0*c*(g_r*y_r+g_i*y_i);
+	    _xpi[xn*(m1+l1)]+=c0*c*(-g_r*y_i+g_i*y_r);
+	    //atomicAdd(_xpr+xn*(m1+l1),c0*c*(g_r*y_r+g_i*y_i));
+	    //atomicAdd(_xpi+xn*(m1+l1),c0*c*(-g_r*y_i+g_i*y_r));
+	  }
+	}
+
+      }
+    }
+  }
+
+  __syncthreads();
+  
+  saveg4(x,xpr,b,t);
+
+}
+
+
+namespace GElib{
+
+
+  void SO3Fpart_addFproduct_back0_cu(const cnine::Ctensor3_view& x, const cnine::Ctensor3_view& r, const cnine::Ctensor3_view& y, 
+    const int conj, const int method, const cudaStream_t& stream){
+
+    const int xl=(x.n1-1)/2;
+    const int yl=(y.n1-1)/2;
+    const int l=(r.n1-1)/2;
+
+    const int b=r.n0;
+    assert(x.n0==b);
+    assert(y.n0==b);
+
+    float* cptr=nullptr;
+    int Cptr=SO3_cgbank.getfC(xl,yl,l)/4;
+    if(Cptr<0) cptr=SO3_cgbank.getf(CGindex(xl,yl,l),r.dev).arrg;
+    int clines=cnine::roundup(x.n1*y.n1,32)/32;
+
+    int nlines=cnine::roundup(x.n1*x.n2*2,32)/32+
+      cnine::roundup(y.n1*y.n2*2,32)/32+
+      cnine::roundup(r.n1*r.n2*2,32)/32;
+
+
+    if(nlines<=384){
+
+      if(method==0){
+
+      SO3Fpart_addFproduct_back0_kernel<<<b,cnine::roundup(x.n2*y.n2,32),nlines*128,stream>>>
+	(r,x,y,Cptr,cptr,conj);
+
+      }else{
+
+	SO3Fpart_addFproduct_back0_large_kernel<<<b,cnine::roundup(std::max(std::max(x.n2,y.n2),r.n2),32),nlines*128,stream>>>
+	  (r,x,y,Cptr,cptr,conj);
+
+      }
+
+    }else{
+      cout<<"error"<<endl;
+    }
+
+  }    
+
+
+}
+
+
+#endif 
+
diff --git a/cuda/SO3Fpart_addFproduct_back1.cu b/cuda/SO3Fpart_addFproduct_back1.cu
new file mode 100644
index 0000000..4a595ef
--- /dev/null
+++ b/cuda/SO3Fpart_addFproduct_back1.cu
@@ -0,0 +1,308 @@
+/*
+ * This file is part of GElib, a C++/CUDA library for group equivariant 
+ * tensor operations. 
+ *  
+ * Copyright (c) 2023, Imre Risi Kondor
+ *
+ * This source code file is subject to the terms of the noncommercial 
+ * license distributed with GElib in the file NONCOMMERICAL.TXT. Commercial 
+ * use is prohibited. All redistributed versions of this file (in orginal
+ * or modified form) must retain this copyright notice and must be 
+ * accompanied by a verbatim copy of the license. 
+ *
+ */
+
+
+#ifndef _SO3Fpart_addFproduct_back1_cu
+#define _SO3Fpart_addFproduct_back1_cu
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+//#include <thrust/complex.h>
+//#include <thrust/tuple.h>
+
+#include "SO3_CGbank.hpp"
+#include "Ctensor3_view.hpp"
+
+
+extern GElib::SO3_CGbank SO3_cgbank;
+
+
+__device__ int loadg5(const cnine::Ctensor3_view& x, float* dest, const int b, const int t){
+  int I=x.n1;
+  int J=x.n2;
+  int s1=x.s1;
+  int s2=x.s2;
+  int offs=I*J; //((I*J-1)/32+1)*32;
+  float* destc=dest+offs;
+  float* source=x.arr+x.s0*b;
+  float* sourcec=x.arrc+x.s0*b;
+  if(t<J){
+    for(int i=0; i<I; i++)
+      dest[i*J+t]=source[i*s1+t*s2];
+    for(int i=0; i<I; i++)
+      destc[i*J+t]=sourcec[i*s1+t*s2];
+  }
+  return offs;
+}
+
+
+__device__ int saveg5(const cnine::Ctensor3_view& x, float* source, const int b, const int t){
+  int I=x.n1;
+  int J=x.n2;
+  int s1=x.s1;
+  int s2=x.s2;
+  int offs=I*J; //((I*J-1)/32+1)*32;
+  float* sourcec=source+offs;
+  float* dest=x.arr+x.s0*b;
+  float* destc=x.arrc+x.s0*b;
+  if(t<J){
+    for(int i=0; i<I; i++)
+      dest[i*s1+t*s2]=source[i*J+t];
+    for(int i=0; i<I; i++)
+      destc[i*s1+t*s2]=sourcec[i*J+t];
+  }
+  return offs;
+}
+
+
+__device__ int loadg5c(const cnine::Ctensor3_view& x, float* dest, const int b, const int t){
+  int I=x.n1;
+  int J=x.n2;
+  int s1=x.s1;
+  int s2=x.s2;
+  int offs=I*J; //((I*J-1)/32+1)*32;
+  float* destc=dest+offs;
+  float* source=x.arr+x.s0*b;
+  float* sourcec=x.arrc+x.s0*b;
+  if(t<J){
+    for(int i=0; i<I; i++)
+      dest[i*J+t]=source[i*s1+t*s2];
+    for(int i=0; i<I; i++)
+      destc[i*J+t]=-sourcec[i*s1+t*s2];
+  }
+  return offs;
+}
+
+
+__device__ int saveg5c(const cnine::Ctensor3_view& x, float* source, const int b, const int t){
+  int I=x.n1;
+  int J=x.n2;
+  int s1=x.s1;
+  int s2=x.s2;
+  int offs=I*J; //((I*J-1)/32+1)*32;
+  float* sourcec=source+offs;
+  float* dest=x.arr+x.s0*b;
+  float* destc=x.arrc+x.s0*b;
+  if(t<J){
+    for(int i=0; i<I; i++)
+      dest[i*s1+t*s2]=source[i*J+t];
+    for(int i=0; i<I; i++)
+      destc[i*s1+t*s2]=-sourcec[i*J+t];
+  }
+  return offs;
+}
+
+
+
+__global__ void SO3Fpart_addFproduct_back1_kernel(const cnine::Ctensor3_view r, const cnine::Ctensor3_view x, 
+  const cnine::Ctensor3_view y, const int Cptr, float* cptr_global, const int conj){
+
+  extern __shared__ unsigned char _shared[]; 
+  const int b=blockIdx.x;
+  const int t=threadIdx.x;
+
+  int l1=(x.n1-1)/2;
+  int l2=(y.n1-1)/2;
+  int l=(r.n1-1)/2;
+  int xn=x.n2;
+  int yn=y.n2;
+  int rn=r.n2;
+
+  float* cptr;
+  if(Cptr>=0) cptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+  else cptr=cptr_global;
+
+  float* xpr=reinterpret_cast<float*>(_shared);
+  float* xpi=xpr+loadg5(x,xpr,b,t);
+
+  float* ypr=xpr+((2*xn*xn-1)/32+1)*32;
+  float* ypi;
+  if(conj==0) ypi=ypr+loadg5(y,ypr,b,t);
+  else ypi=ypr+loadg5c(y,ypr,b,t);
+
+  float* rpr=ypr+((2*yn*yn-1)/32+1)*32;
+  float* rpi=rpr+loadg5(r,rpr,b,t);
+
+  __syncthreads();
+
+  if(t<xn*yn){
+
+    int i1=t/yn;
+    float* _xpr=xpr+i1;
+    float* _xpi=xpi+i1;
+    
+    int i2=t%yn;
+    float* _ypr=ypr+i2;
+    float* _ypi=ypi+i2;
+    
+    int i=i1+i2-l1-l2+l;
+    float* _rpr=rpr+i;
+    float* _rpi=rpi+i;
+
+    if(i>=0 && i<rn){
+      float c0=cptr[i1*yn+i2]*xn*yn/rn;
+      
+      for(int m1=-l1; m1<=l1; m1++){
+	const float x_r=_xpr[xn*(m1+l1)];
+	const float x_i=_xpi[xn*(m1+l1)];
+	int lower=-l-m1; if(lower<-l2) lower=-l2;
+	int upper=l-m1; if(upper>l2) upper=l2;
+	for(int m2=lower; m2<=upper; m2++){
+	  float c=cptr[(m1+l1)*yn+m2+l2];
+	  const float g_r=_rpr[rn*(m1+m2+l)];
+	  const float g_i=_rpi[rn*(m1+m2+l)];
+	  //_ypr[yn*(m2+l2)]+=c*(g_r*x_r+g_i*x_i);
+	  //_ypi[yn*(m2+l2)]+=c*(-g_r*x_i+g_i*x_r);
+	  atomicAdd(_ypr+yn*(m2+l2),c0*c*(g_r*x_r+g_i*x_i));
+	  atomicAdd(_ypi+yn*(m2+l2),c0*c*(-g_r*x_i+g_i*x_r));
+	}
+ 
+      }
+    }
+  }
+
+  __syncthreads();
+  
+  if(conj==0) saveg5(y,ypr,b,t);
+  else saveg5c(y,ypr,b,t);
+
+}
+
+
+__global__ void SO3Fpart_addFproduct_back1_large_kernel(const cnine::Ctensor3_view r, const cnine::Ctensor3_view x, 
+  const cnine::Ctensor3_view y, const int Cptr, float* cptr_global, const int conj){
+
+  extern __shared__ unsigned char _shared[]; 
+  const int b=blockIdx.x;
+  const int t=threadIdx.x;
+
+  int l1=(x.n1-1)/2;
+  int l2=(y.n1-1)/2;
+  int l=(r.n1-1)/2;
+  int xn=x.n2;
+  int yn=y.n2;
+  int rn=r.n2;
+
+  float* cptr;
+  if(Cptr>=0) cptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+  else cptr=cptr_global;
+
+  float* xpr=reinterpret_cast<float*>(_shared);
+  float* xpi=xpr+loadg5(x,xpr,b,t);
+
+  float* ypr=xpr+((2*xn*xn-1)/32+1)*32;
+  float* ypi;
+  if(conj==0) ypi=ypr+loadg5(y,ypr,b,t);
+  else ypi=ypr+loadg5c(y,ypr,b,t);
+
+  float* rpr=ypr+((2*yn*yn-1)/32+1)*32;
+  float* rpi=rpr+loadg5(r,rpr,b,t);
+
+  __syncthreads();
+
+  if(t<yn){
+
+    int i2=t;
+    float* _ypr=ypr+i2;
+    float* _ypi=ypi+i2;
+    
+    for(int i1=0; i1<xn; i1++){
+      float* _xpr=xpr+i1;
+      float* _xpi=xpi+i1;
+   
+      int i=i1+i2-l1-l2+l;
+      float* _rpr=rpr+i;
+      float* _rpi=rpi+i;
+
+      if(i>=0 && i<rn){
+	float c0=cptr[i1*yn+i2]*xn*yn/rn;
+	
+	for(int m1=-l1; m1<=l1; m1++){
+	  const float x_r=_xpr[xn*(m1+l1)];
+	  const float x_i=_xpi[xn*(m1+l1)];
+	  int lower=-l-m1; if(lower<-l2) lower=-l2;
+	  int upper=l-m1; if(upper>l2) upper=l2;
+	  for(int m2=lower; m2<=upper; m2++){
+	    float c=cptr[(m1+l1)*yn+m2+l2];
+	    const float g_r=_rpr[rn*(m1+m2+l)];
+	    const float g_i=_rpi[rn*(m1+m2+l)];
+	    _ypr[yn*(m2+l2)]+=c*(g_r*x_r+g_i*x_i);
+	    _ypi[yn*(m2+l2)]+=c*(-g_r*x_i+g_i*x_r);
+	    //atomicAdd(_ypr+yn*(m2+l2),c0*c*(g_r*x_r+g_i*x_i));
+	    //atomicAdd(_ypi+yn*(m2+l2),c0*c*(-g_r*x_i+g_i*x_r));
+	  }
+	} 
+      }
+
+    }
+  }
+
+  __syncthreads();
+  
+  if(conj==0) saveg5(y,ypr,b,t);
+  else saveg5c(y,ypr,b,t);
+
+}
+
+
+
+namespace GElib{
+
+
+  void SO3Fpart_addFproduct_back1_cu(const cnine::Ctensor3_view& y, const cnine::Ctensor3_view& r, const cnine::Ctensor3_view& x, 
+    const int conj, const int method, const cudaStream_t& stream){
+
+    const int xl=(x.n1-1)/2;
+    const int yl=(y.n1-1)/2;
+    const int l=(r.n1-1)/2;
+
+    const int b=r.n0;
+    assert(x.n0==b);
+    assert(y.n0==b);
+
+    float* cptr=nullptr;
+    int Cptr=SO3_cgbank.getfC(xl,yl,l)/4;
+    if(Cptr<0) cptr=SO3_cgbank.getf(CGindex(xl,yl,l),r.dev).arrg;
+    int clines=cnine::roundup(x.n1*y.n1,32)/32;
+
+    int nlines=cnine::roundup(x.n1*x.n2*2,32)/32+
+      cnine::roundup(y.n1*y.n2*2,32)/32+
+      cnine::roundup(r.n1*r.n2*2,32)/32;
+
+
+    if(nlines<=384){
+
+      if(method==0){
+
+      SO3Fpart_addFproduct_back1_kernel<<<b,cnine::roundup(x.n2*y.n2,32),nlines*128,stream>>>
+	(r,x,y,Cptr,cptr,conj);
+
+      }else{
+
+	SO3Fpart_addFproduct_back1_large_kernel<<<b,cnine::roundup(std::max(std::max(x.n2,y.n2),r.n2),32),nlines*128,stream>>>
+	(r,x,y,Cptr,cptr,conj);
+
+      }
+    }else{
+      cout<<"error"<<endl;
+    }
+
+  }    
+
+
+}
+
+
+#endif 
+
diff --git a/cuda/SO3partA_CGproduct.cu b/cuda/SO3partA_CGproduct.cu
new file mode 100644
index 0000000..8bdb83a
--- /dev/null
+++ b/cuda/SO3partA_CGproduct.cu
@@ -0,0 +1,1239 @@
+/*
+ * This file is part of GElib, a C++/CUDA library for group equivariant 
+ * tensor operations. 
+ *  
+ * Copyright (c) 2023, Imre Risi Kondor
+ *
+ * This source code file is subject to the terms of the noncommercial 
+ * license distributed with GElib in the file NONCOMMERICAL.TXT. Commercial 
+ * use is prohibited. All redistributed versions of this file (in orginal
+ * or modified form) must retain this copyright notice and must be 
+ * accompanied by a verbatim copy of the license. 
+ *
+ */
+
+#ifndef _SO3partA_CGproduct_cu
+#define _SO3partA_CGproduct_cu
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <thrust/complex.h>
+#include <thrust/tuple.h>
+
+//__device__ __constant__ unsigned char cg_cmem[32276]; 
+
+
+#include "SO3partA.hpp"
+#include "SO3partArrayA.hpp"
+#include "SO3_CGbank.hpp"
+
+#include "CellwiseBinaryCmap.hpp"
+#include "BroadcastBinaryCmap.hpp"
+#include "InnerCmap.hpp"
+#include "OuterCmap.hpp"
+#include "MVprodCmap.hpp"
+#include "VMprodCmap.hpp"
+//#include "convolve1_cmap.hpp"
+#include "Convolve2Cmap.hpp"
+
+extern GElib::SO3_CGbank SO3_cgbank;
+
+
+__device__ void SO3part_load_lines(float* dest, const float* source, const int nlines, const int t){
+  if(t<32){
+    for(int i=0; i<nlines; i++)
+      dest[i*32+t]=source[i*32+t];
+  }
+}
+
+__device__ void SO3part_save_lines(const float* source, float* dest, const int nlines, const int t){
+  if(t<32){
+    for(int i=0; i<nlines; i++)
+      dest[i*32+t]=source[i*32+t];
+  }
+}
+
+
+// ---- CGproduct --------------------------------------------------------------------------------------------
+
+
+template<typename IMAP>
+__global__ void SO3partA_CGproduct_kernel(float* rarr, float* rarrc, float* xarr, float* xarrc, 
+  float* yarr, float* yarrc, const int rstride, const int xstride, const int ystride, const IMAP cmap, 
+  const int xn, const int yn, const int rn, const int l1, const int l2, const int l, 
+  const int _offs, const int nch, const int Cptr, const int mode=0){
+
+  extern __shared__ unsigned char _shared[]; 
+  float* shared=reinterpret_cast<float*>(_shared);
+
+  const float* C_ptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+  const int t=threadIdx.x;
+
+  const int r=2*l+1;
+  const int r1=2*l1+1;
+  const int r2=2*l2+1;
+
+  const int xwidth=xn*nch; 
+  const int ywidth=yn*nch; 
+  const int rwidth=xn*yn*nch;
+  const int global_rwidth=rn*nch;
+  
+  const int rlines=((r*rwidth-1)/32+1);
+  const int xlines=((r1*xwidth-1)/32+1);
+  const int ylines=((r2*ywidth-1)/32+1);
+
+  const int rptr=0;
+  const int xptr=rptr+rlines*64;
+  const int yptr=xptr+xlines*64;
+
+  int rix,xix,yix;
+  int nsum;
+  int lst;
+
+
+  if(mode<2){
+    auto T=cmap(blockIdx.x,blockIdx.y,blockIdx.z);
+    rix=thrust::get<0>(T);
+    xix=thrust::get<1>(T);
+    yix=thrust::get<2>(T);
+    nsum=1;
+    //if(t==0) printf("foop1\n");
+  }else{
+    rix=cmap.target(blockIdx.x);
+    nsum=cmap.n_accum(blockIdx.x);
+    lst=cmap.lst_ptr(blockIdx.x);
+  }
+  
+  if(mode==1){
+    if(t<32){
+      for(int i=0; i<2*rlines; i++)
+	shared[rptr+i*32+t]=0;
+    }
+  }else{
+    if(t<rwidth){
+      for(int i=0; i<r; i++)
+	shared[rptr+i*rwidth+t]=rarr[rix*rstride+_offs+i*global_rwidth+t];
+      for(int i=0; i<r; i++)
+	shared[rptr+rlines*32+i*rwidth+t]=rarrc[rix*rstride+_offs+i*global_rwidth+t];
+    }
+  }
+  
+  for(int s=0; s<nsum; s++){
+
+    if(mode==2){
+      auto T=cmap.source(lst,blockIdx.x,s);
+      xix=thrust::get<0>(T);
+      yix=thrust::get<1>(T);
+    }
+
+    SO3part_load_lines(shared+xptr,xarr+xix*xstride,xlines,t);
+    SO3part_load_lines(shared+xptr+xlines*32,xarrc+xix*xstride,xlines,t);
+    SO3part_load_lines(shared+yptr,yarr+yix*ystride,ylines,t);
+    SO3part_load_lines(shared+yptr+ylines*32,yarrc+yix*ystride,ylines,t);
+
+    //if(t==0) printf("foop3\n");
+      
+      __syncthreads();
+
+      const int rpr=rptr+t;
+      const int rpi=rpr+rlines*32;
+
+      const int xcol=t/yn;
+      const int xpr=xptr+xcol;
+      const int xpi=xpr+xlines*32;
+
+      const int ycol=t%ywidth;
+      const int ypr=yptr+ycol;
+      const int ypi=ypr+ylines*32;
+
+
+      if(t<rwidth){
+	for(int m1=-l1; m1<=l1; m1++){
+	  const float x_r=shared[xpr+xwidth*(m1+l1)];
+	  const float x_i=shared[xpi+xwidth*(m1+l1)];
+	  int lower=-l-m1; if(lower<-l2) lower=-l2;
+	  int upper=l-m1; if(upper>l2) upper=l2;
+	  for(int m2=lower; m2<=upper; m2++){
+	    float c=C_ptr[(m1+l1)*r2+m2+l2];
+	    const float y_r=shared[ypr+ywidth*(m2+l2)];
+	    const float y_i=shared[ypi+ywidth*(m2+l2)];
+	    shared[rpr+rwidth*(m1+m2+l)]+=c*(x_r*y_r-x_i*y_i); 
+	    shared[rpi+rwidth*(m1+m2+l)]+=c*(x_r*y_i+x_i*y_r);
+	  }
+	}
+      }
+
+    //if(t==0) printf("foop4\n");
+
+      __syncthreads();
+  }
+
+  //if(t==0) printf("fooq\n");
+  
+  if(t<rwidth){
+    for(int i=0; i<r; i++)
+      rarr[rix*rstride+_offs+i*global_rwidth+t]=shared[rptr+i*rwidth+t];
+    for(int i=0; i<r; i++)
+      rarrc[rix*rstride+_offs+i*global_rwidth+t]=shared[rptr+rlines*32+i*rwidth+t];
+  }    
+ 
+}
+
+
+
+template<typename IMAP>
+__global__ void SO3partA_CGproduct_kernel_L(float* rarr, float* rarrc, float* xarr, float* xarrc, 
+  float* yarr, float* yarrc, const int rstride, const int xstride, const int ystride, const IMAP cmap, 
+  const int xn, const int yn, const int rn, const int l1, const int l2, const int l, 
+  const int _offs, const int nch, const int Cptr, const int mode=0){
+
+  extern __shared__ unsigned char _shared[]; 
+  float* shared=reinterpret_cast<float*>(_shared);
+
+  const float* C_ptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+  const int t=threadIdx.x;
+
+  const int r=2*l+1;
+  const int r1=2*l1+1;
+  const int r2=2*l2+1;
+
+  const int xwidth=xn*nch; 
+  const int ywidth=yn*nch; 
+  const int rwidth=xn*nch; 
+  const int global_rwidth=rn*nch;
+  
+  const int rlines=((r*rwidth-1)/32+1);
+  const int xlines=((r1*xwidth-1)/32+1);
+  const int ylines=((r2*1-1)/32+1);
+
+  const int rptr=0;
+  const int xptr=rptr+rlines*64;
+  const int yptr=xptr+xlines*64;
+
+  int rix,xix,yix;
+  int nsum;
+  int lst;
+
+  if(mode<2){
+    auto T=cmap(blockIdx.x,blockIdx.y,blockIdx.z);
+    rix=thrust::get<0>(T);
+    xix=thrust::get<1>(T);
+    yix=thrust::get<2>(T);
+    nsum=1;
+  }else{
+    rix=cmap.target(blockIdx.x);
+    nsum=cmap.n_accum(blockIdx.x);
+    lst=cmap.lst_ptr(blockIdx.x);
+  }
+  
+
+  for(int s=0; s<nsum; s++){
+
+    if(mode==2){
+      auto T=cmap.source(lst,blockIdx.x,s);
+      xix=thrust::get<0>(T);
+      yix=thrust::get<1>(T);
+    }
+
+    SO3part_load_lines(shared+xptr,xarr+xix*xstride,xlines,t);
+    SO3part_load_lines(shared+xptr+xlines*32,xarrc+xix*xstride,xlines,t);
+
+
+    for(int ycol=0; ycol<ywidth; ycol++){
+
+      if(t==0){
+	for(int i=0; i<r; i++)
+	  shared[yptr+i]=yarr[yix*ystride+i*ywidth+ycol];
+	for(int i=0; i<r; i++)
+	  shared[yptr+ylines*32+i]=yarrc[yix*ystride+i*ywidth+ycol];
+      }
+
+      if(t<32){
+	for(int i=0; i<2*rlines; i++)
+	  shared[rptr+i*32+t]=0;
+      }
+      
+      __syncthreads();
+
+      const int rpr=rptr+t;
+      const int rpi=rpr+rlines*32;
+
+      const int xcol=t;
+      const int xpr=xptr+xcol;
+      const int xpi=xpr+xlines*32;
+
+      //const int ycol=t%ywidth;
+      const int ypr=yptr;// +ycol;
+      const int ypi=ypr+ylines*32;
+
+
+      if(t<rwidth){
+	for(int m1=-l1; m1<=l1; m1++){
+	  const float x_r=shared[xpr+xwidth*(m1+l1)];
+	  const float x_i=shared[xpi+xwidth*(m1+l1)];
+	  int lower=-l-m1; if(lower<-l2) lower=-l2;
+	  int upper=l-m1; if(upper>l2) upper=l2;
+	  for(int m2=lower; m2<=upper; m2++){
+	    float c=C_ptr[(m1+l1)*r2+m2+l2];
+	    const float y_r=shared[ypr+1*(m2+l2)];
+	    const float y_i=shared[ypi+1*(m2+l2)];
+	    shared[rpr+rwidth*(m1+m2+l)]+=c*(x_r*y_r-x_i*y_i); 
+	    shared[rpi+rwidth*(m1+m2+l)]+=c*(x_r*y_i+x_i*y_r);
+	  }
+	}
+      }
+
+      //if(t==0) printf("foop4\n");
+
+      __syncthreads();
+
+      //if(t==0) printf("fooq\n");
+  
+      if(t<rwidth){
+	for(int i=0; i<r; i++)
+	  rarr[rix*rstride+_offs+i*global_rwidth+t*ywidth+ycol]+=shared[rptr+i*rwidth+t];
+	for(int i=0; i<r; i++)
+	  rarrc[rix*rstride+_offs+i*global_rwidth+t*ywidth+ycol]+=shared[rptr+rlines*32+i*rwidth+t];
+      }    
+
+      __syncthreads();
+
+    } // ycol
+
+  } //nsum
+}
+
+
+
+
+// ---- back0 ------------------------------------------------------------------------------------------------
+
+
+template<typename IMAP>
+__global__ void SO3partA_CGproduct_back0_kernel(float* xarr, float* xarrc, float* garr, float* garrc, 
+  float* yarr, float* yarrc, const int xstride, const int ystride, const int gstride, const IMAP cmap, 
+  const int xn, const int yn, const int gn, const int l1, const int l2, const int l, 
+  const int _offs, const int nch, const int Cptr, const int mode=0){
+
+  extern __shared__ unsigned char _shared[]; 
+  float* shared=reinterpret_cast<float*>(_shared);
+
+  const float* C_ptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+  const int t=threadIdx.x;
+
+  const int rg=2*l+1;
+  const int rx=2*l1+1;
+  const int ry=2*l2+1;
+
+  const int xwidth=xn*nch; 
+  const int ywidth=yn*nch; 
+  const int gwidth=xn*yn*nch;
+  const int global_gwidth=gn*nch;
+
+  const int glines=((rg*gwidth-1)/32+1);
+  const int xlines=((rx*xwidth-1)/32+1);
+  const int ylines=((ry*ywidth-1)/32+1);
+
+  const int xptr=0;
+  const int gptr=xptr+xlines*64;
+  const int yptr=gptr+glines*64;
+
+  int gix,xix,yix;
+  int nsum;
+  int lst;
+
+  if(mode<2){
+    auto T=cmap(blockIdx.x,blockIdx.y,blockIdx.z);
+    xix=thrust::get<0>(T);
+    gix=thrust::get<1>(T);
+    yix=thrust::get<2>(T);
+    nsum=1;
+  }else{
+    xix=cmap.target(blockIdx.x);
+    nsum=cmap.n_accum(blockIdx.x);
+    lst=cmap.lst_ptr(blockIdx.x);
+  }
+
+  if(mode==1){
+    if(t<32){
+      for(int i=0; i<2*xlines; i++){
+	shared[xptr+i*32+t]=0;
+      }
+    }
+  }else{
+    SO3part_load_lines(shared+xptr,xarr+xix*xstride,xlines,t);
+    SO3part_load_lines(shared+xptr+xlines*32,xarrc+xix*xstride,xlines,t);
+  }
+  
+  for(int s=0; s<nsum; s++){
+
+    if(mode==2){
+      auto T=cmap.source(lst,blockIdx.x,s);
+      gix=thrust::get<0>(T);
+      yix=thrust::get<1>(T);
+    }
+
+    // hack: gwidth assumed to be <=32
+    for(int i=0; i<rg; i++)
+      if(t<gwidth)
+	shared[gptr+i*gwidth+t]=garr[gix*gstride+i*global_gwidth+_offs+t];
+    for(int i=0; i<rg; i++)
+      if(t<gwidth)
+	shared[gptr+glines*32+i*gwidth+t]=garrc[gix*gstride+i*global_gwidth+_offs+t];
+
+    SO3part_load_lines(shared+yptr,yarr+yix*ystride,ylines,t);
+    SO3part_load_lines(shared+yptr+ylines*32,yarrc+yix*ystride,ylines,t);
+
+    __syncthreads();
+
+      //const int xcol=t;
+    const int xpr=xptr+t;
+    const int xpi=xpr+xlines*32;
+    
+    for(int ycol=0; ycol<ywidth; ycol++){
+      
+      const int ypr=yptr+ycol;
+      const int ypi=ypr+ylines*32;
+      
+      const int gpr=gptr+ywidth*t+ycol;
+      const int gpi=gpr+glines*32;
+
+      if(t<xwidth){
+	for(int m1=-l1; m1<=l1; m1++){
+	  int lower=-l-m1; if(lower<-l2) lower=-l2;
+	  int upper=l-m1; if(upper>l2) upper=l2;
+	  for(int m2=lower; m2<=upper; m2++){
+	    float c=C_ptr[(m1+l1)*ry+m2+l2];
+	    const float y_r=shared[ypr+ywidth*(m2+l2)];
+	    const float y_i=shared[ypi+ywidth*(m2+l2)];
+	    const float g_r=shared[gpr+gwidth*(m1+m2+l)];
+	    const float g_i=shared[gpi+gwidth*(m1+m2+l)];
+	    shared[xpr+xwidth*(m1+l1)]+=c*(g_r*y_r+g_i*y_i); 
+	    shared[xpi+xwidth*(m1+l1)]+=c*(-g_r*y_i+g_i*y_r);
+	  }
+	}
+      }
+      __syncthreads();
+	
+    }
+
+  }
+  
+  SO3part_save_lines(shared+xptr,xarr+xix*xstride,xlines,t);
+  SO3part_save_lines(shared+xptr+xlines*32,xarrc+xix*xstride,xlines,t);
+    
+  __syncthreads();
+
+}
+
+
+template<typename IMAP>
+__global__ void SO3partA_CGproduct_back0_kernel_big(float* xarr, float* xarrc, float* garr, float* garrc, 
+  float* yarr, float* yarrc, const int xstride, const int ystride, const int gstride, const IMAP cmap, 
+  const int xn, const int yn, const int gn, const int l1, const int l2, const int l, 
+  const int _offs, const int nch, const int Cptr, const int mode=0){
+
+  extern __shared__ unsigned char _shared[]; 
+  float* shared=reinterpret_cast<float*>(_shared);
+
+  const float* C_ptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+  const int t=threadIdx.x;
+
+  //const int rg=2*l+1;
+  const int rx=2*l1+1;
+  const int ry=2*l2+1;
+
+  const int xwidth=xn*nch; 
+  const int ywidth=yn*nch; 
+  //const int gwidth=xn*yn*nch;
+  const int global_gwidth=gn*nch;
+
+  //const int glines=((rg*gwidth-1)/32+1);
+  const int xlines=((rx*xwidth-1)/32+1);
+  const int ylines=((ry*ywidth-1)/32+1);
+
+  const int xptr=0;
+  const int yptr=xptr+xlines*64;
+  //const int yptr=gptr+glines*64;
+
+  int gix,xix,yix;
+  int nsum;
+  int lst;
+
+  if(mode<2){
+    auto T=cmap(blockIdx.x,blockIdx.y,blockIdx.z);
+    xix=thrust::get<0>(T);
+    gix=thrust::get<1>(T);
+    yix=thrust::get<2>(T);
+    nsum=1;
+  }else{
+    xix=cmap.target(blockIdx.x);
+    nsum=cmap.n_accum(blockIdx.x);
+    lst=cmap.lst_ptr(blockIdx.x);
+  }
+
+  if(mode==1){
+    if(t<32){
+      for(int i=0; i<2*xlines; i++){
+	shared[xptr+i*32+t]=0;
+      }
+    }
+  }else{
+    SO3part_load_lines(shared+xptr,xarr+xix*xstride,xlines,t);
+    SO3part_load_lines(shared+xptr+xlines*32,xarrc+xix*xstride,xlines,t);
+  }
+  
+  for(int s=0; s<nsum; s++){
+
+    if(mode==2){
+      auto T=cmap.source(lst,blockIdx.x,s);
+      gix=thrust::get<0>(T);
+      yix=thrust::get<1>(T);
+    }
+
+    // hack: gwidth assumed to be <=32
+    //for(int i=0; i<rg; i++)
+    //if(t<gwidth)
+    //shared[gptr+i*gwidth+t]=garr[gix*gstride+i*global_gwidth+_offs+t];
+    //for(int i=0; i<rg; i++)
+    //if(t<gwidth)
+    //shared[gptr+glines*32+i*gwidth+t]=garrc[gix*gstride+i*global_gwidth+_offs+t];
+
+    SO3part_load_lines(shared+yptr,yarr+yix*ystride,ylines,t);
+    SO3part_load_lines(shared+yptr+ylines*32,yarrc+yix*ystride,ylines,t);
+
+    __syncthreads();
+
+      //const int xcol=t;
+    const int xpr=xptr+t;
+    const int xpi=xpr+xlines*32;
+    
+    for(int ycol=0; ycol<ywidth; ycol++){
+      
+      const int ypr=yptr+ycol;
+      const int ypi=ypr+ylines*32;
+      
+      //const int gpr=gptr+ywidth*t+ycol;
+      //const int gpi=gpr+glines*32;
+
+      if(t<xwidth){
+	for(int m1=-l1; m1<=l1; m1++){
+	  int lower=-l-m1; if(lower<-l2) lower=-l2;
+	  int upper=l-m1; if(upper>l2) upper=l2;
+	  for(int m2=lower; m2<=upper; m2++){
+	    float c=C_ptr[(m1+l1)*ry+m2+l2];
+	    const float y_r=shared[ypr+ywidth*(m2+l2)];
+	    const float y_i=shared[ypi+ywidth*(m2+l2)];
+	    //const float g_r=shared[gpr+gwidth*(m1+m2+l)];
+	    //const float g_i=shared[gpi+gwidth*(m1+m2+l)];
+	    const float g_r=garr[gix*gstride+_offs+ywidth*t+ycol+(m1+m2+l)*global_gwidth];
+	    const float g_i=garrc[gix*gstride+_offs+ywidth*t+ycol+(m1+m2+l)*global_gwidth];
+	    shared[xpr+xwidth*(m1+l1)]+=c*(g_r*y_r+g_i*y_i); 
+	    shared[xpi+xwidth*(m1+l1)]+=c*(-g_r*y_i+g_i*y_r);
+	  }
+	}
+      }
+      __syncthreads();
+	
+    }
+
+  }
+  
+  SO3part_save_lines(shared+xptr,xarr+xix*xstride,xlines,t);
+  SO3part_save_lines(shared+xptr+xlines*32,xarrc+xix*xstride,xlines,t);
+    
+  __syncthreads();
+
+}
+
+
+// ---- back1 ------------------------------------------------------------------------------------------------
+
+
+template<typename IMAP>
+__global__ void SO3partA_CGproduct_back1_kernel(float* yarr, float* yarrc, float* garr, float* garrc, 
+  float* xarr, float* xarrc, const int xstride, const int ystride, const int gstride, const IMAP cmap, 
+  const int xn, const int yn, const int gn, const int l1, const int l2, const int l, 
+  const int _offs, const int nch, const int Cptr, const int mode=0){
+
+  extern __shared__ unsigned char _shared[]; 
+  float* shared=reinterpret_cast<float*>(_shared);
+
+  const float* C_ptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+  const int t=threadIdx.x;
+
+  const int rg=2*l+1;
+  const int rx=2*l1+1;
+  const int ry=2*l2+1;
+
+  const int xwidth=xn*nch; 
+  const int ywidth=yn*nch; 
+  const int gwidth=xn*yn*nch;
+  const int global_gwidth=gn*nch;
+
+  const int glines=((rg*gwidth-1)/32+1);
+  const int xlines=((rx*xwidth-1)/32+1);
+  const int ylines=((ry*ywidth-1)/32+1);
+
+  const int yptr=0;
+  const int gptr=yptr+ylines*64;
+  const int xptr=gptr+glines*64;
+
+  int gix,xix,yix;
+  int nsum;
+  int lst;
+
+  if(mode<2){
+    auto T=cmap(blockIdx.x,blockIdx.y,blockIdx.z);
+    yix=thrust::get<0>(T);
+    gix=thrust::get<1>(T);
+    xix=thrust::get<2>(T);
+    nsum=1;
+  }else{
+    yix=cmap.target(blockIdx.x);
+    nsum=cmap.n_accum(blockIdx.x);
+    lst=cmap.lst_ptr(blockIdx.x);
+  }
+
+  if(mode==1){
+    if(t<32){
+      for(int i=0; i<2*ylines; i++)
+	shared[yptr+i*32+t]=0;
+    }
+  }else{
+    SO3part_load_lines(shared+yptr,yarr+yix*ystride,ylines,t);
+    SO3part_load_lines(shared+yptr+ylines*32,yarrc+yix*ystride,ylines,t);
+  }
+  
+  for(int s=0; s<nsum; s++){
+
+    if(mode==2){
+      auto T=cmap.source(lst,blockIdx.x,s);
+      gix=thrust::get<0>(T);
+      xix=thrust::get<1>(T);
+    }
+
+    // hack: gwidth assumed to be <=32
+    for(int i=0; i<rg; i++)
+      if(t<gwidth)
+	shared[gptr+i*gwidth+t]=garr[gix*gstride+i*global_gwidth+_offs+t];
+    for(int i=0; i<rg; i++)
+      if(t<gwidth)
+	shared[gptr+glines*32+i*gwidth+t]=garrc[gix*gstride+i*global_gwidth+_offs+t];
+
+    SO3part_load_lines(shared+xptr,xarr+xix*xstride,xlines,t);
+    SO3part_load_lines(shared+xptr+xlines*32,xarrc+xix*xstride,xlines,t);
+
+    __syncthreads();
+
+    //const int ycol=t;
+    const int ypr=yptr+t;
+    const int ypi=ypr+ylines*32;
+    
+    for(int xcol=0; xcol<xwidth; xcol++){
+      
+      const int xpr=xptr+xcol;
+      const int xpi=xpr+xlines*32;
+      
+      const int gpr=gptr+ywidth*xcol+t;
+      const int gpi=gpr+glines*32;
+
+      if(t<ywidth){
+	for(int m1=-l1; m1<=l1; m1++){
+	  int lower=-l-m1; if(lower<-l2) lower=-l2;
+	  int upper=l-m1; if(upper>l2) upper=l2;
+	  const float x_r=shared[xpr+xwidth*(m1+l1)];
+	  const float x_i=shared[xpi+xwidth*(m1+l1)];
+	  for(int m2=lower; m2<=upper; m2++){
+	    float c=C_ptr[(m1+l1)*ry+m2+l2];
+	    const float g_r=shared[gpr+gwidth*(m1+m2+l)];
+	    const float g_i=shared[gpi+gwidth*(m1+m2+l)];
+	    shared[ypr+ywidth*(m2+l2)]+=c*(g_r*x_r+g_i*x_i); 
+	    shared[ypi+ywidth*(m2+l2)]+=c*(-g_r*x_i+g_i*x_r);
+	  }
+	}
+      }
+      __syncthreads();
+
+    }
+
+  }
+  
+  SO3part_save_lines(shared+yptr,yarr+yix*ystride,ylines,t);
+  SO3part_save_lines(shared+yptr+ylines*32,yarrc+yix*ystride,ylines,t);
+    
+  __syncthreads();
+
+}
+
+
+template<typename IMAP>
+__global__ void SO3partA_CGproduct_back1_kernel_big(float* yarr, float* yarrc, float* garr, float* garrc, 
+  float* xarr, float* xarrc, const int xstride, const int ystride, const int gstride, const IMAP cmap, 
+  const int xn, const int yn, const int gn, const int l1, const int l2, const int l, 
+  const int _offs, const int nch, const int Cptr, const int mode=0){
+
+  extern __shared__ unsigned char _shared[]; 
+  float* shared=reinterpret_cast<float*>(_shared);
+
+  const float* C_ptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+  const int t=threadIdx.x;
+
+  //const int rg=2*l+1;
+  const int rx=2*l1+1;
+  const int ry=2*l2+1;
+
+  const int xwidth=xn*nch; 
+  const int ywidth=yn*nch; 
+  //const int gwidth=xn*yn*nch;
+  const int global_gwidth=gn*nch;
+
+  //const int glines=((rg*gwidth-1)/32+1);
+  const int xlines=((rx*xwidth-1)/32+1);
+  const int ylines=((ry*ywidth-1)/32+1);
+
+  const int yptr=0;
+  const int xptr=yptr+ylines*64;
+  //const int xptr=gptr+glines*64;
+
+  int gix,xix,yix;
+  int nsum;
+  int lst;
+
+  if(mode<2){
+    auto T=cmap(blockIdx.x,blockIdx.y,blockIdx.z);
+    yix=thrust::get<0>(T);
+    gix=thrust::get<1>(T);
+    xix=thrust::get<2>(T);
+    nsum=1;
+  }else{
+    yix=cmap.target(blockIdx.x);
+    nsum=cmap.n_accum(blockIdx.x);
+    lst=cmap.lst_ptr(blockIdx.x);
+  }
+
+  if(mode==1){
+    if(t<32){
+      for(int i=0; i<2*ylines; i++)
+	shared[yptr+i*32+t]=0;
+    }
+  }else{
+    SO3part_load_lines(shared+yptr,yarr+yix*ystride,ylines,t);
+    SO3part_load_lines(shared+yptr+ylines*32,yarrc+yix*ystride,ylines,t);
+  }
+  
+  for(int s=0; s<nsum; s++){
+
+    if(mode==2){
+      auto T=cmap.source(lst,blockIdx.x,s);
+      gix=thrust::get<0>(T);
+      xix=thrust::get<1>(T);
+    }
+
+    // hack: gwidth assumed to be <=32
+    //for(int i=0; i<rg; i++)
+    //if(t<gwidth)
+    //shared[gptr+i*gwidth+t]=garr[gix*gstride+i*global_gwidth+_offs+t];
+    //for(int i=0; i<rg; i++)
+    //if(t<gwidth)
+    //shared[gptr+glines*32+i*gwidth+t]=garrc[gix*gstride+i*global_gwidth+_offs+t];
+
+    SO3part_load_lines(shared+xptr,xarr+xix*xstride,xlines,t);
+    SO3part_load_lines(shared+xptr+xlines*32,xarrc+xix*xstride,xlines,t);
+
+    __syncthreads();
+
+    //const int ycol=t;
+    const int ypr=yptr+t;
+    const int ypi=ypr+ylines*32;
+    
+    for(int xcol=0; xcol<xwidth; xcol++){
+      
+      const int xpr=xptr+xcol;
+      const int xpi=xpr+xlines*32;
+      
+      //const int gpr=gptr+ywidth*xcol+t;
+      //const int gpi=gpr+glines*32;
+
+      if(t<ywidth){
+	for(int m1=-l1; m1<=l1; m1++){
+	  int lower=-l-m1; if(lower<-l2) lower=-l2;
+	  int upper=l-m1; if(upper>l2) upper=l2;
+	  const float x_r=shared[xpr+xwidth*(m1+l1)];
+	  const float x_i=shared[xpi+xwidth*(m1+l1)];
+	  for(int m2=lower; m2<=upper; m2++){
+	    float c=C_ptr[(m1+l1)*ry+m2+l2];
+	    //const float g_r=shared[gpr+gwidth*(m1+m2+l)];
+	    //const float g_i=shared[gpi+gwidth*(m1+m2+l)];
+	    const float g_r=garr[gix*gstride+_offs+ywidth*xcol+t+(m1+m2+l)*global_gwidth];
+	    const float g_i=garrc[gix*gstride+_offs+ywidth*xcol+t+(m1+m2+l)*global_gwidth];
+	    shared[ypr+ywidth*(m2+l2)]+=c*(g_r*x_r+g_i*x_i); 
+	    shared[ypi+ywidth*(m2+l2)]+=c*(-g_r*x_i+g_i*x_r);
+	  }
+	}
+      }
+      __syncthreads();
+
+    }
+
+  }
+  
+  SO3part_save_lines(shared+yptr,yarr+yix*ystride,ylines,t);
+  SO3part_save_lines(shared+yptr+ylines*32,yarrc+yix*ystride,ylines,t);
+    
+  __syncthreads();
+
+}
+
+
+// -----------------------------------------------------------------------------------------------------------
+
+
+namespace GElib{
+
+
+  template<typename CMAP>
+  void SO3partA_CGproduct_cu(const CMAP& map, SO3partArrayA& r, const SO3partArrayA& x, 
+    const SO3partArrayA& y, const cudaStream_t& stream, const int offs, const int mode){
+
+    const int xl=x.getl();
+    const int yl=y.getl();
+    const int l=r.getl();
+    const int _nch=1;
+    assert(x.nbu==r.nbu);
+    assert(y.nbu==r.nbu);
+    int _nbu=1; if(_nbu<0) _nbu=1;
+
+    int Cptr=SO3_cgbank.getfC(xl,yl,l)/4;
+    //int nlines=x.cellstride/16+y.cellstride/16+r.cellstride/16; // should be smaller than this!
+    int nlines=x.cellstride/16+y.cellstride/16+cnine::roundup(x.getn()*y.getn()*_nch*(2*l+1),32)/16;
+    // nlines/=_nbu;
+
+    cout<<"nlines="<<nlines<<endl;
+
+    if(nlines<=0*384){
+
+      SO3partA_CGproduct_kernel<<<map.blockdims(),cnine::roundup(x.getn()*y.getn(),32),nlines*128,stream>>>
+	(r.arrg,r.arrgc,x.arrg,x.arrgc,y.arrg,y.arrgc,
+	  r.cellstride,x.cellstride,y.cellstride,map,
+	  x.getn(),y.getn(),r.getn(),xl,yl,l,offs,_nch,Cptr,mode);
+
+    }else{
+      
+      int nlines=x.cellstride/16+cnine::roundup(_nch*(2*yl+1),32)/16+cnine::roundup(x.getn()*_nch*(2*l+1),32)/16;
+
+      cout<<"GElib: large CGproduct"<<endl; 
+
+      if(nlines>384){
+	cout<<"GElib error: CGproduct too big for shared memory"<<endl;
+      }else{
+	SO3partA_CGproduct_kernel_L<<<map.blockdims(),cnine::roundup(x.getn(),32),nlines*128,stream>>>
+	  (r.arrg,r.arrgc,x.arrg,x.arrgc,y.arrg,y.arrgc,
+	    r.cellstride,x.cellstride,y.cellstride,map,
+	    x.getn(),y.getn(),r.getn(),xl,yl,l,offs,_nch,Cptr,mode);
+      }
+    }
+
+  }
+
+  
+  void SO3partA_CGproduct_cu(SO3partA& r, const SO3partA& x, const SO3partA& y,  const int offs, 
+    const cudaStream_t& stream,const int mode){
+
+    const int xl=x.getl();
+    const int yl=y.getl();
+    const int l=r.getl();
+    const int _nch=1;
+    assert(x.nbu==r.nbu);
+    assert(y.nbu==r.nbu);
+    int _nbu=1; if(_nbu<0) _nbu=1;
+    cnine::CellwiseBinaryCmap map;
+
+    int Cptr=SO3_cgbank.getfC(xl,yl,l)/4;
+    int nlines=cnine::roundup(x.memsize,32)/32+cnine::roundup(y.memsize,32)/32+
+      cnine::roundup(x.getn()*y.getn()*_nch*(2*l+1),32)/16;
+
+    //cout<<"nlines="<<nlines<<endl;
+
+    if(nlines<=384){
+
+      SO3partA_CGproduct_kernel<<<map.blockdims(),cnine::roundup(x.getn()*y.getn(),32),nlines*128,stream>>>
+	(r.arrg,r.arrgc,x.arrg,x.arrgc,y.arrg,y.arrgc,
+	  0,0,0,map,
+	  x.getn(),y.getn(),r.getn(),xl,yl,l,offs,_nch,Cptr,mode);
+
+    }else{
+      
+      int nlines=cnine::roundup(x.memsize,32)/32+cnine::roundup(y.memsize,32)/32+
+        cnine::roundup(x.getn()*_nch*(2*l+1),32)/16;
+
+      cout<<"GElib: large CGproduct"<<endl; 
+
+      if(nlines>384){
+	cout<<"GElib error: CGproduct too big for shared memory"<<endl;
+      }else{
+	SO3partA_CGproduct_kernel_L<<<map.blockdims(),cnine::roundup(x.getn(),32),nlines*128,stream>>>
+	  (r.arrg,r.arrgc,x.arrg,x.arrgc,y.arrg,y.arrgc,
+	    0,0,0,map,
+	    x.getn(),y.getn(),r.getn(),xl,yl,l,offs,_nch,Cptr,mode);
+      }
+    }
+
+  }
+
+  
+  template<typename CMAP>
+  void SO3partA_CGproduct_back0_cu(const CMAP& map, SO3partArrayA& x, const SO3partArrayA& g, 
+    const SO3partArrayA& y, const cudaStream_t& stream, const int offs, const int mode){
+
+    const int xl=x.getl();
+    const int yl=y.getl();
+    const int l=g.getl();
+
+    int Cptr=SO3_cgbank.getfC(xl,yl,l)/4;
+    int nlines=x.cellstride/16+y.cellstride/16+g.cellstride/16;
+    assert(x.nbu==g.nbu);
+    assert(y.nbu==g.nbu);
+
+    const int _nch=1;
+    int _nbu=1; if(_nbu<0) _nbu=1;
+    nlines/=_nbu;
+
+    cout<<"nlines="<<nlines<<endl;
+
+    if(nlines<=0*384){
+
+      SO3partA_CGproduct_back0_kernel<<<map.blockdims(),cnine::roundup(x.getn(),32),nlines*128,stream>>>
+	(x.arrg,x.arrgc,g.arrg,g.arrgc,y.arrg,y.arrgc,
+	  x.cellstride,y.cellstride,g.cellstride,map,
+	  x.getn(),y.getn(),g.getn(),xl,yl,l,offs,_nch,Cptr,mode);
+      
+    }else{
+
+      int nlines=x.cellstride/16+y.cellstride/16;
+      
+      cout<<"GElib: large CGproduct_back0"<<endl; 
+
+      if(nlines>384){
+	cout<<"GElib error: CGproduct too big for shared memory"<<endl;
+      }else{
+	SO3partA_CGproduct_back0_kernel_big<<<map.blockdims(),cnine::roundup(std::max(x.getn(),y.getn()),32),nlines*128,stream>>>
+	  (x.arrg,x.arrgc,g.arrg,g.arrgc,y.arrg,y.arrgc,
+	    x.cellstride,y.cellstride,g.cellstride,map,
+	    x.getn(),y.getn(),g.getn(),xl,yl,l,offs,_nch,Cptr,mode);
+      }
+
+    }
+
+  }
+  
+
+  template<typename CMAP>
+  void SO3partA_CGproduct_back1_cu(const CMAP& map, SO3partArrayA& y, const SO3partArrayA& g, 
+    const SO3partArrayA& x, const cudaStream_t& stream, const int offs, const int mode){
+
+    const int xl=x.getl();
+    const int yl=y.getl();
+    const int l=g.getl();
+
+    int Cptr=SO3_cgbank.getfC(xl,yl,l)/4;
+    int nlines=x.cellstride/16+y.cellstride/16+g.cellstride/16;
+    assert(x.nbu==g.nbu);
+    assert(y.nbu==g.nbu);
+
+    const int _nch=1;
+    int _nbu=1; if(_nbu<0) _nbu=1;
+    nlines/=_nbu;
+
+    cout<<"nlines="<<nlines<<endl;
+
+    if(nlines<=0*384){
+
+      SO3partA_CGproduct_back1_kernel<<<map.blockdims(),cnine::roundup(y.getn(),32),nlines*128,stream>>>
+	(y.arrg,y.arrgc,g.arrg,g.arrgc,x.arrg,x.arrgc,
+	  x.cellstride,y.cellstride,g.cellstride,map,
+	  x.getn(),y.getn(),g.getn(),xl,yl,l,offs,_nch,Cptr,mode);
+      
+    }else{
+
+      int nlines=x.cellstride/16+y.cellstride/16;
+      
+      cout<<"GElib: large CGproduct_back1"<<endl; 
+
+      if(nlines>384){
+	cout<<"GElib error: CGproduct too big for shared memory"<<endl;
+      }else{
+	SO3partA_CGproduct_back1_kernel_big<<<map.blockdims(),cnine::roundup(std::max(x.getn(),y.getn()),32),nlines*128,stream>>>
+	  (y.arrg,y.arrgc,g.arrg,g.arrgc,x.arrg,x.arrgc,
+	    x.cellstride,y.cellstride,g.cellstride,map,
+	    x.getn(),y.getn(),g.getn(),xl,yl,l,offs,_nch,Cptr,mode);
+      }
+
+    }
+
+  }
+
+
+  template void SO3partA_CGproduct_cu(const cnine::CellwiseBinaryCmap& map, 
+    SO3partArrayA&, const SO3partArrayA&, const SO3partArrayA&, const cudaStream_t&, const int offs, 
+    const int mode);
+
+  template void SO3partA_CGproduct_cu(const cnine::BroadcastBinaryCmap& map, 
+    SO3partArrayA&, const SO3partArrayA&, const SO3partArrayA&, const cudaStream_t&, const int offs, 
+    const int mode);
+
+  template void SO3partA_CGproduct_cu(const cnine::OuterCmap& map, 
+    SO3partArrayA&, const SO3partArrayA&, const SO3partArrayA&, const cudaStream_t&, const int offs, 
+    const int mode);
+
+  template void SO3partA_CGproduct_cu(const cnine::InnerCmap& map, 
+    SO3partArrayA&, const SO3partArrayA&, const SO3partArrayA&, const cudaStream_t&, const int offs, 
+    const int mode);
+
+  template void SO3partA_CGproduct_cu(const cnine::MVprodCmap& map, 
+    SO3partArrayA&, const SO3partArrayA&, const SO3partArrayA&, const cudaStream_t&, const int offs, 
+    const int mode);
+
+  template void SO3partA_CGproduct_cu(const cnine::Convolve2Cmap& map, 
+    SO3partArrayA&, const SO3partArrayA&, const SO3partArrayA&, const cudaStream_t&, const int offs, 
+    const int mode);
+
+
+
+  template void SO3partA_CGproduct_back0_cu(const cnine::CellwiseBinaryCmap& map, 
+    SO3partArrayA&, const SO3partArrayA&, const SO3partArrayA&, const cudaStream_t&, const int offs, 
+    const int mode);
+
+  template void SO3partA_CGproduct_back0_cu(const cnine::BroadcastBinaryCmap& map, 
+    SO3partArrayA&, const SO3partArrayA&, const SO3partArrayA&, const cudaStream_t&, const int offs, 
+    const int mode);
+
+  template void SO3partA_CGproduct_back0_cu(const cnine::OuterCmap& map, 
+    SO3partArrayA&, const SO3partArrayA&, const SO3partArrayA&, const cudaStream_t&, const int offs, 
+    const int mode);
+
+
+  template void SO3partA_CGproduct_back1_cu(const cnine::CellwiseBinaryCmap& map, 
+    SO3partArrayA&, const SO3partArrayA&, const SO3partArrayA&, const cudaStream_t&, const int offs, 
+    const int mode);
+
+  template void SO3partA_CGproduct_back1_cu(const cnine::BroadcastBinaryCmap& map, 
+    SO3partArrayA&, const SO3partArrayA&, const SO3partArrayA&, const cudaStream_t&, const int offs, 
+    const int mode);
+
+  template void SO3partA_CGproduct_back1_cu(const cnine::OuterCmap& map, 
+    SO3partArrayA&, const SO3partArrayA&, const SO3partArrayA&, const cudaStream_t&, const int offs, 
+    const int mode);
+
+
+
+
+}
+
+#endif 
+
+
+
+
+
+  /*
+  void SO3partA_CGproduct_cu(SO3partArrayA& r, const SO3partArrayA& x, const SO3partArrayA& y, 
+    const int mode, const cudaStream_t& stream, const int offs){
+
+    const int xl=x.getl();
+    const int yl=y.getl();
+    const int l=r.getl();
+
+    int Cptr=SO3_cgbank.getfC(xl,yl,l)/4;
+    int nlines=x.cellstride/16+y.cellstride/16+r.cellstride/16;
+    assert(x.nbu==r.nbu);
+    assert(y.nbu==r.nbu);
+
+    const int _nch=1;
+    int _nbu=1; if(_nbu<0) _nbu=1;
+    nlines/=_nbu;
+
+    if(mode==0){
+      dim3 blocks(r.aasize,1,1);
+      cnine::CellwiseImap imap;
+      SO3partA_CGproduct_kernel<<<blocks,cnine::roundup(x.getn()*y.getn(),32),nlines*128,stream>>>
+	(r.arrg,r.arrgc,x.arrg,x.arrgc,y.arrg,y.arrgc,
+	  r.cellstride,x.cellstride,y.cellstride,imap,
+	  x.getn(),y.getn(),r.getn(),xl,yl,l,offs,_nch,Cptr); // 
+    }
+
+    if(mode==1){
+      dim3 blocks(x.aasize,y.aasize,1);
+      cnine::OuterImap imap(r.adims[1]);
+      SO3partA_CGproduct_kernel<<<blocks,cnine::roundup(x.getn()*y.getn(),32),nlines*128,stream>>>
+	(r.arrg,r.arrgc,x.arrg,x.arrgc,y.arrg,y.arrgc,
+	  r.cellstride,x.cellstride,y.cellstride,imap,
+	  x.getn(),y.getn(),r.getn(),xl,yl,l,offs,_nch,Cptr);
+    }
+    
+
+  }
+    */
+
+
+  /*
+  void SO3partA_CGproduct_cu(SO3partArrayA& r, const SO3partArrayA& x, const SO3partArrayA& y, 
+    const int rN, const int xN, const int yN, 
+    const int ris, const int rjs, const int rks, 
+    const int xis, const int xjs, const int xks, 
+    const int yis, const int yjs, const int yks, 
+    const cudaStream_t& stream, const int offs){
+
+    const int xl=x.getl();
+    const int yl=y.getl();
+    const int l=r.getl();
+
+    int Cptr=SO3_cgbank.getfC(xl,yl,l)/4;
+    int nlines=x.cellstride/16+y.cellstride/16+r.cellstride/16;
+    assert(x.nbu==r.nbu);
+    assert(y.nbu==r.nbu);
+
+    const int _nch=1;
+    int _nbu=1; if(_nbu<0) _nbu=1;
+    dim3 blocks(rN,xN,yN);
+    nlines/=_nbu;
+
+
+    SO3partA_CGproduct_kernel<<<blocks,cnine::roundup(x.getn()*y.getn(),32),nlines*128,stream>>>
+      (r.arrg,r.arrgc,x.arrg,x.arrgc,y.arrg,y.arrgc,
+	ris*r.cellstride,rjs*r.cellstride, rks*r.cellstride,
+	xis*x.cellstride,xjs*x.cellstride, xks*x.cellstride,
+	yis*y.cellstride,yjs*y.cellstride, yks*y.cellstride,
+	x.getn(),y.getn(),r.getn(),xl,yl,l,offs,_nch,Cptr); 
+  }
+  */
+
+/*
+__global__ void SO3partA_CGproduct_kernel(float* rarr, float* rarrc, float* xarr, float* xarrc, 
+  float* yarr, float* yarrc, 
+  const int ristride, const int xistride, const int yistride,   
+  const int rjstride, const int xjstride, const int yjstride,   
+  const int rkstride, const int xkstride, const int ykstride,   
+  const int xfrags, const int yfrags, const int rfrags,  
+  const int l1, const int l2, const int l, const int _offs, const int nch, const int Cptr){
+
+  extern __shared__ unsigned char _shared[]; 
+  float* shared=reinterpret_cast<float*>(_shared);
+
+  const float* C_ptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+
+  const int iix=blockIdx.x;
+  const int jix=blockIdx.y;
+  const int kix=blockIdx.z;
+
+  const int t=threadIdx.x;
+
+  const int r1=2*l1+1;
+  const int r2=2*l2+1;
+  const int r=2*l+1;
+
+  const int xwidth=xfrags*nch; 
+  const int ywidth=yfrags*nch; 
+  const int rwidth=xfrags*yfrags*nch;
+  const int global_rwidth=rfrags*nch;
+
+  int offs=0;
+
+  int xptr=32*offs;
+  SO3part_load(offs,shared,xarr,xarrc,l1,xwidth,iix*xistride+jix*xjstride+kix*xkstride,t);
+
+  const int yptr=32*offs;
+  SO3part_load(offs,shared,yarr,yarrc,l2,ywidth,iix*yistride+jix*yjstride+kix*ykstride,t);
+
+  const int rpr=32*offs+t;
+  const int rpi=rpr+((r*rwidth-1)/32+1)*32;
+  float* _rptr=rarr+iix*ristride+jix*rjstride+kix*rkstride+_offs;
+  float* _rptri=rarrc+iix*ristride+jix*rjstride+kix*rkstride+_offs;
+
+  if(t<rwidth){
+    for(int i=0; i<r; i++)
+      shared[rpr+i*rwidth]=_rptr[i*global_rwidth+t];
+    for(int i=0; i<r; i++)
+      shared[rpi+i*rwidth]=_rptri[i*global_rwidth+t];
+  }
+
+  __syncthreads();
+  
+  const int xcol=t/yfrags;
+  const int xpr=xptr+xcol;
+  const int xlines=((r1*xwidth-1)/32+1); 
+  const int xpi=xpr+xlines*32;
+
+  const int ycol=t%ywidth;
+  const int ypr=yptr+ycol;
+  const int ylines=((r2*ywidth-1)/32+1); 
+  const int ypi=ypr+ylines*32;
+
+  if(t<rwidth){
+    for(int m1=-l1; m1<=l1; m1++){
+      const float x_r=shared[xpr+xwidth*(m1+l1)];
+      const float x_i=shared[xpi+xwidth*(m1+l1)];
+      int lower=-l-m1; if(lower<-l2) lower=-l2;
+      int upper=l-m1; if(upper>l2) upper=l2;
+      for(int m2=lower; m2<=upper; m2++){
+	float c=C_ptr[(m1+l1)*r2+m2+l2];
+	const float y_r=shared[ypr+ywidth*(m2+l2)];
+	const float y_i=shared[ypi+ywidth*(m2+l2)];
+	shared[rpr+rwidth*(m1+m2+l)]+=c*(x_r*y_r-x_i*y_i); 
+	shared[rpi+rwidth*(m1+m2+l)]+=c*(x_r*y_i+x_i*y_r);
+      }
+    }
+  }
+
+  __syncthreads();
+
+  if(t<rwidth){
+    for(int i=0; i<r; i++)
+      _rptr[i*global_rwidth+t]=shared[rpr+i*rwidth];
+    for(int i=0; i<r; i++)
+      _rptri[i*global_rwidth+t]=shared[rpi+i*rwidth];
+  }
+
+}
+*/
+
+/*
+__device__ int SO3part_load(int& offs, float* shared, float* arr, float* arrc, const int l, const int w, const int skip, const int t){
+  const int _offs=offs;
+  int ptr=32*offs;
+  const int r=2*l+1;
+  const int lines=((r*w-1)/32+1); 
+  float* xcell=arr+skip; 
+  if(t<32)
+    for(int i=0; i<lines; i++)
+      shared[ptr+i*32+t]=xcell[i*32+t];
+  ptr+=32*lines;
+  xcell=arrc+skip;
+  if(t<32)
+    for(int i=0; i<lines; i++)
+      shared[ptr+i*32+t]=xcell[i*32+t];
+  offs+=2*lines;
+  return 32*_offs;
+}
+
+
+__device__ int SO3part_load(int& offs, float* shared, float* arr, float* arrc, const int l, const int w, const int skip, 
+  const int frag_offs, const int nfrags, const int t){
+  const int _offs=offs;
+  int ptr=32*offs+t;
+  const int r=2*l+1;
+  const int lines=((r*w-1)/32+1); 
+  float* xcell=arr+skip+frag_offs+t;
+  if(t<nfrags){
+    for(int i=0; i<r; i++)
+      shared[ptr+i*nfrags]=xcell[i*w];
+    ptr+=32*lines;
+    xcell=arrc+skip+frag_offs+t;
+    for(int i=0; i<r; i++)
+      shared[ptr+i*nfrags]=xcell[i*w];
+  }
+  offs+=2*lines;
+  return 32*_offs;
+}
+*/
+
diff --git a/cuda/SO3partA_DiagCGproduct.cu b/cuda/SO3partA_DiagCGproduct.cu
new file mode 100644
index 0000000..7a9660e
--- /dev/null
+++ b/cuda/SO3partA_DiagCGproduct.cu
@@ -0,0 +1,704 @@
+/*
+ * This file is part of GElib, a C++/CUDA library for group equivariant 
+ * tensor operations. 
+ *  
+ * Copyright (c) 2023, Imre Risi Kondor
+ *
+ * This source code file is subject to the terms of the noncommercial 
+ * license distributed with GElib in the file NONCOMMERICAL.TXT. Commercial 
+ * use is prohibited. All redistributed versions of this file (in orginal
+ * or modified form) must retain this copyright notice and must be 
+ * accompanied by a verbatim copy of the license. 
+ *
+ */
+
+#ifndef _SO3partA_DiagCGproduct_cu
+#define _SO3partA_DiagCGproduct_cu
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <thrust/complex.h>
+#include <thrust/tuple.h>
+
+//__device__ __constant__ unsigned char cg_cmem[32276]; 
+
+#include "SO3partArrayA.hpp"
+#include "SO3_CGbank.hpp"
+#include "SO3partA.hpp"
+
+#include "CellwiseBinaryCmap.hpp"
+#include "BroadcastBinaryCmap.hpp"
+#include "InnerCmap.hpp"
+#include "OuterCmap.hpp"
+#include "MVprodCmap.hpp"
+#include "VMprodCmap.hpp"
+#include "Convolve2Cmap.hpp"
+
+
+// should move these elsewhere
+
+__device__ void SO3part_load_lines2(float* dest, const float* source, const int nlines, const int t){
+  if(t<32){
+    for(int i=0; i<nlines; i++)
+      dest[i*32+t]=source[i*32+t];
+  }
+}
+
+__device__ void SO3part_save_lines2(const float* source, float* dest, const int nlines, const int t){
+  if(t<32){
+    for(int i=0; i<nlines; i++)
+      dest[i*32+t]=source[i*32+t];
+  }
+}
+
+
+// ---- DiagCGproduct --------------------------------------------------------------------------------------------
+
+
+template<typename IMAP>
+__global__ void SO3partA_DiagCGproduct_kernel(float* rarr, float* rarrc, float* xarr, float* xarrc, 
+  float* yarr, float* yarrc, const int rstride, const int xstride, const int ystride, const IMAP cmap, 
+  const int xn, const int yn, const int rn, const int l1, const int l2, const int l, 
+  const int _offs, const int nch, const int Cptr, const int mode=0){
+
+  extern __shared__ unsigned char _shared[]; 
+  float* shared=reinterpret_cast<float*>(_shared);
+
+  const float* C_ptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+  const int t=threadIdx.x;
+
+  const int r=2*l+1;
+  const int r1=2*l1+1;
+  const int r2=2*l2+1;
+
+  const int xwidth=xn*nch; 
+  const int ywidth=yn*nch; 
+  const int rwidth=xn*nch;
+  const int global_rwidth=rn*nch;
+  
+  const int rlines=((r*rwidth-1)/32+1);
+  const int xlines=((r1*xwidth-1)/32+1);
+  const int ylines=((r2*ywidth-1)/32+1);
+
+  const int rptr=0;
+  const int xptr=rptr+rlines*64;
+  const int yptr=xptr+xlines*64;
+
+  int rix,xix,yix;
+  int nsum;
+  int lst;
+
+
+  if(mode<2){
+    auto T=cmap(blockIdx.x,blockIdx.y,blockIdx.z);
+    rix=thrust::get<0>(T);
+    xix=thrust::get<1>(T);
+    yix=thrust::get<2>(T);
+    nsum=1;
+    //if(t==0) printf("foop1\n");
+  }else{
+    rix=cmap.target(blockIdx.x);
+    nsum=cmap.n_accum(blockIdx.x);
+    lst=cmap.lst_ptr(blockIdx.x);
+  }
+  
+  if(mode==1){
+    if(t<32){
+      for(int i=0; i<2*rlines; i++)
+	shared[rptr+i*32+t]=0;
+    }
+  }else{
+    if(t<rwidth){
+      for(int i=0; i<r; i++)
+	shared[rptr+i*rwidth+t]=rarr[rix*rstride+_offs+i*global_rwidth+t];
+      for(int i=0; i<r; i++)
+	shared[rptr+rlines*32+i*rwidth+t]=rarrc[rix*rstride+_offs+i*global_rwidth+t];
+    }
+  }
+  
+  for(int s=0; s<nsum; s++){
+
+    if(mode==2){
+      auto T=cmap.source(lst,blockIdx.x,s);
+      xix=thrust::get<0>(T);
+      yix=thrust::get<1>(T);
+    }
+
+    SO3part_load_lines2(shared+xptr,xarr+xix*xstride,xlines,t);
+    SO3part_load_lines2(shared+xptr+xlines*32,xarrc+xix*xstride,xlines,t);
+    SO3part_load_lines2(shared+yptr,yarr+yix*ystride,ylines,t);
+    SO3part_load_lines2(shared+yptr+ylines*32,yarrc+yix*ystride,ylines,t);
+
+    //if(t==0) printf("foop3\n");
+      
+      __syncthreads();
+
+      const int rpr=rptr+t;
+      const int rpi=rpr+rlines*32;
+
+      const int xpr=xptr+t;
+      const int xpi=xpr+xlines*32;
+
+      const int ypr=yptr+t;
+      const int ypi=ypr+ylines*32;
+
+
+      if(t<rwidth){
+	for(int m1=-l1; m1<=l1; m1++){
+	  const float x_r=shared[xpr+xwidth*(m1+l1)];
+	  const float x_i=shared[xpi+xwidth*(m1+l1)];
+	  int lower=-l-m1; if(lower<-l2) lower=-l2;
+	  int upper=l-m1; if(upper>l2) upper=l2;
+	  for(int m2=lower; m2<=upper; m2++){
+	    float c=C_ptr[(m1+l1)*r2+m2+l2];
+	    const float y_r=shared[ypr+ywidth*(m2+l2)];
+	    const float y_i=shared[ypi+ywidth*(m2+l2)];
+	    shared[rpr+rwidth*(m1+m2+l)]+=c*(x_r*y_r-x_i*y_i); 
+	    shared[rpi+rwidth*(m1+m2+l)]+=c*(x_r*y_i+x_i*y_r);
+	  }
+	}
+      }
+
+    //if(t==0) printf("foop4\n");
+
+      __syncthreads();
+  }
+
+  //if(t==0) printf("fooq\n");
+  
+  if(t<rwidth){
+    for(int i=0; i<r; i++)
+      rarr[rix*rstride+_offs+i*global_rwidth+t]=shared[rptr+i*rwidth+t];
+    for(int i=0; i<r; i++)
+      rarrc[rix*rstride+_offs+i*global_rwidth+t]=shared[rptr+rlines*32+i*rwidth+t];
+  }    
+ 
+}
+
+
+
+template<typename IMAP> // TODO
+__global__ void SO3partA_DiagCGproduct_kernel_L(float* rarr, float* rarrc, float* xarr, float* xarrc, 
+  float* yarr, float* yarrc, const int rstride, const int xstride, const int ystride, const IMAP cmap, 
+  const int xn, const int yn, const int rn, const int l1, const int l2, const int l, 
+  const int _offs, const int nch, const int Cptr, const int mode=0){
+
+  extern __shared__ unsigned char _shared[]; 
+  float* shared=reinterpret_cast<float*>(_shared);
+
+  const float* C_ptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+  const int t=threadIdx.x;
+
+  const int r=2*l+1;
+  const int r1=2*l1+1;
+  const int r2=2*l2+1;
+
+  const int xwidth=xn*nch; 
+  const int ywidth=yn*nch; 
+  const int rwidth=xn*nch; 
+  const int global_rwidth=rn*nch;
+  
+  const int rlines=((r*rwidth-1)/32+1);
+  const int xlines=((r1*xwidth-1)/32+1);
+  const int ylines=((r2*1-1)/32+1);
+
+  const int rptr=0;
+  const int xptr=rptr+rlines*64;
+  const int yptr=xptr+xlines*64;
+
+  int rix,xix,yix;
+  int nsum;
+  int lst;
+
+  if(mode<2){
+    auto T=cmap(blockIdx.x,blockIdx.y,blockIdx.z);
+    rix=thrust::get<0>(T);
+    xix=thrust::get<1>(T);
+    yix=thrust::get<2>(T);
+    nsum=1;
+  }else{
+    rix=cmap.target(blockIdx.x);
+    nsum=cmap.n_accum(blockIdx.x);
+    lst=cmap.lst_ptr(blockIdx.x);
+  }
+  
+
+  for(int s=0; s<nsum; s++){
+
+    if(mode==2){
+      auto T=cmap.source(lst,blockIdx.x,s);
+      xix=thrust::get<0>(T);
+      yix=thrust::get<1>(T);
+    }
+
+    SO3part_load_lines2(shared+xptr,xarr+xix*xstride,xlines,t);
+    SO3part_load_lines2(shared+xptr+xlines*32,xarrc+xix*xstride,xlines,t);
+
+
+    for(int ycol=0; ycol<ywidth; ycol++){
+
+      if(t==0){
+	for(int i=0; i<r; i++)
+	  shared[yptr+i]=rarr[yix*ystride+i*ywidth+ycol];
+	for(int i=0; i<r; i++)
+	  shared[yptr+ylines*32+i]=rarrc[yix*ystride+i*ywidth+ycol];
+      }
+
+      if(t<32){
+	for(int i=0; i<2*rlines; i++)
+	  shared[rptr+i*32+t]=0;
+      }
+      
+      __syncthreads();
+
+      const int rpr=rptr+t;
+      const int rpi=rpr+rlines*32;
+
+      const int xcol=t;
+      const int xpr=xptr+xcol;
+      const int xpi=xpr+xlines*32;
+
+      //const int ycol=t%ywidth;
+      const int ypr=yptr;// +ycol;
+      const int ypi=ypr+ylines*32;
+
+
+      if(t<rwidth){
+	for(int m1=-l1; m1<=l1; m1++){
+	  const float x_r=shared[xpr+xwidth*(m1+l1)];
+	  const float x_i=shared[xpi+xwidth*(m1+l1)];
+	  int lower=-l-m1; if(lower<-l2) lower=-l2;
+	  int upper=l-m1; if(upper>l2) upper=l2;
+	  for(int m2=lower; m2<=upper; m2++){
+	    float c=C_ptr[(m1+l1)*r2+m2+l2];
+	    const float y_r=shared[ypr+1*(m2+l2)];
+	    const float y_i=shared[ypi+1*(m2+l2)];
+	    shared[rpr+rwidth*(m1+m2+l)]+=c*(x_r*y_r-x_i*y_i); 
+	    shared[rpi+rwidth*(m1+m2+l)]+=c*(x_r*y_i+x_i*y_r);
+	  }
+	}
+      }
+
+      //if(t==0) printf("foop4\n");
+
+      __syncthreads();
+
+      //if(t==0) printf("fooq\n");
+  
+      if(t<rwidth){
+	for(int i=0; i<r; i++)
+	  rarr[rix*rstride+_offs+i*global_rwidth+t*ywidth+ycol]+=shared[rptr+i*rwidth+t];
+	for(int i=0; i<r; i++)
+	  rarrc[rix*rstride+_offs+i*global_rwidth+t*ywidth+ycol]+=shared[rptr+rlines*32+i*rwidth+t];
+      }    
+
+      __syncthreads();
+
+    } // ycol
+
+  } //nsum
+}
+
+
+
+
+// ---- back0 ------------------------------------------------------------------------------------------------
+
+
+template<typename IMAP> // TODO 
+__global__ void SO3partA_DiagCGproduct_back0_kernel(float* xarr, float* xarrc, float* garr, float* garrc, 
+  float* yarr, float* yarrc, const int xstride, const int ystride, const int gstride, const IMAP cmap, 
+  const int xn, const int yn, const int gn, const int l1, const int l2, const int l, 
+  const int _offs, const int nch, const int Cptr, const int mode=0){
+
+  extern __shared__ unsigned char _shared[]; 
+  float* shared=reinterpret_cast<float*>(_shared);
+
+  const float* C_ptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+  const int t=threadIdx.x;
+
+  const int rg=2*l+1;
+  const int rx=2*l1+1;
+  const int ry=2*l2+1;
+
+  const int xwidth=xn*nch; 
+  const int ywidth=yn*nch; 
+  const int gwidth=xn*yn*nch;
+  const int global_gwidth=gn*nch;
+
+  const int glines=((rg*gwidth-1)/32+1);
+  const int xlines=((rx*xwidth-1)/32+1);
+  const int ylines=((ry*ywidth-1)/32+1);
+
+  const int xptr=0;
+  const int gptr=xptr+xlines*64;
+  const int yptr=gptr+glines*64;
+
+  int gix,xix,yix;
+  int nsum;
+  int lst;
+
+  if(mode<2){
+    auto T=cmap(blockIdx.x,blockIdx.y,blockIdx.z);
+    xix=thrust::get<0>(T);
+    gix=thrust::get<1>(T);
+    yix=thrust::get<2>(T);
+    nsum=1;
+  }else{
+    xix=cmap.target(blockIdx.x);
+    nsum=cmap.n_accum(blockIdx.x);
+    lst=cmap.lst_ptr(blockIdx.x);
+  }
+
+  if(mode==1){
+    if(t<32){
+      for(int i=0; i<2*xlines; i++){
+	shared[xptr+i*32+t]=0;
+      }
+    }
+  }else{
+    SO3part_load_lines2(shared+xptr,xarr+xix*xstride,xlines,t);
+    SO3part_load_lines2(shared+xptr+xlines*32,xarrc+xix*xstride,xlines,t);
+  }
+  
+  for(int s=0; s<nsum; s++){
+
+    if(mode==2){
+      auto T=cmap.source(lst,blockIdx.x,s);
+      gix=thrust::get<0>(T);
+      yix=thrust::get<1>(T);
+    }
+
+    // hack: gwidth assumed to be <=32
+    for(int i=0; i<rg; i++)
+      if(t<gwidth)
+	shared[gptr+i*gwidth+t]=garr[gix*gstride+i*global_gwidth+_offs+t];
+    for(int i=0; i<rg; i++)
+      if(t<gwidth)
+	shared[gptr+glines*32+i*gwidth+t]=garrc[gix*gstride+i*global_gwidth+_offs+t];
+
+    SO3part_load_lines2(shared+yptr,yarr+yix*ystride,ylines,t);
+    SO3part_load_lines2(shared+yptr+ylines*32,yarrc+yix*ystride,ylines,t);
+
+    __syncthreads();
+
+      //const int xcol=t;
+    const int xpr=xptr+t;
+    const int xpi=xpr+xlines*32;
+    
+    for(int ycol=0; ycol<ywidth; ycol++){
+      
+      const int ypr=yptr+ycol;
+      const int ypi=ypr+ylines*32;
+      
+      const int gpr=gptr+ywidth*t+ycol;
+      const int gpi=gpr+glines*32;
+
+      if(t<xwidth){
+	for(int m1=-l1; m1<=l1; m1++){
+	  int lower=-l-m1; if(lower<-l2) lower=-l2;
+	  int upper=l-m1; if(upper>l2) upper=l2;
+	  for(int m2=lower; m2<=upper; m2++){
+	    float c=C_ptr[(m1+l1)*ry+m2+l2];
+	    const float y_r=shared[ypr+ywidth*(m2+l2)];
+	    const float y_i=shared[ypi+ywidth*(m2+l2)];
+	    const float g_r=shared[gpr+gwidth*(m1+m2+l)];
+	    const float g_i=shared[gpi+gwidth*(m1+m2+l)];
+	    shared[xpr+xwidth*(m1+l1)]+=c*(g_r*y_r+g_i*y_i); 
+	    shared[xpi+xwidth*(m1+l1)]+=c*(-g_r*y_i+g_i*y_r);
+	  }
+	}
+      }
+      __syncthreads();
+	
+    }
+
+  }
+  
+  SO3part_save_lines2(shared+xptr,xarr+xix*xstride,xlines,t);
+  SO3part_save_lines2(shared+xptr+xlines*32,xarrc+xix*xstride,xlines,t);
+    
+  __syncthreads();
+
+}
+
+
+// ---- back1 ------------------------------------------------------------------------------------------------
+
+
+template<typename IMAP> // TODO 
+__global__ void SO3partA_DiagCGproduct_back1_kernel(float* yarr, float* yarrc, float* garr, float* garrc, 
+  float* xarr, float* xarrc, const int xstride, const int ystride, const int gstride, const IMAP cmap, 
+  const int xn, const int yn, const int gn, const int l1, const int l2, const int l, 
+  const int _offs, const int nch, const int Cptr, const int mode=0){
+
+  extern __shared__ unsigned char _shared[]; 
+  float* shared=reinterpret_cast<float*>(_shared);
+
+  const float* C_ptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+  const int t=threadIdx.x;
+
+  const int rg=2*l+1;
+  const int rx=2*l1+1;
+  const int ry=2*l2+1;
+
+  const int xwidth=xn*nch; 
+  const int ywidth=yn*nch; 
+  const int gwidth=xn*yn*nch;
+  const int global_gwidth=gn*nch;
+
+  const int glines=((rg*gwidth-1)/32+1);
+  const int xlines=((rx*xwidth-1)/32+1);
+  const int ylines=((ry*ywidth-1)/32+1);
+
+  const int yptr=0;
+  const int gptr=yptr+ylines*64;
+  const int xptr=gptr+glines*64;
+
+  int gix,xix,yix;
+  int nsum;
+  int lst;
+
+  if(mode<2){
+    auto T=cmap(blockIdx.x,blockIdx.y,blockIdx.z);
+    yix=thrust::get<0>(T);
+    gix=thrust::get<1>(T);
+    xix=thrust::get<2>(T);
+    nsum=1;
+  }else{
+    yix=cmap.target(blockIdx.x);
+    nsum=cmap.n_accum(blockIdx.x);
+    lst=cmap.lst_ptr(blockIdx.x);
+  }
+
+  if(mode==1){
+    if(t<32){
+      for(int i=0; i<2*ylines; i++)
+	shared[yptr+i*32+t]=0;
+    }
+  }else{
+    SO3part_load_lines2(shared+yptr,yarr+yix*ystride,ylines,t);
+    SO3part_load_lines2(shared+yptr+ylines*32,yarrc+yix*ystride,ylines,t);
+  }
+  
+  for(int s=0; s<nsum; s++){
+
+    if(mode==2){
+      auto T=cmap.source(lst,blockIdx.x,s);
+      gix=thrust::get<0>(T);
+      xix=thrust::get<1>(T);
+    }
+
+    // hack: gwidth assumed to be <=32
+    for(int i=0; i<rg; i++)
+      if(t<gwidth)
+	shared[gptr+i*gwidth+t]=garr[gix*gstride+i*global_gwidth+_offs+t];
+    for(int i=0; i<rg; i++)
+      if(t<gwidth)
+	shared[gptr+glines*32+i*gwidth+t]=garrc[gix*gstride+i*global_gwidth+_offs+t];
+
+    SO3part_load_lines2(shared+xptr,xarr+xix*xstride,xlines,t);
+    SO3part_load_lines2(shared+xptr+xlines*32,xarrc+xix*xstride,xlines,t);
+
+    __syncthreads();
+
+    //const int ycol=t;
+    const int ypr=yptr+t;
+    const int ypi=ypr+ylines*32;
+    
+    for(int xcol=0; xcol<xwidth; xcol++){
+      
+      const int xpr=xptr+xcol;
+      const int xpi=xpr+xlines*32;
+      
+      const int gpr=gptr+ywidth*xcol+t;
+      const int gpi=gpr+glines*32;
+
+      if(t<ywidth){
+	for(int m1=-l1; m1<=l1; m1++){
+	  int lower=-l-m1; if(lower<-l2) lower=-l2;
+	  int upper=l-m1; if(upper>l2) upper=l2;
+	  const float x_r=shared[xpr+xwidth*(m1+l1)];
+	  const float x_i=shared[xpi+xwidth*(m1+l1)];
+	  for(int m2=lower; m2<=upper; m2++){
+	    float c=C_ptr[(m1+l1)*ry+m2+l2];
+	    const float g_r=shared[gpr+gwidth*(m1+m2+l)];
+	    const float g_i=shared[gpi+gwidth*(m1+m2+l)];
+	    shared[ypr+ywidth*(m2+l2)]+=c*(g_r*x_r+g_i*x_i); 
+	    shared[ypi+ywidth*(m2+l2)]+=c*(-g_r*x_i+g_i*x_r);
+	  }
+	}
+      }
+      __syncthreads();
+
+    }
+
+  }
+  
+  SO3part_save_lines2(shared+yptr,yarr+yix*ystride,ylines,t);
+  SO3part_save_lines2(shared+yptr+ylines*32,yarrc+yix*ystride,ylines,t);
+    
+  __syncthreads();
+
+}
+
+
+// -----------------------------------------------------------------------------------------------------------
+
+
+namespace GElib{
+
+
+  template<typename CMAP>
+  void SO3partA_DiagCGproduct_cu(const CMAP& map, SO3partArrayA& r, const SO3partArrayA& x, 
+    const SO3partArrayA& y, const cudaStream_t& stream, const int offs, const int mode){
+
+    const int xl=x.getl();
+    const int yl=y.getl();
+    const int l=r.getl();
+    const int _nch=1;
+    assert(x.nbu==r.nbu);
+    assert(y.nbu==r.nbu);
+    int _nbu=1; if(_nbu<0) _nbu=1;
+
+    int Cptr=SO3_cgbank.getfC(xl,yl,l)/4;
+    //int nlines=x.cellstride/16+y.cellstride/16+r.cellstride/16; // should be smaller than this!
+    int nlines=x.cellstride/16+y.cellstride/16+cnine::roundup(x.getn()*_nch*(2*l+1),32)/16;
+    // nlines/=_nbu;
+
+    if(nlines<=384){
+
+      SO3partA_DiagCGproduct_kernel<<<map.blockdims(),cnine::roundup(x.getn(),32),nlines*128,stream>>>
+	(r.arrg,r.arrgc,x.arrg,x.arrgc,y.arrg,y.arrgc,
+	  r.cellstride,x.cellstride,y.cellstride,map,
+	  x.getn(),y.getn(),r.getn(),xl,yl,l,offs,_nch,Cptr,mode);
+
+    }else{ // TODO 
+      
+      int nlines=x.cellstride/16+cnine::roundup(_nch*(2*yl+1),32)/16+cnine::roundup(x.getn()*_nch*(2*l+1),32)/16;
+
+      if(nlines>384){
+	cout<<"GElib error: DiagCGproduct too big for shared memory"<<endl;
+      }else{
+	SO3partA_DiagCGproduct_kernel_L<<<map.blockdims(),cnine::roundup(x.getn(),32),nlines*128,stream>>>
+	  (r.arrg,r.arrgc,x.arrg,x.arrgc,y.arrg,y.arrgc,
+	    r.cellstride,x.cellstride,y.cellstride,map,
+	    x.getn(),y.getn(),r.getn(),xl,yl,l,offs,_nch,Cptr,mode);
+      }
+    }
+
+  }
+
+  
+  template<typename CMAP> // TODO 
+  void SO3partA_DiagCGproduct_back0_cu(const CMAP& map, SO3partArrayA& x, const SO3partArrayA& g, 
+    const SO3partArrayA& y, const cudaStream_t& stream, const int offs, const int mode){
+
+    const int xl=x.getl();
+    const int yl=y.getl();
+    const int l=g.getl();
+
+    int Cptr=SO3_cgbank.getfC(xl,yl,l)/4;
+    int nlines=x.cellstride/16+y.cellstride/16+g.cellstride/16;
+    assert(x.nbu==g.nbu);
+    assert(y.nbu==g.nbu);
+
+    const int _nch=1;
+    int _nbu=1; if(_nbu<0) _nbu=1;
+    nlines/=_nbu;
+
+    SO3partA_DiagCGproduct_back0_kernel<<<map.blockdims(),cnine::roundup(x.getn(),32),nlines*128,stream>>>
+      (x.arrg,x.arrgc,g.arrg,g.arrgc,y.arrg,y.arrgc,
+	x.cellstride,y.cellstride,g.cellstride,map,
+	x.getn(),y.getn(),g.getn(),xl,yl,l,offs,_nch,Cptr,mode);
+
+  }
+
+  
+  template<typename CMAP> // TODO 
+  void SO3partA_DiagCGproduct_back1_cu(const CMAP& map, SO3partArrayA& y, const SO3partArrayA& g, 
+    const SO3partArrayA& x, const cudaStream_t& stream, const int offs, const int mode){
+
+    const int xl=x.getl();
+    const int yl=y.getl();
+    const int l=g.getl();
+
+    int Cptr=SO3_cgbank.getfC(xl,yl,l)/4;
+    int nlines=x.cellstride/16+y.cellstride/16+g.cellstride/16;
+    assert(x.nbu==g.nbu);
+    assert(y.nbu==g.nbu);
+
+    const int _nch=1;
+    int _nbu=1; if(_nbu<0) _nbu=1;
+    nlines/=_nbu;
+
+    SO3partA_DiagCGproduct_back1_kernel<<<map.blockdims(),cnine::roundup(y.getn(),32),nlines*128,stream>>>
+      (y.arrg,y.arrgc,g.arrg,g.arrgc,x.arrg,x.arrgc,
+	x.cellstride,y.cellstride,g.cellstride,map,
+	x.getn(),y.getn(),g.getn(),xl,yl,l,offs,_nch,Cptr,mode);
+
+  }
+
+
+  template void SO3partA_DiagCGproduct_cu(const cnine::CellwiseBinaryCmap& map, 
+    SO3partArrayA&, const SO3partArrayA&, const SO3partArrayA&, const cudaStream_t&, const int offs, 
+    const int mode);
+
+  template void SO3partA_DiagCGproduct_cu(const cnine::BroadcastBinaryCmap& map, 
+    SO3partArrayA&, const SO3partArrayA&, const SO3partArrayA&, const cudaStream_t&, const int offs, 
+    const int mode);
+
+  template void SO3partA_DiagCGproduct_cu(const cnine::OuterCmap& map, 
+    SO3partArrayA&, const SO3partArrayA&, const SO3partArrayA&, const cudaStream_t&, const int offs, 
+    const int mode);
+
+  template void SO3partA_DiagCGproduct_cu(const cnine::InnerCmap& map, 
+    SO3partArrayA&, const SO3partArrayA&, const SO3partArrayA&, const cudaStream_t&, const int offs, 
+    const int mode);
+
+  template void SO3partA_DiagCGproduct_cu(const cnine::MVprodCmap& map, 
+    SO3partArrayA&, const SO3partArrayA&, const SO3partArrayA&, const cudaStream_t&, const int offs, 
+    const int mode);
+
+  template void SO3partA_DiagCGproduct_cu(const cnine::Convolve2Cmap& map, 
+    SO3partArrayA&, const SO3partArrayA&, const SO3partArrayA&, const cudaStream_t&, const int offs, 
+    const int mode);
+
+
+
+  template void SO3partA_DiagCGproduct_back0_cu(const cnine::CellwiseBinaryCmap& map, 
+    SO3partArrayA&, const SO3partArrayA&, const SO3partArrayA&, const cudaStream_t&, const int offs, 
+    const int mode);
+
+  template void SO3partA_DiagCGproduct_back0_cu(const cnine::BroadcastBinaryCmap& map, 
+    SO3partArrayA&, const SO3partArrayA&, const SO3partArrayA&, const cudaStream_t&, const int offs, 
+    const int mode);
+
+  template void SO3partA_DiagCGproduct_back0_cu(const cnine::OuterCmap& map, 
+    SO3partArrayA&, const SO3partArrayA&, const SO3partArrayA&, const cudaStream_t&, const int offs, 
+    const int mode);
+
+
+  template void SO3partA_DiagCGproduct_back1_cu(const cnine::CellwiseBinaryCmap& map, 
+    SO3partArrayA&, const SO3partArrayA&, const SO3partArrayA&, const cudaStream_t&, const int offs, 
+    const int mode);
+
+  template void SO3partA_DiagCGproduct_back1_cu(const cnine::BroadcastBinaryCmap& map, 
+    SO3partArrayA&, const SO3partArrayA&, const SO3partArrayA&, const cudaStream_t&, const int offs, 
+    const int mode);
+
+  template void SO3partA_DiagCGproduct_back1_cu(const cnine::OuterCmap& map, 
+    SO3partArrayA&, const SO3partArrayA&, const SO3partArrayA&, const cudaStream_t&, const int offs, 
+    const int mode);
+
+
+
+
+}
+
+#endif 
+
+
+
+
+
diff --git a/cuda/SO3partB_addCGproduct.cu b/cuda/SO3partB_addCGproduct.cu
new file mode 100644
index 0000000..706999d
--- /dev/null
+++ b/cuda/SO3partB_addCGproduct.cu
@@ -0,0 +1,280 @@
+/*
+ * This file is part of GElib, a C++/CUDA library for group equivariant 
+ * tensor operations. 
+ *  
+ * Copyright (c) 2023, Imre Risi Kondor
+ *
+ * This source code file is subject to the terms of the noncommercial 
+ * license distributed with GElib in the file NONCOMMERICAL.TXT. Commercial 
+ * use is prohibited. All redistributed versions of this file (in orginal
+ * or modified form) must retain this copyright notice and must be 
+ * accompanied by a verbatim copy of the license. 
+ *
+ */
+
+#ifndef _SO3partB_addCGproduct_cu
+#define _SO3partB_addCGproduct_cu
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "SO3_CGbank.hpp"
+#include "GElibConfig.hpp"
+#include "Ctensor3_view.hpp"
+#include "Ctensor4_view.hpp"
+#include "cuda_loaders.cu"
+
+
+extern GElib::SO3_CGbank SO3_cgbank;
+extern GElib::GElibConfig* gelib_config;
+
+#define maxl1_explicit 2
+#define maxl_explicit 4
+
+#include "SO3part_addCGproduct_subkernels.inc"
+
+
+__global__ void SO3partB_addCGproduct_tiled_kernel(const cnine::Ctensor3_view r, const cnine::Ctensor4_view_t3 x, 
+  const cnine::Ctensor4_view_t3 y, const int Cptr, float* cptr_global, const bool preloadCG){
+
+  extern __shared__ unsigned char _shared[]; 
+  const int b=blockIdx.x;
+  const int t=threadIdx.x;
+
+  int l1=(x.n1-1)/2;
+  int l2=(y.n1-1)/2;
+  int l=(r.n1-1)/2;
+  int L2=y.n1;
+
+  float* cptr;
+  float* xpr;
+  if(preloadCG){
+    cptr=reinterpret_cast<float*>(_shared);
+    xpr=cptr+((x.n1*y.n1-1)/32+1)*32;
+    if(Cptr>=0) loadf(cptr,reinterpret_cast<float*>(cg_cmem)+Cptr,x.n1*y.n1);
+    else loadf(cptr,cptr_global,x.n1*y.n1);
+  }else{
+    if(Cptr>=0) cptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+    else cptr=cptr_global;
+    xpr=reinterpret_cast<float*>(_shared);
+  }
+
+  float* xpi=xpr+x.n1*x.n3;
+  float* ypr=xpr+((2*x.n1*x.n3-1)/32+1)*32;
+  float* ypi=ypr+y.n1*y.n3;
+
+  int xs1=x.n3;
+  int ys1=y.n3;
+  int rs1=r.s1;
+  int ytot=(y.n2-1)*y.n3+y.last;
+
+  for(int i=0; i<x.n2; i++){
+    int xn; if(i<x.n2-1) xn=x.n3; else xn=x.last; 
+    loadg_tile(xpr,x,b,i,xn);
+
+    for(int j=0; j<y.n2; j++){
+      int yn; if(j<y.n2-1) yn=y.n3; else yn=y.last;
+      //int rn=xn*yn;
+      loadg_tile(ypr,y,b,j,yn);
+
+      __syncthreads();
+
+      if(t<xn*yn){
+
+	float* _xpr=xpr+t/yn;
+	float* _xpi=xpi+t/yn;
+    
+	float* _ypr=ypr+t%yn;
+	float* _ypi=ypi+t%yn;
+    
+	float* _rpr=r.arr+r.s0*b+r.s2*((i*x.n3+t/yn)*ytot+(j*y.n3+t%yn));
+	float* _rpi=r.arrc+r.s0*b+r.s2*((i*x.n3+t/yn)*ytot+(j*y.n3+t%yn));
+
+	for(int m=-l; m<=l; m++){
+	  float r_r=0;
+	  float r_i=0;
+	  int lower=max(-l1,m-l2);
+	  int upper=min(l1,m+l2);
+	  for(int m1=lower; m1<=upper; m1++){
+	    int m2=m-m1;
+	    float c=cptr[(m1+l1)*L2+m2+l2];
+	    const float x_r=_xpr[xs1*(m1+l1)];
+	    const float x_i=_xpi[xs1*(m1+l1)];
+	    const float y_r=_ypr[ys1*(m2+l2)];
+	    const float y_i=_ypi[ys1*(m2+l2)];
+	    r_r+=c*(x_r*y_r-x_i*y_i); 
+	    r_i+=c*(x_r*y_i+x_i*y_r);
+	  }
+	  _rpr[rs1*(m+l)]+=r_r;
+	  _rpi[rs1*(m+l)]+=r_i;
+	}
+      }
+      __syncthreads();
+
+    }
+  }
+
+}
+
+
+
+typedef void (*CGPRODUCT_SUBKERNEL)(const float*, const float*, const float*, const float*, const int, float*, float*, int); 
+
+template<CGPRODUCT_SUBKERNEL subkernel>
+__global__ void SO3part_addCGproduct_explicit(const cnine::Ctensor3_view r, const cnine::Ctensor3_view x, 
+  const cnine::Ctensor3_view y){
+
+  extern __shared__ unsigned char _shared[]; 
+  const int b=blockIdx.x;
+  const int t=threadIdx.x;
+
+  //int l1=(x.n1-1)/2;
+  //int l2=(y.n1-1)/2;
+  //int l=(r.n1-1)/2;
+  //int L2=y.n1;
+  //int L2=y.n1;
+
+  float* xpr=reinterpret_cast<float*>(_shared);
+  float* xpi=xpr+16; //xpr+x.n1;
+  float* ypr=xpr+32; //xpr+((2*x.n1-1)/32+1)*32;
+  float* ypi=ypr+y.n1*y.n2;
+  loadg(y,ypr,b,t);
+
+  for(int i=0; i<x.n2; i++){
+
+    if(t<x.n1){
+      xpr[t]=x.arr[b*x.s0+t*x.s1+i*x.s2];
+      xpi[t]=x.arrc[b*x.s0+t*x.s1+i*x.s2];
+    }
+
+    if(t<y.n2){
+      subkernel(xpr,xpi,ypr+t,ypi+t,y.n2,r.arr+b*r.s0+(i*y.n2+t)*r.s2,r.arrc+b*r.s0+(i*y.n2+t)*r.s2,r.s1);
+    }
+  }
+  
+}
+
+namespace GElib{
+
+
+  void SO3partB_addCGproduct_cu(cnine::Ctensor3_view r, const cnine::Ctensor3_view& x, const cnine::Ctensor3_view& y, 
+    const int offs, const cudaStream_t& stream){
+
+    const int xl=(x.n1-1)/2;
+    const int yl=(y.n1-1)/2;
+    const int l=(r.n1-1)/2;
+    const int b=r.n0;
+
+    r.arr+=r.s2*offs;
+    r.arrc+=r.s2*offs;
+    r.n2=x.n2*y.n2;
+    //GELIB_CHECK(x.n2*y.n2<=1024,"Number of ouput channels can be at most 1024.")
+
+    if(gelib_config && gelib_config->SO3part_CGkernels_explicit && xl<=maxl1_explicit && yl<=maxl1_explicit && l<=maxl_explicit){
+      cout<<"Explicit!"<<endl;
+      int nlines=1+cnine::roundup(2*y.n1*y.n2,32)/32;
+      int l1=xl;
+      int l2=yl;
+      #include "SO3part_addCGproduct_explicit_calls.inc"
+      return;
+    }
+
+    float* cptr=nullptr;
+    int Cptr=SO3_cgbank.getfC(xl,yl,l)/4;
+    if(Cptr<0) cptr=SO3_cgbank.getf(CGindex(xl,yl,l),r.dev).arrg;
+    int clines=cnine::roundup(x.n1*y.n1,32)/32;
+
+    // set tile sizes
+    const int xn=std::min(x.n2,32);
+    const int yn=std::min(y.n2,32);
+    cnine::Ctensor4_view_t3 xtiled(x,xn);
+    cnine::Ctensor4_view_t3 ytiled(y,yn);
+    //cnine::Ctensor4_view_t3 rtiled(r,xn*yn);
+
+    int nlines=cnine::roundup(xtiled.n1*xn*2,32)/32+
+      cnine::roundup(ytiled.n1*yn*2,32)/32;
+
+    if(nlines<=384){
+      bool preloadCG=(nlines+clines<=384);
+      //preloadCG=false;
+      SO3partB_addCGproduct_tiled_kernel<<<b,cnine::roundup(xn*yn,32),(nlines+preloadCG*clines)*128,stream>>>
+	(r,xtiled,ytiled,Cptr,cptr,preloadCG);
+      return;
+    }
+
+    cout<<"error"<<endl;
+
+  }    
+
+
+}
+
+
+#endif 
+
+
+
+/*
+__global__ void SO3partB_addCGproduct_kernel(const cnine::Ctensor3_view r, const cnine::Ctensor3_view x, 
+  const cnine::Ctensor3_view y, const int Cptr, const bool preloadCG){
+
+  extern __shared__ unsigned char _shared[]; 
+  const int b=blockIdx.x;
+  const int t=threadIdx.x;
+
+  int l1=(x.n1-1)/2;
+  int l2=(y.n1-1)/2;
+  int l=(r.n1-1)/2;
+  int xn=x.n2;
+  int yn=y.n2;
+  int rn=xn*yn;
+  int L2=y.n1;
+
+  float* xpr=reinterpret_cast<float*>(_shared);
+  float* xpi=xpr+loadg(x,xpr,b,t);
+
+  float* ypr=xpr+((2*x.n1*xn-1)/32+1)*32;
+  float* ypi=ypr+loadg(y,ypr,b,t);
+
+  float* rpr=ypr+((2*y.n1*yn-1)/32+1)*32;
+  float* rpi=rpr+loadg(r,rpr,b,t);
+
+  float* cptr;
+  const float C_ptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+  if(preloadCG){
+    cptr=rpr+((2*r.n1*rn-1)/32+1)*32;
+    loadf(cptr,C_ptr,x.n1*y.n1,t);
+  }else cptr=C_ptr;
+
+  __syncthreads();
+
+  if(t<rn){
+
+    xpr=xpr+t/yn;
+    xpi=xpi+t/yn;
+    
+    ypr=ypr+t%yn;
+    ypi=ypi+t%yn;
+    
+    float* _rpr=rpr+t;
+    float* _rpi=rpi+t;
+
+    for(int m1=-l1; m1<=l1; m1++){
+      const float x_r=xpr[xn*(m1+l1)];
+      const float x_i=xpi[xn*(m1+l1)];
+      int lower=-l-m1; if(lower<-l2) lower=-l2;
+      int upper=l-m1; if(upper>l2) upper=l2;
+      for(int m2=lower; m2<=upper; m2++){
+	float c=C_ptr[(m1+l1)*L2+m2+l2];
+	const float y_r=ypr[yn*(m2+l2)];
+	const float y_i=ypi[yn*(m2+l2)];
+	_rpr[rn*(m1+m2+l)]+=c*(x_r*y_r-x_i*y_i); 
+	_rpi[rn*(m1+m2+l)]+=c*(x_r*y_i+x_i*y_r);
+      }
+    }
+  }
+
+  __syncthreads();
+  saveg(r,rpr,b,t);
+}
+*/
diff --git a/cuda/SO3partB_addCGproduct_back0.cu b/cuda/SO3partB_addCGproduct_back0.cu
new file mode 100644
index 0000000..f42dc81
--- /dev/null
+++ b/cuda/SO3partB_addCGproduct_back0.cu
@@ -0,0 +1,323 @@
+/*
+ * This file is part of GElib, a C++/CUDA library for group equivariant 
+ * tensor operations. 
+ *  
+ * Copyright (c) 2023, Imre Risi Kondor
+ *
+ * This source code file is subject to the terms of the noncommercial 
+ * license distributed with GElib in the file NONCOMMERICAL.TXT. Commercial 
+ * use is prohibited. All redistributed versions of this file (in orginal
+ * or modified form) must retain this copyright notice and must be 
+ * accompanied by a verbatim copy of the license. 
+ *
+ */
+
+#ifndef _SO3partB_addCGproduct_back0_cu
+#define _SO3partB_addCGproduct_back0_cu
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "SO3_CGbank.hpp"
+#include "Ctensor3_view.hpp"
+#include "cuda_loaders.cu"
+
+
+extern GElib::SO3_CGbank SO3_cgbank;
+
+
+
+
+__global__ void SO3partB_addCGproduct_back0_tiled_kernel(const cnine::Ctensor4_view_t3 x, const cnine::Ctensor3_view r, 
+  const cnine::Ctensor4_view_t3 y, const int Cptr, float* cptr_global, const bool preloadCG){
+
+  extern __shared__ unsigned char _shared[]; 
+  const int b=blockIdx.x;
+  const int t=threadIdx.x;
+
+  int l1=(x.n1-1)/2;
+  int l2=(y.n1-1)/2;
+  int l=(r.n1-1)/2;
+  int L2=y.n1;
+
+  float* cptr;
+  float* xpr;
+  if(preloadCG){
+    cptr=reinterpret_cast<float*>(_shared);
+    xpr=cptr+((x.n1*y.n1-1)/32+1)*32;
+    if(Cptr>=0) loadf(cptr,reinterpret_cast<float*>(cg_cmem)+Cptr,x.n1*y.n1);
+    else loadf(cptr,cptr_global,x.n1*y.n1);
+  }else{
+    if(Cptr>=0) cptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+    else cptr=cptr_global;
+    xpr=reinterpret_cast<float*>(_shared);
+  }
+
+  float* xpi=xpr+x.n1*x.n3;
+  float* ypr=xpr+((2*x.n1*x.n3-1)/32+1)*32;
+  float* ypi=ypr+y.n1*y.n3;
+
+  int xs1=x.n3;
+  int ys1=y.n3;
+  int rs1=r.s1;
+  int ytot=(y.n2-1)*y.n3+y.last;
+
+
+  for(int i=0; i<x.n2; i++){
+    int xn; if(i<x.n2-1) xn=x.n3; else xn=x.last; 
+    loadg_tile(xpr,x,b,i,xn);
+
+    for(int j=0; j<y.n2; j++){
+      int yn; if(j<y.n2-1) yn=y.n3; else yn=y.last;
+      loadg_tile(ypr,y,b,j,yn);
+
+      __syncthreads();
+
+      if(t<xn){
+	float* _xpr=xpr+t;
+	float* _xpi=xpi+t;
+    
+	for(int m1=-l1; m1<=l1; m1++){
+	  int lower=-l-m1; if(lower<-l2) lower=-l2;
+	  int upper=l-m1; if(upper>l2) upper=l2;
+	  float x_r=0;
+	  float x_i=0;
+
+	  for(int ycol=0; ycol<yn; ycol++){
+
+	    float* _ypr=ypr+ycol;
+	    float* _ypi=ypi+ycol;
+	    float* _rpr=r.arr+r.s0*b+r.s2*((i*x.n3+t)*ytot+(j*y.n3+ycol));
+	    float* _rpi=r.arrc+r.s0*b+r.s2*((i*x.n3+t)*ytot+(j*y.n3+ycol));
+
+	    for(int m2=lower; m2<=upper; m2++){
+	      float c=cptr[(m1+l1)*L2+m2+l2];
+	      const float y_r=_ypr[ys1*(m2+l2)];
+	      const float y_i=_ypi[ys1*(m2+l2)];
+	      const float g_r=_rpr[rs1*(m1+m2+l)];
+	      const float g_i=_rpi[rs1*(m1+m2+l)];
+	      x_r+=c*(g_r*y_r+g_i*y_i);
+	      x_i+=c*(-g_r*y_i+g_i*y_r);
+	    }
+	  }
+
+	  _xpr[xs1*(m1+l1)]+=x_r; 
+	  _xpi[xs1*(m1+l1)]+=x_i;
+	}
+
+      }// end t<xn loop
+      __syncthreads();
+
+    }// end j<y.n2 loop
+
+    saveg_tile(xpr,x,b,i,xn);
+  }// end i<x.n2 loop
+
+}
+
+
+namespace GElib{
+
+
+  void SO3partB_addCGproduct_back0_cu(const cnine::Ctensor3_view& x, cnine::Ctensor3_view r, const cnine::Ctensor3_view& y, 
+    const int offs, const cudaStream_t& stream){
+
+    const int xl=(x.n1-1)/2;
+    const int yl=(y.n1-1)/2;
+    const int l=(r.n1-1)/2;
+    const int b=r.n0;
+
+    r.arr+=r.s2*offs;
+    r.arrc+=r.s2*offs;
+    r.n2=x.n2*y.n2;
+
+    float* cptr=nullptr;
+    int Cptr=SO3_cgbank.getfC(xl,yl,l)/4;
+    if(Cptr<0) cptr=SO3_cgbank.getf(CGindex(xl,yl,l),r.dev).arrg;
+    int clines=cnine::roundup(x.n1*y.n1,32)/32;
+
+    // set tile sizes
+    const int xn=std::min(x.n2,32);
+    const int yn=std::min(y.n2,32);
+    cnine::Ctensor4_view_t3 xtiled(x,xn);
+    cnine::Ctensor4_view_t3 ytiled(y,yn);
+
+    int nlines=cnine::roundup(xtiled.n1*xn*2,32)/32+
+      cnine::roundup(ytiled.n1*yn*2,32)/32;
+
+    if(nlines<=384){
+      bool preloadCG=(nlines+clines<=384);
+      SO3partB_addCGproduct_back0_tiled_kernel<<<b,cnine::roundup(xn,32),(nlines+preloadCG*clines)*128,stream>>>
+	(xtiled,r,ytiled,Cptr,cptr,preloadCG);
+      return;
+    }
+
+    cout<<"error"<<endl;
+
+  }    
+
+
+}
+
+
+#endif 
+
+
+
+  /*
+  if(t<32){
+    int xn=xview.n1;
+    int xs0=xview.s0;
+    int xs1=xview.s1;
+    int xarr=xview.arr;
+    int xarrc=xview.arrc;
+    for(int i=0; i<2*l1+1; i++)
+      for(int j=0; j<xn; x++)
+	xpr[i*xwidth+j]=xarr[i*xs0+j*xs1];
+    for(int i=0; i<2*l1+1; i++)
+      for(int j=0; j<xn; x++)
+	xpi[i*xwidth+j]=xarrc[i*xs0+j*xs1];
+  }
+
+  if(t<32){
+    int yn=yview.n1;
+    int ys0=yview.s0;
+    int ys1=yview.s1;
+    int yarr=yview.arr;
+    int yarrc=yview.arrc;
+    for(int i=0; i<2*l2+1; i++)
+      for(int j=0; j<xn; x++)
+	ypr[i*ywidth+j]=yarr[i*ys0+j*ys1];
+    for(int i=0; i<2*l2+1; i++)
+      for(int j=0; j<xn; x++)
+	ypi[i*ywidth+j]=yarrc[i*ys0+j*ys1];
+  }
+
+  if(t<rwidth){
+    for(int m1=-l1; m1<=l1; m1++){
+      const float x_r=xpr[xwidth*(m1+l1)];
+      const float x_i=xpi[xwidth*(m1+l1)];
+      int lower=-l-m1; if(lower<-l2) lower=-l2;
+      int upper=l-m1; if(upper>l2) upper=l2;
+      for(int m2=lower; m2<=upper; m2++){
+	float c=C_ptr[(m1+l1)*r2+m2+l2];
+	const float y_r=shared[ypr+ywidth*(m2+l2)];
+	const float y_i=shared[ypi+ywidth*(m2+l2)];
+	shared[rpr+rwidth*(m1+m2+l)]+=c*(x_r*y_r-x_i*y_i); 
+	shared[rpi+rwidth*(m1+m2+l)]+=c*(x_r*y_i+x_i*y_r);
+      }
+    }
+  }
+  */
+/*
+__device__ int loadg1(const cnine::Ctensor3_view& x, float* dest, const int b, const int t){
+  int I=x.n1;
+  int J=x.n2;
+  int s1=x.s1;
+  int s2=x.s2;
+  int offs=I*J;
+  float* destc=dest+offs;
+  float* source=x.arr+x.s0*b;
+  float* sourcec=x.arrc+x.s0*b;
+  if(t<J){
+    for(int i=0; i<I; i++)
+      dest[i*J+t]=source[i*s1+t*s2];
+    for(int i=0; i<I; i++)
+      destc[i*J+t]=sourcec[i*s1+t*s2];
+  }
+  return offs;
+}
+
+
+__device__ int saveg1(const cnine::Ctensor3_view& x, float* source, const int b, const int t){
+  int I=x.n1;
+  int J=x.n2;
+  int s1=x.s1;
+  int s2=x.s2;
+  int offs=I*J; //((I*J-1)/32+1)*32;
+  float* sourcec=source+offs;
+  float* dest=x.arr+x.s0*b;
+  float* destc=x.arrc+x.s0*b;
+  if(t<J){
+    for(int i=0; i<I; i++)
+      dest[i*s1+t*s2]=source[i*J+t];
+    for(int i=0; i<I; i++)
+      destc[i*s1+t*s2]=sourcec[i*J+t];
+  }
+  return offs;
+}
+*/
+/*
+__global__ void SO3partB_addCGproduct_back0_kernel(const cnine::Ctensor3_view x, const cnine::Ctensor3_view r, 
+  const cnine::Ctensor3_view y, const int Cptr){
+
+  extern __shared__ unsigned char _shared[]; 
+  const float* C_ptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+  const int b=blockIdx.x;
+  const int t=threadIdx.x;
+
+  int l1=(x.n1-1)/2;
+  int l2=(y.n1-1)/2;
+  int l=(r.n1-1)/2;
+  int xn=x.n2;
+  int yn=y.n2;
+  int rn=xn*yn;
+  int L2=y.n1;
+
+  float* xpr=reinterpret_cast<float*>(_shared);
+  float* xpi=xpr+loadg(x,xpr,b,t);
+
+  float* ypr=xpr+((2*x.n1*xn-1)/32+1)*32;
+  float* ypi=ypr+loadg(y,ypr,b,t);
+
+  float* rpr=ypr+((2*y.n1*yn-1)/32+1)*32;
+  float* rpi=rpr+loadg(r,rpr,b,t);
+
+  __syncthreads();
+
+
+  float* _xpr=xpr+t;
+  float* _xpi=xpi+t;
+
+  for(int ycol=0; ycol<yn; ycol++){
+    if(t<xn){
+
+      float* _ypr=ypr+ycol;
+      float* _ypi=ypi+ycol;
+      
+      float* _rpr=rpr+t*yn+ycol;
+      float* _rpi=rpi+t*yn+ycol;
+      
+      for(int m1=-l1; m1<=l1; m1++){
+	int lower=-l-m1; if(lower<-l2) lower=-l2;
+	int upper=l-m1; if(upper>l2) upper=l2;
+	for(int m2=lower; m2<=upper; m2++){
+	  float c=C_ptr[(m1+l1)*L2+m2+l2];
+	  const float y_r=_ypr[yn*(m2+l2)];
+	  const float y_i=_ypi[yn*(m2+l2)];
+	  const float g_r=_rpr[rn*(m1+m2+l)];
+	  const float g_i=_rpi[rn*(m1+m2+l)];
+	  _xpr[xn*(m1+l1)]+=c*(g_r*y_r+g_i*y_i);
+	  _xpi[xn*(m1+l1)]+=c*(-g_r*y_i+g_i*y_r);
+	}
+      }
+    }
+    __syncthreads();
+  }
+  
+
+  __syncthreads();
+  
+  saveg(x,xpr,b,t);
+
+}
+*/
+
+    /*
+    if(nlines<=384){
+      SO3partB_addCGproduct_back0_kernel<<<b,cnine::roundup(xg.n2*y.n2,32),nlines*128,stream>>>
+	(xg,rg,y,Cptr);
+    }else{
+      cout<<"error"<<endl;
+    }
+    */
diff --git a/cuda/SO3partB_addCGproduct_back1.cu b/cuda/SO3partB_addCGproduct_back1.cu
new file mode 100644
index 0000000..7e127e5
--- /dev/null
+++ b/cuda/SO3partB_addCGproduct_back1.cu
@@ -0,0 +1,273 @@
+/*
+ * This file is part of GElib, a C++/CUDA library for group equivariant 
+ * tensor operations. 
+ *  
+ * Copyright (c) 2023, Imre Risi Kondor
+ *
+ * This source code file is subject to the terms of the noncommercial 
+ * license distributed with GElib in the file NONCOMMERICAL.TXT. Commercial 
+ * use is prohibited. All redistributed versions of this file (in orginal
+ * or modified form) must retain this copyright notice and must be 
+ * accompanied by a verbatim copy of the license. 
+ *
+ */
+
+#ifndef _SO3partB_addCGproduct_back1_cu
+#define _SO3partB_addCGproduct_back1_cu
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "SO3_CGbank.hpp"
+#include "Ctensor3_view.hpp"
+#include "cuda_loaders.cu"
+
+
+extern GElib::SO3_CGbank SO3_cgbank;
+
+
+
+__global__ void SO3partB_addCGproduct_back1_tiled_kernel(const cnine::Ctensor4_view_t3 y, const cnine::Ctensor3_view r, 
+  const cnine::Ctensor4_view_t3 x, const int Cptr, float* cptr_global, const bool preloadCG){
+
+  extern __shared__ unsigned char _shared[]; 
+  const int b=blockIdx.x;
+  const int t=threadIdx.x;
+
+  int l1=(x.n1-1)/2;
+  int l2=(y.n1-1)/2;
+  int l=(r.n1-1)/2;
+  int L2=y.n1;
+
+  float* cptr;
+  float* xpr;
+  if(preloadCG){
+    cptr=reinterpret_cast<float*>(_shared);
+    xpr=cptr+((x.n1*y.n1-1)/32+1)*32;
+    if(Cptr>=0) loadf(cptr,reinterpret_cast<float*>(cg_cmem)+Cptr,x.n1*y.n1);
+    else loadf(cptr,cptr_global,x.n1*y.n1);
+  }else{
+    if(Cptr>=0) cptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+    else cptr=cptr_global;
+    xpr=reinterpret_cast<float*>(_shared);
+  }
+
+  float* xpi=xpr+x.n1*x.n3;
+  float* ypr=xpr+((2*x.n1*x.n3-1)/32+1)*32;
+  float* ypi=ypr+y.n1*y.n3;
+
+  int xs1=x.n3;
+  int ys1=y.n3;
+  int rs1=r.s1;
+  int ytot=(y.n2-1)*y.n3+y.last;
+
+
+  for(int j=0; j<y.n2; j++){
+    int yn; if(j<y.n2-1) yn=y.n3; else yn=y.last;
+    loadg_tile(ypr,y,b,j,yn);
+
+    for(int i=0; i<x.n2; i++){
+      int xn; if(i<x.n2-1) xn=x.n3; else xn=x.last; 
+      loadg_tile(xpr,x,b,i,xn);
+      __syncthreads();
+
+      if(t<yn){
+	float* _ypr=ypr+t;
+	float* _ypi=ypi+t;
+    
+	for(int m2=-l2; m2<=l2; m2++){
+	  int lower=-l-m2; if(lower<-l1) lower=-l1;
+	  int upper=l-m2; if(upper>l1) upper=l1;
+	  float y_r=0;
+	  float y_i=0;
+
+	  for(int xcol=0; xcol<xn; xcol++){
+
+	    float* _xpr=xpr+xcol;
+	    float* _xpi=xpi+xcol;
+	    float* _rpr=r.arr+r.s0*b+r.s2*((i*x.n3+xcol)*ytot+(j*y.n3+t));
+	    float* _rpi=r.arrc+r.s0*b+r.s2*((i*x.n3+xcol)*ytot+(j*y.n3+t));
+
+	    for(int m1=lower; m1<=upper; m1++){
+	      float c=cptr[(m1+l1)*L2+m2+l2];
+	      const float x_r=_xpr[xs1*(m1+l1)];
+	      const float x_i=_xpi[xs1*(m1+l1)];
+	      const float g_r=_rpr[rs1*(m1+m2+l)];
+	      const float g_i=_rpi[rs1*(m1+m2+l)];
+	      y_r+=c*(g_r*x_r+g_i*x_i);
+	      y_i+=c*(-g_r*x_i+g_i*x_r);
+	    }
+	  }
+
+	  _ypr[ys1*(m2+l2)]+=y_r; 
+	  _ypi[ys1*(m2+l2)]+=y_i;
+	}
+
+      }// end t<yn loop
+      __syncthreads();
+
+    }// end i<x.n2 loop
+
+    saveg_tile(ypr,y,b,j,yn);
+  }// end j<y.n2 loop
+
+}
+
+
+namespace GElib{
+
+
+  void SO3partB_addCGproduct_back1_cu(const cnine::Ctensor3_view& y, cnine::Ctensor3_view r, const cnine::Ctensor3_view& x, 
+    const int offs, const cudaStream_t& stream){
+
+    const int xl=(x.n1-1)/2;
+    const int yl=(y.n1-1)/2;
+    const int l=(r.n1-1)/2;
+    const int b=r.n0;
+
+    r.arr+=r.s2*offs;
+    r.arrc+=r.s2*offs;
+    r.n2=x.n2*y.n2;
+
+    float* cptr=nullptr;
+    int Cptr=SO3_cgbank.getfC(xl,yl,l)/4;
+    if(Cptr<0) cptr=SO3_cgbank.getf(CGindex(xl,yl,l),r.dev).arrg;
+    int clines=cnine::roundup(x.n1*y.n1,32)/32;
+
+    // set tile sizes
+    const int xn=std::min(x.n2,32);
+    const int yn=std::min(y.n2,32);
+    cnine::Ctensor4_view_t3 xtiled(x,xn);
+    cnine::Ctensor4_view_t3 ytiled(y,yn);
+
+    int nlines=cnine::roundup(xtiled.n1*xn*2,32)/32+
+      cnine::roundup(ytiled.n1*yn*2,32)/32;
+
+    if(nlines<=384){
+      bool preloadCG=(nlines+clines<=384);
+      SO3partB_addCGproduct_back1_tiled_kernel<<<b,cnine::roundup(yn,32),(nlines+preloadCG*clines)*128,stream>>>
+	(ytiled,r,xtiled,Cptr,cptr,preloadCG);
+      return;
+    }
+
+    cout<<"error"<<endl;
+
+    /*
+    if(nlines<=384){
+      SO3partB_addCGproduct_back1_kernel<<<b,cnine::roundup(x.n2*yg.n2,32),nlines*128,stream>>>
+	(yg,g,x,Cptr);
+    }else{
+      cout<<"error"<<endl;
+    }
+    */
+
+  }    
+
+
+}
+
+
+#endif 
+
+
+/*
+__device__ int loadg2(const cnine::Ctensor3_view& x, float* dest, const int b, const int t){
+  int I=x.n1;
+  int J=x.n2;
+  int s1=x.s1;
+  int s2=x.s2;
+  int offs=I*J;
+  float* destc=dest+offs;
+  float* source=x.arr+x.s0*b;
+  float* sourcec=x.arrc+x.s0*b;
+  if(t<J){
+    for(int i=0; i<I; i++)
+      dest[i*J+t]=source[i*s1+t*s2];
+    for(int i=0; i<I; i++)
+      destc[i*J+t]=sourcec[i*s1+t*s2];
+  }
+  return offs;
+}
+
+
+__device__ int saveg2(const cnine::Ctensor3_view& x, float* source, const int b, const int t){
+  int I=x.n1;
+  int J=x.n2;
+  int s1=x.s1;
+  int s2=x.s2;
+  int offs=I*J;
+  float* sourcec=source+offs;
+  float* dest=x.arr+x.s0*b;
+  float* destc=x.arrc+x.s0*b;
+  if(t<J){
+    for(int i=0; i<I; i++)
+      dest[i*s1+t*s2]=source[i*J+t];
+    for(int i=0; i<I; i++)
+      destc[i*s1+t*s2]=sourcec[i*J+t];
+  }
+  return offs;
+}
+*/
+/*
+__global__ void SO3partB_addCGproduct_back1_kernel(const cnine::Ctensor3_view y, const cnine::Ctensor3_view r, 
+  const cnine::Ctensor3_view x, const int Cptr){
+
+  extern __shared__ unsigned char _shared[]; 
+  const float* C_ptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+  const int b=blockIdx.x;
+  const int t=threadIdx.x;
+
+  int l1=(x.n1-1)/2;
+  int l2=(y.n1-1)/2;
+  int l=(r.n1-1)/2;
+  int xn=x.n2;
+  int yn=y.n2;
+  int rn=xn*yn;
+  int L2=y.n1;
+
+  float* xpr=reinterpret_cast<float*>(_shared);
+  float* xpi=xpr+loadg(x,xpr,b,t);
+
+  float* ypr=xpr+((2*x.n1*xn-1)/32+1)*32;
+  float* ypi=ypr+loadg(y,ypr,b,t);
+
+  float* rpr=ypr+((2*y.n1*yn-1)/32+1)*32;
+  float* rpi=rpr+loadg(r,rpr,b,t);
+
+  __syncthreads();
+
+
+  for(int xcol=0; xcol<xn; xcol++){
+    if(t<yn){
+
+      float* _xpr=xpr+xcol;
+      float* _xpi=xpi+xcol;
+
+      float* _ypr=ypr+t;
+      float* _ypi=ypi+t;
+      
+      float* _rpr=rpr+xcol*yn+t;
+      float* _rpi=rpi+xcol*yn+t;
+      
+      for(int m1=-l1; m1<=l1; m1++){
+	const float x_r=_xpr[xn*(m1+l1)];
+	const float x_i=_xpi[xn*(m1+l1)];
+	int lower=-l-m1; if(lower<-l2) lower=-l2;
+	int upper=l-m1; if(upper>l2) upper=l2;
+	for(int m2=lower; m2<=upper; m2++){
+	  float c=C_ptr[(m1+l1)*L2+m2+l2];
+	  const float g_r=_rpr[rn*(m1+m2+l)];
+	  const float g_i=_rpi[rn*(m1+m2+l)];
+	  _ypr[yn*(m2+l2)]+=c*(g_r*x_r+g_i*x_i);
+	  _ypi[yn*(m2+l2)]+=c*(-g_r*x_i+g_i*x_r);
+	}
+      }
+    }
+    __syncthreads();
+  }
+
+  __syncthreads();
+  saveg(y,ypr,b,t);
+
+}
+*/
diff --git a/cuda/SO3partB_addCGsquare.cu b/cuda/SO3partB_addCGsquare.cu
new file mode 100644
index 0000000..27a65d7
--- /dev/null
+++ b/cuda/SO3partB_addCGsquare.cu
@@ -0,0 +1,156 @@
+/*
+ * This file is part of GElib, a C++/CUDA library for group equivariant 
+ * tensor operations. 
+ *  
+ * Copyright (c) 2023, Imre Risi Kondor
+ *
+ * This source code file is subject to the terms of the noncommercial 
+ * license distributed with GElib in the file NONCOMMERICAL.TXT. Commercial 
+ * use is prohibited. All redistributed versions of this file (in orginal
+ * or modified form) must retain this copyright notice and must be 
+ * accompanied by a verbatim copy of the license. 
+ *
+ */
+
+#ifndef _SO3partB_addCGsquare_cu
+#define _SO3partB_addCGsquare_cu
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "SO3_CGbank.hpp"
+#include "Ctensor3_view.hpp"
+#include "Ctensor4_view.hpp"
+#include "cuda_loaders.cu"
+
+
+extern GElib::SO3_CGbank SO3_cgbank;
+
+
+__global__ void SO3partB_addCGsquare_tiled_kernel(const cnine::Ctensor3_view r, const cnine::Ctensor4_view_t3 x, 
+  const int Cptr, float* cptr_global, const bool preloadCG){
+
+  extern __shared__ unsigned char _shared[]; 
+  const int b=blockIdx.x;
+  const int t=threadIdx.x;
+
+  int l1=(x.n1-1)/2;
+  int l=(r.n1-1)/2;
+  int L2=x.n1;
+
+  float* cptr;
+  float* xpr;
+  if(preloadCG){
+    cptr=reinterpret_cast<float*>(_shared);
+    xpr=cptr+((x.n1*x.n1-1)/32+1)*32;
+    if(Cptr>=0) loadf(cptr,reinterpret_cast<float*>(cg_cmem)+Cptr,x.n1*x.n1);
+    else loadf(cptr,cptr_global,x.n1*x.n1);
+  }else{
+    if(Cptr>=0) cptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+    else cptr=cptr_global;
+    xpr=reinterpret_cast<float*>(_shared);
+  }
+
+  float* xpi=xpr+x.n1*x.n3;
+  float* ypr=xpr+((2*x.n1*x.n3-1)/32+1)*32;
+  float* ypi=ypr+y.n1*y.n3;
+
+  int xs1=x.n3;
+  int ys1=y.n3;
+  int rs1=r.s1;
+  int ytot=(y.n2-1)*y.n3+y.last;
+
+  for(int i=0; i<x.n2; i++){
+    int xn; if(i<x.n2-1) xn=x.n3; else xn=x.last; 
+    loadg_tile(xpr,x,b,i,xn);
+
+    for(int j=0; j<y.n2; j++){
+      int yn; if(j<y.n2-1) yn=y.n3; else yn=y.last;
+      //int rn=xn*yn;
+      loadg_tile(ypr,y,b,j,yn);
+
+      __syncthreads();
+
+      if(t<xn*yn){
+
+	float* _xpr=xpr+t/yn;
+	float* _xpi=xpi+t/yn;
+    
+	float* _ypr=ypr+t%yn;
+	float* _ypi=ypi+t%yn;
+    
+	float* _rpr=r.arr+r.s0*b+r.s2*((i*x.n3+t/yn)*ytot+(j*y.n3+t%yn));
+	float* _rpi=r.arrc+r.s0*b+r.s2*((i*x.n3+t/yn)*ytot+(j*y.n3+t%yn));
+
+	for(int m=-l; m<=l; m++){
+	  float r_r=0;
+	  float r_i=0;
+	  int lower=max(-l1,m-l2);
+	  int upper=min(l1,m+l2);
+	  for(int m1=lower; m1<=upper; m1++){
+	    int m2=m-m1;
+	    float c=cptr[(m1+l1)*L2+m2+l2];
+	    const float x_r=_xpr[xs1*(m1+l1)];
+	    const float x_i=_xpi[xs1*(m1+l1)];
+	    const float y_r=_ypr[ys1*(m2+l2)];
+	    const float y_i=_ypi[ys1*(m2+l2)];
+	    r_r+=c*(x_r*y_r-x_i*y_i); 
+	    r_i+=c*(x_r*y_i+x_i*y_r);
+	  }
+	  _rpr[rs1*(m+l)]+=r_r;
+	  _rpi[rs1*(m+l)]+=r_i;
+	}
+      }
+      __syncthreads();
+
+    }
+  }
+
+}
+
+
+namespace GElib{
+
+
+  void SO3partB_addCGsquare_cu(cnine::Ctensor3_view r, const cnine::Ctensor3_view& x,  
+    const int offs, const cudaStream_t& stream){
+
+    const int xl=(x.n1-1)/2;
+    const int l=(r.n1-1)/2;
+    const int b=r.n0;
+    const int diag=1-(2*xl-l)%2;
+
+    r.arr+=r.s2*offs;
+    r.arrc+=r.s2*offs;
+    r.n2=x.n2*(x.n2-1)/2+x.n2*diag;
+    //GELIB_CHECK(x.n2*y.n2<=1024,"Number of ouput channels can be at most 1024.")
+
+    float* cptr=nullptr;
+    int Cptr=SO3_cgbank.getfC(xl,yl,l)/4;
+    if(Cptr<0) cptr=SO3_cgbank.getf(CGindex(xl,yl,l),r.dev).arrg;
+    int clines=cnine::roundup(r.n2,32)/32;
+
+    // set tile sizes
+    const int xn=std::min(x.n2,32);
+    cnine::Ctensor4_view_t3 xtiled(x,xn);
+    //cnine::Ctensor4_view_t3 rtiled(r,xn*yn);
+
+    int nlines=2*cnine::roundup(xtiled.n1*xn*2,32)/32;
+
+    if(nlines<=384){
+      bool preloadCG=(nlines+clines<=384);
+      SO3partB_addCGsquare_tiled_kernel<<<b,cnine::roundup(xn*xn,32),(nlines+preloadCG*clines)*128,stream>>>
+	(r,xtiled,Cptr,cptr,preloadCG);
+      return;
+    }
+
+    cout<<"error"<<endl;
+
+  }    
+
+
+}
+
+
+#endif 
+
diff --git a/cuda/SO3partB_addCGtransform.cu b/cuda/SO3partB_addCGtransform.cu
new file mode 100644
index 0000000..090f57d
--- /dev/null
+++ b/cuda/SO3partB_addCGtransform.cu
@@ -0,0 +1,105 @@
+/*
+ * This file is part of GElib, a C++/CUDA library for group equivariant 
+ * tensor operations. 
+ *  
+ * Copyright (c) 2023, Imre Risi Kondor
+ *
+ * This source code file is subject to the terms of the noncommercial 
+ * license distributed with GElib in the file NONCOMMERICAL.TXT. Commercial 
+ * use is prohibited. All redistributed versions of this file (in orginal
+ * or modified form) must retain this copyright notice and must be 
+ * accompanied by a verbatim copy of the license. 
+ *
+ */
+
+#ifndef _SO3partB_addCGproduct_cu
+#define _SO3partB_addCGproduct_cu
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "SO3_CGbank.hpp"
+#include "Ctensor3_view.hpp"
+#include "Ctensor4_view.hpp"
+#include "cuda_loaders.cu"
+
+
+extern GElib::SO3_CGbank SO3_cgbank;
+
+
+__global__ void SO3partB_addCGtransform_kernel(const cnine::Ctensor3_view r, const cnine::Ctensor4_view x, 
+  const int Cptr, float* cptr_global, const bool preloadCG){
+
+  extern __shared__ unsigned char _shared[]; 
+  const int b=blockIdx.x;
+  const int t=threadIdx.x;
+
+  int l1=(x.n1-1)/2;
+  int l2=(x.n2-1)/2;
+  int l=(r.n1-1)/2;
+  //int L2=y.n1;
+
+  float* cptr;
+  float* xpr;
+  if(preloadCG){
+    cptr=reinterpret_cast<float*>(_shared);
+    xpr=cptr+((x.n1*x.n2-1)/32+1)*32;
+    if(Cptr>=0) loadf(cptr,reinterpret_cast<float*>(cg_cmem)+Cptr,x.n1*x.n2);
+    else loadf(cptr,cptr_global,x.n1*x.n2);
+  }else{
+    if(Cptr>=0) cptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+    else cptr=cptr_global;
+    xpr=reinterpret_cast<float*>(_shared);
+  }
+
+  
+
+}
+
+
+namespace GElib{
+
+
+  void SO3partB_addCGtransform_cu(cnine::Ctensor3_view r, const cnine::Ctensor4_view& x,  
+    const int offs, const cudaStream_t& stream){
+
+    const int xl=(x.n1-1)/2;
+    const int yl=(x.n2-1)/2;
+    const int l=(r.n1-1)/2;
+    const int b=r.n0;
+
+    r.arr+=r.s2*offs;
+    r.arrc+=r.s2*offs;
+    r.n2=x.n2;
+    GELIB_CHECK(x.n2==y.n2,"Diag mismatch.");
+    //GELIB_CHECK(x.n2*y.n2<=1024,"Number of ouput channels can be at most 1024.")
+
+    float* cptr=nullptr;
+    int Cptr=SO3_cgbank.getfC(xl,yl,l)/4;
+    if(Cptr<0) cptr=SO3_cgbank.getf(CGindex(xl,yl,l),r.dev).arrg;
+    int clines=cnine::roundup(x.n1*y.n1,32)/32;
+
+    const int tilesize=std::min(x.n2,32);
+    cnine::Ctensor4_view_t3 xtiled(x,tilesize);
+    cnine::Ctensor4_view_t3 ytiled(y,tilesize);
+
+    int nlines=cnine::roundup(xtiled.n1*tilesize*2,32)/32+
+      cnine::roundup(ytiled.n1*tilesize*2,32)/32;
+
+    if(nlines<=384){
+      bool preloadCG=(nlines+clines<=384);
+      //preloadCG=false;
+      SO3partB_addDiagCGproduct_tiled_kernel<<<b,cnine::roundup(tilesize,32),(nlines+preloadCG*clines)*128,stream>>>
+	(r,xtiled,ytiled,Cptr,cptr,preloadCG);
+      return;
+    }
+
+    cout<<"error"<<endl;
+
+  }    
+
+
+}
+
+
+#endif 
diff --git a/cuda/SO3partB_addDiagCGproduct.cu b/cuda/SO3partB_addDiagCGproduct.cu
new file mode 100644
index 0000000..98c8b8e
--- /dev/null
+++ b/cuda/SO3partB_addDiagCGproduct.cu
@@ -0,0 +1,286 @@
+/*
+ * This file is part of GElib, a C++/CUDA library for group equivariant 
+ * tensor operations. 
+ *  
+ * Copyright (c) 2023, Imre Risi Kondor
+ *
+ * This source code file is subject to the terms of the noncommercial 
+ * license distributed with GElib in the file NONCOMMERICAL.TXT. Commercial 
+ * use is prohibited. All redistributed versions of this file (in orginal
+ * or modified form) must retain this copyright notice and must be 
+ * accompanied by a verbatim copy of the license. 
+ *
+ */
+
+#ifndef _SO3partB_addDiagCGproduct_cu
+#define _SO3partB_addDiagCGproduct_cu
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "SO3_CGbank.hpp"
+#include "Ctensor3_view.hpp"
+#include "Ctensor4_view.hpp"
+#include "cuda_loaders.cu"
+
+
+extern GElib::SO3_CGbank SO3_cgbank;
+//extern long int opcount;
+
+// Process ncells number of cells in one call
+__global__ void SO3partB_addDiagCGproduct_kernel(const cnine::Ctensor3_view r, const cnine::Ctensor3_view x, 
+  const cnine::Ctensor3_view y, const int Cptr, float* cptr_global, const bool preloadCG, const int ncells){
+
+  bool loadr=false; // does not work because of striding of r
+
+  extern __shared__ unsigned char _shared[]; 
+  const int b=blockIdx.x;
+  const int t=threadIdx.x;
+  const int t0=t/x.n2; // cell selector
+  const int t1=t%x.n2; // channel selector within cell
+  const int actual_ncells=min(ncells,r.n0-b*ncells);
+
+  int l1=(x.n1-1)/2;
+  int l2=(y.n1-1)/2;
+  int l=(r.n1-1)/2;
+  int L2=y.n1;
+
+  float* cptr;
+  float* xpr;
+  if(preloadCG){
+    cptr=reinterpret_cast<float*>(_shared);
+    xpr=cptr+((x.n1*y.n1-1)/32+1)*32;
+    if(Cptr>=0) loadf(cptr,reinterpret_cast<float*>(cg_cmem)+Cptr,x.n1*y.n1);
+    else loadf(cptr,cptr_global,x.n1*y.n1);
+  }else{
+    if(Cptr>=0) cptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+    else cptr=cptr_global;
+    xpr=reinterpret_cast<float*>(_shared);
+  }
+
+  float* xpi=xpr+actual_ncells*x.n1*x.n2;
+  float* ypr=xpr+((2*actual_ncells*x.n1*x.n2-1)/32+1)*32;
+  float* ypi=ypr+actual_ncells*y.n1*y.n2;
+  float* rpr=ypr+((2*actual_ncells*y.n1*y.n2-1)/32+1)*32;
+  float* rpi=rpr+actual_ncells*r.n1*r.n2; // should be x.n2??
+
+  int xs1=x.s1/2;
+  int ys1=y.s1/2;
+  int rs1=r.s1; // changed!
+  if(loadr) rs1=r.n2; // changed!
+
+  //if(t==0) printf("%d %d %d\n",r.n1,r.n2,r.s0);
+
+  loadf_strided(xpr,x.arr+b*ncells*x.s0,actual_ncells*x.n1*x.n2,2);
+  loadf_strided(xpi,x.arrc+b*ncells*x.s0,actual_ncells*x.n1*x.n2,2);
+  loadf_strided(ypr,y.arr+b*ncells*y.s0,actual_ncells*y.n1*y.n2,2);
+  loadf_strided(ypi,y.arrc+b*ncells*y.s0,actual_ncells*y.n1*y.n2,2);
+  if(loadr)
+    for(int i=0; i<actual_ncells; i++){
+      loadf_strided(rpr+i*r.n1*r.n2,r.arr+(b*ncells+i)*r.s0,r.n1*r.n2,2);
+      loadf_strided(rpi+i*r.n1*r.n2,r.arrc+(b*ncells+i)*r.s0,r.n1*r.n2,2);
+    }
+  __syncthreads();
+
+  // this handles both the padding of the number of threads to a multiple of 32
+  // and the padding of the number of blocks to a multiple of ncells
+  if(t0<actual_ncells){ 
+
+    float* _xpr=xpr+t0*x.s0/2+t1; //*x.s2;
+    float* _xpi=xpi+t0*x.s0/2+t1; //*x.s2;
+    
+    float* _ypr=ypr+t0*y.s0/2+t1; //*y.s2;
+    float* _ypi=ypi+t0*y.s0/2+t1; //*y.s2;
+    
+    float* _rpr=r.arr+(b*ncells+t0)*r.s0+t1*r.s2;
+    float* _rpi=r.arrc+(b*ncells+t0)*r.s0+t1*r.s2;
+
+    if(loadr){
+      _rpr=rpr+t0*r.n1*r.n2+t1;
+      _rpi=rpi+t0*r.n1*r.n2+t1;
+    }
+
+    for(int m=-l; m<=l; m++){
+      float r_r=0;
+      float r_i=0;
+      int lower=max(-l1,m-l2);
+      int upper=min(l1,m+l2);
+      for(int m1=lower; m1<=upper; m1++){
+	int m2=m-m1;
+	float c=cptr[(m1+l1)*L2+m2+l2];
+	const float x_r=_xpr[xs1*(m1+l1)];
+	const float x_i=_xpi[xs1*(m1+l1)];
+	const float y_r=_ypr[ys1*(m2+l2)];
+	const float y_i=_ypi[ys1*(m2+l2)];
+	r_r+=c*(x_r*y_r-x_i*y_i); 
+	r_i+=c*(x_r*y_i+x_i*y_r);
+      }
+      _rpr[rs1*(m+l)]+=r_r;
+      _rpi[rs1*(m+l)]+=r_i;
+    }
+    
+  }
+
+  __syncthreads();
+  if(loadr)
+    for(int i=0; i<actual_ncells; i++){
+      savef_strided(rpr+i*r.n1*r.n2,r.arr+(b*ncells+i)*r.s0,r.n1*r.n2,2);
+      savef_strided(rpi+i*r.n1*r.n2,r.arrc+(b*ncells+i)*r.s0,r.n1*r.n2,2);
+    }
+
+}
+
+
+__global__ void SO3partB_addDiagCGproduct_tiled_kernel(const cnine::Ctensor3_view r, const cnine::Ctensor4_view_t3 x, 
+  const cnine::Ctensor4_view_t3 y, const int Cptr, float* cptr_global, const bool preloadCG){
+
+  extern __shared__ unsigned char _shared[]; 
+  const int b=blockIdx.x;
+  const int t=threadIdx.x;
+
+  int l1=(x.n1-1)/2;
+  int l2=(y.n1-1)/2;
+  int l=(r.n1-1)/2;
+  int L2=y.n1;
+
+  float* cptr;
+  float* xpr;
+  if(preloadCG){
+    cptr=reinterpret_cast<float*>(_shared);
+    xpr=cptr+((x.n1*y.n1-1)/32+1)*32;
+    if(Cptr>=0) loadf(cptr,reinterpret_cast<float*>(cg_cmem)+Cptr,x.n1*y.n1);
+    else loadf(cptr,cptr_global,x.n1*y.n1);
+  }else{
+    if(Cptr>=0) cptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+    else cptr=cptr_global;
+    xpr=reinterpret_cast<float*>(_shared);
+  }
+
+  float* xpi=xpr+x.n1*x.n3;
+  float* ypr=xpr+((2*x.n1*x.n3-1)/32+1)*32;
+  float* ypi=ypr+y.n1*y.n3;
+
+  int xs1=x.n3;
+  int ys1=y.n3;
+  int rs1=r.s1;
+
+  assert(x.n2==y.n2);
+
+  for(int i=0; i<x.n2; i++){
+    int xn; if(i<x.n2-1) xn=x.n3; else xn=x.last; 
+    loadg_tile(xpr,x,b,i,xn);
+    loadg_tile(ypr,y,b,i,xn);
+      
+    __syncthreads();
+
+    if(t<xn){
+
+      float* _xpr=xpr+t;
+      float* _xpi=xpi+t;
+    
+      float* _ypr=ypr+t;
+      float* _ypi=ypi+t;
+    
+      float* _rpr=r.arr+r.s0*b+r.s2*(i*x.n3+t);
+      float* _rpi=r.arrc+r.s0*b+r.s2*(i*x.n3+t);
+
+      for(int m=-l; m<=l; m++){
+	float r_r=0;
+	float r_i=0;
+	int lower=max(-l1,m-l2);
+	int upper=min(l1,m+l2);
+	for(int m1=lower; m1<=upper; m1++){
+	  int m2=m-m1;
+	  float c=cptr[(m1+l1)*L2+m2+l2];
+	  const float x_r=_xpr[xs1*(m1+l1)];
+	  const float x_i=_xpi[xs1*(m1+l1)];
+	  const float y_r=_ypr[ys1*(m2+l2)];
+	  const float y_i=_ypi[ys1*(m2+l2)];
+	  r_r+=c*(x_r*y_r-x_i*y_i); 
+	  r_i+=c*(x_r*y_i+x_i*y_r);
+	}
+	_rpr[rs1*(m+l)]+=r_r;
+	_rpi[rs1*(m+l)]+=r_i;
+      }
+    }
+    __syncthreads();
+
+  }
+
+}
+
+
+namespace GElib{
+
+
+  void SO3partB_addDiagCGproduct_cu(cnine::Ctensor3_view r, const cnine::Ctensor3_view& x, const cnine::Ctensor3_view& y, 
+    const int offs, const cudaStream_t& stream){
+
+    GELIB_ASSERT(r.n0==x.n0,"Batch dimension mismatch.");
+    GELIB_ASSERT(r.n0==y.n0,"Batch dimension mismatch.");
+    GELIB_ASSERT(x.n2==y.n2,"Diag mismatch.");
+
+    const int xl=(x.n1-1)/2;
+    const int yl=(y.n1-1)/2;
+    const int l=(r.n1-1)/2;
+    const int b=r.n0;
+
+    r.arr+=r.s2*offs;
+    r.arrc+=r.s2*offs;
+    r.n2=x.n2;
+
+    float* cptr=nullptr;
+    int Cptr=SO3_cgbank.getfC(xl,yl,l)/4;
+    if(Cptr<0) cptr=SO3_cgbank.getf(CGindex(xl,yl,l),r.dev).arrg;
+    int clines=cnine::roundup(x.n1*y.n1,32)/32;
+
+    //cout<<r.n0<<" "<<r.n1<<" "<<r.n2<<endl;
+    //cout<<r.s0<<" "<<r.s1<<" "<<r.s2<<endl;
+
+    // If the number of channels is 32 or less, try and process multiple cells in one thread-block
+    if(x.n2<=32){
+      int ncells=32/x.n2;
+      ncells=std::min(ncells,380*32/(2*x.n1*x.n2+2*y.n1*y.n2));
+      //ncells=std::min(ncells,380*32/(2*x.n1*x.n2+2*y.n1*y.n2+2*r.n1*r.n2));
+      int nlines=cnine::roundup(2*ncells*x.n1*x.n2,32)/32+
+      cnine::roundup(2*ncells*y.n1*y.n2,32)/32;
+      //int nlines=cnine::roundup(2*ncells*x.n1*x.n2,32)/32+
+      //cnine::roundup(2*ncells*y.n1*y.n2,32)/32+
+      //cnine::roundup(2*ncells*r.n1*r.n2,32)/32;
+      //cout<<"ncells="<<ncells<<endl;
+      if(ncells>0 && nlines<=384){
+	bool preloadCG=(nlines+clines<=384);
+	//cout<<"Launching ("<<xl<<","<<yl<<","<<l<<") addDiagCGproduct_kernel with ncells="<<ncells<<" nblocks="<<cnine::roundup(b,ncells)/ncells<<" and nthreads="<<cnine::roundup(ncells*x.n2,32)<<endl; 
+	//opcount+=b*x.n1*y.n1*x.n2;
+	SO3partB_addDiagCGproduct_kernel<<<cnine::roundup(b,ncells)/ncells,cnine::roundup(ncells*x.n2,32),
+	  (nlines+preloadCG*clines)*128,stream>>>
+	  (r,x,y,Cptr,cptr,preloadCG,ncells);
+	return;
+      }
+    }
+
+    // Otherwise tile the inputs to chunks of width 32
+    const int tilesize=std::min(x.n2,32);
+    cnine::Ctensor4_view_t3 xtiled(x,tilesize);
+    cnine::Ctensor4_view_t3 ytiled(y,tilesize);
+    int nlines=cnine::roundup(xtiled.n1*tilesize*2,32)/32+
+      cnine::roundup(ytiled.n1*tilesize*2,32)/32;
+    
+    if(nlines<=384){
+      bool preloadCG=(nlines+clines<=384);
+      SO3partB_addDiagCGproduct_tiled_kernel<<<b,cnine::roundup(tilesize,32),(nlines+preloadCG*clines)*128,stream>>>
+	(r,xtiled,ytiled,Cptr,cptr,preloadCG);
+      return;
+    }
+
+    GELIB_ERROR("Inputs too large to load in shared memory.");
+  }    
+
+
+}
+
+
+#endif 
+
+
+
diff --git a/cuda/SO3partB_addDiagCGproduct_back0.cu b/cuda/SO3partB_addDiagCGproduct_back0.cu
new file mode 100644
index 0000000..3ce6172
--- /dev/null
+++ b/cuda/SO3partB_addDiagCGproduct_back0.cu
@@ -0,0 +1,261 @@
+/*
+ * This file is part of GElib, a C++/CUDA library for group equivariant 
+ * tensor operations. 
+ *  
+ * Copyright (c) 2023, Imre Risi Kondor
+ *
+ * This source code file is subject to the terms of the noncommercial 
+ * license distributed with GElib in the file NONCOMMERICAL.TXT. Commercial 
+ * use is prohibited. All redistributed versions of this file (in orginal
+ * or modified form) must retain this copyright notice and must be 
+ * accompanied by a verbatim copy of the license. 
+ *
+ */
+
+#ifndef _SO3partB_addDiagCGproduct_back0_cu
+#define _SO3partB_addDiagCGproduct_back0_cu
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "SO3_CGbank.hpp"
+#include "Ctensor3_view.hpp"
+#include "cuda_loaders.cu"
+
+
+extern GElib::SO3_CGbank SO3_cgbank;
+
+
+
+
+// Process ncells number of cells in one call
+__global__ void SO3partB_addDiagCGproduct_back0_kernel(const cnine::Ctensor3_view x, const cnine::Ctensor3_view r, 
+  const cnine::Ctensor3_view y, const int Cptr, float* cptr_global, const bool preloadCG, const int ncells){
+
+  extern __shared__ unsigned char _shared[]; 
+  const int b=blockIdx.x;
+  const int t=threadIdx.x;
+  const int t0=t/x.n2; // cell selector
+  const int t1=t%x.n2; // channel selector within cell
+  const int actual_ncells=min(ncells,r.n0-b*ncells);
+
+  int l1=(x.n1-1)/2;
+  int l2=(y.n1-1)/2;
+  int l=(r.n1-1)/2;
+  int L2=y.n1;
+
+  float* cptr;
+  float* xpr;
+  if(preloadCG){
+    cptr=reinterpret_cast<float*>(_shared);
+    xpr=cptr+((x.n1*y.n1-1)/32+1)*32;
+    if(Cptr>=0) loadf(cptr,reinterpret_cast<float*>(cg_cmem)+Cptr,x.n1*y.n1);
+    else loadf(cptr,cptr_global,x.n1*y.n1);
+  }else{
+    if(Cptr>=0) cptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+    else cptr=cptr_global;
+    xpr=reinterpret_cast<float*>(_shared);
+  }
+
+  float* xpi=xpr+actual_ncells*x.n1*x.n2;
+  float* ypr=xpr+((2*actual_ncells*x.n1*x.n2-1)/32+1)*32;
+  float* ypi=ypr+actual_ncells*y.n1*y.n2;
+
+  int xs1=x.s1/2;
+  int ys1=y.s1/2;
+  int rs1=r.s1;
+
+  loadf_strided(xpr,x.arr+b*ncells*x.s0,actual_ncells*x.n1*x.n2,2);
+  loadf_strided(xpi,x.arrc+b*ncells*x.s0,actual_ncells*x.n1*x.n2,2);
+  loadf_strided(ypr,y.arr+b*ncells*y.s0,actual_ncells*y.n1*y.n2,2);
+  loadf_strided(ypi,y.arrc+b*ncells*y.s0,actual_ncells*y.n1*y.n2,2);
+  __syncthreads();
+
+
+  // this handles both the padding of the number of threads to a multiple of 32
+  // and the padding of the number of blocks to a multiple of ncells  
+  if(t0<actual_ncells){ 
+
+    float* _xpr=xpr+t0*x.s0/2+t1;
+    float* _xpi=xpi+t0*x.s0/2+t1;
+    
+    float* _ypr=ypr+t0*y.s0/2+t1;
+    float* _ypi=ypi+t0*y.s0/2+t1;
+    
+    float* _rpr=r.arr+(b*ncells+t0)*r.s0+t1*r.s2;
+    float* _rpi=r.arrc+(b*ncells+t0)*r.s0+t1*r.s2;
+
+    for(int m1=-l1; m1<=l1; m1++){
+      int lower=-l-m1; if(lower<-l2) lower=-l2;
+      int upper=l-m1; if(upper>l2) upper=l2;
+      float x_r=0;
+      float x_i=0;
+
+      for(int m2=lower; m2<=upper; m2++){
+	float c=cptr[(m1+l1)*L2+m2+l2];
+	const float y_r=_ypr[ys1*(m2+l2)];
+	const float y_i=_ypi[ys1*(m2+l2)];
+	const float g_r=_rpr[rs1*(m1+m2+l)];
+	const float g_i=_rpi[rs1*(m1+m2+l)];
+	x_r+=c*(g_r*y_r+g_i*y_i);
+	x_i+=c*(-g_r*y_i+g_i*y_r);
+      }
+
+      _xpr[xs1*(m1+l1)]+=x_r; 
+      _xpi[xs1*(m1+l1)]+=x_i;
+    }
+  }
+
+  __syncthreads();
+  savef_strided(xpr,x.arr+b*ncells*x.s0,actual_ncells*x.n1*x.n2,2);
+  savef_strided(xpi,x.arrc+b*ncells*x.s0,actual_ncells*x.n1*x.n2,2);
+
+}
+
+
+__global__ void SO3partB_addDiagCGproduct_back0_tiled_kernel(const cnine::Ctensor4_view_t3 x, const cnine::Ctensor3_view r, 
+  const cnine::Ctensor4_view_t3 y, const int Cptr, float* cptr_global, const bool preloadCG){
+
+  extern __shared__ unsigned char _shared[]; 
+  const int b=blockIdx.x;
+  const int t=threadIdx.x;
+
+  int l1=(x.n1-1)/2;
+  int l2=(y.n1-1)/2;
+  int l=(r.n1-1)/2;
+  int L2=y.n1;
+
+  float* cptr;
+  float* xpr;
+  if(preloadCG){
+    cptr=reinterpret_cast<float*>(_shared);
+    xpr=cptr+((x.n1*y.n1-1)/32+1)*32;
+    if(Cptr>=0) loadf(cptr,reinterpret_cast<float*>(cg_cmem)+Cptr,x.n1*y.n1);
+    else loadf(cptr,cptr_global,x.n1*y.n1);
+  }else{
+    if(Cptr>=0) cptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+    else cptr=cptr_global;
+    xpr=reinterpret_cast<float*>(_shared);
+  }
+
+  float* xpi=xpr+x.n1*x.n3;
+  float* ypr=xpr+((2*x.n1*x.n3-1)/32+1)*32;
+  float* ypi=ypr+y.n1*y.n3;
+
+  int xs1=x.n3;
+  int ys1=y.n3;
+  int rs1=r.s1;
+  assert(x.n2==y.n2);
+
+  for(int i=0; i<x.n2; i++){
+    int xn; if(i<x.n2-1) xn=x.n3; else xn=x.last; 
+    loadg_tile(xpr,x,b,i,xn);
+    loadg_tile(ypr,y,b,i,xn);
+
+    __syncthreads();
+
+    if(t<xn){
+      float* _xpr=xpr+t;
+      float* _xpi=xpi+t;
+	
+      for(int m1=-l1; m1<=l1; m1++){
+	int lower=-l-m1; if(lower<-l2) lower=-l2;
+	int upper=l-m1; if(upper>l2) upper=l2;
+	float x_r=0;
+	float x_i=0;
+
+	float* _ypr=ypr+t;
+	float* _ypi=ypi+t;
+	float* _rpr=r.arr+r.s0*b+r.s2*(i*x.n3+t);
+	float* _rpi=r.arrc+r.s0*b+r.s2*(i*x.n3+t);
+
+	for(int m2=lower; m2<=upper; m2++){
+	  float c=cptr[(m1+l1)*L2+m2+l2];
+	  const float y_r=_ypr[ys1*(m2+l2)];
+	  const float y_i=_ypi[ys1*(m2+l2)];
+	  const float g_r=_rpr[rs1*(m1+m2+l)];
+	  const float g_i=_rpi[rs1*(m1+m2+l)];
+	  x_r+=c*(g_r*y_r+g_i*y_i);
+	  x_i+=c*(-g_r*y_i+g_i*y_r);
+	}
+
+	_xpr[xs1*(m1+l1)]+=x_r; 
+	_xpi[xs1*(m1+l1)]+=x_i;
+      }
+
+    }// end t<xn loop
+    __syncthreads();
+
+    
+    saveg_tile(xpr,x,b,i,xn);
+  }// end i<x.n2 loop
+
+}
+
+
+namespace GElib{
+
+
+  void SO3partB_addDiagCGproduct_back0_cu(const cnine::Ctensor3_view& x, cnine::Ctensor3_view r, const cnine::Ctensor3_view& y, 
+    const int offs, const cudaStream_t& stream){
+
+    GELIB_ASSERT(r.n0==x.n0,"Batch dimension mismatch.");
+    GELIB_ASSERT(r.n0==y.n0,"Batch dimension mismatch.");
+    GELIB_ASSERT(x.n2==y.n2,"Diag mismatch.");
+
+    const int xl=(x.n1-1)/2;
+    const int yl=(y.n1-1)/2;
+    const int l=(r.n1-1)/2;
+    const int b=r.n0;
+
+    r.arr+=r.s2*offs;
+    r.arrc+=r.s2*offs;
+    r.n2=x.n2;
+
+    float* cptr=nullptr;
+    int Cptr=SO3_cgbank.getfC(xl,yl,l)/4;
+    if(Cptr<0) cptr=SO3_cgbank.getf(CGindex(xl,yl,l),r.dev).arrg;
+    int clines=cnine::roundup(x.n1*y.n1,32)/32;
+
+
+    // If the number of channels is 32 or less, try and process multiple cells in one thread-block
+    if(x.n2<=32){
+      int ncells=32/x.n2;
+      ncells=std::min(ncells,384*32/(2*x.n1*x.n2+2*y.n1*y.n2));
+      int nlines=cnine::roundup(2*ncells*x.n1*x.n2,32)/32+
+	cnine::roundup(2*ncells*y.n1*y.n2,32)/32;
+      if(ncells>0 && nlines<=384){
+	bool preloadCG=(nlines+clines<=384);
+	//cout<<"Launching addDiagCGproduct_kernel_back0 with ncells="<<ncells<<" nblocks="<<cnine::roundup(b,ncells)/ncells<<" and nthreads="<<cnine::roundup(ncells*x.n2,32)<<endl; 
+	SO3partB_addDiagCGproduct_back0_kernel<<<cnine::roundup(b,ncells)/ncells,cnine::roundup(ncells*x.n2,32),
+	  (nlines+preloadCG*clines)*128,stream>>>
+	  (x,r,y,Cptr,cptr,preloadCG,ncells);
+	return;
+      }
+    }
+
+    // Otherwise tile the inputs to chunks of width 32
+    const int tilesize=std::min(x.n2,32);
+    cnine::Ctensor4_view_t3 xtiled(x,tilesize);
+    cnine::Ctensor4_view_t3 ytiled(y,tilesize);
+    int nlines=cnine::roundup(xtiled.n1*tilesize*2,32)/32+
+      cnine::roundup(ytiled.n1*tilesize*2,32)/32;
+    if(nlines<=384){
+      bool preloadCG=(nlines+clines<=384);
+      SO3partB_addDiagCGproduct_back0_tiled_kernel<<<b,cnine::roundup(tilesize,32),(nlines+preloadCG*clines)*128,stream>>>
+	(xtiled,r,ytiled,Cptr,cptr,preloadCG);
+      return;
+    }
+
+    GELIB_ERROR("Inputs too large to load in shared memory.");
+
+  }    
+
+
+}
+
+
+#endif 
+
+
+
diff --git a/cuda/SO3partB_addDiagCGproduct_back1.cu b/cuda/SO3partB_addDiagCGproduct_back1.cu
new file mode 100644
index 0000000..054f4ec
--- /dev/null
+++ b/cuda/SO3partB_addDiagCGproduct_back1.cu
@@ -0,0 +1,255 @@
+/*
+ * This file is part of GElib, a C++/CUDA library for group equivariant 
+ * tensor operations. 
+ *  
+ * Copyright (c) 2023, Imre Risi Kondor
+ *
+ * This source code file is subject to the terms of the noncommercial 
+ * license distributed with GElib in the file NONCOMMERICAL.TXT. Commercial 
+ * use is prohibited. All redistributed versions of this file (in orginal
+ * or modified form) must retain this copyright notice and must be 
+ * accompanied by a verbatim copy of the license. 
+ *
+ */
+
+#ifndef _SO3partB_addDiagCGproduct_back1_cu
+#define _SO3partB_addDiagCGproduct_back1_cu
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "SO3_CGbank.hpp"
+#include "Ctensor3_view.hpp"
+#include "cuda_loaders.cu"
+
+
+extern GElib::SO3_CGbank SO3_cgbank;
+
+
+
+__global__ void SO3partB_addDiagCGproduct_back1_kernel(const cnine::Ctensor3_view y, const cnine::Ctensor3_view r, 
+  const cnine::Ctensor3_view x, const int Cptr, float* cptr_global, const bool preloadCG, const int ncells){
+
+  extern __shared__ unsigned char _shared[]; 
+  const int b=blockIdx.x;
+  const int t=threadIdx.x;
+  const int t0=t/x.n2; // cell selector
+  const int t1=t%x.n2; // channel selector within cell
+  const int actual_ncells=min(ncells,r.n0-b*ncells);
+
+  int l1=(x.n1-1)/2;
+  int l2=(y.n1-1)/2;
+  int l=(r.n1-1)/2;
+  int L2=y.n1;
+
+  float* cptr;
+  float* xpr;
+  if(preloadCG){
+    cptr=reinterpret_cast<float*>(_shared);
+    xpr=cptr+((x.n1*y.n1-1)/32+1)*32;
+    if(Cptr>=0) loadf(cptr,reinterpret_cast<float*>(cg_cmem)+Cptr,x.n1*y.n1);
+    else loadf(cptr,cptr_global,x.n1*y.n1);
+  }else{
+    if(Cptr>=0) cptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+    else cptr=cptr_global;
+    xpr=reinterpret_cast<float*>(_shared);
+  }
+
+  float* xpi=xpr+actual_ncells*x.n1*x.n2;
+  float* ypr=xpr+((2*actual_ncells*x.n1*x.n2-1)/32+1)*32;
+  float* ypi=ypr+actual_ncells*y.n1*y.n2;
+
+  int xs1=x.s1/2;
+  int ys1=y.s1/2;
+  int rs1=r.s1;
+
+  loadf_strided(xpr,x.arr+b*ncells*x.s0,actual_ncells*x.n1*x.n2,2);
+  loadf_strided(xpi,x.arrc+b*ncells*x.s0,actual_ncells*x.n1*x.n2,2);
+  loadf_strided(ypr,y.arr+b*ncells*y.s0,actual_ncells*y.n1*y.n2,2);
+  loadf_strided(ypi,y.arrc+b*ncells*y.s0,actual_ncells*y.n1*y.n2,2);
+  __syncthreads();
+
+
+  // this handles both the padding of the number of threads to a multiple of 32
+  // and the padding of the number of blocks to a multiple of ncells
+  if(t0<actual_ncells){ 
+
+    float* _xpr=xpr+t0*x.s0/2+t1;
+    float* _xpi=xpi+t0*x.s0/2+t1;
+    
+    float* _ypr=ypr+t0*y.s0/2+t1;
+    float* _ypi=ypi+t0*y.s0/2+t1;
+    
+    float* _rpr=r.arr+(b*ncells+t0)*r.s0+t1*r.s2;
+    float* _rpi=r.arrc+(b*ncells+t0)*r.s0+t1*r.s2;
+
+    for(int m2=-l2; m2<=l2; m2++){
+      int lower=-l-m2; if(lower<-l1) lower=-l1;
+      int upper=l-m2; if(upper>l1) upper=l1;
+      float y_r=0;
+      float y_i=0;
+      
+      for(int m1=lower; m1<=upper; m1++){
+	float c=cptr[(m1+l1)*L2+m2+l2];
+	const float x_r=_xpr[xs1*(m1+l1)];
+	const float x_i=_xpi[xs1*(m1+l1)];
+	const float g_r=_rpr[rs1*(m1+m2+l)];
+	const float g_i=_rpi[rs1*(m1+m2+l)];
+	y_r+=c*(g_r*x_r+g_i*x_i);
+	y_i+=c*(-g_r*x_i+g_i*x_r);
+      }
+
+      _ypr[ys1*(m2+l2)]+=y_r; 
+      _ypi[ys1*(m2+l2)]+=y_i;
+    }
+  }
+
+  __syncthreads();
+  savef_strided(ypr,y.arr+b*ncells*y.s0,actual_ncells*y.n1*y.n2,2);
+  savef_strided(ypi,y.arrc+b*ncells*y.s0,actual_ncells*y.n1*y.n2,2);
+  
+}
+
+
+__global__ void SO3partB_addDiagCGproduct_back1_tiled_kernel(const cnine::Ctensor4_view_t3 y, const cnine::Ctensor3_view r, 
+  const cnine::Ctensor4_view_t3 x, const int Cptr, float* cptr_global, const bool preloadCG){
+
+  extern __shared__ unsigned char _shared[]; 
+  const int b=blockIdx.x;
+  const int t=threadIdx.x;
+
+  int l1=(x.n1-1)/2;
+  int l2=(y.n1-1)/2;
+  int l=(r.n1-1)/2;
+  int L2=y.n1;
+
+  float* cptr;
+  float* xpr;
+  if(preloadCG){
+    cptr=reinterpret_cast<float*>(_shared);
+    xpr=cptr+((x.n1*y.n1-1)/32+1)*32;
+    if(Cptr>=0) loadf(cptr,reinterpret_cast<float*>(cg_cmem)+Cptr,x.n1*y.n1);
+    else loadf(cptr,cptr_global,x.n1*y.n1);
+  }else{
+    if(Cptr>=0) cptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+    else cptr=cptr_global;
+    xpr=reinterpret_cast<float*>(_shared);
+  }
+
+  float* xpi=xpr+x.n1*x.n3;
+  float* ypr=xpr+((2*x.n1*x.n3-1)/32+1)*32;
+  float* ypi=ypr+y.n1*y.n3;
+
+  int xs1=x.n3;
+  int ys1=y.n3;
+  int rs1=r.s1;
+  assert(x.n2==y.n2);
+
+
+  for(int j=0; j<y.n2; j++){
+    int yn; if(j<y.n2-1) yn=y.n3; else yn=y.last;
+    loadg_tile(ypr,y,b,j,yn);
+    loadg_tile(xpr,x,b,j,yn);
+    __syncthreads();
+
+    if(t<yn){
+      float* _ypr=ypr+t;
+      float* _ypi=ypi+t;
+    
+      for(int m2=-l2; m2<=l2; m2++){
+	int lower=-l-m2; if(lower<-l1) lower=-l1;
+	int upper=l-m2; if(upper>l1) upper=l1;
+	float y_r=0;
+	float y_i=0;
+	
+	float* _xpr=xpr+t;
+	float* _xpi=xpi+t;
+	float* _rpr=r.arr+r.s0*b+r.s2*(j*x.n3+t);
+	float* _rpi=r.arrc+r.s0*b+r.s2*(j*x.n3+t);
+
+	for(int m1=lower; m1<=upper; m1++){
+	  float c=cptr[(m1+l1)*L2+m2+l2];
+	  const float x_r=_xpr[xs1*(m1+l1)];
+	  const float x_i=_xpi[xs1*(m1+l1)];
+	  const float g_r=_rpr[rs1*(m1+m2+l)];
+	  const float g_i=_rpi[rs1*(m1+m2+l)];
+	  y_r+=c*(g_r*x_r+g_i*x_i);
+	  y_i+=c*(-g_r*x_i+g_i*x_r);
+	}
+
+	_ypr[ys1*(m2+l2)]+=y_r; 
+	_ypi[ys1*(m2+l2)]+=y_i;
+
+      }
+
+    }// end t<yn loop
+    __syncthreads();
+
+    saveg_tile(ypr,y,b,j,yn);
+  }// end j<y.n2 loop
+
+}
+
+
+namespace GElib{
+
+
+  void SO3partB_addDiagCGproduct_back1_cu(const cnine::Ctensor3_view& y, cnine::Ctensor3_view r, const cnine::Ctensor3_view& x, 
+    const int offs, const cudaStream_t& stream){
+
+    const int xl=(x.n1-1)/2;
+    const int yl=(y.n1-1)/2;
+    const int l=(r.n1-1)/2;
+    const int b=r.n0;
+
+    r.arr+=r.s2*offs;
+    r.arrc+=r.s2*offs;
+    r.n2=x.n2;
+    GELIB_CHECK(x.n2==y.n2,"Diag mismatch.");
+
+    float* cptr=nullptr;
+    int Cptr=SO3_cgbank.getfC(xl,yl,l)/4;
+    if(Cptr<0) cptr=SO3_cgbank.getf(CGindex(xl,yl,l),r.dev).arrg;
+    int clines=cnine::roundup(x.n1*y.n1,32)/32;
+
+
+    // If the number of channels is 32 or less, try and process multiple cells in one thread-block
+    if(x.n2<=32){
+      int ncells=32/x.n2;
+      ncells=std::min(ncells,384*32/(2*x.n1*x.n2+2*y.n1*y.n2));
+      int nlines=cnine::roundup(2*ncells*x.n1*x.n2,32)/32+
+	cnine::roundup(2*ncells*y.n1*y.n2,32)/32;
+      if(ncells>0 && nlines<=384){
+	bool preloadCG=(nlines+clines<=384);
+	//cout<<"Launching addDiagCGproduct_kernel_back1 with ncells="<<ncells<<" nblocks="<<cnine::roundup(b,ncells)/ncells<<" and nthreads="<<cnine::roundup(ncells*x.n2,32)<<endl; 
+	SO3partB_addDiagCGproduct_back1_kernel<<<cnine::roundup(b,ncells)/ncells,cnine::roundup(ncells*x.n2,32),
+	  (nlines+preloadCG*clines)*128,stream>>>
+	  (y,r,x,Cptr,cptr,preloadCG,ncells);
+	return;
+      }
+    }
+
+    // Otherwise tile the inputs to chunks of width 32
+    const int tilesize=std::min(x.n2,32);
+    cnine::Ctensor4_view_t3 xtiled(x,tilesize);
+    cnine::Ctensor4_view_t3 ytiled(y,tilesize);
+    int nlines=cnine::roundup(xtiled.n1*tilesize*2,32)/32+
+      cnine::roundup(ytiled.n1*tilesize*2,32)/32;
+    if(nlines<=384){
+      bool preloadCG=(nlines+clines<=384);
+      SO3partB_addDiagCGproduct_back1_tiled_kernel<<<b,cnine::roundup(tilesize,32),(nlines+preloadCG*clines)*128,stream>>>
+	(ytiled,r,xtiled,Cptr,cptr,preloadCG);
+      return;
+    }
+
+    GELIB_ERROR("Inputs too large to load in shared memory.");
+
+  }    
+
+
+}
+
+
+#endif 
+
+
diff --git a/cuda/SO3part_addCGproduct_explicit_calls.inc b/cuda/SO3part_addCGproduct_explicit_calls.inc
new file mode 100644
index 0000000..31b2f1e
--- /dev/null
+++ b/cuda/SO3part_addCGproduct_explicit_calls.inc
@@ -0,0 +1,72 @@
+  switch(l1){
+      case 0:
+        switch(l2){
+        case 0:
+          switch(l){
+          case 0: SO3part_addCGproduct_explicit<SO3part_addCGproduct_explicit_kernel_0_0_0><<<b,cnine::roundup(y.n2,32),nlines*128,stream>>>(r,x,y); break;
+          }
+        break;
+        case 1:
+          switch(l){
+          case 1: SO3part_addCGproduct_explicit<SO3part_addCGproduct_explicit_kernel_0_1_1><<<b,cnine::roundup(y.n2,32),nlines*128,stream>>>(r,x,y); break;
+          }
+        break;
+        case 2:
+          switch(l){
+          case 2: SO3part_addCGproduct_explicit<SO3part_addCGproduct_explicit_kernel_0_2_2><<<b,cnine::roundup(y.n2,32),nlines*128,stream>>>(r,x,y); break;
+          }
+        break;
+        }
+
+      break;
+      case 1:
+        switch(l2){
+        case 0:
+          switch(l){
+          case 1: SO3part_addCGproduct_explicit<SO3part_addCGproduct_explicit_kernel_1_0_1><<<b,cnine::roundup(y.n2,32),nlines*128,stream>>>(r,x,y); break;
+          }
+        break;
+        case 1:
+          switch(l){
+          case 0: SO3part_addCGproduct_explicit<SO3part_addCGproduct_explicit_kernel_1_1_0><<<b,cnine::roundup(y.n2,32),nlines*128,stream>>>(r,x,y); break;
+          case 1: SO3part_addCGproduct_explicit<SO3part_addCGproduct_explicit_kernel_1_1_1><<<b,cnine::roundup(y.n2,32),nlines*128,stream>>>(r,x,y); break;
+          case 2: SO3part_addCGproduct_explicit<SO3part_addCGproduct_explicit_kernel_1_1_2><<<b,cnine::roundup(y.n2,32),nlines*128,stream>>>(r,x,y); break;
+          }
+        break;
+        case 2:
+          switch(l){
+          case 1: SO3part_addCGproduct_explicit<SO3part_addCGproduct_explicit_kernel_1_2_1><<<b,cnine::roundup(y.n2,32),nlines*128,stream>>>(r,x,y); break;
+          case 2: SO3part_addCGproduct_explicit<SO3part_addCGproduct_explicit_kernel_1_2_2><<<b,cnine::roundup(y.n2,32),nlines*128,stream>>>(r,x,y); break;
+          case 3: SO3part_addCGproduct_explicit<SO3part_addCGproduct_explicit_kernel_1_2_3><<<b,cnine::roundup(y.n2,32),nlines*128,stream>>>(r,x,y); break;
+          }
+        break;
+        }
+
+      break;
+      case 2:
+        switch(l2){
+        case 0:
+          switch(l){
+          case 2: SO3part_addCGproduct_explicit<SO3part_addCGproduct_explicit_kernel_2_0_2><<<b,cnine::roundup(y.n2,32),nlines*128,stream>>>(r,x,y); break;
+          }
+        break;
+        case 1:
+          switch(l){
+          case 1: SO3part_addCGproduct_explicit<SO3part_addCGproduct_explicit_kernel_2_1_1><<<b,cnine::roundup(y.n2,32),nlines*128,stream>>>(r,x,y); break;
+          case 2: SO3part_addCGproduct_explicit<SO3part_addCGproduct_explicit_kernel_2_1_2><<<b,cnine::roundup(y.n2,32),nlines*128,stream>>>(r,x,y); break;
+          case 3: SO3part_addCGproduct_explicit<SO3part_addCGproduct_explicit_kernel_2_1_3><<<b,cnine::roundup(y.n2,32),nlines*128,stream>>>(r,x,y); break;
+          }
+        break;
+        case 2:
+          switch(l){
+          case 0: SO3part_addCGproduct_explicit<SO3part_addCGproduct_explicit_kernel_2_2_0><<<b,cnine::roundup(y.n2,32),nlines*128,stream>>>(r,x,y); break;
+          case 1: SO3part_addCGproduct_explicit<SO3part_addCGproduct_explicit_kernel_2_2_1><<<b,cnine::roundup(y.n2,32),nlines*128,stream>>>(r,x,y); break;
+          case 2: SO3part_addCGproduct_explicit<SO3part_addCGproduct_explicit_kernel_2_2_2><<<b,cnine::roundup(y.n2,32),nlines*128,stream>>>(r,x,y); break;
+          case 3: SO3part_addCGproduct_explicit<SO3part_addCGproduct_explicit_kernel_2_2_3><<<b,cnine::roundup(y.n2,32),nlines*128,stream>>>(r,x,y); break;
+          case 4: SO3part_addCGproduct_explicit<SO3part_addCGproduct_explicit_kernel_2_2_4><<<b,cnine::roundup(y.n2,32),nlines*128,stream>>>(r,x,y); break;
+          }
+        break;
+        }
+
+      break;
+      }
diff --git a/cuda/SO3part_addCGproduct_subkernels.inc b/cuda/SO3part_addCGproduct_subkernels.inc
new file mode 100644
index 0000000..7cc45c9
--- /dev/null
+++ b/cuda/SO3part_addCGproduct_subkernels.inc
@@ -0,0 +1,609 @@
+__forceinline__ __device__ void SO3part_addCGproduct_explicit_kernel_0_0_0(const float* xpr, const float* xpi, const float* ypr, const float* ypi, const int ys, float* rpr, float* rpi, const int rs){
+  rpr[0*rs]+=
+    (1.000000f)*(xpr[0]*ypr[0*ys]-xpi[0]*ypi[0*ys]);
+  rpi[0*rs]+=
+    (1.000000f)*(xpr[0]*ypi[0*ys]+xpi[0]*ypr[0*ys]);
+}
+
+__forceinline__ __device__ void SO3part_addCGproduct_explicit_kernel_0_1_1(const float* xpr, const float* xpi, const float* ypr, const float* ypi, const int ys, float* rpr, float* rpi, const int rs){
+  rpr[0*rs]+=
+    (1.000000f)*(xpr[0]*ypr[0*ys]-xpi[0]*ypi[0*ys]);
+  rpi[0*rs]+=
+    (1.000000f)*(xpr[0]*ypi[0*ys]+xpi[0]*ypr[0*ys]);
+  rpr[1*rs]+=
+    (1.000000f)*(xpr[0]*ypr[1*ys]-xpi[0]*ypi[1*ys]);
+  rpi[1*rs]+=
+    (1.000000f)*(xpr[0]*ypi[1*ys]+xpi[0]*ypr[1*ys]);
+  rpr[2*rs]+=
+    (1.000000f)*(xpr[0]*ypr[2*ys]-xpi[0]*ypi[2*ys]);
+  rpi[2*rs]+=
+    (1.000000f)*(xpr[0]*ypi[2*ys]+xpi[0]*ypr[2*ys]);
+}
+
+__forceinline__ __device__ void SO3part_addCGproduct_explicit_kernel_0_2_2(const float* xpr, const float* xpi, const float* ypr, const float* ypi, const int ys, float* rpr, float* rpi, const int rs){
+  rpr[0*rs]+=
+    (1.000000f)*(xpr[0]*ypr[0*ys]-xpi[0]*ypi[0*ys]);
+  rpi[0*rs]+=
+    (1.000000f)*(xpr[0]*ypi[0*ys]+xpi[0]*ypr[0*ys]);
+  rpr[1*rs]+=
+    (1.000000f)*(xpr[0]*ypr[1*ys]-xpi[0]*ypi[1*ys]);
+  rpi[1*rs]+=
+    (1.000000f)*(xpr[0]*ypi[1*ys]+xpi[0]*ypr[1*ys]);
+  rpr[2*rs]+=
+    (1.000000f)*(xpr[0]*ypr[2*ys]-xpi[0]*ypi[2*ys]);
+  rpi[2*rs]+=
+    (1.000000f)*(xpr[0]*ypi[2*ys]+xpi[0]*ypr[2*ys]);
+  rpr[3*rs]+=
+    (1.000000f)*(xpr[0]*ypr[3*ys]-xpi[0]*ypi[3*ys]);
+  rpi[3*rs]+=
+    (1.000000f)*(xpr[0]*ypi[3*ys]+xpi[0]*ypr[3*ys]);
+  rpr[4*rs]+=
+    (1.000000f)*(xpr[0]*ypr[4*ys]-xpi[0]*ypi[4*ys]);
+  rpi[4*rs]+=
+    (1.000000f)*(xpr[0]*ypi[4*ys]+xpi[0]*ypr[4*ys]);
+}
+
+__forceinline__ __device__ void SO3part_addCGproduct_explicit_kernel_1_0_1(const float* xpr, const float* xpi, const float* ypr, const float* ypi, const int ys, float* rpr, float* rpi, const int rs){
+  rpr[0*rs]+=
+    (1.000000f)*(xpr[0]*ypr[0*ys]-xpi[0]*ypi[0*ys]);
+  rpi[0*rs]+=
+    (1.000000f)*(xpr[0]*ypi[0*ys]+xpi[0]*ypr[0*ys]);
+  rpr[1*rs]+=
+    (1.000000f)*(xpr[1]*ypr[0*ys]-xpi[1]*ypi[0*ys]);
+  rpi[1*rs]+=
+    (1.000000f)*(xpr[1]*ypi[0*ys]+xpi[1]*ypr[0*ys]);
+  rpr[2*rs]+=
+    (1.000000f)*(xpr[2]*ypr[0*ys]-xpi[2]*ypi[0*ys]);
+  rpi[2*rs]+=
+    (1.000000f)*(xpr[2]*ypi[0*ys]+xpi[2]*ypr[0*ys]);
+}
+
+__forceinline__ __device__ void SO3part_addCGproduct_explicit_kernel_1_1_0(const float* xpr, const float* xpi, const float* ypr, const float* ypi, const int ys, float* rpr, float* rpi, const int rs){
+  rpr[0*rs]+=
+    (0.577350f)*(xpr[0]*ypr[2*ys]-xpi[0]*ypi[2*ys])+
+    (-0.577350f)*(xpr[1]*ypr[1*ys]-xpi[1]*ypi[1*ys])+
+    (0.577350f)*(xpr[2]*ypr[0*ys]-xpi[2]*ypi[0*ys]);
+  rpi[0*rs]+=
+    (0.577350f)*(xpr[0]*ypi[2*ys]+xpi[0]*ypr[2*ys])+
+    (-0.577350f)*(xpr[1]*ypi[1*ys]+xpi[1]*ypr[1*ys])+
+    (0.577350f)*(xpr[2]*ypi[0*ys]+xpi[2]*ypr[0*ys]);
+}
+
+__forceinline__ __device__ void SO3part_addCGproduct_explicit_kernel_1_1_1(const float* xpr, const float* xpi, const float* ypr, const float* ypi, const int ys, float* rpr, float* rpi, const int rs){
+  rpr[0*rs]+=
+    (-0.707107f)*(xpr[0]*ypr[1*ys]-xpi[0]*ypi[1*ys])+
+    (0.707107f)*(xpr[1]*ypr[0*ys]-xpi[1]*ypi[0*ys]);
+  rpi[0*rs]+=
+    (-0.707107f)*(xpr[0]*ypi[1*ys]+xpi[0]*ypr[1*ys])+
+    (0.707107f)*(xpr[1]*ypi[0*ys]+xpi[1]*ypr[0*ys]);
+  rpr[1*rs]+=
+    (-0.707107f)*(xpr[0]*ypr[2*ys]-xpi[0]*ypi[2*ys])+
+    (0.000000f)*(xpr[1]*ypr[1*ys]-xpi[1]*ypi[1*ys])+
+    (0.707107f)*(xpr[2]*ypr[0*ys]-xpi[2]*ypi[0*ys]);
+  rpi[1*rs]+=
+    (-0.707107f)*(xpr[0]*ypi[2*ys]+xpi[0]*ypr[2*ys])+
+    (0.000000f)*(xpr[1]*ypi[1*ys]+xpi[1]*ypr[1*ys])+
+    (0.707107f)*(xpr[2]*ypi[0*ys]+xpi[2]*ypr[0*ys]);
+  rpr[2*rs]+=
+    (-0.707107f)*(xpr[1]*ypr[2*ys]-xpi[1]*ypi[2*ys])+
+    (0.707107f)*(xpr[2]*ypr[1*ys]-xpi[2]*ypi[1*ys]);
+  rpi[2*rs]+=
+    (-0.707107f)*(xpr[1]*ypi[2*ys]+xpi[1]*ypr[2*ys])+
+    (0.707107f)*(xpr[2]*ypi[1*ys]+xpi[2]*ypr[1*ys]);
+}
+
+__forceinline__ __device__ void SO3part_addCGproduct_explicit_kernel_1_1_2(const float* xpr, const float* xpi, const float* ypr, const float* ypi, const int ys, float* rpr, float* rpi, const int rs){
+  rpr[0*rs]+=
+    (1.000000f)*(xpr[0]*ypr[0*ys]-xpi[0]*ypi[0*ys]);
+  rpi[0*rs]+=
+    (1.000000f)*(xpr[0]*ypi[0*ys]+xpi[0]*ypr[0*ys]);
+  rpr[1*rs]+=
+    (0.707107f)*(xpr[0]*ypr[1*ys]-xpi[0]*ypi[1*ys])+
+    (0.707107f)*(xpr[1]*ypr[0*ys]-xpi[1]*ypi[0*ys]);
+  rpi[1*rs]+=
+    (0.707107f)*(xpr[0]*ypi[1*ys]+xpi[0]*ypr[1*ys])+
+    (0.707107f)*(xpr[1]*ypi[0*ys]+xpi[1]*ypr[0*ys]);
+  rpr[2*rs]+=
+    (0.408248f)*(xpr[0]*ypr[2*ys]-xpi[0]*ypi[2*ys])+
+    (0.816497f)*(xpr[1]*ypr[1*ys]-xpi[1]*ypi[1*ys])+
+    (0.408248f)*(xpr[2]*ypr[0*ys]-xpi[2]*ypi[0*ys]);
+  rpi[2*rs]+=
+    (0.408248f)*(xpr[0]*ypi[2*ys]+xpi[0]*ypr[2*ys])+
+    (0.816497f)*(xpr[1]*ypi[1*ys]+xpi[1]*ypr[1*ys])+
+    (0.408248f)*(xpr[2]*ypi[0*ys]+xpi[2]*ypr[0*ys]);
+  rpr[3*rs]+=
+    (0.707107f)*(xpr[1]*ypr[2*ys]-xpi[1]*ypi[2*ys])+
+    (0.707107f)*(xpr[2]*ypr[1*ys]-xpi[2]*ypi[1*ys]);
+  rpi[3*rs]+=
+    (0.707107f)*(xpr[1]*ypi[2*ys]+xpi[1]*ypr[2*ys])+
+    (0.707107f)*(xpr[2]*ypi[1*ys]+xpi[2]*ypr[1*ys]);
+  rpr[4*rs]+=
+    (1.000000f)*(xpr[2]*ypr[2*ys]-xpi[2]*ypi[2*ys]);
+  rpi[4*rs]+=
+    (1.000000f)*(xpr[2]*ypi[2*ys]+xpi[2]*ypr[2*ys]);
+}
+
+__forceinline__ __device__ void SO3part_addCGproduct_explicit_kernel_1_2_1(const float* xpr, const float* xpi, const float* ypr, const float* ypi, const int ys, float* rpr, float* rpi, const int rs){
+  rpr[0*rs]+=
+    (0.316228f)*(xpr[0]*ypr[2*ys]-xpi[0]*ypi[2*ys])+
+    (-0.547723f)*(xpr[1]*ypr[1*ys]-xpi[1]*ypi[1*ys])+
+    (0.774597f)*(xpr[2]*ypr[0*ys]-xpi[2]*ypi[0*ys]);
+  rpi[0*rs]+=
+    (0.316228f)*(xpr[0]*ypi[2*ys]+xpi[0]*ypr[2*ys])+
+    (-0.547723f)*(xpr[1]*ypi[1*ys]+xpi[1]*ypr[1*ys])+
+    (0.774597f)*(xpr[2]*ypi[0*ys]+xpi[2]*ypr[0*ys]);
+  rpr[1*rs]+=
+    (0.547723f)*(xpr[0]*ypr[3*ys]-xpi[0]*ypi[3*ys])+
+    (-0.632456f)*(xpr[1]*ypr[2*ys]-xpi[1]*ypi[2*ys])+
+    (0.547723f)*(xpr[2]*ypr[1*ys]-xpi[2]*ypi[1*ys]);
+  rpi[1*rs]+=
+    (0.547723f)*(xpr[0]*ypi[3*ys]+xpi[0]*ypr[3*ys])+
+    (-0.632456f)*(xpr[1]*ypi[2*ys]+xpi[1]*ypr[2*ys])+
+    (0.547723f)*(xpr[2]*ypi[1*ys]+xpi[2]*ypr[1*ys]);
+  rpr[2*rs]+=
+    (0.774597f)*(xpr[0]*ypr[4*ys]-xpi[0]*ypi[4*ys])+
+    (-0.547723f)*(xpr[1]*ypr[3*ys]-xpi[1]*ypi[3*ys])+
+    (0.316228f)*(xpr[2]*ypr[2*ys]-xpi[2]*ypi[2*ys]);
+  rpi[2*rs]+=
+    (0.774597f)*(xpr[0]*ypi[4*ys]+xpi[0]*ypr[4*ys])+
+    (-0.547723f)*(xpr[1]*ypi[3*ys]+xpi[1]*ypr[3*ys])+
+    (0.316228f)*(xpr[2]*ypi[2*ys]+xpi[2]*ypr[2*ys]);
+}
+
+__forceinline__ __device__ void SO3part_addCGproduct_explicit_kernel_1_2_2(const float* xpr, const float* xpi, const float* ypr, const float* ypi, const int ys, float* rpr, float* rpi, const int rs){
+  rpr[0*rs]+=
+    (-0.577350f)*(xpr[0]*ypr[1*ys]-xpi[0]*ypi[1*ys])+
+    (0.816497f)*(xpr[1]*ypr[0*ys]-xpi[1]*ypi[0*ys]);
+  rpi[0*rs]+=
+    (-0.577350f)*(xpr[0]*ypi[1*ys]+xpi[0]*ypr[1*ys])+
+    (0.816497f)*(xpr[1]*ypi[0*ys]+xpi[1]*ypr[0*ys]);
+  rpr[1*rs]+=
+    (-0.707107f)*(xpr[0]*ypr[2*ys]-xpi[0]*ypi[2*ys])+
+    (0.408248f)*(xpr[1]*ypr[1*ys]-xpi[1]*ypi[1*ys])+
+    (0.577350f)*(xpr[2]*ypr[0*ys]-xpi[2]*ypi[0*ys]);
+  rpi[1*rs]+=
+    (-0.707107f)*(xpr[0]*ypi[2*ys]+xpi[0]*ypr[2*ys])+
+    (0.408248f)*(xpr[1]*ypi[1*ys]+xpi[1]*ypr[1*ys])+
+    (0.577350f)*(xpr[2]*ypi[0*ys]+xpi[2]*ypr[0*ys]);
+  rpr[2*rs]+=
+    (-0.707107f)*(xpr[0]*ypr[3*ys]-xpi[0]*ypi[3*ys])+
+    (0.000000f)*(xpr[1]*ypr[2*ys]-xpi[1]*ypi[2*ys])+
+    (0.707107f)*(xpr[2]*ypr[1*ys]-xpi[2]*ypi[1*ys]);
+  rpi[2*rs]+=
+    (-0.707107f)*(xpr[0]*ypi[3*ys]+xpi[0]*ypr[3*ys])+
+    (0.000000f)*(xpr[1]*ypi[2*ys]+xpi[1]*ypr[2*ys])+
+    (0.707107f)*(xpr[2]*ypi[1*ys]+xpi[2]*ypr[1*ys]);
+  rpr[3*rs]+=
+    (-0.577350f)*(xpr[0]*ypr[4*ys]-xpi[0]*ypi[4*ys])+
+    (-0.408248f)*(xpr[1]*ypr[3*ys]-xpi[1]*ypi[3*ys])+
+    (0.707107f)*(xpr[2]*ypr[2*ys]-xpi[2]*ypi[2*ys]);
+  rpi[3*rs]+=
+    (-0.577350f)*(xpr[0]*ypi[4*ys]+xpi[0]*ypr[4*ys])+
+    (-0.408248f)*(xpr[1]*ypi[3*ys]+xpi[1]*ypr[3*ys])+
+    (0.707107f)*(xpr[2]*ypi[2*ys]+xpi[2]*ypr[2*ys]);
+  rpr[4*rs]+=
+    (-0.816497f)*(xpr[1]*ypr[4*ys]-xpi[1]*ypi[4*ys])+
+    (0.577350f)*(xpr[2]*ypr[3*ys]-xpi[2]*ypi[3*ys]);
+  rpi[4*rs]+=
+    (-0.816497f)*(xpr[1]*ypi[4*ys]+xpi[1]*ypr[4*ys])+
+    (0.577350f)*(xpr[2]*ypi[3*ys]+xpi[2]*ypr[3*ys]);
+}
+
+__forceinline__ __device__ void SO3part_addCGproduct_explicit_kernel_1_2_3(const float* xpr, const float* xpi, const float* ypr, const float* ypi, const int ys, float* rpr, float* rpi, const int rs){
+  rpr[0*rs]+=
+    (1.000000f)*(xpr[0]*ypr[0*ys]-xpi[0]*ypi[0*ys]);
+  rpi[0*rs]+=
+    (1.000000f)*(xpr[0]*ypi[0*ys]+xpi[0]*ypr[0*ys]);
+  rpr[1*rs]+=
+    (0.816497f)*(xpr[0]*ypr[1*ys]-xpi[0]*ypi[1*ys])+
+    (0.577350f)*(xpr[1]*ypr[0*ys]-xpi[1]*ypi[0*ys]);
+  rpi[1*rs]+=
+    (0.816497f)*(xpr[0]*ypi[1*ys]+xpi[0]*ypr[1*ys])+
+    (0.577350f)*(xpr[1]*ypi[0*ys]+xpi[1]*ypr[0*ys]);
+  rpr[2*rs]+=
+    (0.632456f)*(xpr[0]*ypr[2*ys]-xpi[0]*ypi[2*ys])+
+    (0.730297f)*(xpr[1]*ypr[1*ys]-xpi[1]*ypi[1*ys])+
+    (0.258199f)*(xpr[2]*ypr[0*ys]-xpi[2]*ypi[0*ys]);
+  rpi[2*rs]+=
+    (0.632456f)*(xpr[0]*ypi[2*ys]+xpi[0]*ypr[2*ys])+
+    (0.730297f)*(xpr[1]*ypi[1*ys]+xpi[1]*ypr[1*ys])+
+    (0.258199f)*(xpr[2]*ypi[0*ys]+xpi[2]*ypr[0*ys]);
+  rpr[3*rs]+=
+    (0.447214f)*(xpr[0]*ypr[3*ys]-xpi[0]*ypi[3*ys])+
+    (0.774597f)*(xpr[1]*ypr[2*ys]-xpi[1]*ypi[2*ys])+
+    (0.447214f)*(xpr[2]*ypr[1*ys]-xpi[2]*ypi[1*ys]);
+  rpi[3*rs]+=
+    (0.447214f)*(xpr[0]*ypi[3*ys]+xpi[0]*ypr[3*ys])+
+    (0.774597f)*(xpr[1]*ypi[2*ys]+xpi[1]*ypr[2*ys])+
+    (0.447214f)*(xpr[2]*ypi[1*ys]+xpi[2]*ypr[1*ys]);
+  rpr[4*rs]+=
+    (0.258199f)*(xpr[0]*ypr[4*ys]-xpi[0]*ypi[4*ys])+
+    (0.730297f)*(xpr[1]*ypr[3*ys]-xpi[1]*ypi[3*ys])+
+    (0.632456f)*(xpr[2]*ypr[2*ys]-xpi[2]*ypi[2*ys]);
+  rpi[4*rs]+=
+    (0.258199f)*(xpr[0]*ypi[4*ys]+xpi[0]*ypr[4*ys])+
+    (0.730297f)*(xpr[1]*ypi[3*ys]+xpi[1]*ypr[3*ys])+
+    (0.632456f)*(xpr[2]*ypi[2*ys]+xpi[2]*ypr[2*ys]);
+  rpr[5*rs]+=
+    (0.577350f)*(xpr[1]*ypr[4*ys]-xpi[1]*ypi[4*ys])+
+    (0.816497f)*(xpr[2]*ypr[3*ys]-xpi[2]*ypi[3*ys]);
+  rpi[5*rs]+=
+    (0.577350f)*(xpr[1]*ypi[4*ys]+xpi[1]*ypr[4*ys])+
+    (0.816497f)*(xpr[2]*ypi[3*ys]+xpi[2]*ypr[3*ys]);
+  rpr[6*rs]+=
+    (1.000000f)*(xpr[2]*ypr[4*ys]-xpi[2]*ypi[4*ys]);
+  rpi[6*rs]+=
+    (1.000000f)*(xpr[2]*ypi[4*ys]+xpi[2]*ypr[4*ys]);
+}
+
+__forceinline__ __device__ void SO3part_addCGproduct_explicit_kernel_2_0_2(const float* xpr, const float* xpi, const float* ypr, const float* ypi, const int ys, float* rpr, float* rpi, const int rs){
+  rpr[0*rs]+=
+    (1.000000f)*(xpr[0]*ypr[0*ys]-xpi[0]*ypi[0*ys]);
+  rpi[0*rs]+=
+    (1.000000f)*(xpr[0]*ypi[0*ys]+xpi[0]*ypr[0*ys]);
+  rpr[1*rs]+=
+    (1.000000f)*(xpr[1]*ypr[0*ys]-xpi[1]*ypi[0*ys]);
+  rpi[1*rs]+=
+    (1.000000f)*(xpr[1]*ypi[0*ys]+xpi[1]*ypr[0*ys]);
+  rpr[2*rs]+=
+    (1.000000f)*(xpr[2]*ypr[0*ys]-xpi[2]*ypi[0*ys]);
+  rpi[2*rs]+=
+    (1.000000f)*(xpr[2]*ypi[0*ys]+xpi[2]*ypr[0*ys]);
+  rpr[3*rs]+=
+    (1.000000f)*(xpr[3]*ypr[0*ys]-xpi[3]*ypi[0*ys]);
+  rpi[3*rs]+=
+    (1.000000f)*(xpr[3]*ypi[0*ys]+xpi[3]*ypr[0*ys]);
+  rpr[4*rs]+=
+    (1.000000f)*(xpr[4]*ypr[0*ys]-xpi[4]*ypi[0*ys]);
+  rpi[4*rs]+=
+    (1.000000f)*(xpr[4]*ypi[0*ys]+xpi[4]*ypr[0*ys]);
+}
+
+__forceinline__ __device__ void SO3part_addCGproduct_explicit_kernel_2_1_1(const float* xpr, const float* xpi, const float* ypr, const float* ypi, const int ys, float* rpr, float* rpi, const int rs){
+  rpr[0*rs]+=
+    (0.774597f)*(xpr[0]*ypr[2*ys]-xpi[0]*ypi[2*ys])+
+    (-0.547723f)*(xpr[1]*ypr[1*ys]-xpi[1]*ypi[1*ys])+
+    (0.316228f)*(xpr[2]*ypr[0*ys]-xpi[2]*ypi[0*ys]);
+  rpi[0*rs]+=
+    (0.774597f)*(xpr[0]*ypi[2*ys]+xpi[0]*ypr[2*ys])+
+    (-0.547723f)*(xpr[1]*ypi[1*ys]+xpi[1]*ypr[1*ys])+
+    (0.316228f)*(xpr[2]*ypi[0*ys]+xpi[2]*ypr[0*ys]);
+  rpr[1*rs]+=
+    (0.547723f)*(xpr[1]*ypr[2*ys]-xpi[1]*ypi[2*ys])+
+    (-0.632456f)*(xpr[2]*ypr[1*ys]-xpi[2]*ypi[1*ys])+
+    (0.547723f)*(xpr[3]*ypr[0*ys]-xpi[3]*ypi[0*ys]);
+  rpi[1*rs]+=
+    (0.547723f)*(xpr[1]*ypi[2*ys]+xpi[1]*ypr[2*ys])+
+    (-0.632456f)*(xpr[2]*ypi[1*ys]+xpi[2]*ypr[1*ys])+
+    (0.547723f)*(xpr[3]*ypi[0*ys]+xpi[3]*ypr[0*ys]);
+  rpr[2*rs]+=
+    (0.316228f)*(xpr[2]*ypr[2*ys]-xpi[2]*ypi[2*ys])+
+    (-0.547723f)*(xpr[3]*ypr[1*ys]-xpi[3]*ypi[1*ys])+
+    (0.774597f)*(xpr[4]*ypr[0*ys]-xpi[4]*ypi[0*ys]);
+  rpi[2*rs]+=
+    (0.316228f)*(xpr[2]*ypi[2*ys]+xpi[2]*ypr[2*ys])+
+    (-0.547723f)*(xpr[3]*ypi[1*ys]+xpi[3]*ypr[1*ys])+
+    (0.774597f)*(xpr[4]*ypi[0*ys]+xpi[4]*ypr[0*ys]);
+}
+
+__forceinline__ __device__ void SO3part_addCGproduct_explicit_kernel_2_1_2(const float* xpr, const float* xpi, const float* ypr, const float* ypi, const int ys, float* rpr, float* rpi, const int rs){
+  rpr[0*rs]+=
+    (-0.816497f)*(xpr[0]*ypr[1*ys]-xpi[0]*ypi[1*ys])+
+    (0.577350f)*(xpr[1]*ypr[0*ys]-xpi[1]*ypi[0*ys]);
+  rpi[0*rs]+=
+    (-0.816497f)*(xpr[0]*ypi[1*ys]+xpi[0]*ypr[1*ys])+
+    (0.577350f)*(xpr[1]*ypi[0*ys]+xpi[1]*ypr[0*ys]);
+  rpr[1*rs]+=
+    (-0.577350f)*(xpr[0]*ypr[2*ys]-xpi[0]*ypi[2*ys])+
+    (-0.408248f)*(xpr[1]*ypr[1*ys]-xpi[1]*ypi[1*ys])+
+    (0.707107f)*(xpr[2]*ypr[0*ys]-xpi[2]*ypi[0*ys]);
+  rpi[1*rs]+=
+    (-0.577350f)*(xpr[0]*ypi[2*ys]+xpi[0]*ypr[2*ys])+
+    (-0.408248f)*(xpr[1]*ypi[1*ys]+xpi[1]*ypr[1*ys])+
+    (0.707107f)*(xpr[2]*ypi[0*ys]+xpi[2]*ypr[0*ys]);
+  rpr[2*rs]+=
+    (-0.707107f)*(xpr[1]*ypr[2*ys]-xpi[1]*ypi[2*ys])+
+    (0.000000f)*(xpr[2]*ypr[1*ys]-xpi[2]*ypi[1*ys])+
+    (0.707107f)*(xpr[3]*ypr[0*ys]-xpi[3]*ypi[0*ys]);
+  rpi[2*rs]+=
+    (-0.707107f)*(xpr[1]*ypi[2*ys]+xpi[1]*ypr[2*ys])+
+    (0.000000f)*(xpr[2]*ypi[1*ys]+xpi[2]*ypr[1*ys])+
+    (0.707107f)*(xpr[3]*ypi[0*ys]+xpi[3]*ypr[0*ys]);
+  rpr[3*rs]+=
+    (-0.707107f)*(xpr[2]*ypr[2*ys]-xpi[2]*ypi[2*ys])+
+    (0.408248f)*(xpr[3]*ypr[1*ys]-xpi[3]*ypi[1*ys])+
+    (0.577350f)*(xpr[4]*ypr[0*ys]-xpi[4]*ypi[0*ys]);
+  rpi[3*rs]+=
+    (-0.707107f)*(xpr[2]*ypi[2*ys]+xpi[2]*ypr[2*ys])+
+    (0.408248f)*(xpr[3]*ypi[1*ys]+xpi[3]*ypr[1*ys])+
+    (0.577350f)*(xpr[4]*ypi[0*ys]+xpi[4]*ypr[0*ys]);
+  rpr[4*rs]+=
+    (-0.577350f)*(xpr[3]*ypr[2*ys]-xpi[3]*ypi[2*ys])+
+    (0.816497f)*(xpr[4]*ypr[1*ys]-xpi[4]*ypi[1*ys]);
+  rpi[4*rs]+=
+    (-0.577350f)*(xpr[3]*ypi[2*ys]+xpi[3]*ypr[2*ys])+
+    (0.816497f)*(xpr[4]*ypi[1*ys]+xpi[4]*ypr[1*ys]);
+}
+
+__forceinline__ __device__ void SO3part_addCGproduct_explicit_kernel_2_1_3(const float* xpr, const float* xpi, const float* ypr, const float* ypi, const int ys, float* rpr, float* rpi, const int rs){
+  rpr[0*rs]+=
+    (1.000000f)*(xpr[0]*ypr[0*ys]-xpi[0]*ypi[0*ys]);
+  rpi[0*rs]+=
+    (1.000000f)*(xpr[0]*ypi[0*ys]+xpi[0]*ypr[0*ys]);
+  rpr[1*rs]+=
+    (0.577350f)*(xpr[0]*ypr[1*ys]-xpi[0]*ypi[1*ys])+
+    (0.816497f)*(xpr[1]*ypr[0*ys]-xpi[1]*ypi[0*ys]);
+  rpi[1*rs]+=
+    (0.577350f)*(xpr[0]*ypi[1*ys]+xpi[0]*ypr[1*ys])+
+    (0.816497f)*(xpr[1]*ypi[0*ys]+xpi[1]*ypr[0*ys]);
+  rpr[2*rs]+=
+    (0.258199f)*(xpr[0]*ypr[2*ys]-xpi[0]*ypi[2*ys])+
+    (0.730297f)*(xpr[1]*ypr[1*ys]-xpi[1]*ypi[1*ys])+
+    (0.632456f)*(xpr[2]*ypr[0*ys]-xpi[2]*ypi[0*ys]);
+  rpi[2*rs]+=
+    (0.258199f)*(xpr[0]*ypi[2*ys]+xpi[0]*ypr[2*ys])+
+    (0.730297f)*(xpr[1]*ypi[1*ys]+xpi[1]*ypr[1*ys])+
+    (0.632456f)*(xpr[2]*ypi[0*ys]+xpi[2]*ypr[0*ys]);
+  rpr[3*rs]+=
+    (0.447214f)*(xpr[1]*ypr[2*ys]-xpi[1]*ypi[2*ys])+
+    (0.774597f)*(xpr[2]*ypr[1*ys]-xpi[2]*ypi[1*ys])+
+    (0.447214f)*(xpr[3]*ypr[0*ys]-xpi[3]*ypi[0*ys]);
+  rpi[3*rs]+=
+    (0.447214f)*(xpr[1]*ypi[2*ys]+xpi[1]*ypr[2*ys])+
+    (0.774597f)*(xpr[2]*ypi[1*ys]+xpi[2]*ypr[1*ys])+
+    (0.447214f)*(xpr[3]*ypi[0*ys]+xpi[3]*ypr[0*ys]);
+  rpr[4*rs]+=
+    (0.632456f)*(xpr[2]*ypr[2*ys]-xpi[2]*ypi[2*ys])+
+    (0.730297f)*(xpr[3]*ypr[1*ys]-xpi[3]*ypi[1*ys])+
+    (0.258199f)*(xpr[4]*ypr[0*ys]-xpi[4]*ypi[0*ys]);
+  rpi[4*rs]+=
+    (0.632456f)*(xpr[2]*ypi[2*ys]+xpi[2]*ypr[2*ys])+
+    (0.730297f)*(xpr[3]*ypi[1*ys]+xpi[3]*ypr[1*ys])+
+    (0.258199f)*(xpr[4]*ypi[0*ys]+xpi[4]*ypr[0*ys]);
+  rpr[5*rs]+=
+    (0.816497f)*(xpr[3]*ypr[2*ys]-xpi[3]*ypi[2*ys])+
+    (0.577350f)*(xpr[4]*ypr[1*ys]-xpi[4]*ypi[1*ys]);
+  rpi[5*rs]+=
+    (0.816497f)*(xpr[3]*ypi[2*ys]+xpi[3]*ypr[2*ys])+
+    (0.577350f)*(xpr[4]*ypi[1*ys]+xpi[4]*ypr[1*ys]);
+  rpr[6*rs]+=
+    (1.000000f)*(xpr[4]*ypr[2*ys]-xpi[4]*ypi[2*ys]);
+  rpi[6*rs]+=
+    (1.000000f)*(xpr[4]*ypi[2*ys]+xpi[4]*ypr[2*ys]);
+}
+
+__forceinline__ __device__ void SO3part_addCGproduct_explicit_kernel_2_2_0(const float* xpr, const float* xpi, const float* ypr, const float* ypi, const int ys, float* rpr, float* rpi, const int rs){
+  rpr[0*rs]+=
+    (0.447214f)*(xpr[0]*ypr[4*ys]-xpi[0]*ypi[4*ys])+
+    (-0.447214f)*(xpr[1]*ypr[3*ys]-xpi[1]*ypi[3*ys])+
+    (0.447214f)*(xpr[2]*ypr[2*ys]-xpi[2]*ypi[2*ys])+
+    (-0.447214f)*(xpr[3]*ypr[1*ys]-xpi[3]*ypi[1*ys])+
+    (0.447214f)*(xpr[4]*ypr[0*ys]-xpi[4]*ypi[0*ys]);
+  rpi[0*rs]+=
+    (0.447214f)*(xpr[0]*ypi[4*ys]+xpi[0]*ypr[4*ys])+
+    (-0.447214f)*(xpr[1]*ypi[3*ys]+xpi[1]*ypr[3*ys])+
+    (0.447214f)*(xpr[2]*ypi[2*ys]+xpi[2]*ypr[2*ys])+
+    (-0.447214f)*(xpr[3]*ypi[1*ys]+xpi[3]*ypr[1*ys])+
+    (0.447214f)*(xpr[4]*ypi[0*ys]+xpi[4]*ypr[0*ys]);
+}
+
+__forceinline__ __device__ void SO3part_addCGproduct_explicit_kernel_2_2_1(const float* xpr, const float* xpi, const float* ypr, const float* ypi, const int ys, float* rpr, float* rpi, const int rs){
+  rpr[0*rs]+=
+    (-0.447214f)*(xpr[0]*ypr[3*ys]-xpi[0]*ypi[3*ys])+
+    (0.547723f)*(xpr[1]*ypr[2*ys]-xpi[1]*ypi[2*ys])+
+    (-0.547723f)*(xpr[2]*ypr[1*ys]-xpi[2]*ypi[1*ys])+
+    (0.447214f)*(xpr[3]*ypr[0*ys]-xpi[3]*ypi[0*ys]);
+  rpi[0*rs]+=
+    (-0.447214f)*(xpr[0]*ypi[3*ys]+xpi[0]*ypr[3*ys])+
+    (0.547723f)*(xpr[1]*ypi[2*ys]+xpi[1]*ypr[2*ys])+
+    (-0.547723f)*(xpr[2]*ypi[1*ys]+xpi[2]*ypr[1*ys])+
+    (0.447214f)*(xpr[3]*ypi[0*ys]+xpi[3]*ypr[0*ys]);
+  rpr[1*rs]+=
+    (-0.632456f)*(xpr[0]*ypr[4*ys]-xpi[0]*ypi[4*ys])+
+    (0.316228f)*(xpr[1]*ypr[3*ys]-xpi[1]*ypi[3*ys])+
+    (0.000000f)*(xpr[2]*ypr[2*ys]-xpi[2]*ypi[2*ys])+
+    (-0.316228f)*(xpr[3]*ypr[1*ys]-xpi[3]*ypi[1*ys])+
+    (0.632456f)*(xpr[4]*ypr[0*ys]-xpi[4]*ypi[0*ys]);
+  rpi[1*rs]+=
+    (-0.632456f)*(xpr[0]*ypi[4*ys]+xpi[0]*ypr[4*ys])+
+    (0.316228f)*(xpr[1]*ypi[3*ys]+xpi[1]*ypr[3*ys])+
+    (0.000000f)*(xpr[2]*ypi[2*ys]+xpi[2]*ypr[2*ys])+
+    (-0.316228f)*(xpr[3]*ypi[1*ys]+xpi[3]*ypr[1*ys])+
+    (0.632456f)*(xpr[4]*ypi[0*ys]+xpi[4]*ypr[0*ys]);
+  rpr[2*rs]+=
+    (-0.447214f)*(xpr[1]*ypr[4*ys]-xpi[1]*ypi[4*ys])+
+    (0.547723f)*(xpr[2]*ypr[3*ys]-xpi[2]*ypi[3*ys])+
+    (-0.547723f)*(xpr[3]*ypr[2*ys]-xpi[3]*ypi[2*ys])+
+    (0.447214f)*(xpr[4]*ypr[1*ys]-xpi[4]*ypi[1*ys]);
+  rpi[2*rs]+=
+    (-0.447214f)*(xpr[1]*ypi[4*ys]+xpi[1]*ypr[4*ys])+
+    (0.547723f)*(xpr[2]*ypi[3*ys]+xpi[2]*ypr[3*ys])+
+    (-0.547723f)*(xpr[3]*ypi[2*ys]+xpi[3]*ypr[2*ys])+
+    (0.447214f)*(xpr[4]*ypi[1*ys]+xpi[4]*ypr[1*ys]);
+}
+
+__forceinline__ __device__ void SO3part_addCGproduct_explicit_kernel_2_2_2(const float* xpr, const float* xpi, const float* ypr, const float* ypi, const int ys, float* rpr, float* rpi, const int rs){
+  rpr[0*rs]+=
+    (0.534522f)*(xpr[0]*ypr[2*ys]-xpi[0]*ypi[2*ys])+
+    (-0.654654f)*(xpr[1]*ypr[1*ys]-xpi[1]*ypi[1*ys])+
+    (0.534522f)*(xpr[2]*ypr[0*ys]-xpi[2]*ypi[0*ys]);
+  rpi[0*rs]+=
+    (0.534522f)*(xpr[0]*ypi[2*ys]+xpi[0]*ypr[2*ys])+
+    (-0.654654f)*(xpr[1]*ypi[1*ys]+xpi[1]*ypr[1*ys])+
+    (0.534522f)*(xpr[2]*ypi[0*ys]+xpi[2]*ypr[0*ys]);
+  rpr[1*rs]+=
+    (0.654654f)*(xpr[0]*ypr[3*ys]-xpi[0]*ypi[3*ys])+
+    (-0.267261f)*(xpr[1]*ypr[2*ys]-xpi[1]*ypi[2*ys])+
+    (-0.267261f)*(xpr[2]*ypr[1*ys]-xpi[2]*ypi[1*ys])+
+    (0.654654f)*(xpr[3]*ypr[0*ys]-xpi[3]*ypi[0*ys]);
+  rpi[1*rs]+=
+    (0.654654f)*(xpr[0]*ypi[3*ys]+xpi[0]*ypr[3*ys])+
+    (-0.267261f)*(xpr[1]*ypi[2*ys]+xpi[1]*ypr[2*ys])+
+    (-0.267261f)*(xpr[2]*ypi[1*ys]+xpi[2]*ypr[1*ys])+
+    (0.654654f)*(xpr[3]*ypi[0*ys]+xpi[3]*ypr[0*ys]);
+  rpr[2*rs]+=
+    (0.534522f)*(xpr[0]*ypr[4*ys]-xpi[0]*ypi[4*ys])+
+    (0.267261f)*(xpr[1]*ypr[3*ys]-xpi[1]*ypi[3*ys])+
+    (-0.534522f)*(xpr[2]*ypr[2*ys]-xpi[2]*ypi[2*ys])+
+    (0.267261f)*(xpr[3]*ypr[1*ys]-xpi[3]*ypi[1*ys])+
+    (0.534522f)*(xpr[4]*ypr[0*ys]-xpi[4]*ypi[0*ys]);
+  rpi[2*rs]+=
+    (0.534522f)*(xpr[0]*ypi[4*ys]+xpi[0]*ypr[4*ys])+
+    (0.267261f)*(xpr[1]*ypi[3*ys]+xpi[1]*ypr[3*ys])+
+    (-0.534522f)*(xpr[2]*ypi[2*ys]+xpi[2]*ypr[2*ys])+
+    (0.267261f)*(xpr[3]*ypi[1*ys]+xpi[3]*ypr[1*ys])+
+    (0.534522f)*(xpr[4]*ypi[0*ys]+xpi[4]*ypr[0*ys]);
+  rpr[3*rs]+=
+    (0.654654f)*(xpr[1]*ypr[4*ys]-xpi[1]*ypi[4*ys])+
+    (-0.267261f)*(xpr[2]*ypr[3*ys]-xpi[2]*ypi[3*ys])+
+    (-0.267261f)*(xpr[3]*ypr[2*ys]-xpi[3]*ypi[2*ys])+
+    (0.654654f)*(xpr[4]*ypr[1*ys]-xpi[4]*ypi[1*ys]);
+  rpi[3*rs]+=
+    (0.654654f)*(xpr[1]*ypi[4*ys]+xpi[1]*ypr[4*ys])+
+    (-0.267261f)*(xpr[2]*ypi[3*ys]+xpi[2]*ypr[3*ys])+
+    (-0.267261f)*(xpr[3]*ypi[2*ys]+xpi[3]*ypr[2*ys])+
+    (0.654654f)*(xpr[4]*ypi[1*ys]+xpi[4]*ypr[1*ys]);
+  rpr[4*rs]+=
+    (0.534522f)*(xpr[2]*ypr[4*ys]-xpi[2]*ypi[4*ys])+
+    (-0.654654f)*(xpr[3]*ypr[3*ys]-xpi[3]*ypi[3*ys])+
+    (0.534522f)*(xpr[4]*ypr[2*ys]-xpi[4]*ypi[2*ys]);
+  rpi[4*rs]+=
+    (0.534522f)*(xpr[2]*ypi[4*ys]+xpi[2]*ypr[4*ys])+
+    (-0.654654f)*(xpr[3]*ypi[3*ys]+xpi[3]*ypr[3*ys])+
+    (0.534522f)*(xpr[4]*ypi[2*ys]+xpi[4]*ypr[2*ys]);
+}
+
+__forceinline__ __device__ void SO3part_addCGproduct_explicit_kernel_2_2_3(const float* xpr, const float* xpi, const float* ypr, const float* ypi, const int ys, float* rpr, float* rpi, const int rs){
+  rpr[0*rs]+=
+    (-0.707107f)*(xpr[0]*ypr[1*ys]-xpi[0]*ypi[1*ys])+
+    (0.707107f)*(xpr[1]*ypr[0*ys]-xpi[1]*ypi[0*ys]);
+  rpi[0*rs]+=
+    (-0.707107f)*(xpr[0]*ypi[1*ys]+xpi[0]*ypr[1*ys])+
+    (0.707107f)*(xpr[1]*ypi[0*ys]+xpi[1]*ypr[0*ys]);
+  rpr[1*rs]+=
+    (-0.707107f)*(xpr[0]*ypr[2*ys]-xpi[0]*ypi[2*ys])+
+    (0.000000f)*(xpr[1]*ypr[1*ys]-xpi[1]*ypi[1*ys])+
+    (0.707107f)*(xpr[2]*ypr[0*ys]-xpi[2]*ypi[0*ys]);
+  rpi[1*rs]+=
+    (-0.707107f)*(xpr[0]*ypi[2*ys]+xpi[0]*ypr[2*ys])+
+    (0.000000f)*(xpr[1]*ypi[1*ys]+xpi[1]*ypr[1*ys])+
+    (0.707107f)*(xpr[2]*ypi[0*ys]+xpi[2]*ypr[0*ys]);
+  rpr[2*rs]+=
+    (-0.547723f)*(xpr[0]*ypr[3*ys]-xpi[0]*ypi[3*ys])+
+    (-0.447214f)*(xpr[1]*ypr[2*ys]-xpi[1]*ypi[2*ys])+
+    (0.447214f)*(xpr[2]*ypr[1*ys]-xpi[2]*ypi[1*ys])+
+    (0.547723f)*(xpr[3]*ypr[0*ys]-xpi[3]*ypi[0*ys]);
+  rpi[2*rs]+=
+    (-0.547723f)*(xpr[0]*ypi[3*ys]+xpi[0]*ypr[3*ys])+
+    (-0.447214f)*(xpr[1]*ypi[2*ys]+xpi[1]*ypr[2*ys])+
+    (0.447214f)*(xpr[2]*ypi[1*ys]+xpi[2]*ypr[1*ys])+
+    (0.547723f)*(xpr[3]*ypi[0*ys]+xpi[3]*ypr[0*ys]);
+  rpr[3*rs]+=
+    (-0.316228f)*(xpr[0]*ypr[4*ys]-xpi[0]*ypi[4*ys])+
+    (-0.632456f)*(xpr[1]*ypr[3*ys]-xpi[1]*ypi[3*ys])+
+    (0.000000f)*(xpr[2]*ypr[2*ys]-xpi[2]*ypi[2*ys])+
+    (0.632456f)*(xpr[3]*ypr[1*ys]-xpi[3]*ypi[1*ys])+
+    (0.316228f)*(xpr[4]*ypr[0*ys]-xpi[4]*ypi[0*ys]);
+  rpi[3*rs]+=
+    (-0.316228f)*(xpr[0]*ypi[4*ys]+xpi[0]*ypr[4*ys])+
+    (-0.632456f)*(xpr[1]*ypi[3*ys]+xpi[1]*ypr[3*ys])+
+    (0.000000f)*(xpr[2]*ypi[2*ys]+xpi[2]*ypr[2*ys])+
+    (0.632456f)*(xpr[3]*ypi[1*ys]+xpi[3]*ypr[1*ys])+
+    (0.316228f)*(xpr[4]*ypi[0*ys]+xpi[4]*ypr[0*ys]);
+  rpr[4*rs]+=
+    (-0.547723f)*(xpr[1]*ypr[4*ys]-xpi[1]*ypi[4*ys])+
+    (-0.447214f)*(xpr[2]*ypr[3*ys]-xpi[2]*ypi[3*ys])+
+    (0.447214f)*(xpr[3]*ypr[2*ys]-xpi[3]*ypi[2*ys])+
+    (0.547723f)*(xpr[4]*ypr[1*ys]-xpi[4]*ypi[1*ys]);
+  rpi[4*rs]+=
+    (-0.547723f)*(xpr[1]*ypi[4*ys]+xpi[1]*ypr[4*ys])+
+    (-0.447214f)*(xpr[2]*ypi[3*ys]+xpi[2]*ypr[3*ys])+
+    (0.447214f)*(xpr[3]*ypi[2*ys]+xpi[3]*ypr[2*ys])+
+    (0.547723f)*(xpr[4]*ypi[1*ys]+xpi[4]*ypr[1*ys]);
+  rpr[5*rs]+=
+    (-0.707107f)*(xpr[2]*ypr[4*ys]-xpi[2]*ypi[4*ys])+
+    (0.000000f)*(xpr[3]*ypr[3*ys]-xpi[3]*ypi[3*ys])+
+    (0.707107f)*(xpr[4]*ypr[2*ys]-xpi[4]*ypi[2*ys]);
+  rpi[5*rs]+=
+    (-0.707107f)*(xpr[2]*ypi[4*ys]+xpi[2]*ypr[4*ys])+
+    (0.000000f)*(xpr[3]*ypi[3*ys]+xpi[3]*ypr[3*ys])+
+    (0.707107f)*(xpr[4]*ypi[2*ys]+xpi[4]*ypr[2*ys]);
+  rpr[6*rs]+=
+    (-0.707107f)*(xpr[3]*ypr[4*ys]-xpi[3]*ypi[4*ys])+
+    (0.707107f)*(xpr[4]*ypr[3*ys]-xpi[4]*ypi[3*ys]);
+  rpi[6*rs]+=
+    (-0.707107f)*(xpr[3]*ypi[4*ys]+xpi[3]*ypr[4*ys])+
+    (0.707107f)*(xpr[4]*ypi[3*ys]+xpi[4]*ypr[3*ys]);
+}
+
+__forceinline__ __device__ void SO3part_addCGproduct_explicit_kernel_2_2_4(const float* xpr, const float* xpi, const float* ypr, const float* ypi, const int ys, float* rpr, float* rpi, const int rs){
+  rpr[0*rs]+=
+    (1.000000f)*(xpr[0]*ypr[0*ys]-xpi[0]*ypi[0*ys]);
+  rpi[0*rs]+=
+    (1.000000f)*(xpr[0]*ypi[0*ys]+xpi[0]*ypr[0*ys]);
+  rpr[1*rs]+=
+    (0.707107f)*(xpr[0]*ypr[1*ys]-xpi[0]*ypi[1*ys])+
+    (0.707107f)*(xpr[1]*ypr[0*ys]-xpi[1]*ypi[0*ys]);
+  rpi[1*rs]+=
+    (0.707107f)*(xpr[0]*ypi[1*ys]+xpi[0]*ypr[1*ys])+
+    (0.707107f)*(xpr[1]*ypi[0*ys]+xpi[1]*ypr[0*ys]);
+  rpr[2*rs]+=
+    (0.462910f)*(xpr[0]*ypr[2*ys]-xpi[0]*ypi[2*ys])+
+    (0.755929f)*(xpr[1]*ypr[1*ys]-xpi[1]*ypi[1*ys])+
+    (0.462910f)*(xpr[2]*ypr[0*ys]-xpi[2]*ypi[0*ys]);
+  rpi[2*rs]+=
+    (0.462910f)*(xpr[0]*ypi[2*ys]+xpi[0]*ypr[2*ys])+
+    (0.755929f)*(xpr[1]*ypi[1*ys]+xpi[1]*ypr[1*ys])+
+    (0.462910f)*(xpr[2]*ypi[0*ys]+xpi[2]*ypr[0*ys]);
+  rpr[3*rs]+=
+    (0.267261f)*(xpr[0]*ypr[3*ys]-xpi[0]*ypi[3*ys])+
+    (0.654654f)*(xpr[1]*ypr[2*ys]-xpi[1]*ypi[2*ys])+
+    (0.654654f)*(xpr[2]*ypr[1*ys]-xpi[2]*ypi[1*ys])+
+    (0.267261f)*(xpr[3]*ypr[0*ys]-xpi[3]*ypi[0*ys]);
+  rpi[3*rs]+=
+    (0.267261f)*(xpr[0]*ypi[3*ys]+xpi[0]*ypr[3*ys])+
+    (0.654654f)*(xpr[1]*ypi[2*ys]+xpi[1]*ypr[2*ys])+
+    (0.654654f)*(xpr[2]*ypi[1*ys]+xpi[2]*ypr[1*ys])+
+    (0.267261f)*(xpr[3]*ypi[0*ys]+xpi[3]*ypr[0*ys]);
+  rpr[4*rs]+=
+    (0.119523f)*(xpr[0]*ypr[4*ys]-xpi[0]*ypi[4*ys])+
+    (0.478091f)*(xpr[1]*ypr[3*ys]-xpi[1]*ypi[3*ys])+
+    (0.717137f)*(xpr[2]*ypr[2*ys]-xpi[2]*ypi[2*ys])+
+    (0.478091f)*(xpr[3]*ypr[1*ys]-xpi[3]*ypi[1*ys])+
+    (0.119523f)*(xpr[4]*ypr[0*ys]-xpi[4]*ypi[0*ys]);
+  rpi[4*rs]+=
+    (0.119523f)*(xpr[0]*ypi[4*ys]+xpi[0]*ypr[4*ys])+
+    (0.478091f)*(xpr[1]*ypi[3*ys]+xpi[1]*ypr[3*ys])+
+    (0.717137f)*(xpr[2]*ypi[2*ys]+xpi[2]*ypr[2*ys])+
+    (0.478091f)*(xpr[3]*ypi[1*ys]+xpi[3]*ypr[1*ys])+
+    (0.119523f)*(xpr[4]*ypi[0*ys]+xpi[4]*ypr[0*ys]);
+  rpr[5*rs]+=
+    (0.267261f)*(xpr[1]*ypr[4*ys]-xpi[1]*ypi[4*ys])+
+    (0.654654f)*(xpr[2]*ypr[3*ys]-xpi[2]*ypi[3*ys])+
+    (0.654654f)*(xpr[3]*ypr[2*ys]-xpi[3]*ypi[2*ys])+
+    (0.267261f)*(xpr[4]*ypr[1*ys]-xpi[4]*ypi[1*ys]);
+  rpi[5*rs]+=
+    (0.267261f)*(xpr[1]*ypi[4*ys]+xpi[1]*ypr[4*ys])+
+    (0.654654f)*(xpr[2]*ypi[3*ys]+xpi[2]*ypr[3*ys])+
+    (0.654654f)*(xpr[3]*ypi[2*ys]+xpi[3]*ypr[2*ys])+
+    (0.267261f)*(xpr[4]*ypi[1*ys]+xpi[4]*ypr[1*ys]);
+  rpr[6*rs]+=
+    (0.462910f)*(xpr[2]*ypr[4*ys]-xpi[2]*ypi[4*ys])+
+    (0.755929f)*(xpr[3]*ypr[3*ys]-xpi[3]*ypi[3*ys])+
+    (0.462910f)*(xpr[4]*ypr[2*ys]-xpi[4]*ypi[2*ys]);
+  rpi[6*rs]+=
+    (0.462910f)*(xpr[2]*ypi[4*ys]+xpi[2]*ypr[4*ys])+
+    (0.755929f)*(xpr[3]*ypi[3*ys]+xpi[3]*ypr[3*ys])+
+    (0.462910f)*(xpr[4]*ypi[2*ys]+xpi[4]*ypr[2*ys]);
+  rpr[7*rs]+=
+    (0.707107f)*(xpr[3]*ypr[4*ys]-xpi[3]*ypi[4*ys])+
+    (0.707107f)*(xpr[4]*ypr[3*ys]-xpi[4]*ypi[3*ys]);
+  rpi[7*rs]+=
+    (0.707107f)*(xpr[3]*ypi[4*ys]+xpi[3]*ypr[4*ys])+
+    (0.707107f)*(xpr[4]*ypi[3*ys]+xpi[4]*ypr[3*ys]);
+  rpr[8*rs]+=
+    (1.000000f)*(xpr[4]*ypr[4*ys]-xpi[4]*ypi[4*ys]);
+  rpi[8*rs]+=
+    (1.000000f)*(xpr[4]*ypi[4*ys]+xpi[4]*ypr[4*ys]);
+}
+
diff --git a/cuda/SO3part_addCGtransform.cu b/cuda/SO3part_addCGtransform.cu
new file mode 100644
index 0000000..b0314e0
--- /dev/null
+++ b/cuda/SO3part_addCGtransform.cu
@@ -0,0 +1,238 @@
+/*
+ * This file is part of GElib, a C++/CUDA library for group equivariant 
+ * tensor operations. 
+ *  
+ * Copyright (c) 2023, Imre Risi Kondor
+ *
+ * This source code file is subject to the terms of the noncommercial 
+ * license distributed with GElib in the file NONCOMMERICAL.TXT. Commercial 
+ * use is prohibited. All redistributed versions of this file (in orginal
+ * or modified form) must retain this copyright notice and must be 
+ * accompanied by a verbatim copy of the license. 
+ *
+ */
+
+#ifndef _SO3part_addCGtransform_cu
+#define _SO3part_addCGtransform_cu
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "SO3_CGbank.hpp"
+#include "Ctensor3_view.hpp"
+#include "Ctensor4_view.hpp"
+#include "cuda_loaders.cu"
+
+
+extern GElib::SO3_CGbank SO3_cgbank;
+//extern long int opcount;
+
+// Process ncells number of cells in one call
+__global__ void SO3part_addCGtransform_kernel(const cnine::Ctensor3_view r, const cnine::Ctensor4_view x, 
+  const int Cptr, float* cptr_global, const bool preloadCG, const int ncells){
+
+  extern __shared__ unsigned char _shared[]; 
+  const int b=blockIdx.x;
+  const int t=threadIdx.x;
+  const int t0=t/x.n3; // cell selector
+  const int t1=t%x.n3; // channel selector within cell
+  const int actual_ncells=min(ncells,r.n0-b*ncells);
+
+  int l1=(x.n1-1)/2;
+  int l2=(x.n2-1)/2;
+  int l=(r.n1-1)/2;
+  int L2=x.n2;
+
+  float* cptr;
+  //float* xpr;
+  if(preloadCG){
+    cptr=reinterpret_cast<float*>(_shared);
+    //xpr=cptr+((x.n1*x.n2-1)/32+1)*32;
+    if(Cptr>=0) loadf(cptr,reinterpret_cast<float*>(cg_cmem)+Cptr,x.n1*x.n2);
+    else loadf(cptr,cptr_global,x.n1*x.n2);
+  }else{
+    if(Cptr>=0) cptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+    else cptr=cptr_global;
+    //xpr=reinterpret_cast<float*>(_shared);
+  }
+
+  //loadf(xpr,x.arr+b*ncells*x.s0,actual_ncells*x.n1*x.n2*x.n3);
+  __syncthreads();
+
+  if(t0<actual_ncells){ 
+
+    int xs1=x.s1;
+    int xs2=x.s2;
+    int rs1=r.s1;
+    
+    //float* _xpr=xpr+t0*x.s0+t1;
+    float* _xpr=x.arr+b*ncells*x.s0+t0*x.s0+t1*x.s3;
+    float* _rpr=r.arr+(b*ncells+t0)*r.s0+t1*r.s2;
+
+    for(int m=-l; m<=l; m++){
+      float r_r=0;
+      int lower=max(-l1,m-l2);
+      int upper=min(l1,m+l2);
+      for(int m1=lower; m1<=upper; m1++){
+	int m2=m-m1;
+	float c=cptr[(m1+l1)*L2+m2+l2];
+	r_r+=c*_xpr[xs1*(m1+l1)+xs2*(m2+l2)];
+      }
+      _rpr[rs1*(m+l)]+=r_r;
+    }
+    
+  }
+
+  __syncthreads();
+
+}
+
+/*
+__global__ void SO3part_addCGtransform_tiled_kernel(const cnine::Ctensor3_view r, const cnine::Ctensor4_view_t3 x, 
+  const cnine::Ctensor4_view_t3 y, const int Cptr, float* cptr_global, const bool preloadCG){
+
+  extern __shared__ unsigned char _shared[]; 
+  const int b=blockIdx.x;
+  const int t=threadIdx.x;
+
+  int l1=(x.n1-1)/2;
+  int l2=(y.n1-1)/2;
+  int l=(r.n1-1)/2;
+  int L2=y.n1;
+
+  float* cptr;
+  float* xpr;
+  if(preloadCG){
+    cptr=reinterpret_cast<float*>(_shared);
+    xpr=cptr+((x.n1*y.n1-1)/32+1)*32;
+    if(Cptr>=0) loadf(cptr,reinterpret_cast<float*>(cg_cmem)+Cptr,x.n1*y.n1);
+    else loadf(cptr,cptr_global,x.n1*y.n1);
+  }else{
+    if(Cptr>=0) cptr=reinterpret_cast<float*>(cg_cmem)+Cptr;
+    else cptr=cptr_global;
+    xpr=reinterpret_cast<float*>(_shared);
+  }
+
+  float* xpi=xpr+x.n1*x.n3;
+  float* ypr=xpr+((2*x.n1*x.n3-1)/32+1)*32;
+  float* ypi=ypr+y.n1*y.n3;
+
+  int xs1=x.n3;
+  int ys1=y.n3;
+  int rs1=r.s1;
+
+  assert(x.n2==y.n2);
+
+  for(int i=0; i<x.n2; i++){
+    int xn; if(i<x.n2-1) xn=x.n3; else xn=x.last; 
+    loadg_tile(xpr,x,b,i,xn);
+    loadg_tile(ypr,y,b,i,xn);
+      
+    __syncthreads();
+
+    if(t<xn){
+
+      float* _xpr=xpr+t;
+      float* _xpi=xpi+t;
+    
+      float* _ypr=ypr+t;
+      float* _ypi=ypi+t;
+    
+      float* _rpr=r.arr+r.s0*b+r.s2*(i*x.n3+t);
+      float* _rpi=r.arrc+r.s0*b+r.s2*(i*x.n3+t);
+
+      for(int m=-l; m<=l; m++){
+	float r_r=0;
+	float r_i=0;
+	int lower=max(-l1,m-l2);
+	int upper=min(l1,m+l2);
+	for(int m1=lower; m1<=upper; m1++){
+	  int m2=m-m1;
+	  float c=cptr[(m1+l1)*L2+m2+l2];
+	  const float x_r=_xpr[xs1*(m1+l1)];
+	  const float x_i=_xpi[xs1*(m1+l1)];
+	  const float y_r=_ypr[ys1*(m2+l2)];
+	  const float y_i=_ypi[ys1*(m2+l2)];
+	  r_r+=c*(x_r*y_r-x_i*y_i); 
+	  r_i+=c*(x_r*y_i+x_i*y_r);
+	}
+	_rpr[rs1*(m+l)]+=r_r;
+	_rpi[rs1*(m+l)]+=r_i;
+      }
+    }
+    __syncthreads();
+
+  }
+
+}
+*/
+
+namespace GElib{
+
+
+  void SO3part_addCGtransform_cu(cnine::Ctensor3_view r, cnine::Ctensor4_view x, 
+    const int offs, const cudaStream_t& stream){
+
+    GELIB_ASSERT(r.n0==x.n0,"Batch dimension mismatch.");
+
+    const int b=r.n0;
+    const int l1=(x.n1-1)/2;
+    const int l2=(x.n2-1)/2;
+    const int l=(r.n1-1)/2;
+    const int n=2*x.n3;
+
+    // convert to real tensors by doubling channels
+    r.arr+=r.s2*offs;
+    r.arrc+=r.s2*offs;
+    r.n2=n;
+    x.n3=n;
+    r.s2=1;
+    x.s3=1;
+
+    float* cptr=nullptr;
+    int Cptr=SO3_cgbank.getfC(l1,l2,l)/4;
+    if(Cptr<0) cptr=SO3_cgbank.getf(CGindex(l1,l2,l),r.dev).arrg;
+    int clines=cnine::roundup(x.n1*x.n2,32)/32;
+
+    // If the number of channels is 32 or less, process multiple cells in one thread-block
+    if(n<=32){
+      int ncells=32/n;
+      ncells=std::min(ncells,380*32/(x.n1*x.n2*x.n3));
+      int nlines=0; //cnine::roundup(ncells*x.n1*x.n2*x.n3,32)/32;
+      if(ncells>0 && nlines<=384){
+	bool preloadCG=(nlines+clines<=384);
+	SO3part_addCGtransform_kernel<<<cnine::roundup(b,ncells)/ncells,cnine::roundup(ncells*n,32),
+	  (nlines+preloadCG*clines)*128,stream>>>
+	  (r,x,Cptr,cptr,preloadCG,ncells);
+	return;
+      }
+    }
+
+
+    /*
+    // Otherwise tile the inputs to chunks of width 32
+    const int tilesize=std::min(x.n2,32);
+    cnine::Ctensor4_view_t3 xtiled(x,tilesize);
+    cnine::Ctensor4_view_t3 ytiled(y,tilesize);
+    int nlines=cnine::roundup(xtiled.n1*tilesize*2,32)/32+
+      cnine::roundup(ytiled.n1*tilesize*2,32)/32;
+    
+    if(nlines<=384){
+      bool preloadCG=(nlines+clines<=384);
+      SO3part_addCGtransform_tiled_kernel<<<b,cnine::roundup(tilesize,32),(nlines+preloadCG*clines)*128,stream>>>
+	(r,xtiled,ytiled,Cptr,cptr,preloadCG);
+      return;
+    }
+    */
+
+    GELIB_ERROR("Inputs too large to load in shared memory.");
+  }    
+
+
+}
+
+
+#endif 
+
+
+
diff --git a/cuda/cuda_loaders.cu b/cuda/cuda_loaders.cu
new file mode 100644
index 0000000..b4a75bc
--- /dev/null
+++ b/cuda/cuda_loaders.cu
@@ -0,0 +1,189 @@
+/*
+ * This file is part of GElib, a C++/CUDA library for group equivariant 
+ * tensor operations. 
+ *  
+ * Copyright (c) 2023, Imre Risi Kondor
+ *
+ * This source code file is subject to the terms of the noncommercial 
+ * license distributed with GElib in the file NONCOMMERICAL.TXT. Commercial 
+ * use is prohibited. All redistributed versions of this file (in orginal
+ * or modified form) must retain this copyright notice and must be 
+ * accompanied by a verbatim copy of the license. 
+ *
+ */
+
+#ifndef _GElib_cuda_loaders
+#define _GElib_cuda_loaders
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "Ctensor3_view.hpp"
+#include "Ctensor4_view.hpp"
+
+#define tix threadIdx.x
+
+/*
+__forceinline__ __device__ unsigned dynamic_smem_size(){
+    unsigned ret; 
+    asm volatile ("mov.u32 %0, %dynamic_smem_size;" : "=r"(ret));
+    return ret;
+}
+*/
+
+/*
+__forceinline__ __device__ void loadf(float* dest, const float* src, const int n, const int t){
+  int nthreads=blockDim.x;
+  int I=n/nthreads;
+  for(int i=0; i<I; i++)
+    dest[i*nthreads+t]=src[i*nthreads+t];
+  if(t<n-I*nthreads)
+    dest[I*nthreads+t]=src[I*nthreads+t];
+}
+*/
+
+
+__forceinline__ __device__ void loadf(float* dest, const float* src, const int n){
+  int nthreads=blockDim.x;
+  int I=n/nthreads;
+  for(int i=0; i<I; i++)
+    dest[i*nthreads+tix]=src[i*nthreads+tix];
+  if(tix<n-I*nthreads)
+    dest[I*nthreads+tix]=src[I*nthreads+tix];
+}
+
+
+__forceinline__ __device__ void loadf_strided(float* dest, const float* src, const int n, const int s){
+  int nthreads=blockDim.x;
+  int I=n/nthreads;
+  for(int i=0; i<I; i++)
+    dest[i*nthreads+tix]=src[(i*nthreads+tix)*s];
+  if(tix<n-I*nthreads)
+    dest[I*nthreads+tix]=src[(I*nthreads+tix)*s];
+}
+
+__forceinline__ __device__ void savef_strided(const float* src, float* dest, const int n, const int s){
+  int nthreads=blockDim.x;
+  int I=n/nthreads;
+  for(int i=0; i<I; i++)
+    dest[(i*nthreads+tix)*s]=src[i*nthreads+tix];
+  if(tix<n-I*nthreads)
+    dest[(I*nthreads+tix)*s]=src[I*nthreads+tix];
+}
+
+
+__forceinline__ __device__ int loadg(const cnine::Ctensor3_view& x, float* dest, const int b, const int t){
+  int I=x.n1;
+  int J=x.n2;
+  int s1=x.s1;
+  int s2=x.s2;
+  float* destc=dest+I*J;
+  float* source=x.arr+x.s0*b;
+  float* sourcec=x.arrc+x.s0*b;
+  if(t<J){
+    for(int i=0; i<I; i++)
+      dest[i*J+t]=source[i*s1+t*s2];
+    for(int i=0; i<I; i++)
+      destc[i*J+t]=sourcec[i*s1+t*s2];
+  }
+  return I*J;
+}
+
+/*
+__forceinline__ __device__ int loadg(float* dest, const cnine::Ctensor3_view& x, const int b, const int ncells){
+  int I=x.n1;
+  int J=x.n2;
+  int s1=x.s1;
+  int s2=x.s2;
+  float* destc=dest+I*J;
+  float* source=x.arr+x.s0*b;
+  float* sourcec=x.arrc+x.s0*b;
+  if(tix<J){
+    for(int i=0; i<I; i++)
+      dest[i*J+tix]=source[i*s1+tix*s2];
+    for(int i=0; i<I; i++)
+      destc[i*J+tix]=sourcec[i*s1+tix*s2];
+  }
+  return I*J;
+}
+*/
+
+/*
+__forceinline__ __device__ int loadg(const cnine::Ctensor4_view& x, float* dest, const int bx, const int by, const int t){
+  int I=x.n2;
+  int J=x.n3;
+  int s2=x.s2;
+  int s3=x.s3;
+  float* destc=dest+I*J;
+  float* source=x.arr+x.s0*bx+x.s1*by;
+  float* sourcec=x.arrc+x.s0*bx+x.s1*by;
+  if(t<J){
+    for(int i=0; i<I; i++)
+      dest[i*J+t]=source[i*s2+t*s3];
+    for(int i=0; i<I; i++)
+      destc[i*J+t]=sourcec[i*s2+t*s3];
+  }
+  return I*J;
+}
+*/
+
+// Load n fragments from x to dest 
+// assumption: number of threads is at least n
+__forceinline__ __device__ int loadg_tile(float* dest, const cnine::Ctensor4_view& x, const int b, const int i, const int n){
+  int I=x.n1;
+  int J=x.n3;
+  int s1=x.s1;
+  int s3=x.s3;
+  float* destc=dest+I*J;
+  float* source=x.arr+x.s0*b+i*x.s2;
+  float* sourcec=x.arrc+x.s0*b+i*x.s2;
+  if(tix<n){
+    for(int i=0; i<I; i++)
+      dest[i*J+tix]=source[i*s1+tix*s3];
+    for(int i=0; i<I; i++)
+      destc[i*J+tix]=sourcec[i*s1+tix*s3];
+  }
+  return I*J;
+}
+
+
+// Save n fragments from to x  
+// assumption: number of threads is at least n
+__forceinline__ __device__ void saveg_tile(float* src, const cnine::Ctensor4_view& x, const int b, const int i, const int n){
+  int I=x.n1;
+  int J=x.n3;
+  int s1=x.s1;
+  int s3=x.s3;
+  float* srcc=src+I*J;
+  float* dest=x.arr+x.s0*b+i*x.s2;
+  float* destc=x.arrc+x.s0*b+i*x.s2;
+  if(tix<n){
+    for(int i=0; i<I; i++)
+      dest[i*s1+tix*s3]=src[i*J+tix];
+    for(int i=0; i<I; i++)
+      destc[i*s1+tix*s3]=srcc[i*J+tix];
+  }
+}
+
+
+__forceinline__ __device__ int saveg(const cnine::Ctensor3_view& x, float* source, const int b, const int t){
+  int I=x.n1;
+  int J=x.n2;
+  int s1=x.s1;
+  int s2=x.s2;
+  int offs=I*J;
+  float* sourcec=source+offs;
+  float* dest=x.arr+x.s0*b;
+  float* destc=x.arrc+x.s0*b;
+  if(t<J){
+    for(int i=0; i<I; i++)
+      dest[i*s1+t*s2]=source[i*J+t];
+    for(int i=0; i<I; i++)
+      destc[i*s1+t*s2]=sourcec[i*J+t];
+  }
+  return offs;
+}
+
+
+#undef tix 
+
+#endif
diff --git a/python/docs/source/intro.rst b/python/docs/source/intro.rst
index 337209c..d012322 100644
--- a/python/docs/source/intro.rst
+++ b/python/docs/source/intro.rst
@@ -72,7 +72,7 @@ The installation can be configured by setting the following variables in ``pytho
 ``compile_with_cuda``
   If set to ``True``, `GElib` will be compiled with GPU suport. This requires a working CUDA and CUBLAS installation 
   on your system and PyTorch itself having been compiled with CUDA enabled. If `GElib` is compiled with CUDA,  
-  you must always import ``torch`` before importing ``GElib``. **CUDA functionality is currently on hold.** 
+  you must always import ``torch`` before importing ``GElib``.
 
 ``copy_warnings``
   If set to ``True``, `GElib` will print a message to the terminal whenever a data object 
diff --git a/python/setup.py b/python/setup.py
index 3449d86..6403d7d 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -31,7 +31,8 @@ def main():
 
     cwd = os.getcwd()
     cnine_folder = "/../../cnine/"
-    ext_cuda_folder = "../../GElib-cuda/cuda/"
+    ext_cuda_folder = "../cuda/"
+    #ext_cuda_folder = "../../GElib-cuda/cuda/"
 
     _include_dirs = [cwd + cnine_folder + '/include',
 		     cwd + cnine_folder + '/combinatorial',