From cf6a9ab2b6de49e8cab4ef64f3168a35932fab76 Mon Sep 17 00:00:00 2001
From: DexuanZhou <hbnis@icloud.com>
Date: Wed, 17 May 2023 10:49:51 +0800
Subject: [PATCH 01/54] test sparse_ed, ed2, moved productbasis into P4ML

---
 src/Polynomials4ML.jl      |   1 +
 src/productbasis.jl        |  92 ++++++++++++++++++++++
 src/sparseproduct.jl       | 151 +++++++++++++++++++++++++++++++++++--
 src/staticprod.jl          |  33 +++++++-
 test/test_sparseproduct.jl |  57 +++++++++++++-
 5 files changed, 324 insertions(+), 10 deletions(-)
 create mode 100644 src/productbasis.jl

diff --git a/src/Polynomials4ML.jl b/src/Polynomials4ML.jl
index d4da670..0fedaba 100644
--- a/src/Polynomials4ML.jl
+++ b/src/Polynomials4ML.jl
@@ -62,6 +62,7 @@ include("atomicorbitalsradials/atomicorbitalsradials.jl")
 # generating product bases (generalisation of tensor products)
 include("staticprod.jl")
 include("sparseproduct.jl")
+include("productbasis.jl")
 
 # generic machinery for wrapping poly4ml bases into lux layers 
 include("lux.jl")
diff --git a/src/productbasis.jl b/src/productbasis.jl
new file mode 100644
index 0000000..ad3171d
--- /dev/null
+++ b/src/productbasis.jl
@@ -0,0 +1,92 @@
+#
+# Ordering of the embedding 
+# nuc | 1 2 3  1 2 3  1 2 3
+#   k | 1 1 1  2 2 2  2 2 2
+#
+"""
+This constructs the specification of all the atomic orbitals for one
+nucleus. 
+
+* bRnl : radial basis 
+* Ylm : angular basis, assumed to be spherical harmonics 
+* admissible : a filter, default is a total degree 
+"""
+function make_nlms_spec(bRnl, bYlm;
+            totaldegree::Integer = -1,
+            admissible = ( (br, by) -> degree(bRnl, br) +
+                           degree(bYlm, by) <= totaldegree), 
+            nnuc = 0)
+   
+   spec_Rnl = natural_indices(bRnl) # copy(basis.spec)
+   spec_Ylm = natural_indices(bYlm) 
+   
+   spec1 = []
+   for (iR, br) in enumerate(spec_Rnl), (iY, by) in enumerate(spec_Ylm)
+      if br.l != by.l 
+         continue 
+      end
+      if admissible(br, by)
+         push!(spec1, (br..., m = by.m))
+      end
+   end
+   return spec1 
+end
+
+
+# Jerry: This is just a specific case of a general ProductBasis, this should go to Polynomials4ML later with a general implementation
+# I will do that after reconfiming this is what we want
+mutable struct ProductBasis{NB, TR, TY, TS}
+   spec1::Vector{TS}
+   bRnl::TR
+   bYlm::TY
+   # ---- evaluation kernel from Polynomials4ML ---- 
+   sparsebasis::SparseProduct{NB}
+end
+
+function _invmap(a::AbstractVector)
+   inva = Dict{eltype(a), Int}()
+   for i = 1:length(a) 
+      inva[a[i]] = i 
+   end
+   return inva 
+end
+
+function dropnames(namedtuple::NamedTuple, names::Tuple{Vararg{Symbol}}) 
+   keepnames = Base.diff_names(Base._nt_names(namedtuple), names)
+   return NamedTuple{keepnames}(namedtuple)
+end
+
+function ProductBasis(spec1, bRnl, bYlm)
+   spec1idx = Vector{Tuple{Int, Int}}(undef, length(spec1)) 
+   spec_Rnl = bRnl.spec; inv_Rnl = _invmap(spec_Rnl)
+   spec_Ylm = natural_indices(bYlm); inv_Ylm = _invmap(spec_Ylm)
+
+   spec1idx = Vector{Tuple{Int, Int}}(undef, length(spec1))
+   for (i, b) in enumerate(spec1)
+      spec1idx[i] = (inv_Rnl[dropnames(b,(:m,))], inv_Ylm[(l=b.l, m=b.m)])
+   end
+   sparsebasis = SparseProduct(spec1idx)
+   return ProductBasis(spec1, bRnl, bYlm, sparsebasis)
+end
+
+
+function evaluate(basis::ProductBasis, X::AbstractVector{<: AbstractVector})
+   Nel = length(X)
+   T = promote_type(eltype(X[1]))
+   VT = SVector{3, T}
+   
+   # create all the shifted configurations 
+   xx = zeros(eltype(VT), Nel)
+   for i = 1:Nel
+      xx[i] = norm(X[i])
+   end
+
+   # evaluate the radial and angular components on all the shifted particles 
+   Rnl = reshape(evaluate(basis.bRnl, xx[:]), (Nel, length(basis.bRnl)))
+   Ylm = reshape(evaluate(basis.bYlm, X[:]), (Nel, length(basis.bYlm)))
+
+   # evaluate all the atomic orbitals as ϕ_nlm = Rnl * Ylm 
+   ϕnlm = evaluate(basis.sparsebasis, (Rnl, Ylm))
+
+   return ϕnlm
+end
\ No newline at end of file
diff --git a/src/sparseproduct.jl b/src/sparseproduct.jl
index 77768e0..d5e2cbe 100644
--- a/src/sparseproduct.jl
+++ b/src/sparseproduct.jl
@@ -34,12 +34,36 @@ function evaluate(basis::SparseProduct, BB::Tuple{Vararg{AbstractMatrix}})
    evaluate!(A, basis, BB::Tuple)
    return A 
 end
-
-test_evaluate(basis::SparseProduct, BB::Tuple) = 
-       [ prod(BB[j][basis.spec[i][j]] for j = 1:length(BB)) 
-            for i = 1:length(basis) ]
    
+function evaluate_ed(basis::SparseProduct, BB::Tuple{Vararg{AbstractVector}}, ∂BB::Tuple{Vararg{AbstractVector}}) 
+   VT = mapreduce(eltype, promote_type, ∂BB)
+   A = zeros(VT, length(basis))
+   evaluate_ed!(A, basis, BB::Tuple, ∂BB::Tuple)
+   return A 
+end
+
+function evaluate_ed(basis::SparseProduct, BB::Tuple{Vararg{AbstractMatrix}}, ∂BB::Tuple{Vararg{AbstractMatrix}}) 
+   VT = mapreduce(eltype, promote_type, ∂BB)
+   nX = size(∂BB[1], 1)
+   A = zeros(VT, nX, length(basis))
+   evaluate_ed!(A, basis, BB::Tuple, ∂BB::Tuple)
+   return A 
+end
+
+function evaluate_ed2(basis::SparseProduct, BB::Tuple{Vararg{AbstractVector}}, ∂BB::Tuple{Vararg{AbstractVector}}, ∂∂BB::Tuple{Vararg{AbstractVector}}) 
+   VT = mapreduce(eltype, promote_type, ∂∂BB)
+   A = zeros(VT, length(basis))
+   evaluate_ed2!(A, basis, BB::Tuple, ∂BB::Tuple, ∂∂BB::Tuple)
+   return A 
+end
 
+function evaluate_ed2(basis::SparseProduct, BB::Tuple{Vararg{AbstractMatrix}}, ∂BB::Tuple{Vararg{AbstractMatrix}}, ∂∂BB::Tuple{Vararg{AbstractMatrix}}) 
+   VT = mapreduce(eltype, promote_type, ∂∂BB)
+   nX = size(∂∂BB[1], 1)
+   A = zeros(VT, nX, length(basis))
+   evaluate_ed2!(A, basis, BB::Tuple, ∂BB::Tuple, ∂∂BB::Tuple)
+   return A 
+end
 # ----------------------- evaluation kernels 
 
 function evaluate!(A, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractVector}}) where {NB}
@@ -51,7 +75,6 @@ function evaluate!(A, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractVector}
    return nothing 
 end
 
-
 function evaluate!(A, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractMatrix}}) where {NB}
    nX = size(BB[1], 1)
    @assert all(B->size(B, 1) == nX, BB)
@@ -65,6 +88,88 @@ function evaluate!(A, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractMatrix}
    return nothing
 end
 
+function evaluate_ed!(A, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractVector}}, ∂BB::Tuple{Vararg{AbstractVector}}) where {NB}
+   @assert length(BB) == NB
+   @assert length(∂BB) == NB
+   spec = basis.spec
+   for (iA, ϕ) in enumerate(spec)
+      b = ntuple(Val(NB)) do i 
+         @inbounds BB[i][ϕ[i]] 
+      end 
+      g = _prod_grad(b, Val(NB))
+      for i = 1:NB 
+         A[iA] = muladd(∂BB[i][ϕ[i]], g[i], A[iA])
+      end
+   end 
+   return nothing 
+end
+
+function evaluate_ed!(A, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractMatrix}}, ∂BB::Tuple{Vararg{AbstractMatrix}}) where {NB}
+   nX = size(BB[1], 1)
+   @assert all(B->size(B, 1) == nX, BB)
+   @assert all(∂B->size(∂B, 1) == nX, ∂BB)
+   spec = basis.spec
+   @inbounds for (iA, ϕ) in enumerate(spec)
+      @simd ivdep for j = 1:nX 
+        b = ntuple(Val(NB)) do i 
+           @inbounds BB[i][j, ϕ[i]] 
+        end 
+        g = _prod_grad(b, Val(NB))
+        for i = 1:NB 
+           A[j, iA] = muladd(∂BB[i][j, ϕ[i]], g[i], A[j, iA])
+        end
+      end 
+   end
+   return nothing
+end
+
+function evaluate_ed2!(A, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractVector}}, ∂BB::Tuple{Vararg{AbstractVector}}, ∂∂BB::Tuple{Vararg{AbstractVector}}) where {NB}
+   @assert length(BB) == NB
+   @assert length(∂BB) == NB
+   @assert length(∂∂BB) == NB
+   spec = basis.spec
+   for (iA, ϕ) in enumerate(spec)
+      b = ntuple(Val(NB)) do i 
+         @inbounds BB[i][ϕ[i]] 
+      end 
+      dg = _prod_grad(b, Val(NB))
+      for i = 1:NB 
+         A[iA] = muladd(∂∂BB[i][ϕ[i]], dg[i], A[iA])
+      end
+      for m = 1:NB-1
+         for n = m+1:NB
+            @inbounds A[iA] += 2 * BB2_prod(ϕ, BB, ∂BB, m, n)
+         end
+      end
+   end 
+   return nothing 
+end
+
+function evaluate_ed2!(A, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractMatrix}}, ∂BB::Tuple{Vararg{AbstractMatrix}}, ∂∂BB::Tuple{Vararg{AbstractMatrix}}) where {NB}
+   nX = size(BB[1], 1)
+   @assert all(B->size(B, 1) == nX, BB)
+   @assert all(∂B->size(∂B, 1) == nX, ∂BB)
+   @assert all(∂∂B->size(∂∂B, 1) == nX, ∂∂BB)
+   spec = basis.spec
+
+   @inbounds for (iA, ϕ) in enumerate(spec)
+      @simd ivdep for j = 1:nX
+         b = ntuple(Val(NB)) do i 
+            @inbounds BB[i][j, ϕ[i]] 
+         end 
+         g = _prod_grad(b, Val(NB))
+         for i = 1:NB 
+            A[j, iA] = muladd(∂∂BB[i][j, ϕ[i]], g[i], A[j, iA])
+         end
+         for m = 1:NB-1
+            for n = m+1:NB
+               @inbounds A[j, iA] += 2 * BB2_prod(ϕ, BB, ∂BB, j, m, n)
+            end
+         end
+      end
+   end
+   return nothing
+end
 # -------------------- reverse mode gradient
 
 function _rrule_evaluate(basis::SparseProduct{NB}, BB::Tuple) where {NB}
@@ -104,4 +209,38 @@ function _pullback_evaluate!(∂BB, ∂A, basis::SparseProduct{NB}, BB::Tuple) w
       end 
    end
    return nothing 
-end
\ No newline at end of file
+end
+
+test_evaluate(basis::SparseProduct, BB::Tuple) = 
+       [ prod(BB[j][basis.spec[i][j]] for j = 1:length(BB)) 
+            for i = 1:length(basis) ]
+
+
+function test_evaluate_ed(basis::SparseProduct, BB::Tuple, ∂BB::Tuple) 
+   A = zeros(length(basis))
+   eval = [ prod(BB[j][basis.spec[i][j]] for j = 1:length(BB)) 
+            for i = 1:length(basis) ]
+   for i = 1:length(basis)
+      for j = 1:length(BB)
+         A[i] += eval[i]/BB[j][basis.spec[i][j]] * ∂BB[j][basis.spec[i][j]]
+      end
+   end
+   return A
+end 
+
+function test_evaluate_ed2(basis::SparseProduct, BB::Tuple, ∂BB::Tuple, ∂∂BB::Tuple) 
+   A = zeros(length(basis))
+   eval = [ prod(BB[j][basis.spec[i][j]] for j = 1:length(BB)) 
+            for i = 1:length(basis) ]
+   for i = 1:length(basis)
+      for j = 1:length(BB)
+         A[i] += eval[i]/BB[j][basis.spec[i][j]] * ∂∂BB[j][basis.spec[i][j]]
+      end
+      for j = 1:length(BB)-1
+         for z = j+1:length(BB)
+            A[i] += 2 * eval[i]/(BB[j][basis.spec[i][j]]*BB[z][basis.spec[i][z]]) * ∂BB[j][basis.spec[i][j]] * ∂BB[z][basis.spec[i][z]]
+         end
+      end
+   end
+   return A
+end 
\ No newline at end of file
diff --git a/src/staticprod.jl b/src/staticprod.jl
index ce07c99..ed7f2f3 100644
--- a/src/staticprod.jl
+++ b/src/staticprod.jl
@@ -1,4 +1,3 @@
-
 @inline function BB_prod(ϕ::NTuple{NB}, BB) where NB
    reduce(Base.FastMath.mul_fast, ntuple(Val(NB)) do i
       @inline 
@@ -15,6 +14,35 @@ end
 end
 
 
+@inline function BB2_prod(ϕ::NTuple{NB}, BB1, BB2, j, z) where NB
+   reduce(Base.FastMath.mul_fast, ntuple(Val(NB)) do i 
+      if i == j 
+         @inline 
+         @inbounds BB2[i][ϕ[i]]
+      elseif i == z
+         @inline 
+         @inbounds BB2[i][ϕ[i]]
+      else
+         @inline 
+         @inbounds BB1[i][ϕ[i]]
+      end
+   end)
+end
+
+@inline function BB2_prod(ϕ::NTuple{NB}, BB1, BB2, j, m, n) where NB
+   reduce(Base.FastMath.mul_fast, ntuple(Val(NB)) do i
+      if i == m
+         @inline 
+         @inbounds BB2[i][j, ϕ[i]]
+      elseif i == n
+         @inline 
+         @inbounds BB2[i][j, ϕ[i]]
+      else
+         @inline 
+         @inbounds BB1[i][j, ϕ[i]]
+      end
+   end)
+end
 
 @inline function _prod_grad(b, ::Val{1})
    return (one(eltype(b)),)
@@ -56,5 +84,4 @@ end
          $(code...)
       end
    end
-end
-
+end
\ No newline at end of file
diff --git a/test/test_sparseproduct.jl b/test/test_sparseproduct.jl
index 297cc03..c276e8b 100644
--- a/test/test_sparseproduct.jl
+++ b/test/test_sparseproduct.jl
@@ -1,6 +1,6 @@
 using Test
 using Polynomials4ML.Testing: println_slim, print_tf
-using Polynomials4ML: SparseProduct, evaluate, test_evaluate
+using Polynomials4ML: SparseProduct, evaluate, evaluate_ed, evaluate_ed2, test_evaluate, test_evaluate_ed, test_evaluate_ed2
 using LinearAlgebra: norm
 using Polynomials4ML
 using ACEbase.Testing: fdtest
@@ -15,6 +15,15 @@ B1 = randn(N1)
 B2 = randn(N2)
 B3 = randn(N3)
 
+∂B1 = randn(N1)
+∂B2 = randn(N2)
+∂B3 = randn(N3)
+
+∂∂B1 = randn(N1)
+∂∂B2 = randn(N2)
+∂∂B3 = randn(N3)
+
+
 spec = sort([ (rand(1:N1), rand(1:N2), rand(1:N3)) for i = 1:100 ])
 
 basis = SparseProduct(spec)
@@ -31,8 +40,26 @@ A2 = evaluate(basis, BB)
 
 println_slim(@test A1 ≈ A2 )
 
+@info("Test serial evaluation_ed")
+BB = (B1, B2, B3)
+∂BB = (∂B1, ∂B2, ∂B3)
+
+A1 = test_evaluate_ed(basis, BB, ∂BB)
+A2 = evaluate_ed(basis, BB, ∂BB)
+
+println_slim(@test A1 ≈ A2 )
 ##
 
+@info("Test serial evaluation_d2")
+BB = (B1, B2, B3)
+∂BB = (∂B1, ∂B2, ∂B3)
+∂∂BB = (∂∂B1, ∂∂B2, ∂∂B3)
+
+A1 = test_evaluate_ed2(basis, BB, ∂BB, ∂∂BB)
+A2 = evaluate_ed2(basis, BB, ∂BB, ∂∂BB)
+
+println_slim(@test A1 ≈ A2 )
+
 @info("Test batch evaluation")
 
 nX = 64 
@@ -47,8 +74,36 @@ bA2 = evaluate(basis, bBB)
 
 println_slim(@test bA1 ≈ bA2)
 
+@info("Test batch evaluate_ed")
+
+nX = 64 
+bBB = ( randn(nX, N1), randn(nX, N2), randn(nX, N3) )
+bdBB = ( randn(nX, N1), randn(nX, N2), randn(nX, N3) )
+bA1 = zeros(ComplexF64, nX, length(basis))
 
+for j = 1:nX
+    bA1[j, :] = evaluate_ed(basis, (bBB[1][j, :], bBB[2][j, :], bBB[3][j, :]), (bdBB[1][j, :], bdBB[2][j, :], bdBB[3][j, :]))
+end
+
+bA2 = evaluate_ed(basis, bBB, bdBB)
+
+println_slim(@test bA1 ≈ bA2)
 ## 
+@info("Test batch evaluate_d2")
+
+nX = 64 
+bBB = ( randn(nX, N1), randn(nX, N2), randn(nX, N3) )
+bdBB = ( randn(nX, N1), randn(nX, N2), randn(nX, N3) )
+bddBB = ( randn(nX, N1), randn(nX, N2), randn(nX, N3) )
+bA1 = zeros(ComplexF64, nX, length(basis))
+
+for j = 1:nX
+    bA1[j, :] = evaluate_ed2(basis, (bBB[1][j, :], bBB[2][j, :], bBB[3][j, :]), (bdBB[1][j, :], bdBB[2][j, :], bdBB[3][j, :]), (bddBB[1][j, :], bddBB[2][j, :], bddBB[3][j, :]))
+end
+
+bA2 = evaluate_ed2(basis, bBB, bdBB, bddBB)
+
+println_slim(@test bA1 ≈ bA2)
 
 @info("Testing _rrule_evaluate")
 using LinearAlgebra: dot 

From 5b6ce16f0016a6ed53a1b4386003fbec31c7bb5d Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Tue, 16 May 2023 20:15:29 -0700
Subject: [PATCH 02/54] minor fix

---
 src/productbasis.jl | 103 +++++++++++++++++++++++---------------------
 1 file changed, 53 insertions(+), 50 deletions(-)

diff --git a/src/productbasis.jl b/src/productbasis.jl
index ad3171d..16cbf32 100644
--- a/src/productbasis.jl
+++ b/src/productbasis.jl
@@ -3,71 +3,74 @@
 # nuc | 1 2 3  1 2 3  1 2 3
 #   k | 1 1 1  2 2 2  2 2 2
 #
-"""
-This constructs the specification of all the atomic orbitals for one
-nucleus. 
+# """
+# This constructs the specification of all the atomic orbitals for one
+# nucleus. 
 
-* bRnl : radial basis 
-* Ylm : angular basis, assumed to be spherical harmonics 
-* admissible : a filter, default is a total degree 
-"""
-function make_nlms_spec(bRnl, bYlm;
-            totaldegree::Integer = -1,
-            admissible = ( (br, by) -> degree(bRnl, br) +
-                           degree(bYlm, by) <= totaldegree), 
-            nnuc = 0)
+# * bRnl : radial basis 
+# * Ylm : angular basis, assumed to be spherical harmonics 
+# * admissible : a filter, default is a total degree 
+# """
+# function make_nlms_spec(bRnl, bYlm;
+#             totaldegree::Integer = -1,
+#             admissible = ( (br, by) -> degree(bRnl, br) +
+#                            degree(bYlm, by) <= totaldegree), 
+#             nnuc = 0)
    
-   spec_Rnl = natural_indices(bRnl) # copy(basis.spec)
-   spec_Ylm = natural_indices(bYlm) 
+#    spec_Rnl = natural_indices(bRnl) # copy(basis.spec)
+#    spec_Ylm = natural_indices(bYlm) 
    
-   spec1 = []
-   for (iR, br) in enumerate(spec_Rnl), (iY, by) in enumerate(spec_Ylm)
-      if br.l != by.l 
-         continue 
-      end
-      if admissible(br, by)
-         push!(spec1, (br..., m = by.m))
-      end
-   end
-   return spec1 
-end
+#    spec1 = []
+#    for (iR, br) in enumerate(spec_Rnl), (iY, by) in enumerate(spec_Ylm)
+#       if br.l != by.l 
+#          continue 
+#       end
+#       if admissible(br, by)
+#          push!(spec1, (br..., m = by.m))
+#       end
+#    end
+#    return spec1 
+# end
 
 
-# Jerry: This is just a specific case of a general ProductBasis, this should go to Polynomials4ML later with a general implementation
-# I will do that after reconfiming this is what we want
+# Jerry: This is just a specific case of a general ProductBasis
+# I will do that later expanding this to a general case
 mutable struct ProductBasis{NB, TR, TY, TS}
    spec1::Vector{TS}
    bRnl::TR
    bYlm::TY
-   # ---- evaluation kernel from Polynomials4ML ---- 
+   # ---- evaluation kernel ---- 
    sparsebasis::SparseProduct{NB}
 end
 
-function _invmap(a::AbstractVector)
-   inva = Dict{eltype(a), Int}()
-   for i = 1:length(a) 
-      inva[a[i]] = i 
-   end
-   return inva 
-end
+# function _invmap(a::AbstractVector)
+#    inva = Dict{eltype(a), Int}()
+#    for i = 1:length(a) 
+#       inva[a[i]] = i 
+#    end
+#    return inva 
+# end
 
-function dropnames(namedtuple::NamedTuple, names::Tuple{Vararg{Symbol}}) 
-   keepnames = Base.diff_names(Base._nt_names(namedtuple), names)
-   return NamedTuple{keepnames}(namedtuple)
-end
+# function dropnames(namedtuple::NamedTuple, names::Tuple{Vararg{Symbol}}) 
+#    keepnames = Base.diff_names(Base._nt_names(namedtuple), names)
+#    return NamedTuple{keepnames}(namedtuple)
+# end
 
-function ProductBasis(spec1, bRnl, bYlm)
-   spec1idx = Vector{Tuple{Int, Int}}(undef, length(spec1)) 
-   spec_Rnl = bRnl.spec; inv_Rnl = _invmap(spec_Rnl)
-   spec_Ylm = natural_indices(bYlm); inv_Ylm = _invmap(spec_Ylm)
+# function ProductBasis(spec1, bRnl, bYlm)
+#    spec1idx = Vector{Tuple{Int, Int}}(undef, length(spec1)) 
+#    spec_Rnl = bRnl.spec; inv_Rnl = _invmap(spec_Rnl)
+#    spec_Ylm = natural_indices(bYlm); inv_Ylm = _invmap(spec_Ylm)
 
-   spec1idx = Vector{Tuple{Int, Int}}(undef, length(spec1))
-   for (i, b) in enumerate(spec1)
-      spec1idx[i] = (inv_Rnl[dropnames(b,(:m,))], inv_Ylm[(l=b.l, m=b.m)])
-   end
-   sparsebasis = SparseProduct(spec1idx)
-   return ProductBasis(spec1, bRnl, bYlm, sparsebasis)
-end
+#    spec1idx = Vector{Tuple{Int, Int}}(undef, length(spec1))
+#    for (i, b) in enumerate(spec1)
+#       spec1idx[i] = (inv_Rnl[dropnames(b,(:m,))], inv_Ylm[(l=b.l, m=b.m)])
+#    end
+#    sparsebasis = SparseProduct(spec1idx)
+#    return ProductBasis(spec1, bRnl, bYlm, sparsebasis)
+# end
+
+
+(pbasis::ProductBasis)(args...) = evaluate(pbasis, args...)
 
 
 function evaluate(basis::ProductBasis, X::AbstractVector{<: AbstractVector})

From eee90b25f776371c829c72318d6b104e1b2d1fcb Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Wed, 17 May 2023 05:51:14 -0700
Subject: [PATCH 03/54] clean up

---
 src/productbasis.jl | 75 +++++----------------------------------------
 1 file changed, 7 insertions(+), 68 deletions(-)

diff --git a/src/productbasis.jl b/src/productbasis.jl
index 16cbf32..e69109e 100644
--- a/src/productbasis.jl
+++ b/src/productbasis.jl
@@ -1,85 +1,23 @@
-#
-# Ordering of the embedding 
-# nuc | 1 2 3  1 2 3  1 2 3
-#   k | 1 1 1  2 2 2  2 2 2
-#
-# """
-# This constructs the specification of all the atomic orbitals for one
-# nucleus. 
-
-# * bRnl : radial basis 
-# * Ylm : angular basis, assumed to be spherical harmonics 
-# * admissible : a filter, default is a total degree 
-# """
-# function make_nlms_spec(bRnl, bYlm;
-#             totaldegree::Integer = -1,
-#             admissible = ( (br, by) -> degree(bRnl, br) +
-#                            degree(bYlm, by) <= totaldegree), 
-#             nnuc = 0)
-   
-#    spec_Rnl = natural_indices(bRnl) # copy(basis.spec)
-#    spec_Ylm = natural_indices(bYlm) 
-   
-#    spec1 = []
-#    for (iR, br) in enumerate(spec_Rnl), (iY, by) in enumerate(spec_Ylm)
-#       if br.l != by.l 
-#          continue 
-#       end
-#       if admissible(br, by)
-#          push!(spec1, (br..., m = by.m))
-#       end
-#    end
-#    return spec1 
-# end
-
-
 # Jerry: This is just a specific case of a general ProductBasis
-# I will do that later expanding this to a general case
-mutable struct ProductBasis{NB, TR, TY, TS}
+# I will do that later expanding this to a general case, but it is unclear
+# to me how to allow the basis to distinguish whether to use norm(x) or x efficiently
+struct ProductBasis{NB, TR, TY, TS}
    spec1::Vector{TS}
    bRnl::TR
    bYlm::TY
    # ---- evaluation kernel ---- 
    sparsebasis::SparseProduct{NB}
+   @reqfields
 end
 
-# function _invmap(a::AbstractVector)
-#    inva = Dict{eltype(a), Int}()
-#    for i = 1:length(a) 
-#       inva[a[i]] = i 
-#    end
-#    return inva 
-# end
-
-# function dropnames(namedtuple::NamedTuple, names::Tuple{Vararg{Symbol}}) 
-#    keepnames = Base.diff_names(Base._nt_names(namedtuple), names)
-#    return NamedTuple{keepnames}(namedtuple)
-# end
-
-# function ProductBasis(spec1, bRnl, bYlm)
-#    spec1idx = Vector{Tuple{Int, Int}}(undef, length(spec1)) 
-#    spec_Rnl = bRnl.spec; inv_Rnl = _invmap(spec_Rnl)
-#    spec_Ylm = natural_indices(bYlm); inv_Ylm = _invmap(spec_Ylm)
-
-#    spec1idx = Vector{Tuple{Int, Int}}(undef, length(spec1))
-#    for (i, b) in enumerate(spec1)
-#       spec1idx[i] = (inv_Rnl[dropnames(b,(:m,))], inv_Ylm[(l=b.l, m=b.m)])
-#    end
-#    sparsebasis = SparseProduct(spec1idx)
-#    return ProductBasis(spec1, bRnl, bYlm, sparsebasis)
-# end
-
-
 (pbasis::ProductBasis)(args...) = evaluate(pbasis, args...)
 
-
 function evaluate(basis::ProductBasis, X::AbstractVector{<: AbstractVector})
    Nel = length(X)
    T = promote_type(eltype(X[1]))
-   VT = SVector{3, T}
    
    # create all the shifted configurations 
-   xx = zeros(eltype(VT), Nel)
+   xx = zeros(eltype(T), Nel)
    for i = 1:Nel
       xx[i] = norm(X[i])
    end
@@ -92,4 +30,5 @@ function evaluate(basis::ProductBasis, X::AbstractVector{<: AbstractVector})
    ϕnlm = evaluate(basis.sparsebasis, (Rnl, Ylm))
 
    return ϕnlm
-end
\ No newline at end of file
+end
+

From 7c07059bb816609700518908b242bee94f12bcf9 Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Thu, 18 May 2023 01:19:57 -0700
Subject: [PATCH 04/54] fix evaluate_ed and fix returns

---
 src/sparseproduct.jl       | 76 +++++++++++++++++++++++---------------
 src/staticprod.jl          | 35 +++++++++++++++++-
 test/test_sparseproduct.jl | 12 +++---
 3 files changed, 86 insertions(+), 37 deletions(-)

diff --git a/src/sparseproduct.jl b/src/sparseproduct.jl
index d5e2cbe..88b5773 100644
--- a/src/sparseproduct.jl
+++ b/src/sparseproduct.jl
@@ -37,32 +37,36 @@ end
    
 function evaluate_ed(basis::SparseProduct, BB::Tuple{Vararg{AbstractVector}}, ∂BB::Tuple{Vararg{AbstractVector}}) 
    VT = mapreduce(eltype, promote_type, ∂BB)
-   A = zeros(VT, length(basis))
-   evaluate_ed!(A, basis, BB::Tuple, ∂BB::Tuple)
-   return A 
+   A, dA = zeros(VT, length(basis)), zeros(VT, length(basis))
+   evaluate_ed!(A, dA, basis, BB::Tuple, ∂BB::Tuple)
+   return A, dA
 end
 
 function evaluate_ed(basis::SparseProduct, BB::Tuple{Vararg{AbstractMatrix}}, ∂BB::Tuple{Vararg{AbstractMatrix}}) 
    VT = mapreduce(eltype, promote_type, ∂BB)
    nX = size(∂BB[1], 1)
-   A = zeros(VT, nX, length(basis))
-   evaluate_ed!(A, basis, BB::Tuple, ∂BB::Tuple)
-   return A 
+   A, dA = zeros(VT, nX, length(basis)), zeros(VT, nX, length(basis))
+   evaluate_ed!(A, dA, basis, BB::Tuple, ∂BB::Tuple)
+   return A, dA
 end
 
 function evaluate_ed2(basis::SparseProduct, BB::Tuple{Vararg{AbstractVector}}, ∂BB::Tuple{Vararg{AbstractVector}}, ∂∂BB::Tuple{Vararg{AbstractVector}}) 
    VT = mapreduce(eltype, promote_type, ∂∂BB)
    A = zeros(VT, length(basis))
-   evaluate_ed2!(A, basis, BB::Tuple, ∂BB::Tuple, ∂∂BB::Tuple)
-   return A 
+   dA = zeros(VT, length(basis))
+   ddA = zeros(VT, length(basis))
+   evaluate_ed2!(A, dA, ddA, basis, BB::Tuple, ∂BB::Tuple, ∂∂BB::Tuple)
+   return A, dA, ddA
 end
 
 function evaluate_ed2(basis::SparseProduct, BB::Tuple{Vararg{AbstractMatrix}}, ∂BB::Tuple{Vararg{AbstractMatrix}}, ∂∂BB::Tuple{Vararg{AbstractMatrix}}) 
    VT = mapreduce(eltype, promote_type, ∂∂BB)
    nX = size(∂∂BB[1], 1)
    A = zeros(VT, nX, length(basis))
-   evaluate_ed2!(A, basis, BB::Tuple, ∂BB::Tuple, ∂∂BB::Tuple)
-   return A 
+   dA = zeros(VT, nX, length(basis))
+   ddA = zeros(VT, nX, length(basis))
+   evaluate_ed2!(A, dA, ddA, basis, BB::Tuple, ∂BB::Tuple, ∂∂BB::Tuple)
+   return A, dA, ddA
 end
 # ----------------------- evaluation kernels 
 
@@ -72,7 +76,7 @@ function evaluate!(A, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractVector}
    for (iA, ϕ) in enumerate(spec)
        @inbounds A[iA] = BB_prod(ϕ, BB)
    end
-   return nothing 
+   return A 
 end
 
 function evaluate!(A, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractMatrix}}) where {NB}
@@ -85,73 +89,85 @@ function evaluate!(A, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractMatrix}
          A[j, iA] = BB_prod(ϕ, BB, j)
       end
    end
-   return nothing
+   return A
 end
 
-function evaluate_ed!(A, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractVector}}, ∂BB::Tuple{Vararg{AbstractVector}}) where {NB}
+# Not sure whether we can everything below
+# faster by eval and diff at the same time from prod_grad
+
+function evaluate_ed!(A, dA, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractVector}}, ∂BB::Tuple{Vararg{AbstractVector}}) where {NB}
    @assert length(BB) == NB
    @assert length(∂BB) == NB
    spec = basis.spec
+   # evaluate!(A, basis, BB)
    for (iA, ϕ) in enumerate(spec)
       b = ntuple(Val(NB)) do i 
          @inbounds BB[i][ϕ[i]] 
       end 
-      g = _prod_grad(b, Val(NB))
-      for i = 1:NB 
-         A[iA] = muladd(∂BB[i][ϕ[i]], g[i], A[iA])
+      g = _prod_grad_ed(b, Val(NB))
+      A[iA] = g[1]
+      for i = 1:NB
+         dA[iA] = muladd(∂BB[i][ϕ[i]], g[i + 1], dA[iA])
       end
    end 
-   return nothing 
+   return A, dA 
 end
 
-function evaluate_ed!(A, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractMatrix}}, ∂BB::Tuple{Vararg{AbstractMatrix}}) where {NB}
+function evaluate_ed!(A, dA, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractMatrix}}, ∂BB::Tuple{Vararg{AbstractMatrix}}) where {NB}
    nX = size(BB[1], 1)
    @assert all(B->size(B, 1) == nX, BB)
    @assert all(∂B->size(∂B, 1) == nX, ∂BB)
    spec = basis.spec
+   # evaluate!(A, basis, BB)
    @inbounds for (iA, ϕ) in enumerate(spec)
       @simd ivdep for j = 1:nX 
         b = ntuple(Val(NB)) do i 
            @inbounds BB[i][j, ϕ[i]] 
         end 
-        g = _prod_grad(b, Val(NB))
-        for i = 1:NB 
-           A[j, iA] = muladd(∂BB[i][j, ϕ[i]], g[i], A[j, iA])
+        g = _prod_grad_ed(b, Val(NB))
+        A[j, iA] = g[1] 
+        for i = 1:NB
+           dA[j, iA] = muladd(∂BB[i][j, ϕ[i]], g[i + 1], dA[j, iA])
         end
       end 
    end
-   return nothing
+   return A, dA
 end
 
-function evaluate_ed2!(A, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractVector}}, ∂BB::Tuple{Vararg{AbstractVector}}, ∂∂BB::Tuple{Vararg{AbstractVector}}) where {NB}
+function evaluate_ed2!(A, dA, ddA, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractVector}}, ∂BB::Tuple{Vararg{AbstractVector}}, ∂∂BB::Tuple{Vararg{AbstractVector}}) where {NB}
    @assert length(BB) == NB
    @assert length(∂BB) == NB
    @assert length(∂∂BB) == NB
    spec = basis.spec
+
+   evaluate_ed!(A, dA, basis, BB, ∂BB)
+
    for (iA, ϕ) in enumerate(spec)
       b = ntuple(Val(NB)) do i 
          @inbounds BB[i][ϕ[i]] 
       end 
       dg = _prod_grad(b, Val(NB))
       for i = 1:NB 
-         A[iA] = muladd(∂∂BB[i][ϕ[i]], dg[i], A[iA])
+         ddA[iA] = muladd(∂∂BB[i][ϕ[i]], dg[i], ddA[iA])
       end
       for m = 1:NB-1
          for n = m+1:NB
-            @inbounds A[iA] += 2 * BB2_prod(ϕ, BB, ∂BB, m, n)
+            @inbounds ddA[iA] += 2 * BB2_prod(ϕ, BB, ∂BB, m, n)
          end
       end
    end 
-   return nothing 
+   return A, dA, ddA 
 end
 
-function evaluate_ed2!(A, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractMatrix}}, ∂BB::Tuple{Vararg{AbstractMatrix}}, ∂∂BB::Tuple{Vararg{AbstractMatrix}}) where {NB}
+function evaluate_ed2!(A, dA, ddA, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractMatrix}}, ∂BB::Tuple{Vararg{AbstractMatrix}}, ∂∂BB::Tuple{Vararg{AbstractMatrix}}) where {NB}
    nX = size(BB[1], 1)
    @assert all(B->size(B, 1) == nX, BB)
    @assert all(∂B->size(∂B, 1) == nX, ∂BB)
    @assert all(∂∂B->size(∂∂B, 1) == nX, ∂∂BB)
    spec = basis.spec
 
+   evaluate_ed!(A, dA, basis, BB, ∂BB)
+   
    @inbounds for (iA, ϕ) in enumerate(spec)
       @simd ivdep for j = 1:nX
          b = ntuple(Val(NB)) do i 
@@ -159,16 +175,16 @@ function evaluate_ed2!(A, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractMat
          end 
          g = _prod_grad(b, Val(NB))
          for i = 1:NB 
-            A[j, iA] = muladd(∂∂BB[i][j, ϕ[i]], g[i], A[j, iA])
+            ddA[j, iA] = muladd(∂∂BB[i][j, ϕ[i]], g[i], ddA[j, iA])
          end
          for m = 1:NB-1
             for n = m+1:NB
-               @inbounds A[j, iA] += 2 * BB2_prod(ϕ, BB, ∂BB, j, m, n)
+               @inbounds ddA[j, iA] += 2 * BB2_prod(ϕ, BB, ∂BB, j, m, n)
             end
          end
       end
    end
-   return nothing
+   return A, dA, ddA
 end
 # -------------------- reverse mode gradient
 
diff --git a/src/staticprod.jl b/src/staticprod.jl
index ed7f2f3..2b97ffc 100644
--- a/src/staticprod.jl
+++ b/src/staticprod.jl
@@ -84,4 +84,37 @@ end
          $(code...)
       end
    end
-end
\ No newline at end of file
+end
+
+function _code_prod_grad_ed(NB)
+   code = Expr[]
+   # g[2] = b[1]
+   push!(code, :(g2 = b[1]))
+   for i = 3:NB
+      # g[i] = g[i-1] * b[i-1]
+      push!(code, Meta.parse("g$i = g$(i-1) * b[$(i-1)]"))
+   end
+   # h = b[N]
+   push!(code, Meta.parse("h = b[$NB]"))
+   for i = NB-1:-1:2
+      # g[i] *= h
+      push!(code, Meta.parse("g$i *= h"))
+      # h *= b[i]
+      push!(code, Meta.parse("h *= b[$i]"))
+   end
+   # g[1] = h
+   push!(code, :(g1 = h))
+   # return (g[1], g[2], ..., g[N])
+   push!(code, :(g0 = g1 * b[1]))
+   push!(code, Meta.parse(
+            "return (" * join([ "g$i" for i = 0:NB ], ", ") * ")" ))
+end
+
+@inline @generated function _prod_grad_ed(b, ::Val{NB}) where {NB}
+   code = _code_prod_grad_ed(NB)
+   quote
+      @fastmath begin 
+         $(code...)
+      end
+   end
+end
diff --git a/test/test_sparseproduct.jl b/test/test_sparseproduct.jl
index c276e8b..efea2e9 100644
--- a/test/test_sparseproduct.jl
+++ b/test/test_sparseproduct.jl
@@ -45,7 +45,7 @@ BB = (B1, B2, B3)
 ∂BB = (∂B1, ∂B2, ∂B3)
 
 A1 = test_evaluate_ed(basis, BB, ∂BB)
-A2 = evaluate_ed(basis, BB, ∂BB)
+A2 = evaluate_ed(basis, BB, ∂BB)[2]
 
 println_slim(@test A1 ≈ A2 )
 ##
@@ -56,7 +56,7 @@ BB = (B1, B2, B3)
 ∂∂BB = (∂∂B1, ∂∂B2, ∂∂B3)
 
 A1 = test_evaluate_ed2(basis, BB, ∂BB, ∂∂BB)
-A2 = evaluate_ed2(basis, BB, ∂BB, ∂∂BB)
+A2 = evaluate_ed2(basis, BB, ∂BB, ∂∂BB)[3]
 
 println_slim(@test A1 ≈ A2 )
 
@@ -82,10 +82,10 @@ bdBB = ( randn(nX, N1), randn(nX, N2), randn(nX, N3) )
 bA1 = zeros(ComplexF64, nX, length(basis))
 
 for j = 1:nX
-    bA1[j, :] = evaluate_ed(basis, (bBB[1][j, :], bBB[2][j, :], bBB[3][j, :]), (bdBB[1][j, :], bdBB[2][j, :], bdBB[3][j, :]))
+    bA1[j, :] = evaluate_ed(basis, (bBB[1][j, :], bBB[2][j, :], bBB[3][j, :]), (bdBB[1][j, :], bdBB[2][j, :], bdBB[3][j, :]))[2]
 end
 
-bA2 = evaluate_ed(basis, bBB, bdBB)
+bA2 = evaluate_ed(basis, bBB, bdBB)[2]
 
 println_slim(@test bA1 ≈ bA2)
 ## 
@@ -98,10 +98,10 @@ bddBB = ( randn(nX, N1), randn(nX, N2), randn(nX, N3) )
 bA1 = zeros(ComplexF64, nX, length(basis))
 
 for j = 1:nX
-    bA1[j, :] = evaluate_ed2(basis, (bBB[1][j, :], bBB[2][j, :], bBB[3][j, :]), (bdBB[1][j, :], bdBB[2][j, :], bdBB[3][j, :]), (bddBB[1][j, :], bddBB[2][j, :], bddBB[3][j, :]))
+    bA1[j, :] = evaluate_ed2(basis, (bBB[1][j, :], bBB[2][j, :], bBB[3][j, :]), (bdBB[1][j, :], bdBB[2][j, :], bdBB[3][j, :]), (bddBB[1][j, :], bddBB[2][j, :], bddBB[3][j, :]))[3]
 end
 
-bA2 = evaluate_ed2(basis, bBB, bdBB, bddBB)
+bA2 = evaluate_ed2(basis, bBB, bdBB, bddBB)[3]
 
 println_slim(@test bA1 ≈ bA2)
 

From 5880b2efbd4abeca1aef08bf89e65e652b6903af Mon Sep 17 00:00:00 2001
From: DexuanZhou <hbnis@icloud.com>
Date: Thu, 18 May 2023 22:22:57 +0800
Subject: [PATCH 05/54] calculate sparseproduct ed2 by _code_prod_ed2

---
 src/sparseproduct.jl       |  28 +++++-----
 src/staticprod.jl          | 105 +++++++++++++++++++++++++------------
 test/test_sparseproduct.jl |  87 ++++++++++++++++--------------
 3 files changed, 136 insertions(+), 84 deletions(-)

diff --git a/src/sparseproduct.jl b/src/sparseproduct.jl
index 88b5773..8d6e345 100644
--- a/src/sparseproduct.jl
+++ b/src/sparseproduct.jl
@@ -104,7 +104,7 @@ function evaluate_ed!(A, dA, basis::SparseProduct{NB}, BB::Tuple{Vararg{Abstract
       b = ntuple(Val(NB)) do i 
          @inbounds BB[i][ϕ[i]] 
       end 
-      g = _prod_grad_ed(b, Val(NB))
+      g = _prod_ed(b, Val(NB))
       A[iA] = g[1]
       for i = 1:NB
          dA[iA] = muladd(∂BB[i][ϕ[i]], g[i + 1], dA[iA])
@@ -124,7 +124,7 @@ function evaluate_ed!(A, dA, basis::SparseProduct{NB}, BB::Tuple{Vararg{Abstract
         b = ntuple(Val(NB)) do i 
            @inbounds BB[i][j, ϕ[i]] 
         end 
-        g = _prod_grad_ed(b, Val(NB))
+        g = _prod_ed(b, Val(NB))
         A[j, iA] = g[1] 
         for i = 1:NB
            dA[j, iA] = muladd(∂BB[i][j, ϕ[i]], g[i + 1], dA[j, iA])
@@ -140,19 +140,21 @@ function evaluate_ed2!(A, dA, ddA, basis::SparseProduct{NB}, BB::Tuple{Vararg{Ab
    @assert length(∂∂BB) == NB
    spec = basis.spec
 
-   evaluate_ed!(A, dA, basis, BB, ∂BB)
-
    for (iA, ϕ) in enumerate(spec)
       b = ntuple(Val(NB)) do i 
          @inbounds BB[i][ϕ[i]] 
       end 
-      dg = _prod_grad(b, Val(NB))
+      g = _prod_ed2(b, Val(NB))
+      A[iA] = g[1]
       for i = 1:NB 
-         ddA[iA] = muladd(∂∂BB[i][ϕ[i]], dg[i], ddA[iA])
+         dA[iA] = muladd(∂BB[i][ϕ[i]], g[i + 1], dA[iA])
+         ddA[iA] = muladd(∂∂BB[i][ϕ[i]], g[i + 1], ddA[iA])
       end
+      t = 1
       for m = 1:NB-1
          for n = m+1:NB
-            @inbounds ddA[iA] += 2 * BB2_prod(ϕ, BB, ∂BB, m, n)
+            ddA[iA] = muladd(2 * ∂BB[m][ϕ[m]] * ∂BB[n][ϕ[n]], g[t + 1 + NB], ddA[iA])
+            t += 1
          end
       end
    end 
@@ -165,21 +167,23 @@ function evaluate_ed2!(A, dA, ddA, basis::SparseProduct{NB}, BB::Tuple{Vararg{Ab
    @assert all(∂B->size(∂B, 1) == nX, ∂BB)
    @assert all(∂∂B->size(∂∂B, 1) == nX, ∂∂BB)
    spec = basis.spec
-
-   evaluate_ed!(A, dA, basis, BB, ∂BB)
    
    @inbounds for (iA, ϕ) in enumerate(spec)
       @simd ivdep for j = 1:nX
          b = ntuple(Val(NB)) do i 
             @inbounds BB[i][j, ϕ[i]] 
          end 
-         g = _prod_grad(b, Val(NB))
+         g = _prod_ed2(b, Val(NB))
+         A[j, iA] = g[1]
          for i = 1:NB 
-            ddA[j, iA] = muladd(∂∂BB[i][j, ϕ[i]], g[i], ddA[j, iA])
+            dA[j, iA] = muladd(∂BB[i][j, ϕ[i]], g[i + 1], dA[j, iA])
+            ddA[j, iA] = muladd(∂∂BB[i][j, ϕ[i]], g[i + 1], ddA[j, iA])
          end
+         t = 1
          for m = 1:NB-1
             for n = m+1:NB
-               @inbounds ddA[j, iA] += 2 * BB2_prod(ϕ, BB, ∂BB, j, m, n)
+               ddA[j, iA] = muladd(2 * ∂BB[m][j, ϕ[m]] * ∂BB[n][j, ϕ[n]], g[t + 1 + NB], ddA[j, iA])
+               t += 1
             end
          end
       end
diff --git a/src/staticprod.jl b/src/staticprod.jl
index 2b97ffc..0610cd2 100644
--- a/src/staticprod.jl
+++ b/src/staticprod.jl
@@ -13,37 +13,6 @@ end
    end)
 end
 
-
-@inline function BB2_prod(ϕ::NTuple{NB}, BB1, BB2, j, z) where NB
-   reduce(Base.FastMath.mul_fast, ntuple(Val(NB)) do i 
-      if i == j 
-         @inline 
-         @inbounds BB2[i][ϕ[i]]
-      elseif i == z
-         @inline 
-         @inbounds BB2[i][ϕ[i]]
-      else
-         @inline 
-         @inbounds BB1[i][ϕ[i]]
-      end
-   end)
-end
-
-@inline function BB2_prod(ϕ::NTuple{NB}, BB1, BB2, j, m, n) where NB
-   reduce(Base.FastMath.mul_fast, ntuple(Val(NB)) do i
-      if i == m
-         @inline 
-         @inbounds BB2[i][j, ϕ[i]]
-      elseif i == n
-         @inline 
-         @inbounds BB2[i][j, ϕ[i]]
-      else
-         @inline 
-         @inbounds BB1[i][j, ϕ[i]]
-      end
-   end)
-end
-
 @inline function _prod_grad(b, ::Val{1})
    return (one(eltype(b)),)
 end
@@ -86,7 +55,23 @@ end
    end
 end
 
-function _code_prod_grad_ed(NB)
+@inline function _prod_ed(b, ::Val{1})
+   return (one(eltype(b)),)
+end
+
+@inline function _prod_ed(b::SVector{1, T}) where {T} 
+   return b[1], SVector(one(T))
+end
+
+@inline function _prod_ed2(b, ::Val{1})
+   return (one(eltype(b)),)
+end
+
+@inline function _prod_ed2(b::SVector{1, T}) where {T} 
+   return b[1], SVector(one(T))
+end
+
+function _code_prod_ed(NB)
    code = Expr[]
    # g[2] = b[1]
    push!(code, :(g2 = b[1]))
@@ -110,11 +95,63 @@ function _code_prod_grad_ed(NB)
             "return (" * join([ "g$i" for i = 0:NB ], ", ") * ")" ))
 end
 
-@inline @generated function _prod_grad_ed(b, ::Val{NB}) where {NB}
-   code = _code_prod_grad_ed(NB)
+@inline @generated function _prod_ed(b, ::Val{NB}) where {NB}
+   code = _code_prod_ed(NB)
    quote
       @fastmath begin 
          $(code...)
       end
    end
 end
+
+function _code_prod_ed2(NB)
+   code = Expr[] 
+   push!(code, Meta.parse("g$(2 * NB) = b[1]"))
+   push!(code, Meta.parse("g$(NB + 2) = b[2]"))
+   j = 2 * NB
+   for i = 3:NB-1
+       push!(code, Meta.parse("g$(2 * NB + i-2) = g$(2 * NB + i-3) * b[$i]"))
+       m = j + NB - i + 1
+       push!(code, Meta.parse("g$m = g$j * b[$(i - 1)]"))
+       push!(code, Meta.parse("g$(NB+i) = g$(NB + i-1) * b[$i]"))
+       for z = 1:NB - i - 1
+          push!(code, Meta.parse("g$(m+z) = g$(m+z-1) * b[$(i+z)]"))
+       end
+       j = m
+   end
+
+   for i = 1:NB-1
+      j = Int((i + 1) * NB - i/2 - i^2/2)
+      push!(code, Meta.parse("g$i = g$j * b[$NB]"))
+   end
+   push!(code, Meta.parse("g$NB = g$j * b[$(NB-1)]"))
+   # h = b[N]
+   push!(code, Meta.parse("h = b[$NB]"))
+
+   for i = NB-1:-1:3
+      for z = 1:i-1
+         # g[i] *= h
+         j = Int(NB + i - 1 + (2 * NB - 2 - z) * (z-1)/2)
+         push!(code, Meta.parse("g$j *= h"))
+      end
+      # h *= b[i]
+      push!(code, Meta.parse("h *= b[$i]"))
+   end
+   
+   # g[1] = h
+   push!(code, Meta.parse("g$(NB + 1) = h"))
+   push!(code, :(g0 = g1 * b[1]))
+   # return (g[1], g[2], ..., g[N])
+   push!(code, Meta.parse(
+            "return (" * join([ "g$i" for i = 0:Int(NB+NB*(NB-1)/2) ], ", ") * ")" ))
+end
+
+@inline @generated function _prod_ed2(b, ::Val{NB}) where {NB}
+   code = _code_prod_ed2(NB)
+   quote
+      @fastmath begin 
+         $(code...)
+      end
+   end
+end
+
diff --git a/test/test_sparseproduct.jl b/test/test_sparseproduct.jl
index efea2e9..6e9df35 100644
--- a/test/test_sparseproduct.jl
+++ b/test/test_sparseproduct.jl
@@ -6,25 +6,14 @@ using Polynomials4ML
 using ACEbase.Testing: fdtest
 
 ##
+NB = rand(collect(5:30))
+N = [i * 4 for i = 1:NB]
 
-N1 = 10
-N2 = 20
-N3 = 30
+B = [randn(N[i]) for i = 1:NB]
+∂B = [randn(N[i]) for i = 1:NB]
+∂∂B = [randn(N[i]) for i = 1:NB]
 
-B1 = randn(N1)
-B2 = randn(N2)
-B3 = randn(N3)
-
-∂B1 = randn(N1)
-∂B2 = randn(N2)
-∂B3 = randn(N3)
-
-∂∂B1 = randn(N1)
-∂∂B2 = randn(N2)
-∂∂B3 = randn(N3)
-
-
-spec = sort([ (rand(1:N1), rand(1:N2), rand(1:N3)) for i = 1:100 ])
+spec = sort([ Tuple([rand(1:N[i]) for i = 1:NB]) for _ = 1:100 ])
 
 basis = SparseProduct(spec)
 
@@ -33,7 +22,7 @@ basis = SparseProduct(spec)
 
 @info("Test serial evaluation")
 
-BB = (B1, B2, B3)
+BB = Tuple(B)
 
 A1 = test_evaluate(basis, BB)
 A2 = evaluate(basis, BB)
@@ -41,33 +30,43 @@ A2 = evaluate(basis, BB)
 println_slim(@test A1 ≈ A2 )
 
 @info("Test serial evaluation_ed")
-BB = (B1, B2, B3)
-∂BB = (∂B1, ∂B2, ∂B3)
+BB = Tuple(B)
+∂BB = Tuple(∂B)
 
-A1 = test_evaluate_ed(basis, BB, ∂BB)
+A = test_evaluate_ed(basis, BB, ∂BB)
+
+AA = evaluate(basis, BB)
+A1 = evaluate_ed(basis, BB, ∂BB)[1]
 A2 = evaluate_ed(basis, BB, ∂BB)[2]
 
-println_slim(@test A1 ≈ A2 )
+println_slim(@test A ≈ A2 )
+println_slim(@test AA ≈ A1 )
 ##
 
 @info("Test serial evaluation_d2")
-BB = (B1, B2, B3)
-∂BB = (∂B1, ∂B2, ∂B3)
-∂∂BB = (∂∂B1, ∂∂B2, ∂∂B3)
+BB = Tuple(B)
+∂BB = Tuple(∂B)
+∂∂BB = Tuple(∂∂B)
 
-A1 = test_evaluate_ed2(basis, BB, ∂BB, ∂∂BB)
-A2 = evaluate_ed2(basis, BB, ∂BB, ∂∂BB)[3]
+A = test_evaluate_ed2(basis, BB, ∂BB, ∂∂BB)
 
-println_slim(@test A1 ≈ A2 )
+AA = evaluate(basis, BB)
+dA = evaluate_ed(basis, BB, ∂BB)[2]
+A1 = evaluate_ed2(basis, BB, ∂BB, ∂∂BB)[1]
+A2 = evaluate_ed2(basis, BB, ∂BB, ∂∂BB)[2]
+A3 = evaluate_ed2(basis, BB, ∂BB, ∂∂BB)[3]
 
+println_slim(@test A ≈ A3 )
+println_slim(@test AA ≈ A1 )
+println_slim(@test dA ≈ A2 )
 @info("Test batch evaluation")
 
 nX = 64 
-bBB = ( randn(nX, N1), randn(nX, N2), randn(nX, N3) )
+bBB = Tuple([randn(nX, N[i]) for i = 1:NB])
 bA1 = zeros(ComplexF64, nX, length(basis))
 
 for j = 1:nX
-    bA1[j, :] = evaluate(basis, (bBB[1][j, :], bBB[2][j, :], bBB[3][j, :]))
+    bA1[j, :] = evaluate(basis, Tuple([bBB[i][j, :] for i = 1:NB]))
 end
 
 bA2 = evaluate(basis, bBB)
@@ -77,33 +76,45 @@ println_slim(@test bA1 ≈ bA2)
 @info("Test batch evaluate_ed")
 
 nX = 64 
-bBB = ( randn(nX, N1), randn(nX, N2), randn(nX, N3) )
-bdBB = ( randn(nX, N1), randn(nX, N2), randn(nX, N3) )
+bBB = Tuple([randn(nX, N[i]) for i = 1:NB])
+bdBB = Tuple([randn(nX, N[i]) for i = 1:NB])
+A1 = zeros(ComplexF64, nX, length(basis))
 bA1 = zeros(ComplexF64, nX, length(basis))
 
 for j = 1:nX
-    bA1[j, :] = evaluate_ed(basis, (bBB[1][j, :], bBB[2][j, :], bBB[3][j, :]), (bdBB[1][j, :], bdBB[2][j, :], bdBB[3][j, :]))[2]
+    A1[j, :] = evaluate_ed(basis, Tuple([bBB[i][j, :] for i = 1:NB]), Tuple([bdBB[i][j, :] for i = 1:NB]))[1]
+    bA1[j, :] = evaluate_ed(basis, Tuple([bBB[i][j, :] for i = 1:NB]), Tuple([bdBB[i][j, :] for i = 1:NB]))[2]
 end
 
+A2 = evaluate_ed(basis, bBB, bdBB)[1]
 bA2 = evaluate_ed(basis, bBB, bdBB)[2]
 
+println_slim(@test A1 ≈ A2)
 println_slim(@test bA1 ≈ bA2)
 ## 
 @info("Test batch evaluate_d2")
 
 nX = 64 
-bBB = ( randn(nX, N1), randn(nX, N2), randn(nX, N3) )
-bdBB = ( randn(nX, N1), randn(nX, N2), randn(nX, N3) )
-bddBB = ( randn(nX, N1), randn(nX, N2), randn(nX, N3) )
+bBB = Tuple([randn(nX, N[i]) for i = 1:NB])
+bdBB = Tuple([randn(nX, N[i]) for i = 1:NB])
+bddBB = Tuple([randn(nX, N[i]) for i = 1:NB])
+A1 = zeros(ComplexF64, nX, length(basis))
 bA1 = zeros(ComplexF64, nX, length(basis))
+bbA1 = zeros(ComplexF64, nX, length(basis))
 
 for j = 1:nX
-    bA1[j, :] = evaluate_ed2(basis, (bBB[1][j, :], bBB[2][j, :], bBB[3][j, :]), (bdBB[1][j, :], bdBB[2][j, :], bdBB[3][j, :]), (bddBB[1][j, :], bddBB[2][j, :], bddBB[3][j, :]))[3]
+    A1[j, :] = evaluate_ed2(basis, Tuple([bBB[i][j, :] for i = 1:NB]), Tuple([bBB[i][j, :] for i = 1:NB]), Tuple([bddBB[i][j, :] for i = 1:NB]))[1]
+    bA1[j, :] = evaluate_ed2(basis, Tuple([bBB[i][j, :] for i = 1:NB]), Tuple([bdBB[i][j, :] for i = 1:NB]), Tuple([bddBB[i][j, :] for i = 1:NB]))[2]
+    bbA1[j, :] = evaluate_ed2(basis, Tuple([bBB[i][j, :] for i = 1:NB]), Tuple([bdBB[i][j, :] for i = 1:NB]), Tuple([bddBB[i][j, :] for i = 1:NB]))[3]
 end
 
-bA2 = evaluate_ed2(basis, bBB, bdBB, bddBB)[3]
+A2 = evaluate_ed2(basis, bBB, bdBB, bddBB)[1]
+bA2 = evaluate_ed2(basis, bBB, bdBB, bddBB)[2]
+bbA2 = evaluate_ed2(basis, bBB, bdBB, bddBB)[3]
 
+println_slim(@test A1 ≈ A2)
 println_slim(@test bA1 ≈ bA2)
+println_slim(@test bbA1 ≈ bbA2)
 
 @info("Testing _rrule_evaluate")
 using LinearAlgebra: dot 

From 97725370cb91e6a8cb0364491a2b97fbaf5e2182 Mon Sep 17 00:00:00 2001
From: DexuanZhou <hbnis@icloud.com>
Date: Sat, 20 May 2023 10:50:27 +0800
Subject: [PATCH 06/54] add actual ed and ed2

---
 src/sparseproduct.jl | 162 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 144 insertions(+), 18 deletions(-)

diff --git a/src/sparseproduct.jl b/src/sparseproduct.jl
index 8d6e345..e153bdd 100644
--- a/src/sparseproduct.jl
+++ b/src/sparseproduct.jl
@@ -35,37 +35,81 @@ function evaluate(basis::SparseProduct, BB::Tuple{Vararg{AbstractMatrix}})
    return A 
 end
    
-function evaluate_ed(basis::SparseProduct, BB::Tuple{Vararg{AbstractVector}}, ∂BB::Tuple{Vararg{AbstractVector}}) 
-   VT = mapreduce(eltype, promote_type, ∂BB)
-   A, dA = zeros(VT, length(basis)), zeros(VT, length(basis))
-   evaluate_ed!(A, dA, basis, BB::Tuple, ∂BB::Tuple)
+function evaluate_ed(basis::SparseProduct, BB::Tuple{Vararg{AbstractVector}}) 
+   VT = mapreduce(eltype, promote_type, BB)
+   A = zeros(VT, length(basis))
+   _similar(BB::Tuple) = Tuple([similar(BB[i]) for i = 1:length(BB)])
+   dA = [_similar(BB) for _ = 1:length(basis)]
+   evaluate_ed!(A, dA, basis, BB::Tuple)
+   return A, dA
+end
+
+function evaluate_ed(basis::SparseProduct, BB::Tuple{Vararg{AbstractMatrix}}) 
+   VT = mapreduce(eltype, promote_type, BB)
+   nX = size(BB[1], 1)
+   A = zeros(VT, nX, length(basis))
+   _similar(BB::Tuple) = Tuple([similar(BB[i]) for i = 1:length(BB)])
+   dA = [_similar(BB) for i = 1:nX, j = 1:length(basis)]
+   evaluate_ed!(A, dA, basis, BB::Tuple)
+   return A, dA
+end
+
+function evaluate_ed2(basis::SparseProduct, BB::Tuple{Vararg{AbstractVector}}) 
+   VT = mapreduce(eltype, promote_type, BB)
+   A = zeros(VT, length(basis))
+   _similar(BB::Tuple) = Tuple([similar(BB[i]) for i = 1:length(BB)])
+   dA, ddA = ([_similar(BB) for _ = 1:length(basis)], [_similar(BB) for _ = 1:length(basis)])
+   evaluate_ed2!(A, dA, ddA, basis, BB::Tuple)
+   return A, dA, ddA
+end
+
+function evaluate_ed2(basis::SparseProduct, BB::Tuple{Vararg{AbstractMatrix}}) 
+   VT = mapreduce(eltype, promote_type, BB)
+   nX = size(∂∂BB[1], 1)
+   A = zeros(VT, nX, length(basis))
+   _similar(BB::Tuple) = Tuple([similar(BB[i]) for i = 1:length(BB)])
+   dA, ddA = ([_similar(BB) for i = 1:nX, j = 1:length(basis)], [_similar(BB) for i = 1:nX, j = 1:length(basis)])
+   evaluate_ed2!(A, dA, ddA, basis, BB::Tuple)
+   return A, dA, ddA
+end
+
+function _frule_evaluate(basis::SparseProduct, BB::Tuple{Vararg{AbstractVector}}, ∂BB::Tuple{Vararg{AbstractVector}}) 
+   VT = mapreduce(eltype, promote_type, BB)
+   A = zeros(VT, length(basis))
+   # ∂BB: Vector of SVector{3, Float64}
+   # dA: Matrix 3 * length(basis)
+   dA = zeros(VT, length(∂BB[1][1]), length(basis)) 
+   _frule_evaluate!(A, dA, basis, BB::Tuple, ∂BB::Tuple)
    return A, dA
 end
 
-function evaluate_ed(basis::SparseProduct, BB::Tuple{Vararg{AbstractMatrix}}, ∂BB::Tuple{Vararg{AbstractMatrix}}) 
-   VT = mapreduce(eltype, promote_type, ∂BB)
+function _frule_evaluate(basis::SparseProduct, BB::Tuple{Vararg{AbstractMatrix}}, ∂BB::Tuple{Vararg{AbstractMatrix}}) 
+   VT = mapreduce(eltype, promote_type, BB)
    nX = size(∂BB[1], 1)
-   A, dA = zeros(VT, nX, length(basis)), zeros(VT, nX, length(basis))
-   evaluate_ed!(A, dA, basis, BB::Tuple, ∂BB::Tuple)
+   # BB: Matrix Nel * length(basis)
+   # ∂BB: Matrix of SVector{3, Float64}: Nel * length(basis)
+   A = zeros(VT, nX, length(basis))
+   dA = [zeros(VT, nX, length(basis)) for _ = 1:length(basis)]
+   _frule_evaluate!(A, dA, basis, BB::Tuple, ∂BB::Tuple)
    return A, dA
 end
 
-function evaluate_ed2(basis::SparseProduct, BB::Tuple{Vararg{AbstractVector}}, ∂BB::Tuple{Vararg{AbstractVector}}, ∂∂BB::Tuple{Vararg{AbstractVector}}) 
+function _frule_frule_evaluate(basis::SparseProduct, BB::Tuple{Vararg{AbstractVector}}, ∂BB::Tuple{Vararg{AbstractVector}}, ∂∂BB::Tuple{Vararg{AbstractVector}}) 
    VT = mapreduce(eltype, promote_type, ∂∂BB)
    A = zeros(VT, length(basis))
    dA = zeros(VT, length(basis))
    ddA = zeros(VT, length(basis))
-   evaluate_ed2!(A, dA, ddA, basis, BB::Tuple, ∂BB::Tuple, ∂∂BB::Tuple)
+   _frule_frule_evaluate!(A, dA, ddA, basis, BB::Tuple, ∂BB::Tuple, ∂∂BB::Tuple)
    return A, dA, ddA
 end
 
-function evaluate_ed2(basis::SparseProduct, BB::Tuple{Vararg{AbstractMatrix}}, ∂BB::Tuple{Vararg{AbstractMatrix}}, ∂∂BB::Tuple{Vararg{AbstractMatrix}}) 
+function _frule_frule_evaluate(basis::SparseProduct, BB::Tuple{Vararg{AbstractMatrix}}, ∂BB::Tuple{Vararg{AbstractMatrix}}, ∂∂BB::Tuple{Vararg{AbstractMatrix}}) 
    VT = mapreduce(eltype, promote_type, ∂∂BB)
    nX = size(∂∂BB[1], 1)
    A = zeros(VT, nX, length(basis))
    dA = zeros(VT, nX, length(basis))
    ddA = zeros(VT, nX, length(basis))
-   evaluate_ed2!(A, dA, ddA, basis, BB::Tuple, ∂BB::Tuple, ∂∂BB::Tuple)
+   _frule_frule_evaluate!(A, dA, ddA, basis, BB::Tuple, ∂BB::Tuple, ∂∂BB::Tuple)
    return A, dA, ddA
 end
 # ----------------------- evaluation kernels 
@@ -95,7 +139,88 @@ end
 # Not sure whether we can everything below
 # faster by eval and diff at the same time from prod_grad
 
-function evaluate_ed!(A, dA, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractVector}}, ∂BB::Tuple{Vararg{AbstractVector}}) where {NB}
+function evaluate_ed!(A, dA, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractVector}}) where {NB}
+   @assert length(BB) == NB
+   spec = basis.spec
+   # evaluate!(A, basis, BB)
+   for (iA, ϕ) in enumerate(spec)
+      b = ntuple(Val(NB)) do i 
+         @inbounds BB[i][ϕ[i]] 
+      end 
+      g = _prod_ed(b, Val(NB))
+      A[iA] = g[1]
+      fill!.(dA[iA], 0.0)
+      for i = 1:NB
+         dA[iA][i][ϕ[i]] += g[i + 1]
+      end
+   end 
+   return A, dA 
+end
+
+function evaluate_ed!(A, dA, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractMatrix}}) where {NB}
+   nX = size(BB[1], 1)
+   @assert all(B->size(B, 1) == nX, BB)
+   spec = basis.spec
+   # evaluate!(A, basis, BB)
+   @inbounds for (iA, ϕ) in enumerate(spec)
+      @simd ivdep for j = 1:nX 
+        b = ntuple(Val(NB)) do i 
+           @inbounds BB[i][j, ϕ[i]] 
+        end 
+        g = _prod_ed(b, Val(NB))
+        A[j, iA] = g[1] 
+        fill!.(dA[j, iA], 0.0)
+        for i = 1:NB
+           dA[j, iA][i][j, ϕ[i]] += g[i + 1]
+        end
+      end 
+   end
+   return A, dA
+end
+
+function evaluate_ed2!(A, dA, ddA, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractVector}}) where {NB}
+   @assert length(BB) == NB
+   spec = basis.spec
+
+   for (iA, ϕ) in enumerate(spec)
+      b = ntuple(Val(NB)) do i 
+         @inbounds BB[i][ϕ[i]] 
+      end 
+      g = _prod_ed2(b, Val(NB))
+      A[iA] = g[1]
+      fill!.(dA[iA], 0.0)
+      fill!.(ddA[iA], 0.0)
+      for i = 1:NB 
+         dA[iA][i][ϕ[i]] += g[i + 1]
+      end
+   end 
+   return A, dA, ddA 
+end
+
+
+function evaluate_ed2!(A, dA, ddA, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractMatrix}}) where {NB}
+   nX = size(BB[1], 1)
+   @assert all(B->size(B, 1) == nX, BB)
+   spec = basis.spec
+   # evaluate!(A, basis, BB)
+   @inbounds for (iA, ϕ) in enumerate(spec)
+      @simd ivdep for j = 1:nX 
+        b = ntuple(Val(NB)) do i 
+           @inbounds BB[i][j, ϕ[i]] 
+        end 
+        g = _prod_ed(b, Val(NB))
+        A[j, iA] = g[1] 
+        fill!.(dA[j, iA], 0.0)
+        fill!.(ddA[j, iA], 0.0)
+        for i = 1:NB
+           dA[j, iA][i][j, ϕ[i]] += g[i + 1]
+        end
+      end 
+   end
+   return A, dA
+end
+
+function _frule_evaluate!(A, dA, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractVector}}, ∂BB::Tuple{Vararg{AbstractVector}}) where {NB}
    @assert length(BB) == NB
    @assert length(∂BB) == NB
    spec = basis.spec
@@ -107,13 +232,15 @@ function evaluate_ed!(A, dA, basis::SparseProduct{NB}, BB::Tuple{Vararg{Abstract
       g = _prod_ed(b, Val(NB))
       A[iA] = g[1]
       for i = 1:NB
-         dA[iA] = muladd(∂BB[i][ϕ[i]], g[i + 1], dA[iA])
+         for j = 1:length(∂BB[1][1])
+            dA[j, iA] = muladd(∂BB[i][ϕ[i]][j], g[i + 1], dA[iA])
+         end
       end
    end 
    return A, dA 
 end
 
-function evaluate_ed!(A, dA, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractMatrix}}, ∂BB::Tuple{Vararg{AbstractMatrix}}) where {NB}
+function _frule_evaluate!(A, dA, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractMatrix}}, ∂BB::Tuple{Vararg{AbstractMatrix}}) where {NB}
    nX = size(BB[1], 1)
    @assert all(B->size(B, 1) == nX, BB)
    @assert all(∂B->size(∂B, 1) == nX, ∂BB)
@@ -134,7 +261,7 @@ function evaluate_ed!(A, dA, basis::SparseProduct{NB}, BB::Tuple{Vararg{Abstract
    return A, dA
 end
 
-function evaluate_ed2!(A, dA, ddA, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractVector}}, ∂BB::Tuple{Vararg{AbstractVector}}, ∂∂BB::Tuple{Vararg{AbstractVector}}) where {NB}
+function _frule_frule_evaluate!(A, dA, ddA, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractVector}}, ∂BB::Tuple{Vararg{AbstractVector}}, ∂∂BB::Tuple{Vararg{AbstractVector}}) where {NB}
    @assert length(BB) == NB
    @assert length(∂BB) == NB
    @assert length(∂∂BB) == NB
@@ -161,7 +288,7 @@ function evaluate_ed2!(A, dA, ddA, basis::SparseProduct{NB}, BB::Tuple{Vararg{Ab
    return A, dA, ddA 
 end
 
-function evaluate_ed2!(A, dA, ddA, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractMatrix}}, ∂BB::Tuple{Vararg{AbstractMatrix}}, ∂∂BB::Tuple{Vararg{AbstractMatrix}}) where {NB}
+function _frule_frule_evaluate!(A, dA, ddA, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractMatrix}}, ∂BB::Tuple{Vararg{AbstractMatrix}}, ∂∂BB::Tuple{Vararg{AbstractMatrix}}) where {NB}
    nX = size(BB[1], 1)
    @assert all(B->size(B, 1) == nX, BB)
    @assert all(∂B->size(∂B, 1) == nX, ∂BB)
@@ -235,7 +362,6 @@ test_evaluate(basis::SparseProduct, BB::Tuple) =
        [ prod(BB[j][basis.spec[i][j]] for j = 1:length(BB)) 
             for i = 1:length(basis) ]
 
-
 function test_evaluate_ed(basis::SparseProduct, BB::Tuple, ∂BB::Tuple) 
    A = zeros(length(basis))
    eval = [ prod(BB[j][basis.spec[i][j]] for j = 1:length(BB)) 

From 6e1715a0572ef18ccaf57b9e8d5589197fbcff40 Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Fri, 19 May 2023 21:22:44 -0700
Subject: [PATCH 07/54] update sparseproduct serial evaluation interface

---
 src/interface.jl     | 11 +++++++++++
 src/sparseproduct.jl | 36 ++++++++++++++++++++----------------
 2 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/src/interface.jl b/src/interface.jl
index 4711f5c..367e33f 100644
--- a/src/interface.jl
+++ b/src/interface.jl
@@ -107,9 +107,20 @@ _laplacetype(basis::AbstractPoly4MLBasis, X::BATCH) =
 
 _out_size(basis::AbstractPoly4MLBasis, x::SINGLE) = (length(basis),)
 _out_size(basis::AbstractPoly4MLBasis, X::BATCH) = (length(X), length(basis))
+
+# specfically for SparseProduct
+_out_size(basis::AbstractPoly4MLBasis, x::NTuple) = (length(basis), )
+
+
 _outsym(x::SINGLE) = :out 
 _outsym(X::BATCH) = :outb
 
+# this is just for temporary use and we should think about how to do it generally...
+
+# speccially for SparseProduct
+_outsym(x::NTuple) = :out
+
+
 _alloc(basis::AbstractPoly4MLBasis, X) = 
       acquire!(basis.pool, _outsym(X), _out_size(basis, X), _valtype(basis, X) )
 
diff --git a/src/sparseproduct.jl b/src/sparseproduct.jl
index e153bdd..53183e3 100644
--- a/src/sparseproduct.jl
+++ b/src/sparseproduct.jl
@@ -1,6 +1,7 @@
-struct SparseProduct{NB}
+struct SparseProduct{NB} <: AbstractPoly4MLBasis
    spec::Vector{NTuple{NB, Int}}
-   # ---- temporaries & caches 
+   # ---- temporaries & caches
+   @reqfields()   
 end
 
 function SparseProduct()
@@ -16,24 +17,27 @@ end
  
 Base.length(basis::SparseProduct) = length(basis.spec)
 
+SparseProduct(spec) = SparseProduct(spec, _make_reqfields()...)
 
-# ----------------------- evaluation interfaces 
+_valtype(basis::SparseProduct{T1}, TX::NTuple{NB, Vector{T2}}) where {T1, T2, NB} = T2
 
+# ----------------------- evaluation interfaces 
 
-function evaluate(basis::SparseProduct, BB::Tuple{Vararg{AbstractVector}}) 
-   VT = mapreduce(eltype, promote_type, BB)
-   A = zeros(VT, length(basis))
-   evaluate!(A, basis, BB::Tuple)
-   return A 
-end
 
-function evaluate(basis::SparseProduct, BB::Tuple{Vararg{AbstractMatrix}}) 
-   VT = mapreduce(eltype, promote_type, BB)
-   nX = size(BB[1], 1)
-   A = zeros(VT, nX, length(basis))
-   evaluate!(A, basis, BB::Tuple)
-   return A 
-end
+# function evaluate(basis::SparseProduct, BB::Tuple{Vararg{AbstractVector}}) 
+#    VT = mapreduce(eltype, promote_type, BB)
+#    A = zeros(VT, length(basis))
+#    evaluate!(A, basis, BB::Tuple)
+#    return A
+# end
+
+# function evaluate(basis::SparseProduct, BB::Tuple{Vararg{AbstractMatrix}}) 
+#    VT = mapreduce(eltype, promote_type, BB)
+#    nX = size(BB[1], 1)
+#    A = zeros(VT, nX, length(basis))
+#    evaluate!(A, basis, BB::Tuple)
+#    return A 
+# end
    
 function evaluate_ed(basis::SparseProduct, BB::Tuple{Vararg{AbstractVector}}) 
    VT = mapreduce(eltype, promote_type, BB)

From 66b5ca98e4b48d76a2f3f9d22d9037d85feb5432 Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Fri, 19 May 2023 21:31:50 -0700
Subject: [PATCH 08/54] sparse product batch evaluate with ObejctPools

---
 src/interface.jl           | 6 ++++--
 src/sparseproduct.jl       | 2 +-
 test/test_sparseproduct.jl | 5 +++--
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/interface.jl b/src/interface.jl
index 367e33f..5522fdb 100644
--- a/src/interface.jl
+++ b/src/interface.jl
@@ -109,7 +109,8 @@ _out_size(basis::AbstractPoly4MLBasis, x::SINGLE) = (length(basis),)
 _out_size(basis::AbstractPoly4MLBasis, X::BATCH) = (length(X), length(basis))
 
 # specfically for SparseProduct
-_out_size(basis::AbstractPoly4MLBasis, x::NTuple) = (length(basis), )
+_out_size(basis::AbstractPoly4MLBasis, x::NTuple{NB, AbstractVector{T}}) where {NB, T} = (length(basis), )
+_out_size(basis::AbstractPoly4MLBasis, X::NTuple{NB, AbstractMatrix{T}}) where {NB, T} = (size(X[1], 1), length(basis), )
 
 
 _outsym(x::SINGLE) = :out 
@@ -118,7 +119,8 @@ _outsym(X::BATCH) = :outb
 # this is just for temporary use and we should think about how to do it generally...
 
 # speccially for SparseProduct
-_outsym(x::NTuple) = :out
+_outsym(x::NTuple{NB, AbstractVector{T}}) where {NB, T} = :out
+_outsym(X::NTuple{NB, AbstractMatrix{T}}) where {NB, T} = :outb
 
 
 _alloc(basis::AbstractPoly4MLBasis, X) = 
diff --git a/src/sparseproduct.jl b/src/sparseproduct.jl
index 53183e3..085638e 100644
--- a/src/sparseproduct.jl
+++ b/src/sparseproduct.jl
@@ -19,7 +19,7 @@ Base.length(basis::SparseProduct) = length(basis.spec)
 
 SparseProduct(spec) = SparseProduct(spec, _make_reqfields()...)
 
-_valtype(basis::SparseProduct{T1}, TX::NTuple{NB, Vector{T2}}) where {T1, T2, NB} = T2
+_valtype(basis::SparseProduct{T1}, TX::NTuple{NB, AbstractVecOrMat{T2}}) where {T1, T2, NB} = T2
 
 # ----------------------- evaluation interfaces 
 
diff --git a/test/test_sparseproduct.jl b/test/test_sparseproduct.jl
index 6e9df35..09d9c44 100644
--- a/test/test_sparseproduct.jl
+++ b/test/test_sparseproduct.jl
@@ -6,7 +6,8 @@ using Polynomials4ML
 using ACEbase.Testing: fdtest
 
 ##
-NB = rand(collect(5:30))
+# NB = rand(collect(5:30))
+NB = 3
 N = [i * 4 for i = 1:NB]
 
 B = [randn(N[i]) for i = 1:NB]
@@ -61,7 +62,7 @@ println_slim(@test AA ≈ A1 )
 println_slim(@test dA ≈ A2 )
 @info("Test batch evaluation")
 
-nX = 64 
+nX = 64
 bBB = Tuple([randn(nX, N[i]) for i = 1:NB])
 bA1 = zeros(ComplexF64, nX, length(basis))
 

From 29dcbe26f84fbcb134209587ad8badb885f1ea72 Mon Sep 17 00:00:00 2001
From: DexuanZhou <hbnis@icloud.com>
Date: Sat, 20 May 2023 20:58:05 +0800
Subject: [PATCH 09/54] changed frule and ed and ed2

---
 src/sparseproduct.jl       | 146 ++++++++++++++++++++++++++-----------
 test/test_sparseproduct.jl |  55 +++++++-------
 2 files changed, 132 insertions(+), 69 deletions(-)

diff --git a/src/sparseproduct.jl b/src/sparseproduct.jl
index 085638e..16b964c 100644
--- a/src/sparseproduct.jl
+++ b/src/sparseproduct.jl
@@ -53,7 +53,7 @@ function evaluate_ed(basis::SparseProduct, BB::Tuple{Vararg{AbstractMatrix}})
    nX = size(BB[1], 1)
    A = zeros(VT, nX, length(basis))
    _similar(BB::Tuple) = Tuple([similar(BB[i]) for i = 1:length(BB)])
-   dA = [_similar(BB) for i = 1:nX, j = 1:length(basis)]
+   dA = [_similar(BB) for i = 1:nX, j = 1:length(basis)] # nX * basis
    evaluate_ed!(A, dA, basis, BB::Tuple)
    return A, dA
 end
@@ -93,26 +93,30 @@ function _frule_evaluate(basis::SparseProduct, BB::Tuple{Vararg{AbstractMatrix}}
    # BB: Matrix Nel * length(basis)
    # ∂BB: Matrix of SVector{3, Float64}: Nel * length(basis)
    A = zeros(VT, nX, length(basis))
-   dA = [zeros(VT, nX, length(basis)) for _ = 1:length(basis)]
+   dA = [zeros(VT, length(∂BB[1][1])) for i = 1:nX, j = 1:length(basis)]
    _frule_evaluate!(A, dA, basis, BB::Tuple, ∂BB::Tuple)
    return A, dA
 end
 
 function _frule_frule_evaluate(basis::SparseProduct, BB::Tuple{Vararg{AbstractVector}}, ∂BB::Tuple{Vararg{AbstractVector}}, ∂∂BB::Tuple{Vararg{AbstractVector}}) 
-   VT = mapreduce(eltype, promote_type, ∂∂BB)
+   VT = mapreduce(eltype, promote_type, BB)
    A = zeros(VT, length(basis))
-   dA = zeros(VT, length(basis))
-   ddA = zeros(VT, length(basis))
+   # ∂BB: Vector of SVector{3, Float64}
+   # dA: Matrix 3 * length(basis)
+   dA = zeros(VT, length(∂BB[1][1]), length(basis)) 
+   ddA = zeros(VT, length(∂BB[1][1]), length(basis)) 
    _frule_frule_evaluate!(A, dA, ddA, basis, BB::Tuple, ∂BB::Tuple, ∂∂BB::Tuple)
    return A, dA, ddA
 end
 
 function _frule_frule_evaluate(basis::SparseProduct, BB::Tuple{Vararg{AbstractMatrix}}, ∂BB::Tuple{Vararg{AbstractMatrix}}, ∂∂BB::Tuple{Vararg{AbstractMatrix}}) 
-   VT = mapreduce(eltype, promote_type, ∂∂BB)
-   nX = size(∂∂BB[1], 1)
+   VT = mapreduce(eltype, promote_type, BB)
+   nX = size(∂BB[1], 1)
+   # BB: Matrix Nel * length(basis)
+   # ∂BB: Matrix of SVector{3, Float64}: Nel * length(basis)
    A = zeros(VT, nX, length(basis))
-   dA = zeros(VT, nX, length(basis))
-   ddA = zeros(VT, nX, length(basis))
+   dA = [zeros(VT, length(∂BB[1][1])) for i = 1:nX, j = 1:length(basis)]
+   ddA = [zeros(VT, length(∂BB[1][1])) for i = 1:nX, j = 1:length(basis)]
    _frule_frule_evaluate!(A, dA, ddA, basis, BB::Tuple, ∂BB::Tuple, ∂∂BB::Tuple)
    return A, dA, ddA
 end
@@ -258,7 +262,9 @@ function _frule_evaluate!(A, dA, basis::SparseProduct{NB}, BB::Tuple{Vararg{Abst
         g = _prod_ed(b, Val(NB))
         A[j, iA] = g[1] 
         for i = 1:NB
-           dA[j, iA] = muladd(∂BB[i][j, ϕ[i]], g[i + 1], dA[j, iA])
+            for k = 1:length(∂BB[1][1])
+               dA[j, iA][k] = muladd(∂BB[i][j, ϕ[i]][k], g[i + 1], dA[j, iA])
+            end
         end
       end 
    end
@@ -278,13 +284,17 @@ function _frule_frule_evaluate!(A, dA, ddA, basis::SparseProduct{NB}, BB::Tuple{
       g = _prod_ed2(b, Val(NB))
       A[iA] = g[1]
       for i = 1:NB 
-         dA[iA] = muladd(∂BB[i][ϕ[i]], g[i + 1], dA[iA])
-         ddA[iA] = muladd(∂∂BB[i][ϕ[i]], g[i + 1], ddA[iA])
+         for j = 1:length(∂BB[1][1])
+            dA[iA, j] = muladd(∂BB[i][ϕ[i]][j], g[i + 1], dA[iA])
+            ddA[iA, j] = muladd(∂∂BB[i][ϕ[i]][j], g[i + 1], ddA[iA])
+         end
       end
       t = 1
       for m = 1:NB-1
          for n = m+1:NB
-            ddA[iA] = muladd(2 * ∂BB[m][ϕ[m]] * ∂BB[n][ϕ[n]], g[t + 1 + NB], ddA[iA])
+            for j = 1:length(∂BB[1][1])
+               ddA[iA, j] = muladd(2 * ∂BB[m][ϕ[m]][j] * ∂BB[n][ϕ[n]][j], g[t + 1 + NB], ddA[iA])
+            end
             t += 1
          end
       end
@@ -307,15 +317,17 @@ function _frule_frule_evaluate!(A, dA, ddA, basis::SparseProduct{NB}, BB::Tuple{
          g = _prod_ed2(b, Val(NB))
          A[j, iA] = g[1]
          for i = 1:NB 
-            dA[j, iA] = muladd(∂BB[i][j, ϕ[i]], g[i + 1], dA[j, iA])
-            ddA[j, iA] = muladd(∂∂BB[i][j, ϕ[i]], g[i + 1], ddA[j, iA])
+            for k = 1:length(∂BB[1][1])
+               dA[j, iA][k] = muladd(∂BB[i][j, ϕ[i]][k], g[i + 1], dA[j, iA])
+               ddA[j, iA][k] = muladd(∂∂BB[i][j, ϕ[i]][k], g[i + 1], ddA[j, iA])
+            end
          end
          t = 1
          for m = 1:NB-1
             for n = m+1:NB
-               ddA[j, iA] = muladd(2 * ∂BB[m][j, ϕ[m]] * ∂BB[n][j, ϕ[n]], g[t + 1 + NB], ddA[j, iA])
-               t += 1
+               ddA[j, iA][k] = muladd(2 * ∂BB[m][j, ϕ[m]][k] * ∂BB[n][j, ϕ[n]][k], g[t + 1 + NB], ddA[j, iA])
             end
+            t += 1
          end
       end
    end
@@ -366,31 +378,81 @@ test_evaluate(basis::SparseProduct, BB::Tuple) =
        [ prod(BB[j][basis.spec[i][j]] for j = 1:length(BB)) 
             for i = 1:length(basis) ]
 
-function test_evaluate_ed(basis::SparseProduct, BB::Tuple, ∂BB::Tuple) 
-   A = zeros(length(basis))
-   eval = [ prod(BB[j][basis.spec[i][j]] for j = 1:length(BB)) 
-            for i = 1:length(basis) ]
-   for i = 1:length(basis)
-      for j = 1:length(BB)
-         A[i] += eval[i]/BB[j][basis.spec[i][j]] * ∂BB[j][basis.spec[i][j]]
-      end
+function test_evaluate_ed(basis, BB)
+    A = evaluate_ed(basis, BB)[1]
+    dA = evaluate_ed(basis, BB)[2]
+    errors = Float64[]
+    # loop through finite-difference step-lengths
+    @printf("---------|----------- \n")
+    @printf("    h    | error \n")
+    @printf("---------|----------- \n")
+    for p = 2:11
+        h = 0.1^p
+        dAh = deepcopy(dA)
+        Δ = deepcopy(dA)
+        for n = 1:length(dAh) # basis
+            for i = 1:length(dAh[n]) #NB
+                for j = 1:length(dAh[n][i]) #BB[i]
+                    BB[i][j] += h
+                    dAh[n][i][j] = (evaluate(basis, BB)[n] - A[n])/h
+                    Δ[n][i][j] = dA[n][i][j] - dAh[n][i][j]
+                    BB[i][j] -= h
+                end
+            end
+        end
+        push!(errors, maximum([norm(Δ[i][j], Inf) for i = 1:length(Δ), j = 1:length(Δ[i])] ))
+        @printf(" %1.1e | %4.2e  \n", h, errors[end])
+    end
+    @printf("---------|----------- \n")
+    if minimum(errors) <= 1e-3 * maximum(errors)
+        println("passed")
+        return true
+   else
+        @warn("""It seems the finite-difference test has failed, which indicates
+        that there is an inconsistency between the function and gradient
+        evaluation. Please double-check this manually / visually. (It is
+        also possible that the function being tested is poorly scaled.)""")
+        return false
    end
-   return A
-end 
+end
 
-function test_evaluate_ed2(basis::SparseProduct, BB::Tuple, ∂BB::Tuple, ∂∂BB::Tuple) 
-   A = zeros(length(basis))
-   eval = [ prod(BB[j][basis.spec[i][j]] for j = 1:length(BB)) 
-            for i = 1:length(basis) ]
-   for i = 1:length(basis)
-      for j = 1:length(BB)
-         A[i] += eval[i]/BB[j][basis.spec[i][j]] * ∂∂BB[j][basis.spec[i][j]]
-      end
-      for j = 1:length(BB)-1
-         for z = j+1:length(BB)
-            A[i] += 2 * eval[i]/(BB[j][basis.spec[i][j]]*BB[z][basis.spec[i][z]]) * ∂BB[j][basis.spec[i][j]] * ∂BB[z][basis.spec[i][z]]
-         end
-      end
+function test_evaluate_ed2(basis, BB)
+   A = evaluate_ed2(basis, BB)[1]
+   ddA = evaluate_ed2(basis, BB)[3]
+   errors = Float64[]
+   # loop through finite-difference step-lengths
+   @printf("---------|----------- \n")
+   @printf("    h    | error \n")
+   @printf("---------|----------- \n")
+   for p = 2:11
+       h = 0.1^p
+       ddAh = deepcopy(ddA)
+       Δ = deepcopy(ddA)
+       for n = 1:length(ddAh) # basis
+           for i = 1:length(ddAh[n]) #NB
+               for j = 1:length(ddAh[n][i]) #BB[i]
+                   BB[i][j] += h
+                   ddAh[n][i][j] = evaluate(basis, BB)[n] - 2 * A[n]
+                   BB[i][j] -= 2*h
+                   ddAh[n][i][j] = (ddAh[n][i][j] + evaluate(basis, BB)[n])/h^2
+                   BB[i][j] += h 
+                   Δ[n][i][j] = ddA[n][i][j] - ddAh[n][i][j]
+               end
+           end
+       end
+       push!(errors, maximum([norm(Δ[i][j], Inf) for i = 1:length(Δ), j = 1:length(Δ[i])] ))
+       @printf(" %1.1e | %4.2e  \n", h, errors[end])
    end
-   return A
-end 
\ No newline at end of file
+   @printf("---------|----------- \n")
+   if minimum(errors) <= 1e-3 * maximum(errors)
+       println("passed")
+       return true
+  else
+       @warn("""It seems the finite-difference test has failed, which indicates
+       that there is an inconsistency between the function and gradient
+       evaluation. Please double-check this manually / visually. (It is
+       also possible that the function being tested is poorly scaled.)""")
+       return false
+  end
+end
+
diff --git a/test/test_sparseproduct.jl b/test/test_sparseproduct.jl
index 09d9c44..42eda64 100644
--- a/test/test_sparseproduct.jl
+++ b/test/test_sparseproduct.jl
@@ -4,17 +4,15 @@ using Polynomials4ML: SparseProduct, evaluate, evaluate_ed, evaluate_ed2, test_e
 using LinearAlgebra: norm
 using Polynomials4ML
 using ACEbase.Testing: fdtest
-
+using Printf
 ##
-# NB = rand(collect(5:30))
 NB = 3
+
 N = [i * 4 for i = 1:NB]
 
 B = [randn(N[i]) for i = 1:NB]
-∂B = [randn(N[i]) for i = 1:NB]
-∂∂B = [randn(N[i]) for i = 1:NB]
 
-spec = sort([ Tuple([rand(1:N[i]) for i = 1:NB]) for _ = 1:100 ])
+spec = sort([ Tuple([rand(1:N[i]) for i = 1:NB]) for _ = 1:6])
 
 basis = SparseProduct(spec)
 
@@ -32,37 +30,31 @@ println_slim(@test A1 ≈ A2 )
 
 @info("Test serial evaluation_ed")
 BB = Tuple(B)
-∂BB = Tuple(∂B)
 
-A = test_evaluate_ed(basis, BB, ∂BB)
+A = test_evaluate_ed(basis, BB)
 
 AA = evaluate(basis, BB)
-A1 = evaluate_ed(basis, BB, ∂BB)[1]
-A2 = evaluate_ed(basis, BB, ∂BB)[2]
+A1 = evaluate_ed(basis, BB)[1]
 
-println_slim(@test A ≈ A2 )
 println_slim(@test AA ≈ A1 )
 ##
 
-@info("Test serial evaluation_d2")
+@info("Test serial evaluation_ed2")
 BB = Tuple(B)
-∂BB = Tuple(∂B)
-∂∂BB = Tuple(∂∂B)
 
-A = test_evaluate_ed2(basis, BB, ∂BB, ∂∂BB)
+A = test_evaluate_ed2(basis, BB)
 
 AA = evaluate(basis, BB)
-dA = evaluate_ed(basis, BB, ∂BB)[2]
-A1 = evaluate_ed2(basis, BB, ∂BB, ∂∂BB)[1]
-A2 = evaluate_ed2(basis, BB, ∂BB, ∂∂BB)[2]
-A3 = evaluate_ed2(basis, BB, ∂BB, ∂∂BB)[3]
+dA = evaluate_ed(basis, BB)[2]
+A1 = evaluate_ed2(basis, BB)[1]
+A2 = evaluate_ed2(basis, BB)[2]
 
-println_slim(@test A ≈ A3 )
 println_slim(@test AA ≈ A1 )
-println_slim(@test dA ≈ A2 )
+Δ = maximum([norm(dA[i][j] - A2[i][j], Inf) for i = 1:length(dA), j = 1:length(dA[1])])
+println_slim(@test Δ ≈ 0.0)
 @info("Test batch evaluation")
 
-nX = 64
+nX = 5
 bBB = Tuple([randn(nX, N[i]) for i = 1:NB])
 bA1 = zeros(ComplexF64, nX, length(basis))
 
@@ -76,19 +68,28 @@ println_slim(@test bA1 ≈ bA2)
 
 @info("Test batch evaluate_ed")
 
-nX = 64 
+nX = 2
 bBB = Tuple([randn(nX, N[i]) for i = 1:NB])
-bdBB = Tuple([randn(nX, N[i]) for i = 1:NB])
 A1 = zeros(ComplexF64, nX, length(basis))
 bA1 = zeros(ComplexF64, nX, length(basis))
+_similar(BB::Tuple) = Tuple([similar(BB[i]) for i = 1:length(BB)])
+dA = [Tuple([similar(BB[i]) for i = 1:length(BB)]) for i = 1:nX, j = 1:length(basis)]  # nX * basis
 
+Δ = []
 for j = 1:nX
-    A1[j, :] = evaluate_ed(basis, Tuple([bBB[i][j, :] for i = 1:NB]), Tuple([bdBB[i][j, :] for i = 1:NB]))[1]
-    bA1[j, :] = evaluate_ed(basis, Tuple([bBB[i][j, :] for i = 1:NB]), Tuple([bdBB[i][j, :] for i = 1:NB]))[2]
+    A1[j, :] = evaluate_ed(basis, Tuple([bBB[i][j, :] for i = 1:NB]))[1]
 end
+#for i = 1:length(basis)
+#    for j = 1:nX
+#        for z = 1:NB
+#            dA[j,i][z] = evaluate_ed(basis, Tuple([bBB[i][j, :] for i = 1:NB]))[2][i][z]
+#        end
+#    end
+#end
+
 
-A2 = evaluate_ed(basis, bBB, bdBB)[1]
-bA2 = evaluate_ed(basis, bBB, bdBB)[2]
+A2 = evaluate_ed(basis, bBB)[1]
+bA2 = evaluate_ed(basis, bBB)[2]
 
 println_slim(@test A1 ≈ A2)
 println_slim(@test bA1 ≈ bA2)

From 0d44ea17d50f9cd76d1a3a683713b2f23a4247fe Mon Sep 17 00:00:00 2001
From: DexuanZhou <hbnis@icloud.com>
Date: Sun, 21 May 2023 00:14:29 +0800
Subject: [PATCH 10/54] fix ed test

---
 src/sparseproduct.jl       |  97 ++----------------------
 test/test_sparseproduct.jl | 148 +++++++++++++++++++++++++++++--------
 2 files changed, 127 insertions(+), 118 deletions(-)

diff --git a/src/sparseproduct.jl b/src/sparseproduct.jl
index 16b964c..ab5bc28 100644
--- a/src/sparseproduct.jl
+++ b/src/sparseproduct.jl
@@ -53,7 +53,7 @@ function evaluate_ed(basis::SparseProduct, BB::Tuple{Vararg{AbstractMatrix}})
    nX = size(BB[1], 1)
    A = zeros(VT, nX, length(basis))
    _similar(BB::Tuple) = Tuple([similar(BB[i]) for i = 1:length(BB)])
-   dA = [_similar(BB) for i = 1:nX, j = 1:length(basis)] # nX * basis
+   dA = [_similar(BB) for i = 1:length(basis)] # nX * basis
    evaluate_ed!(A, dA, basis, BB::Tuple)
    return A, dA
 end
@@ -69,10 +69,10 @@ end
 
 function evaluate_ed2(basis::SparseProduct, BB::Tuple{Vararg{AbstractMatrix}}) 
    VT = mapreduce(eltype, promote_type, BB)
-   nX = size(∂∂BB[1], 1)
+   nX = size(BB[1], 1)
    A = zeros(VT, nX, length(basis))
    _similar(BB::Tuple) = Tuple([similar(BB[i]) for i = 1:length(BB)])
-   dA, ddA = ([_similar(BB) for i = 1:nX, j = 1:length(basis)], [_similar(BB) for i = 1:nX, j = 1:length(basis)])
+   dA, ddA = ([_similar(BB) for _ = 1:length(basis)], [_similar(BB) for _ = 1:length(basis)])
    evaluate_ed2!(A, dA, ddA, basis, BB::Tuple)
    return A, dA, ddA
 end
@@ -171,15 +171,15 @@ function evaluate_ed!(A, dA, basis::SparseProduct{NB}, BB::Tuple{Vararg{Abstract
    spec = basis.spec
    # evaluate!(A, basis, BB)
    @inbounds for (iA, ϕ) in enumerate(spec)
+      fill!.(dA[iA], 0.0)
       @simd ivdep for j = 1:nX 
         b = ntuple(Val(NB)) do i 
            @inbounds BB[i][j, ϕ[i]] 
         end 
         g = _prod_ed(b, Val(NB))
         A[j, iA] = g[1] 
-        fill!.(dA[j, iA], 0.0)
         for i = 1:NB
-           dA[j, iA][i][j, ϕ[i]] += g[i + 1]
+           dA[iA][i][j, ϕ[i]] += g[i + 1]
         end
       end 
    end
@@ -212,16 +212,16 @@ function evaluate_ed2!(A, dA, ddA, basis::SparseProduct{NB}, BB::Tuple{Vararg{Ab
    spec = basis.spec
    # evaluate!(A, basis, BB)
    @inbounds for (iA, ϕ) in enumerate(spec)
+      fill!.(dA[iA], 0.0)
+      fill!.(ddA[iA], 0.0)
       @simd ivdep for j = 1:nX 
         b = ntuple(Val(NB)) do i 
            @inbounds BB[i][j, ϕ[i]] 
         end 
         g = _prod_ed(b, Val(NB))
         A[j, iA] = g[1] 
-        fill!.(dA[j, iA], 0.0)
-        fill!.(ddA[j, iA], 0.0)
         for i = 1:NB
-           dA[j, iA][i][j, ϕ[i]] += g[i + 1]
+           dA[iA][i][j, ϕ[i]] += g[i + 1]
         end
       end 
    end
@@ -374,85 +374,4 @@ function _pullback_evaluate!(∂BB, ∂A, basis::SparseProduct{NB}, BB::Tuple) w
    return nothing 
 end
 
-test_evaluate(basis::SparseProduct, BB::Tuple) = 
-       [ prod(BB[j][basis.spec[i][j]] for j = 1:length(BB)) 
-            for i = 1:length(basis) ]
-
-function test_evaluate_ed(basis, BB)
-    A = evaluate_ed(basis, BB)[1]
-    dA = evaluate_ed(basis, BB)[2]
-    errors = Float64[]
-    # loop through finite-difference step-lengths
-    @printf("---------|----------- \n")
-    @printf("    h    | error \n")
-    @printf("---------|----------- \n")
-    for p = 2:11
-        h = 0.1^p
-        dAh = deepcopy(dA)
-        Δ = deepcopy(dA)
-        for n = 1:length(dAh) # basis
-            for i = 1:length(dAh[n]) #NB
-                for j = 1:length(dAh[n][i]) #BB[i]
-                    BB[i][j] += h
-                    dAh[n][i][j] = (evaluate(basis, BB)[n] - A[n])/h
-                    Δ[n][i][j] = dA[n][i][j] - dAh[n][i][j]
-                    BB[i][j] -= h
-                end
-            end
-        end
-        push!(errors, maximum([norm(Δ[i][j], Inf) for i = 1:length(Δ), j = 1:length(Δ[i])] ))
-        @printf(" %1.1e | %4.2e  \n", h, errors[end])
-    end
-    @printf("---------|----------- \n")
-    if minimum(errors) <= 1e-3 * maximum(errors)
-        println("passed")
-        return true
-   else
-        @warn("""It seems the finite-difference test has failed, which indicates
-        that there is an inconsistency between the function and gradient
-        evaluation. Please double-check this manually / visually. (It is
-        also possible that the function being tested is poorly scaled.)""")
-        return false
-   end
-end
-
-function test_evaluate_ed2(basis, BB)
-   A = evaluate_ed2(basis, BB)[1]
-   ddA = evaluate_ed2(basis, BB)[3]
-   errors = Float64[]
-   # loop through finite-difference step-lengths
-   @printf("---------|----------- \n")
-   @printf("    h    | error \n")
-   @printf("---------|----------- \n")
-   for p = 2:11
-       h = 0.1^p
-       ddAh = deepcopy(ddA)
-       Δ = deepcopy(ddA)
-       for n = 1:length(ddAh) # basis
-           for i = 1:length(ddAh[n]) #NB
-               for j = 1:length(ddAh[n][i]) #BB[i]
-                   BB[i][j] += h
-                   ddAh[n][i][j] = evaluate(basis, BB)[n] - 2 * A[n]
-                   BB[i][j] -= 2*h
-                   ddAh[n][i][j] = (ddAh[n][i][j] + evaluate(basis, BB)[n])/h^2
-                   BB[i][j] += h 
-                   Δ[n][i][j] = ddA[n][i][j] - ddAh[n][i][j]
-               end
-           end
-       end
-       push!(errors, maximum([norm(Δ[i][j], Inf) for i = 1:length(Δ), j = 1:length(Δ[i])] ))
-       @printf(" %1.1e | %4.2e  \n", h, errors[end])
-   end
-   @printf("---------|----------- \n")
-   if minimum(errors) <= 1e-3 * maximum(errors)
-       println("passed")
-       return true
-  else
-       @warn("""It seems the finite-difference test has failed, which indicates
-       that there is an inconsistency between the function and gradient
-       evaluation. Please double-check this manually / visually. (It is
-       also possible that the function being tested is poorly scaled.)""")
-       return false
-  end
-end
 
diff --git a/test/test_sparseproduct.jl b/test/test_sparseproduct.jl
index 42eda64..24782e7 100644
--- a/test/test_sparseproduct.jl
+++ b/test/test_sparseproduct.jl
@@ -1,10 +1,10 @@
 using Test
 using Polynomials4ML.Testing: println_slim, print_tf
-using Polynomials4ML: SparseProduct, evaluate, evaluate_ed, evaluate_ed2, test_evaluate, test_evaluate_ed, test_evaluate_ed2
+using Printf
+using Polynomials4ML: SparseProduct, evaluate, evaluate_ed, evaluate_ed2
 using LinearAlgebra: norm
 using Polynomials4ML
 using ACEbase.Testing: fdtest
-using Printf
 ##
 NB = 3
 
@@ -16,8 +16,88 @@ spec = sort([ Tuple([rand(1:N[i]) for i = 1:NB]) for _ = 1:6])
 
 basis = SparseProduct(spec)
 
+test_evaluate(basis::SparseProduct, BB::Tuple) = 
+       [ prod(BB[j][basis.spec[i][j]] for j = 1:length(BB)) 
+            for i = 1:length(basis) ]
+
+function test_evaluate_ed(basis, BB)
+    A = evaluate_ed(basis, BB)[1]
+    dA = evaluate_ed(basis, BB)[2]
+    errors = Float64[]
+    # loop through finite-difference step-lengths
+    @printf("---------|----------- \n")
+    @printf("    h    | error \n")
+    @printf("---------|----------- \n")
+    for p = 2:11
+        h = 0.1^p
+        dAh = deepcopy(dA)
+        Δ = deepcopy(dA)
+        for n = 1:length(dAh) # basis
+            for i = 1:length(dAh[n]) #NB
+                for j = 1:length(dAh[n][i]) #BB[i]
+                    BB[i][j] += h
+                    dAh[n][i][j] = (evaluate(basis, BB)[n] - A[n])/h
+                    Δ[n][i][j] = dA[n][i][j] - dAh[n][i][j]
+                    BB[i][j] -= h
+                end
+            end
+        end
+        push!(errors, maximum([norm(Δ[i][j], Inf) for i = 1:length(Δ) for j = 1:length(Δ[i])] ))
+        @printf(" %1.1e | %4.2e  \n", h, errors[end])
+    end
+    @printf("---------|----------- \n")
+    if minimum(errors) <= 1e-3 * maximum(errors)
+        println("passed")
+        return true
+   else
+        @warn("""It seems the finite-difference test has failed, which indicates
+        that there is an inconsistency between the function and gradient
+        evaluation. Please double-check this manually / visually. (It is
+        also possible that the function being tested is poorly scaled.)""")
+        return false
+   end
+end
+
+function test_evaluate_ed2(basis, BB)
+   A = evaluate_ed2(basis, BB)[1]
+   ddA = evaluate_ed2(basis, BB)[3]
+   errors = Float64[]
+   # loop through finite-difference step-lengths
+   @printf("---------|----------- \n")
+   @printf("    h    | error \n")
+   @printf("---------|----------- \n")
+   for p = 2:11
+       h = 0.1^p
+       ddAh = deepcopy(ddA)
+       Δ = deepcopy(ddA)
+       for n = 1:length(ddAh) # basis
+           for i = 1:length(ddAh[n]) #NB
+               for j = 1:length(ddAh[n][i]) #BB[i]
+                   BB[i][j] += h
+                   ddAh[n][i][j] = evaluate(basis, BB)[n] - 2 * A[n]
+                   BB[i][j] -= 2*h
+                   ddAh[n][i][j] = (ddAh[n][i][j] + evaluate(basis, BB)[n])/h^2
+                   BB[i][j] += h 
+                   Δ[n][i][j] = ddA[n][i][j] - ddAh[n][i][j]
+               end
+           end
+       end
+       push!(errors, maximum([norm(Δ[i][j], Inf) for i = 1:length(Δ) for j = 1:length(Δ[i])] ))
+       @printf(" %1.1e | %4.2e  \n", h, errors[end])
+   end
+   @printf("---------|----------- \n")
+   if minimum(errors) <= 1e-3 * maximum(errors)
+       println("passed")
+       return true
+  else
+       @warn("""It seems the finite-difference test has failed, which indicates
+       that there is an inconsistency between the function and gradient
+       evaluation. Please double-check this manually / visually. (It is
+       also possible that the function being tested is poorly scaled.)""")
+       return false
+  end
+end
 
-## 
 
 @info("Test serial evaluation")
 
@@ -50,7 +130,7 @@ A1 = evaluate_ed2(basis, BB)[1]
 A2 = evaluate_ed2(basis, BB)[2]
 
 println_slim(@test AA ≈ A1 )
-Δ = maximum([norm(dA[i][j] - A2[i][j], Inf) for i = 1:length(dA), j = 1:length(dA[1])])
+Δ = maximum([norm(dA[i][j] - A2[i][j], Inf) for i = 1:length(dA) for j = 1:length(dA[i])])
 println_slim(@test Δ ≈ 0.0)
 @info("Test batch evaluation")
 
@@ -71,56 +151,66 @@ println_slim(@test bA1 ≈ bA2)
 nX = 2
 bBB = Tuple([randn(nX, N[i]) for i = 1:NB])
 A1 = zeros(ComplexF64, nX, length(basis))
-bA1 = zeros(ComplexF64, nX, length(basis))
 _similar(BB::Tuple) = Tuple([similar(BB[i]) for i = 1:length(BB)])
-dA = [Tuple([similar(BB[i]) for i = 1:length(BB)]) for i = 1:nX, j = 1:length(basis)]  # nX * basis
+bA1 = [_similar(bBB) for j = 1:length(basis)]  # nX * basis
 
-Δ = []
 for j = 1:nX
     A1[j, :] = evaluate_ed(basis, Tuple([bBB[i][j, :] for i = 1:NB]))[1]
 end
-#for i = 1:length(basis)
-#    for j = 1:nX
-#        for z = 1:NB
-#            dA[j,i][z] = evaluate_ed(basis, Tuple([bBB[i][j, :] for i = 1:NB]))[2][i][z]
-#        end
-#    end
-#end
-
+for i = 1:length(basis)
+    for j = 1:NB
+        for z = 1:nX
+            bA1[i][j][z,:] = (evaluate_ed(basis, Tuple([bBB[i][z, :] for i = 1:NB]))[2][i][j])
+        end
+    end
+end
 
 A2 = evaluate_ed(basis, bBB)[1]
 bA2 = evaluate_ed(basis, bBB)[2]
 
 println_slim(@test A1 ≈ A2)
-println_slim(@test bA1 ≈ bA2)
+
+Δ = maximum([norm(bA1[i][j] - bA2[i][j], Inf) for i = 1:length(bA1) for j = 1:length(bA1[i])])
+println_slim(@test Δ ≈ 0)
 ## 
-@info("Test batch evaluate_d2")
+
+@info("Test batch evaluate_ed2")
 
 nX = 64 
 bBB = Tuple([randn(nX, N[i]) for i = 1:NB])
-bdBB = Tuple([randn(nX, N[i]) for i = 1:NB])
-bddBB = Tuple([randn(nX, N[i]) for i = 1:NB])
 A1 = zeros(ComplexF64, nX, length(basis))
-bA1 = zeros(ComplexF64, nX, length(basis))
-bbA1 = zeros(ComplexF64, nX, length(basis))
+_similar(BB::Tuple) = Tuple([similar(BB[i]) for i = 1:length(BB)])
+bA1 = [_similar(bBB) for j = 1:length(basis)]  # nX * basis
+bbA1 = [_similar(bBB) for j = 1:length(basis)] 
 
 for j = 1:nX
-    A1[j, :] = evaluate_ed2(basis, Tuple([bBB[i][j, :] for i = 1:NB]), Tuple([bBB[i][j, :] for i = 1:NB]), Tuple([bddBB[i][j, :] for i = 1:NB]))[1]
-    bA1[j, :] = evaluate_ed2(basis, Tuple([bBB[i][j, :] for i = 1:NB]), Tuple([bdBB[i][j, :] for i = 1:NB]), Tuple([bddBB[i][j, :] for i = 1:NB]))[2]
-    bbA1[j, :] = evaluate_ed2(basis, Tuple([bBB[i][j, :] for i = 1:NB]), Tuple([bdBB[i][j, :] for i = 1:NB]), Tuple([bddBB[i][j, :] for i = 1:NB]))[3]
+    A1[j, :] = evaluate_ed2(basis, Tuple([bBB[i][j, :] for i = 1:NB]))[1]
+end
+for i = 1:length(basis)
+    for j = 1:NB
+        for z = 1:nX
+            bA1[i][j][z,:] = (evaluate_ed2(basis, Tuple([bBB[i][z, :] for i = 1:NB]))[2][i][j])
+            bbA1[i][j][z,:] = (evaluate_ed2(basis, Tuple([bBB[i][z, :] for i = 1:NB]))[3][i][j])
+        end
+    end
 end
 
-A2 = evaluate_ed2(basis, bBB, bdBB, bddBB)[1]
-bA2 = evaluate_ed2(basis, bBB, bdBB, bddBB)[2]
-bbA2 = evaluate_ed2(basis, bBB, bdBB, bddBB)[3]
+A2 = evaluate_ed2(basis, bBB)[1]
+bA2 = evaluate_ed2(basis, bBB)[2]
+bbA2 = evaluate_ed2(basis, bBB)[3]
 
 println_slim(@test A1 ≈ A2)
-println_slim(@test bA1 ≈ bA2)
-println_slim(@test bbA1 ≈ bbA2)
+Δ = maximum([norm(bA1[i][j] - bA2[i][j], Inf) for i = 1:length(bA1) for j = 1:length(bA1[i])])
+println_slim(@test Δ ≈ 0)
+Δ = maximum([norm(bbA1[i][j] - bbA2[i][j], Inf) for i = 1:length(bbA1) for j = 1:length(bbA1[i])])
+println_slim(@test Δ ≈ 0)
 
 @info("Testing _rrule_evaluate")
 using LinearAlgebra: dot 
 
+N1 = 10
+N2 = 20
+N3 = 30
 for ntest = 1:30
     local bBB
     local bUU

From ce0a6e73a7addb4c53043016d0443db2952045dc Mon Sep 17 00:00:00 2001
From: cortner <christohortner@gmail.com>
Date: Sat, 20 May 2023 21:06:43 -0700
Subject: [PATCH 11/54] bringing sparseprodpool in line with P4ML

---
 Project.toml                                  |   8 +-
 src/ace/sparseprodpool.jl                     | 195 +++++++++---------
 ...test_1pbasis.jl => test_sparseprodpool.jl} |  23 ++-
 3 files changed, 120 insertions(+), 106 deletions(-)
 rename test/ace/{test_1pbasis.jl => test_sparseprodpool.jl} (70%)

diff --git a/Project.toml b/Project.toml
index 6960570..ff639fa 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,6 +10,7 @@ BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+HyperDualNumbers = "50ceba7f-c3ee-5a84-a6e8-3ad40456ec97"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 LuxCore = "bb33d45b-7691-41d6-9220-0943567d0623"
@@ -25,9 +26,9 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 [compat]
 ACEbase = "0.4.2"
 Adapt = "3.5"
+BenchmarkTools = "1"
 ChainRulesCore = "1"
 Combinatorics = "1"
-BenchmarkTools = "1"
 ForwardDiff = "0.10"
 LoopVectorization = "0.12"
 LuxCore = "0.1.3"
@@ -35,14 +36,13 @@ ObjectPools = "0.2.1"
 QuadGK = "2"
 SpecialFunctions = "2.2"
 StaticArrays = "1.5"
-julia = "1.8"
 StrideArrays = "0.1.25"
-
+julia = "1.8"
 
 [extras]
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
 test = ["Test", "Lux", "Printf"]
diff --git a/src/ace/sparseprodpool.jl b/src/ace/sparseprodpool.jl
index ca8bb39..0dd9bc6 100644
--- a/src/ace/sparseprodpool.jl
+++ b/src/ace/sparseprodpool.jl
@@ -1,12 +1,17 @@
 
 
-struct PooledSparseProduct{NB}
+struct PooledSparseProduct{NB} <: AbstractPoly4MLBasis
    spec::Vector{NTuple{NB, Int}}
    # ---- temporaries & caches 
+   @reqfields
 end
 
 function PooledSparseProduct()
-   return PooledSparseProduct(bases, NTuple{NB, Int}[])
+   return PooledSparseProduct(NTuple{NB, Int}[], _make_reqfields()...)
+end
+
+function PooledSparseProduct(spect::AbstractVector{<: Tuple})
+   return PooledSparseProduct(spect, _make_reqfields()...)
 end
 
 # each column defines a basis element
@@ -18,37 +23,35 @@ end
 
 Base.length(basis::PooledSparseProduct) = length(basis.spec)
 
-# function Base.show(io::IO, basis::PooledSparseProduct)
-#    print(io, "PooledSparseProduct(")
-#    print(io, basis.bases)
-# end
+function Base.show(io::IO, basis::PooledSparseProduct{NB}) where {NB}
+   print(io, "PooledSparseProduct{$NB}(...)")
+end
 
 
 # ----------------------- evaluation interfaces 
 
+const TupVec = Tuple{Vararg{<: AbstractVector}}
+const TupMat = Tuple{Vararg{<: AbstractMatrix}}
+const TupVecMat = Union{TupVec, TupMat}
 
-function evaluate(basis::PooledSparseProduct, BB::Tuple) 
-   VT = mapreduce(eltype, promote_type, BB)
-   A = zeros(VT, length(basis))
-   evaluate!(A, basis, BB::Tuple)
-   return A 
-end
+_valtype(basis::PooledSparseProduct, BB::Tuple) = 
+      mapreduce(eltype, promote_type, BB)
 
-function evalpool(basis::PooledSparseProduct, BB::Tuple)
-   VT = mapreduce(eltype, promote_type, BB)
-   A = zeros(VT, length(basis))
-   evalpool!(A, basis, BB::Tuple)
-   return A
-end
+_alloc(basis::AbstractPoly4MLBasis, BB::TupVecMat) = 
+      acquire!(basis.pool, :A, (length(basis),), _valtype(basis, BB) )
 
+_alloc_d(basis::AbstractPoly4MLBasis, BB::TupVecMat) = 
+      acquire!(basis.pool, _outsym(BB), length(basis), _gradtype(basis, BB) )
 
-test_evaluate(basis::PooledSparseProduct, BB::Tuple) = 
-       [ prod(BB[j][basis.spec[i][j]] for j = 1:length(BB)) 
-            for i = 1:length(basis) ]
+_alloc_dd(basis::AbstractPoly4MLBasis, BB::TupVecMat) = 
+      acquire!(basis.pool, _outsym(BB), length(basis), _gradtype(basis, BB) )
+
+_alloc_ed(basis::AbstractPoly4MLBasis, BB::TupVecMat) = 
+      _alloc(basis, BB), _alloc_d(basis, BB)
+
+_alloc_ed2(basis::AbstractPoly4MLBasis, BB::TupVecMat) = 
+      _alloc(basis, BB), _alloc_d(basis, BB), _alloc_dd(basis, BB)
 
-test_evalpool(basis::PooledSparseProduct, BB::Tuple) = 
-      sum( test_evaluate(basis, ntuple(i -> BB[i][j, :], length(BB)))
-         for j = 1:size(BB[1], 1) )            
 
 # ----------------------- evaluation kernels 
 
@@ -76,7 +79,7 @@ import Base.Cartesian: @nexprs
 #    return nothing 
 # end
 
-function evaluate!(A, basis::PooledSparseProduct{NB}, BB) where {NB}
+function evaluate!(A, basis::PooledSparseProduct{NB}, BB::TupVec) where {NB}
    @assert length(BB) == NB
    # evaluate the 1p product basis functions and add/write into _A
    spec = basis.spec
@@ -107,7 +110,7 @@ end
 # end
 
 # BB::tuple of matrices 
-function evalpool!(A, basis::PooledSparseProduct{NB}, BB, 
+function evaluate!(A, basis::PooledSparseProduct{NB}, BB::TupMat, 
                    nX = size(BB[1], 1)) where {NB}
    @assert all(B->size(B, 1) >= nX, BB)
    spec = basis.spec
@@ -124,71 +127,73 @@ function evalpool!(A, basis::PooledSparseProduct{NB}, BB,
 end
 
 
-struct LinearBatch
-   groups::Vector{Int}
-end
 
-function linearbatch(target::AbstractVector{<: Integer})
-   @assert issorted(target)
-   @assert minimum(target) > 0 
-   ngroups = target[end] 
-   groups = zeros(Int, ngroups+1)
-   gidx = 1 
-   i = 1
-   groups[1] = 1 
-   for gidx = 1:ngroups 
-      while (i <= length(target)) && (target[i] == gidx)
-         i += 1
-      end
-      groups[gidx+1] = i
-   end
-   return LinearBatch(groups)
-end
 
-evalpool_batch!(A, basis::PooledSparseProduct, BB, 
-                   target::AbstractVector{<: Integer}) = 
-    evalpool_batch!(A, basis, BB, linearbatch(target))
+# struct LinearBatch
+#    groups::Vector{Int}
+# end
 
-function evalpool_batch!(A, basis::PooledSparseProduct{NB}, BB, 
-                         target::LinearBatch) where {NB}
-   nX = size(BB[1], 1)
-   nA = size(A, 1)
-   @assert length(target.groups)-1 <= nA 
-   @assert all(B->size(B, 1) == nX, BB)
-   spec = basis.spec
+# function linearbatch(target::AbstractVector{<: Integer})
+#    @assert issorted(target)
+#    @assert minimum(target) > 0 
+#    ngroups = target[end] 
+#    groups = zeros(Int, ngroups+1)
+#    gidx = 1 
+#    i = 1
+#    groups[1] = 1 
+#    for gidx = 1:ngroups 
+#       while (i <= length(target)) && (target[i] == gidx)
+#          i += 1
+#       end
+#       groups[gidx+1] = i
+#    end
+#    return LinearBatch(groups)
+# end
 
-   @inbounds for (iA, ϕ) in enumerate(spec)
-      for t = 1:length(target.groups)-1
-         a_t = zero(eltype(A))
-         @simd ivdep for j = target.groups[t]:target.groups[t+1]-1
-            a_t += BB_prod(ϕ, BB, j)
-         end
-         A[t, iA] = a_t 
-      end
-   end
-   return nothing
-end
+# evalpool_batch!(A, basis::PooledSparseProduct, BB, 
+#                    target::AbstractVector{<: Integer}) = 
+#     evalpool_batch!(A, basis, BB, linearbatch(target))
+
+# function evalpool_batch!(A, basis::PooledSparseProduct{NB}, BB, 
+#                          target::LinearBatch) where {NB}
+#    nX = size(BB[1], 1)
+#    nA = size(A, 1)
+#    @assert length(target.groups)-1 <= nA 
+#    @assert all(B->size(B, 1) == nX, BB)
+#    spec = basis.spec
+
+#    @inbounds for (iA, ϕ) in enumerate(spec)
+#       for t = 1:length(target.groups)-1
+#          a_t = zero(eltype(A))
+#          @simd ivdep for j = target.groups[t]:target.groups[t+1]-1
+#             a_t += BB_prod(ϕ, BB, j)
+#          end
+#          A[t, iA] = a_t 
+#       end
+#    end
+#    return nothing
+# end
 
 
-function evalpool!(A::VA, basis::PooledSparseProduct{2}, BB) where {VA}
-   nX = size(BB[1], 1)
-   @assert size(BB[2], 1) >= nX 
-   @assert length(A) == length(basis)
-   spec = basis.spec
-   BB1 = BB[1] 
-   BB2 = BB[2] 
+# function evalpool!(A::VA, basis::PooledSparseProduct{2}, BB) where {VA}
+#    nX = size(BB[1], 1)
+#    @assert size(BB[2], 1) >= nX 
+#    @assert length(A) == length(basis)
+#    spec = basis.spec
+#    BB1 = BB[1] 
+#    BB2 = BB[2] 
 
-   @inbounds for (iA, ϕ) in enumerate(spec)
-      a = zero(eltype(A))
-      ϕ1 = ϕ[1]; ϕ2 = ϕ[2]
-      @simd ivdep for j = 1:nX
-         a = muladd(BB1[j, ϕ1], BB2[j, ϕ2], a)
-      end
-      A[iA] = a
-   end
+#    @inbounds for (iA, ϕ) in enumerate(spec)
+#       a = zero(eltype(A))
+#       ϕ1 = ϕ[1]; ϕ2 = ϕ[2]
+#       @simd ivdep for j = 1:nX
+#          a = muladd(BB1[j, ϕ1], BB2[j, ϕ2], a)
+#       end
+#       A[iA] = a
+#    end
 
-   return nothing
-end
+#    return nothing
+# end
 
 # this code should never be used, we keep it just for testing 
 # the performance of the generated code. 
@@ -325,8 +330,8 @@ end
 # --------------------- connect with ChainRules 
 # todo ... 
 
-function rrule(::typeof(evalpool), basis::PooledSparseProduct{NB}, BB::Tuple) where {NB}
-   A = evalpool(basis, BB)
+function rrule(::typeof(evaluate), basis::PooledSparseProduct{NB}, BB::Tuple) where {NB}
+   A = evaluate(basis, BB)
 
    function pb(Δ)
       ∂BB = _pullback_evalpool(Δ, basis, BB)
@@ -339,17 +344,17 @@ end
 # --------------------- connect with Lux 
 
 
-struct PooledSparseProductLayer{NB} <: AbstractExplicitLayer 
-   basis::PooledSparseProduct{NB}
-end
+# struct PooledSparseProductLayer{NB} <: AbstractExplicitLayer 
+#    basis::PooledSparseProduct{NB}
+# end
 
-lux(basis::PooledSparseProduct) = PooledSparseProductLayer(basis)
+# lux(basis::PooledSparseProduct) = PooledSparseProductLayer(basis)
 
-initialparameters(rng::AbstractRNG, layer::PooledSparseProductLayer) = 
-      NamedTuple() 
+# initialparameters(rng::AbstractRNG, layer::PooledSparseProductLayer) = 
+#       NamedTuple() 
 
-initialstates(rng::AbstractRNG, layer::PooledSparseProductLayer) = 
-      NamedTuple()
+# initialstates(rng::AbstractRNG, layer::PooledSparseProductLayer) = 
+#       NamedTuple()
 
-(l::PooledSparseProductLayer)(BB, ps, st) = 
-      evalpool(l.basis, BB), st 
+# (l::PooledSparseProductLayer)(BB, ps, st) = 
+#       evaluate(l.basis, BB), st 
diff --git a/test/ace/test_1pbasis.jl b/test/ace/test_sparseprodpool.jl
similarity index 70%
rename from test/ace/test_1pbasis.jl
rename to test/ace/test_sparseprodpool.jl
index 4d13cd8..9ac02c0 100644
--- a/test/ace/test_1pbasis.jl
+++ b/test/ace/test_sparseprodpool.jl
@@ -1,11 +1,20 @@
 
 using BenchmarkTools, Test, Polynomials4ML
-using Polynomials4ML:  PooledSparseProduct, test_evaluate, evaluate, evaluate!, 
-                evalpool, test_evalpool, evalpool!
+using Polynomials4ML:  PooledSparseProduct, evaluate, evaluate! 
 using ACEbase.Testing: fdtest, println_slim, print_tf
+       
+test_evaluate(basis::PooledSparseProduct, BB::Tuple{Vararg{<: AbstractVector}}) = 
+       [ prod(BB[j][basis.spec[i][j]] for j = 1:length(BB)) 
+            for i = 1:length(basis) ]
+
+test_evaluate(basis::PooledSparseProduct, BB::Tuple{Vararg{<: AbstractMatrix}}) = 
+      sum( test_evaluate(basis, ntuple(i -> BB[i][j, :], length(BB)))
+         for j = 1:size(BB[1], 1) )            
 
 P4ML = Polynomials4ML
 
+##
+
 N1 = 10 
 N2 = 20 
 N3 = 50 
@@ -20,7 +29,7 @@ basis = PooledSparseProduct(spec)
 
 ## 
 
-@info("Test serial evaluation")
+@info("Test evaluation with a single input (no pooling)")
 
 BB = (B1, B2, B3)
 
@@ -31,16 +40,16 @@ println_slim(@test A1 ≈ A2 )
 
 ## 
 
-@info("Test batched evaluation")
+@info("Test pooling of multiple inputs")
 nX = 64 
 bBB = ( randn(nX, N1), randn(nX, N2), randn(nX, N3) )
 
 # using the naive evaluation code 
-bA1 = test_evalpool(basis, bBB)
-bA2 = evalpool(basis, bBB)
+bA1 = test_evaluate(basis, bBB)
+bA2 = evaluate(basis, bBB)
 
 bA3 = copy(bA2)
-evalpool!(bA3, basis, bBB)
+evaluate!(bA3, basis, bBB)
 
 println_slim(@test bA1 ≈ bA2 ≈ bA3 )
 

From 7732a092ce81a0a889081e57d22f0d71d4207b07 Mon Sep 17 00:00:00 2001
From: cortner <christohortner@gmail.com>
Date: Sat, 20 May 2023 21:15:35 -0700
Subject: [PATCH 12/54] fixed sparseprodpool rrule

---
 src/ace/sparseprodpool.jl       | 80 ++++++++++++++++-----------------
 test/ace/test_sparseprodpool.jl |  6 +--
 2 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/src/ace/sparseprodpool.jl b/src/ace/sparseprodpool.jl
index 0dd9bc6..ff1e52c 100644
--- a/src/ace/sparseprodpool.jl
+++ b/src/ace/sparseprodpool.jl
@@ -224,22 +224,22 @@ using StaticArrays
 
 
 
-function _rrule_evalpool(basis::PooledSparseProduct{NB}, BB::Tuple) where {NB}
-   A = evalpool(basis, BB)
-   return A, ∂A -> _pullback_evalpool(∂A, basis, BB)
+function _rrule_evaluate(basis::PooledSparseProduct{NB}, BB::TupMat) where {NB}
+   A = evaluate(basis, BB)
+   return A, ∂A -> _pullback_evaluate(∂A, basis, BB)
 end
 
 
-function _pullback_evalpool(∂A, basis::PooledSparseProduct{NB}, BB::Tuple) where {NB}
+function _pullback_evaluate(∂A, basis::PooledSparseProduct{NB}, BB::TupMat) where {NB}
    nX = size(BB[1], 1)
    TA = promote_type(eltype.(BB)...)
    ∂BB = ntuple(i -> zeros(TA, size(BB[i])...), NB)
-   _pullback_evalpool!(∂BB, ∂A, basis, BB)
+   _pullback_evaluate!(∂BB, ∂A, basis, BB)
    return ∂BB
 end
 
 
-function _pullback_evalpool!(∂BB, ∂A, basis::PooledSparseProduct{NB}, BB::Tuple) where {NB}
+function _pullback_evaluate!(∂BB, ∂A, basis::PooledSparseProduct{NB}, BB::TupMat) where {NB}
    nX = size(BB[1], 1)
    @assert all(nX <= size(BB[i], 1) for i = 1:NB)
    @assert all(nX <= size(∂BB[i], 1) for i = 1:NB)
@@ -268,7 +268,7 @@ end
 #       a cruder code generation strategy. This specialized code 
 #       confirms this. 
 
-function _pullback_evalpool!(∂BB, ∂A, basis::PooledSparseProduct{2}, BB::Tuple)
+function _pullback_evaluate!(∂BB, ∂A, basis::PooledSparseProduct{2}, BB::Tuple)
    nX = size(BB[1], 1)
    NB = 2 
    @assert length(∂A) == length(basis)
@@ -293,48 +293,48 @@ end
 
 
 
-function _pullback_evalpool!(∂BB, ∂A, basis::PooledSparseProduct{NB}, 
-                             BB::Tuple, target::AbstractVector{<: Integer}) where {NB}
-   nX = size(BB[1], 1)
-   nT = size(∂A, 1)
-   mint, maxt = extrema(target)
-   @assert 0 < mint <= maxt <= nT
-   @assert all(nX <= size(BB[i], 1) for i = 1:NB)
-   @assert all(nX <= size(∂BB[i], 1) for i = 1:NB)
-   @assert all(size(∂BB[i], 2) >= size(BB[i], 2) for i = 1:NB)
-   @assert size(∂A, 2) == length(basis)
-   @assert length(BB) == NB 
-   @assert length(∂BB) == NB 
-
-   # ∂A_loc = zeros(eltype(∂A), nT)
+# function _pullback_evaluate!(∂BB, ∂A, basis::PooledSparseProduct{NB}, 
+#                              BB::Tuple, target::AbstractVector{<: Integer}) where {NB}
+#    nX = size(BB[1], 1)
+#    nT = size(∂A, 1)
+#    mint, maxt = extrema(target)
+#    @assert 0 < mint <= maxt <= nT
+#    @assert all(nX <= size(BB[i], 1) for i = 1:NB)
+#    @assert all(nX <= size(∂BB[i], 1) for i = 1:NB)
+#    @assert all(size(∂BB[i], 2) >= size(BB[i], 2) for i = 1:NB)
+#    @assert size(∂A, 2) == length(basis)
+#    @assert length(BB) == NB 
+#    @assert length(∂BB) == NB 
+
+#    # ∂A_loc = zeros(eltype(∂A), nT)
 
-   @inbounds for (iA, ϕ) in enumerate(basis.spec)
-      # @simd ivdep for t = 1:nT 
-      #    ∂A_loc[t] = ∂A[t, iA]
-      # end
-      @simd ivdep for j = 1:nX 
-         ∂A_iA = ∂A[target[j], iA] # ∂A_loc[target[j] ] 
-         b = ntuple(Val(NB)) do i 
-            @inbounds BB[i][j, ϕ[i]] 
-         end 
-         g = _prod_grad(b, Val(NB))
-         for i = 1:NB 
-            ∂BB[i][j, ϕ[i]] = muladd(∂A_iA, g[i], ∂BB[i][j, ϕ[i]])
-         end
-      end 
-   end
-   return nothing 
-end
+#    @inbounds for (iA, ϕ) in enumerate(basis.spec)
+#       # @simd ivdep for t = 1:nT 
+#       #    ∂A_loc[t] = ∂A[t, iA]
+#       # end
+#       @simd ivdep for j = 1:nX 
+#          ∂A_iA = ∂A[target[j], iA] # ∂A_loc[target[j] ] 
+#          b = ntuple(Val(NB)) do i 
+#             @inbounds BB[i][j, ϕ[i]] 
+#          end 
+#          g = _prod_grad(b, Val(NB))
+#          for i = 1:NB 
+#             ∂BB[i][j, ϕ[i]] = muladd(∂A_iA, g[i], ∂BB[i][j, ϕ[i]])
+#          end
+#       end 
+#    end
+#    return nothing 
+# end
 
 
 # --------------------- connect with ChainRules 
 # todo ... 
 
-function rrule(::typeof(evaluate), basis::PooledSparseProduct{NB}, BB::Tuple) where {NB}
+function rrule(::typeof(evaluate), basis::PooledSparseProduct{NB}, BB::TupMat) where {NB}
    A = evaluate(basis, BB)
 
    function pb(Δ)
-      ∂BB = _pullback_evalpool(Δ, basis, BB)
+      ∂BB = _pullback_evaluate(Δ, basis, BB)
       return NoTangent(), NoTangent(), ∂BB
    end 
 
diff --git a/test/ace/test_sparseprodpool.jl b/test/ace/test_sparseprodpool.jl
index 9ac02c0..ca4680b 100644
--- a/test/ace/test_sparseprodpool.jl
+++ b/test/ace/test_sparseprodpool.jl
@@ -83,11 +83,11 @@ for ntest = 1:30
    bBB = ( randn(nX, N1), randn(nX, N2), randn(nX, N3) )
    bUU = ( randn(nX, N1), randn(nX, N2), randn(nX, N3) )
    _BB(t) = ( bBB[1] + t * bUU[1], bBB[2] + t * bUU[2], bBB[3] + t * bUU[3] )
-   bA2 = evalpool(basis, bBB)
+   bA2 = evaluate(basis, bBB)
    u = randn(size(bA2))
-   F(t) = dot(u, evalpool(basis, _BB(t)))
+   F(t) = dot(u, evaluate(basis, _BB(t)))
    dF(t) = begin
-      val, pb = P4ML._rrule_evalpool(basis, _BB(t))
+      val, pb = P4ML._rrule_evaluate(basis, _BB(t))
       ∂BB = pb(u)
       return sum( dot(∂BB[i], bUU[i]) for i = 1:length(bUU) )
    end

From b1c990534c5bb25e110571c2013337777bf85710 Mon Sep 17 00:00:00 2001
From: cortner <christohortner@gmail.com>
Date: Sat, 20 May 2023 22:30:07 -0700
Subject: [PATCH 13/54] some lux bugfixes

---
 src/lux.jl        | 23 +++++++++++------------
 test/runtests.jl  |  2 +-
 test/test_flex.jl |  1 +
 test/test_lux.jl  |  2 +-
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/lux.jl b/src/lux.jl
index cfad8b6..59e48f0 100644
--- a/src/lux.jl
+++ b/src/lux.jl
@@ -9,7 +9,11 @@ that the basis accepts a number or short vector as input and produces an output
 that is a vector. It also assumes that batched operations are implemented, 
 as well as some other functionality. 
 """
-function lux(basis::AbstractPoly4MLBasis; meta = Dict{String, Any}())
+function lux(basis::AbstractPoly4MLBasis; 
+               name = String(nameof(typeof(basis))), 
+               meta = Dict{String, Any}("name" => name), 
+            )
+   @assert haskey(meta, "name")
    return PolyLuxLayer(basis, meta)
 end
 
@@ -35,6 +39,11 @@ struct PolyLuxLayer{TB} <: AbstractExplicitLayer
    meta::Dict{String, Any}
 end
 
+function Base.show(io::IO, l::PolyLuxLayer)
+   print(io, "PolyLuxLayer($(l.meta["name"]))")
+end
+
+
 Base.length(l::PolyLuxLayer) = length(l.basis)
 
 initialparameters(rng::AbstractRNG, l::PolyLuxLayer) = _init_luxparams(rng, l.basis)
@@ -43,16 +52,6 @@ initialstates(rng::AbstractRNG, l::PolyLuxLayer) = _init_luxstate(rng, l.basis)
 
 (l::PolyLuxLayer)(args...) = evaluate(l, args...)
 
-function evaluate(l::PolyLuxLayer, x::SINGLE, ps, st)
-   B = acquire!(st.cache, :B, (length(l.basis), ), _valtype(l.basis, x))
-   evaluate!(parent(B), l.basis, x)
-   return B 
-end 
-
-function evaluate(l::PolyLuxLayer, X::AbstractArray{<: SINGLE}, ps, st)
-   B = acquire!(st.cache[:Bbatch], (length(l.basis), length(X)), _valtype(l.basis, X[1]))
-   evaluate!(parent(B), l.basis, X)
-   return B 
-end
+evaluate(l::PolyLuxLayer, X, ps, st) = evaluate(l.basis, X), st
 
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 4e8dda5..08f65d4 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -21,7 +21,7 @@ using Test
     @testset "Atomic Orbitals Radials" begin include("test_atorbrad.jl"); end
 
     # ACE 
-    @testset "SparsePooledProduct" begin include("ace/test_1pbasis.jl"); end 
+    @testset "SparsePooledProduct" begin include("ace/test_sparseprodpool.jl"); end 
     @testset "Sparse Symmetric Product" begin include("ace/test_prodbasis1.jl"); end 
 
     # Misc
diff --git a/test/test_flex.jl b/test/test_flex.jl
index 0fff56c..2e36c60 100644
--- a/test/test_flex.jl
+++ b/test/test_flex.jl
@@ -15,6 +15,7 @@ tests = [
 
 for (basis, rnd) in tests   
    for ntest = 1:5 
+      local B1, B2 
       x = rnd()
       B0 = zeros(Polynomials4ML._valtype(basis, x), length(basis))
       evaluate!(B0, basis, x)
diff --git a/test/test_lux.jl b/test/test_lux.jl
index 96184a8..930ffbb 100644
--- a/test/test_lux.jl
+++ b/test/test_lux.jl
@@ -22,6 +22,6 @@ for (basis, rnd) in test_bases
    B1 = evaluate(basis, x)
    l = lux(basis)
    ps, st = Lux.setup(rng, l)
-   B2 = l(x, ps, st)
+   B2, _ = l(x, ps, st)
    println_slim(@test B1 == parent(B2))
 end

From 2bbc1210e592fe252ad33ee8e4840a1e43f9fe7d Mon Sep 17 00:00:00 2001
From: cortner <christohortner@gmail.com>
Date: Sun, 21 May 2023 22:46:33 -0700
Subject: [PATCH 14/54] severl more bugfixes throughout

---
 Project.toml                |  1 +
 src/ace/ACEcore.jl          | 25 --------------------
 src/ace/sparseprodpool.jl   |  4 ++++
 src/ace/sparsesymmprod.jl   | 46 ++++++++++++++++++++-----------------
 src/ace/symmprod_dag.jl     | 24 ++++++++++---------
 src/lux.jl                  | 11 ++++++++-
 test/ace/test_prodbasis1.jl |  9 +++++---
 test/test_sparseproduct.jl  | 11 ++++++++-
 8 files changed, 69 insertions(+), 62 deletions(-)
 delete mode 100644 src/ace/ACEcore.jl

diff --git a/Project.toml b/Project.toml
index ff639fa..a54ed05 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,6 +14,7 @@ HyperDualNumbers = "50ceba7f-c3ee-5a84-a6e8-3ad40456ec97"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 LuxCore = "bb33d45b-7691-41d6-9220-0943567d0623"
+NamedTupleTools = "d9ec5142-1e00-5aa0-9d6a-321866360f50"
 ObjectPools = "658cac36-ff0f-48ad-967c-110375d98c9d"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 QuadGK = "1fd47b50-473d-5c70-9696-f719f8f3bcdc"
diff --git a/src/ace/ACEcore.jl b/src/ace/ACEcore.jl
deleted file mode 100644
index 85ba77a..0000000
--- a/src/ace/ACEcore.jl
+++ /dev/null
@@ -1,25 +0,0 @@
-module ACE
-
-using ObjectPools: FlexArrayCache, FlexArray, 
-                   ArrayPool, TSafe, 
-                   acquire!, release!
-
-import ChainRulesCore: rrule, NoTangent 
-
-import LuxCore: AbstractExplicitLayer, AbstractExplicitContainerLayer, 
-                 initialparameters, initialstates                 
-
-using Random: AbstractRNG                 
-
-include("sparseprod.jl")
-
-include("symmprod_dag.jl")
-include("symmprod_dag_kernels.jl")
-
-include("simpleprodbasis.jl")
-include("sparsesymmprod.jl")
-
-include("utils/utils.jl")
-include("testing.jl")
-
-end
diff --git a/src/ace/sparseprodpool.jl b/src/ace/sparseprodpool.jl
index ff1e52c..aa993b3 100644
--- a/src/ace/sparseprodpool.jl
+++ b/src/ace/sparseprodpool.jl
@@ -341,7 +341,11 @@ function rrule(::typeof(evaluate), basis::PooledSparseProduct{NB}, BB::TupMat) w
    return A, pb 
 end
 
+
 # --------------------- connect with Lux 
+# it looks like we could use the standard P4ML basis wrapper 
+# but technically the pooling operation changes the behaviour in
+# a few ways and we need to be very careful about this
 
 
 # struct PooledSparseProductLayer{NB} <: AbstractExplicitLayer 
diff --git a/src/ace/sparsesymmprod.jl b/src/ace/sparsesymmprod.jl
index d5f6af0..ad8bc19 100644
--- a/src/ace/sparsesymmprod.jl
+++ b/src/ace/sparsesymmprod.jl
@@ -1,15 +1,15 @@
 
 using LoopVectorization
 
-struct SparseSymmProd{T} 
-   dag::SparseSymmProdDAG{T} 
+struct SparseSymmProd <: AbstractPoly4MLBasis
+   dag::SparseSymmProdDAG
    proj::Vector{Int}
-   pool::ArrayPool{FlexArrayCache}
+   @reqfields
 end
 
-function SparseSymmProd(spec::AbstractVector{<: AbstractVector}; T = Float64, kwargs...)
-   dag = SparseSymmProdDAG(spec; T=T, kwargs...)
-   return SparseSymmProd(dag, dag.projection, ArrayPool(FlexArrayCache))
+function SparseSymmProd(spec::AbstractVector{<: Union{Tuple, AbstractVector}}; kwargs...)
+   dag = SparseSymmProdDAG(spec; kwargs...)
+   return SparseSymmProd(dag, dag.projection, _make_reqfields()... )
 end
 
 Base.length(basis::SparseSymmProd) = length(basis.proj)
@@ -20,16 +20,20 @@ reconstruct_spec(basis::SparseSymmProd) = reconstruct_spec(basis.dag)[basis.proj
 
 # -------------- evaluation interfaces 
 
+_valtype(basis::SparseSymmProd, ::Type{T}) where {T} = T
+
+(basis::SparseSymmProd)(args...) = evaluate(basis, args...)
+
 function evaluate(basis::SparseSymmProd, A::AbstractVector{T}) where {T}
    AA = acquire!(basis.pool, :AA, (length(basis),), T)
-   evaluate!(parent(AA), basis, A)
+   evaluate!(AA, basis, A)
    return AA
 end
 
 function evaluate(basis::SparseSymmProd, A::AbstractMatrix{T}) where {T}
    nX = size(A, 1)
    AA = acquire!(basis.pool, :AAbatch, (nX, length(basis)), T)
-   evaluate!(parent(AA), basis, A)
+   evaluate!(AA, basis, A)
    return AA
 end
 
@@ -45,7 +49,7 @@ function evaluate!(AA, basis::SparseSymmProd, A)
 end
 
 # serial projection 
-function _project!(BB, proj::Vector{<: Integer}, AA::AbstractVector)
+function _project!(BB, proj::Vector{<: Integer}, AA::AbstractVector{<: Number})
    @inbounds for i = 1:length(proj)
       BB[i] = AA[proj[i]]
    end
@@ -53,12 +57,12 @@ function _project!(BB, proj::Vector{<: Integer}, AA::AbstractVector)
 end
 
 # batched projection 
-function _project!(BB, proj::Vector{<: Integer}, AA::AbstractMatrix)
+function _project!(BB, proj::Vector{<: Integer}, AA::AbstractMatrix{<: Number})
    nX = size(AA, 1)
    @assert size(BB, 1) >= nX
    @inbounds for i = 1:length(proj)
       p_i = proj[i]
-      for j = 1:nX
+      @simd ivdep for j = 1:nX
          BB[j, i] = AA[j, p_i]
       end
    end
@@ -94,21 +98,21 @@ end
 
 # -------------- Lux integration 
 
-struct SparseSymmProdLayer{T} <: AbstractExplicitLayer
-   basis::SparseSymmProd{T}
-end
+# struct SparseSymmProdLayer{T} <: AbstractExplicitLayer
+#    basis::SparseSymmProd{T}
+# end
 
-function lux(basis::SparseSymmProd) 
-   return SparseSymmProdLayer(basis)
-end
+# function lux(basis::SparseSymmProd) 
+#    return SparseSymmProdLayer(basis)
+# end
 
-Base.length(l::SparseSymmProdLayer) = length(l.basis)
+# Base.length(l::SparseSymmProdLayer) = length(l.basis)
 
-initialparameters(rng::AbstractRNG, l::SparseSymmProdLayer) = NamedTuple() 
+# initialparameters(rng::AbstractRNG, l::SparseSymmProdLayer) = NamedTuple() 
 
-initialstates(rng::AbstractRNG, l::SparseSymmProdLayer) = NamedTuple()
+# initialstates(rng::AbstractRNG, l::SparseSymmProdLayer) = NamedTuple()
 
-(l::SparseSymmProdLayer)(A, ps, st) = evaluate(l.basis, A), st 
+# (l::SparseSymmProdLayer)(A, ps, st) = evaluate(l.basis, A), st 
 
 
 
diff --git a/src/ace/symmprod_dag.jl b/src/ace/symmprod_dag.jl
index 1e33f1e..8382f6c 100644
--- a/src/ace/symmprod_dag.jl
+++ b/src/ace/symmprod_dag.jl
@@ -5,26 +5,29 @@ using Combinatorics: combinations, partitions
 
 const BinDagNode = Tuple{Int, Int}
 
-struct SparseSymmProdDAG{T}
+struct SparseSymmProdDAG <: AbstractPoly4MLBasis
    nodes::Vector{BinDagNode}
    has0::Bool
    num1::Int
    numstore::Int
    projection::Vector{Int}
    # ---- temps
-   pool::ArrayPool{FlexArrayCache}
+   @reqfields
 end
 
-# warning: this is not the length of the basis!!! 
+# warning: if SparseSymmProdDAG is an extended basis, then `length` will be 
+# the extended length and not the length of the actual basis. 
+
 length(dag::SparseSymmProdDAG) = length(dag.nodes)
 
 # ==(dag1::SparseSymmProdDAG, dag2::SparseSymmProdDAG) = ACE1._allfieldsequal(dag1, dag2)
 
-SparseSymmProdDAG(; T=Float64) = SparseSymmProdDAG{T}(Vector{BinDagNode}(undef, 0), 0, 0)
+SparseSymmProdDAG() = SparseSymmProdDAG(Vector{BinDagNode}(undef, 0), false, 0, 0)
+
+SparseSymmProdDAG(nodes, has0, num1, numstore, proj)   = 
+         SparseSymmProdDAG(nodes, has0, num1, numstore, proj, 
+                              _make_reqfields()...)
 
-SparseSymmProdDAG{T}(nodes, has0, num1, numstore, projection)  where {T} = 
-               SparseSymmProdDAG{T}(nodes, has0, num1, numstore, projection, 
-                                    ArrayPool(FlexArrayCache))
 
 # # -------------- FIO
 
@@ -128,8 +131,7 @@ Kwargs:
 """
 function SparseSymmProdDAG(spec::AbstractVector; 
                            filter = _->true, 
-                           verbose = false, 
-                           T = Float64)
+                           verbose = false)
    @assert issorted(length.(spec))
    @assert all(issorted, spec)
    # we need to separate them into 0-corr, 1-corr and N-corr
@@ -179,7 +181,7 @@ function SparseSymmProdDAG(spec::AbstractVector;
    # re-organise the dag layout to minimise numstore
    # nodesfinal, num1, numstore = _reorder_dag!(nodes)
 
-   return SparseSymmProdDAG{T}(nodes, has0, num1, numstore, projection)
+   return SparseSymmProdDAG(nodes, has0, num1, numstore, projection)
 end
 
 
@@ -231,7 +233,7 @@ end
 
 
 # ------------------------------------------------------------------
-# reconstruct the specification without the tree ... 
+# reconstruct the specification from the DAG ... 
 
 
 function reconstruct_spec(dag::SparseSymmProdDAG)
diff --git a/src/lux.jl b/src/lux.jl
index 59e48f0..07a7609 100644
--- a/src/lux.jl
+++ b/src/lux.jl
@@ -3,6 +3,8 @@ import LuxCore
 import LuxCore: initialparameters, initialstates, AbstractExplicitLayer
 using Random: AbstractRNG
 
+using ChainRulesCore
+
 """
 lux(basis) : convert a basis / embedding object into a lux layer. This assumes 
 that the basis accepts a number or short vector as input and produces an output 
@@ -34,6 +36,8 @@ _init_default_luxstate() = ( tmp = ArrayPool(FlexArray),
 # ---------- PolyLuxLayer
 # the simplest lux layer implementation 
 
+
+
 struct PolyLuxLayer{TB} <: AbstractExplicitLayer
    basis::TB
    meta::Dict{String, Any}
@@ -52,6 +56,11 @@ initialstates(rng::AbstractRNG, l::PolyLuxLayer) = _init_luxstate(rng, l.basis)
 
 (l::PolyLuxLayer)(args...) = evaluate(l, args...)
 
-evaluate(l::PolyLuxLayer, X, ps, st) = evaluate(l.basis, X), st
+function evaluate(l::PolyLuxLayer, X, ps, st) 
+   B = ChainRulesCore.ignore_derivatives() do 
+      evaluate(l.basis, X)
+   end
+   return B, st 
+end 
 
 
diff --git a/test/ace/test_prodbasis1.jl b/test/ace/test_prodbasis1.jl
index 0e2f518..9c944a6 100644
--- a/test/ace/test_prodbasis1.jl
+++ b/test/ace/test_prodbasis1.jl
@@ -18,19 +18,22 @@ A = randn(ComplexF64, 2*M+1)
 basis1 = SimpleProdBasis(spec)
 AA1 = basis1(A)
 
-basis2 = SparseSymmProd(spec; T = ComplexF64)
+basis2 = SparseSymmProd(spec)
 AA2 = basis2(A)
 
+@info("check against simple implementation")
+println_slim(@test AA1 ≈ AA2)
+
+@info("reconstruct spec")
 spec_ = P4ML.reconstruct_spec(basis2)
 println_slim(@test spec_ == spec)
-println_slim(@test AA1 ≈ AA2)
 
 ##
 
 @info("Test with a constant")
 spec_c = [ [Int[],]; spec]
 basis1_c = SimpleProdBasis(spec_c)
-basis2_c = SparseSymmProd(spec_c; T = ComplexF64)
+basis2_c = SparseSymmProd(spec_c)
 
 spec_c_ = P4ML.reconstruct_spec(basis2_c)
 println_slim(@test spec_c_ == spec_c)
diff --git a/test/test_sparseproduct.jl b/test/test_sparseproduct.jl
index 297cc03..f5d008a 100644
--- a/test/test_sparseproduct.jl
+++ b/test/test_sparseproduct.jl
@@ -1,10 +1,19 @@
 using Test
 using Polynomials4ML.Testing: println_slim, print_tf
-using Polynomials4ML: SparseProduct, evaluate, test_evaluate
+using Polynomials4ML: SparseProduct, evaluate
 using LinearAlgebra: norm
 using Polynomials4ML
 using ACEbase.Testing: fdtest
 
+test_evaluate(basis::SparseProduct, BB::Tuple{Vararg{<: AbstractVector}}) = 
+       [ prod(BB[j][basis.spec[i][j]] for j = 1:length(BB)) 
+            for i = 1:length(basis) ]
+
+# test_evaluate(basis::SparseProduct, BB::Tuple{Vararg{<: AbstractMatrix}}) = 
+#         [ test_evaluate(basis, ntuple(i -> BB[i][j, :], length(BB)))
+#          for j = 1:size(BB[1], 1) )            
+
+
 ##
 
 N1 = 10

From e4148d7ef4732a08a598d592d4c269359d479940 Mon Sep 17 00:00:00 2001
From: cortner <christohortner@gmail.com>
Date: Sun, 21 May 2023 22:55:32 -0700
Subject: [PATCH 15/54] added hyper dual number experiment

---
 temp/hypers.jl | 258 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 258 insertions(+)
 create mode 100644 temp/hypers.jl

diff --git a/temp/hypers.jl b/temp/hypers.jl
new file mode 100644
index 0000000..5b06c58
--- /dev/null
+++ b/temp/hypers.jl
@@ -0,0 +1,258 @@
+
+#
+# this script explores HyperDualNumbers to implement the 
+# laplacian operator 
+#
+
+using Polynomials4ML, HyperDualNumbers, Lux, LuxCore, Random
+P4ML = Polynomials4ML
+rng = Random.default_rng()
+
+##
+
+Pn = legendre_basis(10)
+l_Pn = P4ML.lux(Pn)
+
+l_embed = BranchLayer(;Pn = l_Pn,)
+
+bA = P4ML.PooledSparseProduct([ (n,) for n = 1:length(Pn) ])
+l_bA = P4ML.lux(bA)
+
+ch1 = Chain(embed = l_embed, A = l_bA)
+
+##
+
+xx = 2*rand(5) .- 1
+
+ps, st = Lux.setup(rng, ch1)
+o, _ = ch1(xx, ps, st)
+
+P = Pn(xx)
+A = bA( (P,) )
+
+## 
+
+hxx = [ Hyper(xx[i], i==1, i==1, 0) for i = 1:length(xx) ] 
+hA1 = bA( (Pn(hxx),) )
+
+hA2, _ = ch1(hxx, ps, st)
+hA1 == hA2
+
+##
+
+using BenchmarkTools
+
+@btime $bA( ($Pn( $xx ),) )
+@btime $ch1( $xx, $ps, $st )
+@btime $bA( ($Pn( $hxx ),) )
+@btime $ch1( $hxx, $ps, $st )
+
+
+##
+
+# add another layer to the chain - n-correlations
+
+spec = [ [ [n1,] for n1 = 1:length(Pn) ]; 
+         [ [n1, n2] for n1 = 1:length(Pn) for n2 = n1:length(Pn) ] ]
+bAA = P4ML.SparseSymmProd(spec)
+l_bAA = P4ML.lux(bAA)
+
+ch2 = Chain(embed = l_embed, A = l_bA, AA = l_bAA)
+ps, st = Lux.setup(rng, ch2)
+
+ch2(xx, ps, st)
+ch2(hxx, ps, st)
+
+
+## 
+# Most important test: make a model, take a gradient, then run the 
+# Hypers through the gradient ... 
+
+module M1
+   using LuxCore, LinearAlgebra, Random 
+   import LuxCore:  AbstractExplicitLayer, initialparameters, initialstates
+   struct DotL <: AbstractExplicitLayer
+      nin::Int
+   end
+   function (l::DotL)(x::AbstractVector{<: Number}, ps, st)
+      return dot(x, ps.W), st
+   end
+   initialparameters(rng::AbstractRNG, l::DotL) = ( W = randn(rng, l.nin), )
+   initialstates(rng::AbstractRNG, l::DotL) = NamedTuple()
+end
+
+ch3 = Chain(embed = l_embed, A = l_bA, AA = l_bAA, dot = M1.DotL(length(bAA)))
+ps, st = Lux.setup(rng, ch3)
+
+ch3(xx, ps, st)
+ch3(hxx, ps, st)
+
+
+## 
+
+using Zygote
+
+g_ch3 = xx -> Zygote.gradient(p -> ch3(xx, p, st)[1], ps)[1]
+g_ch3(xx)
+g_ch3(hxx)
+
+
+##
+
+module NTarrays
+   # using NamedTupleTools
+
+   struct NTarr{NTT}
+      nt::NTT
+   end
+
+   export array
+
+   array(nt::NamedTuple) = NTarr(nt)
+
+   # ------------------------------
+   #  0 
+
+   zero!(a::AbstractArray) = fill!(a, zero(eltype(a)))
+   zero!(a::Nothing) = nothing 
+
+   function zero!(nt::NamedTuple)
+      for k in keys(nt)
+         zero!(nt[k])
+      end
+      return nt
+   end 
+
+   Base.zero(nt::NamedTuple) = zero!(deepcopy(nt))
+
+   Base.zero(nt::NTarr) = NTarr(zero(nt.nt))
+
+   # ------------------------------
+   #  + 
+
+
+   function _add!(a1::AbstractArray, a2::AbstractArray) 
+      a1[:] .= a1[:] .+ a2[:]
+      return nothing 
+   end
+
+   _add!(at::Nothing, args...) = nothing 
+
+   function _add!(nt1::NamedTuple, nt2)
+      for k in keys(nt1)
+         _add!(nt1[k], nt2[k])
+      end
+      return nothing 
+   end
+
+   function _add(nt1::NamedTuple, nt2::NamedTuple)
+      nt = deepcopy(nt1)
+      _add!(nt, nt2)
+      return nt
+   end
+
+   Base.:+(nt1::NTarr, nt2::NTarr) = NTarr(_add(nt1.nt, nt2.nt))
+
+   # ------------------------------
+   #  * 
+
+   _mul!(::Nothing, args... ) = nothing 
+
+   function _mul!(a::AbstractArray, λ::Number)
+      a[:] .= a[:] .* λ
+      return nothing 
+   end
+
+   function _mul!(nt::NamedTuple, λ::Number)
+      for k in keys(nt)
+         _mul!(nt[k], λ)
+      end
+      return nothing 
+   end
+
+   function _mul(nt::NamedTuple, λ::Number)
+      nt = deepcopy(nt)
+      _mul!(nt, λ)
+      return nt
+   end
+
+   Base.:*(λ::Number, nt::NTarr) = NTarr(_mul(nt.nt, λ))
+   Base.:*(nt::NTarr, λ::Number) = NTarr(_mul(nt.nt, λ))
+
+   # ------------------------------
+   #   map 
+
+   _map!(f, a::AbstractArray) = map!(f, a, a) 
+
+   _map!(f, ::Nothing) = nothing 
+
+   function _map!(f, nt::NamedTuple)
+      for k in keys(nt)
+         _map!(f, nt[k])
+      end
+      return nothing 
+   end
+
+   function Base.map!(f, dest::NTarr, src::NTarr)
+      _map!(f, nt.nt)
+      return nt
+   end
+
+end 
+
+using Main.NTarrays
+
+function laplacian(gfun, xx)
+   function _mapadd!(f, dest::NamedTuple, src::NamedTuple) 
+      for k in keys(dest)
+         _mapadd!(f, dest[k], src[k])
+      end
+      return nothing 
+   end
+   _mapadd!(f, dest::Nothing, src) = nothing
+   _mapadd!(f, dest::AbstractArray, src::AbstractArray) = 
+            map!((s, d) -> d + f(s), dest, src, dest)
+
+   Δ = NTarrays.zero!(gfun(xx))
+   for i = 1:length(xx) 
+      hxx = [ Hyper(xx[j], j==i, j==i, 0) for j = 1:length(xx) ]
+      _mapadd!(ε₁ε₂part, Δ, gfun(hxx))
+   end
+   return Δ
+end
+
+Δ1 = laplacian(g_ch3, xx)
+
+# test the correctness of the implementation 
+
+using LinearAlgebra
+
+function laplacian_fd(gfun, xx; h = 1e-4)
+   Nx = length(xx)
+   g0 = array(gfun(xx))
+   Δ = g0 * (-2 * Nx)
+   for i = 1:Nx
+      xxp = [ xx[j] + h * (i==j) for j = 1:Nx ]
+      xxm = [ xx[j] - h * (i==j) for j = 1:Nx ]      
+      Δ = Δ + (array(gfun(xxp)) + array(gfun(xxm)))
+   end
+   return (Δ * (1/h^2)).nt
+end
+
+for h in [1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7]
+   Δ2 = laplacian_fd(g_ch3, xx, h = h)
+   println("h = $h, error = $(norm(Δ1.dot.W - Δ2.dot.W, Inf))")
+end
+
+# another test via ForwardDiff
+
+using ForwardDiff: hessian
+
+laplace_fwd(gfun, xx) = 
+         [ tr( hessian(xx -> g_ch3(xx).dot.W[i], xx) ) 
+            for i = 1:length(Δ1.dot.W) ]
+
+Δ3 = laplace_fwd(g_ch3, xx)
+
+Δ1.dot.W ≈ Δ3
+

From 6749691ceb50787c7874b886c65209f4a24b526f Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Mon, 22 May 2023 08:25:05 -0700
Subject: [PATCH 16/54] Moving evaluate, ed and ed2 to general Interface with
 ObjectPools

---
 src/interface.jl           | 17 +++++++++++-
 src/sparseproduct.jl       | 55 --------------------------------------
 test/test_sparseproduct.jl | 11 +++++---
 3 files changed, 23 insertions(+), 60 deletions(-)

diff --git a/src/interface.jl b/src/interface.jl
index 5522fdb..e92366b 100644
--- a/src/interface.jl
+++ b/src/interface.jl
@@ -110,7 +110,7 @@ _out_size(basis::AbstractPoly4MLBasis, X::BATCH) = (length(X), length(basis))
 
 # specfically for SparseProduct
 _out_size(basis::AbstractPoly4MLBasis, x::NTuple{NB, AbstractVector{T}}) where {NB, T} = (length(basis), )
-_out_size(basis::AbstractPoly4MLBasis, X::NTuple{NB, AbstractMatrix{T}}) where {NB, T} = (size(X[1], 1), length(basis), )
+_out_size(basis::AbstractPoly4MLBasis, X::NTuple{NB, AbstractMatrix{T}}) where {NB, T} = (size(X[1], 1), length(basis))
 
 
 _outsym(x::SINGLE) = :out 
@@ -139,6 +139,21 @@ _alloc_ed2(basis::AbstractPoly4MLBasis, x) =
       _alloc(basis, x), _alloc_d(basis, x), _alloc_dd(basis, x)
 
 
+# special functions for SparseProduct
+function _alloc_d(basis::AbstractPoly4MLBasis, BBs::NTuple{NB, AbstractVecOrMat{T}}) where {NB, T}
+      BBs_size = [size(bb) for bb in BBs]
+      return [Tuple([acquire!(basis.pool, _outsym(BBs), (BBsize), _valtype(basis, BBs)) for BBsize in BBs_size]) for _ = 1:length(basis)]
+end
+
+function _alloc_dd(basis::AbstractPoly4MLBasis, BBs::NTuple{NB, AbstractVecOrMat{T}}) where {NB, T}
+      BBs_size = [size(bb) for bb in BBs]
+      return [Tuple([acquire!(basis.pool, _outsym(BBs), (BBsize), _valtype(basis, BBs)) for BBsize in BBs_size]) for _ = 1:length(basis)]
+end
+
+_alloc_ed(basis::AbstractPoly4MLBasis, x::NTuple{NB, AbstractVecOrMat{T}}) where {NB, T} = _alloc(basis, x), _alloc_d(basis, x)
+_alloc_ed2(basis::AbstractPoly4MLBasis, x::NTuple{NB, AbstractVecOrMat{T}}) where {NB, T} = _alloc(basis, x), _alloc_d(basis, x), _alloc_dd(basis, x)
+
+
 # OLD ARRAY BASED INTERFACE 
 
 # _alloc(basis::AbstractPoly4MLBasis, X) = 
diff --git a/src/sparseproduct.jl b/src/sparseproduct.jl
index ab5bc28..ce0b523 100644
--- a/src/sparseproduct.jl
+++ b/src/sparseproduct.jl
@@ -22,61 +22,6 @@ SparseProduct(spec) = SparseProduct(spec, _make_reqfields()...)
 _valtype(basis::SparseProduct{T1}, TX::NTuple{NB, AbstractVecOrMat{T2}}) where {T1, T2, NB} = T2
 
 # ----------------------- evaluation interfaces 
-
-
-# function evaluate(basis::SparseProduct, BB::Tuple{Vararg{AbstractVector}}) 
-#    VT = mapreduce(eltype, promote_type, BB)
-#    A = zeros(VT, length(basis))
-#    evaluate!(A, basis, BB::Tuple)
-#    return A
-# end
-
-# function evaluate(basis::SparseProduct, BB::Tuple{Vararg{AbstractMatrix}}) 
-#    VT = mapreduce(eltype, promote_type, BB)
-#    nX = size(BB[1], 1)
-#    A = zeros(VT, nX, length(basis))
-#    evaluate!(A, basis, BB::Tuple)
-#    return A 
-# end
-   
-function evaluate_ed(basis::SparseProduct, BB::Tuple{Vararg{AbstractVector}}) 
-   VT = mapreduce(eltype, promote_type, BB)
-   A = zeros(VT, length(basis))
-   _similar(BB::Tuple) = Tuple([similar(BB[i]) for i = 1:length(BB)])
-   dA = [_similar(BB) for _ = 1:length(basis)]
-   evaluate_ed!(A, dA, basis, BB::Tuple)
-   return A, dA
-end
-
-function evaluate_ed(basis::SparseProduct, BB::Tuple{Vararg{AbstractMatrix}}) 
-   VT = mapreduce(eltype, promote_type, BB)
-   nX = size(BB[1], 1)
-   A = zeros(VT, nX, length(basis))
-   _similar(BB::Tuple) = Tuple([similar(BB[i]) for i = 1:length(BB)])
-   dA = [_similar(BB) for i = 1:length(basis)] # nX * basis
-   evaluate_ed!(A, dA, basis, BB::Tuple)
-   return A, dA
-end
-
-function evaluate_ed2(basis::SparseProduct, BB::Tuple{Vararg{AbstractVector}}) 
-   VT = mapreduce(eltype, promote_type, BB)
-   A = zeros(VT, length(basis))
-   _similar(BB::Tuple) = Tuple([similar(BB[i]) for i = 1:length(BB)])
-   dA, ddA = ([_similar(BB) for _ = 1:length(basis)], [_similar(BB) for _ = 1:length(basis)])
-   evaluate_ed2!(A, dA, ddA, basis, BB::Tuple)
-   return A, dA, ddA
-end
-
-function evaluate_ed2(basis::SparseProduct, BB::Tuple{Vararg{AbstractMatrix}}) 
-   VT = mapreduce(eltype, promote_type, BB)
-   nX = size(BB[1], 1)
-   A = zeros(VT, nX, length(basis))
-   _similar(BB::Tuple) = Tuple([similar(BB[i]) for i = 1:length(BB)])
-   dA, ddA = ([_similar(BB) for _ = 1:length(basis)], [_similar(BB) for _ = 1:length(basis)])
-   evaluate_ed2!(A, dA, ddA, basis, BB::Tuple)
-   return A, dA, ddA
-end
-
 function _frule_evaluate(basis::SparseProduct, BB::Tuple{Vararg{AbstractVector}}, ∂BB::Tuple{Vararg{AbstractVector}}) 
    VT = mapreduce(eltype, promote_type, BB)
    A = zeros(VT, length(basis))
diff --git a/test/test_sparseproduct.jl b/test/test_sparseproduct.jl
index 24782e7..3203d1e 100644
--- a/test/test_sparseproduct.jl
+++ b/test/test_sparseproduct.jl
@@ -5,8 +5,10 @@ using Polynomials4ML: SparseProduct, evaluate, evaluate_ed, evaluate_ed2
 using LinearAlgebra: norm
 using Polynomials4ML
 using ACEbase.Testing: fdtest
+
 ##
-NB = 3
+NB = 3 # For _rrule_evaluate test we need NB = 3, fix later by generalizing the test case
+
 
 N = [i * 4 for i = 1:NB]
 
@@ -131,7 +133,7 @@ A2 = evaluate_ed2(basis, BB)[2]
 
 println_slim(@test AA ≈ A1 )
 Δ = maximum([norm(dA[i][j] - A2[i][j], Inf) for i = 1:length(dA) for j = 1:length(dA[i])])
-println_slim(@test Δ ≈ 0.0)
+println_slim(@test norm(Δ) <= 1e-15)
 @info("Test batch evaluation")
 
 nX = 5
@@ -201,9 +203,9 @@ bbA2 = evaluate_ed2(basis, bBB)[3]
 
 println_slim(@test A1 ≈ A2)
 Δ = maximum([norm(bA1[i][j] - bA2[i][j], Inf) for i = 1:length(bA1) for j = 1:length(bA1[i])])
-println_slim(@test Δ ≈ 0)
+println_slim(@test norm(Δ) <= 1e-15)
 Δ = maximum([norm(bbA1[i][j] - bbA2[i][j], Inf) for i = 1:length(bbA1) for j = 1:length(bbA1[i])])
-println_slim(@test Δ ≈ 0)
+println_slim(@test norm(Δ) <= 1e-15)
 
 @info("Testing _rrule_evaluate")
 using LinearAlgebra: dot 
@@ -211,6 +213,7 @@ using LinearAlgebra: dot
 N1 = 10
 N2 = 20
 N3 = 30
+
 for ntest = 1:30
     local bBB
     local bUU

From d6e899ac84c1fd41d9c96dafcc44c3abdee95ebb Mon Sep 17 00:00:00 2001
From: DexuanZhou <hbnis@icloud.com>
Date: Tue, 23 May 2023 21:52:40 +0800
Subject: [PATCH 17/54] fix test_ed, ed2

---
 test/test_sparseproduct.jl | 38 +++++++++++++++++---------------------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/test/test_sparseproduct.jl b/test/test_sparseproduct.jl
index 3203d1e..085364c 100644
--- a/test/test_sparseproduct.jl
+++ b/test/test_sparseproduct.jl
@@ -23,7 +23,7 @@ test_evaluate(basis::SparseProduct, BB::Tuple) =
             for i = 1:length(basis) ]
 
 function test_evaluate_ed(basis, BB)
-    A = evaluate_ed(basis, BB)[1]
+    A = deepcopy(evaluate_ed(basis, BB)[1])
     dA = evaluate_ed(basis, BB)[2]
     errors = Float64[]
     # loop through finite-difference step-lengths
@@ -32,19 +32,17 @@ function test_evaluate_ed(basis, BB)
     @printf("---------|----------- \n")
     for p = 2:11
         h = 0.1^p
-        dAh = deepcopy(dA)
-        Δ = deepcopy(dA)
-        for n = 1:length(dAh) # basis
-            for i = 1:length(dAh[n]) #NB
-                for j = 1:length(dAh[n][i]) #BB[i]
+        Δ = []
+        for n = 1:length(dA) # basis
+            for i = 1:length(dA[n]) #NB
+                for j = 1:length(dA[n][i]) #BB[i]
                     BB[i][j] += h
-                    dAh[n][i][j] = (evaluate(basis, BB)[n] - A[n])/h
-                    Δ[n][i][j] = dA[n][i][j] - dAh[n][i][j]
+                    push!(Δ, dA[n][i][j] - (evaluate(basis, BB)[n] - A[n])/h)
                     BB[i][j] -= h
                 end
             end
         end
-        push!(errors, maximum([norm(Δ[i][j], Inf) for i = 1:length(Δ) for j = 1:length(Δ[i])] ))
+        push!(errors, norm(Δ, Inf))
         @printf(" %1.1e | %4.2e  \n", h, errors[end])
     end
     @printf("---------|----------- \n")
@@ -70,21 +68,20 @@ function test_evaluate_ed2(basis, BB)
    @printf("---------|----------- \n")
    for p = 2:11
        h = 0.1^p
-       ddAh = deepcopy(ddA)
-       Δ = deepcopy(ddA)
-       for n = 1:length(ddAh) # basis
-           for i = 1:length(ddAh[n]) #NB
-               for j = 1:length(ddAh[n][i]) #BB[i]
+       Δ = []
+       for n = 1:length(ddA) # basis
+           for i = 1:length(ddA[n]) #NB
+               for j = 1:length(ddA[n][i]) #BB[i]
                    BB[i][j] += h
-                   ddAh[n][i][j] = evaluate(basis, BB)[n] - 2 * A[n]
+                   AA = evaluate(basis, BB)[n] - 2 * A[n]
                    BB[i][j] -= 2*h
-                   ddAh[n][i][j] = (ddAh[n][i][j] + evaluate(basis, BB)[n])/h^2
+                   AA = (AA + evaluate(basis, BB)[n])/h^2
                    BB[i][j] += h 
-                   Δ[n][i][j] = ddA[n][i][j] - ddAh[n][i][j]
+                   push!(Δ, ddA[n][i][j] - AA)
                end
            end
        end
-       push!(errors, maximum([norm(Δ[i][j], Inf) for i = 1:length(Δ) for j = 1:length(Δ[i])] ))
+       push!(errors, norm(Δ, Inf))
        @printf(" %1.1e | %4.2e  \n", h, errors[end])
    end
    @printf("---------|----------- \n")
@@ -100,7 +97,6 @@ function test_evaluate_ed2(basis, BB)
   end
 end
 
-
 @info("Test serial evaluation")
 
 BB = Tuple(B)
@@ -113,7 +109,7 @@ println_slim(@test A1 ≈ A2 )
 @info("Test serial evaluation_ed")
 BB = Tuple(B)
 
-A = test_evaluate_ed(basis, BB)
+test_evaluate_ed(basis, BB)
 
 AA = evaluate(basis, BB)
 A1 = evaluate_ed(basis, BB)[1]
@@ -124,7 +120,7 @@ println_slim(@test AA ≈ A1 )
 @info("Test serial evaluation_ed2")
 BB = Tuple(B)
 
-A = test_evaluate_ed2(basis, BB)
+test_evaluate_ed2(basis, BB)
 
 AA = evaluate(basis, BB)
 dA = evaluate_ed(basis, BB)[2]

From c10db23e2bbe34bbb30f7507d5284b5a90c0f838 Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Tue, 23 May 2023 07:22:31 -0700
Subject: [PATCH 18/54] use rrule with ChainRulesCore

---
 Project.toml               |  8 ++++----
 src/sparseproduct.jl       | 16 ++++++++++++++--
 test/test_sparseproduct.jl | 12 +++++++++---
 3 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/Project.toml b/Project.toml
index 6960570..efeb3cc 100644
--- a/Project.toml
+++ b/Project.toml
@@ -21,13 +21,14 @@ SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 StrideArrays = "d1fa6d79-ef01-42a6-86c9-f7c551f8593b"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
 ACEbase = "0.4.2"
 Adapt = "3.5"
+BenchmarkTools = "1"
 ChainRulesCore = "1"
 Combinatorics = "1"
-BenchmarkTools = "1"
 ForwardDiff = "0.10"
 LoopVectorization = "0.12"
 LuxCore = "0.1.3"
@@ -35,14 +36,13 @@ ObjectPools = "0.2.1"
 QuadGK = "2"
 SpecialFunctions = "2.2"
 StaticArrays = "1.5"
-julia = "1.8"
 StrideArrays = "0.1.25"
-
+julia = "1.8"
 
 [extras]
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
 test = ["Test", "Lux", "Printf"]
diff --git a/src/sparseproduct.jl b/src/sparseproduct.jl
index ce0b523..9716647 100644
--- a/src/sparseproduct.jl
+++ b/src/sparseproduct.jl
@@ -1,3 +1,6 @@
+using ChainRulesCore
+using ChainRulesCore: NoTangent
+
 struct SparseProduct{NB} <: AbstractPoly4MLBasis
    spec::Vector{NTuple{NB, Int}}
    # ---- temporaries & caches
@@ -280,12 +283,21 @@ function _frule_frule_evaluate!(A, dA, ddA, basis::SparseProduct{NB}, BB::Tuple{
 end
 # -------------------- reverse mode gradient
 
-function _rrule_evaluate(basis::SparseProduct{NB}, BB::Tuple) where {NB}
+function ChainRulesCore.rrule(::typeof(evaluate), basis::SparseProduct{NB}, BB::Tuple) where {NB}
    A = evaluate(basis, BB)
-   return A, ∂A -> _pullback_evaluate(∂A, basis, BB)
+   function pb(∂A)
+      return NoTangent(), NoTangent(), _pullback_evaluate(∂A, basis, BB)
+   end
+   return A, pb
 end
 
 
+# function _rrule_evaluate(basis::SparseProduct{NB}, BB::Tuple) where {NB}
+#    A = evaluate(basis, BB)
+#    return A, ∂A -> _pullback_evaluate(∂A, basis, BB)
+# end
+
+
 function _pullback_evaluate(∂A, basis::SparseProduct{NB}, BB::Tuple) where {NB}
    TA = promote_type(eltype.(BB)...)
    ∂BB = ntuple(i -> zeros(TA, size(BB[i])...), NB)
diff --git a/test/test_sparseproduct.jl b/test/test_sparseproduct.jl
index 3203d1e..f1a54fa 100644
--- a/test/test_sparseproduct.jl
+++ b/test/test_sparseproduct.jl
@@ -5,6 +5,7 @@ using Polynomials4ML: SparseProduct, evaluate, evaluate_ed, evaluate_ed2
 using LinearAlgebra: norm
 using Polynomials4ML
 using ACEbase.Testing: fdtest
+using Zygote
 
 ##
 NB = 3 # For _rrule_evaluate test we need NB = 3, fix later by generalizing the test case
@@ -225,12 +226,17 @@ for ntest = 1:30
     u = randn(size(bA2))
     F(t) = dot(u, Polynomials4ML.evaluate(basis, _BB(t)))
     dF(t) = begin
-        val, pb = Polynomials4ML._rrule_evaluate(basis, _BB(t))
-        ∂BB = pb(u)
+        val, pb = Zygote.pullback(evaluate, basis, _BB(t))
+        ∂BB = pb(u)[2] # pb(u)[1] returns NoTangent() for basis argument
         return sum( dot(∂BB[i], bUU[i]) for i = 1:length(bUU) )
     end
     print_tf(@test fdtest(F, dF, 0.0; verbose=false))
 end
 println()
 
-##
\ No newline at end of file
+##
+
+# try with rrule
+u, pb = Zygote.pullback(evaluate, basis, bBB)
+# u1, pb1 = Polynomials4ML._rrule_evaluate(basis, bBB)
+

From cee889f10603cfcfb097cca48fd4659c38bab246 Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Tue, 23 May 2023 11:01:56 -0700
Subject: [PATCH 19/54] batched pullback sparsesymmprod  connection with
 ChainRulesCore

---
 src/ace/sparsesymmprod.jl | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/src/ace/sparsesymmprod.jl b/src/ace/sparsesymmprod.jl
index d5f6af0..6127c83 100644
--- a/src/ace/sparsesymmprod.jl
+++ b/src/ace/sparsesymmprod.jl
@@ -68,7 +68,7 @@ end
 
 # -------------- Chainrules integration 
 
-function _pullback(Δ, basis::SparseSymmProd, A, AA, AAdag)
+function _pullback(Δ, basis::SparseSymmProd, A::AbstractVector, AA, AAdag)
    Δdag = zeros(eltype(Δ), length(AAdag))
    Δdag[basis.proj] .= Δ
    T = promote_type(eltype(Δdag), eltype(AAdag))
@@ -77,6 +77,17 @@ function _pullback(Δ, basis::SparseSymmProd, A, AA, AAdag)
    return ΔA
 end
 
+
+function _pullback(Δ, basis::SparseSymmProd, A::AbstractMatrix, AA, AAdag)
+   Δdag = zeros(eltype(Δ), size(AAdag)...)
+   Δdag[:, basis.proj] .= Δ
+   T = promote_type(eltype(Δdag), eltype(AAdag))
+   ΔA = zeros(T, size(A)...)
+   pullback_arg!(ΔA, Δdag, basis.dag, AAdag)
+   return ΔA
+end
+
+
 function rrule(::typeof(evaluate), basis::SparseSymmProd, A::AbstractVector)
    AAdag = evaluate(basis.dag, A)
    AA = AAdag[basis.proj]
@@ -92,6 +103,12 @@ function rrule(::typeof(evaluate), basis::SparseSymmProd, A::AbstractVector)
    return AA, Δ -> (NoTangent(), NoTangent(), _pullback(Δ, basis, A, AA, AAdag))
 end
 
+function rrule(::typeof(evaluate), basis::SparseSymmProd, A::AbstractMatrix)
+   AAdag = evaluate(basis.dag, A)
+   AA = AAdag[:, basis.proj]
+   return AA, Δ -> (NoTangent(), NoTangent(), _pullback(Δ, basis, A, AA, AAdag))
+end
+
 # -------------- Lux integration 
 
 struct SparseSymmProdLayer{T} <: AbstractExplicitLayer

From e1c91ca11593609fc0ffd37b3b7f6e91a85ef7bc Mon Sep 17 00:00:00 2001
From: DexuanZhou <hbnis@icloud.com>
Date: Wed, 24 May 2023 03:29:58 +0800
Subject: [PATCH 20/54] add Placeholder for atobradials and rylm

---
 .../atomicorbitalsradials.jl                    | 17 +++++++++++++++++
 src/sphericalharmonics/rylm.jl                  |  9 +++++++++
 2 files changed, 26 insertions(+)

diff --git a/src/atomicorbitalsradials/atomicorbitalsradials.jl b/src/atomicorbitalsradials/atomicorbitalsradials.jl
index 983be48..6c4edb0 100644
--- a/src/atomicorbitalsradials/atomicorbitalsradials.jl
+++ b/src/atomicorbitalsradials/atomicorbitalsradials.jl
@@ -1,4 +1,6 @@
 export AtomicOrbitalsRadials, GaussianBasis, SlaterBasis, STO_NG
+using ChainRulesCore
+using ChainRulesCore: NoTangent
 
 const NLM{T} = NamedTuple{(:n1, :n2, :l, :m), Tuple{T, T, T, T}}
 const NL{T} = NamedTuple{(:n1, :n2, :l), Tuple{T, T, T}}
@@ -91,6 +93,21 @@ function evaluate_ed2!(Rnl, dRnl, ddRnl, basis::AtomicOrbitalsRadials, R)
     return Rnl, dRnl, ddRnl
 end
 
+# not test
+function ChainRulesCore.rrule(::typeof(evaluate), basis::AtomicOrbitalsRadials, R::AbstractVector{<: Real})
+   A  = evaluate(basis, R)
+   ∂R = similar(R)
+   dR = evaluate_ed(basis, R)[2]
+   function pb(∂A)
+        @assert size(∂A) == (length(R), length(basis))
+        for i = 1:length(R)
+            ∂R[i] = dot(∂A[i,:], dR[i,:])
+        end
+        return NoTangent(), NoTangent(), ∂R
+   end
+   return A, pb
+end
+
 include("gaussian.jl")
 include("slater.jl")
 include("sto_ng.jl")
diff --git a/src/sphericalharmonics/rylm.jl b/src/sphericalharmonics/rylm.jl
index 681c57e..27bee47 100644
--- a/src/sphericalharmonics/rylm.jl
+++ b/src/sphericalharmonics/rylm.jl
@@ -333,3 +333,12 @@ function eval_grad_laplace(basis::RYlmBasis, X)
 	ΔY = _lap(basis, Y)
 	return Y, dY, ΔY
 end
+
+# Placeholder for now
+function ChainRulesCore.rrule(::typeof(evaluate), basis::RYlmBasis, X)
+	A  = evaluate(basis, X)
+	function pb(∂A)
+		return NoTangent(), NoTangent(), X
+	end
+	return A, pb
+end
\ No newline at end of file

From 2af961dee0881221c6a7d0284b96d198d66461dd Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Tue, 23 May 2023 18:20:24 -0700
Subject: [PATCH 21/54] fix atomicorbitalradials pullback and add test

---
 .../atomicorbitalsradials.jl                  |  2 +-
 src/sphericalharmonics/rylm.jl                |  2 ++
 test/test_atorbrad.jl                         | 30 ++++++++++++++++++-
 test/test_sparseproduct.jl                    |  4 +++
 4 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/src/atomicorbitalsradials/atomicorbitalsradials.jl b/src/atomicorbitalsradials/atomicorbitalsradials.jl
index 6c4edb0..127f7dd 100644
--- a/src/atomicorbitalsradials/atomicorbitalsradials.jl
+++ b/src/atomicorbitalsradials/atomicorbitalsradials.jl
@@ -101,7 +101,7 @@ function ChainRulesCore.rrule(::typeof(evaluate), basis::AtomicOrbitalsRadials,
    function pb(∂A)
         @assert size(∂A) == (length(R), length(basis))
         for i = 1:length(R)
-            ∂R[i] = dot(∂A[i,:], dR[i,:])
+            ∂R[i] = dot(@view(∂A[i, :]), @view(dR[i, :]))
         end
         return NoTangent(), NoTangent(), ∂R
    end
diff --git a/src/sphericalharmonics/rylm.jl b/src/sphericalharmonics/rylm.jl
index 27bee47..f91f351 100644
--- a/src/sphericalharmonics/rylm.jl
+++ b/src/sphericalharmonics/rylm.jl
@@ -1,3 +1,5 @@
+using ChainRulesCore
+
 export RYlmBasis 
 
 """
diff --git a/test/test_atorbrad.jl b/test/test_atorbrad.jl
index f96a0a6..ff7076c 100644
--- a/test/test_atorbrad.jl
+++ b/test/test_atorbrad.jl
@@ -3,6 +3,9 @@ using Polynomials4ML, Polynomials4ML.Testing
 using Polynomials4ML: evaluate, evaluate_d, evaluate_ed 
 using Polynomials4ML.Testing: print_tf, println_slim 
 using ForwardDiff
+using ChainRulesTestUtils
+using ACEbase.Testing: fdtest
+using Zygote
 
 ##
 
@@ -77,4 +80,29 @@ fddRnl = vcat([ ForwardDiff.derivative(r -> evaluate_ed(bRnl, [r,])[2], r)
 
 println_slim(@test  Rnl ≈ Rnl1 ≈ Rnl2  )
 println_slim(@test  dRnl1 ≈ dRnl2 ≈ fdRnl )
-println_slim(@test  ddRnl2 ≈ fddRnl )
\ No newline at end of file
+println_slim(@test  ddRnl2 ≈ fddRnl )
+
+
+@info("Test rrule")
+using LinearAlgebra: dot 
+
+for ntest = 1:30
+    local rr
+    local uu
+    local Rnl
+    local u
+    
+    rr = 2 .* randn(10) .- 1
+    uu = 2 .* randn(10) .- 1
+    _rr(t) = rr + t * uu
+    Rnl = evaluate(bRnl, rr)
+    u = randn(size(Rnl))
+    F(t) = dot(u, evaluate(bRnl, _rr(t)))
+    dF(t) = begin
+        val, pb = Zygote.pullback(evaluate, bRnl, _rr(t))
+        ∂BB = pb(u)[2] # pb(u)[1] returns NoTangent() for basis argument
+        return sum( dot(∂BB[i], uu[i]) for i = 1:length(uu) )
+    end
+    print_tf(@test fdtest(F, dF, 0.0; verbose = false))
+end
+println()
diff --git a/test/test_sparseproduct.jl b/test/test_sparseproduct.jl
index 8a24164..bf5e22f 100644
--- a/test/test_sparseproduct.jl
+++ b/test/test_sparseproduct.jl
@@ -236,3 +236,7 @@ println()
 u, pb = Zygote.pullback(evaluate, basis, bBB)
 # u1, pb1 = Polynomials4ML._rrule_evaluate(basis, bBB)
 
+
+# TODO: look into why this is failing
+# using ChainRulesTestUtils
+# test_rrule(evaluate, basis, bBB)

From cf21eb7761f411db98846f4a584159f1c3f914a2 Mon Sep 17 00:00:00 2001
From: DexuanZhou <hbnis@icloud.com>
Date: Wed, 24 May 2023 09:51:39 +0800
Subject: [PATCH 22/54] change rrule of rylm, test fail

---
 src/sphericalharmonics/rylm.jl       |  8 +++++++-
 test/sphericalharmonics/test_rylm.jl | 26 +++++++++++++++++++++++++-
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/src/sphericalharmonics/rylm.jl b/src/sphericalharmonics/rylm.jl
index f91f351..b3097f3 100644
--- a/src/sphericalharmonics/rylm.jl
+++ b/src/sphericalharmonics/rylm.jl
@@ -339,8 +339,14 @@ end
 # Placeholder for now
 function ChainRulesCore.rrule(::typeof(evaluate), basis::RYlmBasis, X)
 	A  = evaluate(basis, X)
+	∂X = similar(X)
+   	dX = evaluate_ed(basis, X)[2]
 	function pb(∂A)
-		return NoTangent(), NoTangent(), X
+		@assert size(∂A) == (length(X), length(basis))
+		for i = 1:length(X)
+            ∂X[i] = sum([∂A[i,j] * X[i,j] for j = 1:length(X[i,:])])
+        end
+		return NoTangent(), NoTangent(), ∂X
 	end
 	return A, pb
 end
\ No newline at end of file
diff --git a/test/sphericalharmonics/test_rylm.jl b/test/sphericalharmonics/test_rylm.jl
index 1cff365..ef6f423 100644
--- a/test/sphericalharmonics/test_rylm.jl
+++ b/test/sphericalharmonics/test_rylm.jl
@@ -6,7 +6,7 @@ using Polynomials4ML: SphericalCoords, index_y,
                       dspher_to_dcart, cart2spher, spher2cart, rand_sphere
 using Polynomials4ML: evaluate, evaluate_d, evaluate_ed 
 using Polynomials4ML.Testing: print_tf, println_slim 
-
+using ACEbase.Testing: fdtest
 verbose = false
 
 ##
@@ -146,3 +146,27 @@ println_slim(@test Y1 ≈ Y2)
 println_slim(@test dY1 ≈ dY2)
 println_slim(@test ΔY1 ≈ ΔY2)
 
+using Zygote
+@info("Test rrule")
+using LinearAlgebra: dot 
+rSH = RYlmBasis(10)
+for ntest = 1:30
+    local X
+    local Y
+    local Rnl
+    local u
+    
+    X = [ rand_sphere() for i = 1:21 ]
+    Y = X = [ rand_sphere() for i = 1:21 ]
+    _x(t) = X + t * Y
+    A = evaluate(rSH, X)
+    u = randn(size(A))
+    F(t) = dot(u, evaluate(rSH, _x(t)))
+    dF(t) = begin
+        val, pb = Zygote.pullback(rSH, _x(t))
+        ∂BB = pb1(u)[1] # pb(u)[1] returns NoTangent() for basis argument
+        return sum( dot(∂BB[i], Y[i]) for i = 1:length(Y) )
+    end
+    print_tf(@test fdtest(F, dF, 0.0; verbose = false))
+end
+println()

From 567fdde436a672f346ba0ea1b3791a1095a30ee3 Mon Sep 17 00:00:00 2001
From: DexuanZhou <hbnis@icloud.com>
Date: Wed, 24 May 2023 09:55:06 +0800
Subject: [PATCH 23/54] fix rrule of rylm

---
 src/sphericalharmonics/rylm.jl       | 2 +-
 test/sphericalharmonics/test_rylm.jl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/sphericalharmonics/rylm.jl b/src/sphericalharmonics/rylm.jl
index b3097f3..63c4ead 100644
--- a/src/sphericalharmonics/rylm.jl
+++ b/src/sphericalharmonics/rylm.jl
@@ -344,7 +344,7 @@ function ChainRulesCore.rrule(::typeof(evaluate), basis::RYlmBasis, X)
 	function pb(∂A)
 		@assert size(∂A) == (length(X), length(basis))
 		for i = 1:length(X)
-            ∂X[i] = sum([∂A[i,j] * X[i,j] for j = 1:length(X[i,:])])
+            ∂X[i] = sum([∂A[i,j] * dX[i,j] for j = 1:length(dX[i,:])])
         end
 		return NoTangent(), NoTangent(), ∂X
 	end
diff --git a/test/sphericalharmonics/test_rylm.jl b/test/sphericalharmonics/test_rylm.jl
index ef6f423..279cf26 100644
--- a/test/sphericalharmonics/test_rylm.jl
+++ b/test/sphericalharmonics/test_rylm.jl
@@ -164,7 +164,7 @@ for ntest = 1:30
     F(t) = dot(u, evaluate(rSH, _x(t)))
     dF(t) = begin
         val, pb = Zygote.pullback(rSH, _x(t))
-        ∂BB = pb1(u)[1] # pb(u)[1] returns NoTangent() for basis argument
+        ∂BB = pb(u)[1] # pb(u)[1] returns NoTangent() for basis argument
         return sum( dot(∂BB[i], Y[i]) for i = 1:length(Y) )
     end
     print_tf(@test fdtest(F, dF, 0.0; verbose = false))

From c4cee08753d8e541340d32ae5c4fe9bacbab8de6 Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Tue, 23 May 2023 20:04:47 -0700
Subject: [PATCH 24/54] fix productbasis to be Zygote-friendly

---
 src/productbasis.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/productbasis.jl b/src/productbasis.jl
index e69109e..37f2725 100644
--- a/src/productbasis.jl
+++ b/src/productbasis.jl
@@ -17,13 +17,12 @@ function evaluate(basis::ProductBasis, X::AbstractVector{<: AbstractVector})
    T = promote_type(eltype(X[1]))
    
    # create all the shifted configurations 
-   xx = zeros(eltype(T), Nel)
-   for i = 1:Nel
-      xx[i] = norm(X[i])
-   end
-
+   # for i = 1:Nel
+   #    xx[i] = norm(X[i])
+   # end
+   
    # evaluate the radial and angular components on all the shifted particles 
-   Rnl = reshape(evaluate(basis.bRnl, xx[:]), (Nel, length(basis.bRnl)))
+   Rnl = reshape(evaluate(basis.bRnl, (norm.(X))[:]), (Nel, length(basis.bRnl)))
    Ylm = reshape(evaluate(basis.bYlm, X[:]), (Nel, length(basis.bYlm)))
 
    # evaluate all the atomic orbitals as ϕ_nlm = Rnl * Ylm 
@@ -32,3 +31,4 @@ function evaluate(basis::ProductBasis, X::AbstractVector{<: AbstractVector})
    return ϕnlm
 end
 
+

From 68d3e3489e0851c553c329fce9a519d5b494300f Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Wed, 24 May 2023 10:09:50 -0700
Subject: [PATCH 25/54] make sure the merge is done nicely, all test passed

---
 src/ace/sparseprodpool.jl       | 26 +++++++++++++++-----------
 src/ace/sparsesymmprod.jl       |  2 --
 src/interface.jl                | 32 ++++++++++++++++----------------
 src/sparseproduct.jl            | 33 +++++++++++++++++++++++++++++++--
 test/ace/test_prodpool_mult.jl  |  2 +-
 test/ace/test_sparseprodpool.jl |  1 +
 test/test_atorbrad.jl           |  1 -
 7 files changed, 64 insertions(+), 33 deletions(-)

diff --git a/src/ace/sparseprodpool.jl b/src/ace/sparseprodpool.jl
index aa993b3..c893406 100644
--- a/src/ace/sparseprodpool.jl
+++ b/src/ace/sparseprodpool.jl
@@ -34,23 +34,27 @@ const TupVec = Tuple{Vararg{<: AbstractVector}}
 const TupMat = Tuple{Vararg{<: AbstractMatrix}}
 const TupVecMat = Union{TupVec, TupMat}
 
-_valtype(basis::PooledSparseProduct, BB::Tuple) = 
+_valtype(basis::AbstractPoly4MLBasis, BB::Tuple) = 
       mapreduce(eltype, promote_type, BB)
 
-_alloc(basis::AbstractPoly4MLBasis, BB::TupVecMat) = 
-      acquire!(basis.pool, :A, (length(basis),), _valtype(basis, BB) )
+_gradtype(basis::AbstractPoly4MLBasis, BB::Tuple) = 
+      mapreduce(eltype, promote_type, BB)
+
+_alloc(basis::PooledSparseProduct, BB::TupVecMat) = 
+      acquire!(basis.pool, :A, (length(basis), ), _valtype(basis, BB) )
+
+# _alloc_d(basis::AbstractPoly4MLBasis, BB::TupVecMat) = 
+#       acquire!(basis.pool, _outsym(BB), (length(basis), ), _gradtype(basis, BB) )
 
-_alloc_d(basis::AbstractPoly4MLBasis, BB::TupVecMat) = 
-      acquire!(basis.pool, _outsym(BB), length(basis), _gradtype(basis, BB) )
+# _alloc_dd(basis::AbstractPoly4MLBasis, BB::TupVecMat) = 
+#       acquire!(basis.pool, _outsym(BB), (length(basis), ), _gradtype(basis, BB) )
 
-_alloc_dd(basis::AbstractPoly4MLBasis, BB::TupVecMat) = 
-      acquire!(basis.pool, _outsym(BB), length(basis), _gradtype(basis, BB) )
+# _alloc_ed(basis::AbstractPoly4MLBasis, BB::TupVecMat) = 
+#       _alloc(basis, BB), _alloc_d(basis, BB)
 
-_alloc_ed(basis::AbstractPoly4MLBasis, BB::TupVecMat) = 
-      _alloc(basis, BB), _alloc_d(basis, BB)
+# _alloc_ed2(basis::AbstractPoly4MLBasis, BB::TupVecMat) = 
+#       _alloc(basis, BB), _alloc_d(basis, BB), _alloc_dd(basis, BB)
 
-_alloc_ed2(basis::AbstractPoly4MLBasis, BB::TupVecMat) = 
-      _alloc(basis, BB), _alloc_d(basis, BB), _alloc_dd(basis, BB)
 
 
 # ----------------------- evaluation kernels 
diff --git a/src/ace/sparsesymmprod.jl b/src/ace/sparsesymmprod.jl
index 9b9df83..5dea90a 100644
--- a/src/ace/sparsesymmprod.jl
+++ b/src/ace/sparsesymmprod.jl
@@ -14,8 +14,6 @@ end
 
 Base.length(basis::SparseSymmProd) = length(basis.proj)
 
-(basis::SparseSymmProd)(args...) = evaluate(basis, args...)
-
 reconstruct_spec(basis::SparseSymmProd) = reconstruct_spec(basis.dag)[basis.proj]
 
 # -------------- evaluation interfaces 
diff --git a/src/interface.jl b/src/interface.jl
index e92366b..e5d3095 100644
--- a/src/interface.jl
+++ b/src/interface.jl
@@ -109,8 +109,8 @@ _out_size(basis::AbstractPoly4MLBasis, x::SINGLE) = (length(basis),)
 _out_size(basis::AbstractPoly4MLBasis, X::BATCH) = (length(X), length(basis))
 
 # specfically for SparseProduct
-_out_size(basis::AbstractPoly4MLBasis, x::NTuple{NB, AbstractVector{T}}) where {NB, T} = (length(basis), )
-_out_size(basis::AbstractPoly4MLBasis, X::NTuple{NB, AbstractMatrix{T}}) where {NB, T} = (size(X[1], 1), length(basis))
+# _out_size(basis::AbstractPoly4MLBasis, x::NTuple{NB, AbstractVector{T}}) where {NB, T} = (length(basis), )
+# _out_size(basis::AbstractPoly4MLBasis, X::NTuple{NB, AbstractMatrix{T}}) where {NB, T} = (size(X[1], 1), length(basis))
 
 
 _outsym(x::SINGLE) = :out 
@@ -119,8 +119,8 @@ _outsym(X::BATCH) = :outb
 # this is just for temporary use and we should think about how to do it generally...
 
 # speccially for SparseProduct
-_outsym(x::NTuple{NB, AbstractVector{T}}) where {NB, T} = :out
-_outsym(X::NTuple{NB, AbstractMatrix{T}}) where {NB, T} = :outb
+# _outsym(x::NTuple{NB, AbstractVector{T}}) where {NB, T} = :out
+# _outsym(X::NTuple{NB, AbstractMatrix{T}}) where {NB, T} = :outb
 
 
 _alloc(basis::AbstractPoly4MLBasis, X) = 
@@ -140,18 +140,18 @@ _alloc_ed2(basis::AbstractPoly4MLBasis, x) =
 
 
 # special functions for SparseProduct
-function _alloc_d(basis::AbstractPoly4MLBasis, BBs::NTuple{NB, AbstractVecOrMat{T}}) where {NB, T}
-      BBs_size = [size(bb) for bb in BBs]
-      return [Tuple([acquire!(basis.pool, _outsym(BBs), (BBsize), _valtype(basis, BBs)) for BBsize in BBs_size]) for _ = 1:length(basis)]
-end
-
-function _alloc_dd(basis::AbstractPoly4MLBasis, BBs::NTuple{NB, AbstractVecOrMat{T}}) where {NB, T}
-      BBs_size = [size(bb) for bb in BBs]
-      return [Tuple([acquire!(basis.pool, _outsym(BBs), (BBsize), _valtype(basis, BBs)) for BBsize in BBs_size]) for _ = 1:length(basis)]
-end
-
-_alloc_ed(basis::AbstractPoly4MLBasis, x::NTuple{NB, AbstractVecOrMat{T}}) where {NB, T} = _alloc(basis, x), _alloc_d(basis, x)
-_alloc_ed2(basis::AbstractPoly4MLBasis, x::NTuple{NB, AbstractVecOrMat{T}}) where {NB, T} = _alloc(basis, x), _alloc_d(basis, x), _alloc_dd(basis, x)
+# function _alloc_d(basis::AbstractPoly4MLBasis, BBs::NTuple{NB, AbstractVecOrMat{T}}) where {NB, T}
+#       BBs_size = [size(bb) for bb in BBs]
+#       return [Tuple([acquire!(basis.pool, _outsym(BBs), (BBsize), _valtype(basis, BBs)) for BBsize in BBs_size]) for _ = 1:length(basis)]
+# end
+
+# function _alloc_dd(basis::AbstractPoly4MLBasis, BBs::NTuple{NB, AbstractVecOrMat{T}}) where {NB, T}
+#       BBs_size = [size(bb) for bb in BBs]
+#       return [Tuple([acquire!(basis.pool, _outsym(BBs), (BBsize), _valtype(basis, BBs)) for BBsize in BBs_size]) for _ = 1:length(basis)]
+# end
+
+# _alloc_ed(basis::AbstractPoly4MLBasis, x::NTuple{NB, AbstractVecOrMat{T}}) where {NB, T} = _alloc(basis, x), _alloc_d(basis, x)
+# _alloc_ed2(basis::AbstractPoly4MLBasis, x::NTuple{NB, AbstractVecOrMat{T}}) where {NB, T} = _alloc(basis, x), _alloc_d(basis, x), _alloc_dd(basis, x)
 
 
 # OLD ARRAY BASED INTERFACE 
diff --git a/src/sparseproduct.jl b/src/sparseproduct.jl
index 9716647..5041685 100644
--- a/src/sparseproduct.jl
+++ b/src/sparseproduct.jl
@@ -68,6 +68,37 @@ function _frule_frule_evaluate(basis::SparseProduct, BB::Tuple{Vararg{AbstractMa
    _frule_frule_evaluate!(A, dA, ddA, basis, BB::Tuple, ∂BB::Tuple, ∂∂BB::Tuple)
    return A, dA, ddA
 end
+
+# ----------------------- overiding alloc functions
+const TupVec = Tuple{Vararg{<: AbstractVector}}
+const TupMat = Tuple{Vararg{<: AbstractMatrix}}
+const TupVecMat = Union{TupVec, TupMat}
+
+# specifically for SparseProduct/PooledSparseProduct
+_outsym(x::NTuple{NB, AbstractVector{T}}) where {NB, T} = :out
+_outsym(X::NTuple{NB, AbstractMatrix{T}}) where {NB, T} = :outb
+
+_alloc(basis::SparseProduct, BB::TupVec) = 
+      acquire!(basis.pool, :out, (length(basis), ), _valtype(basis, BB) )
+
+_alloc(basis::SparseProduct, BB::TupMat) = 
+      acquire!(basis.pool, :outb, (size(BB[1], 1), length(basis) ), _valtype(basis, BB) )
+
+function _alloc_d(basis::SparseProduct, BBs::NTuple{NB, AbstractVecOrMat{T}}) where {NB, T}
+      BBs_size = [size(bb) for bb in BBs]
+      return [Tuple([acquire!(basis.pool, _outsym(BBs), (BBsize), _valtype(basis, BBs)) for BBsize in BBs_size]) for _ = 1:length(basis)]
+end
+
+function _alloc_dd(basis::SparseProduct, BBs::NTuple{NB, AbstractVecOrMat{T}}) where {NB, T}
+      BBs_size = [size(bb) for bb in BBs]
+      return [Tuple([acquire!(basis.pool, _outsym(BBs), (BBsize), _valtype(basis, BBs)) for BBsize in BBs_size]) for _ = 1:length(basis)]
+end
+
+_alloc_ed(basis::SparseProduct, x::NTuple{NB, AbstractVecOrMat{T}}) where {NB, T} = _alloc(basis, x), _alloc_d(basis, x)
+_alloc_ed2(basis::SparseProduct, x::NTuple{NB, AbstractVecOrMat{T}}) where {NB, T} = _alloc(basis, x), _alloc_d(basis, x), _alloc_dd(basis, x)
+
+
+
 # ----------------------- evaluation kernels 
 
 function evaluate!(A, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractVector}}) where {NB}
@@ -92,8 +123,6 @@ function evaluate!(A, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractMatrix}
    return A
 end
 
-# Not sure whether we can everything below
-# faster by eval and diff at the same time from prod_grad
 
 function evaluate_ed!(A, dA, basis::SparseProduct{NB}, BB::Tuple{Vararg{AbstractVector}}) where {NB}
    @assert length(BB) == NB
diff --git a/test/ace/test_prodpool_mult.jl b/test/ace/test_prodpool_mult.jl
index 7daee8c..49c2108 100644
--- a/test/ace/test_prodpool_mult.jl
+++ b/test/ace/test_prodpool_mult.jl
@@ -7,7 +7,7 @@
 
 using BenchmarkTools, Test, Polynomials4ML
 using ACEbase.Testing: println_slim, print_tf
-using Polynomials4ML:  PooledSparseProduct, test_evaluate, evaluate , 
+using Polynomials4ML:  PooledSparseProduct, evaluate, 
                        evaluate!, evalpool!
 
 function evalpool_multi!(A, bA, BBB)
diff --git a/test/ace/test_sparseprodpool.jl b/test/ace/test_sparseprodpool.jl
index ca4680b..e6b3ec7 100644
--- a/test/ace/test_sparseprodpool.jl
+++ b/test/ace/test_sparseprodpool.jl
@@ -80,6 +80,7 @@ using LinearAlgebra: dot
 
 for ntest = 1:30 
    local bBB, bA2 
+   local u
    bBB = ( randn(nX, N1), randn(nX, N2), randn(nX, N3) )
    bUU = ( randn(nX, N1), randn(nX, N2), randn(nX, N3) )
    _BB(t) = ( bBB[1] + t * bUU[1], bBB[2] + t * bUU[2], bBB[3] + t * bUU[3] )
diff --git a/test/test_atorbrad.jl b/test/test_atorbrad.jl
index ff7076c..126ce4b 100644
--- a/test/test_atorbrad.jl
+++ b/test/test_atorbrad.jl
@@ -3,7 +3,6 @@ using Polynomials4ML, Polynomials4ML.Testing
 using Polynomials4ML: evaluate, evaluate_d, evaluate_ed 
 using Polynomials4ML.Testing: print_tf, println_slim 
 using ForwardDiff
-using ChainRulesTestUtils
 using ACEbase.Testing: fdtest
 using Zygote
 

From c3fad70ffa33bf6b520c6fcb23830fa7269137f4 Mon Sep 17 00:00:00 2001
From: DexuanZhou <hbnis@icloud.com>
Date: Thu, 25 May 2023 14:57:02 +0800
Subject: [PATCH 26/54] add natural_indices for radials

---
 src/atomicorbitalsradials/atomicorbitalsradials.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/atomicorbitalsradials/atomicorbitalsradials.jl b/src/atomicorbitalsradials/atomicorbitalsradials.jl
index 127f7dd..5219bf3 100644
--- a/src/atomicorbitalsradials/atomicorbitalsradials.jl
+++ b/src/atomicorbitalsradials/atomicorbitalsradials.jl
@@ -123,3 +123,5 @@ evaluate_ed(basis::ExponentialType, ζ::Vector, r::Number) = evaluate_ed(basis,
 evaluate_ed2(basis::ExponentialType, ζ::Number, r::Number) = evaluate_ed2(basis, [ζ,], [r,])[:]
 evaluate_ed2(basis::ExponentialType, ζ::Number, r::Vector) = evaluate_ed2(basis, [ζ,], r)
 evaluate_ed2(basis::ExponentialType, ζ::Vector, r::Number) = evaluate_ed2(basis, ζ, [r,])
+
+natural_indices(basis::AtomicOrbitalsRadials) = copy(basis.spec)
\ No newline at end of file

From e22e9a9afc7f8291e55cba9ac9b82fa93c25609a Mon Sep 17 00:00:00 2001
From: DexuanZhou <hbnis@icloud.com>
Date: Thu, 25 May 2023 18:26:07 +0800
Subject: [PATCH 27/54] add degree

---
 src/atomicorbitalsradials/atomicorbitalsradials.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/atomicorbitalsradials/atomicorbitalsradials.jl b/src/atomicorbitalsradials/atomicorbitalsradials.jl
index 5219bf3..d742a12 100644
--- a/src/atomicorbitalsradials/atomicorbitalsradials.jl
+++ b/src/atomicorbitalsradials/atomicorbitalsradials.jl
@@ -124,4 +124,5 @@ evaluate_ed2(basis::ExponentialType, ζ::Number, r::Number) = evaluate_ed2(basis
 evaluate_ed2(basis::ExponentialType, ζ::Number, r::Vector) = evaluate_ed2(basis, [ζ,], r)
 evaluate_ed2(basis::ExponentialType, ζ::Vector, r::Number) = evaluate_ed2(basis, ζ, [r,])
 
-natural_indices(basis::AtomicOrbitalsRadials) = copy(basis.spec)
\ No newline at end of file
+natural_indices(basis::AtomicOrbitalsRadials) = copy(basis.spec)
+degree(basis::AtomicOrbitalsRadials, b::NamedTuple) = b.n1
\ No newline at end of file

From afadc2d66f55d86eb76735b43462f2ab03ea4381 Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Mon, 29 May 2023 18:48:55 -0700
Subject: [PATCH 28/54] draft HyperDual

---
 Project.toml                                 |  2 +
 src/interface.jl                             | 12 ---
 src/productbasis.jl                          |  6 +-
 src/sparseproduct.jl                         | 21 ++++--
 src/sphericalharmonics/rylm.jl               |  6 +-
 src/sphericalharmonics/sphericalharmonics.jl |  6 +-
 test/test_acemodel.jl                        | 79 ++++++++++++++++++++
 test/test_sparseproduct.jl                   | 12 ++-
 8 files changed, 119 insertions(+), 25 deletions(-)
 create mode 100644 test/test_acemodel.jl

diff --git a/Project.toml b/Project.toml
index ca63dc9..0037541 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,6 +5,7 @@ version = "0.1.3"
 
 [deps]
 ACEbase = "14bae519-eb20-449c-a949-9c58ed33163e"
+ACEcore = "44c1e890-45d1-48ea-94d6-c2ea5b573f71"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
@@ -13,6 +14,7 @@ ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 HyperDualNumbers = "50ceba7f-c3ee-5a84-a6e8-3ad40456ec97"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
+Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
 LuxCore = "bb33d45b-7691-41d6-9220-0943567d0623"
 NamedTupleTools = "d9ec5142-1e00-5aa0-9d6a-321866360f50"
 ObjectPools = "658cac36-ff0f-48ad-967c-110375d98c9d"
diff --git a/src/interface.jl b/src/interface.jl
index e5d3095..a561bd0 100644
--- a/src/interface.jl
+++ b/src/interface.jl
@@ -108,21 +108,9 @@ _laplacetype(basis::AbstractPoly4MLBasis, X::BATCH) =
 _out_size(basis::AbstractPoly4MLBasis, x::SINGLE) = (length(basis),)
 _out_size(basis::AbstractPoly4MLBasis, X::BATCH) = (length(X), length(basis))
 
-# specfically for SparseProduct
-# _out_size(basis::AbstractPoly4MLBasis, x::NTuple{NB, AbstractVector{T}}) where {NB, T} = (length(basis), )
-# _out_size(basis::AbstractPoly4MLBasis, X::NTuple{NB, AbstractMatrix{T}}) where {NB, T} = (size(X[1], 1), length(basis))
-
-
 _outsym(x::SINGLE) = :out 
 _outsym(X::BATCH) = :outb
 
-# this is just for temporary use and we should think about how to do it generally...
-
-# speccially for SparseProduct
-# _outsym(x::NTuple{NB, AbstractVector{T}}) where {NB, T} = :out
-# _outsym(X::NTuple{NB, AbstractMatrix{T}}) where {NB, T} = :outb
-
-
 _alloc(basis::AbstractPoly4MLBasis, X) = 
       acquire!(basis.pool, _outsym(X), _out_size(basis, X), _valtype(basis, X) )
 
diff --git a/src/productbasis.jl b/src/productbasis.jl
index 37f2725..53ac5c2 100644
--- a/src/productbasis.jl
+++ b/src/productbasis.jl
@@ -22,9 +22,9 @@ function evaluate(basis::ProductBasis, X::AbstractVector{<: AbstractVector})
    # end
    
    # evaluate the radial and angular components on all the shifted particles 
-   Rnl = reshape(evaluate(basis.bRnl, (norm.(X))[:]), (Nel, length(basis.bRnl)))
-   Ylm = reshape(evaluate(basis.bYlm, X[:]), (Nel, length(basis.bYlm)))
-
+   Rnl = evaluate(basis.bRnl, (norm.(X))[:])
+   Ylm = evaluate(basis.bYlm, X[:])
+   
    # evaluate all the atomic orbitals as ϕ_nlm = Rnl * Ylm 
    ϕnlm = evaluate(basis.sparsebasis, (Rnl, Ylm))
 
diff --git a/src/sparseproduct.jl b/src/sparseproduct.jl
index 5041685..dfdd8c8 100644
--- a/src/sparseproduct.jl
+++ b/src/sparseproduct.jl
@@ -75,14 +75,23 @@ const TupMat = Tuple{Vararg{<: AbstractMatrix}}
 const TupVecMat = Union{TupVec, TupMat}
 
 # specifically for SparseProduct/PooledSparseProduct
-_outsym(x::NTuple{NB, AbstractVector{T}}) where {NB, T} = :out
-_outsym(X::NTuple{NB, AbstractMatrix{T}}) where {NB, T} = :outb
+_outsym(x::NTuple{NB, TupVec}) where {NB} = :out
+_outsym(X::NTuple{NB, TupMat}) where {NB} = :outb
 
-_alloc(basis::SparseProduct, BB::TupVec) = 
-      acquire!(basis.pool, :out, (length(basis), ), _valtype(basis, BB) )
+_outsym(x::Tuple{AbstractVector, AbstractVector}) = :out
+_outsym(X::Tuple{AbstractMatrix, AbstractMatrix}) = :outb
 
-_alloc(basis::SparseProduct, BB::TupMat) = 
-      acquire!(basis.pool, :outb, (size(BB[1], 1), length(basis) ), _valtype(basis, BB) )
+# _alloc(basis::SparseProduct, BB::TupVec) = 
+#       acquire!(basis.pool, :out, (length(basis), ), _valtype(basis, BB) )
+
+# _alloc(basis::SparseProduct, BB::TupMat) = 
+#       acquire!(basis.pool, :outb, (size(BB[1], 1), length(basis) ), _valtype(basis, BB) )
+
+_out_size(basis::SparseProduct, BB::TupVec) = (length(basis), )
+_out_size(basis::SparseProduct, BB::TupMat) = (size(BB[1],1), length(basis))
+
+_out_size(basis::SparseProduct, BB::Tuple{AbstractVector, AbstractVector}) = (length(basis), )
+_out_size(basis::SparseProduct, BB::Tuple{AbstractMatrix, AbstractMatrix}) = (size(BB[1],1), length(basis))
 
 function _alloc_d(basis::SparseProduct, BBs::NTuple{NB, AbstractVecOrMat{T}}) where {NB, T}
       BBs_size = [size(bb) for bb in BBs]
diff --git a/src/sphericalharmonics/rylm.jl b/src/sphericalharmonics/rylm.jl
index 63c4ead..feb19cd 100644
--- a/src/sphericalharmonics/rylm.jl
+++ b/src/sphericalharmonics/rylm.jl
@@ -1,4 +1,5 @@
 using ChainRulesCore
+using HyperDualNumbers: Hyper
 
 export RYlmBasis 
 
@@ -26,6 +27,9 @@ RYlmBasis(alp::ALPolynomials{T}) where {T} =
 _valtype(sh::RYlmBasis{T}, ::Type{<: StaticVector{3, S}}) where {T <: Real, S <: Real} = 
 		promote_type(T, S)
 
+_valtype(sh::RYlmBasis{T}, ::Type{<: StaticVector{3, Hyper{S}}}) where {T <: Real, S <: Real} = 
+		promote_type(T, S)
+
 Base.show(io::IO, basis::RYlmBasis) = 
       print(io, "RYlmBasis(L=$(maxL(basis)))")		
 
@@ -33,7 +37,7 @@ Base.show(io::IO, basis::RYlmBasis) =
 
 function evaluate!(Y::AbstractArray, basis::RYlmBasis, X)
 	L = maxL(basis)
-   S = cart2spher(basis, X)
+    S = cart2spher(basis, X)
 	_P = _acqu_P!(basis, S)
 	P = evaluate!(_P, basis.alp, S)
 	rYlm!(Y, maxL(basis), S, parent(P), basis)
diff --git a/src/sphericalharmonics/sphericalharmonics.jl b/src/sphericalharmonics/sphericalharmonics.jl
index e4a58d8..4fe4086 100644
--- a/src/sphericalharmonics/sphericalharmonics.jl
+++ b/src/sphericalharmonics/sphericalharmonics.jl
@@ -1,7 +1,6 @@
 
-
+using HyperDualNumbers: Hyper
 using StaticArrays, LinearAlgebra, LoopVectorization
-
 export CYlmBasis, RYlmBasis, CRlmBasis, RRlmBasis 
 
 
@@ -13,6 +12,9 @@ export CYlmBasis, RYlmBasis, CRlmBasis, RRlmBasis
 
 spher2cart(S::SphericalCoords) = S.r * SVector(S.cosφ*S.sinθ, S.sinφ*S.sinθ, S.cosθ)
 
+## ---------- HyperDualNumbers utils ---------
+Base.atan(y::Hyper{T}, x::Hyper{T}) where {T} = atan(y / x)
+
 function cart2spher(R::AbstractVector) # ; SH = true)
 	@assert length(R) == 3
 	r = norm(R)
diff --git a/test/test_acemodel.jl b/test/test_acemodel.jl
new file mode 100644
index 0000000..94037c7
--- /dev/null
+++ b/test/test_acemodel.jl
@@ -0,0 +1,79 @@
+using LinearAlgebra, StaticArrays, Test, Printf
+using Polynomials4ML
+using Polynomials4ML: evaluate, evaluate_d, evaluate_ed, legendre_basis, RYlmBasis, rand_sphere
+using Polynomials4ML.Utils: gensparse
+using Polynomials4ML.Testing: print_tf, println_slim 
+using ForwardDiff
+using ACEbase.Testing: fdtest
+using Zygote
+using Lux
+using Lux
+using Random
+
+
+P4ML = Polynomials4ML
+rng = Random.default_rng()
+
+# simple Dot product layer with weight for testing
+module M1
+   using LuxCore, LinearAlgebra, Random 
+   import LuxCore:  AbstractExplicitLayer, initialparameters, initialstates
+   struct DotL <: AbstractExplicitLayer
+      nin::Int
+   end
+   function (l::DotL)(x::AbstractVector{<: Number}, ps, st)
+      return dot(x, ps.W), st
+   end
+   initialparameters(rng::AbstractRNG, l::DotL) = ( W = randn(rng, l.nin), )
+   initialstates(rng::AbstractRNG, l::DotL) = NamedTuple()
+end
+
+## 
+totdeg = 8
+maxL = 3
+
+# Radial embedding and spherical harmonics
+Rn = legendre_basis(totdeg)
+Ylm = RYlmBasis(maxL)
+ν = 2
+
+# Pooling and SparseProduct + n-corr 
+spec1p = [(i, y) for i = 1:totdeg for y = 1:maxL]
+bA = P4ML.PooledSparseProduct(spec1p)
+
+# define n-corr spec
+tup2b = vv -> [ spec1p[v] for v in vv[vv .> 0]  ]
+admissible = bb -> ((length(bb) == 0) || (sum(b[1] - 1 for b in bb ) < totdeg)) # cannot use <= since we cannot approxiate poly basis corresponding to (2, 15) with (15)
+filter = bb -> (length(bb) == 0 || sum(idx2lm(b[2])[1] for b in bb) <= maxL)
+specAA = gensparse(; NU = ν, tup2b = tup2b, admissible = admissible, filter = filter, minvv = fill(0, ν), maxvv = fill(length(spec1p), ν), ordered = true)
+spec = [ vv[vv .> 0] for vv in specAA if !(isempty(vv[vv .> 0]))]
+
+# correlation layer
+bAA = P4ML.SparseSymmProd(spec)
+
+
+# wrapping into lux layers
+l_Rn = P4ML.lux(Rn)
+l_Ylm = P4ML.lux(Ylm)
+l_bA = P4ML.lux(bA)
+l_bAA = P4ML.lux(bAA)
+
+# formming model with Lux Chain
+_norm(x) = norm.(x)
+
+l_xnx = Lux.Parallel(nothing; normx = WrappedFunction(_norm), x = WrappedFunction(identity))
+l_embed = Lux.Parallel(nothing; Rn = l_Rn, Ylm = l_Ylm)
+
+
+simpleacemodel = Chain(xnx = l_xnx, embed = l_embed, A = l_bA, AA = l_bAA, out = M1.DotL(length(bAA)))
+ps, st = Lux.setup(rng, simpleacemodel)
+
+bX = [ rand_sphere() for _ = 1:32 ] 
+simpleacemodel(bX, ps, st)
+
+F(X) = simpleacemodel(X, ps, st)[1]
+(l, st_), pb = pullback(x -> Lux.apply(simpleacemodel, x, ps, st), bX)
+gs = pb((l, nothing))[1]
+
+fdtest(F, dF, bX, verbose = true)
+
diff --git a/test/test_sparseproduct.jl b/test/test_sparseproduct.jl
index 6f2b492..070918e 100644
--- a/test/test_sparseproduct.jl
+++ b/test/test_sparseproduct.jl
@@ -6,6 +6,7 @@ using LinearAlgebra: norm
 using Polynomials4ML
 using ACEbase.Testing: fdtest
 using Zygote
+using HyperDualNumbers: Hyper
 
 test_evaluate(basis::SparseProduct, BB::Tuple{Vararg{<: AbstractVector}}) = 
        [ prod(BB[j][basis.spec[i][j]] for j = 1:length(BB)) 
@@ -24,6 +25,9 @@ N = [i * 4 for i = 1:NB]
 
 B = [randn(N[i]) for i = 1:NB]
 
+hB = [Hyper.(bb, 1.0, 1.0, 0) for bb in B]
+
+
 spec = sort([ Tuple([rand(1:N[i]) for i = 1:NB]) for _ = 1:6])
 
 basis = SparseProduct(spec)
@@ -110,14 +114,18 @@ end
 @info("Test serial evaluation")
 
 BB = Tuple(B)
+hBB = Tuple(hB)
 
 A1 = test_evaluate(basis, BB)
 A2 = evaluate(basis, BB)
+hA2 = evaluate(basis, hBB)
+hA2_val = [x.value for x in hA2]
 
 println_slim(@test A1 ≈ A2 )
+println_slim(@test A2 ≈ hA2_val )
+
 
 @info("Test serial evaluation_ed")
-BB = Tuple(B)
 
 test_evaluate_ed(basis, BB)
 
@@ -243,6 +251,8 @@ println()
 
 # try with rrule
 u, pb = Zygote.pullback(evaluate, basis, bBB)
+ll = pb(u)
+
 # u1, pb1 = Polynomials4ML._rrule_evaluate(basis, bBB)
 
 

From 448a4b8b8aec285eec51f398d9e4b3fcbe70f4f1 Mon Sep 17 00:00:00 2001
From: DexuanZhou <hbnis@icloud.com>
Date: Tue, 30 May 2023 15:17:27 +0800
Subject: [PATCH 29/54] add rrule for orth and delete ignore_derivatives

---
 src/lux.jl            |  4 +---
 src/orthopolybasis.jl | 15 ++++++++-------
 test/test_acemodel.jl |  2 +-
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/lux.jl b/src/lux.jl
index 07a7609..6cc5073 100644
--- a/src/lux.jl
+++ b/src/lux.jl
@@ -57,9 +57,7 @@ initialstates(rng::AbstractRNG, l::PolyLuxLayer) = _init_luxstate(rng, l.basis)
 (l::PolyLuxLayer)(args...) = evaluate(l, args...)
 
 function evaluate(l::PolyLuxLayer, X, ps, st) 
-   B = ChainRulesCore.ignore_derivatives() do 
-      evaluate(l.basis, X)
-   end
+   B = evaluate(l.basis, X)
    return B, st 
 end 
 
diff --git a/src/orthopolybasis.jl b/src/orthopolybasis.jl
index 72e719a..f517e21 100644
--- a/src/orthopolybasis.jl
+++ b/src/orthopolybasis.jl
@@ -1,5 +1,6 @@
 using LoopVectorization
-
+using ChainRulesCore
+using ChainRulesCore: NoTangent
 
 @doc raw"""
 `OrthPolyBasis1D3T:` defines a basis of polynomials in terms of a 3-term recursion, 
@@ -219,16 +220,16 @@ end
 
 
 # ------------------   rrules 
-
 # 
 # ∂_xa ( ∂P : P ) = ∑_ij ∂_xa ( ∂P_ij * P_ij ) 
 #                 = ∑_ij ∂P_ij * ∂_xa ( P_ij )
 #                 = ∑_ij ∂P_ij * dP_ij δ_ia
 #
-function rrule_evaluate!(P::AbstractArray, basis::OrthPolyBasis1D3T, X::AbstractVector)
-   nX = length(X) 
+function ChainRulesCore.rrule(::typeof(evaluate), basis::OrthPolyBasis1D3T, x::AbstractVector)
+   P = _alloc(basis, x)
+   nX = length(x) 
    dP = similar(P)
-   evaluate_ed!(P, dP, basis, X)
+   evaluate_ed!(P, dP, basis, x)
 
    function pb(∂P)
       ∂X = zeros(nX)
@@ -237,8 +238,8 @@ function rrule_evaluate!(P::AbstractArray, basis::OrthPolyBasis1D3T, X::Abstract
             ∂X[i] += ∂P[i, j] * dP[i, j]
          end
       end
-      return ∂X 
+      return NoTangent(), NoTangent(), ∂X 
    end
 
-   return P, pb 
+   return P, pb
 end
diff --git a/test/test_acemodel.jl b/test/test_acemodel.jl
index 94037c7..cacfd78 100644
--- a/test/test_acemodel.jl
+++ b/test/test_acemodel.jl
@@ -65,7 +65,7 @@ l_xnx = Lux.Parallel(nothing; normx = WrappedFunction(_norm), x = WrappedFunctio
 l_embed = Lux.Parallel(nothing; Rn = l_Rn, Ylm = l_Ylm)
 
 
-simpleacemodel = Chain(xnx = l_xnx, embed = l_embed, A = l_bA, AA = l_bAA, out = M1.DotL(length(bAA)))
+simpleacemodel = Chain(xnx = l_xnx, embed = l_embed)#, A = l_bA, AA = l_bAA, out = M1.DotL(length(bAA)))
 ps, st = Lux.setup(rng, simpleacemodel)
 
 bX = [ rand_sphere() for _ = 1:32 ] 

From cf19f362f106bcb5f4ff7a82035e11597abbf1bf Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Tue, 30 May 2023 00:49:51 -0700
Subject: [PATCH 30/54] add simpleacemodel fdtest, clean up later...

---
 src/ace/sparseprodpool.jl | 6 ++++--
 src/productbasis.jl       | 5 ++++-
 test/test_acemodel.jl     | 8 +++++---
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/ace/sparseprodpool.jl b/src/ace/sparseprodpool.jl
index c893406..ef15d18 100644
--- a/src/ace/sparseprodpool.jl
+++ b/src/ace/sparseprodpool.jl
@@ -1,3 +1,5 @@
+using ChainRulesCore
+using ChainRulesCore: NoTangent
 
 
 struct PooledSparseProduct{NB} <: AbstractPoly4MLBasis
@@ -272,7 +274,7 @@ end
 #       a cruder code generation strategy. This specialized code 
 #       confirms this. 
 
-function _pullback_evaluate!(∂BB, ∂A, basis::PooledSparseProduct{2}, BB::Tuple)
+function _pullback_evaluate!(∂BB, ∂A, basis::PooledSparseProduct{2}, BB::TupMat)
    nX = size(BB[1], 1)
    NB = 2 
    @assert length(∂A) == length(basis)
@@ -334,7 +336,7 @@ end
 # --------------------- connect with ChainRules 
 # todo ... 
 
-function rrule(::typeof(evaluate), basis::PooledSparseProduct{NB}, BB::TupMat) where {NB}
+function ChainRulesCore.rrule(::typeof(evaluate), basis::PooledSparseProduct{NB}, BB::TupMat) where {NB}
    A = evaluate(basis, BB)
 
    function pb(Δ)
diff --git a/src/productbasis.jl b/src/productbasis.jl
index 53ac5c2..f5206d8 100644
--- a/src/productbasis.jl
+++ b/src/productbasis.jl
@@ -1,7 +1,7 @@
 # Jerry: This is just a specific case of a general ProductBasis
 # I will do that later expanding this to a general case, but it is unclear
 # to me how to allow the basis to distinguish whether to use norm(x) or x efficiently
-struct ProductBasis{NB, TR, TY, TS}
+struct ProductBasis{NB, TR, TY, TS} <: AbstractPoly4MLBasis
    spec1::Vector{TS}
    bRnl::TR
    bYlm::TY
@@ -10,6 +10,9 @@ struct ProductBasis{NB, TR, TY, TS}
    @reqfields
 end
 
+ProductBasis(spec1, bRnl, bYlm) = 
+      ProductBasis(spec1, bRnl, bYlm, SparseProduct(spec1), _make_reqfields()...)
+
 (pbasis::ProductBasis)(args...) = evaluate(pbasis, args...)
 
 function evaluate(basis::ProductBasis, X::AbstractVector{<: AbstractVector})
diff --git a/test/test_acemodel.jl b/test/test_acemodel.jl
index cacfd78..3f2366e 100644
--- a/test/test_acemodel.jl
+++ b/test/test_acemodel.jl
@@ -65,15 +65,17 @@ l_xnx = Lux.Parallel(nothing; normx = WrappedFunction(_norm), x = WrappedFunctio
 l_embed = Lux.Parallel(nothing; Rn = l_Rn, Ylm = l_Ylm)
 
 
-simpleacemodel = Chain(xnx = l_xnx, embed = l_embed)#, A = l_bA, AA = l_bAA, out = M1.DotL(length(bAA)))
+simpleacemodel = Chain(xnx = l_xnx, embed = l_embed, A = l_bA , AA = l_bAA, out = M1.DotL(length(bAA)))
 ps, st = Lux.setup(rng, simpleacemodel)
 
 bX = [ rand_sphere() for _ = 1:32 ] 
 simpleacemodel(bX, ps, st)
 
 F(X) = simpleacemodel(X, ps, st)[1]
-(l, st_), pb = pullback(x -> Lux.apply(simpleacemodel, x, ps, st), bX)
-gs = pb((l, nothing))[1]
+dF(X) = Zygote.gradient(x -> Lux.apply(simpleacemodel, x, ps, st)[1], X)[1]
+#(l, st_), pb = pullback(x -> Lux.apply(simpleacemodel, x, ps, st), bX)
+# gs = pb((l, nothing))[1]
+
 
 fdtest(F, dF, bX, verbose = true)
 

From b74d6e3efd1c2400d9153e467d9783575779138a Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Tue, 30 May 2023 01:12:04 -0700
Subject: [PATCH 31/54] minor fix

---
 src/productbasis.jl | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/productbasis.jl b/src/productbasis.jl
index f5206d8..64e8265 100644
--- a/src/productbasis.jl
+++ b/src/productbasis.jl
@@ -10,8 +10,32 @@ struct ProductBasis{NB, TR, TY, TS} <: AbstractPoly4MLBasis
    @reqfields
 end
 
-ProductBasis(spec1, bRnl, bYlm) = 
-      ProductBasis(spec1, bRnl, bYlm, SparseProduct(spec1), _make_reqfields()...)
+function _invmap(a::AbstractVector)
+   inva = Dict{eltype(a), Int}()
+   for i = 1:length(a) 
+      inva[a[i]] = i 
+   end
+   return inva 
+end
+
+function dropnames(namedtuple::NamedTuple, names::Tuple{Vararg{Symbol}}) 
+   keepnames = Base.diff_names(Base._nt_names(namedtuple), names)
+   return NamedTuple{keepnames}(namedtuple)
+end
+
+function ProductBasis(spec1, bRnl, bYlm)
+   spec1idx = Vector{Tuple{Int, Int}}(undef, length(spec1)) 
+   spec_Rnl = bRnl.spec; inv_Rnl = _invmap(spec_Rnl)
+   spec_Ylm = natural_indices(bYlm); inv_Ylm = _invmap(spec_Ylm)
+
+   spec1idx = Vector{Tuple{Int, Int}}(undef, length(spec1))
+   for (i, b) in enumerate(spec1)
+      spec1idx[i] = (inv_Rnl[dropnames(b,(:m,))], inv_Ylm[(l=b.l, m=b.m)])
+   end
+   sparsebasis = SparseProduct(spec1idx)
+   return ProductBasis(spec1, bRnl, bYlm, sparsebasis, _make_reqfields()...)
+end
+
 
 (pbasis::ProductBasis)(args...) = evaluate(pbasis, args...)
 

From 4b5a8627876dbe9b5582c740575def6028bb5808 Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Tue, 30 May 2023 02:10:01 -0700
Subject: [PATCH 32/54] fix sphericalHarmonicsHyperDual issue

---
 src/sphericalharmonics/rylm.jl | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/sphericalharmonics/rylm.jl b/src/sphericalharmonics/rylm.jl
index feb19cd..d1614d4 100644
--- a/src/sphericalharmonics/rylm.jl
+++ b/src/sphericalharmonics/rylm.jl
@@ -28,7 +28,7 @@ _valtype(sh::RYlmBasis{T}, ::Type{<: StaticVector{3, S}}) where {T <: Real, S <:
 		promote_type(T, S)
 
 _valtype(sh::RYlmBasis{T}, ::Type{<: StaticVector{3, Hyper{S}}}) where {T <: Real, S <: Real} = 
-		promote_type(T, S)
+		promote_type(T, Hyper{S})
 
 Base.show(io::IO, basis::RYlmBasis) = 
       print(io, "RYlmBasis(L=$(maxL(basis)))")		
@@ -40,6 +40,7 @@ function evaluate!(Y::AbstractArray, basis::RYlmBasis, X)
     S = cart2spher(basis, X)
 	_P = _acqu_P!(basis, S)
 	P = evaluate!(_P, basis.alp, S)
+	@show typeof(parent(P))
 	rYlm!(Y, maxL(basis), S, parent(P), basis)
 	return Y
 end
@@ -141,10 +142,10 @@ end
 function rYlm!(Y::AbstractMatrix, L, S::AbstractVector{SphericalCoords{T}}, 
 				   P::AbstractMatrix, basis::RYlmBasis) where {T} 
    nX = length(S) 
-	@assert size(P, 1) >= nX
+   @assert size(P, 1) >= nX
    @assert size(P, 2) >= sizeP(L)
    @assert size(Y, 1) >= nX
-	@assert size(Y, 2) >= sizeY(L)
+   @assert size(Y, 2) >= sizeY(L)
 
    sinφ = acquire!(basis.tmp, :sin, (nX,), T)
    cosφ = acquire!(basis.tmp, :cos, (nX,), T)
@@ -153,19 +154,19 @@ function rYlm!(Y::AbstractMatrix, L, S::AbstractVector{SphericalCoords{T}},
 
    @inbounds begin 
       for i = 1:nX 
-         sinφ[i] = S[i].sinφ
-         cosφ[i] = S[i].cosφ
-         sinmφ[i] = 0.0
-         cosmφ[i] = 1.0
+        sinφ[i] = S[i].sinφ
+        cosφ[i] = S[i].cosφ
+        sinmφ[i] = 0.0
+        cosmφ[i] = 1.0
       end
 
       oort2 = 1 / sqrt(2)
       for l = 0:L
-         i_yl0 = index_y(l, 0)
-         i_pl0 = index_p(l, 0)
-         @avx for i = 1:nX
-            Y[i, i_yl0] = P[i, i_pl0] * oort2
-         end
+        i_yl0 = index_y(l, 0)
+        i_pl0 = index_p(l, 0)
+        @avx for i = 1:nX
+           Y[i, i_yl0] = P[i, i_pl0] * oort2
+        end
       end
 
       for m in 1:L

From d4840fb6abbbc26256f0dc793667297cbaa7c070 Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Tue, 30 May 2023 02:12:56 -0700
Subject: [PATCH 33/54] remove debug message

---
 src/sphericalharmonics/rylm.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/sphericalharmonics/rylm.jl b/src/sphericalharmonics/rylm.jl
index d1614d4..1d15b09 100644
--- a/src/sphericalharmonics/rylm.jl
+++ b/src/sphericalharmonics/rylm.jl
@@ -40,7 +40,6 @@ function evaluate!(Y::AbstractArray, basis::RYlmBasis, X)
     S = cart2spher(basis, X)
 	_P = _acqu_P!(basis, S)
 	P = evaluate!(_P, basis.alp, S)
-	@show typeof(parent(P))
 	rYlm!(Y, maxL(basis), S, parent(P), basis)
 	return Y
 end

From 6d6f443bfc8ce8655d64458966cccdc89e7dca49 Mon Sep 17 00:00:00 2001
From: dhan-02 <dhanush.anantharaman@gmail.com>
Date: Wed, 31 May 2023 00:01:22 -0700
Subject: [PATCH 34/54] Added ChebBasis

---
 src/Polynomials4ML.jl |   1 +
 src/chebbasis.jl      | 173 ++++++++++++++++++++++++++++++++++++++++++
 test/test_cheb.jl     |  30 ++++++++
 3 files changed, 204 insertions(+)
 create mode 100644 src/chebbasis.jl
 create mode 100644 test/test_cheb.jl

diff --git a/src/Polynomials4ML.jl b/src/Polynomials4ML.jl
index 0fedaba..c6f150a 100644
--- a/src/Polynomials4ML.jl
+++ b/src/Polynomials4ML.jl
@@ -52,6 +52,7 @@ include("monomials.jl")
 # 2d harmonics / trigonometric polynomials 
 include("trig.jl")
 include("rtrig.jl")
+include("chebbasis.jl")
 
 # 3d harmonics 
 include("sphericalharmonics/sphericalharmonics.jl")
diff --git a/src/chebbasis.jl b/src/chebbasis.jl
new file mode 100644
index 0000000..643703d
--- /dev/null
+++ b/src/chebbasis.jl
@@ -0,0 +1,173 @@
+export ChebBasis
+
+"""
+`ChebBasis(N)`: 
+
+Chebyshev polynomials up to degree `N-1` (inclusive). i.e  basis with length N. The basis is ordered as 
+```
+[1, x, 2x^2-1, 4x^3-3x....2Tn-1(x)-Tn-2(x)]
+```
+where `x` is input variable. 
+"""
+struct ChebBasis <: AbstractPoly4MLBasis
+   N::Int
+   @reqfields
+end
+
+ChebBasis(N::Integer) = ChebBasis(N, _make_reqfields()...)
+
+Base.length(basis::ChebBasis) = basis.N
+
+natural_indices(basis::ChebBasis) = 0:length(basis)-1
+
+_valtype(basis::ChebBasis, T::Type{<: Real}) = T
+
+
+function evaluate!(P::AbstractVector, basis::ChebBasis, x::Real)
+   N = basis.N 
+   @assert N  >= 2
+   @assert length(P) >= length(basis) # N
+
+   P[1] = 1
+   P[2] = x
+   for k = 3:N 
+      @inbounds P[k] = 2*x*P[k-1] - P[k-2]
+   end
+   return P 
+end 
+
+
+
+function evaluate!(P::AbstractMatrix, basis::ChebBasis, 
+                   x::AbstractVector{<: Real})
+   N = basis.N 
+   nX = length(x)
+   @assert N  >= 2
+   @assert size(P, 2) >= length(basis) # N
+   @assert size(P, 1) >= nX
+
+   @inbounds begin 
+      @simd ivdep for i = 1:nX 
+         P[i, 1] = 1
+         P[i, 2] = x[i]
+      end
+
+      for k = 3:N 
+         @simd ivdep for i = 1:nX 
+            P[i,k] = 2*x[i]*P[i,k-1]-P[i,k-2]
+         end
+      end
+   end
+   return P 
+end 
+
+function evaluate_ed!(P::AbstractVector, dP::AbstractVector, 
+                      basis::ChebBasis, x::Real)
+   N = basis.N 
+   nX = length(x)
+   @assert N  >= 2
+   @assert length(P) >= length(basis)  
+   @assert length(dP) >= length(basis) 
+
+   @inbounds begin 
+      P[1] = 1
+      dP[1] = 0
+      P[2] = x
+      dP[2] = 1
+      for k = 3:N 
+        P[k] = 2*x*P[k-1] - P[k-2]
+        dP[k] = 2*P[k-1] + 2*x*dP[k-1] - dP[k-2] 
+      end
+   end
+   return P, dP 
+end 
+
+
+function evaluate_ed!(P::AbstractMatrix, dP::AbstractMatrix, basis::ChebBasis, 
+                      x::AbstractVector{<: Real})
+   N = basis.N 
+   nX = length(x)
+   @assert N  >= 2
+   @assert size(P, 2) >= length(basis) # N
+   @assert size(P, 1) >= nX
+   @assert size(dP, 2) >= length(basis) # N
+   @assert size(dP, 1) >= nX
+
+   @inbounds begin 
+      @simd ivdep for i = 1:nX 
+         P[i, 1] = 1
+         dP[i, 1] = 0
+         P[i, 2] = x[i]
+         dP[i, 2] = 1
+      end
+
+      for k = 3:N 
+         @simd ivdep for i = 1:nX 
+            P[i,k] = 2*x[i]*P[i,k-1]-P[i,k-2]
+            dP[i,k] = 2*P[i,k-1] + 2*x[i]*dP[i,k-1] - dP[i,k-2]
+         end
+      end
+   end
+   return P, dP 
+end 
+
+
+function evaluate_ed2!(P::AbstractVector, dP::AbstractVector, ddP::AbstractVector,
+                       basis::ChebBasis, x::Real)
+   N = basis.N 
+   @assert N  >= 2
+   @assert length(P) >= length(basis) # N
+   @assert length(dP) >= length(basis) # N
+   @assert length(ddP) >= length(basis) # N
+
+   @inbounds begin 
+      P[1] = 1
+      P[2] = x
+      dP[1] = 0
+      dP[2] = 1
+      ddP[1] = 0
+      ddP[2] = 0
+
+      for k = 3:N 
+        P[k] = 2*x*P[k-1] - P[k-2]
+        dP[k] = 2*P[k-1] + 2*x*dP[k-1] - dP[k-2]
+        ddP[k] = 2*dP[k-1] + 2*dP[k-1] + 2*x*ddP[k-1] - ddP[k-2]
+      end
+   end
+   return P, dP, ddP 
+end 
+
+
+
+function evaluate_ed2!(P::AbstractMatrix, dP::AbstractMatrix, ddP::AbstractMatrix, basis::ChebBasis, 
+                      x::AbstractVector{<: Real})
+   N = basis.N 
+   nX = length(x)
+   @assert N  >= 2
+   @assert size(P, 2) >= length(basis) # N
+   @assert size(P, 1) >= nX
+   @assert size(dP, 2) >= length(basis) # N
+   @assert size(dP, 1) >= nX
+   @assert size(ddP, 2) >= length(basis) # N
+   @assert size(ddP, 1) >= nX
+
+   @inbounds begin 
+      @simd ivdep for i = 1:nX 
+         P[i, 1] = 1
+         P[i, 2] = x[i]
+         dP[i, 1] = 0
+         dP[i, 2] = 1
+         ddP[i, 1] = 0
+         ddP[i, 2] = 0     
+      end
+
+      for k = 3:N 
+         @simd ivdep for i = 1:nX 
+            P[i,k] = 2*x[i]*P[i,k-1] - P[i,k-2]
+            dP[i,k] = 2*P[i,k-1] + 2*x[i]*dP[i,k-1] - dP[i,k-2]
+            ddP[i,k] = 2*dP[i,k-1] + 2*dP[i,k-1] + 2*x[i]*ddP[i,k-1] - ddP[i,k-2]
+         end
+      end
+   end
+   return P, dP, ddP 
+end 
diff --git a/test/test_cheb.jl b/test/test_cheb.jl
new file mode 100644
index 0000000..bff88bf
--- /dev/null
+++ b/test/test_cheb.jl
@@ -0,0 +1,30 @@
+using Polynomials4ML, Test
+using Polynomials4ML: evaluate, evaluate_d, evaluate_dd
+using Polynomials4ML.Testing: println_slim, print_tf, test_derivatives
+
+
+##
+
+@info("Testing Real Chebyshev Polynomials (ChebBasis)")
+N = 10
+basis = ChebBasis(N) 
+
+@info("      correctness")
+mm = natural_indices(basis)
+print_tf(@test mm == 0:N-1)
+
+for ntest = 1:30
+   θ = 2*π * rand()
+   x = cos(θ)
+   P = basis(x)
+   P2 = [ cos(m*θ) for m in mm ]
+   print_tf(@test P ≈ P2)
+end
+println() 
+
+
+##
+
+@info("      test derivatives")
+generate_x = () -> 2*rand()-1
+test_derivatives(basis, generate_x)

From 5251d93dbf4e7b128333066d94c3d394db63d13b Mon Sep 17 00:00:00 2001
From: DexuanZhou <hbnis@icloud.com>
Date: Wed, 31 May 2023 21:20:11 +0800
Subject: [PATCH 35/54] fix atan for hyperdual

---
 src/sphericalharmonics/sphericalharmonics.jl | 6 ++----
 test/test_acemodel.jl                        | 1 -
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/sphericalharmonics/sphericalharmonics.jl b/src/sphericalharmonics/sphericalharmonics.jl
index 4fe4086..718de5e 100644
--- a/src/sphericalharmonics/sphericalharmonics.jl
+++ b/src/sphericalharmonics/sphericalharmonics.jl
@@ -13,7 +13,7 @@ export CYlmBasis, RYlmBasis, CRlmBasis, RRlmBasis
 spher2cart(S::SphericalCoords) = S.r * SVector(S.cosφ*S.sinθ, S.sinφ*S.sinθ, S.cosθ)
 
 ## ---------- HyperDualNumbers utils ---------
-Base.atan(y::Hyper{T}, x::Hyper{T}) where {T} = atan(y / x)
+Base.atan(y::Hyper{T}, x::Hyper{T}) where {T} = atan(y/x)*(x != 0) + (1-2*(y<0))*(pi*(x<0) + 1/2*pi*(x==0))
 
 function cart2spher(R::AbstractVector) # ; SH = true)
 	@assert length(R) == 3
@@ -153,6 +153,4 @@ _acqu_ddP!(basis::XlmBasis, S) = _acqu_alp!(:alpddP, basis, S)
 function rand_sphere() 
 	r = @SVector randn(3)
 	return r / norm(r)
-end
-
-
+end
\ No newline at end of file
diff --git a/test/test_acemodel.jl b/test/test_acemodel.jl
index 3f2366e..fdd6cc7 100644
--- a/test/test_acemodel.jl
+++ b/test/test_acemodel.jl
@@ -7,7 +7,6 @@ using ForwardDiff
 using ACEbase.Testing: fdtest
 using Zygote
 using Lux
-using Lux
 using Random
 
 

From 4b8b1c9235adc56b9d5f2ac8aaab8364bfc14b0d Mon Sep 17 00:00:00 2001
From: DexuanZhou <hbnis@icloud.com>
Date: Wed, 31 May 2023 22:11:00 +0800
Subject: [PATCH 36/54] minor fix

---
 src/sphericalharmonics/sphericalharmonics.jl | 5 -----
 src/utils/hyper.jl                           | 5 +++++
 src/utils/utils.jl                           | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)
 create mode 100644 src/utils/hyper.jl

diff --git a/src/sphericalharmonics/sphericalharmonics.jl b/src/sphericalharmonics/sphericalharmonics.jl
index 718de5e..7392ded 100644
--- a/src/sphericalharmonics/sphericalharmonics.jl
+++ b/src/sphericalharmonics/sphericalharmonics.jl
@@ -1,5 +1,3 @@
-
-using HyperDualNumbers: Hyper
 using StaticArrays, LinearAlgebra, LoopVectorization
 export CYlmBasis, RYlmBasis, CRlmBasis, RRlmBasis 
 
@@ -12,9 +10,6 @@ export CYlmBasis, RYlmBasis, CRlmBasis, RRlmBasis
 
 spher2cart(S::SphericalCoords) = S.r * SVector(S.cosφ*S.sinθ, S.sinφ*S.sinθ, S.cosθ)
 
-## ---------- HyperDualNumbers utils ---------
-Base.atan(y::Hyper{T}, x::Hyper{T}) where {T} = atan(y/x)*(x != 0) + (1-2*(y<0))*(pi*(x<0) + 1/2*pi*(x==0))
-
 function cart2spher(R::AbstractVector) # ; SH = true)
 	@assert length(R) == 3
 	r = norm(R)
diff --git a/src/utils/hyper.jl b/src/utils/hyper.jl
new file mode 100644
index 0000000..2991619
--- /dev/null
+++ b/src/utils/hyper.jl
@@ -0,0 +1,5 @@
+
+using HyperDualNumbers: Hyper
+
+## ---------- HyperDualNumbers utils ---------
+Base.atan(y::Hyper{T}, x::Hyper{T}) where {T} = atan(y/x)*(x != 0) + (1-2*(y<0))*(pi*(x<0) + 1/2*pi*(x==0))
diff --git a/src/utils/utils.jl b/src/utils/utils.jl
index 6d3aba8..bccf1af 100644
--- a/src/utils/utils.jl
+++ b/src/utils/utils.jl
@@ -1,5 +1,5 @@
 module Utils 
 
 include("sparse.jl")
-
+include("hyper.jl")
 end
\ No newline at end of file

From 5a8b85424f2431ccc8c51bd9285aae1cda445127 Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Fri, 2 Jun 2023 01:25:19 -0700
Subject: [PATCH 37/54] general interface rrule, abstract subtypes and fix avx
 in spherical harmonics.

---
 Project.toml                                  |  4 +-
 src/ace/sparseprodpool.jl                     |  6 +--
 .../atomicorbitalsradials.jl                  | 30 ++++++++-------
 src/interface.jl                              | 23 +++++++++++
 src/lux.jl                                    |  7 +++-
 src/monomials.jl                              |  2 +-
 src/orthopolybasis.jl                         | 38 +++++++++----------
 src/sparseproduct.jl                          | 23 +++++------
 src/sphericalharmonics/alp.jl                 |  2 +-
 src/sphericalharmonics/crlm.jl                |  2 +-
 src/sphericalharmonics/cylm.jl                |  2 +-
 src/sphericalharmonics/rrlm.jl                |  2 +-
 src/sphericalharmonics/rylm.jl                |  8 ++--
 temp/hypers.jl                                |  4 +-
 test/sphericalharmonics/test_rylm.jl          | 11 +++++-
 test/test_atorbrad.jl                         | 12 +++---
 test/test_op1d3t.jl                           | 23 ++++++++++-
 test/test_sparseproduct.jl                    |  1 +
 18 files changed, 129 insertions(+), 71 deletions(-)

diff --git a/Project.toml b/Project.toml
index 0037541..e65d2fb 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,7 +5,6 @@ version = "0.1.3"
 
 [deps]
 ACEbase = "14bae519-eb20-449c-a949-9c58ed33163e"
-ACEcore = "44c1e890-45d1-48ea-94d6-c2ea5b573f71"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
@@ -47,6 +46,7 @@ julia = "1.8"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+ACEcore = "44c1e890-45d1-48ea-94d6-c2ea5b573f71"
 
 [targets]
-test = ["Test", "Lux", "Printf"]
+test = ["Test", "Lux", "Printf", "ACEcore"]
diff --git a/src/ace/sparseprodpool.jl b/src/ace/sparseprodpool.jl
index ef15d18..3bc94dd 100644
--- a/src/ace/sparseprodpool.jl
+++ b/src/ace/sparseprodpool.jl
@@ -32,9 +32,9 @@ end
 
 # ----------------------- evaluation interfaces 
 
-const TupVec = Tuple{Vararg{<: AbstractVector}}
-const TupMat = Tuple{Vararg{<: AbstractMatrix}}
-const TupVecMat = Union{TupVec, TupMat}
+# const TupVec = Tuple{Vararg{<: AbstractVector}}
+# const TupMat = Tuple{Vararg{<: AbstractMatrix}}
+# const TupVecMat = Union{TupVec, TupMat}
 
 _valtype(basis::AbstractPoly4MLBasis, BB::Tuple) = 
       mapreduce(eltype, promote_type, BB)
diff --git a/src/atomicorbitalsradials/atomicorbitalsradials.jl b/src/atomicorbitalsradials/atomicorbitalsradials.jl
index d742a12..461d0e9 100644
--- a/src/atomicorbitalsradials/atomicorbitalsradials.jl
+++ b/src/atomicorbitalsradials/atomicorbitalsradials.jl
@@ -5,7 +5,7 @@ using ChainRulesCore: NoTangent
 const NLM{T} = NamedTuple{(:n1, :n2, :l, :m), Tuple{T, T, T, T}}
 const NL{T} = NamedTuple{(:n1, :n2, :l), Tuple{T, T, T}}
 
-struct AtomicOrbitalsRadials{TP, TD, TI, TZ}  <: AbstractPoly4MLBasis
+struct AtomicOrbitalsRadials{TP, TD, TI, TZ}  <: ScalarPoly4MLBasis
    Pn::TP
    Dn::TD
    spec::Vector{NL{TI}}
@@ -94,19 +94,21 @@ function evaluate_ed2!(Rnl, dRnl, ddRnl, basis::AtomicOrbitalsRadials, R)
 end
 
 # not test
-function ChainRulesCore.rrule(::typeof(evaluate), basis::AtomicOrbitalsRadials, R::AbstractVector{<: Real})
-   A  = evaluate(basis, R)
-   ∂R = similar(R)
-   dR = evaluate_ed(basis, R)[2]
-   function pb(∂A)
-        @assert size(∂A) == (length(R), length(basis))
-        for i = 1:length(R)
-            ∂R[i] = dot(@view(∂A[i, :]), @view(dR[i, :]))
-        end
-        return NoTangent(), NoTangent(), ∂R
-   end
-   return A, pb
-end
+# function ChainRulesCore.rrule(::typeof(evaluate), basis::AtomicOrbitalsRadials, R::AbstractVector{<: Real})
+#    # A  = evaluate(basis, R)
+#    # ∂R = similar(R)
+#    # dR = evaluate_ed(basis, R)[2]
+#    A, dR = evaluate_ed(basis, R)
+#    ∂R = similar(R)
+#    function pb(∂A)
+#         @assert size(∂A) == (length(R), length(basis))
+#         for i = 1:length(R)
+#             ∂R[i] = dot(@view(∂A[i, :]), @view(dR[i, :]))
+#         end
+#         return NoTangent(), NoTangent(), ∂R
+#    end
+#    return A, pb
+# end
 
 include("gaussian.jl")
 include("slater.jl")
diff --git a/src/interface.jl b/src/interface.jl
index a561bd0..d7a53fb 100644
--- a/src/interface.jl
+++ b/src/interface.jl
@@ -1,7 +1,12 @@
 using StaticArrays: StaticArray, SVector, StaticVector, similar_type
+using ChainRulesCore
 
 abstract type AbstractPoly4MLBasis end
 
+abstract type ScalarPoly4MLBasis <: AbstractPoly4MLBasis end
+
+abstract type SVecPoly4MLBasis <: AbstractPoly4MLBasis end
+
 # ---------------------------------------
 # some helpers to deal with the three required arrays: 
 
@@ -51,6 +56,9 @@ end
 const SINGLE = Union{Number, StaticArray, SphericalCoords}
 const BATCH = AbstractVector{<: SINGLE}
 
+const TupVec = Tuple{Vararg{<: AbstractVector}}
+const TupMat = Tuple{Vararg{<: AbstractMatrix}}
+const TupVecMat = Union{TupVec, TupMat}
 # ---------------------------------------
 # managing defaults for input-output types
 
@@ -225,4 +233,19 @@ function evaluate_ed2!(flex_B::FlexArray,
    B, dB, ddB = _alloc_ed2(flex_B, flex_dB, flex_ddB, basis, x)
    evaluate_ed2!(B, dB, ddB, basis, x)
    return B, dB, ddB
+end
+
+# --------------------------------------- 
+# general rrule and frule interface for ChainRulesCore
+function ChainRulesCore.rrule(::typeof(evaluate), basis::ScalarPoly4MLBasis, R::AbstractVector{<: Real})
+   A, dR = evaluate_ed(basis, R)
+   ∂R = similar(R)
+   function pb(∂A)
+        @assert size(∂A) == (length(R), length(basis))
+        for i = 1:length(R)
+            ∂R[i] = dot(@view(∂A[i, :]), @view(dR[i, :]))
+        end
+        return NoTangent(), NoTangent(), ∂R
+   end
+   return A, pb
 end
\ No newline at end of file
diff --git a/src/lux.jl b/src/lux.jl
index 6cc5073..3548750 100644
--- a/src/lux.jl
+++ b/src/lux.jl
@@ -56,7 +56,12 @@ initialstates(rng::AbstractRNG, l::PolyLuxLayer) = _init_luxstate(rng, l.basis)
 
 (l::PolyLuxLayer)(args...) = evaluate(l, args...)
 
-function evaluate(l::PolyLuxLayer, X, ps, st) 
+function evaluate(l::PolyLuxLayer, X, ps, st)
+   
+   # TODO: after we make sure we want to migrate to HyperDualNumbers in any cases we can ignore_derivatives from ChainRulesCore
+   #B = ChainRulesCore.ignore_derivatives() do 
+   #   evaluate(l.basis, X)
+   #end
    B = evaluate(l.basis, X)
    return B, st 
 end 
diff --git a/src/monomials.jl b/src/monomials.jl
index 221386a..77ed14d 100644
--- a/src/monomials.jl
+++ b/src/monomials.jl
@@ -6,7 +6,7 @@ export MonoBasis
 """
 Standard Monomials basis. This should very rarely be used. Possibly useful in combination with a transformation of the inputs, e.g. exponential.
 """
-struct MonoBasis <: AbstractPoly4MLBasis
+struct MonoBasis <: ScalarPoly4MLBasis
    N::Int
    pool::POOL
    # ----------------- metadata 
diff --git a/src/orthopolybasis.jl b/src/orthopolybasis.jl
index f517e21..bf471ca 100644
--- a/src/orthopolybasis.jl
+++ b/src/orthopolybasis.jl
@@ -17,7 +17,7 @@ can be either continuous or discrete but must have a density function. See also
 * `chebyshev_basis`
 * `jacobi_basis`
 """
-struct OrthPolyBasis1D3T{T} <: AbstractPoly4MLBasis
+struct OrthPolyBasis1D3T{T} <: ScalarPoly4MLBasis
    # ----------------- the recursion coefficients
    A::Vector{T}
    B::Vector{T}
@@ -225,21 +225,21 @@ end
 #                 = ∑_ij ∂P_ij * ∂_xa ( P_ij )
 #                 = ∑_ij ∂P_ij * dP_ij δ_ia
 #
-function ChainRulesCore.rrule(::typeof(evaluate), basis::OrthPolyBasis1D3T, x::AbstractVector)
-   P = _alloc(basis, x)
-   nX = length(x) 
-   dP = similar(P)
-   evaluate_ed!(P, dP, basis, x)
-
-   function pb(∂P)
-      ∂X = zeros(nX)
-      for j = 1:length(basis) 
-         for i = 1:nX 
-            ∂X[i] += ∂P[i, j] * dP[i, j]
-         end
-      end
-      return NoTangent(), NoTangent(), ∂X 
-   end
-
-   return P, pb
-end
+# function ChainRulesCore.rrule(::typeof(evaluate), basis::OrthPolyBasis1D3T, x::AbstractVector)
+#    #P = _alloc(basis, x)
+#    nX = length(x) 
+#    #dP = similar(P)
+#    P, dP = evaluate_ed(basis, x)
+
+#    function pb(∂P)
+#       ∂X = zeros(eltype(x), nX)
+#       for j = 1:length(basis) 
+#          for i = 1:nX 
+#             ∂X[i] += ∂P[i, j] * dP[i, j]
+#          end
+#       end
+#       return NoTangent(), NoTangent(), ∂X 
+#    end
+
+#    return P, pb
+# end
diff --git a/src/sparseproduct.jl b/src/sparseproduct.jl
index dfdd8c8..90207fc 100644
--- a/src/sparseproduct.jl
+++ b/src/sparseproduct.jl
@@ -4,7 +4,7 @@ using ChainRulesCore: NoTangent
 struct SparseProduct{NB} <: AbstractPoly4MLBasis
    spec::Vector{NTuple{NB, Int}}
    # ---- temporaries & caches
-   @reqfields()   
+   @reqfields()
 end
 
 function SparseProduct()
@@ -70,22 +70,17 @@ function _frule_frule_evaluate(basis::SparseProduct, BB::Tuple{Vararg{AbstractMa
 end
 
 # ----------------------- overiding alloc functions
-const TupVec = Tuple{Vararg{<: AbstractVector}}
-const TupMat = Tuple{Vararg{<: AbstractMatrix}}
-const TupVecMat = Union{TupVec, TupMat}
+# const TupVec = Tuple{Vararg{<: AbstractVector}}
+# const TupMat = Tuple{Vararg{<: AbstractMatrix}}
+# const TupVecMat = Union{TupVec, TupMat}
 
 # specifically for SparseProduct/PooledSparseProduct
-_outsym(x::NTuple{NB, TupVec}) where {NB} = :out
-_outsym(X::NTuple{NB, TupMat}) where {NB} = :outb
+_outsym(x::TupVec) = :out
+_outsym(X::TupMat) = :outb
 
-_outsym(x::Tuple{AbstractVector, AbstractVector}) = :out
-_outsym(X::Tuple{AbstractMatrix, AbstractMatrix}) = :outb
-
-# _alloc(basis::SparseProduct, BB::TupVec) = 
-#       acquire!(basis.pool, :out, (length(basis), ), _valtype(basis, BB) )
-
-# _alloc(basis::SparseProduct, BB::TupMat) = 
-#       acquire!(basis.pool, :outb, (size(BB[1], 1), length(basis) ), _valtype(basis, BB) )
+# TODO: generalize it
+#_outsym(x::Tuple{AbstractVector, AbstractVector}) = :out
+#_outsym(X::Tuple{AbstractMatrix, AbstractMatrix}) = :outb
 
 _out_size(basis::SparseProduct, BB::TupVec) = (length(basis), )
 _out_size(basis::SparseProduct, BB::TupMat) = (size(BB[1],1), length(basis))
diff --git a/src/sphericalharmonics/alp.jl b/src/sphericalharmonics/alp.jl
index 0b47dd5..337e7b2 100644
--- a/src/sphericalharmonics/alp.jl
+++ b/src/sphericalharmonics/alp.jl
@@ -13,7 +13,7 @@ Important Note: `evaluate_ed!`` does NOT return derivatives, but rather
 produces rescaled derivatives for better numerical stability near the poles. 
 See comments in code for details on how to use the ALP derivatives correctly. 
 """
-struct ALPolynomials{T} <: AbstractPoly4MLBasis
+struct ALPolynomials{T} <: ScalarPoly4MLBasis
 	L::Int
 	A::Vector{T}
 	B::Vector{T}
diff --git a/src/sphericalharmonics/crlm.jl b/src/sphericalharmonics/crlm.jl
index 78597d5..c357586 100644
--- a/src/sphericalharmonics/crlm.jl
+++ b/src/sphericalharmonics/crlm.jl
@@ -9,7 +9,7 @@ solid harmonics:
 
 γₗᵐ(r, θ, φ) = rˡYₗᵐ(θ, φ)
 """
-struct CRlmBasis{T} <: AbstractPoly4MLBasis
+struct CRlmBasis{T} <: SVecPoly4MLBasis
     alp::ALPolynomials{T}
 	 @reqfields
 end
diff --git a/src/sphericalharmonics/cylm.jl b/src/sphericalharmonics/cylm.jl
index d7dc830..1d2c9ee 100644
--- a/src/sphericalharmonics/cylm.jl
+++ b/src/sphericalharmonics/cylm.jl
@@ -10,7 +10,7 @@ The input variable is normally an `rr::SVector{3, T}`. This `rr` need not be nor
 * `maxL` : maximum degree of the spherical harmonics
 * `T` : type used to store the coefficients for the associated legendre functions
 """
-struct CYlmBasis{T} <: AbstractPoly4MLBasis
+struct CYlmBasis{T} <: SVecPoly4MLBasis
 	alp::ALPolynomials{T}
    @reqfields
 end
diff --git a/src/sphericalharmonics/rrlm.jl b/src/sphericalharmonics/rrlm.jl
index c0209d8..d2fea8c 100644
--- a/src/sphericalharmonics/rrlm.jl
+++ b/src/sphericalharmonics/rrlm.jl
@@ -11,7 +11,7 @@ Sₗ⁰ = √(4π/2l+1) rˡP̄ₗ⁰/√2
 Sₗᵐ = (-1)ᵐ√(8π/2l+1) rˡ Re(P̄ₗᵐ(cosθ)/√2 exp(imφ))
 Sₗ⁻ᵐ = (-1)ᵐ√(8π/2l+1) rˡIm(P̄ₗᵐ(cosθ)/√2 exp(imφ))
 """
-struct RRlmBasis{T} <: AbstractPoly4MLBasis
+struct RRlmBasis{T} <: SVecPoly4MLBasis
     alp::ALPolynomials{T}
 	 @reqfields
 end
diff --git a/src/sphericalharmonics/rylm.jl b/src/sphericalharmonics/rylm.jl
index 1d15b09..3964336 100644
--- a/src/sphericalharmonics/rylm.jl
+++ b/src/sphericalharmonics/rylm.jl
@@ -13,7 +13,7 @@ The input variable is normally an `rr::SVector{3, T}`. This `rr` need not be nor
 * `maxL` : maximum degree of the spherical harmonics
 * `T` : type used to store the coefficients for the associated legendre functions
 """
-struct RYlmBasis{T} <: AbstractPoly4MLBasis
+struct RYlmBasis{T} <: SVecPoly4MLBasis
 	alp::ALPolynomials{T}
 	@reqfields
 end
@@ -163,13 +163,13 @@ function rYlm!(Y::AbstractMatrix, L, S::AbstractVector{SphericalCoords{T}},
       for l = 0:L
         i_yl0 = index_y(l, 0)
         i_pl0 = index_p(l, 0)
-        @avx for i = 1:nX
+        @simd ivdep for i = 1:nX
            Y[i, i_yl0] = P[i, i_pl0] * oort2
         end
       end
 
       for m in 1:L
-         @avx for i = 1:nX
+         @simd ivdep for i = 1:nX
             cmi = cosmφ[i]
             smi = sinmφ[i]
             cosmφ[i] = cmi * cosφ[i] - smi * sinφ[i]
@@ -180,7 +180,7 @@ function rYlm!(Y::AbstractMatrix, L, S::AbstractVector{SphericalCoords{T}},
             i_plm = index_p(l, m)
             i_ylm⁺ = index_y(l, m)
             i_ylm⁻ = index_y(l, -m)
-            @avx for i = 1:nX
+            @simd ivdep for i = 1:nX
                p = P[i, i_plm]
                Y[i, i_ylm⁺] =  p * cosmφ[i]
                Y[i, i_ylm⁻] = -p * sinmφ[i]
diff --git a/temp/hypers.jl b/temp/hypers.jl
index 5b06c58..0cf4e63 100644
--- a/temp/hypers.jl
+++ b/temp/hypers.jl
@@ -223,6 +223,8 @@ end
 
 Δ1 = laplacian(g_ch3, xx)
 
+@btime $g_ch3($xx)
+@btime $laplacian($g_ch3, $xx)
 # test the correctness of the implementation 
 
 using LinearAlgebra
@@ -249,7 +251,7 @@ end
 using ForwardDiff: hessian
 
 laplace_fwd(gfun, xx) = 
-         [ tr( hessian(xx -> g_ch3(xx).dot.W[i], xx) ) 
+         [ tr( hessian(xx -> gfun(xx).dot.W[i], xx) ) 
             for i = 1:length(Δ1.dot.W) ]
 
 Δ3 = laplace_fwd(g_ch3, xx)
diff --git a/test/sphericalharmonics/test_rylm.jl b/test/sphericalharmonics/test_rylm.jl
index 279cf26..91d3d9c 100644
--- a/test/sphericalharmonics/test_rylm.jl
+++ b/test/sphericalharmonics/test_rylm.jl
@@ -7,6 +7,9 @@ using Polynomials4ML: SphericalCoords, index_y,
 using Polynomials4ML: evaluate, evaluate_d, evaluate_ed 
 using Polynomials4ML.Testing: print_tf, println_slim 
 using ACEbase.Testing: fdtest
+using HyperDualNumbers: Hyper
+
+
 verbose = false
 
 ##
@@ -96,6 +99,12 @@ println()
 
 rSH = RYlmBasis(10)
 X = [ rand_sphere() for i = 1:21 ]
+
+x2dualwrtj(x, j) = SVector{3}([Hyper(x[i], i == j, i == j, 0) for i = 1:3])
+
+hX = [x2dualwrtj(x, 1) for x in X]
+
+
 Y0 = evaluate(rSH, X)
 Y1, dY1 = evaluate_ed(rSH, X)
 Y2 = similar(Y1); dY2 = similar(dY1)
@@ -169,4 +178,4 @@ for ntest = 1:30
     end
     print_tf(@test fdtest(F, dF, 0.0; verbose = false))
 end
-println()
+println()
\ No newline at end of file
diff --git a/test/test_atorbrad.jl b/test/test_atorbrad.jl
index 126ce4b..46f6480 100644
--- a/test/test_atorbrad.jl
+++ b/test/test_atorbrad.jl
@@ -85,11 +85,11 @@ println_slim(@test  ddRnl2 ≈ fddRnl )
 @info("Test rrule")
 using LinearAlgebra: dot 
 
-for ntest = 1:30
-    local rr
-    local uu
-    local Rnl
-    local u
+#for ntest = 1:30
+#    local rr
+#    local uu
+#    local Rnl
+#    local u
     
     rr = 2 .* randn(10) .- 1
     uu = 2 .* randn(10) .- 1
@@ -103,5 +103,5 @@ for ntest = 1:30
         return sum( dot(∂BB[i], uu[i]) for i = 1:length(uu) )
     end
     print_tf(@test fdtest(F, dF, 0.0; verbose = false))
-end
+# end
 println()
diff --git a/test/test_op1d3t.jl b/test/test_op1d3t.jl
index daba736..0612638 100644
--- a/test/test_op1d3t.jl
+++ b/test/test_op1d3t.jl
@@ -1,9 +1,12 @@
 
 using Polynomials4ML, Test
 using Polynomials4ML: evaluate, evaluate_d, evaluate_dd
-using Polynomials4ML.Testing: println_slim, test_derivatives
+using Polynomials4ML.Testing: println_slim, test_derivatives, print_tf
 using LinearAlgebra: I, norm 
 using QuadGK
+using ACEbase.Testing: fdtest
+using Printf
+using Zygote
 
 @info("Testing OrthPolyBasis1D3T")
 
@@ -86,3 +89,21 @@ println_slim(@test all([
 @info("     derivatives")
 test_derivatives(cheb, () -> 2*rand()-1)
 
+@info("Testing rrule")
+using LinearAlgebra: dot 
+N = 10
+for ntest = 1:30
+   bBB = randn(N)
+   bUU = randn(N)
+   _BB(t) = bBB + t * bUU
+   bA2 = cheb(bBB)
+   u = randn(size(bA2))
+   F(t) = dot(u, Polynomials4ML.evaluate(cheb, _BB(t)))
+   dF(t) = begin
+       val, pb = Zygote.pullback(evaluate, cheb, _BB(t))
+       ∂BB = pb(u)[2] # pb(u)[1] returns NoTangent() for basis argument
+       return sum( dot(∂BB[i], bUU[i]) for i = 1:length(bUU) )
+   end
+   print_tf(@test fdtest(F, dF, 0.0; verbose=false))
+end
+println()
\ No newline at end of file
diff --git a/test/test_sparseproduct.jl b/test/test_sparseproduct.jl
index 070918e..54a720b 100644
--- a/test/test_sparseproduct.jl
+++ b/test/test_sparseproduct.jl
@@ -232,6 +232,7 @@ for ntest = 1:30
     local bBB
     local bUU
     local bA2
+    local u
     bBB = ( randn(nX, N1), randn(nX, N2), randn(nX, N3) )
     bUU = ( randn(nX, N1), randn(nX, N2), randn(nX, N3) )
     _BB(t) = ( bBB[1] + t * bUU[1], bBB[2] + t * bUU[2], bBB[3] + t * bUU[3] )

From 06647293a0c6a2b96d640b79a5c1831e4f44c3b4 Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Fri, 2 Jun 2023 01:50:38 -0700
Subject: [PATCH 38/54] minor fix

---
 test/test_atorbrad.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/test_atorbrad.jl b/test/test_atorbrad.jl
index 46f6480..126ce4b 100644
--- a/test/test_atorbrad.jl
+++ b/test/test_atorbrad.jl
@@ -85,11 +85,11 @@ println_slim(@test  ddRnl2 ≈ fddRnl )
 @info("Test rrule")
 using LinearAlgebra: dot 
 
-#for ntest = 1:30
-#    local rr
-#    local uu
-#    local Rnl
-#    local u
+for ntest = 1:30
+    local rr
+    local uu
+    local Rnl
+    local u
     
     rr = 2 .* randn(10) .- 1
     uu = 2 .* randn(10) .- 1
@@ -103,5 +103,5 @@ using LinearAlgebra: dot
         return sum( dot(∂BB[i], uu[i]) for i = 1:length(uu) )
     end
     print_tf(@test fdtest(F, dF, 0.0; verbose = false))
-# end
+end
 println()

From 274123baff4b3a7e064fe5a47bbf45eb364df3ea Mon Sep 17 00:00:00 2001
From: vai-bhav-m <vaibhav.mahapatra@outlook.com>
Date: Fri, 2 Jun 2023 02:07:29 -0700
Subject: [PATCH 39/54] added documentation

---
 README.md          |   1 +
 docs/src/api.md    |   3 +
 docs/src/backup.md |   1 +
 src/chebbasis.jl   | 134 ++++++++++++++++++++++-----------------------
 4 files changed, 72 insertions(+), 67 deletions(-)

diff --git a/README.md b/README.md
index f6355de..9051d61 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,7 @@ This package implements a few polynomial basis types, convenient methods for eva
 * `RYlmBasis` : real spherical  harmonics 
 * `CRlmBasis` : complex solid harmonics 
 * `RRlmBasis` : real solid harmonics 
+* `ChebBasis` : chebyshev polynomials of the first kind
 * several radial bases for atomic orbitals: Slater, STO, STO-NG
 
 
diff --git a/docs/src/api.md b/docs/src/api.md
index 56d49d9..38faa76 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -16,6 +16,9 @@ This page documents the public API, i.e. the list of bases and functions that ar
    - Real spherical harmonics [`RYlmBasis`](@ref)
    - Complex solid harmonics [`CRlmBasis`](@ref)
    - Real solid harmonics [`RRlmBasis`](@ref)
+* Chebyshev polynomials of the first kind [`ChebBasis`](@ref)
+   - this approach computes the basis on the go when it is compiled
+   - it does not store the recursion coefficients like what is done in the orthogonal polynomials
 * Various quantum chemistry related radial basis functions. (experimental)
    
 ## In-place Evaluation  
diff --git a/docs/src/backup.md b/docs/src/backup.md
index 8036b2e..5b29753 100644
--- a/docs/src/backup.md
+++ b/docs/src/backup.md
@@ -10,6 +10,7 @@ Polynomials4ML.CTrigBasis
 Polynomials4ML.RTrigBasis
 Polynomials4ML.CYlmBasis
 Polynomials4ML.RYlmBasis
+Polynomials4ML.ChebBasis
 ```
 
 
diff --git a/src/chebbasis.jl b/src/chebbasis.jl
index 643703d..3291cdd 100644
--- a/src/chebbasis.jl
+++ b/src/chebbasis.jl
@@ -5,7 +5,7 @@ export ChebBasis
 
 Chebyshev polynomials up to degree `N-1` (inclusive). i.e  basis with length N. The basis is ordered as 
 ```
-[1, x, 2x^2-1, 4x^3-3x....2Tn-1(x)-Tn-2(x)]
+[1, x, 2x^2-1, 4x^3-3x....2xTn-1(x)-Tn-2(x)]
 ```
 where `x` is input variable. 
 """
@@ -20,107 +20,107 @@ Base.length(basis::ChebBasis) = basis.N
 
 natural_indices(basis::ChebBasis) = 0:length(basis)-1
 
-_valtype(basis::ChebBasis, T::Type{<: Real}) = T
+_valtype(basis::ChebBasis, T::Type{<:Real}) = T
 
 
 function evaluate!(P::AbstractVector, basis::ChebBasis, x::Real)
-   N = basis.N 
-   @assert N  >= 2
+   N = basis.N
+   @assert N >= 2
    @assert length(P) >= length(basis) # N
 
    P[1] = 1
    P[2] = x
-   for k = 3:N 
-      @inbounds P[k] = 2*x*P[k-1] - P[k-2]
+   for k = 3:N
+      @inbounds P[k] = 2 * x * P[k-1] - P[k-2]
    end
-   return P 
-end 
+   return P
+end
 
 
 
-function evaluate!(P::AbstractMatrix, basis::ChebBasis, 
-                   x::AbstractVector{<: Real})
-   N = basis.N 
+function evaluate!(P::AbstractMatrix, basis::ChebBasis,
+   x::AbstractVector{<:Real})
+   N = basis.N
    nX = length(x)
-   @assert N  >= 2
+   @assert N >= 2
    @assert size(P, 2) >= length(basis) # N
    @assert size(P, 1) >= nX
 
-   @inbounds begin 
-      @simd ivdep for i = 1:nX 
+   @inbounds begin
+      @simd ivdep for i = 1:nX
          P[i, 1] = 1
          P[i, 2] = x[i]
       end
 
-      for k = 3:N 
-         @simd ivdep for i = 1:nX 
-            P[i,k] = 2*x[i]*P[i,k-1]-P[i,k-2]
+      for k = 3:N
+         @simd ivdep for i = 1:nX
+            P[i, k] = 2 * x[i] * P[i, k-1] - P[i, k-2]
          end
       end
    end
-   return P 
-end 
+   return P
+end
 
-function evaluate_ed!(P::AbstractVector, dP::AbstractVector, 
-                      basis::ChebBasis, x::Real)
-   N = basis.N 
+function evaluate_ed!(P::AbstractVector, dP::AbstractVector,
+   basis::ChebBasis, x::Real)
+   N = basis.N
    nX = length(x)
-   @assert N  >= 2
-   @assert length(P) >= length(basis)  
-   @assert length(dP) >= length(basis) 
+   @assert N >= 2
+   @assert length(P) >= length(basis)
+   @assert length(dP) >= length(basis)
 
-   @inbounds begin 
+   @inbounds begin
       P[1] = 1
       dP[1] = 0
       P[2] = x
       dP[2] = 1
-      for k = 3:N 
-        P[k] = 2*x*P[k-1] - P[k-2]
-        dP[k] = 2*P[k-1] + 2*x*dP[k-1] - dP[k-2] 
+      for k = 3:N
+         P[k] = 2 * x * P[k-1] - P[k-2]
+         dP[k] = 2 * P[k-1] + 2 * x * dP[k-1] - dP[k-2]
       end
    end
-   return P, dP 
-end 
+   return P, dP
+end
 
 
-function evaluate_ed!(P::AbstractMatrix, dP::AbstractMatrix, basis::ChebBasis, 
-                      x::AbstractVector{<: Real})
-   N = basis.N 
+function evaluate_ed!(P::AbstractMatrix, dP::AbstractMatrix, basis::ChebBasis,
+   x::AbstractVector{<:Real})
+   N = basis.N
    nX = length(x)
-   @assert N  >= 2
+   @assert N >= 2
    @assert size(P, 2) >= length(basis) # N
    @assert size(P, 1) >= nX
    @assert size(dP, 2) >= length(basis) # N
    @assert size(dP, 1) >= nX
 
-   @inbounds begin 
-      @simd ivdep for i = 1:nX 
+   @inbounds begin
+      @simd ivdep for i = 1:nX
          P[i, 1] = 1
          dP[i, 1] = 0
          P[i, 2] = x[i]
          dP[i, 2] = 1
       end
 
-      for k = 3:N 
-         @simd ivdep for i = 1:nX 
-            P[i,k] = 2*x[i]*P[i,k-1]-P[i,k-2]
-            dP[i,k] = 2*P[i,k-1] + 2*x[i]*dP[i,k-1] - dP[i,k-2]
+      for k = 3:N
+         @simd ivdep for i = 1:nX
+            P[i, k] = 2 * x[i] * P[i, k-1] - P[i, k-2]
+            dP[i, k] = 2 * P[i, k-1] + 2 * x[i] * dP[i, k-1] - dP[i, k-2]
          end
       end
    end
-   return P, dP 
-end 
+   return P, dP
+end
 
 
 function evaluate_ed2!(P::AbstractVector, dP::AbstractVector, ddP::AbstractVector,
-                       basis::ChebBasis, x::Real)
-   N = basis.N 
-   @assert N  >= 2
+   basis::ChebBasis, x::Real)
+   N = basis.N
+   @assert N >= 2
    @assert length(P) >= length(basis) # N
    @assert length(dP) >= length(basis) # N
    @assert length(ddP) >= length(basis) # N
 
-   @inbounds begin 
+   @inbounds begin
       P[1] = 1
       P[2] = x
       dP[1] = 0
@@ -128,22 +128,22 @@ function evaluate_ed2!(P::AbstractVector, dP::AbstractVector, ddP::AbstractVecto
       ddP[1] = 0
       ddP[2] = 0
 
-      for k = 3:N 
-        P[k] = 2*x*P[k-1] - P[k-2]
-        dP[k] = 2*P[k-1] + 2*x*dP[k-1] - dP[k-2]
-        ddP[k] = 2*dP[k-1] + 2*dP[k-1] + 2*x*ddP[k-1] - ddP[k-2]
+      for k = 3:N
+         P[k] = 2 * x * P[k-1] - P[k-2]
+         dP[k] = 2 * P[k-1] + 2 * x * dP[k-1] - dP[k-2]
+         ddP[k] = 2 * dP[k-1] + 2 * dP[k-1] + 2 * x * ddP[k-1] - ddP[k-2]
       end
    end
-   return P, dP, ddP 
-end 
+   return P, dP, ddP
+end
 
 
 
-function evaluate_ed2!(P::AbstractMatrix, dP::AbstractMatrix, ddP::AbstractMatrix, basis::ChebBasis, 
-                      x::AbstractVector{<: Real})
-   N = basis.N 
+function evaluate_ed2!(P::AbstractMatrix, dP::AbstractMatrix, ddP::AbstractMatrix, basis::ChebBasis,
+   x::AbstractVector{<:Real})
+   N = basis.N
    nX = length(x)
-   @assert N  >= 2
+   @assert N >= 2
    @assert size(P, 2) >= length(basis) # N
    @assert size(P, 1) >= nX
    @assert size(dP, 2) >= length(basis) # N
@@ -151,23 +151,23 @@ function evaluate_ed2!(P::AbstractMatrix, dP::AbstractMatrix, ddP::AbstractMatri
    @assert size(ddP, 2) >= length(basis) # N
    @assert size(ddP, 1) >= nX
 
-   @inbounds begin 
-      @simd ivdep for i = 1:nX 
+   @inbounds begin
+      @simd ivdep for i = 1:nX
          P[i, 1] = 1
          P[i, 2] = x[i]
          dP[i, 1] = 0
          dP[i, 2] = 1
          ddP[i, 1] = 0
-         ddP[i, 2] = 0     
+         ddP[i, 2] = 0
       end
 
-      for k = 3:N 
-         @simd ivdep for i = 1:nX 
-            P[i,k] = 2*x[i]*P[i,k-1] - P[i,k-2]
-            dP[i,k] = 2*P[i,k-1] + 2*x[i]*dP[i,k-1] - dP[i,k-2]
-            ddP[i,k] = 2*dP[i,k-1] + 2*dP[i,k-1] + 2*x[i]*ddP[i,k-1] - ddP[i,k-2]
+      for k = 3:N
+         @simd ivdep for i = 1:nX
+            P[i, k] = 2 * x[i] * P[i, k-1] - P[i, k-2]
+            dP[i, k] = 2 * P[i, k-1] + 2 * x[i] * dP[i, k-1] - dP[i, k-2]
+            ddP[i, k] = 2 * dP[i, k-1] + 2 * dP[i, k-1] + 2 * x[i] * ddP[i, k-1] - ddP[i, k-2]
          end
       end
    end
-   return P, dP, ddP 
-end 
+   return P, dP, ddP
+end

From 44d86a6373b227d36e2f24b413b8f5ae505b90c4 Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Fri, 2 Jun 2023 02:22:14 -0700
Subject: [PATCH 40/54] try to fix test

---
 test/ace/test_sparseprodpool.jl | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/test/ace/test_sparseprodpool.jl b/test/ace/test_sparseprodpool.jl
index e6b3ec7..cdcc989 100644
--- a/test/ace/test_sparseprodpool.jl
+++ b/test/ace/test_sparseprodpool.jl
@@ -31,12 +31,24 @@ basis = PooledSparseProduct(spec)
 
 @info("Test evaluation with a single input (no pooling)")
 
-BB = (B1, B2, B3)
-
-A1 = test_evaluate(basis, BB)
-A2 = evaluate(basis, BB)
-
-println_slim(@test A1 ≈ A2 )
+for _ = 1:30
+   local B1
+   local B2
+   local B3
+   local BB
+   local A1
+   local A2
+   
+   B1 = randn(N1)
+   B2 = randn(N2)
+   B3 = randn(N3)
+   BB = (B1, B2, B3)
+
+   A1 = test_evaluate(basis, BB)
+   A2 = evaluate(basis, BB)
+   print_tf(@test A1 ≈ A2 )
+end
+println()
 
 ## 
 

From 8d90533947b9aa7caf254a2b91889583875f78b2 Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Fri, 2 Jun 2023 02:53:25 -0700
Subject: [PATCH 41/54] fix PooledSparseProduct

---
 src/ace/sparseprodpool.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/ace/sparseprodpool.jl b/src/ace/sparseprodpool.jl
index 3bc94dd..a40e06a 100644
--- a/src/ace/sparseprodpool.jl
+++ b/src/ace/sparseprodpool.jl
@@ -89,6 +89,7 @@ function evaluate!(A, basis::PooledSparseProduct{NB}, BB::TupVec) where {NB}
    @assert length(BB) == NB
    # evaluate the 1p product basis functions and add/write into _A
    spec = basis.spec
+   fill!(A, 0)
    for (iA, ϕ) in enumerate(spec)
       @inbounds A[iA] += BB_prod(ϕ, BB)
    end
@@ -120,7 +121,8 @@ function evaluate!(A, basis::PooledSparseProduct{NB}, BB::TupMat,
                    nX = size(BB[1], 1)) where {NB}
    @assert all(B->size(B, 1) >= nX, BB)
    spec = basis.spec
-
+   fill!(A, 0)
+   
    @inbounds for (iA, ϕ) in enumerate(spec)
       a = zero(eltype(A))
       @simd ivdep for j = 1:nX

From 96ecd7c3f244eb622b9efc4d35e57c2b3c34cf23 Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Fri, 2 Jun 2023 08:38:38 -0700
Subject: [PATCH 42/54] add rrules for sphericalharmonics

---
 src/sphericalharmonics/rylm.jl               | 28 ++++----
 src/sphericalharmonics/sphericalharmonics.jl | 16 +++++
 test/ace/test_sparseprodpool.jl              | 74 +++++++++++---------
 test/sphericalharmonics/test_crlm.jl         | 27 ++++++-
 test/sphericalharmonics/test_cylm.jl         | 25 +++++++
 test/sphericalharmonics/test_rrlm.jl         | 26 ++++++-
 6 files changed, 147 insertions(+), 49 deletions(-)

diff --git a/src/sphericalharmonics/rylm.jl b/src/sphericalharmonics/rylm.jl
index 3964336..73d3370 100644
--- a/src/sphericalharmonics/rylm.jl
+++ b/src/sphericalharmonics/rylm.jl
@@ -340,17 +340,17 @@ function eval_grad_laplace(basis::RYlmBasis, X)
 	return Y, dY, ΔY
 end
 
-# Placeholder for now
-function ChainRulesCore.rrule(::typeof(evaluate), basis::RYlmBasis, X)
-	A  = evaluate(basis, X)
-	∂X = similar(X)
-   	dX = evaluate_ed(basis, X)[2]
-	function pb(∂A)
-		@assert size(∂A) == (length(X), length(basis))
-		for i = 1:length(X)
-            ∂X[i] = sum([∂A[i,j] * dX[i,j] for j = 1:length(dX[i,:])])
-        end
-		return NoTangent(), NoTangent(), ∂X
-	end
-	return A, pb
-end
\ No newline at end of file
+# # Placeholder for now
+# function ChainRulesCore.rrule(::typeof(evaluate), basis::RYlmBasis, X)
+# 	A  = evaluate(basis, X)
+# 	∂X = similar(X)
+#    	dX = evaluate_ed(basis, X)[2]
+# 	function pb(∂A)
+# 		@assert size(∂A) == (length(X), length(basis))
+# 		for i = 1:length(X)
+#             ∂X[i] = sum([∂A[i,j] * dX[i,j] for j = 1:length(dX[i,:])])
+#         end
+# 		return NoTangent(), NoTangent(), ∂X
+# 	end
+# 	return A, pb
+# end
\ No newline at end of file
diff --git a/src/sphericalharmonics/sphericalharmonics.jl b/src/sphericalharmonics/sphericalharmonics.jl
index 7392ded..dc64ff5 100644
--- a/src/sphericalharmonics/sphericalharmonics.jl
+++ b/src/sphericalharmonics/sphericalharmonics.jl
@@ -143,6 +143,22 @@ _acqu_dP!( basis::XlmBasis, S) = _acqu_alp!(:alpdP,  basis, S)
 _acqu_ddP!(basis::XlmBasis, S) = _acqu_alp!(:alpddP, basis, S)
 
 
+# ---------------------------- Connection with ChainRulesCore
+function ChainRulesCore.rrule(::typeof(evaluate), basis::XlmBasis, X)
+	∂X = similar(X, SVector{3, _valtype(basis, X)})
+   	A, dX = evaluate_ed(basis, X)
+	function pb(∂A)
+		@assert size(∂A) == (length(X), length(basis))
+		for i = 1:length(X)
+            ∂X[i] = sum([∂A[i,j] * dX[i,j] for j = 1:length(dX[i,:])])
+        end
+		return NoTangent(), NoTangent(), ∂X
+	end
+	return A, pb
+end
+
+
+
 # ---------------------------- Auxiliary functions 
 
 function rand_sphere() 
diff --git a/test/ace/test_sparseprodpool.jl b/test/ace/test_sparseprodpool.jl
index cdcc989..b1268d9 100644
--- a/test/ace/test_sparseprodpool.jl
+++ b/test/ace/test_sparseprodpool.jl
@@ -1,29 +1,29 @@
 
 using BenchmarkTools, Test, Polynomials4ML
-using Polynomials4ML:  PooledSparseProduct, evaluate, evaluate! 
+using Polynomials4ML: PooledSparseProduct, evaluate, evaluate!
 using ACEbase.Testing: fdtest, println_slim, print_tf
-       
-test_evaluate(basis::PooledSparseProduct, BB::Tuple{Vararg{<: AbstractVector}}) = 
-       [ prod(BB[j][basis.spec[i][j]] for j = 1:length(BB)) 
-            for i = 1:length(basis) ]
 
-test_evaluate(basis::PooledSparseProduct, BB::Tuple{Vararg{<: AbstractMatrix}}) = 
-      sum( test_evaluate(basis, ntuple(i -> BB[i][j, :], length(BB)))
-         for j = 1:size(BB[1], 1) )            
+test_evaluate(basis::PooledSparseProduct, BB::Tuple{Vararg{<:AbstractVector}}) =
+   [prod(BB[j][basis.spec[i][j]] for j = 1:length(BB))
+    for i = 1:length(basis)]
+
+test_evaluate(basis::PooledSparseProduct, BB::Tuple{Vararg{<:AbstractMatrix}}) =
+   sum(test_evaluate(basis, ntuple(i -> BB[i][j, :], length(BB)))
+       for j = 1:size(BB[1], 1))
 
 P4ML = Polynomials4ML
 
 ##
 
-N1 = 10 
-N2 = 20 
-N3 = 50 
+N1 = 10
+N2 = 20
+N3 = 50
 
 B1 = randn(N1)
 B2 = randn(N2)
 B3 = randn(N3)
 
-spec = sort([ (rand(1:N1), rand(1:N2), rand(1:N3)) for i = 1:100 ])
+spec = sort([(rand(1:N1), rand(1:N2), rand(1:N3)) for i = 1:100])
 
 basis = PooledSparseProduct(spec)
 
@@ -38,7 +38,7 @@ for _ = 1:30
    local BB
    local A1
    local A2
-   
+
    B1 = randn(N1)
    B2 = randn(N2)
    B3 = randn(N3)
@@ -46,24 +46,32 @@ for _ = 1:30
 
    A1 = test_evaluate(basis, BB)
    A2 = evaluate(basis, BB)
-   print_tf(@test A1 ≈ A2 )
+   print_tf(@test A1 ≈ A2)
 end
 println()
 
 ## 
 
 @info("Test pooling of multiple inputs")
-nX = 64 
-bBB = ( randn(nX, N1), randn(nX, N2), randn(nX, N3) )
+nX = 64
+
+for _ = 1:30
+   local bBB
+   local bA1
+   local bA2
+   local bA3
+   
+   bBB = (randn(nX, N1), randn(nX, N2), randn(nX, N3))
 
-# using the naive evaluation code 
-bA1 = test_evaluate(basis, bBB)
-bA2 = evaluate(basis, bBB)
+   # using the naive evaluation code 
+   bA1 = test_evaluate(basis, bBB)
+   bA2 = evaluate(basis, bBB)
 
-bA3 = copy(bA2)
-evaluate!(bA3, basis, bBB)
+   bA3 = copy(bA2)
+   evaluate!(bA3, basis, bBB)
 
-println_slim(@test bA1 ≈ bA2 ≈ bA3 )
+   println_slim(@test bA1 ≈ bA2 ≈ bA3)
+end
 
 
 ##
@@ -74,10 +82,10 @@ using StaticArrays, ForwardDiff
 
 prodgrad = P4ML._prod_grad
 
-for N = 1:5 
+for N = 1:5
    for ntest = 1:10
-      local v1, g 
-      b = rand(SVector{N, Float64})
+      local v1, g
+      b = rand(SVector{N,Float64})
       g = prodgrad(b.data, Val(N))
       g1 = ForwardDiff.gradient(prod, b)
       print_tf(@test g1 ≈ SVector(g...))
@@ -88,22 +96,22 @@ println()
 ##
 
 @info("Testing _rrule_evalpool")
-using LinearAlgebra: dot 
+using LinearAlgebra: dot
 
-for ntest = 1:30 
-   local bBB, bA2 
+for ntest = 1:30
+   local bBB, bA2
    local u
-   bBB = ( randn(nX, N1), randn(nX, N2), randn(nX, N3) )
-   bUU = ( randn(nX, N1), randn(nX, N2), randn(nX, N3) )
-   _BB(t) = ( bBB[1] + t * bUU[1], bBB[2] + t * bUU[2], bBB[3] + t * bUU[3] )
+   bBB = (randn(nX, N1), randn(nX, N2), randn(nX, N3))
+   bUU = (randn(nX, N1), randn(nX, N2), randn(nX, N3))
+   _BB(t) = (bBB[1] + t * bUU[1], bBB[2] + t * bUU[2], bBB[3] + t * bUU[3])
    bA2 = evaluate(basis, bBB)
    u = randn(size(bA2))
    F(t) = dot(u, evaluate(basis, _BB(t)))
    dF(t) = begin
       val, pb = P4ML._rrule_evaluate(basis, _BB(t))
       ∂BB = pb(u)
-      return sum( dot(∂BB[i], bUU[i]) for i = 1:length(bUU) )
+      return sum(dot(∂BB[i], bUU[i]) for i = 1:length(bUU))
    end
    print_tf(@test fdtest(F, dF, 0.0; verbose=false))
 end
-println() 
\ No newline at end of file
+println()
\ No newline at end of file
diff --git a/test/sphericalharmonics/test_crlm.jl b/test/sphericalharmonics/test_crlm.jl
index 99f95b5..0f03834 100644
--- a/test/sphericalharmonics/test_crlm.jl
+++ b/test/sphericalharmonics/test_crlm.jl
@@ -4,6 +4,7 @@ using Polynomials4ML: SphericalCoords,
                       dspher_to_dcart, cart2spher, spher2cart, index_y
 using Polynomials4ML: evaluate, evaluate_d, evaluate_ed 
 using Polynomials4ML.Testing: print_tf, println_slim 
+using ACEbase.Testing: fdtest
 
 verbose = false
 
@@ -184,4 +185,28 @@ end
 println_slim(@test Yb ≈ Ys ≈ Ys2 ≈ Yb1) 
 println_slim(@test dYb1 ≈ dYs2)
 
-##
\ No newline at end of file
+##
+
+using Zygote
+@info("Test rrule")
+using LinearAlgebra: dot 
+rSH = CRlmBasis(10)
+#for ntest = 1:30
+   local X
+   local Y
+   local Rnl
+   local u
+    
+   X = [ rand_sphere() for i = 1:21 ]
+   Y = X = [ rand_sphere() for i = 1:21 ]
+   _x(t) = X + t * Y
+   A = evaluate(rSH, X)
+   u = randn(size(A))
+   F(t) = dot(u, evaluate(rSH, _x(t)))
+   dF(t) = begin
+       val, pb = Zygote.pullback(rSH, _x(t))
+       ∂BB = pb(u)[1] # pb(u)[1] returns NoTangent() for basis argument
+       return sum( dot(∂BB[i], Y[i]) for i = 1:length(Y) )
+   end
+   print_tf(@test fdtest(F, dF, 0.0; verbose = true))
+#end
\ No newline at end of file
diff --git a/test/sphericalharmonics/test_cylm.jl b/test/sphericalharmonics/test_cylm.jl
index 2474f5e..8fae38f 100644
--- a/test/sphericalharmonics/test_cylm.jl
+++ b/test/sphericalharmonics/test_cylm.jl
@@ -6,6 +6,7 @@ using Polynomials4ML: SphericalCoords,
                       dspher_to_dcart, cart2spher, spher2cart
 using Polynomials4ML: evaluate, evaluate_d, evaluate_ed 
 using Polynomials4ML.Testing: print_tf, println_slim 
+using ACEbase.Testing: fdtest
 
 verbose = false
 
@@ -230,3 +231,27 @@ Y1 = evaluate(bYlm, X1)
 X2 = X1[1:10]
 Y2 = evaluate(bYlm, X2)
 
+using Zygote
+@info("Test rrule")
+using LinearAlgebra: dot 
+rSH = CYlmBasis(10)
+for ntest = 1:30
+   local X
+   local Y
+   local Rnl
+   local u
+    
+   X = [ rand_sphere() for i = 1:21 ]
+   Y = X = [ rand_sphere() for i = 1:21 ]
+   _x(t) = X + t * Y
+   A = evaluate(rSH, X)
+   u = randn(size(A))
+   F(t) = dot(u, evaluate(rSH, _x(t)))
+   dF(t) = begin
+       val, pb = Zygote.pullback(rSH, _x(t))
+       ∂BB = pb(u)[1] # pb(u)[1] returns NoTangent() for basis argument
+       return sum( dot(∂BB[i], Y[i]) for i = 1:length(Y) )
+   end
+   print_tf(@test fdtest(F, dF, 0.0; verbose = false))
+end
+println()
\ No newline at end of file
diff --git a/test/sphericalharmonics/test_rrlm.jl b/test/sphericalharmonics/test_rrlm.jl
index d001fb8..c5739e5 100644
--- a/test/sphericalharmonics/test_rrlm.jl
+++ b/test/sphericalharmonics/test_rrlm.jl
@@ -4,6 +4,7 @@ using Polynomials4ML: SphericalCoords, index_y,
                       dspher_to_dcart, cart2spher, spher2cart, rand_sphere
 using Polynomials4ML: evaluate, evaluate_d, evaluate_ed 
 using Polynomials4ML.Testing: print_tf, println_slim 
+using ACEbase.Testing: fdtest
 
 verbose = false
 
@@ -189,7 +190,30 @@ println_slim(@test Y1 ≈ Y2)
 println_slim(@test dY1 ≈ dY2)
 println_slim(@test ΔY1 ≈ ΔY2)
 
-
+using Zygote
+@info("Test rrule")
+using LinearAlgebra: dot 
+rSH = RRlmBasis(10)
+for ntest = 1:30
+    local X
+    local Y
+    local Rnl
+    local u
+    
+    X = [ rand_sphere() for i = 1:21 ]
+    Y = X = [ rand_sphere() for i = 1:21 ]
+    _x(t) = X + t * Y
+    A = evaluate(rSH, X)
+    u = randn(size(A))
+    F(t) = dot(u, evaluate(rSH, _x(t)))
+    dF(t) = begin
+        val, pb = Zygote.pullback(rSH, _x(t))
+        ∂BB = pb(u)[1] # pb(u)[1] returns NoTangent() for basis argument
+        return sum( dot(∂BB[i], Y[i]) for i = 1:length(Y) )
+    end
+    print_tf(@test fdtest(F, dF, 0.0; verbose = false))
+end
+println()
 # ## quick performance test 
 # this needs to move to a benchmarksuite 
 

From 3897ff4cacfa97a22d1912f884bfafd1734cba73 Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Fri, 2 Jun 2023 21:17:06 -0700
Subject: [PATCH 43/54] minor fix

---
 src/sphericalharmonics/rrlm.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/sphericalharmonics/rrlm.jl b/src/sphericalharmonics/rrlm.jl
index d2fea8c..3def422 100644
--- a/src/sphericalharmonics/rrlm.jl
+++ b/src/sphericalharmonics/rrlm.jl
@@ -25,6 +25,9 @@ RRlmBasis(alp::ALPolynomials{T}) where {T} =
 _valtype(sh::RRlmBasis{T}, ::Type{<: StaticVector{3, S}}) where {T <: Real, S <: Real} = 
 		promote_type(T, S)
 
+_valtype(sh::RRlmBasis{T}, ::Type{<: StaticVector{3, Hyper{S}}}) where {T <: Real, S <: Real} = 
+		promote_type(T, Hyper{S})
+
 Base.show(io::IO, basis::RRlmBasis) = 
       print(io, "RRlmBasis(L=$(maxL(basis)))")
 

From dd39978e59fce4a74b7c1a203d8bf17774264092 Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Fri, 2 Jun 2023 21:17:46 -0700
Subject: [PATCH 44/54] fix test

---
 test/sphericalharmonics/test_crlm.jl | 13 +++++------
 test/sphericalharmonics/test_cylm.jl | 13 +++++------
 test/sphericalharmonics/test_rrlm.jl | 33 ++++++++++++++--------------
 test/sphericalharmonics/test_rylm.jl |  2 +-
 4 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/test/sphericalharmonics/test_crlm.jl b/test/sphericalharmonics/test_crlm.jl
index 0f03834..ebac27c 100644
--- a/test/sphericalharmonics/test_crlm.jl
+++ b/test/sphericalharmonics/test_crlm.jl
@@ -190,21 +190,20 @@ println_slim(@test dYb1 ≈ dYs2)
 using Zygote
 @info("Test rrule")
 using LinearAlgebra: dot 
-rSH = CRlmBasis(10)
-#for ntest = 1:30
+basis = CRlmBasis(5)
+for ntest = 1:30
    local X
    local Y
-   local Rnl
    local u
     
    X = [ rand_sphere() for i = 1:21 ]
-   Y = X = [ rand_sphere() for i = 1:21 ]
+   Y = [ rand_sphere() for i = 1:21 ]
    _x(t) = X + t * Y
-   A = evaluate(rSH, X)
+   A = evaluate(basis, X)
    u = randn(size(A))
-   F(t) = dot(u, evaluate(rSH, _x(t)))
+   F(t) = dot(u, evaluate(basis, _x(t)))
    dF(t) = begin
-       val, pb = Zygote.pullback(rSH, _x(t))
+       val, pb = Zygote.pullback(basis, _x(t))
        ∂BB = pb(u)[1] # pb(u)[1] returns NoTangent() for basis argument
        return sum( dot(∂BB[i], Y[i]) for i = 1:length(Y) )
    end
diff --git a/test/sphericalharmonics/test_cylm.jl b/test/sphericalharmonics/test_cylm.jl
index 8fae38f..e62bfd5 100644
--- a/test/sphericalharmonics/test_cylm.jl
+++ b/test/sphericalharmonics/test_cylm.jl
@@ -68,7 +68,7 @@ println()
 ##
 
 using Polynomials4ML: SphericalCoords, ALPolynomials
-verbose=false
+verbose = false
 @info("Test: check derivatives of associated legendre polynomials")
 for nsamples = 1:30
    θ = rand() * π
@@ -234,21 +234,20 @@ Y2 = evaluate(bYlm, X2)
 using Zygote
 @info("Test rrule")
 using LinearAlgebra: dot 
-rSH = CYlmBasis(10)
+cSH = CYlmBasis(5)
 for ntest = 1:30
    local X
    local Y
-   local Rnl
    local u
     
    X = [ rand_sphere() for i = 1:21 ]
-   Y = X = [ rand_sphere() for i = 1:21 ]
+   Y = [ rand_sphere() for i = 1:21 ]
    _x(t) = X + t * Y
-   A = evaluate(rSH, X)
+   A = evaluate(cSH, X)
    u = randn(size(A))
-   F(t) = dot(u, evaluate(rSH, _x(t)))
+   F(t) = dot(u, evaluate(cSH, _x(t)))
    dF(t) = begin
-       val, pb = Zygote.pullback(rSH, _x(t))
+       val, pb = Zygote.pullback(cSH, _x(t))
        ∂BB = pb(u)[1] # pb(u)[1] returns NoTangent() for basis argument
        return sum( dot(∂BB[i], Y[i]) for i = 1:length(Y) )
    end
diff --git a/test/sphericalharmonics/test_rrlm.jl b/test/sphericalharmonics/test_rrlm.jl
index c5739e5..b43274a 100644
--- a/test/sphericalharmonics/test_rrlm.jl
+++ b/test/sphericalharmonics/test_rrlm.jl
@@ -195,23 +195,22 @@ using Zygote
 using LinearAlgebra: dot 
 rSH = RRlmBasis(10)
 for ntest = 1:30
-    local X
-    local Y
-    local Rnl
-    local u
-    
-    X = [ rand_sphere() for i = 1:21 ]
-    Y = X = [ rand_sphere() for i = 1:21 ]
-    _x(t) = X + t * Y
-    A = evaluate(rSH, X)
-    u = randn(size(A))
-    F(t) = dot(u, evaluate(rSH, _x(t)))
-    dF(t) = begin
-        val, pb = Zygote.pullback(rSH, _x(t))
-        ∂BB = pb(u)[1] # pb(u)[1] returns NoTangent() for basis argument
-        return sum( dot(∂BB[i], Y[i]) for i = 1:length(Y) )
-    end
-    print_tf(@test fdtest(F, dF, 0.0; verbose = false))
+   local X
+   local Y
+   local u
+   
+   X = [ rand_sphere() for i = 1:21 ]
+   Y = [ rand_sphere() for i = 1:21 ]
+   _x(t) = X + t * Y
+   A = evaluate(rSH, X)
+   u = randn(size(A))
+   F(t) = dot(u, evaluate(rSH, _x(t)))
+   dF(t) = begin
+       val, pb = Zygote.pullback(rSH, _x(t))
+       ∂BB = pb(u)[1] # pb(u)[1] returns NoTangent() for basis argument
+       return sum( dot(∂BB[i], Y[i]) for i = 1:length(Y) )
+   end
+   print_tf(@test fdtest(F, dF, 0.0; verbose = false))
 end
 println()
 # ## quick performance test 
diff --git a/test/sphericalharmonics/test_rylm.jl b/test/sphericalharmonics/test_rylm.jl
index 91d3d9c..e378c36 100644
--- a/test/sphericalharmonics/test_rylm.jl
+++ b/test/sphericalharmonics/test_rylm.jl
@@ -166,7 +166,7 @@ for ntest = 1:30
     local u
     
     X = [ rand_sphere() for i = 1:21 ]
-    Y = X = [ rand_sphere() for i = 1:21 ]
+    Y = [ rand_sphere() for i = 1:21 ]
     _x(t) = X + t * Y
     A = evaluate(rSH, X)
     u = randn(size(A))

From 4ee25dcb94373c717066017d488ea210d716bd69 Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Sun, 4 Jun 2023 20:16:32 -0700
Subject: [PATCH 45/54] clean up and add test to make sure ChebBasis ./
 chebyshev_basis = const

---
 docs/src/api.md           |  2 --
 src/ace/sparseprodpool.jl |  9 +++++----
 src/chebbasis.jl          | 10 +++++++---
 src/interface.jl          |  3 +++
 src/sparseproduct.jl      | 21 ++++++++-------------
 test/test_cheb.jl         | 10 ++++++++--
 6 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/docs/src/api.md b/docs/src/api.md
index 38faa76..e220298 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -17,8 +17,6 @@ This page documents the public API, i.e. the list of bases and functions that ar
    - Complex solid harmonics [`CRlmBasis`](@ref)
    - Real solid harmonics [`RRlmBasis`](@ref)
 * Chebyshev polynomials of the first kind [`ChebBasis`](@ref)
-   - this approach computes the basis on the go when it is compiled
-   - it does not store the recursion coefficients like what is done in the orthogonal polynomials
 * Various quantum chemistry related radial basis functions. (experimental)
    
 ## In-place Evaluation  
diff --git a/src/ace/sparseprodpool.jl b/src/ace/sparseprodpool.jl
index ef15d18..3970e3d 100644
--- a/src/ace/sparseprodpool.jl
+++ b/src/ace/sparseprodpool.jl
@@ -32,9 +32,9 @@ end
 
 # ----------------------- evaluation interfaces 
 
-const TupVec = Tuple{Vararg{<: AbstractVector}}
-const TupMat = Tuple{Vararg{<: AbstractMatrix}}
-const TupVecMat = Union{TupVec, TupMat}
+# const TupVec = Tuple{Vararg{<: AbstractVector}}
+# const TupMat = Tuple{Vararg{<: AbstractMatrix}}
+# const TupVecMat = Union{TupVec, TupMat}
 
 _valtype(basis::AbstractPoly4MLBasis, BB::Tuple) = 
       mapreduce(eltype, promote_type, BB)
@@ -89,6 +89,7 @@ function evaluate!(A, basis::PooledSparseProduct{NB}, BB::TupVec) where {NB}
    @assert length(BB) == NB
    # evaluate the 1p product basis functions and add/write into _A
    spec = basis.spec
+   fill!(A, 0)
    for (iA, ϕ) in enumerate(spec)
       @inbounds A[iA] += BB_prod(ϕ, BB)
    end
@@ -120,7 +121,7 @@ function evaluate!(A, basis::PooledSparseProduct{NB}, BB::TupMat,
                    nX = size(BB[1], 1)) where {NB}
    @assert all(B->size(B, 1) >= nX, BB)
    spec = basis.spec
-
+   fill!(A, 0)
    @inbounds for (iA, ϕ) in enumerate(spec)
       a = zero(eltype(A))
       @simd ivdep for j = 1:nX
diff --git a/src/chebbasis.jl b/src/chebbasis.jl
index 3291cdd..caba767 100644
--- a/src/chebbasis.jl
+++ b/src/chebbasis.jl
@@ -3,11 +3,15 @@ export ChebBasis
 """
 `ChebBasis(N)`: 
 
-Chebyshev polynomials up to degree `N-1` (inclusive). i.e  basis with length N. The basis is ordered as 
-```
-[1, x, 2x^2-1, 4x^3-3x....2xTn-1(x)-Tn-2(x)]
+Chebyshev polynomials up to degree `N-1` (inclusive). i.e  basis with length `N`. The basis is ordered as 
+```math
+[1, x, 2x^2-1, 4x^3-3x, ..., 2xT_{N-1}(x)-T_{N-2}(x)]
 ```
 where `x` is input variable. 
+
+The differences between `ChebBasis` and `chebyshev_basis` is that `ChebBasis` computes the basis on the go when it is compiled and it does not store the recursion coefficients as in `chebyshev_basis`.
+
+Warning: `ChebBasis` and `chebyshev_basis` have different normalization.
 """
 struct ChebBasis <: AbstractPoly4MLBasis
    N::Int
diff --git a/src/interface.jl b/src/interface.jl
index a561bd0..ad91b24 100644
--- a/src/interface.jl
+++ b/src/interface.jl
@@ -51,6 +51,9 @@ end
 const SINGLE = Union{Number, StaticArray, SphericalCoords}
 const BATCH = AbstractVector{<: SINGLE}
 
+const TupVec = Tuple{Vararg{<: AbstractVector}}
+const TupMat = Tuple{Vararg{<: AbstractMatrix}}
+const TupVecMat = Union{TupVec, TupMat}
 # ---------------------------------------
 # managing defaults for input-output types
 
diff --git a/src/sparseproduct.jl b/src/sparseproduct.jl
index dfdd8c8..c329fcd 100644
--- a/src/sparseproduct.jl
+++ b/src/sparseproduct.jl
@@ -70,22 +70,17 @@ function _frule_frule_evaluate(basis::SparseProduct, BB::Tuple{Vararg{AbstractMa
 end
 
 # ----------------------- overiding alloc functions
-const TupVec = Tuple{Vararg{<: AbstractVector}}
-const TupMat = Tuple{Vararg{<: AbstractMatrix}}
-const TupVecMat = Union{TupVec, TupMat}
+# const TupVec = Tuple{Vararg{<: AbstractVector}}
+# const TupMat = Tuple{Vararg{<: AbstractMatrix}}
+# const TupVecMat = Union{TupVec, TupMat}
 
 # specifically for SparseProduct/PooledSparseProduct
-_outsym(x::NTuple{NB, TupVec}) where {NB} = :out
-_outsym(X::NTuple{NB, TupMat}) where {NB} = :outb
+_outsym(x::TupVec) = :out
+_outsym(X::TupMat) = :outb
 
-_outsym(x::Tuple{AbstractVector, AbstractVector}) = :out
-_outsym(X::Tuple{AbstractMatrix, AbstractMatrix}) = :outb
-
-# _alloc(basis::SparseProduct, BB::TupVec) = 
-#       acquire!(basis.pool, :out, (length(basis), ), _valtype(basis, BB) )
-
-# _alloc(basis::SparseProduct, BB::TupMat) = 
-#       acquire!(basis.pool, :outb, (size(BB[1], 1), length(basis) ), _valtype(basis, BB) )
+# TODO: generalize it
+#_outsym(x::Tuple{AbstractVector, AbstractVector}) = :out
+#_outsym(X::Tuple{AbstractMatrix, AbstractMatrix}) = :outb
 
 _out_size(basis::SparseProduct, BB::TupVec) = (length(basis), )
 _out_size(basis::SparseProduct, BB::TupMat) = (size(BB[1],1), length(basis))
diff --git a/test/test_cheb.jl b/test/test_cheb.jl
index bff88bf..77829a4 100644
--- a/test/test_cheb.jl
+++ b/test/test_cheb.jl
@@ -8,21 +8,27 @@ using Polynomials4ML.Testing: println_slim, print_tf, test_derivatives
 @info("Testing Real Chebyshev Polynomials (ChebBasis)")
 N = 10
 basis = ChebBasis(N) 
+basis2 = chebyshev_basis(N; normalize=false)
 
 @info("      correctness")
 mm = natural_indices(basis)
 print_tf(@test mm == 0:N-1)
 
+θ = 2*π * rand()
+x = cos(θ)
+r = basis(x) ./ basis2(x)
 for ntest = 1:30
+   local θ
+   local x
    θ = 2*π * rand()
    x = cos(θ)
    P = basis(x)
    P2 = [ cos(m*θ) for m in mm ]
-   print_tf(@test P ≈ P2)
+   P3 = basis2(x)
+   print_tf(@test P ≈ P2 && (P ./ P3 ≈ r))
 end
 println() 
 
-
 ##
 
 @info("      test derivatives")

From 1635fc86528a43aa98d6a9fedb71250cb7f71129 Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Mon, 5 Jun 2023 00:55:21 -0700
Subject: [PATCH 46/54] rewmove product basis and clean up

---
 Project.toml                         |  7 ++----
 src/Polynomials4ML.jl                |  1 -
 src/interface.jl                     | 33 +++++-----------------------
 src/orthopolybasis.jl                | 28 +----------------------
 {src => temp}/productbasis.jl        | 25 +++++++++++++++++++++
 test/sphericalharmonics/test_crlm.jl | 25 +--------------------
 test/sphericalharmonics/test_cylm.jl | 26 +---------------------
 test/test_acemodel.jl                |  5 +----
 test/test_op1d3t.jl                  |  2 +-
 9 files changed, 37 insertions(+), 115 deletions(-)
 rename {src => temp}/productbasis.jl (66%)

diff --git a/Project.toml b/Project.toml
index e65d2fb..c48ac8d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -13,7 +13,6 @@ ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 HyperDualNumbers = "50ceba7f-c3ee-5a84-a6e8-3ad40456ec97"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
-Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
 LuxCore = "bb33d45b-7691-41d6-9220-0943567d0623"
 NamedTupleTools = "d9ec5142-1e00-5aa0-9d6a-321866360f50"
 ObjectPools = "658cac36-ff0f-48ad-967c-110375d98c9d"
@@ -23,8 +22,6 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 StrideArrays = "d1fa6d79-ef01-42a6-86c9-f7c551f8593b"
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
 ACEbase = "0.4.2"
@@ -46,7 +43,7 @@ julia = "1.8"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-ACEcore = "44c1e890-45d1-48ea-94d6-c2ea5b573f71"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Test", "Lux", "Printf", "ACEcore"]
+test = ["Test", "Lux", "Printf", "Zygote"]
diff --git a/src/Polynomials4ML.jl b/src/Polynomials4ML.jl
index 0fedaba..d4da670 100644
--- a/src/Polynomials4ML.jl
+++ b/src/Polynomials4ML.jl
@@ -62,7 +62,6 @@ include("atomicorbitalsradials/atomicorbitalsradials.jl")
 # generating product bases (generalisation of tensor products)
 include("staticprod.jl")
 include("sparseproduct.jl")
-include("productbasis.jl")
 
 # generic machinery for wrapping poly4ml bases into lux layers 
 include("lux.jl")
diff --git a/src/interface.jl b/src/interface.jl
index d7a53fb..5079556 100644
--- a/src/interface.jl
+++ b/src/interface.jl
@@ -134,33 +134,6 @@ _alloc_ed(basis::AbstractPoly4MLBasis, x) =
 _alloc_ed2(basis::AbstractPoly4MLBasis, x) = 
       _alloc(basis, x), _alloc_d(basis, x), _alloc_dd(basis, x)
 
-
-# special functions for SparseProduct
-# function _alloc_d(basis::AbstractPoly4MLBasis, BBs::NTuple{NB, AbstractVecOrMat{T}}) where {NB, T}
-#       BBs_size = [size(bb) for bb in BBs]
-#       return [Tuple([acquire!(basis.pool, _outsym(BBs), (BBsize), _valtype(basis, BBs)) for BBsize in BBs_size]) for _ = 1:length(basis)]
-# end
-
-# function _alloc_dd(basis::AbstractPoly4MLBasis, BBs::NTuple{NB, AbstractVecOrMat{T}}) where {NB, T}
-#       BBs_size = [size(bb) for bb in BBs]
-#       return [Tuple([acquire!(basis.pool, _outsym(BBs), (BBsize), _valtype(basis, BBs)) for BBsize in BBs_size]) for _ = 1:length(basis)]
-# end
-
-# _alloc_ed(basis::AbstractPoly4MLBasis, x::NTuple{NB, AbstractVecOrMat{T}}) where {NB, T} = _alloc(basis, x), _alloc_d(basis, x)
-# _alloc_ed2(basis::AbstractPoly4MLBasis, x::NTuple{NB, AbstractVecOrMat{T}}) where {NB, T} = _alloc(basis, x), _alloc_d(basis, x), _alloc_dd(basis, x)
-
-
-# OLD ARRAY BASED INTERFACE 
-
-# _alloc(basis::AbstractPoly4MLBasis, X) = 
-#       Array{ _valtype(basis, X) }(undef, _out_size(basis, X))
-
-# _alloc_d(basis::AbstractPoly4MLBasis, X) = 
-#       Array{ _gradtype(basis, X) }(undef, _out_size(basis, X))
-
-# _alloc_dd(basis::AbstractPoly4MLBasis, X) = 
-#       Array{ _hesstype(basis, X) }(undef, _out_size(basis, X))
-
 # --------------------------------------- 
 # evaluation interface 
 
@@ -236,7 +209,11 @@ function evaluate_ed2!(flex_B::FlexArray,
 end
 
 # --------------------------------------- 
-# general rrule and frule interface for ChainRulesCore
+# general rrules and frules interface for ChainRulesCore
+
+# ∂_xa ( ∂P : P ) = ∑_ij ∂_xa ( ∂P_ij * P_ij ) 
+#                 = ∑_ij ∂P_ij * ∂_xa ( P_ij )
+#                 = ∑_ij ∂P_ij * dP_ij δ_ia
 function ChainRulesCore.rrule(::typeof(evaluate), basis::ScalarPoly4MLBasis, R::AbstractVector{<: Real})
    A, dR = evaluate_ed(basis, R)
    ∂R = similar(R)
diff --git a/src/orthopolybasis.jl b/src/orthopolybasis.jl
index bf471ca..6e5b026 100644
--- a/src/orthopolybasis.jl
+++ b/src/orthopolybasis.jl
@@ -216,30 +216,4 @@ function evaluate_ed2!(P::AbstractArray, dP::AbstractArray, ddP::AbstractArray,
       end
    end
    return P, dP, ddP 
-end
-
-
-# ------------------   rrules 
-# 
-# ∂_xa ( ∂P : P ) = ∑_ij ∂_xa ( ∂P_ij * P_ij ) 
-#                 = ∑_ij ∂P_ij * ∂_xa ( P_ij )
-#                 = ∑_ij ∂P_ij * dP_ij δ_ia
-#
-# function ChainRulesCore.rrule(::typeof(evaluate), basis::OrthPolyBasis1D3T, x::AbstractVector)
-#    #P = _alloc(basis, x)
-#    nX = length(x) 
-#    #dP = similar(P)
-#    P, dP = evaluate_ed(basis, x)
-
-#    function pb(∂P)
-#       ∂X = zeros(eltype(x), nX)
-#       for j = 1:length(basis) 
-#          for i = 1:nX 
-#             ∂X[i] += ∂P[i, j] * dP[i, j]
-#          end
-#       end
-#       return NoTangent(), NoTangent(), ∂X 
-#    end
-
-#    return P, pb
-# end
+end
\ No newline at end of file
diff --git a/src/productbasis.jl b/temp/productbasis.jl
similarity index 66%
rename from src/productbasis.jl
rename to temp/productbasis.jl
index 64e8265..4f583e0 100644
--- a/src/productbasis.jl
+++ b/temp/productbasis.jl
@@ -1,6 +1,9 @@
 # Jerry: This is just a specific case of a general ProductBasis
 # I will do that later expanding this to a general case, but it is unclear
 # to me how to allow the basis to distinguish whether to use norm(x) or x efficiently
+using Lux: WrappedFunction
+using Lux
+
 struct ProductBasis{NB, TR, TY, TS} <: AbstractPoly4MLBasis
    spec1::Vector{TS}
    bRnl::TR
@@ -58,4 +61,26 @@ function evaluate(basis::ProductBasis, X::AbstractVector{<: AbstractVector})
    return ϕnlm
 end
 
+function ProductBasisLayer(spec1, bRnl, bYlm)
+   spec1idx = Vector{Tuple{Int, Int}}(undef, length(spec1))
+   spec_Rnl = natural_indices(bRnl); inv_Rnl = _invmap(spec_Rnl)
+   spec_Ylm = natural_indices(bYlm); inv_Ylm = _invmap(spec_Ylm)
+
+   spec1idx = Vector{Tuple{Int, Int}}(undef, length(spec1))
+   for (i, b) in enumerate(spec1)
+      spec1idx[i] = (inv_Rnl[dropnames(b,(:m,))], inv_Ylm[(l=b.l, m=b.m)])
+   end
+   sparsebasis = SparseProduct(spec1idx)
+
+   # wrap into lux layers
+   l_Rn = Polynomials4ML.lux(bRnl)
+   l_Ylm = Polynomials4ML.lux(bYlm)
+   l_ϕnlm = Polynomials4ML.lux(sparsebasis)
+   
+   # formming model with Lux Chain
+   _norm(x) = norm.(x)
 
+   l_xnx = Lux.Parallel(nothing; normx = WrappedFunction(_norm), x = WrappedFunction(identity))
+   l_embed = Lux.Parallel(nothing; Rn = l_Rn, Ylm = l_Ylm)
+   return Chain(; xnx = l_xnx, embed = l_embed, ϕnlms = l_ϕnlm)
+end
\ No newline at end of file
diff --git a/test/sphericalharmonics/test_crlm.jl b/test/sphericalharmonics/test_crlm.jl
index ebac27c..ce0164a 100644
--- a/test/sphericalharmonics/test_crlm.jl
+++ b/test/sphericalharmonics/test_crlm.jl
@@ -185,27 +185,4 @@ end
 println_slim(@test Yb ≈ Ys ≈ Ys2 ≈ Yb1) 
 println_slim(@test dYb1 ≈ dYs2)
 
-##
-
-using Zygote
-@info("Test rrule")
-using LinearAlgebra: dot 
-basis = CRlmBasis(5)
-for ntest = 1:30
-   local X
-   local Y
-   local u
-    
-   X = [ rand_sphere() for i = 1:21 ]
-   Y = [ rand_sphere() for i = 1:21 ]
-   _x(t) = X + t * Y
-   A = evaluate(basis, X)
-   u = randn(size(A))
-   F(t) = dot(u, evaluate(basis, _x(t)))
-   dF(t) = begin
-       val, pb = Zygote.pullback(basis, _x(t))
-       ∂BB = pb(u)[1] # pb(u)[1] returns NoTangent() for basis argument
-       return sum( dot(∂BB[i], Y[i]) for i = 1:length(Y) )
-   end
-   print_tf(@test fdtest(F, dF, 0.0; verbose = true))
-#end
\ No newline at end of file
+##
\ No newline at end of file
diff --git a/test/sphericalharmonics/test_cylm.jl b/test/sphericalharmonics/test_cylm.jl
index e62bfd5..7f59520 100644
--- a/test/sphericalharmonics/test_cylm.jl
+++ b/test/sphericalharmonics/test_cylm.jl
@@ -229,28 +229,4 @@ bYlm = CYlmBasis(5)
 X1 = randn(SVector{3, Float64}, 100)
 Y1 = evaluate(bYlm, X1)
 X2 = X1[1:10]
-Y2 = evaluate(bYlm, X2)
-
-using Zygote
-@info("Test rrule")
-using LinearAlgebra: dot 
-cSH = CYlmBasis(5)
-for ntest = 1:30
-   local X
-   local Y
-   local u
-    
-   X = [ rand_sphere() for i = 1:21 ]
-   Y = [ rand_sphere() for i = 1:21 ]
-   _x(t) = X + t * Y
-   A = evaluate(cSH, X)
-   u = randn(size(A))
-   F(t) = dot(u, evaluate(cSH, _x(t)))
-   dF(t) = begin
-       val, pb = Zygote.pullback(cSH, _x(t))
-       ∂BB = pb(u)[1] # pb(u)[1] returns NoTangent() for basis argument
-       return sum( dot(∂BB[i], Y[i]) for i = 1:length(Y) )
-   end
-   print_tf(@test fdtest(F, dF, 0.0; verbose = false))
-end
-println()
\ No newline at end of file
+Y2 = evaluate(bYlm, X2)
\ No newline at end of file
diff --git a/test/test_acemodel.jl b/test/test_acemodel.jl
index fdd6cc7..9acb44b 100644
--- a/test/test_acemodel.jl
+++ b/test/test_acemodel.jl
@@ -72,9 +72,6 @@ simpleacemodel(bX, ps, st)
 
 F(X) = simpleacemodel(X, ps, st)[1]
 dF(X) = Zygote.gradient(x -> Lux.apply(simpleacemodel, x, ps, st)[1], X)[1]
-#(l, st_), pb = pullback(x -> Lux.apply(simpleacemodel, x, ps, st), bX)
-# gs = pb((l, nothing))[1]
-
 
+@info("Testing ∇U w.r.t X")
 fdtest(F, dF, bX, verbose = true)
-
diff --git a/test/test_op1d3t.jl b/test/test_op1d3t.jl
index 0612638..7151a5f 100644
--- a/test/test_op1d3t.jl
+++ b/test/test_op1d3t.jl
@@ -104,6 +104,6 @@ for ntest = 1:30
        ∂BB = pb(u)[2] # pb(u)[1] returns NoTangent() for basis argument
        return sum( dot(∂BB[i], bUU[i]) for i = 1:length(bUU) )
    end
-   print_tf(@test fdtest(F, dF, 0.0; verbose=false))
+   print_tf(@test fdtest(F, dF, 0.0; verbose = false))
 end
 println()
\ No newline at end of file

From e41f0f4aabad19b80b8b41fb6628a7d33b3c7d73 Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Mon, 5 Jun 2023 01:11:07 -0700
Subject: [PATCH 47/54] minor fix on deps

---
 Project.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Project.toml b/Project.toml
index c48ac8d..bece86d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -22,6 +22,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 StrideArrays = "d1fa6d79-ef01-42a6-86c9-f7c551f8593b"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [compat]
 ACEbase = "0.4.2"

From a5c2c92fd070cd7f8b726190f1789dbfe20d48ed Mon Sep 17 00:00:00 2001
From: cheukhinhojerry <cheukhinhojerry@gmail.com>
Date: Mon, 5 Jun 2023 18:40:59 -0700
Subject: [PATCH 48/54] try yo fix Vararg warning and small clean up

---
 src/ace/sparseprodpool.jl       | 5 -----
 src/interface.jl                | 4 ++--
 src/sparseproduct.jl            | 4 ----
 test/ace/test_sparseprodpool.jl | 4 ++--
 test/test_sparseproduct.jl      | 4 ++--
 5 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/src/ace/sparseprodpool.jl b/src/ace/sparseprodpool.jl
index 3970e3d..fe08a4c 100644
--- a/src/ace/sparseprodpool.jl
+++ b/src/ace/sparseprodpool.jl
@@ -31,11 +31,6 @@ end
 
 
 # ----------------------- evaluation interfaces 
-
-# const TupVec = Tuple{Vararg{<: AbstractVector}}
-# const TupMat = Tuple{Vararg{<: AbstractMatrix}}
-# const TupVecMat = Union{TupVec, TupMat}
-
 _valtype(basis::AbstractPoly4MLBasis, BB::Tuple) = 
       mapreduce(eltype, promote_type, BB)
 
diff --git a/src/interface.jl b/src/interface.jl
index 5079556..7099211 100644
--- a/src/interface.jl
+++ b/src/interface.jl
@@ -56,8 +56,8 @@ end
 const SINGLE = Union{Number, StaticArray, SphericalCoords}
 const BATCH = AbstractVector{<: SINGLE}
 
-const TupVec = Tuple{Vararg{<: AbstractVector}}
-const TupMat = Tuple{Vararg{<: AbstractMatrix}}
+const TupVec = Tuple{Vararg{AbstractVector}}
+const TupMat = Tuple{Vararg{AbstractMatrix}}
 const TupVecMat = Union{TupVec, TupMat}
 # ---------------------------------------
 # managing defaults for input-output types
diff --git a/src/sparseproduct.jl b/src/sparseproduct.jl
index 90207fc..c12fb8c 100644
--- a/src/sparseproduct.jl
+++ b/src/sparseproduct.jl
@@ -70,10 +70,6 @@ function _frule_frule_evaluate(basis::SparseProduct, BB::Tuple{Vararg{AbstractMa
 end
 
 # ----------------------- overiding alloc functions
-# const TupVec = Tuple{Vararg{<: AbstractVector}}
-# const TupMat = Tuple{Vararg{<: AbstractMatrix}}
-# const TupVecMat = Union{TupVec, TupMat}
-
 # specifically for SparseProduct/PooledSparseProduct
 _outsym(x::TupVec) = :out
 _outsym(X::TupMat) = :outb
diff --git a/test/ace/test_sparseprodpool.jl b/test/ace/test_sparseprodpool.jl
index b1268d9..cc680a4 100644
--- a/test/ace/test_sparseprodpool.jl
+++ b/test/ace/test_sparseprodpool.jl
@@ -3,11 +3,11 @@ using BenchmarkTools, Test, Polynomials4ML
 using Polynomials4ML: PooledSparseProduct, evaluate, evaluate!
 using ACEbase.Testing: fdtest, println_slim, print_tf
 
-test_evaluate(basis::PooledSparseProduct, BB::Tuple{Vararg{<:AbstractVector}}) =
+test_evaluate(basis::PooledSparseProduct, BB::Tuple{Vararg{AbstractVector}}) =
    [prod(BB[j][basis.spec[i][j]] for j = 1:length(BB))
     for i = 1:length(basis)]
 
-test_evaluate(basis::PooledSparseProduct, BB::Tuple{Vararg{<:AbstractMatrix}}) =
+test_evaluate(basis::PooledSparseProduct, BB::Tuple{Vararg{AbstractMatrix}}) =
    sum(test_evaluate(basis, ntuple(i -> BB[i][j, :], length(BB)))
        for j = 1:size(BB[1], 1))
 
diff --git a/test/test_sparseproduct.jl b/test/test_sparseproduct.jl
index 54a720b..8c3e90d 100644
--- a/test/test_sparseproduct.jl
+++ b/test/test_sparseproduct.jl
@@ -8,11 +8,11 @@ using ACEbase.Testing: fdtest
 using Zygote
 using HyperDualNumbers: Hyper
 
-test_evaluate(basis::SparseProduct, BB::Tuple{Vararg{<: AbstractVector}}) = 
+test_evaluate(basis::SparseProduct, BB::Tuple{Vararg{AbstractVector}}) = 
        [ prod(BB[j][basis.spec[i][j]] for j = 1:length(BB)) 
             for i = 1:length(basis) ]
 
-# test_evaluate(basis::SparseProduct, BB::Tuple{Vararg{<: AbstractMatrix}}) = 
+# test_evaluate(basis::SparseProduct, BB::Tuple{Vararg{AbstractMatrix}}) = 
 #         [ test_evaluate(basis, ntuple(i -> BB[i][j, :], length(BB)))
 #          for j = 1:size(BB[1], 1) )            
 

From 689322958905f23789a769d6dc6b3205eeb00726 Mon Sep 17 00:00:00 2001
From: cortner <christohortner@gmail.com>
Date: Mon, 5 Jun 2023 21:13:13 -0700
Subject: [PATCH 49/54] fixed chebyshev_basis implementation + tests

---
 src/jacobiweights.jl | 21 +++++++++++++--
 test/test_op1d3t.jl  | 61 ++++++++++++++++++++------------------------
 2 files changed, 47 insertions(+), 35 deletions(-)

diff --git a/src/jacobiweights.jl b/src/jacobiweights.jl
index f708bed..4b0b608 100644
--- a/src/jacobiweights.jl
+++ b/src/jacobiweights.jl
@@ -68,8 +68,25 @@ Constructs an `OrthPolyBasis1D3T` object representing a possibly rescaled versio
 
 Careful: the normalisation may be non-standard. 
 """
-chebyshev_basis(N::Integer; normalize=false) = 
-      orthpolybasis(N, chebyshev_weights(normalize))
+function chebyshev_basis(N::Integer; normalize=false) 
+   cheb = orthpolybasis(N, chebyshev_weights(normalize))
+   if normalize 
+      cheb.A[1] = sqrt(1/π)
+      cheb.A[2] = sqrt(2/π)
+      cheb.C[3] = - sqrt(2) 
+      cheb.A[3:end] .= 2 
+      cheb.B[:] .= 0 
+      cheb.C[4:end] .= -1 
+   else 
+      cheb.A[1] = 1
+      cheb.A[2] = 1
+      cheb.A[3:end] .= 2 
+      cheb.B[:] .= 0 
+      cheb.C[3:end] .= -1 
+   end
+   return cheb 
+end 
+      
 
 """
 `legendre_basis(N::Integer)`: 
diff --git a/test/test_op1d3t.jl b/test/test_op1d3t.jl
index 7151a5f..19ccb2e 100644
--- a/test/test_op1d3t.jl
+++ b/test/test_op1d3t.jl
@@ -51,45 +51,40 @@ end
 ##
 
 
-# @warn("turn off Chebyshev test - coeffs seem poorly normalized?!?")
-# cheb = chebyshev_basis(N, normalize=true)
-# @show abs(cheb.A[1] - sqrt(1/π))
-# @show abs(cheb.A[2] - sqrt(2/π))
-# @show abs(cheb.C[3] + sqrt(2))
-# @show norm(cheb.A[3:end] .- 2, Inf)
-# @show norm(cheb.B, Inf)
-# @show norm(cheb.C[4:end] .+ 1)
-
-# TODO: add standard chebyshev and add it to the test suite
-# @info("Check correctness of Chebyshev Basis (normalize=false)")
-# cheb = chebyshev_basis(N, normalize=false)
-# @info("     recursion coefficients")
-# println_slim(@test all([ 
-#          cheb.A[1] ≈ 1, 
-#          all(cheb.B[:] .== 0), 
-#          cheb.A[2] ≈ 1, 
-#          all(cheb.A[3:end] .≈ 2), 
-#          cheb.C[3] ≈ - sqrt(2), 
-#          all(cheb.C[4:end] .≈ -1), ]))
-# @info("     derivatives")
-
-
-##
-
-@info("Test Chebyshev Basis (normalize=true)")
+@info("Test normalized cheb basis") 
+@info("   coeffs")
 cheb = chebyshev_basis(N, normalize=true)
-@info("     recursion coefficients")
 println_slim(@test all([ 
-         abs(cheb.A[1] - sqrt(1/π)) < 1e-7,
-         abs(cheb.A[2] - sqrt(2/π)) < 1e-7,
-         abs(cheb.C[3] + sqrt(2)) < 1e-7,
-         norm(cheb.A[3:end] .- 2, Inf) < 1e-7,
-         norm(cheb.B, Inf) < 1e-7,
-         norm(cheb.C[4:end] .+ 1) < 1e-7, ]))
+   cheb.A[1] ≈ sqrt(1/π), 
+   cheb.A[2] ≈ sqrt(2/π), 
+   cheb.C[3] ≈ -sqrt(2), 
+   norm(cheb.A[3:end] .- 2, Inf) < 1e-12, 
+   norm(cheb.B, Inf) == 0, 
+   norm(cheb.C[4:end] .+ 1) < 1e-12, ] ))
+@info("   orthogonality")
+G = quadgk(x -> (1-x)^(-0.5) * (x+1)^(-0.5) * cheb(x) * cheb(x)', -1, 1)[1]
+println_slim(@test round.(G, digits=6) ≈ I)
 @info("     derivatives")
 test_derivatives(cheb, () -> 2*rand()-1)
 
+
+@info("Check correctness of Chebyshev Basis (normalize=false)")
+cheb = chebyshev_basis(N, normalize=false)
+@info("     recursion coefficients")
+println_slim(@test all([ 
+         cheb.A[1] ≈ 1, 
+         all(cheb.B[:] .== 0), 
+         cheb.A[2] ≈ 1, 
+         all(cheb.A[3:end] .≈ 2), 
+         all(cheb.C[3:end] .≈ -1), ]))
+
+@info("    consistency with ChebBasis")
+cheb2 = ChebBasis(N)
+println_slim(@test all( (x = 2*rand()-1; cheb(x) ≈ cheb2(x)) for _=1:30 ))
+
+##
 @info("Testing rrule")
+
 using LinearAlgebra: dot 
 N = 10
 for ntest = 1:30

From edb10d0f10296fec4d409ed4c3421ac8da3b89de Mon Sep 17 00:00:00 2001
From: cortner <christohortner@gmail.com>
Date: Mon, 5 Jun 2023 21:28:58 -0700
Subject: [PATCH 50/54] cleanup

---
 Project.toml                    |  1 +
 test/ace/test_sparseprodpool.jl |  2 +-
 test/test_atorbrad.jl           | 13 +++++++++++++
 test/test_discreteweights.jl    |  9 ---------
 4 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/Project.toml b/Project.toml
index bece86d..e1392f5 100644
--- a/Project.toml
+++ b/Project.toml
@@ -33,6 +33,7 @@ Combinatorics = "1"
 ForwardDiff = "0.10"
 LoopVectorization = "0.12"
 LuxCore = "0.1.3"
+NamedTupleTools = "0.14.3"
 ObjectPools = "0.2.1"
 QuadGK = "2"
 SpecialFunctions = "2.2"
diff --git a/test/ace/test_sparseprodpool.jl b/test/ace/test_sparseprodpool.jl
index cc680a4..8693531 100644
--- a/test/ace/test_sparseprodpool.jl
+++ b/test/ace/test_sparseprodpool.jl
@@ -70,7 +70,7 @@ for _ = 1:30
    bA3 = copy(bA2)
    evaluate!(bA3, basis, bBB)
 
-   println_slim(@test bA1 ≈ bA2 ≈ bA3)
+   print_tf(@test bA1 ≈ bA2 ≈ bA3)
 end
 
 
diff --git a/test/test_atorbrad.jl b/test/test_atorbrad.jl
index 126ce4b..e15400c 100644
--- a/test/test_atorbrad.jl
+++ b/test/test_atorbrad.jl
@@ -6,6 +6,8 @@ using ForwardDiff
 using ACEbase.Testing: fdtest
 using Zygote
 
+P4ML = Polynomials4ML
+
 ##
 
 @info("Testing GaussianBasis")
@@ -31,6 +33,9 @@ println_slim(@test  Rnl ≈ Rnl1 ≈ Rnl2 )
 println_slim(@test  dRnl1 ≈ dRnl2 ≈ fdRnl )
 println_slim(@test  ddRnl2 ≈ fddRnl )
 
+# why does this fail? 
+# P4ML.Testing.test_derivatives(bRnl, () -> 2 * rand() - 1)
+
 ##
 
 @info("Testing SlaterBasis")
@@ -56,6 +61,9 @@ println_slim(@test  Rnl ≈ Rnl1 ≈ Rnl2  )
 println_slim(@test  dRnl1 ≈ dRnl2 ≈ fdRnl )
 println_slim(@test  ddRnl2 ≈ fddRnl )
 
+# why does this fail? 
+# P4ML.Testing.test_derivatives(bRnl, () -> 2 * rand() - 1)
+
 ##
 
 @info("Testing STOBasis")
@@ -81,6 +89,11 @@ println_slim(@test  Rnl ≈ Rnl1 ≈ Rnl2  )
 println_slim(@test  dRnl1 ≈ dRnl2 ≈ fdRnl )
 println_slim(@test  ddRnl2 ≈ fddRnl )
 
+# why does this fail? 
+# P4ML.Testing.test_derivatives(bRnl, () -> 2 * rand() - 1)
+
+
+##
 
 @info("Test rrule")
 using LinearAlgebra: dot 
diff --git a/test/test_discreteweights.jl b/test/test_discreteweights.jl
index 8d3d3d2..bfb7d52 100644
--- a/test/test_discreteweights.jl
+++ b/test/test_discreteweights.jl
@@ -29,12 +29,3 @@ for ntest = 1:30
    print_tf( @test G ≈ I )
 end
 println() 
-
-## 
-
-@info("check that they are really polynomials")
-@info("  ... TODO ... ")
-
-
-
-

From e0ac1742c117d96c96f56922ff5c53308a3ed0b8 Mon Sep 17 00:00:00 2001
From: cortner <christohortner@gmail.com>
Date: Mon, 5 Jun 2023 22:10:44 -0700
Subject: [PATCH 51/54] toy example for double-pullback

---
 temp/double_pb.jl | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 temp/double_pb.jl

diff --git a/temp/double_pb.jl b/temp/double_pb.jl
new file mode 100644
index 0000000..326932d
--- /dev/null
+++ b/temp/double_pb.jl
@@ -0,0 +1,42 @@
+using Zygote, ChainRules, LinearAlgebra, ChainRulesCore
+
+import ChainRules: rrule
+
+function f(x, W) 
+   return W * sin.(x)
+end
+
+function g(y)
+   return sum(abs2, y)
+end
+
+function gf(x, W) 
+   return g(f(x, W))
+end
+
+
+x = rand(10)
+W = rand(5, 10)
+gf(x, W)
+
+∇gf = (x, W) -> Zygote.gradient(x -> gf(x, W), x)[1]
+∇gf(x, W)
+
+L(W) = sum(abs2, ∇gf(x, W))
+L(W)
+
+Zygote.gradient(W -> L(W), W)[1]
+
+
+##
+
+# also works if we add our custom rrules. 
+
+function rrule(::typeof(g), y)
+   return g(y), Δ -> (NoTangent(), 2 * Δ * y)
+end
+
+function rrule(::typeof(f), x, W)
+   y = f(x, W)
+   return y, Δ -> (NoTangent(), cos.(x) .* (W' * Δ), Δ * sin.(x)')
+end

From 2ad027a6808ad35609b23e8c71fcf62ff6c7a811 Mon Sep 17 00:00:00 2001
From: DexuanZhou <hbnis@icloud.com>
Date: Tue, 6 Jun 2023 14:21:11 +0800
Subject: [PATCH 52/54] fixed general interface of slater and gaussian

---
 .../atomicorbitalsradials.jl                  | 74 ++++++++-----------
 src/atomicorbitalsradials/gaussian.jl         | 44 +++++------
 src/atomicorbitalsradials/slater.jl           | 41 +++++-----
 test/test_atorbrad.jl                         | 19 +++--
 4 files changed, 77 insertions(+), 101 deletions(-)

diff --git a/src/atomicorbitalsradials/atomicorbitalsradials.jl b/src/atomicorbitalsradials/atomicorbitalsradials.jl
index 461d0e9..0e93f2f 100644
--- a/src/atomicorbitalsradials/atomicorbitalsradials.jl
+++ b/src/atomicorbitalsradials/atomicorbitalsradials.jl
@@ -5,17 +5,16 @@ using ChainRulesCore: NoTangent
 const NLM{T} = NamedTuple{(:n1, :n2, :l, :m), Tuple{T, T, T, T}}
 const NL{T} = NamedTuple{(:n1, :n2, :l), Tuple{T, T, T}}
 
-struct AtomicOrbitalsRadials{TP, TD, TI, TZ}  <: ScalarPoly4MLBasis
+struct AtomicOrbitalsRadials{TP, TD, TI}  <: ScalarPoly4MLBasis
    Pn::TP
    Dn::TD
    spec::Vector{NL{TI}}
-   ζ::Vector{TZ}   # later : this does into a parameters named-tuple 
    # ----------------- metadata 
    @reqfields
 end
 
-AtomicOrbitalsRadials(Pn, Dn, spec, ζ) = 
-        AtomicOrbitalsRadials(Pn, Dn, spec, ζ, _make_reqfields()...)
+AtomicOrbitalsRadials(Pn, Dn, spec) = 
+        AtomicOrbitalsRadials(Pn, Dn, spec, _make_reqfields()...)
 
 Base.length(basis::AtomicOrbitalsRadials) = length(basis.spec)
 
@@ -26,17 +25,10 @@ _valtype(basis::AtomicOrbitalsRadials, T::Type{<: Real}) = T
 # TODO: (Jerry?) this kind of construction could be used for all  bases? 
 #       file an issue on this.
 
-function evaluate!(Rnl, basis::AtomicOrbitalsRadials, r::Number)
-    Rnl_ = reshape(Rnl, (1, length(basis)))
-    evaluate!(Rnl_, basis, [r,])
-    return Rnl 
-end
-
-
 function evaluate!(Rnl, basis::AtomicOrbitalsRadials, R::AbstractVector{<: Real})
     nR = length(R)
     Pn = evaluate(basis.Pn, R)           # Pn(r)
-    Dn = evaluate(basis.Dn, basis.ζ, R)  # Dn(r)  (ζ are the parameters -> reorganize the Lux way)
+    Dn = evaluate(basis.Dn, R)           # Dn(r)  (ζ are the parameters -> reorganize the Lux way)
 
     fill!(Rnl, 0)
     
@@ -51,10 +43,10 @@ function evaluate!(Rnl, basis::AtomicOrbitalsRadials, R::AbstractVector{<: Real}
     return Rnl 
 end
 
-function evaluate_ed!(Rnl, dRnl, basis::AtomicOrbitalsRadials, R)
+function evaluate_ed!(Rnl, dRnl, basis::AtomicOrbitalsRadials, R::AbstractVector{<: Real})
     nR = length(R)
     Pn, dPn = evaluate_ed(basis.Pn, R)
-    Dn, dDn = evaluate_ed(basis.Dn, basis.ζ, R)
+    Dn, dDn = evaluate_ed(basis.Dn, R)
 
     fill!(Rnl, 0); fill!(dRnl, 0); 
 
@@ -72,10 +64,10 @@ function evaluate_ed!(Rnl, dRnl, basis::AtomicOrbitalsRadials, R)
 end
 
 
-function evaluate_ed2!(Rnl, dRnl, ddRnl, basis::AtomicOrbitalsRadials, R)
+function evaluate_ed2!(Rnl, dRnl, ddRnl, basis::AtomicOrbitalsRadials, R::AbstractVector{<: Real})
     nR = length(R)
     Pn, dPn, ddPn = evaluate_ed2(basis.Pn, R)
-    Dn, dDn, ddDn = evaluate_ed2(basis.Dn, basis.ζ, R)
+    Dn, dDn, ddDn = evaluate_ed2(basis.Dn, R)
 
     fill!(Rnl, 0); fill!(dRnl, 0); fill!(ddRnl, 0)
 
@@ -93,22 +85,8 @@ function evaluate_ed2!(Rnl, dRnl, ddRnl, basis::AtomicOrbitalsRadials, R)
     return Rnl, dRnl, ddRnl
 end
 
-# not test
-# function ChainRulesCore.rrule(::typeof(evaluate), basis::AtomicOrbitalsRadials, R::AbstractVector{<: Real})
-#    # A  = evaluate(basis, R)
-#    # ∂R = similar(R)
-#    # dR = evaluate_ed(basis, R)[2]
-#    A, dR = evaluate_ed(basis, R)
-#    ∂R = similar(R)
-#    function pb(∂A)
-#         @assert size(∂A) == (length(R), length(basis))
-#         for i = 1:length(R)
-#             ∂R[i] = dot(@view(∂A[i, :]), @view(dR[i, :]))
-#         end
-#         return NoTangent(), NoTangent(), ∂R
-#    end
-#    return A, pb
-# end
+natural_indices(basis::AtomicOrbitalsRadials) = copy(basis.spec)
+degree(basis::AtomicOrbitalsRadials, b::NamedTuple) = b.n1
 
 include("gaussian.jl")
 include("slater.jl")
@@ -116,15 +94,25 @@ include("sto_ng.jl")
 
 const ExponentialType = Union{GaussianBasis, SlaterBasis, STO_NG}
 
-evaluate(basis::ExponentialType, ζ::Number, r::Number) = evaluate(basis, [ζ,], [r,])[:]
-evaluate(basis::ExponentialType, ζ::Number, r::Vector) = evaluate(basis, [ζ,], r)
-evaluate(basis::ExponentialType, ζ::Vector, r::Number) = evaluate(basis, ζ, [r,])
-evaluate_ed(basis::ExponentialType, ζ::Number, r::Number) = evaluate_ed(basis, [ζ,], [r,])[:]
-evaluate_ed(basis::ExponentialType, ζ::Number, r::Vector) = evaluate_ed(basis, [ζ,], r)
-evaluate_ed(basis::ExponentialType, ζ::Vector, r::Number) = evaluate_ed(basis, ζ, [r,])
-evaluate_ed2(basis::ExponentialType, ζ::Number, r::Number) = evaluate_ed2(basis, [ζ,], [r,])[:]
-evaluate_ed2(basis::ExponentialType, ζ::Number, r::Vector) = evaluate_ed2(basis, [ζ,], r)
-evaluate_ed2(basis::ExponentialType, ζ::Vector, r::Number) = evaluate_ed2(basis, ζ, [r,])
 
-natural_indices(basis::AtomicOrbitalsRadials) = copy(basis.spec)
-degree(basis::AtomicOrbitalsRadials, b::NamedTuple) = b.n1
\ No newline at end of file
+function evaluate!(Rnl, basis::Union{AtomicOrbitalsRadials, ExponentialType}, r::Number)
+    Rnl_ = reshape(Rnl, (1, length(basis)))
+    evaluate!(Rnl_, basis, [r,])
+    return Rnl 
+end
+
+function evaluate_ed!(Rnl, dRnl, basis::Union{AtomicOrbitalsRadials, ExponentialType}, r::Number)
+    Rnl_ = reshape(Rnl, (1, length(basis)))
+    dRnl_ = reshape(dRnl, (1, length(basis)))
+    evaluate_ed!(Rnl_, dRnl_, basis, [r,])
+    return Rnl 
+end
+
+function evaluate_ed2!(Rnl, dRnl, ddRnl, basis::Union{AtomicOrbitalsRadials, ExponentialType}, r::Number)
+    Rnl_ = reshape(Rnl, (1, length(basis)))
+    dRnl_ = reshape(dRnl, (1, length(basis)))
+    ddRnl_ = reshape(ddRnl, (1, length(basis)))
+    evaluate_ed2!(Rnl_, dRnl_, ddRnl_, basis, [r,])
+    return Rnl 
+end
+
diff --git a/src/atomicorbitalsradials/gaussian.jl b/src/atomicorbitalsradials/gaussian.jl
index 6e36d22..2546cf5 100644
--- a/src/atomicorbitalsradials/gaussian.jl
+++ b/src/atomicorbitalsradials/gaussian.jl
@@ -1,22 +1,23 @@
-struct GaussianBasis <: AbstractPoly4MLBasis
+struct GaussianBasis <: ScalarPoly4MLBasis
+    ζ::AbstractVector
     # ----------------- metadata 
     @reqfields
 end
  
-GaussianBasis() = GaussianBasis(_make_reqfields()...)
+GaussianBasis(ζ) = GaussianBasis(ζ, _make_reqfields()...)
+
+Base.length(basis::GaussianBasis) = length(basis.ζ)
 
 _valtype(::GaussianBasis, T::Type{<: Real}) = T
 
-function evaluate(basis::GaussianBasis, ζ::AbstractVector{<: Number}, x::AbstractVector{<: Number}) 
-    N = length(ζ)
+function evaluate!(P, basis::GaussianBasis, x::AbstractVector{<: Real}) 
+    N = size(P, 2)
     nX = length(x)
-    P = acquire!(basis.pool, :P, (nX, N), eltype(x))
-    fill!(P, 0)
 
     @inbounds begin 
         for n = 1:N
             @simd ivdep for i = 1:nX 
-                P[i,n] = exp(-ζ[n] * x[i]^2)
+                P[i,n] = exp(-basis.ζ[n] * x[i]^2)
             end
         end
     end
@@ -24,42 +25,31 @@ function evaluate(basis::GaussianBasis, ζ::AbstractVector{<: Number}, x::Abstra
     return P 
 end
 
-function evaluate_ed(basis::GaussianBasis, ζ::AbstractVector{<: Number}, x::AbstractVector{<: Number})
-    N = length(ζ)
+function evaluate_ed!(P, dP, basis::GaussianBasis, x)
+    N = length(basis.ζ)
     nX = length(x)
-    P = acquire!(basis.pool, :P, (nX, N), eltype(x))
-    dP = acquire!(basis.pool, :dP, (nX, N), eltype(x))
-    fill!(P, 0)
-    fill!(dP, 0)
 
     @inbounds begin 
         for n = 1:N
             @simd ivdep for i = 1:nX 
-                P[i,n] = exp(-ζ[n] * x[i]^2)
-                dP[i,n] = -2 * ζ[n] * x[i] * P[i, n]
+                P[i,n] = exp(-basis.ζ[n] * x[i]^2)
+                dP[i,n] = -2 * basis.ζ[n] * x[i] * P[i, n]
             end
         end
     end
     return P, dP 
 end 
 
-function evaluate_ed2(basis::GaussianBasis, ζ::AbstractVector{<: Number}, x::AbstractVector{<: Number})
-    N = length(ζ)
+function evaluate_ed2!(P, dP, ddP, basis::GaussianBasis, x)
+    N = length(basis.ζ)
     nX = length(x)
 
-    P = acquire!(basis.pool, :P, (nX, N), eltype(x))
-    dP = acquire!(basis.pool, :dP, (nX, N), eltype(x))
-    ddP = acquire!(basis.pool, :ddP, (nX, N), eltype(x))
-    fill!(P, 0)
-    fill!(dP, 0)
-    fill!(ddP, 0)
-
     @inbounds begin 
         for n = 1:N
             @simd ivdep for i = 1:nX 
-                P[i, n] = exp(-ζ[n] * x[i]^2)
-                dP[i, n] = -2 * ζ[n] * x[i] * P[i, n]
-                ddP[i, n] = -2 * ζ[n] * P[i, n] -2 * ζ[n] * x[i] * dP[i, n]
+                P[i, n] = exp(-basis.ζ[n] * x[i]^2)
+                dP[i, n] = -2 * basis.ζ[n] * x[i] * P[i, n]
+                ddP[i, n] = -2 * basis.ζ[n] * P[i, n] -2 * basis.ζ[n] * x[i] * dP[i, n]
             end
         end
     end
diff --git a/src/atomicorbitalsradials/slater.jl b/src/atomicorbitalsradials/slater.jl
index ded9a3a..5ba2339 100644
--- a/src/atomicorbitalsradials/slater.jl
+++ b/src/atomicorbitalsradials/slater.jl
@@ -1,36 +1,38 @@
-struct SlaterBasis
+struct SlaterBasis <: ScalarPoly4MLBasis
+    ζ::AbstractVector
     # ----------------- metadata 
-    meta::Dict{String, Any}
+    @reqfields
 end
 
-SlaterBasis(; meta = Dict{String, Any}()) = SlaterBasis(meta)
+SlaterBasis(ζ) = SlaterBasis(ζ, _make_reqfields()...)
 
-function evaluate(basis::SlaterBasis, ζ::AbstractVector{<: Number}, x::AbstractVector{<: Number}) 
-    N = length(ζ)
+Base.length(basis::SlaterBasis) = length(basis.ζ)
+
+_valtype(::SlaterBasis, T::Type{<: Real}) = T
+
+function evaluate!(P, basis::SlaterBasis, x::AbstractVector{<: Real}) 
+    N = size(P, 2)
     nX = length(x)
-    P = zeros(eltype(x), nX, N)
 
     @inbounds begin 
         for n = 1:N
             @simd ivdep for i = 1:nX 
-                P[i,n] = exp(-ζ[n] * x[i])
+                P[i,n] = exp(-basis.ζ[n] * x[i])
             end
         end
     end
     return P 
 end
 
-function evaluate_ed(basis::SlaterBasis, ζ::AbstractVector{<: Number}, x::AbstractVector{<: Number})
-    N = length(ζ)
+function evaluate_ed!(P, dP, basis::SlaterBasis, x)
+    N = size(P, 2)
     nX = length(x)
-    P = zeros(eltype(x), nX, N)
-    dP = zeros(eltype(x), nX, N)
 
     @inbounds begin 
         for n = 1:N
             @simd ivdep for i = 1:nX 
-                P[i, n] = exp(-ζ[n] * x[i])
-                dP[i, n] = -ζ[n] * P[i,n]
+                P[i, n] = exp(-basis.ζ[n] * x[i])
+                dP[i, n] = -basis.ζ[n] * P[i,n]
             end
         end
     end
@@ -38,19 +40,16 @@ function evaluate_ed(basis::SlaterBasis, ζ::AbstractVector{<: Number}, x::Abstr
    return P, dP 
 end 
 
-function evaluate_ed2(basis::SlaterBasis, ζ::AbstractVector{<: Number}, x::AbstractVector{<: Number})
-    N = length(ζ)
+function evaluate_ed2!(P, dP, ddP, basis::SlaterBasis, x)
+    N = size(P, 2)
     nX = length(x)
-    P = zeros(eltype(x), nX, N)
-    dP = zeros(eltype(x), nX, N)
-    ddP = zeros(eltype(x), nX, N)
     
     @inbounds begin 
         for n = 1:N
             @simd ivdep for i = 1:nX 
-                P[i, n] = exp(-ζ[n] * x[i])
-                dP[i, n] = -ζ[n] * P[i, n]
-                ddP[i, n] = -ζ[n] * dP[i, n]
+                P[i, n] = exp(-basis.ζ[n] * x[i])
+                dP[i, n] = -basis.ζ[n] * P[i, n]
+                ddP[i, n] = -basis.ζ[n] * dP[i, n]
             end
         end
     end
diff --git a/test/test_atorbrad.jl b/test/test_atorbrad.jl
index e15400c..db356f6 100644
--- a/test/test_atorbrad.jl
+++ b/test/test_atorbrad.jl
@@ -13,11 +13,11 @@ P4ML = Polynomials4ML
 @info("Testing GaussianBasis")
 n1 = 5 # degree
 n2 = 3 
-Pn = Polynomials4ML.legendre_basis(n1+1)
+Pn = P4ML.legendre_basis(n1+1)
 spec = [(n1 = n1, n2 = n2, l = l) for n1 = 1:n1 for n2 = 1:n2 for l = 0:n1-1] 
-Dn = GaussianBasis()
 ζ = rand(length(spec))
-bRnl = AtomicOrbitalsRadials(Pn, Dn, spec, ζ) 
+Dn = GaussianBasis(ζ)
+bRnl = AtomicOrbitalsRadials(Pn, Dn, spec) 
 rr = 2 * rand(10) .- 1
 Rnl = evaluate(bRnl, rr)
 Rnl1, dRnl1 = evaluate_ed(bRnl, rr)
@@ -32,9 +32,8 @@ fddRnl = vcat([ ForwardDiff.derivative(r -> evaluate_ed(bRnl, [r,])[2], r)
 println_slim(@test  Rnl ≈ Rnl1 ≈ Rnl2 )
 println_slim(@test  dRnl1 ≈ dRnl2 ≈ fdRnl )
 println_slim(@test  ddRnl2 ≈ fddRnl )
-
-# why does this fail? 
-# P4ML.Testing.test_derivatives(bRnl, () -> 2 * rand() - 1)
+ 
+P4ML.Testing.test_derivatives(bRnl, () -> 2 * rand() - 1)
 
 ##
 
@@ -43,9 +42,10 @@ n1 = 5 # degree
 n2 = 3 
 Pn = Polynomials4ML.legendre_basis(n1+1)
 spec = [(n1 = n1, n2 = n2, l = l) for n1 = 1:n1 for n2 = 1:n2 for l = 0:n1-1] 
-Dn = SlaterBasis()
 ζ = rand(length(spec))
-bRnl = AtomicOrbitalsRadials(Pn, Dn, spec, ζ) 
+
+Dn = SlaterBasis(ζ)
+bRnl = AtomicOrbitalsRadials(Pn, Dn, spec) 
 rr = 2 * rand(10) .- 1
 Rnl = evaluate(bRnl, rr)
 Rnl1, dRnl1 = evaluate_ed(bRnl, rr)
@@ -61,8 +61,7 @@ println_slim(@test  Rnl ≈ Rnl1 ≈ Rnl2  )
 println_slim(@test  dRnl1 ≈ dRnl2 ≈ fdRnl )
 println_slim(@test  ddRnl2 ≈ fddRnl )
 
-# why does this fail? 
-# P4ML.Testing.test_derivatives(bRnl, () -> 2 * rand() - 1)
+P4ML.Testing.test_derivatives(bRnl, () -> 2 * rand() - 1)
 
 ##
 

From dd507d1823941194f6ba95ad229cb5e3f2d3b45e Mon Sep 17 00:00:00 2001
From: DexuanZhou <hbnis@icloud.com>
Date: Tue, 6 Jun 2023 14:34:52 +0800
Subject: [PATCH 53/54] fix general interface for STO_ng

---
 src/atomicorbitalsradials/sto_ng.jl | 38 +++++++++++++++--------------
 test/test_atorbrad.jl               |  9 +++----
 2 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/src/atomicorbitalsradials/sto_ng.jl b/src/atomicorbitalsradials/sto_ng.jl
index d8260d1..12ddb01 100644
--- a/src/atomicorbitalsradials/sto_ng.jl
+++ b/src/atomicorbitalsradials/sto_ng.jl
@@ -1,16 +1,20 @@
-struct STO_NG
+struct STO_NG <: ScalarPoly4MLBasis
+    ζ::Tuple
     # ----------------- metadata 
-    meta::Dict{String, Any}
+    @reqfields
 end
 
-STO_NG(; meta = Dict{String, Any}()) = STO_NG(meta)
+STO_NG(ζ) = STO_NG(ζ, _make_reqfields()...)
 
-function evaluate(basis::STO_NG, ξ::Vector{Matrix{Float64}}, x::AbstractVector{<: Number}) 
-    ζ, D = ξ[1], ξ[2]
+Base.length(basis::STO_NG) = length(basis.ζ[1])
+
+_valtype(::STO_NG, T::Type{<: Real}) = T
+
+function evaluate!(P, basis::STO_NG, x::AbstractVector{<: Real}) 
+    ζ, D = basis.ζ[1], basis.ζ[2]
     N, M = size(ζ)
     nX = length(x)
-    P = zeros(eltype(x), nX, N)
-
+    fill!(P, 0)
     @inbounds begin 
         for n = 1:N
             for m = 1:M
@@ -24,13 +28,12 @@ function evaluate(basis::STO_NG, ξ::Vector{Matrix{Float64}}, x::AbstractVector{
     return P # D[n,m] * exp(-[n, m] * x[i]^2)
 end
 
-function evaluate_ed(basis::STO_NG, ξ::Vector{Matrix{Float64}}, x::AbstractVector{<: Number})
-    ζ, D = ξ[1], ξ[2]
+function evaluate_ed!(P, dP, basis::STO_NG, x::AbstractVector{<: Real})
+    ζ, D = basis.ζ[1], basis.ζ[2]
     N, M = size(ζ)
     nX = length(x)
-    P = zeros(eltype(x), nX, N)
-    dP = zeros(eltype(x), nX, N)
-
+    fill!(P, 0)
+    fill!(dP, 0)
     @inbounds begin 
         for n = 1:N
             for m = 1:M
@@ -46,14 +49,13 @@ function evaluate_ed(basis::STO_NG, ξ::Vector{Matrix{Float64}}, x::AbstractVect
     return P, dP 
 end 
 
-function evaluate_ed2(basis::STO_NG, ξ::Vector{Matrix{Float64}}, x::AbstractVector{<: Number})
-    ζ, D = ξ[1], ξ[2]
+function evaluate_ed2!(P, dP, ddP, basis::STO_NG, x::AbstractVector{<: Real})
+    ζ, D = basis.ζ[1], basis.ζ[2]
     N, M = size(ζ)
     nX = length(x)
-    P = zeros(eltype(x), nX, N)
-    dP = zeros(eltype(x), nX, N)
-    ddP = zeros(eltype(x), nX, N)
-
+    fill!(P, 0)
+    fill!(dP, 0)
+    fill!(ddP, 0)
     @inbounds begin 
         for n = 1:N
             for m = 1:M
diff --git a/test/test_atorbrad.jl b/test/test_atorbrad.jl
index db356f6..270a934 100644
--- a/test/test_atorbrad.jl
+++ b/test/test_atorbrad.jl
@@ -71,9 +71,9 @@ n2 = 1
 Pn = Polynomials4ML.legendre_basis(n1+1)
 spec = [(n1 = n1, n2 = n2, l = l) for n1 = 1:n1 for n2 = 1:1 for l = 0:n1-1] 
 M = 3
-ζ = [rand(length(spec), M),rand(length(spec), M)]
-Dn = STO_NG()
-bRnl = AtomicOrbitalsRadials(Pn, Dn, spec, ζ) 
+ζ = (rand(length(spec), M),rand(length(spec), M))
+Dn = STO_NG(ζ)
+bRnl = AtomicOrbitalsRadials(Pn, Dn, spec) 
 rr = 2 * rand(10) .- 1
 Rnl = evaluate(bRnl, rr)
 Rnl1, dRnl1 = evaluate_ed(bRnl, rr)
@@ -88,8 +88,7 @@ println_slim(@test  Rnl ≈ Rnl1 ≈ Rnl2  )
 println_slim(@test  dRnl1 ≈ dRnl2 ≈ fdRnl )
 println_slim(@test  ddRnl2 ≈ fddRnl )
 
-# why does this fail? 
-# P4ML.Testing.test_derivatives(bRnl, () -> 2 * rand() - 1)
+P4ML.Testing.test_derivatives(bRnl, () -> 2 * rand() - 1)
 
 
 ##

From 0902a84d166190832e22a13167596dca92f99bb1 Mon Sep 17 00:00:00 2001
From: cortner <christohortner@gmail.com>
Date: Tue, 6 Jun 2023 20:34:46 -0700
Subject: [PATCH 54/54] cleanup

---
 src/chebbasis.jl                     |  2 +-
 test/runtests.jl                     |  1 +
 test/sphericalharmonics/test_crlm.jl | 10 ++++------
 test/sphericalharmonics/test_cylm.jl |  3 +++
 test/sphericalharmonics/test_rrlm.jl |  2 +-
 test/test_flex.jl                    |  2 +-
 test/test_lux.jl                     |  2 +-
 test/test_rtrig.jl                   |  1 +
 test/test_trig.jl                    |  1 +
 9 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/chebbasis.jl b/src/chebbasis.jl
index caba767..aa20f66 100644
--- a/src/chebbasis.jl
+++ b/src/chebbasis.jl
@@ -13,7 +13,7 @@ The differences between `ChebBasis` and `chebyshev_basis` is that `ChebBasis` co
 
 Warning: `ChebBasis` and `chebyshev_basis` have different normalization.
 """
-struct ChebBasis <: AbstractPoly4MLBasis
+struct ChebBasis <: ScalarPoly4MLBasis
    N::Int
    @reqfields
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 08f65d4..0656a24 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -6,6 +6,7 @@ using Test
     # 1D Polynomials 
     @testset "OrthPolyBasis1D3T" begin include("test_op1d3t.jl"); end
     @testset "DiscreteWeights" begin include("test_discreteweights.jl"); end
+    @testset "Chebyshev" begin include("test_cheb.jl"); end 
 
     # 2D Harmonics 
     @testset "TrigonometricPolynomials" begin include("test_trig.jl"); end
diff --git a/test/sphericalharmonics/test_crlm.jl b/test/sphericalharmonics/test_crlm.jl
index ce0164a..6a4466f 100644
--- a/test/sphericalharmonics/test_crlm.jl
+++ b/test/sphericalharmonics/test_crlm.jl
@@ -35,7 +35,7 @@ end
 @info("Test: check complex solid harmonics against explicit expressions")
 nsamples = 30
 for n = 1:nsamples
-    local X
+    local X, θ, r
     l = rand(collect(1:10))
     m = rand(collect(1:l))
     θ = rand() * π
@@ -53,7 +53,7 @@ println()
 @info("      ... same near pole")
 nsamples = 30
 for n = 1:nsamples
-    local X
+    local X, θ, r
     l = rand(collect(1:10))
     m = rand(collect(1:l))
     θ = rand() * 1e-9
@@ -73,8 +73,7 @@ println()
 @info("Test: check complex solid harmonics against spherical harmonics times r^l")
 nsamples = 30
 for n = 1:nsamples
-    local X
-    local Y2
+    local X, Y2, θ, r 
     l = rand(collect(1:10))
     m = rand(collect(1:l))
     θ = rand() * π
@@ -93,8 +92,7 @@ println()
 @info("      ... same near pole")
 nsamples = 30
 for n = 1:nsamples
-    local X
-    local Y2
+    local X, Y2, θ, r
     l = rand(collect(1:10))
     m = rand(collect(1:l))
     θ = rand() * 1e-9
diff --git a/test/sphericalharmonics/test_cylm.jl b/test/sphericalharmonics/test_cylm.jl
index 7f59520..4e07bac 100644
--- a/test/sphericalharmonics/test_cylm.jl
+++ b/test/sphericalharmonics/test_cylm.jl
@@ -36,6 +36,7 @@ end
 @info("Test: check complex spherical harmonics against explicit expressions")
 nsamples = 30
 for n = 1:nsamples
+   local θ, r 
    θ = rand() * π
    φ = (rand()-0.5) * 2*π
    r = 0.1+rand()
@@ -51,6 +52,7 @@ println()
 @info("      ... same near pole")
 nsamples = 30
 for n = 1:nsamples
+   local θ, r
    θ = rand() * 1e-9
    if θ < 1e-10
       θ = 0.0
@@ -71,6 +73,7 @@ using Polynomials4ML: SphericalCoords, ALPolynomials
 verbose = false
 @info("Test: check derivatives of associated legendre polynomials")
 for nsamples = 1:30
+   local θ
    θ = rand() * π
    φ = (rand()-0.5) * 2*π
    S = SphericalCoords(φ, θ)
diff --git a/test/sphericalharmonics/test_rrlm.jl b/test/sphericalharmonics/test_rrlm.jl
index b43274a..d177319 100644
--- a/test/sphericalharmonics/test_rrlm.jl
+++ b/test/sphericalharmonics/test_rrlm.jl
@@ -44,7 +44,7 @@ end
 @info("Test: check real solid harmonics against explicit expressions")
 nsamples = 30
 for n = 1:nsamples
-   local X
+   local X, θ, r
    θ = rand() * π
    φ = (rand()-0.5) * 2*π
    r = 0.1+rand()
diff --git a/test/test_flex.jl b/test/test_flex.jl
index 2e36c60..0ecb2a2 100644
--- a/test/test_flex.jl
+++ b/test/test_flex.jl
@@ -15,7 +15,7 @@ tests = [
 
 for (basis, rnd) in tests   
    for ntest = 1:5 
-      local B1, B2 
+      local B1, B2, x
       x = rnd()
       B0 = zeros(Polynomials4ML._valtype(basis, x), length(basis))
       evaluate!(B0, basis, x)
diff --git a/test/test_lux.jl b/test/test_lux.jl
index 930ffbb..ca63fb1 100644
--- a/test/test_lux.jl
+++ b/test/test_lux.jl
@@ -17,7 +17,7 @@ test_bases = [ (chebyshev_basis(10), () -> rand()),
           (RYlmBasis(5), () -> randn(SVector{3, Float64})) ]
 
 for (basis, rnd) in test_bases 
-   local B1, B2 
+   local B1, B2, x
    x = rnd() 
    B1 = evaluate(basis, x)
    l = lux(basis)
diff --git a/test/test_rtrig.jl b/test/test_rtrig.jl
index c97dd0a..c20d912 100644
--- a/test/test_rtrig.jl
+++ b/test/test_rtrig.jl
@@ -12,6 +12,7 @@ basis = RTrigBasis(N)
 @info("      correctness")
 mm = natural_indices(basis)
 for ntest = 1:30
+   local x 
    x = 2*π * rand()
    P = basis(x)
    P2 = [ (m >= 0 ? cos(m*x) : sin(abs(m)*x)) for m in mm ]
diff --git a/test/test_trig.jl b/test/test_trig.jl
index 70c59ce..2f765f3 100644
--- a/test/test_trig.jl
+++ b/test/test_trig.jl
@@ -14,6 +14,7 @@ basis = CTrigBasis(N)
 @info("      correctness")
 mm = natural_indices(basis)
 for ntest = 1:10
+   local x 
    x = 2*π * rand()
    P = basis(x)
    print_tf(@test all(