From b85b098a5dcda1c2e391e311ab115c05cfbedc60 Mon Sep 17 00:00:00 2001
From: Christoph Ortner <christophortner0@gmail.com>
Date: Mon, 6 Nov 2023 22:57:52 -0800
Subject: [PATCH] Julia Implementation (#79)

---------

Co-authored-by: Christoph Ortner <christohortner@gmail.com>
Co-authored-by: frostedoyster <bigi.f@libero.it>
---
 .github/workflows/tests.yml           |  15 ++
 .gitignore                            |   2 +
 LICENSE => LICENSE-APACHE             |   0
 LICENSE-MIT                           |  21 ++
 README.md                             |  10 +
 julia/Project.toml                    |  19 ++
 julia/README.md                       |  65 ++++++
 julia/benchmarks/benchmark.jl         | 104 +++++++++
 julia/src/SpheriCart.jl               |  19 ++
 julia/src/api.jl                      | 179 +++++++++++++++
 julia/src/batched_kernels.jl          | 318 ++++++++++++++++++++++++++
 julia/src/generated_kernels.jl        | 202 ++++++++++++++++
 julia/src/indexing.jl                 |  40 ++++
 julia/src/normalisations.jl           |  30 +++
 julia/src/spherical.jl                | 142 ++++++++++++
 julia/test/python/test_python.py      |  59 +++++
 julia/test/runtests.jl                |   7 +
 julia/test/test_solidharmonics.jl     | 175 ++++++++++++++
 julia/test/test_sphericalharmonics.jl |  79 +++++++
 pyproject.toml                        |   2 +-
 20 files changed, 1487 insertions(+), 1 deletion(-)
 rename LICENSE => LICENSE-APACHE (100%)
 create mode 100644 LICENSE-MIT
 create mode 100644 julia/Project.toml
 create mode 100644 julia/README.md
 create mode 100644 julia/benchmarks/benchmark.jl
 create mode 100644 julia/src/SpheriCart.jl
 create mode 100644 julia/src/api.jl
 create mode 100644 julia/src/batched_kernels.jl
 create mode 100644 julia/src/generated_kernels.jl
 create mode 100644 julia/src/indexing.jl
 create mode 100644 julia/src/normalisations.jl
 create mode 100644 julia/src/spherical.jl
 create mode 100644 julia/test/python/test_python.py
 create mode 100644 julia/test/runtests.jl
 create mode 100644 julia/test/test_solidharmonics.jl
 create mode 100644 julia/test/test_sphericalharmonics.jl

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 9d3f7e963..c5ce56a4d 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -23,6 +23,11 @@ jobs:
         with:
           python-version: "3.10"
 
+      - name: setup Julia
+        uses: julia-actions/setup-julia@v1
+        with:
+          version: '1.9.2'
+
       - name: install tests dependencies
         run: |
           python -m pip install --upgrade pip
@@ -42,6 +47,16 @@ jobs:
           # Use the CPU only version of torch when building/running the code
           PIP_EXTRA_INDEX_URL: https://download.pytorch.org/whl/cpu
 
+      - name: run Julia tests
+        run: |
+          cd julia/test/
+          julia --project=.. -e 'using Pkg; Pkg.instantiate(); Pkg.precompile();'
+          julia --project=.. runtests.jl
+          cd python/
+          julia -e 'using Pkg; Pkg.add("PyCall"); Pkg.instantiate();'
+          python -m pip install numpy pytest sphericart julia
+          python -m pytest .
+
   # check that we can build Python wheels on any Python version
   python-build:
     runs-on: ubuntu-20.04
diff --git a/.gitignore b/.gitignore
index 013d835f2..95b10615b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,5 @@ dist/
 *.egg-info
 .tox/
 *.pyc
+.vscode
+Manifest.toml
diff --git a/LICENSE b/LICENSE-APACHE
similarity index 100%
rename from LICENSE
rename to LICENSE-APACHE
diff --git a/LICENSE-MIT b/LICENSE-MIT
new file mode 100644
index 000000000..3f9bfb9aa
--- /dev/null
+++ b/LICENSE-MIT
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 sphericart contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
index ede0c317e..bfbc4391b 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,8 @@ If you are using sphericart for your academic work, you can cite it as
 }
 ```
 
+This library is dual-licensed under the Apache License 2.0 and the MIT license. You can use to use it under either of the two licenses.
+
 ## Installation
 
 ### Python
@@ -55,6 +57,14 @@ pip install --extra-index-url https://download.pytorch.org/whl/cpu .[torch]
 Building from source is also necessary to use sphericart's PyTorch GPU 
 functionalities, and it requires a CUDA compiler.
 
+### Julia 
+
+A native Julia implementation of `sphericart` is provided, called `SpheriCart`. 
+Install the package by opening a REPL, switch to the package manager by 
+typing `]` and then `add SpheriCart`. 
+See [julia/README.md](julia/README.md) for usage.
+
+
 ### C and C++
 
 From source
diff --git a/julia/Project.toml b/julia/Project.toml
new file mode 100644
index 000000000..32553ee30
--- /dev/null
+++ b/julia/Project.toml
@@ -0,0 +1,19 @@
+name = "SpheriCart"
+uuid = "5caf2b29-02d9-47a3-9434-5931c85ba645"
+authors = ["Christoph Ortner <christohortner@gmail.com> and contributors"]
+version = "0.0.2-dev"
+
+[deps]
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+ObjectPools = "658cac36-ff0f-48ad-967c-110375d98c9d"
+OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+
+[extras]
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[targets]
+test = ["Test", "LinearAlgebra", "Random"]
diff --git a/julia/README.md b/julia/README.md
new file mode 100644
index 000000000..d7f5a51b7
--- /dev/null
+++ b/julia/README.md
@@ -0,0 +1,65 @@
+# SpheriCart 
+
+A Julia implementation of real solid and spherical harmonics, following
+```quote
+Fast evaluation of spherical harmonics with sphericart, 
+Filippo Bigi, Guillaume Fraux, Nicholas J. Browning and Michele Ceriotti; 
+J. Chem. Phys. 159, 064802 (2023); arXiv:2302.08381
+```
+
+
+`SpheriCart.jl` is released under MIT license and under Apache 2.0 license.
+
+## Installation 
+
+Install the package by opening a REPL, switch to the package manager by typing `]` and then `add SpheriCart`.
+
+## Basic Usage
+
+There are two implementations of real solid harmonics and real spherical harmonics
+- a generated  implementation for a single `𝐫::SVector{3, T}` input, returning the spherical harmonics as an `SVector{T}`. 
+- a generic implementation that is optimized for evaluating over batches of inputs, exploiting SIMD vectorization. 
+
+For large enough batches (system dependent) the second implementation is comparable to or faster than broadcasting over the generated implementation. For single inputs, the generated implementation is far superior in performance. 
+
+
+```julia
+using SpheriCart, StaticArrays 
+
+# generate the basis object 
+L = 5
+basis = SolidHarmonics(L)
+# Replace this with 
+#  basis = SphericalHarmonics(L) 
+# to evaluate the spherical instead of solid harmonics 
+
+# evaluate for a single input 
+𝐫 = @SVector randn(3) 
+# Z : SVector of length (L+1)²
+Z = basis(𝐫)  
+Z = compute(basis, 𝐫)
+# ∇Z : SVector of length (L+1)², each ∇Z[i] is an SVector{3, T}
+Z, ∇Z = compute_with_gradients(basis, 𝐫)
+
+# evaluate for many inputs 
+nX = 32
+Rs = [ @SVector randn(3)  for _ = 1:nX ]
+# Z : Matrix of size nX × (L+1)² of scalar 
+# dZ : Matrix of size nX × (L+1)² of SVector{3, T}
+Z = basis(Rs)  
+Z = compute(basis, Rs)
+Z, ∇Z = compute_with_gradients(basis, Rs)
+
+# in-place evaluation to avoid the allocation 
+compute!(Z, basis, Rs)
+compute_with_gradients!(Z, ∇Z, basis, Rs)
+```
+
+Note that Julia uses column-major indexing, which means that for batched output the loop over inputs is contiguous in memory. 
+
+<!-- ## Advanced Usage
+
+TODO:  
+- different normalizations
+- enforce static versus dynamic 
+- wrapping outputs into zvec for easier indexing  -->
\ No newline at end of file
diff --git a/julia/benchmarks/benchmark.jl b/julia/benchmarks/benchmark.jl
new file mode 100644
index 000000000..dbb50bfa4
--- /dev/null
+++ b/julia/benchmarks/benchmark.jl
@@ -0,0 +1,104 @@
+# to run this benchmark add `BenchmarkTools` to the standard julia 
+# environment (e.g. `julia` and `] add BenchmarkTools`)
+# then run this scrict via from the current folder via 
+# `julia --project=.. -O3 benchmark.jl`
+
+# This is just a preliminary benchmark for initial testing. 
+# We need to write a proper benchmark suite to check performance 
+# regressions and compare against the C++ implementation on the same 
+# system. 
+
+using StaticArrays, BenchmarkTools, SpheriCart
+using SpheriCart: SolidHarmonics, compute, compute!, 
+                  static_solid_harmonics, 
+                  compute_with_gradients, compute_with_gradients!
+
+##
+
+@info("static_solid_harmonics")
+𝐫 = @SVector randn(3)
+for L = 1:12
+   @show L
+   basis = SolidHarmonics(L; static=true)
+   basis(𝐫) # warmup 
+   @btime ($basis)($𝐫)
+   # these two are equivalent - just keeping them here for testing 
+   # since there is an odd effect that in some environments there there 
+   # is an unexplained overhead in the `compute` interface. 
+   # @btime static_solid_harmonics($(Val(L)), $𝐫)
+   # @btime compute($basis, $𝐫)
+end
+
+
+##
+
+@info("batched evaluation vs broadcast")
+@info("nX = 32 (try a nice number)")
+@info("broadcast! cost is almost exactly single * nX")
+
+Rs = [ (@SVector randn(3)) for _ = 1:32 ]
+
+for L = 3:3:15
+   @show L
+   basis = SolidHarmonics(L)
+   Zs = static_solid_harmonics.(Val(L), Rs)
+   print("    single: "); @btime static_solid_harmonics($(Val(L)), $(Rs[1]))
+   print("broadcast!: "); @btime broadcast!(𝐫 -> static_solid_harmonics($(Val(L)), 𝐫), $Zs, $Rs)
+   Zb = compute(basis, Rs)
+   print("   batched: "); (@btime compute!($Zb, $basis, $Rs));
+end
+
+##
+
+@info("static vs generic basis for single input (nX = 1)")
+@info("  this shouws that the generic single-input implementation needs work")
+
+for L = 3:3:30 
+   local 𝐫
+   @show L 
+   𝐫 = @SVector randn(3)
+   basis_st = SolidHarmonics(L; static=true)
+   basis_dy = SolidHarmonics(L; static=false)
+   print("  static: "); @btime compute($basis_st, $𝐫)
+   print(" generic: "); @btime compute($basis_dy, $𝐫)
+end 
+
+##
+
+
+@info("compute! vs compute_with_gradients! (using 32 inputs)")
+
+for L = 1:2:15
+   @show L 
+   nX = 32
+   basis = SolidHarmonics(L)
+   Rs = [ (@SVector randn(3)) for _ = 1:nX ]
+   Z, dZ = compute_with_gradients(basis, Rs)
+   print("        compute!: "); @btime compute!($Z, $basis, $Rs)
+   print(" _with_gradient!: "); @btime compute_with_gradients!($Z, $dZ, $basis, $Rs)
+end 
+
+
+##
+
+
+@info("compute vs compute_with_gradients for code-generated basis")
+for L = 1:10
+   @show L 
+   basis = SolidHarmonics(L; static=true)
+   𝐫 = @SVector randn(3)
+   Z1 = compute(basis, 𝐫)
+   Z, dZ = compute_with_gradients(basis, 𝐫)
+   print("        compute: "); @btime compute($basis, $𝐫)
+   print(" _with_gradient: "); @btime compute_with_gradients($basis, $𝐫)
+end 
+
+
+
+## -------------- 
+
+# @profview let Z = Z, uf_Zlm = uf_Zlm, XX = XX
+#    for n = 1:3_000_000
+#       evaluate!(Z, uf_Zlm, XX)
+#    end
+# end
\ No newline at end of file
diff --git a/julia/src/SpheriCart.jl b/julia/src/SpheriCart.jl
new file mode 100644
index 000000000..838f1bc1d
--- /dev/null
+++ b/julia/src/SpheriCart.jl
@@ -0,0 +1,19 @@
+module SpheriCart
+
+using StaticArrays, OffsetArrays, ObjectPools
+
+export SolidHarmonics, 
+       compute, 
+       compute!, 
+       compute_with_gradients,
+       compute_with_gradients!
+
+include("indexing.jl")
+include("normalisations.jl")
+include("api.jl")
+include("generated_kernels.jl")
+include("batched_kernels.jl")
+include("spherical.jl")
+
+end
+
diff --git a/julia/src/api.jl b/julia/src/api.jl
new file mode 100644
index 000000000..f4cbcc654
--- /dev/null
+++ b/julia/src/api.jl
@@ -0,0 +1,179 @@
+"""
+`struct SolidHarmonics` : datatype representing a solid harmonics basis. 
+
+### Constructor to generate the basis object
+```julia
+basis = SolidHarmonic(L::Integer; kwargs...)
+```
+
+### Keyword arguments:
+* `normalisation = :L2` : choose the normalisation of the basis, default is to 
+   make it orthonoormal on the unit sphere. 
+* `static = (L<=15)` : decide whether to use a generated code that outputs an 
+`SVector` but has a larger compiler and stack footprint
+* `T = Float64` : datatype in which basis parameters are stored. The output type 
+is inferred at runtime, but the rule of thumb is to use `T = FloatX` for 
+`FloatX` output.
+
+### Usage example: 
+```julia
+using StaticArrays, SpheriCart
+basis = SolidHarmonics(4)
+# evaluate basis with single input 
+𝐫 = @SVector randn(3)
+Z = basis(𝐫)
+Z = compute(basis, 𝐫)
+# evaluate basis with multiple inputs (batching)
+R = [ @SVector randn(3) for _ = 1:32 ]
+Z = basis(Rs)
+Z = compute(basis, Rs)
+
+# to be implented: 
+# Z, ∇Z = compute_and_gradients(basis, 𝐫)
+# Z, ∇Z, ∇²Z = compute_and_hessian(basis, 𝐫)
+```
+See documentation for more details.
+"""
+struct SolidHarmonics{L, NORM, STATIC, T1}
+   Flm::OffsetMatrix{T1, Matrix{T1}}
+   cache::ArrayPool{FlexArrayCache}
+end
+
+function SolidHarmonics(L::Integer; 
+                        normalisation = :L2, 
+                        static = (L <= 15), 
+                        T = Float64) 
+   Flm = generate_Flms(L; normalisation = normalisation, T = T)
+   @assert eltype(Flm) == T   
+   SolidHarmonics{L, normalisation, static, T}(Flm, ArrayPool(FlexArrayCache))
+end
+
+@inline (basis::SolidHarmonics)(args...) = compute(basis, args...)
+
+
+@inline function compute(basis::SolidHarmonics{L, NORM, true}, 𝐫::SVector{3}
+                 ) where {L, NORM} 
+   return static_solid_harmonics(Val{L}(), 𝐫, Val{NORM}())
+end 
+
+function compute(basis::SolidHarmonics{L, NORM, false, T1}, 𝐫::SVector{3, T2}
+         ) where {L, NORM, T1, T2}
+   T = promote_type(T1, T2)
+   Z = zeros(T, sizeY(L))
+   Zmat = reshape(Z, 1, :)   # this is a view, not a copy!
+   compute!(Zmat, basis, SA[𝐫,])
+   return Z 
+end 
+
+function compute(basis::SolidHarmonics{L, NORM, STATIC, T1}, 
+                  Rs::AbstractVector{SVector{3, T2}}
+                  ) where {L, NORM, STATIC, T1, T2}
+   T = promote_type(T1, T2)
+   Z = zeros(T, length(Rs), sizeY(L)) # we could make this cached as well 
+   compute!(Z, basis, Rs)
+   return Z
+end
+
+function compute!(Z::AbstractMatrix, 
+                  basis::SolidHarmonics{L, NORM, STATIC, T1}, 
+                  Rs::AbstractVector{SVector{3, T2}}
+                  ) where {L, NORM, STATIC, T1, T2}
+
+   nX = length(Rs)
+   T = promote_type(T1, T2)
+
+   # allocate temporary arrays from an array cache 
+   temps = (x = acquire!(basis.cache, :x,  (nX, ),    T),
+            y = acquire!(basis.cache, :y,  (nX, ),    T),
+            z = acquire!(basis.cache, :z,  (nX, ),    T), 
+           r² = acquire!(basis.cache, :r2, (nX, ),    T),
+            s = acquire!(basis.cache, :s,  (nX, L+1), T),
+            c = acquire!(basis.cache, :c,  (nX, L+1), T),
+            Q = acquire!(basis.cache, :Q,  (nX, sizeY(L)), T), 
+            Flm = basis.Flm )
+
+   # the actual evaluation kernel 
+   solid_harmonics!(Z, Val{L}(), Rs, temps)
+
+   # release the temporary arrays back into the cache
+   # (don't release Flm!!)
+   release!(temps.x)
+   release!(temps.y)
+   release!(temps.z)
+   release!(temps.r²)
+   release!(temps.s)
+   release!(temps.c)
+   release!(temps.Q)
+
+   return Z 
+end 
+
+
+# ---------- gradients 
+
+function compute_with_gradients(basis::SolidHarmonics{L, NORM, false, T1}, 
+                                𝐫::SVector{3, T2}
+                               ) where {L, NORM, T1, T2}
+   T = promote_type(T1, T2)
+   Z = zeros(T, sizeY(L))
+   dZ = zeros(SVector{3, T}, sizeY(L))
+   Zmat = reshape(Z, 1, :)   # this is a view, not a copy!
+   dZmat = reshape(dZ, 1, :)
+   compute_with_gradients!(Zmat, dZmat, basis, SA[𝐫,])
+   return Z, dZ 
+end 
+
+function compute_with_gradients(basis::SolidHarmonics{L, NORM, true, T1}, 
+                                𝐫::SVector{3, T2}
+                               ) where {L, NORM, T1, T2}
+   return static_solid_harmonics_with_grads(Val{L}(), 𝐫, Val{NORM}())
+end 
+
+
+function compute_with_gradients(basis::SolidHarmonics{L, NORM, STATIC, T1}, 
+                                Rs::AbstractVector{SVector{3, T2}}
+                                ) where {L, NORM, STATIC, T1, T2}
+   T = promote_type(T1, T2)
+   Z = zeros(T, length(Rs), sizeY(L)) # we could make this cached as well 
+   dZ = zeros(SVector{3, T}, length(Rs), sizeY(L)) 
+   compute_with_gradients!(Z, dZ, basis, Rs)
+   return Z, dZ 
+end
+
+
+
+function compute_with_gradients!(
+            Z::AbstractMatrix, 
+            dZ::AbstractMatrix,
+            basis::SolidHarmonics{L, NORM, STATIC, T1}, 
+            Rs::AbstractVector{SVector{3, T2}}
+            ) where {L, NORM, STATIC, T1, T2}
+
+   nX = length(Rs)
+   T = promote_type(T1, T2)
+
+   # allocate temporary arrays from an array cache 
+   temps = (x = acquire!(basis.cache, :x,  (nX, ),    T),
+            y = acquire!(basis.cache, :y,  (nX, ),    T),
+            z = acquire!(basis.cache, :z,  (nX, ),    T), 
+           r² = acquire!(basis.cache, :r2, (nX, ),    T),
+            s = acquire!(basis.cache, :s,  (nX, L+1), T),
+            c = acquire!(basis.cache, :c,  (nX, L+1), T),
+            Q = acquire!(basis.cache, :Q,  (nX, sizeY(L)), T),
+            Flm = basis.Flm )
+
+   # the actual evaluation kernel 
+   solid_harmonics_with_grad!(Z, dZ, Val{L}(), Rs, temps)
+
+   # release the temporary arrays back into the cache
+   # (don't release Flm!!)
+   release!(temps.x)
+   release!(temps.y)
+   release!(temps.z)
+   release!(temps.r²)
+   release!(temps.s)
+   release!(temps.c)
+   release!(temps.Q)
+
+   return Z 
+end 
diff --git a/julia/src/batched_kernels.jl b/julia/src/batched_kernels.jl
new file mode 100644
index 000000000..8fbfea634
--- /dev/null
+++ b/julia/src/batched_kernels.jl
@@ -0,0 +1,318 @@
+
+# SIMD vectorized computational kernel for moderately many inputs. 
+# (for MANY inputs we should in addition multi-thread it)
+# 
+
+function solid_harmonics!(Z::AbstractMatrix, ::Val{L}, 
+                          Rs::AbstractVector{SVector{3, T}}, 
+                          temps::NamedTuple 
+                           ) where {L, T}
+   nX = length(Rs)
+   len = sizeY(L)
+
+   x = temps.x
+   y = temps.y
+   z = temps.z
+   r² = temps.r²
+   s = temps.s
+   c = temps.c
+   Q = temps.Q
+   Flm = temps.Flm
+
+   # size checks to make sure the inbounds macro can be used safely. 
+   @assert length(y) == length(z) == nX 
+   @assert length(r²) >= nX
+   @assert size(Z, 1) >= nX && size(s, 1) >= nX  && size(c, 1) >= nX && size(Q, 1) >= nX
+   @assert size(Z, 2) >= len && size(Q, 2) >= len 
+   @assert size(s, 2) >= L+1 && size(c, 2) >= L+1
+
+   rt2 = sqrt(T(2)) 
+   
+   @inbounds @simd ivdep for j = 1:nX
+      𝐫 = Rs[j] 
+      xj, yj, zj = 𝐫[1], 𝐫[2], 𝐫[3]
+      x[j] = xj
+      y[j] = yj
+      z[j] = zj
+      r²[j] = xj^2 + yj^2 + zj^2
+      # c_m and s_m, m = 0 
+      s[j, 1] = zero(T)    # 0 -> 1
+      c[j, 1] = one(T)     # 0 -> 1
+   end
+
+   # c_m and s_m continued 
+   @inbounds for m = 1:L 
+      @simd ivdep for j = 1:nX
+         # m -> m+1 and  m-1 -> m
+         s[j, m+1] = s[j, m] * x[j] + c[j, m] * y[j]
+         c[j, m+1] = c[j, m] * x[j] - s[j, m] * y[j]
+      end
+   end
+
+   # change c[0] to 1/rt2 to avoid a special case l-1=m=0 later 
+   i00 = lm2idx(0, 0)
+
+   @inbounds @simd ivdep for j = 1:nX
+      c[j, 1] = one(T)/rt2
+
+      # fill Q_0^0 and Z_0^0 
+      Q[j, i00] = one(T)
+      Z[j, i00] = (Flm[0,0]/rt2) * Q[j, i00]
+   end
+
+   @inbounds for l = 1:L 
+      ill = lm2idx(l, l)
+      il⁻l = lm2idx(l, -l)
+      ill⁻¹ = lm2idx(l, l-1)
+      il⁻¹l⁻¹ = lm2idx(l-1, l-1)
+      il⁻l⁺¹ = lm2idx(l, -l+1)
+      F_l_l = Flm[l,l]
+      F_l_l⁻¹ = Flm[l,l-1]
+      @simd ivdep for j = 1:nX 
+         # Q_l^l and Y_l^l
+         # m = l 
+         Q[j, ill]   = - (2*l-1) * Q[j, il⁻¹l⁻¹]
+         Z[j, ill]   = F_l_l * Q[j, ill] * c[j, l+1]  # l -> l+1
+         Z[j, il⁻l] = F_l_l * Q[j, ill] * s[j, l+1]  # l -> l+1
+         # Q_l^l-1 and Y_l^l-1
+         # m = l-1 
+         Q[j, ill⁻¹]  = (2*l-1) * z[j] * Q[j, il⁻¹l⁻¹]
+         Z[j, il⁻l⁺¹] = F_l_l⁻¹ * Q[j, ill⁻¹] * s[j, l]  # l-1 -> l
+         Z[j, ill⁻¹]  = F_l_l⁻¹ * Q[j, ill⁻¹] * c[j, l]  # l-1 -> l
+         # overwrite if m = 0 -> ok 
+      end
+
+      # now we can go to the second recursion 
+      for m = l-2:-1:0 
+         ilm = lm2idx(l, m)
+         il⁻m = lm2idx(l, -m)
+         il⁻¹m = lm2idx(l-1, m)
+         il⁻²m = lm2idx(l-2, m)
+         F_l_m = Flm[l,m]
+         @simd ivdep for j = 1:nX 
+            Q[j, ilm] = ((2*l-1) * z[j] * Q[j, il⁻¹m] - (l+m-1) * r²[j] * Q[j, il⁻²m]) / (l-m)
+            Z[j, il⁻m] = F_l_m * Q[j, ilm] * s[j, m+1]   # m -> m+1
+            Z[j, ilm] = F_l_m * Q[j, ilm] * c[j, m+1]    # m -> m+1
+         end
+      end
+   end
+
+   return Z 
+end
+
+
+
+# notes on gradients - identities this code uses 
+# 
+#  ∂x c^m = m c^{m-1}  
+#  ∂y c^m = - m s^{m-1}
+#  ∂x s^m = m s^{m-1}
+#  ∂y s^m = m c^{m-1}
+#
+#  ∂x Q_l^m = x Q_{l-1}^{m+1} 
+#  ∂y Q_l^m = y Q_{l-1}^{m+1} 
+#  ∂z Q_l^m = (l+m) Q_{l-1}^m
+#
+# cf. Eq. (10) in the sphericart publication.
+# these identities are the reason we don't need additional 
+# temporary arrays 
+
+function solid_harmonics_with_grad!(
+               Z::AbstractMatrix, 
+               dZ::AbstractMatrix,
+               ::Val{L}, 
+               Rs::AbstractVector{SVector{3, T}}, 
+               temps::NamedTuple 
+               ) where {L, T}
+   nX = length(Rs)
+   len = sizeY(L)
+
+   x = temps.x
+   y = temps.y
+   z = temps.z
+   r² = temps.r²
+   s = temps.s
+   c = temps.c
+   Q = temps.Q
+   Flm = temps.Flm
+
+   # size checks to make sure the inbounds macro can be used safely. 
+   @assert length(y) == length(z) == nX 
+   @assert length(r²) >= nX
+   @assert size(Z, 1) >= nX && size(s, 1) >= nX  && size(c, 1) >= nX && size(Q, 1) >= nX
+   @assert size(Z, 2) >= len && size(Q, 2) >= len 
+   @assert size(s, 2) >= L+1 && size(c, 2) >= L+1
+   @assert size(dZ, 1) >= nX
+   @assert size(dZ, 2) >= len
+
+   rt2 = sqrt(T(2)) 
+   _0 = zero(T)
+   _1 = one(T)
+   
+   @inbounds @simd ivdep for j = 1:nX
+      𝐫 = Rs[j] 
+      xj, yj, zj = 𝐫[1], 𝐫[2], 𝐫[3]
+      x[j] = xj
+      y[j] = yj
+      z[j] = zj
+      r²[j] = xj^2 + yj^2 + zj^2
+      # c_m and s_m, m = 0 
+      s[j, 1] = zero(T)    # 0 -> 1
+      c[j, 1] = one(T)     # 0 -> 1
+   end
+
+   # c_m and s_m continued 
+   @inbounds for m = 1:L 
+      @simd ivdep for j = 1:nX
+         # m -> m+1 and  m-1 -> m
+         s[j, m+1] = s[j, m] * x[j] + c[j, m] * y[j]
+         c[j, m+1] = c[j, m] * x[j] - s[j, m] * y[j]
+      end
+   end
+
+   i00 = lm2idx(0, 0)
+   @inbounds @simd ivdep for j = 1:nX
+      # fill Q_0^0 and Z_0^0 
+      Q[j, i00] = one(T)
+      Z[j, i00] = (Flm[0,0]/rt2) * Q[j, i00]
+
+      # gradients 
+      dZ[j, i00] = zero(SVector{3, T})
+   end
+
+   # need to treat l = 1 separately 
+   i11 = lm2idx(1, 1)
+   i1⁻1 = lm2idx(1, -1)
+   i00 = lm2idx(0, 0)
+   i10 = lm2idx(1, 0)
+   F_1_1 = Flm[1,1]
+   F_1_0 = Flm[1,0]
+
+   @inbounds @simd ivdep for j = 1:nX 
+      # Q_1^1, Y_1^1, Y_1^-1
+      # Q_j_00 = _1
+      Q[j, i11] = - _1
+      Z[j, i11] = - F_1_1 * c[j, 2]    # 2 => l = 1
+      Z[j, i1⁻1] = - F_1_1 * s[j, 2]
+
+      # Q_l^l-1 and Y_l^l-1
+      # m = l-1 
+      Q[j, i10]  = Q_j_10 = z[j]
+      Z[j, i10]  = F_1_0 * Q_j_10 * c[j, 1] / rt2    # l-1 -> l
+
+      # gradients 
+      dZ[j, i11]  = SA[- F_1_1, _0, _0]
+      dZ[j, i1⁻1] = SA[_0, - F_1_1, _0]
+      dZ[j, i10]  = SA[_0, _0,  F_1_0 / rt2 ]                                   
+   end
+
+   # now from l = 2 onwards 
+   @inbounds for l = 2:L 
+      ill = lm2idx(l, l)
+      il⁻l = lm2idx(l, -l)
+      ill⁻¹ = lm2idx(l, l-1)
+      il⁻¹l⁻¹ = lm2idx(l-1, l-1)
+      il⁻l⁺¹ = lm2idx(l, -l+1)
+      il⁻¹l = lm2idx(l-1, l)
+      
+      F_l_l = Flm[l,l]
+      F_l_l⁻¹ = Flm[l,l-1]
+
+      @simd ivdep for j = 1:nX 
+         # Q_l^l and Y_l^l
+         # m = l 
+         Q_j_l⁻¹l⁻¹ = Q[j, il⁻¹l⁻¹]
+         Q[j, ill] = Q_j_ll = - (2*l-1) * Q_j_l⁻¹l⁻¹
+         Z[j, ill] = F_l_l * Q_j_ll * c[j, l+1]  # l -> l+1
+         Z[j, il⁻l] = F_l_l * Q_j_ll * s[j, l+1]  # l -> l+1
+
+         # Q_l^l-1 and Y_l^l-1
+         # m = l-1 
+         Q_j_il⁻¹l⁻¹ = Q[j, il⁻¹l⁻¹]
+         Q[j, ill⁻¹]  = Q_j_ll⁻¹ = (2*l-1) * z[j] * Q_j_l⁻¹l⁻¹
+         Z[j, il⁻l⁺¹] = F_l_l⁻¹ * Q_j_ll⁻¹ * s[j, l]  # l-1 -> l
+         Z[j, ill⁻¹]  = F_l_l⁻¹ * Q_j_ll⁻¹ * c[j, l]  # l-1 -> l
+
+         # gradients 
+
+         # l = m 
+         # Q_j_ll = const => ∇Q_j_ll = 0
+         dZ[j, ill]  = F_l_l * Q_j_ll * SA[l * c[j, l], -l * s[j, l], _0]
+         dZ[j, il⁻l] = F_l_l * Q_j_ll * SA[l * s[j, l],  l * c[j, l], _0]
+
+         # m = l-1
+         # Q_j_l⁻¹l⁻¹ = const => ∇_{xy}Q_j_l⁻¹l⁻¹ = 0
+         dZ[j, il⁻l⁺¹] = F_l_l⁻¹ * SA[Q_j_ll⁻¹ * (l-1) * s[j, l-1], 
+                                      Q_j_ll⁻¹ * (l-1) * c[j, l-1], 
+                                      (2*l-1) * Q_j_l⁻¹l⁻¹ * s[j, l] ]
+         dZ[j, ill⁻¹]  = F_l_l⁻¹ * SA[Q_j_ll⁻¹ * (l-1) * c[j, l-1],
+                                      Q_j_ll⁻¹ * (-l+1) * s[j, l-1], 
+                                      (2*l-1) * Q_j_l⁻¹l⁻¹ * c[j, l] ]
+      end
+
+      # now we can go to the second recursion 
+      # unfortunately we have to treat m = 0 separately again 
+      for m = l-2:-1:1 
+         ilm = lm2idx(l, m)
+         il⁻m = lm2idx(l, -m)
+         il⁻¹m = lm2idx(l-1, m)
+         il⁻²m = lm2idx(l-2, m)
+         il⁻¹m⁺¹ = lm2idx(l-1, m+1)
+         _f = (m == 0) ? _1/rt2 : _1
+
+         F_l_m = Flm[l,m]
+         F_l_m_f = F_l_m * _f
+
+         @simd ivdep for j = 1:nX 
+            cj = c[j, m+1]; sj = s[j, m+1]   # m -> m+1
+            Q[j, ilm] = Q_lm = ((2*l-1) * z[j] * Q[j, il⁻¹m] - (l+m-1) * r²[j] * Q[j, il⁻²m]) / (l-m)
+            Z[j, il⁻m] = F_l_m * Q_lm * sj   
+            Z[j, ilm] = F_l_m_f * Q_lm * cj 
+
+            # gradients
+            Q_lm_x = x[j] * Q[j, il⁻¹m⁺¹]
+            Q_lm_y = y[j] * Q[j, il⁻¹m⁺¹]
+            Q_lm_z = (l+m) * Q[j, il⁻¹m]
+            s_x = m * s[j, m]
+            s_y = m * c[j, m]
+            c_x = m * c[j, m]
+            c_y = -m * s[j, m]
+
+            dZ[j, il⁻m] = F_l_m * SA[Q_lm * s_x + Q_lm_x * sj, 
+                                     Q_lm * s_y + Q_lm_y * sj, 
+                                                  Q_lm_z * sj ]
+            dZ[j, ilm] = F_l_m_f * SA[Q_lm * c_x + Q_lm_x * cj,
+                                    Q_lm * c_y + Q_lm_y * cj, 
+                                                 Q_lm_z * cj ]                                                 
+         end
+      end
+
+      # special case m = 0: only if l = 2 or larger. 
+      # for l = 1 it is already taken care of above. 
+      if l >= 2 
+         # m = 0 
+         il0 = lm2idx(l, 0)
+         il⁻¹0 = lm2idx(l-1, 0)
+         il⁻²0 = lm2idx(l-2, 0)
+         il⁻¹1 = lm2idx(l-1, 1)
+
+         F_l_0_f = Flm[l,0] / rt2
+
+         @simd ivdep for j = 1:nX 
+            cj = c[j, 1]; sj = s[j, 1]   # 1 => m = 0
+            Q[j, il0] = Q_l0 = ((2*l-1) * z[j] * Q[j, il⁻¹0] - (l-1) * r²[j] * Q[j, il⁻²0]) / l
+            Z[j, il0] = F_l_0_f * Q_l0 * cj
+
+            # gradients
+            Q_l0_x = x[j] * Q[j, il⁻¹1]
+            Q_l0_y = y[j] * Q[j, il⁻¹1]
+            Q_l0_z = l * Q[j, il⁻¹0]
+
+            dZ[j, il0] = F_l_0_f * cj * SA[Q_l0_x, Q_l0_y, Q_l0_z ]                                                 
+         end
+      end
+
+   end
+
+   return nothing 
+end
diff --git a/julia/src/generated_kernels.jl b/julia/src/generated_kernels.jl
new file mode 100644
index 000000000..94058ee11
--- /dev/null
+++ b/julia/src/generated_kernels.jl
@@ -0,0 +1,202 @@
+
+#  This is a generated code for best possible performance with a single input
+
+import ForwardDiff
+import ForwardDiff: Dual
+
+function _codegen_Zlm(L, T, normalisation) 
+   Flm = generate_Flms(L; normalisation = normalisation, T = T)
+   len = sizeY(L)
+   rt2 = sqrt(T(2)) 
+
+   code = Expr[] 
+   push!(code, :(r² = x^2 + y^2 + z^2))
+
+   # c_m and s_m 
+   push!(code, :(s_0 = zero($T)))
+   push!(code, :(c_0 = one($T)))
+   for m = 1:L 
+      push!(code, Meta.parse("s_$m = s_$(m-1) * x + c_$(m-1) * y"))
+      push!(code, Meta.parse("c_$m = c_$(m-1) * x - s_$(m-1) * y"))
+   end
+
+   # redefine c_0 = 1/√2 => this allows us to avoid special casing m=0
+   push!(code, Meta.parse("c_0 = one($T)/$rt2"))
+
+   # Q_0^0 and Y_0^0
+   push!(code, Meta.parse("Q_0_0 = one($T)"))
+   push!(code, Meta.parse("Z_1 = $(Flm[0,0]/rt2) * Q_0_0"))
+
+   for l = 1:L 
+      # Q_l^l and Y_l^l
+      # m = l 
+      push!(code, Meta.parse("Q_$(l)_$(l)  = - $(2*l-1) * Q_$(l-1)_$(l-1)"))
+      push!(code, Meta.parse("Z_$(lm2idx(l, l))  = $(Flm[l,l]) * Q_$(l)_$(l) * c_$(l)"))
+      push!(code, Meta.parse("Z_$(lm2idx(l, -l)) = $(Flm[l, l]) * Q_$(l)_$(l) * s_$(l)"))
+      # Q_l^l-1 and Y_l^l-1
+      # m = l-1 
+      push!(code, Meta.parse("Q_$(l)_$(l-1)  = $(2*l-1) * z * Q_$(l-1)_$(l-1)"))
+      push!(code, Meta.parse("Z_$(lm2idx(l, -l+1)) = $(Flm[l, l-1]) * Q_$(l)_$(l-1) * s_$(l-1)"))
+      push!(code, Meta.parse("Z_$(lm2idx(l, l-1) ) = $(Flm[l, l-1]) * Q_$(l)_$(l-1) * c_$(l-1)" )) # overwrite if m = 0 -> ok 
+      # now we can go to the second recursion 
+      for m = l-2:-1:0
+         push!(code, Meta.parse("Q_$(l)_$(m)  = $((2*l-1)/(l-m)) * z * Q_$(l-1)_$m - $((l+m-1)/(l-m)) * r² * Q_$(l-2)_$(m)"))
+         push!(code, Meta.parse("Z_$(lm2idx(l,-m)) = $(Flm[l, m]) * Q_$(l)_$(m) * s_$(m)"))
+         push!(code, Meta.parse("Z_$(lm2idx(l,m) ) = $(Flm[l, m]) * Q_$(l)_$(m) * c_$(m)"))
+      end
+   end
+
+   # finally generate an svector output
+   push!(code, Meta.parse("return SVector{$len, $T}(" * 
+                join( ["Z_$i, " for i = 1:len], ) * ")"))
+end
+
+
+function _codegen_Zlm_grads(L, T, normalisation) 
+   Flm = generate_Flms(L; normalisation = normalisation, T = T)
+   len = sizeY(L)
+   rt2 = sqrt(T(2)) 
+
+   code = Expr[] 
+   push!(code, :(r² = x^2 + y^2 + z^2))
+
+   # c_m and s_m 
+   push!(code, :(s_0 = zero($T)))
+   push!(code, :(c_0 = one($T)))
+   for m = 1:L 
+      push!(code, Meta.parse("s_$m = s_$(m-1) * x + c_$(m-1) * y"))
+      push!(code, Meta.parse("c_$m = c_$(m-1) * x - s_$(m-1) * y"))
+   end
+
+   # l = 0 
+   # Q_0^0 and Y_0^0
+   push!(code, Meta.parse("Q_0_0 = one($T)"))
+   push!(code, Meta.parse("Z_1 = $(Flm[0,0]/rt2) * Q_0_0"))
+
+   # gradients
+   push!(code, Meta.parse("dZ_1 = zero(SVector{3, $T})"))
+
+
+   # l = 1 special case 
+   # Q_1^1 => Y_1^1, Y_1^-1
+   push!(code, Meta.parse("Q_1_1  = - Q_0_0"))
+   push!(code, Meta.parse("Z_$(lm2idx(1,  1)) = $(-Flm[1,1]) * c_1"))
+   push!(code, Meta.parse("Z_$(lm2idx(1, -1)) = $(-Flm[1,1]) * s_1"))
+   # Q_1^0 and Y_1^0
+   push!(code, Meta.parse("Q_1_0  = z"))
+   push!(code, Meta.parse("Z_$(lm2idx(1, 0)) = $(Flm[1, 0]/rt2) * Q_1_0 * c_0"))
+
+   # gradients       
+   push!(code, Meta.parse("dZ_$(lm2idx(1,  1)) = SA[ $(-Flm[1,1]), zero($T), zero($T) ]"))
+   push!(code, Meta.parse("dZ_$(lm2idx(1, -1)) = SA[ zero($T), $(-Flm[1,1]), zero($T) ]"))
+   push!(code, Meta.parse("dZ_$(lm2idx(1,  0)) = SA[ zero($T), zero($T), $(Flm[1, 0]/rt2) ]"))
+
+   for l = 2:L 
+      # Q_l^l => Y_l^l, Y_l^-l
+      push!(code, Meta.parse("Q_$(l)_$(l)  = - $(2*l-1) * Q_$(l-1)_$(l-1)"))
+      push!(code, Meta.parse("Z_$(lm2idx(l, l))  = $(Flm[l,l]) * Q_$(l)_$(l) * c_$(l)"))
+      push!(code, Meta.parse("Z_$(lm2idx(l, -l)) = $(Flm[l, l]) * Q_$(l)_$(l) * s_$(l)"))
+      # Q_l^l-1 => Y_l^l-1, Y_l^-l+1
+      push!(code, Meta.parse("Q_$(l)_$(l-1)  = $(2*l-1) * z * Q_$(l-1)_$(l-1)"))
+      push!(code, Meta.parse("Z_$(lm2idx(l, -l+1)) = $(Flm[l, l-1]) * Q_$(l)_$(l-1) * s_$(l-1)"))
+      push!(code, Meta.parse("Z_$(lm2idx(l, l-1) ) = $(Flm[l, l-1]) * Q_$(l)_$(l-1) * c_$(l-1)" )) # overwrite if m = 0 -> ok 
+
+      # gradients 
+      push!(code, Meta.parse("dZ_$(lm2idx(l, l)) = $(Flm[l,l]) * Q_$(l)_$(l) * SA[ $l * c_$(l-1), - $l * s_$(l-1), zero($T) ]"))
+      push!(code, Meta.parse("dZ_$(lm2idx(l, -l)) = $(Flm[l,l]) * Q_$(l)_$(l) * SA[ $l * s_$(l-1),  $l * c_$(l-1), zero($T) ]"))
+      push!(code, Meta.parse("""dZ_$(lm2idx(l, -l+1)) = $(Flm[l, l-1]) * SA[ Q_$(l)_$(l-1) * $(l-1) * s_$(l-2), 
+                                                                             Q_$(l)_$(l-1) * $(l-1) * c_$(l-2), 
+                                                                             $(2*l-1) * Q_$(l-1)_$(l-1) * s_$(l-1) ]"""))
+      push!(code, Meta.parse("""dZ_$(lm2idx(l,  l-1)) = $(Flm[l, l-1]) * SA[ Q_$(l)_$(l-1) * $(l-1) * c_$(l-2), 
+                                                                             Q_$(l)_$(l-1) * $(-l+1) * s_$(l-2), 
+                                                                             $(2*l-1) * Q_$(l-1)_$(l-1) * c_$(l-1) ]"""))
+
+      # now we can go to the second recursion 
+      for m = l-2:-1:1
+         push!(code, Meta.parse("Q_$(l)_$(m)  = $((2*l-1)/(l-m)) * z * Q_$(l-1)_$m - $((l+m-1)/(l-m)) * r² * Q_$(l-2)_$(m)"))
+         push!(code, Meta.parse("Z_$(lm2idx(l,-m)) = $(Flm[l, m]) * Q_$(l)_$(m) * s_$(m)"))
+         push!(code, Meta.parse("Z_$(lm2idx(l,m) ) = $(Flm[l, m]) * Q_$(l)_$(m) * c_$(m)"))
+
+         # gradients 
+         push!(code, Meta.parse(""" 
+            dZ_$(lm2idx(l, -m)) = $(Flm[l, m]) * SA[ Q_$(l)_$(m) * $(m) * s_$(m-1) + x * Q_$(l-1)_$(m+1) * s_$(m), 
+                                                     Q_$(l)_$(m) * $(m) * c_$(m-1) + y * Q_$(l-1)_$(m+1) * s_$(m), 
+                                                     $(l+m) * Q_$(l-1)_$(m) * s_$m ]"""))
+         push!(code, Meta.parse("""
+            dZ_$(lm2idx(l, m)) = $(Flm[l, m]) * SA[ Q_$(l)_$(m) * $(m) * c_$(m-1) + x * Q_$(l-1)_$(m+1) * c_$(m), 
+                                                    Q_$(l)_$(m) * $(-m) * s_$(m-1) + y * Q_$(l-1)_$(m+1) * c_$(m), 
+                                                    $(l+m) * Q_$(l-1)_$(m) * c_$m ]"""))
+      end
+
+      # special-case m = 0 
+      if l >= 2 
+         push!(code, Meta.parse("Q_$(l)_0  = $((2*l-1)/l) * z * Q_$(l-1)_0 - $((l-1)/(l)) * r² * Q_$(l-2)_0"))
+         push!(code, Meta.parse("Z_$(lm2idx(l,0) ) = $(Flm[l, 0] / rt2) * Q_$(l)_0"))
+
+         # gradients
+         # dZ[j, il0] = F_l_0_f * cj * SA[Q_l0_x, Q_l0_y, Q_l0_z ]                                                 
+         push!(code, Meta.parse("dZ_$(lm2idx(l, 0)) = $(Flm[l, 0] / rt2) * SA[ Q_$(l-1)_1 * x, Q_$(l-1)_1 * y, $(l) * Q_$(l-1)_0 ]"))
+      end
+   end
+
+   # finally generate an svector output
+   push!(code, Meta.parse("Z = SVector{$len, $T}(" * 
+                join( ["Z_$i, " for i = 1:len], ) * ")"))
+
+   push!(code, Meta.parse("dZ = SVector{$len, SVector{3, $T}}(" * 
+                join( ["dZ_$i, " for i = 1:len] ) * ")"))
+
+   push!(code, Meta.parse("return Z, dZ"))
+end
+
+
+
+"""
+`static_solid_harmonics`: evaluate the solid harmonics basis for a single 
+input point. The code is fully generated and unrolled. The return value 
+is an `SVector{LEN, T}` where `LEN` is the length of the basis and `T` the 
+element type of the input point. 
+
+Usage: e.g. for `L = 4`
+```julia 
+valL = Val{4}() 
+𝐫 = @SVector randn(3) 
+Z = static_solid_harmonics(valL, 𝐫) 
+x, y, z = tuple(𝐫...)
+Z = static_solid_harmonics(valL, x, y, z) 
+```
+
+Once can also specify the normalisation convention, e.g., 
+```julia
+Z = static_solid_harmonics(valL, 𝐫, Val{:L2}()) 
+```
+which would be the default behaviour.
+"""
+static_solid_harmonics(valL::Val{L}, 𝐫::SVector{3}, 
+                     valNorm = Val{:sphericart}()) where {L} = 
+      static_solid_harmonics(valL, 𝐫[1], 𝐫[2], 𝐫[3], valNorm)
+
+@generated function static_solid_harmonics(::Val{L}, x::T, y::T, z::T, 
+                     valNorm::Val{NORM} = Val{:sphericart}()
+                     ) where {L, T, NORM}
+   code = _codegen_Zlm(L, T, NORM)
+   return quote
+      $(Expr(:block, code...))
+   end
+end
+
+
+static_solid_harmonics_with_grads(valL::Val{L}, 𝐫::SVector{3}, 
+                     valNorm = Val{:sphericart}()) where {L} = 
+         static_solid_harmonics_with_grads(valL, 𝐫[1], 𝐫[2], 𝐫[3], valNorm)
+
+
+@generated function static_solid_harmonics_with_grads(::Val{L}, x::T, y::T, z::T, 
+                     valNorm::Val{NORM} = Val{:sphericart}()
+                     ) where {L, T, NORM}
+   code = _codegen_Zlm_grads(L, T, NORM)
+   return quote
+      $(Expr(:block, code...))
+   end
+end
+
diff --git a/julia/src/indexing.jl b/julia/src/indexing.jl
new file mode 100644
index 000000000..8d27ba39a
--- /dev/null
+++ b/julia/src/indexing.jl
@@ -0,0 +1,40 @@
+
+"""
+`sizeY(maxL):`
+Return the size of the set of spherical harmonics ``Y_l^m`` of
+degree less than or equal to the given maximum degree `maxL`
+"""
+sizeY(maxL) = (maxL + 1)^2
+
+"""
+`lm2idx(l,m):`
+Return the index into a flat array of real spherical harmonics ``Y_l^m``
+for the given indices `(l,m)`. ``Y_l^m`` are stored in l-major order i.e.
+```
+	[Y(0,0), Y(1,-1), Y(1,0), Y(1,1), Y(2,-2), ...]
+```
+"""
+lm2idx(l::Integer, m::Integer) = m + l + (l*l) + 1
+
+"""
+Inverse of `lm2idx`: given an index into a vector of Ylm values, return the 
+`(l, m)` indices.
+"""
+function idx2lm(i::Integer) 
+	l = floor(Int, sqrt(i-1) + 1e-10)
+	m = i - (l + (l*l) + 1)
+	return l, m 
+end 
+
+
+"""
+Convenience wrapper around solid/spherical harmonics outputs. 
+Store the basis as a vector but access via `(l, m)` pairs. 
+"""
+struct ZVec{TP} 
+   parent::TP 
+end 
+
+getindex(Z::ZVec, i::Integer) = Z.parent[i]
+
+getindex(Z::ZVec, l::Integer, m::Integer) = Z.parent[lm2idx(l, m)]
diff --git a/julia/src/normalisations.jl b/julia/src/normalisations.jl
new file mode 100644
index 000000000..1c24be7e9
--- /dev/null
+++ b/julia/src/normalisations.jl
@@ -0,0 +1,30 @@
+
+
+
+# Generates the `F[l, m]` values exactly as described in the 
+# `sphericart` publication. 
+function _generate_Flms(L::Integer, ::Union{Val{:sphericart}, Val{:L2}}, T=Float64)
+   Flm = OffsetMatrix(zeros(L+1, L+1), (-1, -1))
+   for l = 0:L
+      Flm[l, 0] = sqrt((2*l+1)/(2 * π))
+      for m = 1:l 
+         Flm[l, m] = - Flm[l, m-1] / sqrt((l+m) * (l+1-m))
+      end
+   end
+   return Flm
+end
+
+
+
+"""
+```
+generate_Flms(L; normalisation = :L2, T = Float64)
+```
+generate the `F[l,m]` prefactors in the definitions of the solid harmonics; 
+see `sphericart` publication for details. The default normalisation generates 
+a basis that is L2-orthonormal on the unit sphere. Other normalisations: 
+- `:sphericart` the same as `:L2`
+- `:p4ml` same normalisation as used in `Polynomials4ML.jl`
+"""
+generate_Flms(L::Integer; normalisation = :L2, T = Float64) = 
+      _generate_Flms(L, Val(normalisation), T)
diff --git a/julia/src/spherical.jl b/julia/src/spherical.jl
new file mode 100644
index 000000000..2ccfaeed6
--- /dev/null
+++ b/julia/src/spherical.jl
@@ -0,0 +1,142 @@
+
+export SphericalHarmonics
+
+using LinearAlgebra: norm, dot 
+
+"""
+`struct SphericalHarmonics` : datatype representing a real spherical harmonics basis. 
+
+### Constructor to generate the basis object
+```julia
+basis = SphericalHarmonics(L::Integer; kwargs...)
+```
+
+### Keyword arguments:
+* `normalisation = :L2` : choose the normalisation of the basis, default is to 
+   make it orthonoormal on the unit sphere. 
+* `static = (L<=15)` : decide whether to use a generated code that outputs an 
+`SVector` but has a larger compiler and stack footprint
+* `T = Float64` : datatype in which basis parameters are stored. The output type 
+is inferred at runtime, but the rule of thumb is to use `T = FloatX` for 
+`FloatX` output.
+
+### Usage example: 
+```julia
+using StaticArrays, SpheriCart
+basis = SphericalHarmonics(4)
+# evaluate basis with single input 
+𝐫 = @SVector randn(3)
+Z = basis(𝐫)
+Z = compute(basis, 𝐫)
+# evaluate basis with multiple inputs (batching)
+R = [ @SVector randn(3) for _ = 1:32 ]
+Z = basis(Rs)
+Z = compute(basis, Rs)
+
+# to be implented: 
+# Z, ∇Z = compute_and_gradients(basis, 𝐫)
+# Z, ∇Z, ∇²Z = compute_and_hessian(basis, 𝐫)
+```
+See documentation for more details.
+"""
+struct SphericalHarmonics{L, NORM, STATIC, T1}
+   solids::SolidHarmonics{L, NORM, STATIC, T1}
+   # Flm::OffsetMatrix{T1, Matrix{T1}}
+   cache::ArrayPool{FlexArrayCache}
+end
+
+SphericalHarmonics(L::Integer; kwargs...) = 
+      SphericalHarmonics(SolidHarmonics(L, kwargs...), 
+                         ArrayPool(FlexArrayCache))
+
+
+@inline (basis::SphericalHarmonics)(args...) = compute(basis, args...)
+
+@inline function compute(basis::SphericalHarmonics, 𝐫::SVector{3})
+   𝐫̂ = 𝐫 / norm(𝐫)                        
+   return compute(basis.solids, 𝐫̂)
+end 
+
+@inline function compute_with_gradients(basis::SphericalHarmonics, 𝐫::SVector{3})
+   r = norm(𝐫)
+   𝐫̂ = 𝐫 / r
+   Y, ∇Z = compute_with_gradients(basis.solids, 𝐫̂)
+
+   ∇Y = map(dZ -> (dz = dZ / r; dz - dot(𝐫̂, dz) * 𝐫̂), ∇Z)
+
+   # @inbounds @simd ivdep for i = 1:length(Y)
+   #    dz = ∇Y[i] / r
+   #    ∇Y[i] = dz - dot(𝐫̂, dz) * 𝐫̂
+   # end
+
+   return Y, ∇Y
+end 
+
+# --------------------- 
+#  batched api 
+
+function _normalise_Rs!(basis::SphericalHarmonics, 
+                        Rs::AbstractVector{SVector{3, T1}}) where {T1}
+   nX = length(Rs) 
+   rs = acquire!(basis.cache, :rs, (nX, ), T1)
+   Rs_norm = acquire!(basis.cache, :Rs_norm, (nX, ), SVector{3, T1})
+   @inbounds @simd ivdep for i = 1:nX
+      rs[i] = norm(Rs[i])
+      Rs_norm[i] = Rs[i] / rs[i]
+   end
+   return rs, Rs_norm 
+end
+
+function _rescale_∇Z2∇Y!(∇Z::AbstractMatrix, Rs_norm, rs)
+   nX = length(rs)
+   @inbounds for i = 1:size(∇Z, 2)
+      @simd ivdep for j = 1:nX
+         dzj = ∇Z[j, i] / rs[j]
+         𝐫̂j = Rs_norm[j]
+         ∇Z[j, i] = dzj - dot(𝐫̂j, dzj) * 𝐫̂j
+      end
+   end
+end
+
+function compute(basis::SphericalHarmonics, 
+                         Rs::AbstractVector{<: SVector{3}})
+   rs, Rs_norm = _normalise_Rs!(basis, Rs)
+   Y = compute(basis.solids, Rs_norm)
+   release!(Rs_norm)
+   release!(rs)
+   return Y
+end
+
+function compute!(Y, basis::SphericalHarmonics, 
+                     Rs::AbstractVector{<: SVector{3}})
+   rs, Rs_norm = _normalise_Rs!(basis, Rs)
+   compute!(Y, basis.solids, Rs_norm)
+   release!(Rs_norm)
+   release!(rs)
+   return Y
+end
+
+
+function compute_with_gradients(basis::SphericalHarmonics, 
+                           Rs::AbstractVector{<: SVector{3}})
+   rs, Rs_norm = _normalise_Rs!(basis, Rs)
+   Y, ∇Z = compute_with_gradients(basis.solids, Rs_norm)
+   _rescale_∇Z2∇Y!(∇Z, Rs_norm, rs)
+
+   release!(Rs_norm)
+   release!(rs)
+
+   return Y, ∇Z
+end 
+
+function compute_with_gradients!(Y, ∇Y, basis::SphericalHarmonics, 
+                                 Rs::AbstractVector{<: SVector{3}})
+   rs, Rs_norm = _normalise_Rs!(basis, Rs)
+   compute_with_gradients!(Y, ∇Y, basis.solids, Rs_norm)
+   _rescale_∇Z2∇Y!(∇Y, Rs_norm, rs)
+
+   release!(Rs_norm)
+   release!(rs)
+
+   return Y, ∇Y
+end 
diff --git a/julia/test/python/test_python.py b/julia/test/python/test_python.py
new file mode 100644
index 000000000..f17fd03a9
--- /dev/null
+++ b/julia/test/python/test_python.py
@@ -0,0 +1,59 @@
+import numpy as np
+from julia import Julia
+import sphericart
+
+
+L_MAX = 25
+N_SAMPLES = 100
+
+
+def test_consistency_solid_harmonics():
+
+    jl = Julia(compiled_modules=False)
+    jl_code = """
+    using Pkg
+    Pkg.activate("../../")
+    Pkg.instantiate()
+    using SpheriCart
+    using StaticArrays
+
+    function compute_solid_harmonics_from_array(xyz::Array{Float64,2}, L_MAX)
+        xyz_svectors = [SVector{3}(x) for x in eachrow(xyz)]  # Convert to an array of StaticArrays.SVector{3}
+        basis = SpheriCart.SolidHarmonics(L_MAX)
+        results = SpheriCart.compute(basis, xyz_svectors)
+        return results
+    end
+    """
+    xyz = np.random.rand(N_SAMPLES, 3)
+    jl.eval(jl_code)
+    compute_solid_harmonics_from_array = jl.eval("compute_solid_harmonics_from_array")
+    julia_results = compute_solid_harmonics_from_array(xyz, L_MAX)
+
+    python_results = sphericart.SphericalHarmonics(L_MAX, normalized=False).compute(xyz)
+    assert np.allclose(julia_results, python_results)
+
+
+def test_consistency_spherical_harmonics():
+
+    jl = Julia(compiled_modules=False)
+    jl_code = """
+    using Pkg
+    Pkg.activate("../../")
+    Pkg.instantiate()
+    using SpheriCart
+    using StaticArrays
+
+    function compute_solid_harmonics_from_array(xyz::Array{Float64,2}, L_MAX)
+        xyz_svectors = [SVector{3}(x) for x in eachrow(xyz)]  # Convert to an array of StaticArrays.SVector{3}
+        basis = SpheriCart.SphericalHarmonics(L_MAX)
+        results = SpheriCart.compute(basis, xyz_svectors)
+        return results
+    end
+    """
+    xyz = np.random.rand(N_SAMPLES, 3)
+    jl.eval(jl_code)
+    compute_solid_harmonics_from_array = jl.eval("compute_solid_harmonics_from_array")
+    julia_results = compute_solid_harmonics_from_array(xyz, L_MAX)
+
+    python_results = sphericart.SphericalHarmonics(L_MAX, normalized=True).compute(xyz)
+    assert np.allclose(julia_results, python_results)
diff --git a/julia/test/runtests.jl b/julia/test/runtests.jl
new file mode 100644
index 000000000..fd9dee525
--- /dev/null
+++ b/julia/test/runtests.jl
@@ -0,0 +1,7 @@
+using SpheriCart
+using Test
+
+@testset "SpheriCart.jl" begin
+   @testset "Solid Harmonics" begin include("test_solidharmonics.jl"); end 
+   @testset "Spherical Harmonics" begin include("test_sphericalharmonics.jl"); end
+end
diff --git a/julia/test/test_solidharmonics.jl b/julia/test/test_solidharmonics.jl
new file mode 100644
index 000000000..9a446dfe3
--- /dev/null
+++ b/julia/test/test_solidharmonics.jl
@@ -0,0 +1,175 @@
+
+using Test, StaticArrays, LinearAlgebra, Random, SpheriCart
+using ForwardDiff
+using SpheriCart: compute, compute!, SolidHarmonics, sizeY, 
+                  static_solid_harmonics, 
+                  compute_with_gradients
+
+@info("============= Testset Solid Harmonics =============")
+##
+
+# This is an implementation that ignores any normalisation factors. 
+# Correctness of the implementation will be tested UP TO normalisation. 
+# The normalisation will then separately be tested by computing the gramian 
+# and confirming that the basis if L2-orthonormal on the sphere. 
+
+# TODO: can we replace this against a generated code? (sympy or similar?)
+
+function symbolic_zlm_4(𝐫)
+   x, y, z = tuple(𝐫...)
+   r = norm(𝐫)   
+   return [ 
+      1.0,  # l = 0
+      y,    # l = 1
+      z, 
+      x, 
+      x * y,  # l = 2 
+      y * z, 
+      3 * z^2 - r^2,
+      x * z, 
+      x^2 - y^2, 
+      (3 * x^2 - y^2) * y,   # l = 3
+      x * y * z, 
+      (5 * z^2 - r^2) * y, 
+      (5 * z^2 - 3 * r^2) * z,
+      (5 * z^2 - r^2) * x,
+      (x^2 - y^2) * z, 
+      (x^2 - 3 * y^2) * x, 
+      x * y * (x^2 - y^2),    # l = 4 
+      y * z * (3 * x^2 - y^2), 
+      x * y * (7 * z^2 - r^2), 
+      y * z * (7 * z^2 - 3 * r^2), 
+      (35 * z^4 - 30 * r^2 * z^2 + 3 * r^4),
+      x * z * (7 * z^2 - 3 * r^2),
+      (x^2 - y^2) * (7 * z^2 - r^2),
+      x * z * (x^2 - 3 * y^2),
+      x^2 * (x^2 - 3 * y^2) - y^2 * (3 * x^2 - y^2),
+   ]
+end
+
+# the code to be tested against the symbolic code above 
+# all other implementations will be tested against this. 
+zlm_4 = SolidHarmonics(4; static=true)
+
+𝐫0 = @SVector randn(3)
+Z1 = zlm_4(𝐫0)
+Z2 = symbolic_zlm_4(𝐫0)
+F = Z1 ./ Z2
+
+# check that zlm_4 evaluates the right thing. 
+@test static_solid_harmonics(Val(4), 𝐫0) == zlm_4(𝐫0)
+
+for ntest = 1:30 
+   local 𝐫, Z1, Z2 
+   𝐫 = @SVector randn(3)
+   Z1 = zlm_4(𝐫)
+   Z2 = symbolic_zlm_4(𝐫)
+   @test Z1 ≈ Z2 .* F
+end
+
+##
+
+@info("confirm that the two implementations are consistent with one another")
+for L = 2:10, ntest = 1:10
+   local 𝐫, Z1, Z2, basis 
+   basis = SolidHarmonics(L)
+   basis2 = SolidHarmonics(L; static=false)
+   𝐫 = @SVector randn(3)
+   Z1 = basis(𝐫)
+   Z2 = basis([𝐫,])[:]
+   Z3 = basis2(𝐫)
+   @test Z1 ≈ Z2 ≈ Z3 
+end
+
+##
+
+@info("test the orthogonality on the sphere: G ≈ I")
+
+Random.seed!(0)
+L = 3
+basis = SolidHarmonics(L)
+rand_sphere() = ( (𝐫 = @SVector randn(3)); 𝐫/norm(𝐫) )
+
+for ntest = 1:10
+   local Z
+   Rs = [ rand_sphere() for _ = 1:10_000 ] 
+   Z = compute(basis, Rs)
+   G = (Z' * Z) / length(Rs) * 4 * π
+   @test norm(G - I) < 0.33
+   @test cond(G) < 1.5
+end
+
+
+##
+
+@info("confirm batched evaluation is consistent with single")
+for L = 2:10, ntest = 1:10
+   local basis, Z1, Z2
+   basis = SolidHarmonics(L)
+   nbatch = rand(8:20)
+   Rs = [ @SVector randn(3) for _=1:nbatch ]
+   Z1 = reinterpret(reshape, Float64, 
+                     static_solid_harmonics.(Val(L), Rs), )'
+   Z2 = compute(basis, Rs)
+
+   @test Z1 ≈ Z2
+end
+
+##
+
+@info("test gradients")
+
+zlm_4 = SolidHarmonics(12; static=false)
+
+function fwd_grad(basis, 𝐫)
+   Z = basis(𝐫)
+   dZ = ForwardDiff.jacobian(basis, 𝐫)'
+   return Z, [ SVector{3, eltype(𝐫)}(dZ[:, i]...) for i = 1:length(Z) ]
+end
+
+for ntest = 1:30
+   local 𝐫0, Z0, Z1, Z2, dZ1, dZ2
+   𝐫0 = @SVector randn(3)
+   Z0 = zlm_4(𝐫0)
+   Z1, dZ1 = compute_with_gradients(zlm_4, 𝐫0)
+   Z2, dZ2 = fwd_grad(zlm_4, 𝐫0)
+
+   @test Z0 ≈ Z1 ≈ Z2
+   @test dZ1 ≈ dZ2
+end
+
+##
+
+@info("test batched gradients")
+
+basis = SolidHarmonics(12; static=false)
+
+for ntest = 1:30
+   local nX, Rs, Z0, Z1, dZ1, dZ2
+   nX = rand(2:37)
+   Rs = [ @SVector randn(3) for _ = 1:nX ]
+   Z0 = compute(basis, Rs)
+   Z1, dZ1 = compute_with_gradients(basis, Rs)
+   dZ2 = vcat([ reshape(compute_with_gradients(basis, 𝐫)[2], 1, :) for 𝐫 in Rs ]...)
+   
+   @test Z0 ≈ Z1
+   @test dZ1 ≈ dZ2
+end
+
+
+##
+
+@info("test generated gradients")
+
+basis_st = SolidHarmonics(12; static=true)
+basis_dy = SolidHarmonics(12; static=false)
+for ntest = 1:30 
+   local 𝐫, Z0, Z1, Z2, dZ1, dZ2
+   𝐫 = @SVector randn(3)
+   Z0 = basis_st(𝐫)
+   Z1, dZ1 = compute_with_gradients(basis_st, 𝐫)
+   Z2, dZ2 = compute_with_gradients(basis_dy, 𝐫)
+
+   @test Z1 ≈ Z0 ≈ Z2
+   @test dZ1 ≈ dZ2
+end
diff --git a/julia/test/test_sphericalharmonics.jl b/julia/test/test_sphericalharmonics.jl
new file mode 100644
index 000000000..cfb092b38
--- /dev/null
+++ b/julia/test/test_sphericalharmonics.jl
@@ -0,0 +1,79 @@
+
+
+using SpheriCart, StaticArrays, LinearAlgebra, Test, ForwardDiff
+using SpheriCart: idx2lm
+
+@info("============= Testset Spherical Harmonics =============")
+
+##
+
+@info("test consistency of spherical and solid harmonics")
+
+L = 12
+spheri = SphericalHarmonics(L)
+solids = SolidHarmonics(L)
+
+for ntest = 1:30 
+   local 𝐫, r, 𝐫̂, Z1, Z2, ll, rˡ
+   𝐫 = @SVector randn(3)
+   r = norm(𝐫)
+   𝐫̂ = 𝐫 / r 
+
+   Z1 = spheri(𝐫)
+   Z2 = solids(𝐫̂)
+   @test Z1 ≈ Z2 
+
+   ll = [ idx2lm(i)[1] for i = 1:length(Z1) ]
+   rˡ = [ r^l for l = ll ]
+   @test Z1 .* rˡ ≈ solids(𝐫)
+end 
+
+
+##
+
+@info("test consistency of batched spherical harmonics")
+
+for ntest = 1:30 
+   local Rs, Z1, Z2
+   Rs = [ @SVector randn(3) for _ = 1:rand(2:37) ]
+   Z1 = spheri(Rs)
+   Z2 = 0 * copy(Z1)
+   compute!(Z2, spheri, Rs)
+   Z3 = vcat([ reshape(spheri(𝐫), 1, :) for 𝐫 in Rs ]...)
+   @test Z1 ≈ Z2 ≈ Z3
+end
+
+##
+
+@info("test gradients")
+
+function fwd_grad_1(basis, 𝐫)
+   Z = basis(𝐫)
+   dZ = ForwardDiff.jacobian(basis, 𝐫)'
+   return Z, [ SVector{3, eltype(𝐫)}(dZ[:, i]...) for i = 1:length(Z) ]
+end
+
+for ntest = 1:30 
+   local 𝐫, Y1, Y2, ∇Y1, ∇Y2
+   𝐫 = @SVector randn(3)
+   Y1, ∇Y1 = compute_with_gradients(spheri, 𝐫)
+   Y2, ∇Y2 = fwd_grad_1(spheri, 𝐫)
+   @test Y1 ≈ Y2
+   @test ∇Y1 ≈ ∇Y2
+end 
+
+##
+
+@info("test batched gradients")
+
+for ntest = 1:30 
+   local nX, Rs, Y1, Y2, ∇Y1, ∇Y2
+   nX = rand(2:37)
+   Rs = [ @SVector randn(3) for _ = 1:nX ]
+   Y1, ∇Y1 = compute_with_gradients(spheri, Rs)
+   Y2 = copy(Y1); ∇Y2 = copy(∇Y1)
+   compute_with_gradients!(Y2, ∇Y2, spheri, Rs)
+   ∇Y3 = vcat([ reshape(compute_with_gradients(spheri, 𝐫)[2], 1, :) for 𝐫 in Rs ]...)
+   @test Y1 ≈ Y2
+   @test ∇Y1 ≈ ∇Y2 ≈ ∇Y3
+end
diff --git a/pyproject.toml b/pyproject.toml
index 350e14189..2d672f02e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires-python = ">=3.7"
 dependencies = ["numpy"]
 
 readme = "README.md"
-license = {text = "Apache-2.0"}
+license = {text = "Apache-2.0 or MIT"}
 description = "Fast calculation of spherical harmonics"
 authors = [
     {name = "Filippo Bigi"},