From 4c05d88ba08d6bcecf9a416309d0040ece153b26 Mon Sep 17 00:00:00 2001
From: Ethan Meitz <54505069+ejmeitz@users.noreply.github.com>
Date: Tue, 17 Oct 2023 14:03:51 -0400
Subject: [PATCH] stuff

---
 Project.toml           |   2 +-
 src/autodiff_test.jl   |  60 +++++--
 src/finite_diff_ifc.jl |  58 ++-----
 src/fourth_order.jl    |  16 +-
 src/mcc.jl             | 104 +++++++-----
 src/old.jl             |   6 +
 src/third_order.jl     | 360 ++++++++++++++++++++++++-----------------
 src/types.jl           |  59 ++++---
 test/runtests.jl       |  35 ++--
 9 files changed, 407 insertions(+), 293 deletions(-)

diff --git a/Project.toml b/Project.toml
index 2abfeb6..ad037fa 100644
--- a/Project.toml
+++ b/Project.toml
@@ -20,7 +20,7 @@ Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d"
 cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [compat]
-SimpleCrystals = "0.1.6"
+SimpleCrystals = "0.1.7"
 TensorOperations = "4.0.0"
 julia = "1"
 
diff --git a/src/autodiff_test.jl b/src/autodiff_test.jl
index c74f6b4..15821f8 100644
--- a/src/autodiff_test.jl
+++ b/src/autodiff_test.jl
@@ -97,16 +97,8 @@ function third_order_AD_test(sys::SuperCellSystem{D}, pot::PairPotential, tol) w
         end
     end
 
-    #Acoustic Sum Rule #&TODO
-    # Threads.@threads for i in range(1, N_atoms) # index of block matrix
-    #     for α in range(1,D)
-    #         for β in range(1,D)
-    #             ii = D*(i-1) + α
-    #             jj = D*(i-1) + β # i == j because we're on diagonal
-    #             IFC2[ii,jj] = -1*sum(IFC2[ii, β:D:end])
-    #         end
-    #     end
-    # end
+    #Acoustic Sum Rule
+    ASR!(IFC3, N_atoms, D)
 
     IFC3 = apply_tols!(IFC3,tol)
 
@@ -114,7 +106,7 @@ function third_order_AD_test(sys::SuperCellSystem{D}, pot::PairPotential, tol) w
 
 end
 
-function fourth_order_AD_test(sys::SuperCellSystem{D}, pot::PairPotential, tol; float_type = Float32) where D
+function fourth_order_AD_test(sys::SuperCellSystem{D}, pot::PairPotential, tol) where D
     vars = make_variables(:r, D)
     r_norm = sqrt(sum(x -> x^2, vars))
     pot_symbolic = potential_nounits(pot, r_norm)
@@ -124,7 +116,7 @@ function fourth_order_AD_test(sys::SuperCellSystem{D}, pot::PairPotential, tol;
     FO_exec = make_function(fourth_order_symbolic, vars)
 
     N_atoms = n_atoms(sys)
-    IFC4 = FC_val{float_type,4}[]
+    IFC4 = FC_val{Float64,4}[]
 
     for i in range(1, N_atoms)
         for j in range(i + 1, N_atoms)
@@ -143,10 +135,10 @@ function fourth_order_AD_test(sys::SuperCellSystem{D}, pot::PairPotential, tol;
                                 #iiij
                                 ii = D*(i-1) + α; jj = D*(i-1) + β; kk = D*(i-1) + γ; ll = D*(j-1) + δ
                                 #& THERE WILL BE DUPLICATES SINCE THIS USES PUSH INSTEAD OF JUST OVERWRITTING DATA!!!
-                                set_terms_fourth_order!(χ, ii, jj, kk, ll, float_type(iiij_block[α,β,γ,δ]))
+                                set_terms_fourth_order!(χ, ii, jj, kk, ll, iiij_block[α,β,γ,δ])
                                 #iijj
                                 ii = D*(i-1) + α; jj = D*(i-1) + β; kk = D*(j-1) + γ; ll = D*(j-1) + δ
-                                set_terms_fourth_order!(χ, ii, jj, kk, ll, float_type(iiij_block[α,β,γ,δ]))
+                                set_terms_fourth_order!(χ, ii, jj, kk, ll, iiij_block[α,β,γ,δ])
                             end
                         end
                     end
@@ -167,7 +159,7 @@ function fourth_order_AD_test(sys::SuperCellSystem{D}, pot::PairPotential, tol;
                             if abs(iiij_block[α,β,γ,δ]) > tol
                                 #ijjj
                                 ii = D*(i-1) + α; jj = D*(j-1) + β; kk = D*(j-1) + γ; ll = D*(j-1) + δ
-                                set_terms_fourth_order!(χ, ii, jj, kk, ll, float_type(ijjj_block[α,β,γ,δ]))
+                                set_terms_fourth_order!(χ, ii, jj, kk, ll, ijjj_block[α,β,γ,δ])
                             end
                         end
                     end
@@ -219,4 +211,40 @@ function energy_loop_mcc(sys::SuperCellSystem{D}, pot::PairPotential, q, phi, ma
 
     return U_total
 
-end
\ No newline at end of file
+end
+
+#q_vars = make_variable(:q, N_modes)
+#r_vars = zeros(eltype(q_vars), length(q_vars))
+#for i in 1:N_modes
+#   r_vars[i] = dot(q_vars, phi[i,:])
+#end
+
+# divide r_vars by mass_sqrt
+
+# pass r_vars into energy_loop2
+
+# function energy_loop2(pot::PairPotential, posns, r_cut, box_sizes, N_atoms)
+
+#     U_total = 0.0
+#     box_sizes = ustrip.(box_sizes)
+
+#     for i in range(1,N_atoms)
+#         for j in range(i+1, N_atoms)             
+#             r = posns[3*(i-1) + 1: 3*i] .- posns[3*(j-1) + 1: 3*j ]
+#             nearest_mirror_AD!(r, box_sizes)
+#             dist = sqrt(sum(x-> x*x, r))
+#             # if dist < r_cut
+#             U_total += potential_nounits(pot, dist)
+#             # end
+#         end
+#     end
+
+#     return U_total
+
+# end
+
+# #& not quite right but I just wanna test the AD
+# function nearest_mirror_AD!(r_ij, box_sizes)
+#     r_ij .+= sign.(r_ij .- box_sizes./2) .* box_sizes
+#     return r_ij
+# end
\ No newline at end of file
diff --git a/src/finite_diff_ifc.jl b/src/finite_diff_ifc.jl
index a1423db..bfc6771 100644
--- a/src/finite_diff_ifc.jl
+++ b/src/finite_diff_ifc.jl
@@ -15,8 +15,7 @@ function second_order_finite_diff(sys_eq::SuperCellSystem{3}, pot::PairPotential
 
    energy_unit = zero(potential(pot,1u"Å")) 
 
-   #Make mutable #& change SimpleCrystals to not use SVector
-   posns = [Vector(a) for a in positions(sys_eq)]
+   posns = positions(sys_eq)
 
    if atom_idxs[1] != atom_idxs[2]
 
@@ -61,9 +60,8 @@ function third_order_finite_diff(sys_eq::SuperCellSystem{3}, pot::PairPotential,
 
    @assert length(atom_idxs) == 3
    @assert length(cartesian_idxs) == 3
-   @assert !all(atom_idxs .== atom_idxs[1]) "Cannot check self terms with finite difference"
 
-   N_atoms = n_atoms(sys)
+   N_atoms = n_atoms(sys_eq)
 
    z = zero(pot.σ)
    combos = [[z,h,-h],[z,h,h],[z,-h,h],[z,-h,-h]]
@@ -71,15 +69,14 @@ function third_order_finite_diff(sys_eq::SuperCellSystem{3}, pot::PairPotential,
    force_unit = zero(force(pot,1u"Å")) 
    forces = zeros(4)*force_unit
 
-   #Make mutable #& change SimpleCrystals to not use SVector
-   posns = [Vector(a) for a in positions(sys)]
+   posns = positions(sys_eq)
 
    for (c,combo) in enumerate(combos)
 
        posns[atom_idxs[2]][cartesian_idxs[2]] += combo[2]
        posns[atom_idxs[3]][cartesian_idxs[3]] += combo[3]
 
-       forces[c] = force_loop_j(pot, posns, force_unit, r_cut, sys.box_sizes_SC, N_atoms, atom_idxs[1], cartesian_idxs[1])
+       forces[c] = force_loop_j(pot, posns, force_unit, r_cut, sys_eq.box_sizes_SC, N_atoms, atom_idxs[1], cartesian_idxs[1])
 
        posns[atom_idxs[2]][cartesian_idxs[2]] -= combo[2]
        posns[atom_idxs[3]][cartesian_idxs[3]] -= combo[3]
@@ -98,16 +95,15 @@ function fourth_order_finite_diff(sys_eq::SuperCellSystem{3}, pot::PairPotential
    @assert length(cartesian_idxs) == 4
    @assert !all(atom_idxs .== atom_idxs[1]) "Cannot check self terms with finite difference"
 
-   N_atoms = n_atoms(sys)
+   N_atoms = n_atoms(sys_eq)
 
    combos = [[h,h,h],[h,h,-h],[h,-h,h],[h,-h,-h],[-h,h,h],[-h,h,-h],[-h,-h,h],[-h,-h,-h]]
    
    force_unit = zero(force(pot,1u"Å")) 
    forces = zeros(8)*force_unit
 
-   #Make mutable #& change SimpleCrystals to not use SVector
-   posns = [Vector(a) for a in positions(sys)]
-
+   posns = positions(sys_eq)
+   
    for (c,combo) in enumerate(combos)
 
        posns[atom_idxs[1]][cartesian_idxs[1]] += combo[1]
@@ -120,7 +116,7 @@ function fourth_order_finite_diff(sys_eq::SuperCellSystem{3}, pot::PairPotential
        for i in range(1,N_atoms)
            if i != l               
                r = posns[i] .- posns[l]
-               nearest_mirror!(r, sys.box_sizes_SC)
+               nearest_mirror!(r, sys_eq.box_sizes_SC)
                dist = norm(r)
                
                #Make sure mirrored particle is in cuttoff
@@ -148,48 +144,28 @@ function fourth_order_finite_diff(sys_eq::SuperCellSystem{3}, pot::PairPotential
    return (1/(8*(h^3)))*(forces[1] - forces[2] - forces[3] + forces[4] - forces[5] + forces[6] + forces[7] - forces[8])
 end
 
+#& CURRENTLY DIFFERS by -1 FROM MINE
 function check_ifc(sys::SuperCellSystem{D}, ifc::DenseForceConstants{O}, pot::PairPotential, n_points;
     r_cut = pot.r_cut, h = 0.04*0.5291772109u"Å") where {D,O}
 
     N_atoms = n_atoms(sys)
-    ri = rand(1:N_atoms, (n_points,O)) #random indexes
+    ri = rand(1:N_atoms, (n_points,O)) #random atom indexes
     rci = rand(1:D, (n_points, O)) #random cartesian indexes
 
     finite_diff_funcs = [second_order_finite_diff, third_order_finite_diff, fourth_order_finite_diff]
     finite_diff_func = finite_diff_funcs[O-1]
 
-    ifc_fd = zeros(n_points)*ifc.units
-    #& ONLY WORKS FOR SPECIFIC DIMENSION
-    ifc_actual = [ifc[D*(ri[i,1] - 1) + rci[i,1], D*(ri[i,2] - 1) + rci[i,2]] for i in 1:n_points]
-    # ifc_actual = [ifc[D*(ri[i,1] - 1) + rci[i,1], D*(ri[i,2] - 1) + rci[i,2], D*(ri[i,3] - 1) + rci[i,3]]
-    #                  for i in 1:n_points]
+    idxs = (ri_row, rci_row) -> [D*(ri_row[o] - 1) + rci_row[o] for o in 1:O]
+    ifc_actual = @views [ifc[idxs(ri[i,:], rci[i,:])...] for i in 1:n_points]
  
+    ifc_fd = zeros(n_points)*ifc.units
     for i in 1:n_points
-        ifc_fd[i] = finite_diff_func(sys, pot, ri[i,:], rci[i,:]; r_cut = r_cut, h = h)
+        val = finite_diff_func(sys, pot, ri[i,:], rci[i,:]; r_cut = r_cut, h = h)
+        if ustrip.(abs(val)) > ifc.tol
+            ifc_fd[i] = val
+        end
     end
 
     return ifc_fd, (ifc_actual.*ifc.units)
 
-end
-
-function check_ifc(sys::SuperCellSystem{D}, ifc::SparseForceConstants{O}, pot::PairPotential, n_points;
-    r_cut = pot.r_cut, h = 0.04*0.5291772109u"Å") where {D,O}
-
-    random_idxs = rand(1:length(ifc), (n_points,)) 
-
-    finite_diff_funcs = [second_order_finite_diff, third_order_finite_diff, fourth_order_finite_diff]
-    finite_diff_func = finite_diff_funcs[O-1]
-
-    ifc_fd = zeros(n_points)*ifc.units
-    ifc_actual = value.(ifc[random_idxs])
- 
-    for p in 1:n_points
-        idxs = idx(ifc[random_idxs[p]])
-        cart_idxs = ((idxs .- 1) .% D) .+ 1
-        atom_idxs = ((idxs .- cart_idxs) ./ D) .- 1
-        ifc_fd[p] = finite_diff_func(sys, pot, atom_idxs, cart_idxs; r_cut = r_cut, h = h)
-    end
-
-    return ifc_fd,  (ifc_actual.*ifc.units)
-
 end
\ No newline at end of file
diff --git a/src/fourth_order.jl b/src/fourth_order.jl
index 8fa394e..e788700 100644
--- a/src/fourth_order.jl
+++ b/src/fourth_order.jl
@@ -7,7 +7,7 @@ i = j and k = l (e.g. (i,i,j,j)) and terms where i = j = k (e.g. (i,i,i,j)). All
 more than 2 unique indices will be 0.
 """
 function fourth_order_sparse(sys::SuperCellSystem{D}, pot::PairPotential,
-         tol; float_type = Float32, r_cut = pot.r_cut, nthreads::Integer = Threads.nthreads()) where D
+         tol; r_cut = pot.r_cut, nthreads::Integer = Threads.nthreads()) where D
 
     @assert all(r_cut .< sys.box_sizes_SC) "Cutoff larger than L/2"
     
@@ -21,7 +21,7 @@ function fourth_order_sparse(sys::SuperCellSystem{D}, pot::PairPotential,
     #Create storage for each thread
     χ_arrays = MultiVectorStorage(FC_val{float_type,4}, nthreads)
 
-    atoms_per_thd = cld(N_atoms,nthreads) #* DONT LIKE THIS, BAD BALANCING
+    atoms_per_thd = cld(N_atoms,nthreads) #* DONT LIKE THIS, BAD BALANCING, USE ITERATORS.PARTITION
 
     Base.@sync for thd in 1:nthreads
         Base.Threads.@spawn begin
@@ -43,7 +43,7 @@ function fourth_order_sparse(sys::SuperCellSystem{D}, pot::PairPotential,
                             for β in range(1,D)
                                 for γ in range(1,D)
                                     for δ in range(1,D)
-                                        val = float_type(ustrip(ϕ₄(pot, dist, r, α, β, γ, δ)))
+                                        val = ustrip(ϕ₄(pot, dist, r, α, β, γ, δ))
                                         
                                         if abs(val) > tol
                                             # i,i,j,j terms
@@ -73,7 +73,7 @@ function fourth_order_sparse(sys::SuperCellSystem{D}, pot::PairPotential,
                             for β in range(1,D)
                                 for γ in range(1,D)
                                     for δ in range(1,D)
-                                        val = -float_type(ustrip(ϕ₄(pot, dist, r, α, β, γ, δ)))
+                                        val = -ustrip(ϕ₄(pot, dist, r, α, β, γ, δ))
                                         
                                         if abs(val) > tol
                                             # j,j,j,i terms
@@ -97,16 +97,16 @@ function fourth_order_sparse(sys::SuperCellSystem{D}, pot::PairPotential,
     end
 
     #Concat arrays calculated by separate threads
-    χ = convert(Vector{FC_val{float_type, 4}}, χ_arrays)
+    χ = convert(Vector{FC_val{Float64, 4}}, χ_arrays)
 
-    #Acoustic Sum Rule (i,i,i,i terms) #& how to parallelize this part?
-    for i in range(1,N_atoms)
+    #Acoustic Sum Rule (i,i,i,i terms)
+    Threads.@threads for i in range(1,N_atoms)
         for α in range(1,D)
             for β in range(1,D)
                 for γ in range(1,D)
                     for δ in range(1,D)
                         ii = D*(i-1) + α; jj = D*(i-1) + β; kk = D*(i-1) + γ; ll = D*(i-1) + δ
-                        val = float_type(ϕ₄_self(sys, pot, i, α, β, γ, δ, r_cut))
+                        val = ϕ₄_self(sys, pot, i, α, β, γ, δ, r_cut)
                         if abs(val) > tol
                             push!(χ,FC_val(val, ii, jj, kk, ll))
                         end
diff --git a/src/mcc.jl b/src/mcc.jl
index 49f2c4b..82faf7e 100644
--- a/src/mcc.jl
+++ b/src/mcc.jl
@@ -73,46 +73,64 @@ function mcc3_blocked!(K3::CuArray{Float32, 3}, Ψ::CuArray{Float32, 3}, phi::Cu
 end
 
 
-const BLOCK_SIZE = 128
-
-#Each thread calculates fiber of K_nml
-function mcc3_custom_kernel(num_psi_idxs::Int32, mcc::CuDeviceArray{Float32,1}, phi::CuDeviceArray{Float32,2},
-         Ψ_vals::CuDeviceArray, is::CuDeviceArray{Int32,1}, js::CuDeviceArray{Int32,1}, ks::CuDeviceArray{Int32,1}) 
-
-    n = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
-    m = (blockIdx().y-1i32) * blockDim().y + threadIdx().y
-    linearThreadIdx = (blockDim().x * threadIdx().y) + threadIdx().x #& OFF BY 1 iSSUES?
-
-    #To store rows of phi matrix needed by current values of Psi
-    phi_i = CuStaticSharedArray(Float32, BLOCK_SIZE)
-    phi_j = CuStaticSharedArray(Float32, BLOCK_SIZE)
-    phi_k = CuStaticSharedArray(Float32, BLOCK_SIZE)
-
-
-    # Treat thread index as index into Ψ now and accumulate values into MCC
-    for i in 1:BLOCK_SIZE:num_psi_idxs
-        psi_idx = i + linearThreadIdx
-
-        if psi_idx <= num_psi_idxs
-            #Move Phi Data Into Shared Memory
-            phi_i[linearThreadIdx] = phi[is[psi_idx]]
-            phi_j[linearThreadIdx] = phi[js[psi_idx]]
-            phi_k[linearThreadIdx] = phi[ks[psi_idx]] #& This might be the only one that needs to be in shared mem
-            sync_threads()
-
-            #Calculate fiber of MCC
-            for l in 1:N_modes
-                mcc[n,m,l] += Ψ_vals[psi_idx]*phi_i[n]*phi_j[m]*phi_k[l]
-            end
-            sync_threads()
-
-        end
-    end
-
-end
-
-# const BLOCK_SIDE = 32
-# const GRID_SIDE = ceil(sqrt(N)/BLOCK_SIDE)
-# @cuda blocks = (GRID_SIDE,GRID_SIDE) threads = (BLOCK_SIDE,BLOCK_SIDE) mcc3_custom_kernel(
-
-# )
\ No newline at end of file
+# using CUDA
+# import CUDA: i32
+# N = 10000000
+# N_modes = 768;
+# rand_nums = CUDA.rand(N);
+# rand_idxs = rand(1:N_modes, (N,3));
+# is = CuArray{Int32}(rand_idxs[:,1])
+# js = CuArray{Int32}(rand_idxs[:,2])
+# ks = CuArray{Int32}(rand_idxs[:,3])
+
+# phi = CUDA.rand(N_modes, N_modes)
+# mcc_all = zeros(Float32, (N_modes, N_modes, N_modes))
+
+# #256 x 256 x N_modes
+# mcc_chunk_size = 256
+# mcc_chunk = CUDA.zeros(Float32, (mcc_chunk_size, mcc_chunk_size, N_modes))
+
+
+# #CUDA THREAD DIMENSIONS
+# const BLOCK_SIDE::Int32 = 32
+# const BLOCK_SIZE::Int32 = BLOCK_SIDE*BLOCK_SIDE
+
+# const GRID_SIDE::Int64 = Int64(ceil(mcc_chunk_size/BLOCK_SIDE))
+# @cuda blocks = (GRID_SIDE,GRID_SIDE) threads = (BLOCK_SIDE,BLOCK_SIDE) mcc3_custom_kernel(Int32(N), Int32(0), Int32(N_modes), mcc_chunk, phi,rand_nums,is,js,ks)
+
+# #Each thread calculates fiber of K_nml
+# function mcc3_custom_kernel(num_psi_idxs::Int32, mcc_offset::Int32, N_modes::Int32, mcc_block::CuDeviceArray{Float32,3}, phi::CuDeviceArray{Float32,2},
+#     Ψ_vals::CuDeviceArray{Float32,1}, is::CuDeviceArray{Int32,1}, js::CuDeviceArray{Int32,1}, ks::CuDeviceArray{Int32,1}) 
+
+#     n::Int32 = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
+#     m::Int32 = (blockIdx().y-1i32) * blockDim().y + threadIdx().y
+#     @cuprintln "$n $m"
+#     linearThreadIdx::Int32 = (blockDim().x * (threadIdx().y-1i32)) + threadIdx().x #& OFF BY 1 iSSUES?
+
+#     #To store rows of phi matrix needed by current values of Psi
+#     phi_i = CuStaticSharedArray(Float32, BLOCK_SIZE)
+#     phi_j = CuStaticSharedArray(Float32, BLOCK_SIZE)
+#     phi_k = CuStaticSharedArray(Float32, BLOCK_SIZE)
+
+#     # Treat thread index as index into Ψ now and accumulate values into MCC
+#     for i in 1:BLOCK_SIZE:num_psi_idxs
+#         psi_idx::Int32 = i + linearThreadIdx
+
+#         if psi_idx <= num_psi_idxs
+#             #Move Phi Data Into Shared Memory
+#             phi_i[linearThreadIdx] = phi[is[psi_idx],linearThreadIdx] #* this read is not in order of memory
+#             phi_j[linearThreadIdx] = phi[js[psi_idx],linearThreadIdx]
+#             phi_k[linearThreadIdx] = phi[ks[psi_idx],linearThreadIdx] #& This might be the only one that needs to be in shared mem
+#             CUDA.sync_threads()
+
+#             #Calculate fiber of MCC
+#             # if m >= n #* this could make it slower??
+#             for l in 1:N_modes
+#                 mcc_block[n,m,l] += Ψ_vals[psi_idx]*phi_i[n + mcc_offset]*phi_j[m + mcc_offset]*phi_k[l]
+#             end
+#             # end
+#             CUDA.sync_threads()
+#         end
+#     end
+#     return
+# end
diff --git a/src/old.jl b/src/old.jl
index 41aab2b..c61f2a9 100644
--- a/src/old.jl
+++ b/src/old.jl
@@ -19,6 +19,12 @@
 #    return mapreduce(f, +, cuF3_sparse_mw)
 # end
 
+# function test(cuF3_sparse_mw::CuArray{F3_val}, q::CuArray{Float32,1})
+#     f = (f3_data) -> f3_data.val * q[f3_data.i] * q[f3_data.j] * q[f3_data.k]
+#    return mapreduce(f, +, cuF3_sparse_mw)
+# end
+
+
 # """
 # Takes in sparse third order data that has already been mass weighted
 
diff --git a/src/third_order.jl b/src/third_order.jl
index 13fb230..788efb2 100644
--- a/src/third_order.jl
+++ b/src/third_order.jl
@@ -1,72 +1,175 @@
-export third_order_IFC, third_order_test, mass_weight_sparsify_third_order, mass_weight_third_order!, F3_val
+export third_order_IFC, third_order_sparse, mass_weight_sparsify_third_order, mass_weight_third_order!, F3_val
 
-"""
-Calculates analytical third order force constants for a pair potential (e.g. Lennard Jones).
-For pair potentials the only non-zero terms will be self-terms (e.g. i,i,i) and terms where
-i = j or j = k (e.g. 1,1,2).
-"""
 function third_order_IFC(sys::SuperCellSystem{D}, pot::PairPotential, tol) where D
     @assert all(pot.r_cut .< sys.box_sizes_SC) "Cutoff larger than L/2"
     N_atoms = n_atoms(sys)
-    Ψ = zeros(D*N_atoms,D*N_atoms,D*N_atoms)
-
-    unique_indices = with_replacement_combinations((1:N_atoms), 3)
-    is_not_self_term = (i,j,k) -> !(i == j && i == k)
-    not_all_unique = (i,j,k) -> (i == j || i == k || j == k)
-
-    r = zeros(D)*unit(sys.atoms.position[1][1])
-    for idx in unique_indices #TODO PARALLELIZE THIS, HOW TO STILL USE ITERATOR? just for i <= j <= k
-        i,j,k = idx
-        if is_not_self_term(i,j,k) && not_all_unique(i,j,k)
-            # println(i," ",j," ",k)
-
-            if i == j
-                r .= sys.atoms.position[i] .- sys.atoms.position[k]
-            elseif i == k
-                r .= sys.atoms.position[i] .- sys.atoms.position[j]
-            elseif j == k
-                r .= sys.atoms.position[j] .- sys.atoms.position[i]
-            end
+    Ψ = zeros(D*N_atoms, D*N_atoms, D*N_atoms)
+
+
+    # pot is pair potential only loop atomic pairs
+    Threads.@threads for i in range(1,N_atoms)
+        r = zeros(D)*unit(sys.atoms.position[1][1]) #pre-allocate per thread
+        for j in range(i+1,N_atoms)
+
+            #i,i,j terms
+            r .= sys.atoms.position[i] .- sys.atoms.position[j]
+            nearest_mirror!(r, sys.box_sizes_SC)
+            dist = norm(r)
+
             
-            r .= nearest_mirror(r, sys.box_sizes_SC)
+            if dist < pot.r_cut
+                for α in range(1,D)
+                    for β in range(1,D)
+                        for γ in range(1,D)
+                            val = -ustrip(ϕ₃(pot, dist, r, α, β, γ)) #* re-calculating derivatives ununecessarily in here
+                            ii = D*(i-1) + α; jj = D*(i-1) + β; kk = D*(j-1) + γ
+                            Ψ[ii,jj,kk] = val; Ψ[ii,kk,jj] = val
+                            Ψ[jj,kk,ii] = val; Ψ[jj,ii,kk] = val
+                            Ψ[kk,ii,jj] = val; Ψ[kk,jj,ii] = val 
+                        end
+                    end
+                end            
+
+            end
+
+            # #j,j,i term
+            r .= sys.atoms.position[j] .- sys.atoms.position[i]
+            nearest_mirror!(r, sys.box_sizes_SC)
             dist = norm(r)
 
             if dist < pot.r_cut
                 for α in range(1,D)
                     for β in range(1,D)
                         for γ in range(1,D)
-                            ii = D*(i-1) + α; jj = D*(j-1) + β; kk = D*(k-1) + γ
-    
-                            val = -ustrip(ϕ₃(pot, dist, r, α, β, γ))
+                            val = -ustrip(ϕ₃(pot, dist, r, α, β, γ)) 
+                            ii = D*(i-1) + α; jj = D*(j-1) + β; kk = D*(j-1) + γ
                             Ψ[ii,jj,kk] = val; Ψ[ii,kk,jj] = val
                             Ψ[jj,kk,ii] = val; Ψ[jj,ii,kk] = val
-                            Ψ[kk,ii,jj] = val; Ψ[kk,jj,ii] = val
+                            Ψ[kk,ii,jj] = val; Ψ[kk,jj,ii] = val 
                         end
                     end
+                end            
+
+            end
+        end
+    end
+
+    #Self terms
+    Ψ = ASR!(Ψ, N_atoms, D)
+
+    #Apply tolerances
+    Ψ = apply_tols!(Ψ, tol)
+
+    #Give proper units
+    Ψ_unit = unit(pot.ϵ / pot.σ^3)
+
+    return DenseForceConstants(Ψ, Ψ_unit, tol)
+    
+end
+
+#* untested
+function third_order_sparse(sys::SuperCellSystem{D}, pot::PairPotential,
+    tol; r_cut = pot.r_cut, nthreads::Integer = Threads.nthreads()) where D
+
+    throw(error("Not yet implemented"))
+
+    @assert all(r_cut .< sys.box_sizes_SC) "Cutoff larger than L/2"
+    
+    N_atoms = n_atoms(sys)
+
+    if (N_atoms < nthreads)
+        @warn "More threads than atoms, using N_atoms as number of threads"
+        nthreads = N_atoms
+    end
+
+    #Create storage for each thread
+    Ψ_arrays = MultiVectorStorage(FC_val{Float64,3}, nthreads)
+    atoms_per_thd = cld(N_atoms,nthreads) #* DONT LIKE THIS, BAD BALANCING, USE ITERATORS.PARTITION
+
+    Base.@sync for thd in 1:nthreads
+        Base.Threads.@spawn begin
+            Ψ_thd = Ψ_arrays.data[thd] #Get local storage for force constants
+            start_idx = (thd-1)*atoms_per_thd + 1
+            end_idx = min(thd*atoms_per_thd, N_atoms)
+
+            r = zeros(D)*unit(sys.atoms.position[1][1]) #pre-allocate
+            for i in range(start_idx, end_idx)
+                for j in range(i+1,N_atoms)
+
+                    r .= sys.atoms.position[i] .- sys.atoms.position[j]
+                    nearest_mirror!(r, sys.box_sizes_SC)
+                    dist = norm(r)
+        
+                    
+                    if dist < pot.r_cut
+                        for α in range(1,D)
+                            for β in range(1,D)
+                                for γ in range(1,D)
+                                    val = -ustrip(ϕ₃(pot, dist, r, α, β, γ)) #* re-calculating derivatives ununecessarily in here
+                                    ii = D*(i-1) + α; jj = D*(i-1) + β; kk = D*(j-1) + γ
+                                    push!(Ψ_thd, FC_val(val,ii,jj,kk))
+                                    push!(Ψ_thd, FC_val(val,ii,kk,jj))
+                                    push!(Ψ_thd, FC_val(val,jj,kk,ii))
+                                    push!(Ψ_thd, FC_val(val,jj,ii,kk))
+                                    push!(Ψ_thd, FC_val(val,kk,ii,jj))
+                                    push!(Ψ_thd, FC_val(val,kk,jj,ii))
+                                end
+                            end
+                        end            
+        
+                    end
+        
+                    # #j,j,i term
+                    r .= sys.atoms.position[j] .- sys.atoms.position[i]
+                    nearest_mirror!(r, sys.box_sizes_SC)
+                    dist = norm(r)
+        
+                    if dist < pot.r_cut
+                        for α in range(1,D)
+                            for β in range(1,D)
+                                for γ in range(1,D)
+                                    val = -ustrip(ϕ₃(pot, dist, r, α, β, γ)) 
+                                    ii = D*(i-1) + α; jj = D*(j-1) + β; kk = D*(j-1) + γ
+                                    push!(Ψ_thd, FC_val(val,ii,jj,kk))
+                                    push!(Ψ_thd, FC_val(val,ii,kk,jj))
+                                    push!(Ψ_thd, FC_val(val,jj,kk,ii))
+                                    push!(Ψ_thd, FC_val(val,jj,ii,kk))
+                                    push!(Ψ_thd, FC_val(val,kk,ii,jj))
+                                    push!(Ψ_thd, FC_val(val,kk,jj,ii))
+                                end
+                            end
+                        end            
+        
+                    end
                 end
             end
         end
     end
 
-    #Acoustic Sum Rule (technically re-calculating the whole loop above here)
-    Threads.@threads for i in range(1,N_atoms)
+    Ψ = convert(Vector{FC_val{Float64, 3}}, Ψ_arrays)
+
+    #Acoustic Sum Rule (i,i,i,i terms) #& how to parallelize this part?
+    #* Can keep running track of these terms when generating others
+    for i in range(1,N_atoms)
         for α in range(1,D)
             for β in range(1,D)
                 for γ in range(1,D)
                     ii = D*(i-1) + α; jj = D*(i-1) + β; kk = D*(i-1) + γ
-                    Ψ[ii,jj,kk] = ϕ₃_self(sys, pot, i, α, β, γ)
+                    val = ϕ₃_self(sys, pot, i, α, β, γ)
+                    if abs(val) > tol
+                        push!(Ψ ,FC_val(val, ii, jj, kk))
+                    end
                 end
             end
         end
     end
 
-    #Apply tolerances
-    Ψ = apply_tols!(Ψ, tol)
 
     #Give proper units
     Ψ_unit = unit(pot.ϵ / pot.σ^3)
 
-    return DenseForceConstants(Ψ, Ψ_unit, tol)
+    return SparseForceConstants(Ψ, Ψ_unit, tol)
+
 end
 
 function mass_weight_third_order!(Ψ::ThirdOrderForceConstants, masses::AbstractVector)
@@ -133,118 +236,6 @@ function ϕ₃_self(sys::SuperCellSystem, pot::PairPotential, i, α, β, γ)
 end
 
 
-"""
-Mass weights the force constant matrix such that element i,j,k is divided by
-sqrt(m_i * m_j * m_k). This is useful when converting to modal coupling constants.
-The force constants are also returned in sparse format as a vector of values and indices.
-"""
-function mass_weight_sparsify_third_order(Ψ::ThirdOrderForceConstants, masses::AbstractVector)
-
-    N_modes = size(Ψ)[1]
-    D = Int(N_modes/length(masses))
-    N_atoms = length(masses)
-
-    @assert D ∈ [1,2,3] 
-
-    mass_unit = unit(masses[1])
-    masses = ustrip.(masses)
-
-    num_nonzero = sum(Ψ.values .!= 0.0)
-    Ψ_non_zero_mw = Vector{F3_val}(undef,(num_nonzero,))
-
-    count = 1
-    for i in 1:N_atoms
-        for j in 1:N_atoms
-            for k in 1:N_atoms
-                for α in 1:D
-                    for β in 1:D
-                        for γ in 1:D
-                            ii = D*(i-1) + α; jj = D*(j-1) + β; kk = D*(k-1) + γ
-                            if Ψ[ii,jj,kk] != 0
-                                Ψ_non_zero_mw[count] = F3_val(ii, jj, kk, Ψ[ii,jj,kk]/sqrt(masses[i]*masses[j]*masses[k]))
-                                count += 1
-                            end
-                        end
-                    end
-                end
-            end
-        end
-    end
-    @assert (count - 1) == num_nonzero
-
-
-    return SparseForceConstants(Ψ_non_zero_mw, Ψ.units/mass_unit, Ψ.tol)
-end
-
-
-function third_order_test(sys::SuperCellSystem{D}, pot::PairPotential, tol) where D
-    @assert all(pot.r_cut .< sys.box_sizes_SC) "Cutoff larger than L/2"
-    N_atoms = n_atoms(sys)
-    Ψ = zeros(D*N_atoms, D*N_atoms, D*N_atoms)
-
-
-    # pot is pair potential only loop atomic pairs
-    Threads.@threads for i in range(1,N_atoms)
-        r = zeros(D)*unit(sys.atoms.position[1][1]) #pre-allocate per thread
-        for j in range(i+1,N_atoms)
-
-            #i,i,j terms
-            r .= sys.atoms.position[i] .- sys.atoms.position[j]
-            nearest_mirror!(r, sys.box_sizes_SC)
-            dist = norm(r)
-
-            
-            if dist < pot.r_cut
-                for α in range(1,D)
-                    for β in range(1,D)
-                        for γ in range(1,D)
-                            val = -ustrip(ϕ₃(pot, dist, r, α, β, γ)) #* re-calculating derivatives ununecessarily in here
-                            ii = D*(i-1) + α; jj = D*(i-1) + β; kk = D*(j-1) + γ
-                            Ψ[ii,jj,kk] = val; Ψ[ii,kk,jj] = val
-                            Ψ[jj,kk,ii] = val; Ψ[jj,ii,kk] = val
-                            Ψ[kk,ii,jj] = val; Ψ[kk,jj,ii] = val 
-                        end
-                    end
-                end            
-
-            end
-
-            # #j,j,i term
-            r .= sys.atoms.position[j] .- sys.atoms.position[i]
-            nearest_mirror!(r, sys.box_sizes_SC)
-            dist = norm(r)
-
-            if dist < pot.r_cut
-                for α in range(1,D)
-                    for β in range(1,D)
-                        for γ in range(1,D)
-                            val = -ustrip(ϕ₃(pot, dist, r, α, β, γ)) 
-                            ii = D*(i-1) + α; jj = D*(j-1) + β; kk = D*(j-1) + γ
-                            Ψ[ii,jj,kk] = val; Ψ[ii,kk,jj] = val
-                            Ψ[jj,kk,ii] = val; Ψ[jj,ii,kk] = val
-                            Ψ[kk,ii,jj] = val; Ψ[kk,jj,ii] = val 
-                        end
-                    end
-                end            
-
-            end
-        end
-    end
-
-    #Self terms
-    Ψ = ASR!(Ψ, N_atoms, D)
-
-    #Apply tolerances
-    Ψ = apply_tols!(Ψ, tol)
-
-    #Give proper units
-    Ψ_unit = unit(pot.ϵ / pot.σ^3)
-
-    return DenseForceConstants(Ψ, Ψ_unit, tol)
-    
-end
-
-
 function ASR!(ifc3::Array{T,3}, N_atoms, D) where T
     #Loop all self terms
     Threads.@threads for i in range(1,N_atoms)
@@ -267,4 +258,75 @@ function ASR!(ifc3::Array{T,3}, N_atoms, D) where T
     end
 
     return ifc3
-end
\ No newline at end of file
+end
+
+
+#Old slower version
+# """
+# Calculates analytical third order force constants for a pair potential (e.g. Lennard Jones).
+# For pair potentials the only non-zero terms will be self-terms (e.g. i,i,i) and terms where
+# i = j or j = k (e.g. 1,1,2).
+# """
+# function third_order_IFC(sys::SuperCellSystem{D}, pot::PairPotential, tol) where D
+#     @assert all(pot.r_cut .< sys.box_sizes_SC) "Cutoff larger than L/2"
+#     N_atoms = n_atoms(sys)
+#     Ψ = zeros(D*N_atoms,D*N_atoms,D*N_atoms)
+
+#     unique_indices = with_replacement_combinations((1:N_atoms), 3)
+#     is_not_self_term = (i,j,k) -> !(i == j && i == k)
+#     not_all_unique = (i,j,k) -> (i == j || i == k || j == k)
+
+#     r = zeros(D)*unit(sys.atoms.position[1][1])
+#     for idx in unique_indices #TODO PARALLELIZE THIS, HOW TO STILL USE ITERATOR? just for i <= j <= k
+#         i,j,k = idx
+#         if is_not_self_term(i,j,k) && not_all_unique(i,j,k)
+#             # println(i," ",j," ",k)
+
+#             if i == j
+#                 r .= sys.atoms.position[i] .- sys.atoms.position[k]
+#             elseif i == k
+#                 r .= sys.atoms.position[i] .- sys.atoms.position[j]
+#             elseif j == k
+#                 r .= sys.atoms.position[j] .- sys.atoms.position[i]
+#             end
+            
+#             r .= nearest_mirror(r, sys.box_sizes_SC)
+#             dist = norm(r)
+
+#             if dist < pot.r_cut
+#                 for α in range(1,D)
+#                     for β in range(1,D)
+#                         for γ in range(1,D)
+#                             ii = D*(i-1) + α; jj = D*(j-1) + β; kk = D*(k-1) + γ
+    
+#                             val = -ustrip(ϕ₃(pot, dist, r, α, β, γ))
+#                             Ψ[ii,jj,kk] = val; Ψ[ii,kk,jj] = val
+#                             Ψ[jj,kk,ii] = val; Ψ[jj,ii,kk] = val
+#                             Ψ[kk,ii,jj] = val; Ψ[kk,jj,ii] = val
+#                         end
+#                     end
+#                 end
+#             end
+#         end
+#     end
+
+#     #Acoustic Sum Rule (technically re-calculating the whole loop above here)
+#     Threads.@threads for i in range(1,N_atoms)
+#         for α in range(1,D)
+#             for β in range(1,D)
+#                 for γ in range(1,D)
+#                     ii = D*(i-1) + α; jj = D*(i-1) + β; kk = D*(i-1) + γ
+#                     Ψ[ii,jj,kk] = ϕ₃_self(sys, pot, i, α, β, γ)
+#                 end
+#             end
+#         end
+#     end
+
+#     #Apply tolerances
+#     Ψ = apply_tols!(Ψ, tol)
+
+#     #Give proper units
+#     Ψ_unit = unit(pot.ϵ / pot.σ^3)
+
+#     return DenseForceConstants(Ψ, Ψ_unit, tol)
+# end
\ No newline at end of file
diff --git a/src/types.jl b/src/types.jl
index 7882684..20066b2 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -1,5 +1,6 @@
 export SuperCellSystem, Potential, PairPotential, ThreeBodyPotential,
-    masses, mass, position, positions, charges, charge, n_atoms, n_atoms_per_uc
+    masses, mass, position, positions, charges, charge, n_atoms, n_atoms_per_uc,
+    SparseForceConstants, DenseForceConstants
 
 struct Atom{P,C,M}
     position::P
@@ -75,6 +76,8 @@ end
 
 Base.size(fc::DenseForceConstants) = size(fc.values)
 Base.getindex(fc::DenseForceConstants{O}, idxs::Vararg{Integer, O}) where O = fc.values[idxs...]
+Base.getindex(fc::DenseForceConstants{O}, idxs::CartesianIndex{O}) where O = fc.values[idxs]
+
 n_modes(fc::DenseForceConstants) = size(fc)[1]
 
 #type aliases
@@ -93,42 +96,58 @@ function FC_val(val, idxs::Vararg{Integer, O}) where O
     return FC_val{typeof(val), O}(val, Vector{Int32}(idxs))
 end
 
+function FC_val(val, idxs::AbstractVector{<:Integer})
+    return FC_val{typeof(val), length(idxs)}(val, Int32.(idxs))
+end
+
+function FC_val(val, idxs::Tuple)
+    return FC_val{typeof(val), length(idxs)}(val, collect(idxs))
+end
+
 
 struct SparseForceConstants{O,V,U,T} <: AbstractForceConstants{O}
-    values::AbstractVector{FC_val{V,O}} #& MAKE THIS A STRUCT ARRAY???
+    values::StructArray{FC_val{V,O}} #1 entry per DoF
     units::U
     tol::T
 end
 
-# struct SparseForceConstantsTest{O,V,U,T} <: AbstractForceConstant{O}
-#     values::StructArray{FC_val{V,O}} #& MAKE THIS A STRUCT ARRAY???
-#     units::U
-#     tol::T
-# end
-
-Base.length(fc::SparseForceConstants) = length(fc.values)
+Base.length(fc::SparseForceConstants) = sum(length.(fc.values))
 
 #type aliases
 const SparseSecondOrder{V,U,T} = SparseForceConstants{2,V,U,T}
 const SparseThirdOrder{V,U,T} = SparseForceConstants{3,V,U,T}
 const SparseFourthOrder{V,U,T} = SparseForceConstants{4,V,U,T}
 
+function SparseForceConstants(vals::Vector{FC_val{V, O}}, units, tol) where {V,O}
+    return SparseForceConstants{O, V, typeof(units), typeof(tol)}(StructArray(vals), units, tol)
+end
 
 # #Construct SparseForceConstants object from dense ForceConstants object 
-function SparseForceConstants(fc::ForceConstants{O,V,U,T}) where {O,V,U,T}
-    N_modes = size(fc)[1]
-    
-    idxs = with_replacement_combinations(1:N_modes, O)
-    values = FC_Val{V,O}[]
-    for idx in idxs
-        if fc[idx...] != 0.0
-            for perm in permutations(idx)
-                push!(values, FC_Val(fc[perm...], perm...))
-            end
+function SparseForceConstants(fc::DenseForceConstants{O,V,U,T}; nthreads::Integer = Threads.nthreads()) where {O,V,U,T}
+   
+    num_nonzero = sum(fc.values .!= 0.0)
+    sa = StructArray{FC_val{V,O}}(undef, (num_nonzero,))
+
+    count = 1
+    for idx in CartesianIndices(fc.values)
+        if fc[idx] != 0.0
+            sa[count] = FC_val(fc[idx],Tuple(idx))
+            count += 1
         end
     end
 
-    return SparseForceConstants{O,V,U,T}(values, fc.units, fc.tol)
+    return SparseForceConstants{O,V,U,T}(sa, fc.units, fc.tol)
+end
+
+#&NEED TO TEST THIS
+function sort_by_mode(sfc::SparseForceConstants{O,V}, n_dof::Integer) where {O,V}
+    data = Vector{StructArray{FC_val{V,O}}}(undef, n_dof)
+
+    for fc in sfc.values
+        data[fc.idxs[1]] = fc
+    end
+
+    return data
 end
 
 # Storage is multiple vectors, acts as single vector
diff --git a/test/runtests.jl b/test/runtests.jl
index 6136b48..b0211e2 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -18,11 +18,14 @@ else
     @warn "The GPU tests will not be run as a CUDA-enabled device is not available"
 end
 
-pot = LJ(3.4u"Å", 0.24037u"kcal * mol^-1", 8.5u"Å")
-fcc_crystal = FCC(5.2468u"Å", :Ar, SVector(4,4,4))
-sys = SuperCellSystem(fcc_crystal)
-dynmat = dynamicalMatrix(sys, pot, 1e-12);
-freqs_sq, phi = get_modes(dynmat_, 3)
+#& Check that calculating sparse 3rd equals generating it from the dense
+#& Various U_TEP tests, per mode vs total vs parallel vs serial
+
+# pot = LJ(3.4u"Å", 0.24037u"kcal * mol^-1", 8.5u"Å")
+# fcc_crystal = FCC(5.2468u"Å", :Ar, SVector(4,4,4))
+# sys = SuperCellSystem(fcc_crystal)
+# dynmat = dynamicalMatrix(sys, pot, 1e-12);
+# freqs_sq, phi = get_modes(dynmat, 3)
 
 # @testset "Supercell vs Unitcell" begin
     
@@ -54,19 +57,21 @@ freqs_sq, phi = get_modes(dynmat_, 3)
 @testset "MCC3, MCC3 Blocked" begin
     
     #Load test dataset
-    m = 39.95
-    phi, dynmat, F3, K3_actual, freqs_sq = load("./test_data/TEP.jld2", "phi", "dynmat", "F3", "K3", "freqs_sq")
-    N_modes = length(freqs_sq) #should be 96
+    if run_gpu_tests
+        m = 39.95
+        phi, dynmat, F3, K3_actual, freqs_sq = load("./test_data/TEP.jld2", "phi", "dynmat", "F3", "K3", "freqs_sq")
+        N_modes = length(freqs_sq) #should be 96
 
-    block_size = 32
-    F3 ./= sqrt(m^3)
-    K3_full = mcc3(CuArray{Float32}(F3), CuArray{Float32}(phi))
-    K3_blocked = mcc3(CuArray{Float32}(F3), CuArray{Float32}(phi), block_size)
+        block_size = 32
+        F3 ./= sqrt(m^3)
+        K3_full = mcc3(CuArray{Float32}(F3), CuArray{Float32}(phi))
+        K3_blocked = mcc3(CuArray{Float32}(F3), CuArray{Float32}(phi), block_size)
 
-    max_idx = argmax(K3_actual)
+        max_idx = argmax(K3_actual)
 
-    @test isapprox(K3_actual, K3_full, atol = 1e-6)
-    @test isapprox(K3_actual, K3_blocked, atol = 1e-6)
+        @test isapprox(K3_actual, K3_full, atol = 1e-6)
+        @test isapprox(K3_actual, K3_blocked, atol = 1e-6)
+    end
     
 end