diff --git a/src/grads.jl b/src/grads.jl
index af52de6..ead83ac 100644
--- a/src/grads.jl
+++ b/src/grads.jl
@@ -10,10 +10,6 @@ function sumabs2(x::AbstractArray{T}; dims = 1) where {T}
     sum(abs2, x; dims)
 end
 
-function _sumabs2_no_rrule(x::AbstractArray{T}; dims = 1) where {T}
-    sum(abs2, x; dims)
-end
-
 function ChainRulesCore.rrule(::typeof(sumabs2), x; dims = 1)
     function sumabs2_pullback(_Δ)
         Δ = unthunk(_Δ)
@@ -27,10 +23,6 @@ function L2norm(x::AbstractArray{T}; dims = 1, eps = 1f-7) where {T}
     sqrt.(sumabs2(x; dims) .+ eps )
 end
 
-function _L2norm_no_rrule(x::AbstractArray{T}; dims = 1, eps = 1f-7) where {T}
-    sqrt.(sum(abs2, x; dims) .+ eps )
-end
-
 function ChainRulesCore.rrule(::typeof(L2norm), x::AbstractArray{T}; dims = 1, eps = 1f-7) where {T}
     normx = L2norm(x; dims, eps)
     function L2norm_pullback(_Δ)
@@ -44,91 +36,39 @@ function pair_diff(A::AbstractArray{T}, B::AbstractArray{T}; dims = 4) where {T}
     return Flux.unsqueeze(A, dims = dims + 1) .- Flux.unsqueeze(B, dims = dims)
 end
 
-function _pair_diff_no_rrule(A::AbstractArray{T}, B::AbstractArray{T}; dims = 4) where {T}
-    return Flux.unsqueeze(A, dims = dims + 1) .- Flux.unsqueeze(B, dims = dims)
-end
-
-function ChainRulesCore.rrule(::typeof(pair_diff), A::AbstractArray{T}, B::AbstractArray{T}; dims = 4) where {T}
+function ChainRulesCore.rrule(::typeof(pair_diff), A::AbstractArray{T}, B::AbstractArray{T}; dims=4) where {T}
     y = pair_diff(A, B; dims)
     function pair_diff_pullback(_Δ)
         Δ = unthunk(_Δ)
-        return (NoTangent(), @thunk(sumdrop(Δ; dims = dims + 1)), @thunk(-sumdrop(Δ; dims = dims)))
+        return (NoTangent(), @thunk(sumdrop(Δ; dims=dims+1)), @thunk(-sumdrop(Δ; dims=dims)))
     end
     return y, pair_diff_pullback
 end
 
-function ChainRulesCore.rrule(::typeof(T_R3), A, R, t; dims = 1)
-    function T_R3_pullback(_Δ)
-        Δ = unthunk(_Δ)
-        ΔA = @thunk begin
-            batch_size = size(A)[3:end]
-            R2 = reshape(R, size(R,1), size(R,2), :)
-            Δ2 = reshape(Δ, size(Δ,1), size(Δ,2), :)
-            ΔA = batched_mul(batched_adjoint(R2), Δ2)
-            reshape(ΔA, size(ΔA, 1), size(ΔA, 2), batch_size...)
-        end
-        ΔR = @thunk begin
-            batch_size = size(R)[3:end]
-            A2 = reshape(A, size(A,1), size(A,2), :)
-            Δ2 = reshape(Δ, size(Δ,1), size(Δ,2), :)
-            ΔR = batched_mul(Δ2, batched_adjoint(A2))
-            reshape(ΔR, size(ΔR, 1), size(ΔR, 2), batch_size...)
-        end
-        Δt = @thunk begin 
-            # Case for broadcasting t along dim = 2.
-            size(t,2) == 1 ? tmp = sum(Δ, dims = 2) : tmp = Δ
-            tmp
-        end
-        return (NoTangent(), ΔA, ΔR, Δt)
+function ChainRulesCore.rrule(::typeof(T_R3), x::AbstractArray{T,N}, R::AbstractArray{T,N}, t::AbstractArray{T,N}) where {T,N}
+    function T_R3_pullback(_Δy)
+        Δy = unthunk(_Δy)
+        Δx = @thunk(batched_mul(_batched_transpose(R), Δy))
+        ΔR = @thunk(batched_mul(Δy, _batched_transpose(x)))
+        Δt = @thunk(sum(Δy, dims=2))
+        return (NoTangent(), Δx, ΔR, Δt)
     end
-    return T_R3(A, R, t), T_R3_pullback
-end
-
-function _T_R3_no_rrule(mat, rot,trans)
-    size_mat = size(mat)
-    rotc = reshape(rot, 3,3,:)  
-    trans = reshape(trans, 3,1,:)
-    matc = reshape(mat,3,size(mat,2),:) 
-    rotated_mat = batched_mul(rotc,matc) .+ trans
-    return reshape(rotated_mat,size_mat)
-end 
-
-function ChainRulesCore.rrule(::typeof(T_R3_inv), A, R, t; dims = 1)
-    function T_R3_inv_pullback(_Δ)
-        Δ = unthunk(_Δ)
-        ΔA = @thunk begin
-            batch_size = size(A)[3:end]
-            R2 = reshape(R, size(R,1), size(R,2), :)
-            Δ2 = reshape(Δ, size(Δ,1), size(Δ,2), :)
-            ΔA = batched_mul(R2, Δ2)
-            reshape(ΔA, size(ΔA, 1), size(ΔA, 2), batch_size...)
-        end
-        
-        ΔR = @thunk begin
-            batch_size = size(R)[3:end]
-            A2 = reshape(A, size(A,1), size(A,2), :)
-            Δ2 = reshape(Δ, size(Δ,1), size(Δ,2), :)
-            ΔR = batched_mul(A2, batched_adjoint(Δ2))
-            reshape(ΔR, size(ΔR, 1), size(ΔR, 2), batch_size...)
-        end
-        Δt = @thunk begin 
-            # Case for broadcasting t along dim = 2.
-            size(t,2) == 1 ? tmp = sum(Δ, dims = 2) : tmp = Δ
-            tmp
-        end
-        return (NoTangent(), ΔA, ΔR, Δt)
+    return T_R3(x, R, t), T_R3_pullback
+end
+
+function ChainRulesCore.rrule(::typeof(T_R3_inv), x::AbstractArray{T,N}, R::AbstractArray{T,N}, t::AbstractArray{T,N}) where {T,N}
+    z = x .- t
+    y = batched_mul(_batched_transpose(R), z)
+    function T_R3_inv_pullback(_Δy)
+        Δy = unthunk(_Δy)
+        Δx = @thunk(batched_mul(R, Δy))
+        ΔR = @thunk(batched_mul(z, _batched_transpose(Δy)))
+        Δt = @thunk(-sum(Δx, dims=2)) # t is in the same position as x, but negated and broadcasted
+        return (NoTangent(), Δx, ΔR, Δt)
     end
-    return T_R3_inv(A, R, t), T_R3_inv_pullback
+    return T_R3_inv(x, R, t), T_R3_inv_pullback
 end
 
-function _T_R3_inv_no_rrule(mat, rot,trans)
-    size_mat = size(mat)
-    rotc = batched_transpose(reshape(rot, 3,3,:))
-    matc = reshape(mat,3,size(mat,2),:)
-    trans = reshape(trans, 3,1,:)
-    rotated_mat = batched_mul(rotc,matc .- trans)
-    return reshape(rotated_mat,size_mat)
-end 
 #=
 function diff_sum_glob(T, q, k)
     bs = size(q) 
@@ -143,7 +83,7 @@ function _diff_sum_glob_no_rrule(T,q,k)
     bs = size(q)
     qresh = reshape(q, size(q,1), size(q,2)*size(q,3), size(q,4),size(q,5))
     kresh = reshape(k, size(k,1), size(k,2)*size(k,3), size(k,4),size(k,5))
-    Tq, Tk = _T_R3_no_rrule(qresh,T[1],T[2]),_T_R3_no_rrule(kresh,T[1],T[2])
+    Tq, Tk = T_R3_no_rrule(qresh,T[1],T[2]),T_R3_no_rrule(kresh,T[1],T[2])
     Tq, Tk = reshape(Tq, bs...), reshape(Tk, bs...)
     diffs = _sumabs2_no_rrule(_pair_diff_no_rrule(Tq, Tk, dims = 4),dims=[1,3])
 end=#
diff --git a/src/layers.jl b/src/layers.jl
index 72b2cdb..f947ef4 100644
--- a/src/layers.jl
+++ b/src/layers.jl
@@ -190,9 +190,9 @@ function (ipa::Union{IPCrossA, IPA})(
                             ,(3,1,2,4))
     
     # Applying our transformations to the queries, keys, and values to put them in the global frame.
-    Tqhp = reshape(_T_R3_no_rrule(qhp, rot_TiR,translate_TiR),3,N_head,N_query_points,N_frames_R,:) 
-    Tkhp = reshape(_T_R3_no_rrule(khp, rot_TiL,translate_TiL),3,N_head,N_query_points,N_frames_L,:)
-    Tvhp = _T_R3_no_rrule(vhp, rot_TiL, translate_TiL)
+    Tqhp = reshape(T_R3(qhp, rot_TiR,translate_TiR),3,N_head,N_query_points,N_frames_R,:) 
+    Tkhp = reshape(T_R3(khp, rot_TiL,translate_TiL),3,N_head,N_query_points,N_frames_L,:)
+    Tvhp = T_R3(vhp, rot_TiL, translate_TiL)
 
     diffs_glob = Flux.unsqueeze(Tqhp, dims = 5) .- Flux.unsqueeze(Tkhp, dims = 4)
     sum_norms_glob = reshape(sum(abs2, diffs_glob, dims = [1,3]),N_head,N_frames_R,N_frames_L,:) #Sum over points for each head
@@ -233,7 +233,7 @@ function (ipa::Union{IPCrossA, IPA})(
     end
 
     #ohp_r were in the global frame, so we put those ba ck in the recipient local
-    ohp = _T_R3_inv_no_rrule(ohp_r, rot_TiR, translate_TiR) 
+    ohp = T_R3_inv(ohp_r, rot_TiR, translate_TiR) 
     normed_ohp = sqrt.(sum(abs2, ohp,dims = 1) .+ Typ(0.000001f0)) #Adding eps
 
     catty = vcat(
@@ -350,7 +350,7 @@ function ipa_customgrad(ipa::Union{IPCrossA, IPA}, Ti::Tuple{AbstractArray,Abstr
         ohp_r = reshape(sum(broadcast_att_ohp.*broadcast_tvhp,dims=5),3,N_head*N_point_values,N_frames_R,:)
     end
     #ohp_r were in the global frame, so we put those back in the recipient local
-    ohp = _T_R3_inv_no_rrule(ohp_r, rot_TiR, translate_TiR) 
+    ohp = T_R3_inv(ohp_r, rot_TiR, translate_TiR) 
     normed_ohp = sqrt.(sumabs2(ohp, dims = 1) .+ Typ(0.000001f0)) #Adding eps
     catty = vcat(
         reshape(oh, N_head*c, N_frames_R,:),
@@ -510,11 +510,11 @@ function expand(
     rot_TiR, translate_TiR = TiR
     ΔTqhp = reshape(T_R3(Δqhp, (rot_TiR[:,:,R+1:R+ΔR,:]), (translate_TiR[:,:,R+1:R+ΔR,:])), (3, N_head, N_query_points, ΔR, B))
     Tkhp = reshape(
-        T_R3(reshape(khp, (3, N_head * N_query_points, (L + ΔL) * B)), (rot_TiL[:,:,1:L+ΔL,:]), (translate_TiL[:,:,1:L+ΔL,:])),
+        T_R3(reshape(khp, (3, N_head * N_query_points, (L + ΔL), B)), (rot_TiL[:,:,1:L+ΔL,:]), (translate_TiL[:,:,1:L+ΔL,:])),
         (3, N_head, N_query_points, L + ΔL, B)
     )
     Tvhp = reshape(
-        T_R3(reshape(vhp, (3, N_head * N_point_values, (L + ΔL) * B)), (rot_TiL[:,:,1:L+ΔL,:]), (translate_TiL[:,:,1:L+ΔL,:])),
+        T_R3(reshape(vhp, (3, N_head * N_point_values, (L + ΔL), B)), (rot_TiL[:,:,1:L+ΔL,:]), (translate_TiL[:,:,1:L+ΔL,:])),
         (3, N_head, N_point_values, L + ΔL, B)
     )
 
@@ -572,7 +572,7 @@ function expand(
                 )  .+
                     reshape(translate_TiR[:,:,R+1:R+ΔR,:],  (3, 1,      1,               ΔR, B)) .*
                     reshape(1 .- sum(Δatt, dims = 3),       (1, N_head, 1,               ΔR, B)),
-            (3, N_head * N_point_values, ΔR * B)
+            (3, N_head * N_point_values, ΔR, B)
         )
     else
         ohp_pre = reshape(
@@ -582,14 +582,13 @@ function expand(
                 reshape(Tvhp, (3, N_head, N_point_values,  1, L + ΔL, B)),
                 dims = 5,
             ),
-            (3, N_head * N_point_values, ΔR * B)
+            (3, N_head * N_point_values, ΔR, B)
         )
     end
 
     ohp = reshape(
         T_R3_inv(
-            ohp_pre
-            ,
+            ohp_pre,
             (rot_TiR[:,:,R+1:R+ΔR,:]),
             (translate_TiR[:,:,R+1:R+ΔR,:])
         ),
diff --git a/src/rotational_utils.jl b/src/rotational_utils.jl
index 8e316be..85325cb 100644
--- a/src/rotational_utils.jl
+++ b/src/rotational_utils.jl
@@ -52,31 +52,28 @@ get_rotation(dims...; T::Type{<:Real}=Float32) = get_rotation(T, dims...)
 Generates random translations of given size.
 """
 get_translation(T::Type{<:Real}, dims...) = randn(T, 3, 1, dims...)
-get_translation(dims...; T::Type{<:Real}=Float32) = get_translation(T, dims...) 
+get_translation(dims...; T::Type{<:Real}=Float32) = get_translation(T, dims...)
+
+function _batched_transpose(data::A) where {T,N,A<:AbstractArray{T,N}}
+    perm = (2,1,3:N...)
+    PermutedDimsArray{T,N,perm,perm,A}(data)
+end
 
 """
 Applies the SE3 transformations T = (rot,trans) ∈ SE(3)^N
 to N batches of m points in R3, i.e., mat ∈ R^(3 x m x N) ↦ T(mat) ∈ R^(3 x m x N).
 Note here that rotations here are represented in matrix form. 
 """
-function T_R3(mat, rot, trans)
-    rotc = reshape(rot, 3, 3, :)  
-    trans = reshape(trans, 3, 1, :)
-    matc = reshape(mat, 3, size(mat, 2), :) 
-    rotated_mat = batched_mul(rotc, matc) .+ trans
-    return reshape(rotated_mat, size(mat))
+function T_R3(x::AbstractArray{T,N}, R::AbstractArray{T,N}, t::AbstractArray{T,N}) where {T,N}
+    return batched_mul(R, x) .+ t
 end
 
 """ 
 Applies the group inverse of the SE3 transformations T = (R,t) ∈ SE(3)^N to N batches of m points in R3,
 such that T^-1(T*x) = T^-1(Rx+t) =  R^T(Rx+t-t) = x.
 """
-function T_R3_inv(mat, rot, trans)
-    rotc = batched_transpose(reshape(rot, 3, 3, :))
-    matc = reshape(mat, 3, size(mat, 2), :)
-    trans = reshape(trans, 3,1,:)
-    rotated_mat = batched_mul(rotc, matc .- trans)
-    return reshape(rotated_mat, size(mat))
+function T_R3_inv(x::AbstractArray{T,N}, R::AbstractArray{T,N}, t::AbstractArray{T,N}) where {T,N}
+    return batched_mul(_batched_transpose(R), x .- t)
 end
 
 """
@@ -106,7 +103,7 @@ end
 
 unzip(a) = map(x->getfield.(a, x), fieldnames(eltype(a)))
 
-calculate_residue_centroid(residue_xyz::AbstractMatrix) = reshape(mean(residue_xyz[:, 1:3], dims = 2), 3)
+centroid(coords::AbstractMatrix) = vec(sum(coords; dims=2)) / size(coords, 2)
 
 """
 Get frame from residue
@@ -117,7 +114,7 @@ function calculate_residue_rotation_and_translation(residue_xyz::AbstractMatrix)
     Ca = residue_xyz[:, 2] # We use the centroid instead of the Ca - not 100% sure if this is correct
     C = residue_xyz[:, 3]
 
-    t = calculate_residue_centroid(residue_xyz)
+    t = centroid(residue_xyz)
 
     v1 = C - t
     v2 = N - t
diff --git a/test/runtests.jl b/test/runtests.jl
index 512546e..5a1e1a7 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2,10 +2,11 @@ using InvariantPointAttention
 using Test
 
 import InvariantPointAttention: get_rotation, get_translation, softmax1
-import Zygote: gradient, withgradient
-import Flux: params
-import InvariantPointAttention: T_R3, T_R3_inv, _T_R3_no_rrule, _T_R3_inv_no_rrule, diff_sum_glob, _diff_sum_glob_no_rrule, pair_diff, _pair_diff_no_rrule
-import InvariantPointAttention: L2norm, _L2norm_no_rrule, sumabs2, _sumabs2_no_rrule
+import InvariantPointAttention: T_R3, T_R3_inv, pair_diff
+import InvariantPointAttention: L2norm, sumabs2
+import Flux
+
+using ChainRulesTestUtils
 
 @testset "InvariantPointAttention.jl" begin
     # Write your tests here.
@@ -13,74 +14,46 @@ import InvariantPointAttention: L2norm, _L2norm_no_rrule, sumabs2, _sumabs2_no_r
         #Check if softmax1 is consistent with softmax, when adding an additional zero logit
         x = randn(4,3)
         xaug = hcat(x, zeros(4,1))
-        @test InvariantPointAttention.softmax1(x, dims = 2) ≈ InvariantPointAttention.Flux.softmax(xaug, dims = 2)[:,1:end-1]
+        @test softmax1(x, dims = 2) ≈ Flux.softmax(xaug, dims = 2)[:,1:end-1]
     end
-    @testset "Softmax1 custom grad" begin
-        x = randn(3,10,41,13)
- 
-        function softmax1_no_rrule(x::AbstractArray{T}; dims = 1) where {T}
-            _zero = T(0)
-            max_ = max.(maximum(x; dims), _zero)
-            @fastmath out = exp.(x .- max_)
-            tmp = sum(out, dims = dims)
-            out ./ (tmp + exp.(-max_))
-        end
 
-        for k in 1:4
-            f(x; dims = k) = sum(softmax1(x; dims))
-            g(x; dims = k) = sum(softmax1_no_rrule(x; dims))
-            @test gradient(f, x)[1] ≈ gradient(g, x)[1]
-        end
-    end  
+    @testset "softmax1 rrule" begin
+        x = randn(2,3,4)
 
-    @testset "T_R3 custom grad" begin 
-        x = randn(3,5,10,15)
-        rot = get_rotation(10,15) 
-        trans = get_translation(10,15)
+        foreach(i -> test_rrule(softmax1, x; fkwargs=(; dims=i)), 1:3)
+    end  
 
-        @test gradient(sum ∘ T_R3, x, rot, trans)[1] ≈ gradient(sum ∘ _T_R3_no_rrule, x, rot, trans)[1]
+    @testset "T_R3 rrule" begin 
+        x = randn(Float64, 3, 2, 1, 2)
+        R = get_rotation(Float64, 1, 2)
+        t = get_translation(Float64, 1, 2)
+        test_rrule(T_R3, x, R, t)
     end
 
-    @testset "T_R3_inv custom grad" begin 
-        x = randn(3,5,10,15)
-        rot = get_rotation(10,15) 
-        trans = get_translation(10,15)
-        @test gradient(sum ∘ T_R3_inv, x, rot, trans)[1] ≈ gradient(sum ∘ _T_R3_inv_no_rrule, x, rot, trans)[1]
+    @testset "T_R3_inv rrule" begin 
+        x = randn(Float64, 3, 2, 1, 2)
+        R = get_rotation(Float64, 1, 2) 
+        t = get_translation(Float64, 1, 2)
+        test_rrule(T_R3_inv, x, R, t)
     end
 
-    @testset "sumabs2 custom grad" begin 
-        x = randn(3,10,41,13)
-
-        for k in 1:4
-            f(x; dims = k) = sum(sumabs2(x; dims))
-            g(x; dims = k) = sum(sum(abs2, x; dims))
-            cval, cgs = withgradient(f, x)
-            val, gs = withgradient(g, x)
-            @test cval ≈ val
-            @test keys(cgs) ≈ keys(gs)
-        end
+    @testset "sumabs2 rrule" begin
+        x = rand(2,3,4)
+        foreach(i -> test_rrule(sumabs2, x; fkwargs=(; dims=i)), 1:3)
     end
 
-    @testset "L2norm custom grad" begin 
-        x = randn(3,10,41,13)
-
-        for k in 1:4
-            f(x; dims = k) = sum(L2norm(x; dims, eps = 0.1f0))
-            g(x; dims = k) = sum(_L2norm_no_rrule(x; dims, eps = 0.1f0))
-            cval, cgs = withgradient(f, x)
-            val, gs = withgradient(g, x)
-            @test cval ≈ val
-            @test keys(cgs)[1] ≈ keys(gs)[1]
-        end
+    @testset "L2norm rrule" begin 
+        x = randn(2,3,4,5)
+        foreach(i -> test_rrule(L2norm, x; fkwargs=(; dims=i)), 1:3)
     end
 
     @testset "pair_diff custom grad" begin 
-        x = randn(3,5,5,5,5)
-        y = randn(3,5,5,15,5)
-        @test gradient(sum ∘ pair_diff, x, y)[1] ≈ gradient(sum ∘ _pair_diff_no_rrule, x, y)[1]
+        x = randn(1,4,2)
+        y = randn(1,3,2)
+        test_rrule(pair_diff, x, y; fkwargs=(; dims=2))
     end
 
-    @testset "ipa_customgrad" begin
+    #=@testset "ipa_customgrad" begin
         batch_size = 3
         framesL = 10
         framesR = 10
@@ -89,20 +62,20 @@ import InvariantPointAttention: L2norm, _L2norm_no_rrule, sumabs2, _sumabs2_no_r
         siL = randn(Float32, dim, framesL, batch_size) 
         siR = siL
         # Use CLOPS.jl shape notation
-        TiL = (get_rotation(Float32, framesL, batch_size), randn(Float32, 3, framesL, batch_size)) 
+        TiL = (get_rotation(Float32, framesL, batch_size), get_translation(Float32, framesL, batch_size)) 
         TiR = TiL 
         zij = randn(Float32, 16, framesR, framesL, batch_size) 
 
         ipa = IPCrossA(IPA_settings(dim; use_softmax1 = true, c_z = 16, Typ = Float32))  
         # Batching on mask
         mask = right_to_left_mask(framesL)[:, :, ones(Int, batch_size)]
-        ps = params(ipa)
+        ps = Flux.params(ipa)
         
-        lz,gs = withgradient(ps) do 
+        lz,gs = Flux.withgradient(ps) do 
             sum(ipa(TiL, siL, TiR, siR; zij, mask, customgrad = true))
         end
         
-        lz2, zygotegs = withgradient(ps) do 
+        lz2, zygotegs = Flux.withgradient(ps) do 
             sum(ipa(TiL, siL, TiR, siR; zij, mask, customgrad = false))
         end
         
@@ -111,7 +84,8 @@ import InvariantPointAttention: L2norm, _L2norm_no_rrule, sumabs2, _sumabs2_no_r
         end
         #@show lz, lz2
         @test abs.(lz - lz2) < 1f-5
-    end
+    end=#
+
     @testset "IPAsoftmax_invariance" begin
         batch_size = 3
         framesL = 100