From 121faaf3737ced55ab0d7344c4c2a2edaa970e09 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 27 Jun 2020 09:08:45 -0700
Subject: [PATCH 1/4] whitespace changes

Signed-off-by: Jeff Hammond <jeff.r.hammond@intel.com>
---
 src/AtomicMacro.hh | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)
diff --git a/src/AtomicMacro.hh b/src/AtomicMacro.hh
index 4c31c853..2785c64a 100644
--- a/src/AtomicMacro.hh
+++ b/src/AtomicMacro.hh
@@ -16,18 +16,20 @@
 
         //Currently not atomic here. But its only used when it does not necissarially need to be atomic.
         #define ATOMIC_WRITE( x, v ) \
-            x = v;          
+            x = v;
 
         #define ATOMIC_ADD( x, v ) \
             atomicAdd( &x, v );
-        
+
         #define ATOMIC_UPDATE( x ) \
             atomicAdd( &x, 1 );
 
         #define ATOMIC_CAPTURE( x, v, p ) \
             p = atomicAdd( &x, v );
+
     //If in a CPU OpenMP section use the OpenMP atomics
     #elif defined (USE_OPENMP_ATOMICS)
+
         #define ATOMIC_WRITE( x, v ) \
             _Pragma("omp atomic write") \
             x = v;
@@ -46,6 +48,7 @@
 
     //If in a serial section, no need to use atomics
     #else
+
         #define ATOMIC_WRITE( x, v ) \
             x = v;
 
@@ -62,6 +65,7 @@
 
 //If in a OpenMP section use the OpenMP atomics
 #elif defined (USE_OPENMP_ATOMICS)
+
     #define ATOMIC_WRITE( x, v ) \
         _Pragma("omp atomic write") \
         x = v;
@@ -74,12 +78,13 @@
         _Pragma("omp atomic update") \
         x++;
 
-        #define ATOMIC_CAPTURE( x, v, p ) \
-            _Pragma("omp atomic capture") \
-            {p = x; x = x + v;}
+    #define ATOMIC_CAPTURE( x, v, p ) \
+        _Pragma("omp atomic capture") \
+        {p = x; x = x + v;}
 
 //If in a serial section, no need to use atomics
 #else
+
     #define ATOMIC_WRITE( x, v ) \
         x = v;
 
@@ -91,4 +96,5 @@
 
     #define ATOMIC_CAPTURE( x, v, p ) \
         {p = x; x = x + v;}
+
 #endif

From 5bf8b84d5403f853eefef4c63ff932aad2be2eae Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 27 Jun 2020 09:12:01 -0700
Subject: [PATCH 2/4] new implementation of atomics

New version uses functions not macros.

The use of template functions allows for enforcement of type-safety,
which is implemented using static_assert.

The old implementation is preserved for posterity.

A header guard was added.

I found the old macro names confusing, so I used new names, but I map
the old names in the source onto them so the application source does not
change.

Signed-off-by: Jeff Hammond <jeff.r.hammond@intel.com>
---
 src/AtomicMacro.hh | 158 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 157 insertions(+), 1 deletion(-)

diff --git a/src/AtomicMacro.hh b/src/AtomicMacro.hh
index 2785c64a..b438665e 100644
--- a/src/AtomicMacro.hh
+++ b/src/AtomicMacro.hh
@@ -1,3 +1,8 @@
+#ifndef AtomicMacro_HH_
+#define AtomicMacro_HH_
+
+#define USE_MACRO_FUNCTIONS 1
+
 //Determine which atomics to use based on platform being compiled for
 //
 //If compiling with CUDA
@@ -8,6 +13,153 @@
     #define USE_OPENMP_ATOMICS
 #endif
 
+// --------------------------------------------------
+// Original Names            -> Inline function names
+// --------------------------------------------------
+// ATOMIC_WRITE( x, v )      -> ATOMIC_WRITE
+// ATOMIC_UPDATE( x )        -> ATOMIC_INCREMENT
+// ATOMIC_ADD( x, v )        -> ATOMIC_ADD
+// ATOMIC_CAPTURE( x, v, p ) -> ATOMIC_FETCH_ADD
+// --------------------------------------------------
+
+#if defined (USE_MACRO_FUNCTIONS)
+
+#define ATOMIC_CAPTURE( x, v, p )  ATOMIC_FETCH_ADD((x),(v),(p))
+#define ATOMIC_UPDATE( x )         ATOMIC_INCREMENT((x))
+
+#if defined(HAVE_CUDA) && defined(__CUDA_ARCH__)
+
+template <typename T>
+inline void ATOMIC_WRITE(T & x, T v) {
+    x = v;
+}
+
+template <typename T>
+inline void ATOMIC_INCREMENT(T& x) {
+    atomicAdd( &x, 1 );
+}
+
+template <typename T>
+inline void ATOMIC_ADD(T& x, T v) {
+    atomicAdd( &x, v );
+}
+
+template <typename T1, typename T2>
+inline void ATOMIC_ADD(T1& x, T2 v) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    atomicAdd( &x, v );
+}
+
+template <typename T>
+inline void ATOMIC_FETCH_ADD(T& x, T v, T& p) {
+    p = atomicAdd( &x, v );
+}
+
+template <typename T1, typename T2>
+inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T1& p) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    p = atomicAdd( &x, v );
+}
+
+template <typename T1, typename T2, typename T3>
+inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T3& p) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    static_assert( sizeof(T3) >= sizeof(T1), "Unsafe: small := large");
+    p = atomicAdd( &x, v );
+}
+
+#elif defined(USE_OPENMP_ATOMICS)
+
+template <typename T>
+inline void ATOMIC_WRITE(T & x, T v) {
+    _Pragma("omp atomic write")
+    x = v;
+}
+
+template <typename T>
+inline void ATOMIC_INCREMENT(T& x) {
+    _Pragma("omp atomic update")
+    x++;
+}
+
+template <typename T>
+inline void ATOMIC_ADD(T& x, T v) {
+    _Pragma("omp atomic")
+    x += v;
+}
+
+template <typename T1, typename T2>
+inline void ATOMIC_ADD(T1& x, T2 v) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    _Pragma("omp atomic")
+    x += v;
+}
+
+template <typename T>
+inline void ATOMIC_FETCH_ADD(T& x, T v, T& p) {
+    _Pragma("omp atomic capture")
+    {p = x; x = x + v;}
+}
+
+template <typename T1, typename T2>
+inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T1& p) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    _Pragma("omp atomic capture")
+    {p = x; x = x + v;}
+}
+
+template <typename T1, typename T2, typename T3>
+inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T3& p) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    static_assert( sizeof(T3) >= sizeof(T1), "Unsafe: small := large");
+    _Pragma("omp atomic capture")
+    {p = x; x = x + v;}
+}
+
+#else // SEQUENTIAL
+
+template <typename T>
+inline void ATOMIC_WRITE(T & x, T v) {
+    x = v;
+}
+
+template <typename T>
+inline void ATOMIC_INCREMENT(T& x) {
+    x++;
+}
+
+template <typename T>
+inline void ATOMIC_ADD(T& x, T v) {
+    x += v;
+}
+
+template <typename T1, typename T2>
+inline void ATOMIC_ADD(T1& x, T2 v) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    x += v;
+}
+
+template <typename T>
+inline void ATOMIC_FETCH_ADD(T& x, T v, T& p) {
+    {p = x; x = x + v;}
+}
+
+template <typename T1, typename T2>
+inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T1& p) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    {p = x; x = x + v;}
+}
+
+template <typename T1, typename T2, typename T3>
+inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T3& p) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    static_assert( sizeof(T3) >= sizeof(T1), "Unsafe: small := large");
+    {p = x; x = x + v;}
+}
+
+#endif // BACKENDS
+
+#else // ! USE_MACRO_FUNCTIONS
 
 #if defined (HAVE_CUDA)
 
@@ -97,4 +249,8 @@
     #define ATOMIC_CAPTURE( x, v, p ) \
         {p = x; x = x + v;}
 
-#endif
+#endif // BACKENDS
+
+#endif // USE_MACRO_FUNCTIONS
+
+#endif // AtomicMacro_HH_

From 9040882ecfe88e4a395aa993358931b6a1b05332 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Fri, 4 Feb 2022 00:45:49 -0800
Subject: [PATCH 3/4] remove unnecessary comments

---
 src/AtomicMacro.hh | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/AtomicMacro.hh b/src/AtomicMacro.hh
index b438665e..0b0ceb89 100644
--- a/src/AtomicMacro.hh
+++ b/src/AtomicMacro.hh
@@ -3,10 +3,6 @@
 
 #define USE_MACRO_FUNCTIONS 1
 
-//Determine which atomics to use based on platform being compiled for
-//
-//If compiling with CUDA
-
 #ifdef HAVE_OPENMP
     #define USE_OPENMP_ATOMICS
 #elif HAVE_OPENMP_TARGET

From 01337a49323f4f3e8a6f26b7a601b1dc1d588045 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Fri, 4 Feb 2022 01:03:34 -0800
Subject: [PATCH 4/4] C++20 atomics

Signed-off-by: Jeff Hammond <jehammond@nvidia.com>
---
 src/AtomicMacro.hh | 73 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 72 insertions(+), 1 deletion(-)

diff --git a/src/AtomicMacro.hh b/src/AtomicMacro.hh
index 0b0ceb89..0bb9094c 100644
--- a/src/AtomicMacro.hh
+++ b/src/AtomicMacro.hh
@@ -23,7 +23,76 @@
 #define ATOMIC_CAPTURE( x, v, p )  ATOMIC_FETCH_ADD((x),(v),(p))
 #define ATOMIC_UPDATE( x )         ATOMIC_INCREMENT((x))
 
-#if defined(HAVE_CUDA) && defined(__CUDA_ARCH__)
+#if defined(USE_CXX20_ATOMICS)
+
+    #if (__cplusplus > 201703L)
+
+        #include <atomic>
+
+        #if defined(__cpp_lib_atomic_float) && defined(__cpp_lib_atomic_ref)
+
+            template <typename T>
+            inline void ATOMIC_WRITE(T & x, T v) {
+                //x = v;
+                std::atomic_ref<T> r{x};
+                r = v;
+            }
+
+            template <typename T>
+            inline void ATOMIC_INCREMENT(T& x) {
+                //atomicAdd( &x, 1 );
+                std::atomic_ref<T> r{x};
+                r++;
+            }
+
+            template <typename T>
+            inline void ATOMIC_ADD(T& x, T v) {
+                //atomicAdd( &x, v );
+                std::atomic_ref<T> r{x};
+                r+=v;
+            }
+
+            template <typename T1, typename T2>
+            inline void ATOMIC_ADD(T1& x, T2 v) {
+                static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+                //atomicAdd( &x, v );
+                std::atomic_ref<T1> r{x};
+                r+=v;
+            }
+
+            template <typename T>
+            inline void ATOMIC_FETCH_ADD(T& x, T v, T& p) {
+                //p = atomicAdd( &x, v );
+                std::atomic_ref<T> r{x};
+                p = r.fetch_add(v);
+            }
+
+            template <typename T1, typename T2>
+            inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T1& p) {
+                static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+                //p = atomicAdd( &x, v );
+                std::atomic_ref<T1> r{x};
+                p = r.fetch_add(v);
+            }
+
+            template <typename T1, typename T2, typename T3>
+            inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T3& p) {
+                static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+                static_assert( sizeof(T3) >= sizeof(T1), "Unsafe: small := large");
+                //p = atomicAdd( &x, v );
+                std::atomic_ref<T1> r{x};
+                p = r.fetch_add(v);
+            }
+
+        #else
+            #error Your supposedly C++20 compiler doesn't support atomic_ref<double>.
+        #endif
+
+    #else
+        #error Sorry, you need C++20.
+    #endif
+
+#elif defined(HAVE_CUDA) && defined(__CUDA_ARCH__)
 
 template <typename T>
 inline void ATOMIC_WRITE(T & x, T v) {
@@ -66,6 +135,8 @@ inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T3& p) {
 
 #elif defined(USE_OPENMP_ATOMICS)
 
+#warning Should not be here
+
 template <typename T>
 inline void ATOMIC_WRITE(T & x, T v) {
     _Pragma("omp atomic write")