From d2032ec9e5df56cccf4a6ab9137cef36cd99b450 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Sun, 17 Nov 2024 14:49:30 -0600
Subject: [PATCH 01/12] improve benchmarking

---
 src/base/globals.nim        | 10 +++++++
 src/base/profile.nim        | 57 ++++++++-----------------------------
 src/bench/benchLinalg.nim   | 12 ++++++--
 src/bench/benchStagProp.nim |  4 +--
 src/experimental/stagag.nim |  5 +---
 src/physics/qcdTypes.nim    |  1 -
 tests/base/tshift.nim       | 26 +++++++++++------
 7 files changed, 53 insertions(+), 62 deletions(-)

diff --git a/src/base/globals.nim b/src/base/globals.nim
index 49325700..428bf6a3 100644
--- a/src/base/globals.nim
+++ b/src/base/globals.nim
@@ -44,3 +44,13 @@ macro setDefaultNc*(n: static[int]): untyped =
   result = newEmptyNode()
 macro getDefaultNc*(): untyped =
   return newLit(defaultNc)
+
+template getDefPrecStr:string =
+  const defPrec {.strdefine.} = "D"
+  defPrec
+
+var defPrec* {.compiletime.} = getDefPrecStr()
+macro setDefaultSingle* =
+  defPrec = "S"
+  echo "Default precision: ", defPrec
+static: echo "Default precision: ", defPrec
diff --git a/src/base/profile.nim b/src/base/profile.nim
index f577bece..506568ca 100644
--- a/src/base/profile.nim
+++ b/src/base/profile.nim
@@ -1,6 +1,6 @@
 import threading
 export threading
-import comms/comms, stdUtils, base/basicOps
+import comms/comms, stdUtils, base/[basicOps,params]
 import os, strutils, sequtils, std/monotimes, std/tables, std/algorithm, strformat
 export monotimes
 getOptimPragmas()
@@ -20,7 +20,7 @@ var
 
 ##[
 
-Each Tic starts a local timer.  Each Toc records the time difference
+Each tic() starts a local timer.  Each toc() records the time difference
 of the current time with the one in the local timer visible in the
 scope, and then update the timer with the current time.
 
@@ -516,49 +516,10 @@ template tocI(f: SomeNumber; s:SString = ""; n = -1) =
       localCode {.global.} = newList[CodePoint]()
       thisCode = CodePoint(-1)
   if threadNum==0:
-    when false:
-      #echo "==== begin toc ",s," ",ii
-      #echo "     rtiStack: ",indent($rtiStack,5)
-      #echo "     cpHeap: ",indent($cpHeap,5)
-      if unlikely VerboseTimer: echoToc(s,ii)
-      if prevRTI.int32 >= 0:
-        if restartTimer:
-          thawTimers()
-          restartTimer = false
-        if not timersFrozen():
-          let theTime = getTics()
-          when not cname:
-            for c in items(localCode):
-              if cpHeap[c.int].name.equal(s):
-                thisCode = c
-                break
-          if thisCode.isNil:
-            thisCode = newCodePoint(ii.addr, s)
-            when not cname:
-              localCode.add thisCode
-          let
-            ns = theTime-localTimer
-            thisRTI = record(localTic, prevRTI, thisCode, ns, float(f))
-          var oh = rtiStack[thisRTI.int].childrenOverhead
-          let c = rtiStack[thisRTI.int].children
-          for i in 0..<c.len:
-            if toDropTimer(c[i].prev):
-              oh -= c[i].childrenOverhead
-          if oh.float / ns.float > DropWasteTimerRatio:
-            # Signal stop if the overhead is too large.
-            dropTimer(prevRTI)
-          if toDropTimer(thisCode):
-            freezeTimers()
-            restartTimer = true
-          localTimer = getTics()
-          rtiStack[thisRTI.int].overhead = nsec(localTimer-theTime)
-          prevRTI = thisRTI
-      #echo "==== end toc ",s," ",ii
+    when cname:
+      tocSet(localTimer,prevRTI,restartTimer,thisCode,f,s,ii.addr,localTic,false)
     else:
-      when cname:
-        tocSet(localTimer,prevRTI,restartTimer,thisCode,f,s,ii.addr,localTic,false)
-      else:
-        tocSet(localTimer,prevRTI,restartTimer,thisCode,f,s,ii.addr,localTic,addr localCode)
+      tocSet(localTimer,prevRTI,restartTimer,thisCode,f,s,ii.addr,localTic,addr localCode)
 
 when noTicToc:
   template toc*() = discard
@@ -822,7 +783,7 @@ proc makeHotspotTable(lrti: List[RTInfoObj]): tuple[ns:int64,oh:int64] =
           t.children.add ri.children[i]
     do: # loc not found
       hs[loc] = ri
-    #let tot = makeHotSpotTable(List[RTInfoObj](ri.children))
+    let tot = makeHotSpotTable(List[RTInfoObj](ri.children))
   return (nstot, ohtot)
 
 proc echoHotspots* =
@@ -879,6 +840,12 @@ proc echoHotspots* =
         let tsnspct = 100.0 * tsns / nstot
         echo &"{pct:6.3f} {tsnspct:7.3f} {count} {mf} {nc} S {lc} {nm}"
 
+proc echoProf*(def = 0) =
+  case intParam("prof",def)
+  of 1: echoHotspots()
+  of 2: echoTimers()
+  else: discard
+
 when isMainModule:
   import os
   proc test =
diff --git a/src/bench/benchLinalg.nim b/src/bench/benchLinalg.nim
index 68d61447..928fae78 100644
--- a/src/bench/benchLinalg.nim
+++ b/src/bench/benchLinalg.nim
@@ -45,6 +45,7 @@ proc checkMem =
 # fps: flops per site
 # bps: bytes moved (load+store) per site
 # mm: memory footprint (bytes) per site
+var minNrep = int.high
 template bench(fps,bps,mm,eqn: untyped) {.dirty.} =
   block:
     let vol = lo.nSites.float
@@ -73,12 +74,19 @@ template bench(fps,bps,mm,eqn: untyped) {.dirty.} =
     inc nbench
     echo "bench: ",nbench|(-6), "secs: ", dt|(6,3), "  mf: ", mf|7, "  mb: ", mb|7, "  mem: ", mem, "  nrep: ", nrep
     echo exp2string(eqn), "\n"
+    minNrep = min(minNrep, nrep)
 template bench(fps,bps,eqn: untyped) =
   bench(fps,bps,0,eqn)
 
 proc test(lat:auto) =
-  let maxl = intParam("maxl",16)
-  if lat[0] > maxl: return
+  let minl = intParam("minl",0)
+  if lat[0] < minl: return
+  let maxl = intParam("maxl",0)
+  if maxl>0:
+    if lat[0] > maxl: return
+  else:
+    if minNrep == 1: return
+  minNrep = int.high
   var lo = newLayout(lat)
   template newCV: untyped = lo.ColorVector()
   template newCM: untyped = lo.ColorMatrix()
diff --git a/src/bench/benchStagProp.nim b/src/bench/benchStagProp.nim
index 2ac4f8f5..164758cf 100644
--- a/src/bench/benchStagProp.nim
+++ b/src/bench/benchStagProp.nim
@@ -63,7 +63,7 @@ threads:
   threadBarrier()
   echo r.norm2
 #echo v2
-echoTimers()
+echoProf()
 
 var g3:array[8,type(g[0])]
 for i in 0..3:
@@ -76,6 +76,6 @@ var s3 = newStag3(g3)
 s3.solve(v2, v1, mass, sp)
 resetTimers()
 s3.solve(v2, v1, mass, sp)
-echoTimers()
+echoProf()
 
 qexFinalize()
diff --git a/src/experimental/stagag.nim b/src/experimental/stagag.nim
index e29f9341..4574ce98 100644
--- a/src/experimental/stagag.nim
+++ b/src/experimental/stagag.nim
@@ -2103,8 +2103,5 @@ if outfn != "":
   echo "Saving gauge field to file: ", outfn
   let err = g.saveGauge outfn
 
-case intParam("prof",0)
-of 1: echoTimers()
-of 2: echoHotspots()
-else: discard
+echoProf()
 qexFinalize()
diff --git a/src/physics/qcdTypes.nim b/src/physics/qcdTypes.nim
index 007ca9f3..2f0bcef2 100644
--- a/src/physics/qcdTypes.nim
+++ b/src/physics/qcdTypes.nim
@@ -416,7 +416,6 @@ macro makeConstructors(x: untyped): untyped =
   result = newStmtList()
   result.add getAst mp(ident(f&"S"), ident("S"&r&"V"), ident"result")
   result.add getAst mp(ident(f&"D"), ident("D"&r&"V"), ident"result")
-  const defPrec {.strdefine.} = "D"
   result.add getAst mp(ident(f), ident(defPrec&r&"V"), ident"result")
   # non-Simd versions
   result.add getAst mp(ident(f&"S1"), ident("S"&r), ident"result")
diff --git a/tests/base/tshift.nim b/tests/base/tshift.nim
index 75d39e5d..4ddc04f1 100644
--- a/tests/base/tshift.nim
+++ b/tests/base/tshift.nim
@@ -126,18 +126,20 @@ template testS16(Smd: typedesc) =
 qexInit()
 
 template makeSimdArrayX(T,N,B: untyped) {.dirty.} =
-  makeSimdArray(`T X`, N, B)
-  type T = Simd[`T X`]
-  template toDoubleImpl(x: `T X`): untyped = x  # always already double
+  #makeSimdArray(`T X`, N, B)
+  #type T = Simd[`T X`]
+  #template toDoubleImpl(x: `T X`): untyped = x  # always already double
+  type T = Simd[SimdArrayObj[N,B]]
 
 #testS1(float)
+
 makeSimdArrayX(SD1, 1, float)
 testS1(SD1)
 when declared(SimdD1):
   testS1(SimdD1)
 
-#makeSimdArrayX(SS1, 1, float32)
-#testS1(SS1)
+makeSimdArrayX(SS1, 1, float32)
+testS1(SS1)
 when declared(SimdS1):
   testS1(SimdS1)
 
@@ -146,8 +148,8 @@ testS2(SD2)
 when declared(SimdD2):
   testS2(SimdD2)
 
-#makeSimdArrayX(SS2, 2, float32)
-#testS2(SS2)
+makeSimdArrayX(SS2, 2, float32)
+testS2(SS2)
 when declared(SimdS2):
   testS2(SimdS2)
 
@@ -155,6 +157,9 @@ makeSimdArrayX(SD4, 4, float)
 testS4(SD4)
 when declared(SimdD4):
   testS4(SimdD4)
+
+makeSimdArrayX(SS4, 4, float32)
+testS4(SS4)
 when declared(SimdS4):
   testS4(SimdS4)
 
@@ -162,6 +167,9 @@ makeSimdArrayX(SD8, 8, float)
 testS8(SD8)
 when declared(SimdD8):
   testS8(SimdD8)
+
+makeSimdArrayX(SS8, 8, float32)
+testS8(SS8)
 when declared(SimdS8):
   testS8(SimdS8)
 
@@ -169,8 +177,10 @@ makeSimdArrayX(SD16, 16, float)
 testS16(SD16)
 when declared(SimdD16):
   testS16(SimdD16)
+
+makeSimdArrayX(SS16, 16, float32)
+testS16(SS16)
 when declared(SimdS16):
   testS16(SimdS16)
 
-
 qexFinalize()

From 5d6b7b47a61651bc116484fb35a91f7966f5a268 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Tue, 19 Nov 2024 15:19:52 -0600
Subject: [PATCH 02/12] fix threading issue fix issue when doing multiple
 hotspot outputs add single precision versions of some benchmarks

---
 src/base/backtrace.nim       |   3 +-
 src/base/omp.nim             |  11 ++
 src/base/profile.nim         |   1 +
 src/base/stdUtils.nim        |  13 ++-
 src/base/threading.nim       | 218 +++++++++++++++++++++++++++--------
 src/bench/benchLinalgS.nim   |   3 +
 src/bench/benchStagPropS.nim |   3 +
 src/comms/commsUtils.nim     |  30 ++++-
 src/examples/staghmc_sh.nim  |   3 +-
 src/layout/shifts.nim        |   3 +-
 10 files changed, 222 insertions(+), 66 deletions(-)
 create mode 100644 src/bench/benchLinalgS.nim
 create mode 100644 src/bench/benchStagPropS.nim

diff --git a/src/base/backtrace.nim b/src/base/backtrace.nim
index 0a75dbe9..d393cd82 100644
--- a/src/base/backtrace.nim
+++ b/src/base/backtrace.nim
@@ -8,7 +8,7 @@ proc backtrace_symbols(buffer: ptr pointer, size: cint): ptr UncheckedArray[cstr
 
 proc print_trace =
   #void *array[10];
-  const nmax = 10
+  const nmax = 20
   var arr: array[nmax, pointer]
   #char **strings;
 
@@ -22,6 +22,7 @@ proc print_trace =
 
 proc sigtrace(sig: cint) {.noconv.} =
   print_trace()
+  quit()
 
 proc setTrace* =
   c_signal(SIGSEGV, sigtrace)
diff --git a/src/base/omp.nim b/src/base/omp.nim
index 80b803b2..52b2c3fe 100644
--- a/src/base/omp.nim
+++ b/src/base/omp.nim
@@ -28,6 +28,11 @@ else:
     #forceOmpOn()
     #{. emit:["#pragma omp ", p] .}
     {. emit:["_Pragma(\"omp ", p, "\")"] .}
+  template ompPragma(p:string,body:typed) =
+    {. push stackTrace:off, lineTrace:off, line_dir:off .}
+    {. emit:["_Pragma(\"omp ", p, "\")"] .}
+    body
+    {. pop .}
   template ompBlock*(p:string; body:untyped) =
     #{. emit:"#pragma omp " & p .}
     #{. emit:"{ /* Inserted by ompBlock " & p & " */".}
@@ -38,6 +43,12 @@ else:
     #{. emit:"} /* End ompBlock " & p & " */".}
 
 template ompBarrier* = ompPragma("barrier")
+template ompFlush* = ompPragma("flush")
+template ompFlushAcquire* = ompPragma("flush acquire")
+template ompFlushRelease* = ompPragma("flush release")
+template ompFlushSeqCst* = ompPragma("flush seq_cst")
+template ompAtomicRead*(body) = ompPragma("atomic read acquire", body)
+template ompAtomicWrite*(body) = ompPragma("atomic write release", body)
 
 template ompParallel*(body:untyped) =
   ompBlock("parallel"):
diff --git a/src/base/profile.nim b/src/base/profile.nim
index 506568ca..160f0411 100644
--- a/src/base/profile.nim
+++ b/src/base/profile.nim
@@ -787,6 +787,7 @@ proc makeHotspotTable(lrti: List[RTInfoObj]): tuple[ns:int64,oh:int64] =
   return (nstot, ohtot)
 
 proc echoHotspots* =
+  hs.clear
   let tot = makeHotspotTable(rtiStack)
   #let nstot = tot.ns
   let ohtot = tot.oh
diff --git a/src/base/stdUtils.nim b/src/base/stdUtils.nim
index e0430c3b..5da5f26d 100644
--- a/src/base/stdUtils.nim
+++ b/src/base/stdUtils.nim
@@ -8,10 +8,11 @@ type
 #template `[]`*(x: cArray): untyped = addr x[0]
 #template `&`*(x: ptr cArray): untyped = addr x[0]
 
-template ptrInt*(x:untyped):untyped = cast[ByteAddress](x)
-template addrInt*(x:untyped):untyped = cast[ByteAddress](addr(x))
-template unsafeAddrInt*(x:untyped):untyped = cast[ByteAddress](addr(x))
-template toHex*(x: ptr typed): untyped = toHex(cast[ByteAddress](x))
+template ptrInt*(x: auto): auto = cast[uint](x)
+template addrInt*(x: auto): auto = cast[uint](addr(x))
+template unsafeAddrInt*(x: auto): auto = cast[uint](addr(x))
+template toHex*(x: ptr auto): auto = toHex(cast[uint](x))
+template toHex*(x: pointer): auto = toHex(cast[uint](x))
 
 type
   ConstInt* {.importc:"const int".} = object
@@ -31,7 +32,7 @@ proc isInteger*(s: string):bool =
   var t:int
   parseInt(s, t) == s.len
 
-template `$&`*(x: untyped): string =
+template `$&`*(x: auto): string =
   toHex(unsafeAddrInt(x))
 
 proc `|`*(s: string, d: tuple[w:int,c:char]): string =
@@ -52,7 +53,7 @@ proc `|`*(f: SomeFloat, d: tuple[w,p: int]): string =
     formatFloat(f, ffDefault, d.p) | d.w
 proc `|`*(f: float, d: int): string =
   f | (d,d)
-template `|-`*(x:SomeNumber, y: int): untyped =
+template `|-`*(x:SomeNumber, y: int): auto =
   x | -y
 
 proc indexOf*[T](x: openArray[T], y: auto): int =
diff --git a/src/base/threading.nim b/src/base/threading.nim
index 73db56c1..8a01205b 100644
--- a/src/base/threading.nim
+++ b/src/base/threading.nim
@@ -4,6 +4,8 @@ import stdUtils
 import macros
 import omp
 import metaUtils
+import base/basicOps
+getOptimPragmas
 
 type
   ThreadShare* = object
@@ -18,19 +20,34 @@ var threadNum*{.threadvar.}:int
 var numThreads*{.threadvar.}:int
 var threadLocals*{.threadvar.}:ThreadObj
 var inited = false
+var ts: pointer = nil
+var nts = 0
 
-template initThreadLocals*(ts:seq[ThreadShare]):untyped =
+proc allocTs* {.alwaysInline.} =
+  if numThreads > nts and threadNum == 0:
+    if ts == nil:
+      ts = allocShared(numThreads*sizeof(ThreadShare))
+    else:
+      ts = reallocShared(ts, numThreads*sizeof(ThreadShare))
+    nts = numThreads
+
+#template initThreadLocals*(ts:seq[ThreadShare]) =
+template initThreadLocals* =
+  bind ts
   threadLocals.threadNum = threadNum
   threadLocals.numThreads = numThreads
-  threadLocals.share = cast[ptr cArray[ThreadShare]](ts[0].addr)
+  #threadLocals.share = cast[ptr cArray[ThreadShare]](ts[0].addr)
+  threadLocals.share = cast[ptr cArray[ThreadShare]](ts)
   threadLocals.share[threadNum].p = nil
   threadLocals.share[threadNum].counter = 0
 proc init =
   inited = true
   threadNum = 0
   numThreads = 1
-  var ts = newSeq[ThreadShare](numThreads)
-  initThreadLocals(ts)
+  #var ts = newSeq[ThreadShare](numThreads)
+  #initThreadLocals(ts)
+  allocTs()
+  initThreadLocals()
 template threadsInit* =
   if not inited:
     init()
@@ -57,51 +74,61 @@ template emitStackTrace: untyped =
 template threads*(body:untyped):untyped =
   checkInit()
   doAssert(numThreads==1)
-  let tidOld = threadNum
-  let nidOld = numThreads
-  let tlOld = threadLocals
+  #let tidOld = threadNum
+  #let nidOld = numThreads
+  #let tlOld = threadLocals
   #proc tproc2{.genSym,inline.} =
   #  body
   proc tproc{.genSym.} =
     emitStackTrace()
-    var ts:seq[ThreadShare]
+    #var ts:seq[ThreadShare]
     ompParallel:
       threadNum = ompGetThreadNum()
       numThreads = ompGetNumThreads()
-      if threadNum==0: ts.newSeq(numThreads)
+      #if threadNum==0: ts.newSeq(numThreads)
+      allocTs()
       threadBarrierO()
-      initThreadLocals(ts)
+      #initThreadLocals(ts)
+      initThreadLocals()
       threadBarrierO()
       #echoAll threadNum, " s: ", ptrInt(threadLocals.share)
       body
       #tproc2()
       threadBarrierO()
   tproc()
-  threadNum = tidOld
-  numThreads = nidOld
-  threadLocals = tlOld
+  #threadNum = tidOld
+  #numThreads = nidOld
+  #threadLocals = tlOld
+  threadNum = 0
+  numThreads = 1
+  initThreadLocals()
 template threads*(x0:untyped;body:untyped):untyped =
   checkInit()
-  let tidOld = threadNum
-  let nidOld = numThreads
-  let tlOld = threadLocals
+  #let tidOld = threadNum
+  #let nidOld = numThreads
+  #let tlOld = threadLocals
   proc tproc(xx:var type(x0)) {.genSym.} =
-    var ts:seq[ThreadShare]
+    #var ts:seq[ThreadShare]
     ompParallel:
       threadNum = ompGetThreadNum()
       numThreads = ompGetNumThreads()
-      if threadNum==0: ts.newSeq(numThreads)
+      #if threadNum==0: ts.newSeq(numThreads)
+      allocTs()
       threadBarrierO()
-      initThreadLocals(ts)
+      #initThreadLocals(ts)
+      initThreadLocals()
       threadBarrierO()
       #echoAll threadNum, " s: ", ptrInt(threadLocals.share)
       subst(x0,xx):
         body
       threadBarrierO()
   tproc(x0)
-  threadNum = tidOld
-  numThreads = nidOld
-  threadLocals = tlOld
+  #threadNum = tidOld
+  #numThreads = nidOld
+  #threadLocals = tlOld
+  threadNum = 0
+  numThreads = 1
+  initThreadLocals()
 
 template nothreads*(body: untyped): untyped =
   ## convenient way to turn off threading
@@ -114,13 +141,18 @@ template threadBarrierO* = ompBarrier
 template threadMaster*(x:untyped) = ompMaster(x)
 template threadSingle*(x:untyped) = ompSingle(x)
 template threadCritical*(x:untyped) = ompCritical(x)
+template threadFlush* = ompFlush
+template threadFlushRelease* = ompFlushRelease
+template threadFlushAcquire* = ompFlushAcquire
+template threadFlushSeqCst* = ompFlushSeqCst
+template threadAtomicRead*(body:typed) = ompAtomicRead(body)
+template threadAtomicWrite*(body:typed) = ompAtomicWrite(body)
 
 template threadDivideLow*(x,y: untyped): untyped =
   x + (threadNum*(y-x)) div numThreads
 template threadDivideHigh*(x,y: untyped): untyped =
   x + ((threadNum+1)*(y-x)) div numThreads
 
-
 proc tForX*(index,i0,i1,body:NimNode):NimNode =
   return quote do:
     let d = 1+`i1` - `i0`
@@ -154,39 +186,123 @@ iterator `.|`*[S, T](a: S, b: T): T {.inline.} =
     inc(res)
 """
 
-template t0waitO* = threadBarrier()
-template t0waitX* =
-  if threadNum==0:
-    inc threadLocals.share[0].counter
-    let tbar0 = threadLocals.share[0].counter
-    for b in 1..<numThreads:
-      let p{.volatile.} = threadLocals.share[b].counter.addr
+template t0waitB* = threadBarrier()
+#template t0waitO* = t0waitB()
+template t0waitA* =
+  if numThreads > 1:
+    if threadNum==0:
+      inc threadLocals.share[0].counter
+      let tbar0 = threadLocals.share[0].counter
+      for b in 1..<numThreads:
+        let p = threadLocals.share[b].counter.addr
+        while true:
+          #fence()
+          #ompAcquire
+          #if p[] >= tbar0: break
+          var t {.noInit.}: type(p[])
+          ompAtomicRead: t = p[]
+          if t >= tbar0: break
+    else:
+      #inc threadLocals.share[threadNum].counter
+      #fence()
+      #ompRelease
+      let t = threadLocals.share[threadNum].counter + 1
+      ompAtomicWrite:
+        threadLocals.share[threadNum].counter = t
+template t0wait* = t0waitA()
+#template t0wait* = t0waitB()
+
+template twait0B* = threadBarrier()
+template twait0A* =
+  if numThreads > 1:
+    if threadNum==0:
+      #inc threadLocals.share[0].counter
+      #fence()
+      #ompRelease
+      let t = threadLocals.share[0].counter + 1
+      ompAtomicWrite:
+        threadLocals.share[0].counter = t
+    else:
+      inc threadLocals.share[threadNum].counter
+      let tbar0 = threadLocals.share[threadNum].counter
+      let p = threadLocals.share[0].counter.addr
       while true:
-        fence()
-        if p[] >= tbar0: break
-  else:
-    inc threadLocals.share[threadNum].counter
-    fence()
-template t0wait* = t0waitO()
+        #fence()
+        #ompAcquire
+        #if p[] >= tbar0: break
+        var t {.noInit.}: type(p[])
+        ompAtomicRead: t = p[]
+        if t >= tbar0: break
+template twait0* = twait0A()
+#template twait0* = twait0B()
+
+template threadBarrierA* =
+  threadFlushRelease
+  t0waitA
+  twait0A
+  threadFlushAcquire
+template threadBarrier* = threadBarrierA
+#template threadBarrier* = ompBarrier
 
-template twait0O* = threadBarrier()
-template twait0X* =
+template threadSum01A*[T](a: T) =
+  ## sum value with result on thread 0, atomic version
   if threadNum==0:
-    inc threadLocals.share[0].counter
-    fence()
+    tic("threadSum01A")
+    t0wait()
+    toc("t0wait")
+    for i in 1..<numThreads:
+      var p{.noInit.}: pointer
+      threadAtomicRead:
+        p = threadLocals.share[i].p
+      a += cast[ptr T](p)[]
+    toc("sum")
+    twait0()
+    toc("twait0")
   else:
-    inc threadLocals.share[threadNum].counter
-    let tbar0 = threadLocals.share[threadNum].counter
-    let p{.volatile.} = threadLocals.share[0].counter.addr
-    while true:
-      fence()
-      if p[] >= tbar0: break
-template twait0* = twait0O()
+    threadAtomicWrite:
+      threadLocals.share[threadNum].p = a.addr
+    t0wait()
+    twait0()
+
+template threadSum01B*[T](a: T) =
+  ## sum value with result on thread 0, barrier version
+  block:
+    tic("threadSum01")
+    if threadNum!=0:
+      threadLocals.share[threadNum].p = a.addr
+    threadBarrier()
+    toc("threadBarrier first")
+    if threadNum==0:
+      for i in 1..<numThreads:
+        a += cast[ptr T](threadLocals.share[i].p)[]
+    toc("sum")
+    threadBarrier()
+    toc("threadBarrier last")
+
+template threadSum01*(a: auto) = threadSum01A(a)
+#template threadSum01*(a: auto) = threadSum01B(a)
+template threadSum0*(a: auto) = threadSum01(a)
 
-template threadBarrier* =
-  #t0waitX
-  #twait0X
-  ompBarrier
+# threadMax0 FIXME
+
+template threadBroadcast1A*[T](a: T) =
+  if threadNum==0:
+    tic("threadRankSum1")
+    threadAtomicWrite:
+      threadLocals.share[0].p = a.addr
+    twait0()
+    toc("twait0")
+    t0wait()
+    toc("t0wait")
+  else:
+    twait0()
+    var p{.noInit.}: pointer
+    threadAtomicRead:
+      p = threadLocals.share[0].p
+    a = cast[ptr T](p)[]
+    t0wait()
+template threadBroadcast1*(a: auto) = threadBroadcast1A(a)
+template threadBroadcast*(a: auto) = threadBroadcast1(a)
 
 macro threadSum*(a:varargs[untyped]):auto =
   #echo a.treeRepr
diff --git a/src/bench/benchLinalgS.nim b/src/bench/benchLinalgS.nim
new file mode 100644
index 00000000..048069b7
--- /dev/null
+++ b/src/bench/benchLinalgS.nim
@@ -0,0 +1,3 @@
+import base/globals
+setDefaultSingle()
+import benchLinalg
diff --git a/src/bench/benchStagPropS.nim b/src/bench/benchStagPropS.nim
new file mode 100644
index 00000000..e2358f69
--- /dev/null
+++ b/src/bench/benchStagPropS.nim
@@ -0,0 +1,3 @@
+import base/globals
+setDefaultSingle()
+import benchStagProp
diff --git a/src/comms/commsUtils.nim b/src/comms/commsUtils.nim
index bb9a423d..8729318a 100644
--- a/src/comms/commsUtils.nim
+++ b/src/comms/commsUtils.nim
@@ -1,9 +1,10 @@
-import base/[threading,metaUtils,stdUtils]
+import base/[threading,metaUtils,stdUtils,basicOps,profile]
 import times
 import os
 import macros
 #import strUtils
 import comms/comms
+getOptimPragmas()
 
 proc evalArgs*(call:var NimNode; args:NimNode):NimNode =
   result = newStmtList()
@@ -218,6 +219,7 @@ template rankSum*(a:varargs[untyped]) =
   let comm = getDefaultComm()
   comm.rankSum(a)
 
+#[
 #var count = 0
 template threadRankSum1*[T](comm: Comm, a: T) =
   mixin rankSum
@@ -257,7 +259,6 @@ template threadRankSum1*[T](comm: Comm, a: T) =
     toc("t0wait")
     for i in 1..<numThreads:
       a += cast[ptr type(a)](threadLocals.share[i].p)[]
-      #a += ta2[threadNum]
     toc("sum")
     rankSum(comm,a)
     toc("rankSum")
@@ -265,11 +266,25 @@ template threadRankSum1*[T](comm: Comm, a: T) =
     twait0()
     toc("twait0")
   else:
-    threadLocals.share[threadNum].p = a.addr
+    threadAtomicWrite:
+      threadLocals.share[threadNum].p = a.addr
     #ta2[threadNum] = a
     t0wait()
     twait0()
     a = ta
+]#
+
+#template threadRankSum1x*[T](comm: Comm, a: T) =
+proc threadRankSum1x*[T](comm: Comm, a: var T) =
+  mixin rankSum
+  tic("threadRankSum1x")
+  threadSum0(a)
+  toc("threadSum0")
+  if threadNum==0:
+    rankSum(comm,a)
+  toc("rankSum")
+  threadBroadcast(a)
+  toc("threadBroadcast")
 
 #[
 proc threadRankSumN*(comm: Comm, a: NimNode): auto =
@@ -336,11 +351,10 @@ template threadRankSum*(a:varargs[untyped]) =
 #template threadRankSum*(a:typed):auto =
 
 template threadRankSum*(c: Comm, a: typed) =
-  threadRankSum1(c, a)
+  threadRankSum1x(c, a)
 template threadRankSum*(a: typed) =
   let comm = getDefaultComm()
-  threadRankSum1(comm, a)
-
+  threadRankSum1x(comm, a)
 
 macro rankMax*(a:varargs[untyped]):auto =
   if a.len==1:
@@ -357,16 +371,20 @@ template threadRankMax1*(a:untyped):untyped =
   var ta{.global.}:type(a)
   if threadNum==0:
     t0wait()
+    #threadBarrier()
     for i in 1..<numThreads:
       let c = cast[ptr type(a)](threadLocals.share[i].p)[]
       if a < c: a = c
     rankMax(a)
     ta = a
     twait0()
+    #threadBarrier()
   else:
     threadLocals.share[threadNum].p = a.addr
     t0wait()
+    #threadBarrier()
     twait0()
+    #threadBarrier()
     a = ta
 
 macro threadRankMax*(a:varargs[untyped]):auto =
diff --git a/src/examples/staghmc_sh.nim b/src/examples/staghmc_sh.nim
index ac5c5d1f..58658e44 100644
--- a/src/examples/staghmc_sh.nim
+++ b/src/examples/staghmc_sh.nim
@@ -817,7 +817,8 @@ for n in inittraj+1..inittraj+trajs:
 
 toc("hmc")
 
-if showTimers: echoTimers(timerExpandRatio, timerEchoDropped)
+#if showTimers: echoTimers(timerExpandRatio, timerEchoDropped)
+echoProf()
 processSaveParams()
 writeParamFile()
 qexfinalize()
diff --git a/src/layout/shifts.nim b/src/layout/shifts.nim
index 1a62cebd..78d8e102 100644
--- a/src/layout/shifts.nim
+++ b/src/layout/shifts.nim
@@ -519,7 +519,8 @@ proc newShifters*[F](f: F, len: int, sub="all"): auto =
   r
 
 #template transporterApply*(x: Transporter, y: auto): auto =
-proc transporterApply*(x: Transporter, y: auto): auto {.alwaysInline.} =
+#proc transporterApply*(x: Transporter, y: auto): auto {.alwaysInline.} =
+proc transporterApply*(x: Transporter, y: auto): auto =
   mixin mul, load1, adj, `[]`
   var r = x.field
   when compiles(x.link):

From e0b23c4554c3bd2f81fd9e57473ae02768f9f304 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Wed, 20 Nov 2024 22:42:28 -0600
Subject: [PATCH 03/12] improve threading

---
 src/base/profile.nim        |  3 +-
 src/base/threading.nim      | 97 +++++++++++++++++++++++++++++++++++--
 src/bench/benchStagProp.nim |  2 +
 3 files changed, 97 insertions(+), 5 deletions(-)

diff --git a/src/base/profile.nim b/src/base/profile.nim
index 160f0411..6a861f6c 100644
--- a/src/base/profile.nim
+++ b/src/base/profile.nim
@@ -15,7 +15,8 @@ template ticDiffSecs*(x,y: TicType): float = 1e-9 * float(x.int64 - y.int64)
 template `-`*(x,y: TicType): TicType = TicType(x.int64 - y.int64)
 
 var
-  DropWasteTimerRatio* = 0.05  ## Drop children timers if the proportion of their overhead is larger than this.
+  ## Drop children timers if the proportion of their overhead is larger than this.
+  DropWasteTimerRatio* = floatParam("dropRatio", 0.05)
   VerboseTimer* = false  ## If true print out all the timers during execution.
 
 ##[
diff --git a/src/base/threading.nim b/src/base/threading.nim
index 8a01205b..e4acf106 100644
--- a/src/base/threading.nim
+++ b/src/base/threading.nim
@@ -5,15 +5,18 @@ import macros
 import omp
 import metaUtils
 import base/basicOps
+import bitops
 getOptimPragmas
 
 type
   ThreadShare* = object
     p*:pointer
     counter*:int
+    extra*:int
   ThreadObj* = object
     threadNum*:int
     numThreads*:int
+    #counter*: int
     share*:ptr cArray[ThreadShare]
 
 var threadNum*{.threadvar.}:int
@@ -209,8 +212,29 @@ template t0waitA* =
       let t = threadLocals.share[threadNum].counter + 1
       ompAtomicWrite:
         threadLocals.share[threadNum].counter = t
-template t0wait* = t0waitA()
+template t0waitC* =
+  if numThreads > 1:
+    if threadNum==0:
+      inc threadLocals.share[0].counter
+      let tbar0 = threadLocals.share[0].counter
+      var left = numThreads - 1
+      for i in 1..<numThreads: threadLocals.share[i].extra = 1
+      while left > 0:
+        for i in 1..<numThreads:
+          if threadLocals.share[i].extra > 0:
+            let p = threadLocals.share[i].counter.addr
+            var t {.noInit.}: type(p[])
+            ompAtomicRead: t = p[]
+            if t >= tbar0:
+              threadLocals.share[i].extra = 0
+              dec left
+    else:
+      let t = threadLocals.share[threadNum].counter + 1
+      ompAtomicWrite:
+        threadLocals.share[threadNum].counter = t
+#template t0wait* = t0waitA()
 #template t0wait* = t0waitB()
+template t0wait* = t0waitC()
 
 template twait0B* = threadBarrier()
 template twait0A* =
@@ -244,10 +268,10 @@ template threadBarrierA* =
 template threadBarrier* = threadBarrierA
 #template threadBarrier* = ompBarrier
 
-template threadSum01A*[T](a: T) =
+template threadSum01A0*[T](a: T) =
   ## sum value with result on thread 0, atomic version
   if threadNum==0:
-    tic("threadSum01A")
+    tic("threadSum01A0")
     t0wait()
     toc("t0wait")
     for i in 1..<numThreads:
@@ -264,6 +288,33 @@ template threadSum01A*[T](a: T) =
     t0wait()
     twait0()
 
+template threadSum01A*[T](a: T) =
+  ## sum value with result on thread 0, atomic version
+  if threadNum==0:
+    tic("threadSum01A")
+    inc threadLocals.share[0].counter
+    let tbar0 = threadLocals.share[0].counter
+    for b in 1..<numThreads:
+      let pc = threadLocals.share[b].counter.addr
+      while true:
+        var t {.noInit.}: type(pc[])
+        ompAtomicRead: t = pc[]
+        if t >= tbar0: break
+      var p{.noInit.}: pointer
+      threadAtomicRead:
+        p = threadLocals.share[b].p
+      a += cast[ptr T](p)[]
+    toc("sum")
+    twait0()
+    toc("twait0")
+  else:
+    threadAtomicWrite:
+      threadLocals.share[threadNum].p = a.addr
+    let t = threadLocals.share[threadNum].counter + 1
+    threadAtomicWrite:
+      threadLocals.share[threadNum].counter = t
+    twait0()
+
 template threadSum01B*[T](a: T) =
   ## sum value with result on thread 0, barrier version
   block:
@@ -279,15 +330,53 @@ template threadSum01B*[T](a: T) =
     threadBarrier()
     toc("threadBarrier last")
 
+template threadSum01T*[T](a: T) =
+  ## sum value with result on thread 0, tree version
+  threadAtomicWrite:
+    threadLocals.share[threadNum].p = a.addr
+  var c = threadLocals.share[threadNum].counter
+  var done = false
+  var b = 1
+  while b < numThreads:
+    inc c
+    if not done:
+      let o = bitxor(threadNum, b)
+      if bitand(threadNum, b) == 0:
+        if o < numThreads:
+          while true:
+            var d{.noInit.}: int
+            threadAtomicRead:
+              d = threadLocals.share[o].counter
+            if d >= c: break
+          var p{.noInit.}: pointer
+          threadAtomicRead:
+            p = threadLocals.share[o].p
+          a += cast[ptr T](p)[]
+        threadAtomicWrite:
+          threadLocals.share[threadNum].counter = c
+      else:
+        threadAtomicWrite:
+          threadLocals.share[threadNum].counter = c
+        while true:
+          var d{.noInit.}: int
+          threadAtomicRead:
+            d = threadLocals.share[o].counter
+          if d >= c: break
+        done = true
+    else:
+      threadLocals.share[threadNum].counter = c
+    b *= 2
+
 template threadSum01*(a: auto) = threadSum01A(a)
 #template threadSum01*(a: auto) = threadSum01B(a)
+#template threadSum01*(a: auto) = threadSum01T(a)
 template threadSum0*(a: auto) = threadSum01(a)
 
 # threadMax0 FIXME
 
 template threadBroadcast1A*[T](a: T) =
   if threadNum==0:
-    tic("threadRankSum1")
+    tic("threadBroadcast1A")
     threadAtomicWrite:
       threadLocals.share[0].p = a.addr
     twait0()
diff --git a/src/bench/benchStagProp.nim b/src/bench/benchStagProp.nim
index 164758cf..4a518367 100644
--- a/src/bench/benchStagProp.nim
+++ b/src/bench/benchStagProp.nim
@@ -71,6 +71,8 @@ for i in 0..3:
   g3[2*i+1] = lo.ColorMatrix()
   g3[2*i+1].randomSU rs
   g3[2*i+1] *= 0.1
+for i in 0..<g3.len:
+  echo &"g3[{i}]2: {g3[i].norm2}"
 var s3 = newStag3(g3)
 #s3.D(v2, v1, m)
 s3.solve(v2, v1, mass, sp)

From b5b6d0d1378b118d72b133038be16a35f92cd02e Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Thu, 21 Nov 2024 14:01:02 -0600
Subject: [PATCH 04/12] fix echo in profile

---
 src/base/profile.nim     |  2 +-
 src/comms/comms.nim      | 10 +----
 src/comms/commsEcho.nim  | 86 ++++++++++++++++++++++++++++++++++++++++
 src/comms/commsTypes.nim |  8 ++++
 src/comms/commsUtils.nim | 83 --------------------------------------
 5 files changed, 97 insertions(+), 92 deletions(-)
 create mode 100644 src/comms/commsEcho.nim

diff --git a/src/base/profile.nim b/src/base/profile.nim
index 6a861f6c..de7ca62e 100644
--- a/src/base/profile.nim
+++ b/src/base/profile.nim
@@ -1,6 +1,6 @@
 import threading
 export threading
-import comms/comms, stdUtils, base/[basicOps,params]
+import comms/commsEcho, stdUtils, base/[basicOps,params]
 import os, strutils, sequtils, std/monotimes, std/tables, std/algorithm, strformat
 export monotimes
 getOptimPragmas()
diff --git a/src/comms/comms.nim b/src/comms/comms.nim
index 9e4be9a8..28e3be03 100644
--- a/src/comms/comms.nim
+++ b/src/comms/comms.nim
@@ -1,14 +1,6 @@
 import commsTypes
 export commsTypes
 
-# globals
-
-var defaultComm*: Comm
-template getDefaultComm*(): Comm = defaultComm
-template getComm*(): Comm = getDefaultComm()  # temporary alias
-var myRank* = 0
-var nRanks* = 1
-
 # base methods
 
 method name*(c: Comm): string {.base.} = discard
@@ -157,6 +149,8 @@ commsNames.add "QMP"
 commsInits.add getQmpComm
 commsFinis.add commsFinalizeQmp
 
+import commsEcho
+export commsEcho
 import commsUtils
 export commsUtils
 
diff --git a/src/comms/commsEcho.nim b/src/comms/commsEcho.nim
new file mode 100644
index 00000000..b06c68ef
--- /dev/null
+++ b/src/comms/commsEcho.nim
@@ -0,0 +1,86 @@
+import macros
+import base/[threading,metaUtils]
+import commsTypes
+
+proc evalArgs*(call:var NimNode; args:NimNode):NimNode =
+  result = newStmtList()
+  for i in 0..<args.len:
+    let t = genSym()
+    let a = args[i]
+    result.add(quote do:
+      when `a` is openarray:
+        let `t` = $`a`
+      else:
+        let `t` = `a`
+      )
+    call.add(t)
+proc cprintf*(fmt:cstring){.importc:"printf",varargs,header:"<stdio.h>".}
+#proc printfOrdered(
+macro printf*(fmt:string; args:varargs[untyped]):auto =
+  var call = newCall(ident("cprintf"), fmt)
+  result = evalArgs(call, args)
+  result.add(quote do:
+    if myRank==0 and threadNum==0:
+      `call`
+    )
+proc echoRaw*(x: varargs[typed, `$`]) {.magic: "Echo".}
+macro echoAll*(args:varargs[untyped]):auto =
+  var call = newCall(bindSym"echoRaw")
+  result = evalArgs(call, args)
+  result.add(quote do:
+    `call`
+    )
+macro echoRank*(args:varargs[untyped]):auto =
+  var call = newCall(bindSym"echoRaw")
+  call.add ident"myRank"
+  call.add newLit"/"
+  call.add ident"nRanks"
+  call.add newLit": "
+  result = evalArgs(call, args)
+  template f(x:untyped):untyped =
+    if threadNum==0: x
+  result.add getAst(f(call))
+macro echo0*(args: varargs[untyped]): untyped =
+  var call = newCall(bindSym"echoRaw")
+  result = evalArgs(call, args)
+  result.add(quote do:
+    bind myRank
+    if myRank==0 and threadNum==0:
+      `call`
+    )
+  #echo result.repr
+macro makeEchos(n:static[int]): untyped =
+  template ech(x,y: untyped) =
+    template echo* =
+      when nimvm:
+        x
+      else:
+        y
+  result = newStmtList()
+  for i in 1..n:
+    var er = newCall(bindSym"echoRaw")
+    var e0 = newCall(bindSym"echo0")
+    var ea = newSeq[NimNode](0)
+    for j in 1..i:
+      let ai = ident("a" & $j)
+      er.add ai
+      e0.add ai
+      ea.add newNimNode(nnkIdentDefs).add(ai).add(ident"untyped").add(newEmptyNode())
+    var t = getAst(ech(er,e0)).peelStmt
+    #echo t.treerepr
+    for j in 0..<i: t[3].add ea[j]
+    result.add t
+  #echo result.treerepr
+makeEchos(64)
+#[
+template echo*(a1: untyped) =
+  when nimvm:
+    echoRaw(a1)
+  else:
+    echo0(a1)
+template echo*(a1,a2: untyped) =
+  when nimvm:
+    echoRaw(a1,a2)
+  else:
+    echo0(a1,a2)
+]#
diff --git a/src/comms/commsTypes.nim b/src/comms/commsTypes.nim
index bac49e09..b80d1193 100644
--- a/src/comms/commsTypes.nim
+++ b/src/comms/commsTypes.nim
@@ -1,2 +1,10 @@
 type
   Comm* = ref object of RootObj
+
+# globals
+
+var defaultComm*: Comm
+template getDefaultComm*(): Comm = defaultComm
+template getComm*(): Comm = getDefaultComm()  # temporary alias
+var myRank* = 0
+var nRanks* = 1
diff --git a/src/comms/commsUtils.nim b/src/comms/commsUtils.nim
index 8729318a..4b042752 100644
--- a/src/comms/commsUtils.nim
+++ b/src/comms/commsUtils.nim
@@ -6,89 +6,6 @@ import macros
 import comms/comms
 getOptimPragmas()
 
-proc evalArgs*(call:var NimNode; args:NimNode):NimNode =
-  result = newStmtList()
-  for i in 0..<args.len:
-    let t = genSym()
-    let a = args[i]
-    result.add(quote do:
-      when `a` is openarray:
-        let `t` = $`a`
-      else:
-        let `t` = `a`
-      )
-    call.add(t)
-proc cprintf*(fmt:cstring){.importc:"printf",varargs,header:"<stdio.h>".}
-#proc printfOrdered(
-macro printf*(fmt:string; args:varargs[untyped]):auto =
-  var call = newCall(ident("cprintf"), fmt)
-  result = evalArgs(call, args)
-  result.add(quote do:
-    if myRank==0 and threadNum==0:
-      `call`
-    )
-proc echoRaw*(x: varargs[typed, `$`]) {.magic: "Echo".}
-macro echoAll*(args:varargs[untyped]):auto =
-  var call = newCall(bindSym"echoRaw")
-  result = evalArgs(call, args)
-  result.add(quote do:
-    `call`
-    )
-macro echoRank*(args:varargs[untyped]):auto =
-  var call = newCall(bindSym"echoRaw")
-  call.add ident"myRank"
-  call.add newLit"/"
-  call.add ident"nRanks"
-  call.add newLit": "
-  result = evalArgs(call, args)
-  template f(x:untyped):untyped =
-    if threadNum==0: x
-  result.add getAst(f(call))
-macro echo0*(args: varargs[untyped]): untyped =
-  var call = newCall(bindSym"echoRaw")
-  result = evalArgs(call, args)
-  result.add(quote do:
-    bind myRank
-    if myRank==0 and threadNum==0:
-      `call`
-    )
-  #echo result.repr
-macro makeEchos(n:static[int]): untyped =
-  template ech(x,y: untyped) =
-    template echo* =
-      when nimvm:
-        x
-      else:
-        y
-  result = newStmtList()
-  for i in 1..n:
-    var er = newCall(bindSym"echoRaw")
-    var e0 = newCall(bindSym"echo0")
-    var ea = newSeq[NimNode](0)
-    for j in 1..i:
-      let ai = ident("a" & $j)
-      er.add ai
-      e0.add ai
-      ea.add newNimNode(nnkIdentDefs).add(ai).add(ident"untyped").add(newEmptyNode())
-    var t = getAst(ech(er,e0)).peelStmt
-    #echo t.treerepr
-    for j in 0..<i: t[3].add ea[j]
-    result.add t
-  #echo result.treerepr
-makeEchos(64)
-#[
-template echo*(a1: untyped) =
-  when nimvm:
-    echoRaw(a1)
-  else:
-    echo0(a1)
-template echo*(a1,a2: untyped) =
-  when nimvm:
-    echoRaw(a1,a2)
-  else:
-    echo0(a1,a2)
-]#
-
 template sum*(c:Comm, v:var SomeNumber) = c.allReduce(v)
 template sum*(c:Comm, v:ptr float32, n:int) = c.allReduce(v,n)
 template sum*(c:Comm, v:ptr float64, n:int) = c.allReduce(v,n)

From db028fae09a35fad1c4dbfb1483acb18294b1a0a Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Thu, 21 Nov 2024 14:02:12 -0600
Subject: [PATCH 05/12] add version info

---
 src/base/version.nim | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 src/base/version.nim

diff --git a/src/base/version.nim b/src/base/version.nim
new file mode 100644
index 00000000..66a47f8e
--- /dev/null
+++ b/src/base/version.nim
@@ -0,0 +1,22 @@
+import strutils, times
+
+const sep = '='.repeat(78)
+const head = "QEX Compilation information:"
+const comptime = "  Time: " & staticExec("date")
+const compmach = "  Host: " & staticExec("uname -a")
+const gitlog = "  Log: " & staticExec("git log -1").indent(7).strip
+const gitstat = "  Status: " & staticExec("git status -uno").indent(10).strip
+const buildInfo = [sep,head,comptime,compmach,gitlog,gitstat,sep].join("\n")
+const gitrev = staticExec("git rev-parse HEAD")
+
+static: echo buildInfo
+
+proc getBuildInfo*(): string =
+  buildInfo
+
+proc getVersion*(): string =
+  gitrev
+
+when isMainModule:
+  echo getBuildInfo()
+  echo getVersion()

From b46d18cb5f2b10fffe640a5db7bcdd9cb9f3055f Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Thu, 21 Nov 2024 14:04:44 -0600
Subject: [PATCH 06/12] allow setting of hwloc library

---
 src/hwloc/capi.nim | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/hwloc/capi.nim b/src/hwloc/capi.nim
index 51cfa6f2..f104dcbc 100644
--- a/src/hwloc/capi.nim
+++ b/src/hwloc/capi.nim
@@ -48,9 +48,10 @@
 ##
 
 when hostOs == "macosx":
-  {.pragma: hwloc, dynlib:"libhwloc.dylib".}
+  const hwloclib {.strDefine.} = "libhwloc.dylib"
 else:
-  {.pragma: hwloc, dynlib:"libhwloc.so".}
+  const hwloclib {.strDefine.} = "libhwloc.so"
+{.pragma: hwloc, dynlib:hwloclib.}
 
 type
   hwloc_bitmap_s {.bycopy.} = object

From e5ed8b0a89db86101904a8cc6d1a04d9aca1f90b Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Sat, 23 Nov 2024 16:14:14 -0600
Subject: [PATCH 07/12] small update to simd types

---
 src/simd/simdX86Ops.nim   |  8 ++++----
 src/simd/simdX86Types.nim | 39 +++++++++++++++++++++++++++++++++------
 2 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/src/simd/simdX86Ops.nim b/src/simd/simdX86Ops.nim
index 4edd4375..5f6d1790 100644
--- a/src/simd/simdX86Ops.nim
+++ b/src/simd/simdX86Ops.nim
@@ -17,22 +17,22 @@ template int2mask*(T: typedesc[m512], i: SomeInteger): mmask16 = cvtu32_mask16(u
 proc `[]=`*(r:var m128; i:SomeInteger; x:SomeNumber) {.alwaysInline.} =
   mixin toArray
   var a = toArray(r)
-  a[i] := x
+  a[i] = float32 x
   assign(r, a)
 proc `[]=`*(r:var m128d; i:SomeInteger; x:SomeNumber) {.alwaysInline.} =
   mixin toArray
   var a = toArray(r)
-  a[i] := x
+  a[i] = float x
   assign(r, a)
 proc `[]=`*(r:var m256; i:SomeInteger; x:SomeNumber) {.alwaysInline.} =
   mixin toArray
   var a = toArray(r)
-  a[i] := x
+  a[i] = float32 x
   assign(r, a)
 proc `[]=`*(r:var m256d; i:SomeInteger; x:SomeNumber) {.alwaysInline.} =
   mixin toArray
   var a = toArray(r)
-  a[i] := x
+  a[i] = float x
   assign(r, a)
 #proc `[]=`*(r:var m256d; i:SomeInteger; x:SomeNumber) {.alwaysInline.} =
 #  var a {.noInit.}: m256d
diff --git a/src/simd/simdX86Types.nim b/src/simd/simdX86Types.nim
index bed968b4..f1b001e2 100644
--- a/src/simd/simdX86Types.nim
+++ b/src/simd/simdX86Types.nim
@@ -2,25 +2,52 @@ import simdWrap
 export simdWrap
 
 {.pragma: imm, header:"immintrin.h".}
+{.pragma: imms, header:"immintrin.h", incompleteStruct.}
+type
+  m64*   {.importc: "__m64"  , imms.} = object
+  m128*  {.importc: "__m128" , imms.} = object
+  m128d* {.importc: "__m128d", imms.} = object
+  m128i* {.importc: "__m128i", imms.} = object
+  m128h* = distinct int64
+  m256*  {.importc: "__m256" , imms.} = object
+  m256d* {.importc: "__m256d", imms.} = object
+  m256i* {.importc: "__m256i", imms.} = object
+  m256h* = distinct m128i
+  m512*  {.importc: "__m512" , imms.} = object
+  m512d* {.importc: "__m512d", imms.} = object
+  m512i* {.importc: "__m512i", imms.} = object
+  m512h* = distinct m256i
+  mmask8*  {.importc: "__mmask8" , imms.} = object
+  mmask16* {.importc: "__mmask16", imms.} = object
+  mmask32* {.importc: "__mmask32", imms.} = object
+  mmask64* {.importc: "__mmask64", imms.} = object
+#[
+{.pragma: imm, header:"immintrin.h", incompleteStruct.}
 type
   m64*   {.importc: "__m64"  , imm.} = object
-  m128*  {.importc: "__m128" , imm.} = object
-  m128d* {.importc: "__m128d", imm.} = object
+    a: array[2,float32]
+  m128*  {.importc: "__m128" , imm.} = distinct array[4,float32]
+  m128d* {.importc: "__m128d", imm.} = distinct array[2,float64]
   m128i* {.importc: "__m128i", imm.} = object
+    a: array[4,int32]
   m128h* = distinct int64
   m256*  {.importc: "__m256" , imm.} = object
-  m256d* {.importc: "__m256d", imm.} = object
+    a: array[8,float32]
+  m256d* {.importc: "__m256d", imm.} = distinct array[4,float64]
   m256i* {.importc: "__m256i", imm.} = object
+    a: array[8,int32]
   m256h* = distinct m128i
-  m512*  {.importc: "__m512" , imm.} = object
-  m512d* {.importc: "__m512d", imm.} = object
-  m512i* {.importc: "__m512i", imm.} = object
+  m512*  {.importc: "__m512" , imm.} = distinct array[16,float32]
+  m512d* {.importc: "__m512d", imm.} = distinct array[8,float64]
+  m512i* {.importc: "__m512i", imm.} = distinct array[16,int32]
   m512h* = distinct m256i
   mmask8*  {.importc: "__mmask8" , imm.} = object
   mmask16* {.importc: "__mmask16", imm.} = object
   mmask32* {.importc: "__mmask32", imm.} = object
   mmask64* {.importc: "__mmask64", imm.} = object
+]#
 
+type
   SimdX86S* = m64 | m128 | m256 | m512
   SimdX86D* = m128d | m256d | m512d
   SimdX86* = SimdX86S | SimdX86D

From 01128e27c9d309b2b37953794558d4b2e78d8e50 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Mon, 2 Dec 2024 23:14:33 -0600
Subject: [PATCH 08/12] include compile-time parameters in help small update to
 random generators

---
 src/base/params.nim         | 57 ++++++++++++++++++++++++++-----------
 src/base/stdUtils.nim       |  7 +++++
 src/base/threading.nim      |  2 +-
 src/bench/benchStagProp.nim |  3 ++
 src/layout/layoutX.nim      | 13 +++++++--
 src/rng/distribution.nim    |  2 +-
 src/rng/generator.nim       |  6 ++++
 src/rng/milcrng.nim         | 18 ++++++++----
 src/rng/mrg32k3a.nim        | 12 ++++++--
 9 files changed, 90 insertions(+), 30 deletions(-)

diff --git a/src/base/params.nim b/src/base/params.nim
index dd779a1b..7f896e3a 100644
--- a/src/base/params.nim
+++ b/src/base/params.nim
@@ -1,14 +1,12 @@
 import macros, os, strUtils, tables, algorithm, seqUtils
-# Note: may get imported before 'echo' is redefined
-#       uses of 'echo' should be in a template with 'mixin echo'
 
 # TODO
 # include CT in saveParams RT
-# include CT in help
 # check used
 # only write from rank 0
 # only read from rank 0
 # add way to represent empty seq/string (versus unset/unknown)
+# sort paramHelp output by file and option name
 
 type
   ParamListT = object  # used to store set parameters from cmd line or file
@@ -28,7 +26,8 @@ var loadParamsCommand {.compileTime.} = "loadParams"  # normally not changed
 var params: Table[string,ParamListT]  # store params during setup
 # param info generated at compile time, as setParam commands are compiled
 var paramInfoCT {.compileTime.} = newSeq[ParamObj](0)
-var paramInfoCTRT = newSeq[ParamObj](0)  # conversion of paramInfoCT to RT variable
+#var paramInfoCTRT = newSeq[ParamObj](0)  # conversion of paramInfoCT to RT variable
+var paramInfoCTRT: seq[ParamObj]  # conversion of paramInfoCT to RT variable
 # param info generated at run time, as setParam commands executed (using set values)
 var paramInfo = newSeq[ParamObj](0)
 
@@ -218,13 +217,16 @@ proc saveParams(x: seq[ParamObj], fn: string, loc: string, thisfile: string) =
 proc addParam(s,r: string, c: string = "", ii: IiType) =
   paramInfo.set(s, r, c, ii.filename, ii.line)
 
-proc addComment(s,c:string):string =
+let spcName = "                "
+let spcValue = "                                   #"
+let spcLoc = "                                            ## "
+proc addComment(s,c,spc:string):string =
   result = s
   if c.len>0:
     let
-      spc = "                                 ## "
       m = min(s.len, spc.len-8)
     result &= spc[m..^1] & c
+proc addComment(s,c:string):string = addComment(s,c,spcValue)
 
 proc echoParamsX*(warnUnknown=false):string =
   result = "Params:\n"
@@ -243,6 +245,8 @@ template echoParams*(warnUnknown=false) =
   echo echoParamsX(warnUnknown)
 
 proc paramHelp*(p:string = ""):string =
+  #echo paramInfo
+  #echo paramInfoCTRT
   result = "Usage:\n  " & getAppFileName()
   var t: ParamObj
   var i = paramInfo.find p
@@ -256,10 +260,21 @@ proc paramHelp*(p:string = ""):string =
     result &= addComment(" -" & p & ":" & t.value & " (current value)", t.comment)
   else:
     result &= " -OPTION:VALUE ...\nAvailable OPTIONs and current VALUEs:"
-    let spc = "                "
-    for t in paramInfo:
+    template p(t) =
       let nm = t.name
-      result &= "\n    " & (nm & spc[min(spc.len-1,nm.len)..^1] & " : " & t.value).addComment(t.comment)
+      var s = nm & spcName[min(spcName.len-1,nm.len)..^1] & " : " & t.value
+      let f = t.file.splitfile[1]
+      let c = f & "," & $t.line
+      s = s.addComment(c,spcValue)
+      s = s.addComment(t.comment,spcLoc)
+      result &= "\n  " & s
+    var paramnames = newSeq[string](0)
+    for t in paramInfo:
+      paramnames.add t.name
+      p(t)
+    for t in paramInfoCTRT:
+      if t.name notin paramnames:  # FIXME: should check location too
+        p(t)
 
 template cnvnone(x:typed):untyped = x
 template makeTypeParam(name,typ,deflt,cnvrt: untyped): untyped {.dirty.} =
@@ -375,15 +390,15 @@ template processSaveParams*(index = -1) =
     let ii = instantiationInfo(index, fullPaths=true)
     saveParams(paramInfo, saveValue, ii.filename & ":" & $ii.line, ii.filename)
 
+var makeParamInfoCTRT: proc()
 template processMakeParamInfoCTRT* =
-  proc makeParamInfoCTRT() {.inject.}
-  makeParamInfoCTRT()
+  if not isNil makeParamInfoCTRT: makeParamInfoCTRT()
 
 template installStandardParams*(idx= -3) =
   installSaveParams(index=idx)
   installLoadParams(index=idx)
   installHelpParam(index=idx)
-  processMakeParamInfoCTRT()
+  #processMakeParamInfoCTRT()
 
 # write param file at compile time
 macro writeParamFileX*(filename: static string, ii: static IiType) =
@@ -405,10 +420,14 @@ macro makeParamInfoCTRTX*(): auto =
     let l = newLit t.line
     result.add quote do:
       paramInfoCTRT.add ParamObj(name:`n`,value:`v`,comment:`c`,file:`f`,line:`l`)
+template finalizeParams* =
+  proc makeParamInfoCTRTImpl(): bool =
+    makeParamInfoCTRTX()
+    true
+  #makeParamInfoCTRT = makeParamInfoCTRTImpl
+  let dum {.global.} = makeParamInfoCTRTImpl()
 template writeParamFile*(filename: static string = "") =
   writeParamFileX(filename, instantiationInfo(fullPaths=true))
-  proc makeParamInfoCTRT() {.inject.} =
-    makeParamInfoCTRTX()
 
 template assertParam*(p:auto, f:auto) =
   if not f p:
@@ -439,8 +458,10 @@ template CLIset*(p:typed, n:untyped, prefix = "") =
     discard
 
 when isMainModule:
-  import qex, paramtest
+  import qex
   qexInit()
+  installHelpParam("h")
+  processHelpParam()
 
   letParam:
     bf = false
@@ -462,11 +483,15 @@ when isMainModule:
     fa1 = @[1.0,1,1,1]
     fax = if true: @[2.0,2,2,2] else: @[3.0,3,3,3]
 
-  installHelpParam("h")
   echoParams()
 
   defaultSetup()
+  proc paramTest* =
+    var i = intParam("i", 0, "Test intParam")
+    var f = floatParam("f", 0.0, "Test floatParam")
+    echo i, f
   paramTest()
 
   writeParamFile("")
   qexFinalize()
+  finalizeParams()
diff --git a/src/base/stdUtils.nim b/src/base/stdUtils.nim
index 5da5f26d..85bc4f3d 100644
--- a/src/base/stdUtils.nim
+++ b/src/base/stdUtils.nim
@@ -56,6 +56,13 @@ proc `|`*(f: float, d: int): string =
 template `|-`*(x:SomeNumber, y: int): auto =
   x | -y
 
+proc values*(e: typedesc[enum]): string =
+  result = ""
+  var sep = ""
+  for v in e:
+    result &= sep & $v
+    sep = " "
+
 proc indexOf*[T](x: openArray[T], y: auto): int =
   let n = x.len
   while result<n and x[result]!=y: inc result
diff --git a/src/base/threading.nim b/src/base/threading.nim
index e4acf106..58689754 100644
--- a/src/base/threading.nim
+++ b/src/base/threading.nim
@@ -318,7 +318,7 @@ template threadSum01A*[T](a: T) =
 template threadSum01B*[T](a: T) =
   ## sum value with result on thread 0, barrier version
   block:
-    tic("threadSum01")
+    tic("threadSum01B")
     if threadNum!=0:
       threadLocals.share[threadNum].p = a.addr
     threadBarrier()
diff --git a/src/bench/benchStagProp.nim b/src/bench/benchStagProp.nim
index 4a518367..14aea877 100644
--- a/src/bench/benchStagProp.nim
+++ b/src/bench/benchStagProp.nim
@@ -9,6 +9,8 @@ import os
 import rng
 
 qexInit()
+installHelpParam()
+processHelpParam()
 #var defaultLat = [4,4,4,4]
 #var defaultLat = [8,8,8,8]
 var defaultLat = @[8,8,8,8]
@@ -81,3 +83,4 @@ s3.solve(v2, v1, mass, sp)
 echoProf()
 
 qexFinalize()
+finalizeParams()
diff --git a/src/layout/layoutX.nim b/src/layout/layoutX.nim
index dc514b6c..239adf0b 100644
--- a/src/layout/layoutX.nim
+++ b/src/layout/layoutX.nim
@@ -242,14 +242,21 @@ proc coord*(l: Layout, coord: ptr cint, ri: tuple[rank,index:cint]) =
     #layoutCoordQ(l.lq.addr, cast[ptr cArray[cint]](coord), li.addr)
     layoutCoordQ(l.lq.addr, toOpenArray(ca,0,l.nDim-1), li.addr)
 
+# from physical index to local rank and index
 proc rankIndex*(lo: Layout, lex: int): tuple[rank,index:int] =
   let n = lo.nDim
-  var c = newSeq[cint](n)
   var k = lex
   for i in 0..<n:
-    c[i] = (k mod lo.physGeom[i]).cint
+    lo.temp[i] = (k mod lo.physGeom[i]).cint
     k = k div lo.physGeom[i]
-  rankIndex(lo, c)
+  rankIndex(lo, lo.temp)
+
+proc physIndex*(lo: Layout, rank,index: int): int =
+  lo.coord(lo.temp, rank, index)
+  let n = lo.nDim
+  result = 0
+  for i in countdown(n-1,0):
+    result = (result*lo.physGeom[i]) + lo.temp[i]
 
 proc rankFromRankCoords*(l: Layout, coords: ptr cArray): int =
   for i in 0..<l.nDim:
diff --git a/src/rng/distribution.nim b/src/rng/distribution.nim
index d1399e38..f09b2faa 100644
--- a/src/rng/distribution.nim
+++ b/src/rng/distribution.nim
@@ -69,7 +69,7 @@ func generate*[T:RandomGenerator](d: UniformRealOpenOpen, g: var T): float =
 ########################## normal #####################################
 
 type
-  NormalAlgorithm = enum
+  NormalAlgorithm* = enum
     naBoxMuller, naPolar, naCDF
   NormalDistribution* = object
     mean: float
diff --git a/src/rng/generator.nim b/src/rng/generator.nim
index c4035eed..e0f36580 100644
--- a/src/rng/generator.nim
+++ b/src/rng/generator.nim
@@ -1,3 +1,5 @@
+import field
+
 type
   RandomGeneratorBase* {.inheritable,pure.} = object
     count: int
@@ -50,6 +52,10 @@ func skip*(e: var Milc6Generator, c = 1) =
   e.incCount(c)
   e.state.skip(c)
 
+func seed*(x: Field[1, Milc6Generator]; sed: auto) =
+  for e in x:
+    x[e].seed(sed)
+
 ########################## MRG32k3a #############################
 import mrg32k3a
 
diff --git a/src/rng/milcrng.nim b/src/rng/milcrng.nim
index e887d49f..276fb45a 100644
--- a/src/rng/milcrng.nim
+++ b/src/rng/milcrng.nim
@@ -54,9 +54,11 @@ const
   SCALE = 1.0'f32 / 0x01000000.float32
   #SCALE1 = 1.0'f32 / MASK.float32
 
-#template maxInt*(x: RngMilc6): int = int(MASK)
+template maxInt*(x: RngMilc6): int = int(MASK)
 template maxInt*(x: typedesc[RngMilc6]): int = int(MASK)
-#template numInts*(x: RngMilc6): int = int(NUMINTS)
+template high*(x: RngMilc6): uint = uint(MASK)
+template high*(x: typedesc[RngMilc6]): uint = uint(MASK)
+template numInts*(x: RngMilc6): int = int(NUMINTS)
 template numInts*(x: typedesc[RngMilc6]): int = int(NUMINTS)
 
 when defined(FUELCompat):
@@ -115,7 +117,7 @@ proc seed*(prn: var RngMilc6; sed,index: auto) =
   QMP_broadcast(ss.addr, sizeof(ss).csize_t)
   seedIndep(prn, ss, index)
 
-proc next(prn: var RngMilc6): uint32 {.inline.} =
+proc nextI(prn: var RngMilc6): uint32 {.inline.} =
   ## internal routine to return next value
   let t = (((prn.r5 shr 7) or (prn.r6 shl 17)) xor
       ((prn.r4 shr 1) or (prn.r5 shl 23))) and MASK
@@ -132,16 +134,20 @@ proc next(prn: var RngMilc6): uint32 {.inline.} =
 
 func skip*(prn: var RngMilc6, c = 1) =
   for i in 1..c:
-    discard prn.next
+    discard prn.nextI
 
 proc integer*(prn: var RngMilc6): int =
   ## Return random integer from 0 to maxInt
-  result = int prn.next
+  result = int prn.nextI
+
+proc next*(prn: var RngMilc6): uint =
+  ## Return random integer from 0 to maxInt
+  result = uint prn.nextI
 
 proc uniform*(prn: var RngMilc6): float32 =
   ## Return random number uniform on [0,1)
   ## The choice of including endpoints may vary among different RNGs
-  let i = prn.next
+  let i = prn.nextI
   result = SCALE * float32(i)
 
 #var QLA_use_milc_gaussian* = false
diff --git a/src/rng/mrg32k3a.nim b/src/rng/mrg32k3a.nim
index 8bdd3741..f8da41e2 100644
--- a/src/rng/mrg32k3a.nim
+++ b/src/rng/mrg32k3a.nim
@@ -77,6 +77,8 @@ when a2sq[76]!=[[1511326704u32, 3759209742u32, 1610795712u32], [4292754251u32, 1
 
 #template maxInt*(x: MRG32k3a): int = int 4294967086
 template maxInt*(x: typedesc[MRG32k3a]): int = int 4294967086
+template high*(x: typedesc[MRG32k3a]): uint = uint 4294967086
+template high*(x: MRG32k3a): uint = uint 4294967086
 #template numInts*(x: MRG32k3a): int = 4294967087
 template numInts*(x: typedesc[MRG32k3a]): int = int 4294967087
 
@@ -152,7 +154,7 @@ proc next0(prn: var MRG32k3a): float {.inline.} =
     result = p1 - p2
 ]#
 
-proc next(prn: var MRG32k3a): int {.inline.} =
+proc nextI(prn: var MRG32k3a): int {.inline.} =
   ## Return random integer uniform on [1,m1]
   const
     a12i = int a12
@@ -185,7 +187,11 @@ proc next(prn: var MRG32k3a): int {.inline.} =
 
 proc integer*(prn: var MRG32k3a): int =
   ## Return random integer from 0 to maxInt
-  result = int(prn.next) - 1
+  result = int(prn.nextI) - 1
+
+proc next*(prn: var MRG32k3a): uint =
+  ## Return random integer from 0 to maxInt
+  result = uint(prn.nextI - 1)
 
 #[
 proc uniform*(prn:var MRG32k3a): float =
@@ -214,7 +220,7 @@ proc uniform*(prn:var MRG32k3a): float =
 ]#
 proc uniform*(prn: var MRG32k3a): float =
   ## Return random number uniform on (0,1)
-  result = norm * prn.next.float
+  result = norm * prn.nextI.float
 
 proc gaussian*(prn: var MRG32k3a): float =
   ## Gaussian normal deviate

From 235ed88306aab6c0236c7cd6394a20bc2aa00d60 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Thu, 5 Dec 2024 18:14:05 -0600
Subject: [PATCH 09/12] added fat7 derivative

---
 src/gauge/fat7l.nim      |   5 +
 src/gauge/fat7lderiv.nim | 345 +++++++++++++++++++++++++++++++++++++++
 src/gauge/hypsmear.nim   |  37 +----
 src/gauge/smearutil.nim  |  50 ++++++
 4 files changed, 401 insertions(+), 36 deletions(-)
 create mode 100644 src/gauge/fat7lderiv.nim
 create mode 100644 src/gauge/smearutil.nim

diff --git a/src/gauge/fat7l.nim b/src/gauge/fat7l.nim
index 2aae3561..9fdaea10 100644
--- a/src/gauge/fat7l.nim
+++ b/src/gauge/fat7l.nim
@@ -13,6 +13,11 @@ type Fat7lCoefs* = object
   sevenStaple*: float
   lepage*: float
 
+#type Fat7lState*[I,O] = object
+#  coefs*: Fat7lCoefs
+#  in*: I
+#  out*: O
+
 proc `$`*(c: Fat7lCoefs): string =
   result  = "oneLink:     " & $c.oneLink & "\n"
   result &= "threeStaple: " & $c.threeStaple & "\n"
diff --git a/src/gauge/fat7lderiv.nim b/src/gauge/fat7lderiv.nim
new file mode 100644
index 00000000..f77b3095
--- /dev/null
+++ b/src/gauge/fat7lderiv.nim
@@ -0,0 +1,345 @@
+import fat7l, smearutil
+import base/profile, layout
+
+let STAPLE_FLOPS = 3*198+216+18
+let SIDE_FORCE_FLOPS = 7*198+3*216+3*18
+
+#[
+proc staple(auto out, auto in0, auto link0, int mu, int nu) =
+#define link     ftmp0[0]
+#define linkmu   ftmp[0][mu]
+#define in       ftmp0[1]
+#define innu     ftmp[1][nu]
+#define linkin   mtmp[0]
+#define back     btmp0[0]
+#define backnu   btmp[0][nu]
+#define linkinnu mtmp[1]
+
+  QDP_M_eq_M(link, link0, QDP_all);
+  QDP_M_eq_sM(linkmu, link, QDP_neighbor[mu], QDP_forward, QDP_all);
+  QDP_M_eq_M(in, in0, QDP_all);
+  QDP_M_eq_sM(innu, in, QDP_neighbor[nu], QDP_forward, QDP_all);
+  QDP_M_eq_Ma_times_M(linkin, link, in, QDP_all);
+  QDP_M_eq_M_times_M(back, linkin, linkmu, QDP_all);
+  QDP_M_eq_sM(backnu, back, QDP_neighbor[nu], QDP_backward, QDP_all);
+  QDP_M_eq_M_times_M(linkinnu, link, innu, QDP_all);
+  QDP_discard_M(innu);
+  QDP_M_peq_M_times_Ma(out, linkinnu, linkmu, QDP_all);
+  QDP_discard_M(linkmu);
+  QDP_M_peq_M(out, backnu, QDP_all);
+  QDP_discard_M(backnu);
+#define STAPLE_FLOPS (3*198+216+18)
+
+#undef link
+#undef linkmu
+#undef in
+#undef innu
+#undef linkin
+#undef back
+#undef backnu
+#undef linkinnu
+}
+]#
+
+#[
+static void
+side_force(QDP_ColorMatrix *force, QDP_ColorMatrix *bot0, QDP_ColorMatrix *side0,
+           QDP_ColorMatrix *top0, int mu, int nu, QDP_ColorMatrix *stpl)
+{
+#define side     ftmp0[0]
+#define sidemu   ftmp[0][mu]
+#define top      ftmp0[1]
+#define topnu    ftmp[1][nu]
+#define bot      ftmp0[2]
+#define botnu    ftmp[2][nu]
+#define sidebot  mtmp[0]
+#define sidetop  mtmp[1]
+#define topnusidebot  btmp0[0]
+#define fbmu          btmp[0][mu]
+#define botnusidetop  btmp0[1]
+#define fmbmu         btmp[1][mu]
+#define sidebotsidemu btmp0[2]
+#define stm           btmp[2][nu]
+#define botnusidemu   mtmp[2]
+#define botsidemu     mtmp[3]
+
+  // force += bot * sidemu * topnu+
+  // force -= bot-mu+ * side-mu * topnu-mu
+  // -= top <-> bot
+  // stpl += side * botnu * sidemu+
+  // stpl += side-nu+ * bot-nu * sidemu-nu
+  QDP_M_eq_M(side, side0, QDP_all);
+  QDP_M_eq_sM(sidemu, side, QDP_neighbor[mu], QDP_forward, QDP_all);
+  QDP_M_eq_M(top, top0, QDP_all);
+  QDP_M_eq_sM(topnu, top, QDP_neighbor[nu], QDP_forward, QDP_all);
+  QDP_M_eq_M(bot, bot0, QDP_all);
+  QDP_M_eq_sM(botnu, bot, QDP_neighbor[nu], QDP_forward, QDP_all);
+  QDP_M_eq_Ma_times_M(sidebot, side, bot, QDP_all);
+  QDP_M_eq_Ma_times_M(sidetop, side, top, QDP_all);
+  QDP_M_eq_Ma_times_M(topnusidebot, topnu, sidebot, QDP_all);
+  QDP_M_eq_sM(fbmu, topnusidebot, QDP_neighbor[mu], QDP_backward, QDP_all);
+  QDP_M_eq_Ma_times_M(botnusidetop, botnu, sidetop, QDP_all);
+  QDP_M_eq_sM(fmbmu, botnusidetop, QDP_neighbor[mu], QDP_backward, QDP_all);
+  QDP_M_eq_M_times_M(sidebotsidemu, sidebot, sidemu, QDP_all);
+  QDP_M_eq_sM(stm, sidebotsidemu, QDP_neighbor[nu], QDP_backward, QDP_all);
+  QDP_M_eq_M_times_Ma(botnusidemu, botnu, sidemu, QDP_all);
+  QDP_discard_M(botnu);
+  QDP_M_peq_M_times_M(stpl, side, botnusidemu, QDP_all);
+ //QDP_M_meq_M_times_Ma(force, top, botnusidemu, QDP_all);
+  QDP_M_peq_M_times_Ma(force, top, botnusidemu, QDP_all);
+  QDP_M_eq_M_times_M(botsidemu, bot, sidemu, QDP_all);
+  QDP_discard_M(sidemu);
+  QDP_M_peq_M_times_Ma(force, botsidemu, topnu, QDP_all);
+  QDP_discard_M(topnu);
+  //QDP_M_meq_Ma(force, fbmu, QDP_all);
+  QDP_M_peq_Ma(force, fbmu, QDP_all);
+  QDP_discard_M(fbmu);
+  QDP_M_peq_Ma(force, fmbmu, QDP_all);
+  QDP_discard_M(fmbmu);
+  QDP_M_peq_M(stpl, stm, QDP_all);
+  QDP_discard_M(stm);
+#define SIDE_FORCE_FLOPS (7*198+3*216+3*18)
+
+#undef side
+#undef sidemu
+#undef top
+#undef topnu
+#undef bot
+#undef botnu
+#undef sidebot
+#undef sidetop
+#undef topnusidebot
+#undef fbmu
+#undef botnusidetop
+#undef fmbmu
+#undef sidebotsidemu
+#undef stm
+#undef botnusidemu
+#undef botsidemu
+}
+]#
+
+proc fat7lderiv(perf: var PerfInfo, gauge: auto, deriv: auto,
+                coef: Fat7lCoefs, mid: auto) =
+  tic()
+  var nflops = 0
+  let coefL = coef.lepage
+  let coef1 = coef.oneLink - 6*coefL
+  let coef3 = coef.threeStaple
+  let coef5 = coef.fiveStaple
+  let coef7 = coef.sevenStaple
+  let have5 = (coef5!=0.0) or (coef7!=0.0) or (coefL!=0.0)
+  let have3 = (coef3!=0.0) or have5
+  type lcm = type(gauge[0])
+  let lo = gauge[0].l
+  proc newlcm: lcm = result.new(gauge[0].l)
+  var
+    stpl3: array[4,lcm]
+    mid5: array[4,lcm]
+    stpl5,mid3: lcm
+    s1: array[4,array[4,Shifter[lcm,type(gauge[0][0])]]]
+    s1a,s1b: array[4,Shifter[lcm,type(gauge[0][0])]]
+    tm1,tm2: lcm
+    sm1: array[4,Shifter[lcm,type(gauge[0][0])]]
+  if have3:
+    if have5:
+      for mu in 0..3:
+        stpl3[mu] = newlcm()
+        mid5[mu] = newlcm()
+        s1b[mu] = newShifter(gauge[0], mu, 1)
+    stpl5 = newlcm()
+    mid3 = newlcm()
+    for mu in 0..3:
+      for nu in 0..3:
+        if nu!=mu:
+          s1[mu][nu] = newShifter(gauge[mu], nu, 1)
+      s1a[mu] = newShifter(gauge[0], mu, 1)
+      sm1[mu] = newShifter(gauge[0], mu, -1)
+    tm1 = newlcm()
+    tm2 = newlcm()
+    threads:
+      for mu in 0..<4:
+        for nu in 0..<4:
+          if nu!=mu:
+            discard s1[mu][nu] ^* gauge[mu]
+  for sig in 0..3:
+    if have5:
+      for mu in 0..3:
+        if mu==sig: continue
+        #QDP_M_eq_zero(stpl3[mu], QDP_all);
+        stpl3[mu] := 0
+        #staple(stpl3[mu], gauge[sig], gauge[mu], sig, mu);
+        symStaple(stpl3[mu], 1.0, gauge[mu], gauge[sig],
+                  s1[mu][sig], s1[sig][mu], tm1, sm1[mu])
+        #QDP_M_eq_zero(mid5[mu], QDP_all);
+        mid5[mu] := 0
+        nflops += STAPLE_FLOPS
+      for rho in 0..3:
+        if rho==sig: continue
+        #QDP_M_eq_zero(stpl5, QDP_all);
+        stpl5 := 0
+        if coef7!=0.0:
+          for mu in 0..3:
+            if  mu==sig or mu==rho: continue
+            for nu in 0..3:
+              if nu==sig or nu==rho or nu==mu: continue
+              #staple(stpl5, stpl3[mu], gauge[nu], sig, nu);
+              discard s1a[nu] ^* stpl3[mu]
+              symStaple(stpl5, 1.0, gauge[nu], stpl3[mu],
+                        s1[nu][sig], s1a[nu], tm1, sm1[nu])
+              nflops += STAPLE_FLOPS
+          #QDP_M_eq_r_times_M(stpl5, &coef7, stpl5, QDP_all);
+          stpl5 *= coef7
+          nflops += 18
+        #QDP_M_eq_zero(mid3, QDP_all);
+        mid3 := 0
+        if coefL!=0.0:
+          #QDP_M_peq_r_times_M(stpl5, &coefL, stpl3[rho], QDP_all);
+          stpl5 += coefL * stpl3[rho]
+          nflops += 36
+        if coefL!=0.0 or coef7!=0.0:
+          #side_force(deriv[rho], mid[sig], gauge[rho], stpl5, sig, rho, mid3);
+          discard s1a[rho] ^* stpl5
+          discard s1b[rho] ^* mid[sig]
+          symStapleDeriv(deriv[rho], mid3,
+                         gauge[rho], stpl5, s1[rho][sig], s1a[rho],
+                         mid[sig], s1b[rho], tm1, tm2, sm1[rho], sm1[sig])
+          nflops += SIDE_FORCE_FLOPS
+        if coefL!=0.0:
+          #QDP_M_peq_r_times_M(mid5[rho], &coefL, mid3, QDP_all);
+          mid5[rho] += coefL * mid3
+          nflops += 36
+        #//QDP_M_eqm_r_times_M(mid3, &coef7, mid3, QDP_all);
+        #QDP_M_eq_r_times_M(mid3, &coef7, mid3, QDP_all);
+        mid3 *= coef7
+        #QDP_M_peq_r_times_M(mid3, &coef5, mid[sig], QDP_all);
+        mid3 += coef5 * mid[sig]
+        nflops += 18+36
+        for mu in 0..3:
+          if mu==sig or mu==rho: continue
+          for nu in 0..3:
+            if nu==sig or nu==rho or nu==mu: continue
+            #side_force(deriv[mu], mid3, gauge[mu], stpl3[nu], sig, mu, mid5[nu]);
+            discard s1a[mu] ^* stpl3[nu]
+            discard s1b[mu] ^* mid3
+            symStapleDeriv(deriv[mu], mid5[nu],
+                           gauge[mu], stpl3[nu], s1[mu][sig], s1a[mu],
+                           mid3, s1b[mu], tm1, tm2, sm1[mu], sm1[sig])
+            nflops += SIDE_FORCE_FLOPS
+
+    if have3:
+      #QDP_M_eq_zero(mid3, QDP_all);
+      mid3 := 0
+      for mu in 0..3:
+        if mu==sig: continue
+        if have5:
+          #//QDP_M_eq_r_times_M_minus_M(stpl5, &coef3, mid[sig], mid5[mu], QDP_all);
+          #QDP_M_eq_r_times_M_plus_M(stpl5, &coef3, mid[sig], mid5[mu], QDP_all);
+          stpl5 := coef3*mid[sig] + mid5[mu]
+          nflops += 36
+        else:
+          #QDP_M_eq_r_times_M(stpl5, &coef3, mid[sig], QDP_all);
+          stpl5 := coef3*mid[sig]
+          nflops += 18
+        #side_force(deriv[mu], stpl5, gauge[mu], gauge[sig], sig, mu, mid3);
+        discard s1a[mu] ^* stpl5
+        symStapleDeriv(deriv[mu], mid3,
+                       gauge[mu], gauge[sig], s1[mu][sig], s1[sig][mu],
+                       stpl5, s1a[mu], tm1, tm2, sm1[mu], sm1[sig])
+        nflops += SIDE_FORCE_FLOPS
+      #//QDP_M_meq_M(deriv[sig], mid3, QDP_all);
+      #QDP_M_peq_M(deriv[sig], mid3, QDP_all);
+      deriv[sig] += mid3
+      nflops += 18
+    if coef1!=0.0:
+      #QDP_M_peq_r_times_M(deriv[sig], &coef1, mid[sig], QDP_all);
+      deriv[sig] += coef1 * mid[sig]
+      nflops += 36
+
+  if coefL!=0.0:
+    # fix up Lepage term
+    let fixL = -coefL
+    for mu in 0..3:
+      #QDP_M_eq_zero(ftmp0[0], QDP_all);
+      tm1 := 0
+      for nu in 0..3:
+        if nu==mu: continue
+        #QDP_M_eq_Ma_times_M(btmp0[0], mid[nu], gauge[nu], QDP_all);
+        tm2 := mid[nu].adj * gauge[nu]
+        #QDP_M_eq_sM(btmp[0][nu], btmp0[0], QDP_neighbor[nu], QDP_backward, QDP_all);
+        discard sm1[nu] ^* tm2
+        #QDP_M_eq_M_times_Ma(stpl5, mid[nu], gauge[nu], QDP_all);
+        stpl5 := mid[nu] * gauge[nu].adj
+        #QDP_M_meq_M(stpl5, btmp[0][nu], QDP_all);
+        stpl5 -= sm1[nu].field
+        #QDP_discard_M(btmp[0][nu]);
+        #QDP_M_peq_M(ftmp0[0], stpl5, QDP_all);
+        #QDP_M_peq_Ma(ftmp0[0], stpl5, QDP_all);
+        tm1 += stpl5 + stpl5.adj
+      #QDP_M_eq_sM(ftmp[0][mu], ftmp0[0], QDP_neighbor[mu], QDP_forward, QDP_all);
+      discard s1a[mu] ^* tm1
+      #QDP_M_eq_M_times_M(stpl5, ftmp0[0], gauge[mu], QDP_all);
+      stpl5 := tm1 * gauge[mu]
+      #QDP_M_meq_M_times_M(stpl5, gauge[mu], ftmp[0][mu], QDP_all);
+      stpl5 -= gauge[mu] * s1a[mu].field
+      #QDP_discard_M(ftmp[0][mu]);
+      #QDP_M_peq_r_times_M(deriv[mu], &fixL, stpl5, QDP_all);
+      deriv[mu] += fixL * stpl5
+    nflops += 4*(3*(2*198+3*18)+198+216+36)
+
+  #info->final_sec = QOP_time() - dtime;
+  #info->final_flop = nflops*QDP_sites_on_node;
+  #info->status = QOP_SUCCESS;
+
+when isMainModule:
+  import qex
+  import physics/qcdTypes
+  import gauge
+  qexInit()
+  #var defaultGaugeFile = "l88.scidac"
+  let defaultLat = @[8,8,8,8]
+  defaultSetup()
+  for mu in 0..<g.len: g[mu] := 1
+  #g.random
+
+  var info: PerfInfo
+  var coef: Fat7lCoefs
+  coef.oneLink = 1.0
+  coef.threeStaple = 1.0
+  coef.fiveStaple = 1.0
+  coef.sevenStaple = 1.0
+  coef.lepage = 1.0
+
+  var fl = lo.newGauge()
+  var fl2 = lo.newGauge()
+  var dfl = lo.newGauge()
+  var g2 = lo.newGauge()
+  var dg = lo.newGauge()
+  var fd = lo.newGauge()
+  for mu in 0..<dg.len:
+    dg[mu] := 0.00001 * g[mu]
+    g2[mu] := g[mu] + dg[mu]
+    fd[mu] := 0
+
+  echo g.plaq
+  echo g2.plaq
+  makeImpLinks(info, fl, g, coef)
+  makeImpLinks(info, fl2, g2, coef)
+  echo fl.plaq
+  echo fl2.plaq
+  #echo ll.plaq
+  #echo pow(1.0,4)/6.0
+  #echo pow(1.0+6.0,4)/6.0
+  #echo pow(1.0+6.0+6.0*4.0,4)/6.0
+  #echo pow(1.0+6.0+6.0*4.0+6.0*4.0*2.0,4)/6.0
+  #echo pow(1.0+6.0+6.0*4.0+6.0*4.0*2.0+6.0,4)/6.0
+
+  fat7lderiv(info, g, fd, coef, dg)
+  for mu in 0..3:
+    dfl[mu] := fl2[mu] - fl[mu]
+    echo dfl[mu].norm2
+    echo fd[mu].norm2
+    dfl[mu] -= fd[mu]
+    echo dfl[mu].norm2
+
+  qexFinalize()
diff --git a/src/gauge/hypsmear.nim b/src/gauge/hypsmear.nim
index 14f792fd..4f5930b0 100644
--- a/src/gauge/hypsmear.nim
+++ b/src/gauge/hypsmear.nim
@@ -3,6 +3,7 @@ import layout
 import gauge
 #import strUtils
 import fat7l
+import smearutil
 
 export PerfInfo
 
@@ -24,42 +25,6 @@ proc `$`*(c: HypCoefs): string =
   result &= "  alpha3: " & $c.alpha3 & "\n"
   result &= "}"
 
-proc symStaple(s: auto, alp: float, g1: auto, g2: auto,
-               s1: auto, s2: auto, tm: auto, sm: auto) =
-  tic()
-  mixin adj
-  tm := g1.adj * g2 * s1.field
-  discard sm ^* tm
-  s += alp * ( g1 * s2.field * s1.field.adj )
-  s += alp * sm.field
-  let nc = g1[0].nrows
-  let siteFlops = float(nc*nc*((6*nc+2*(nc-1))*5+4*2))
-  toc("symStaple", flops=siteFlops*g1.l.nSites)
-
-proc symStapleDeriv(f1, f2: auto;  # output
-                    g1, g2: auto; s1, s2: auto;  # same as symStaple
-                    c: auto, s: auto;  # chain and shift
-                    tm1, tm2: auto;  # temporary fields
-                    sm1, sm2: auto;  # shifts
-                   ) =
-  tic()
-  mixin adj
-  # ∪ s.field.adj * g1.adj * g2 * s1.field
-  # ∪† s.field * s1.field.adj * g2.adj * g1
-  # ∩ c.adj * g1 * s2.field * s1.field.adj
-  # ∩† c * s1.field * s2.field.adj * g1.adj
-  tm1 := g1.adj * c * s1.field  # ∩†2  s2
-  tm2 := g2.adj * g1 * s.field  # ∪†1  s1
-  tm2 += c.adj * g1 * s2.field  # ∩3   s1
-  discard sm1 ^* tm1
-  discard sm2 ^* tm2
-  f1 += g2 * s1.field * s.field.adj  # ∪1   g1
-  f1 += c * s1.field * s2.field.adj  # ∩†3  g1
-  f2 += g1 * s.field * s1.field.adj  # ∪†2  g2
-  f2 += sm1.field
-  f1 += sm2.field
-  toc("symStapleDeriv")
-
 template proj(x: auto) =
   for e in x:
     x[e].projectU x[e]
diff --git a/src/gauge/smearutil.nim b/src/gauge/smearutil.nim
new file mode 100644
index 00000000..68105572
--- /dev/null
+++ b/src/gauge/smearutil.nim
@@ -0,0 +1,50 @@
+import base, layout
+
+proc symStaple*(s: auto, alp: float, g1: auto, g2: auto,
+                s1: auto, s2: auto, tm: auto, sm: auto) =
+  # s = alp*[ shift from back mu(g1.adj * g2 * s1.field) + g1 * s2.field * s1.field.adj ]
+  # g1: side link
+  # g2: middle link
+  # s1: g1 shifted from forward g2 direction
+  # s2: g2 shifted from forward g1 direction
+  # tm: temp gauge field
+  # sm: shifter for backwards g1 direction
+  tic()
+  mixin adj
+  tm := g1.adj * g2 * s1.field
+  discard sm ^* tm
+  s += alp * ( g1 * s2.field * s1.field.adj )
+  s += alp * sm.field
+  let nc = g1[0].nrows
+  let siteFlops = float(nc*nc*((6*nc+2*(nc-1))*5+4*2))
+  toc("symStaple", flops=siteFlops*g1.l.nSites)
+
+proc symStapleDeriv*(f1, f2: auto;  # output
+                     g1, g2: auto; s1, s2: auto;  # same as symStaple
+                     c: auto, s: auto;  # chain and shift
+                     tm1, tm2: auto;  # temporary fields
+                     sm1, sm2: auto;  # shifts
+                    ) =
+  # f1: deriv wrt g1
+  # f2: deriv wrt g2
+  # s: c shifted from forward g1 direction
+  # sm1: shifter from backwards g1 direction
+  # sm2: shifter from backwards g2 direction
+  tic()
+  mixin adj
+  # ∪ s.field.adj * g1.adj * g2 * s1.field
+  # ∪† s.field * s1.field.adj * g2.adj * g1
+  # ∩ c.adj * g1 * s2.field * s1.field.adj
+  # ∩† c * s1.field * s2.field.adj * g1.adj
+  tm1 := g1.adj * c * s1.field  # ∩†2  s2
+  tm2 := g2.adj * g1 * s.field  # ∪†1  s1
+  tm2 += c.adj * g1 * s2.field  # ∩3   s1
+  discard sm1 ^* tm1
+  discard sm2 ^* tm2
+  f1 += g2 * s1.field * s.field.adj  # ∪1   g1
+  f1 += c * s1.field * s2.field.adj  # ∩†3  g1
+  f2 += g1 * s.field * s1.field.adj  # ∪†2  g2
+  f2 += sm1.field
+  f1 += sm2.field
+  toc("symStapleDeriv")
+

From 476ef08e4b67de2d86fde81de852bdd36e9620d1 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Fri, 6 Dec 2024 10:40:26 -0600
Subject: [PATCH 10/12] move PerfInfo

---
 src/base/profile.nim     | 16 ++++++++++++++++
 src/gauge/fat7l.nim      | 10 +++++-----
 src/gauge/fat7lderiv.nim |  8 +++++---
 3 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/src/base/profile.nim b/src/base/profile.nim
index de7ca62e..455e3995 100644
--- a/src/base/profile.nim
+++ b/src/base/profile.nim
@@ -7,6 +7,22 @@ getOptimPragmas()
 
 const noTicToc {.boolDefine.} = false
 
+type PerfInfo* = object
+  count*: int
+  flops*: float
+  secs*: float
+template clear*(pi: var PerfInfo) =
+  pi.count = 0.0
+  pi.flops = 0.0
+  pi.secs = 0.0
+template `+=`*(r: var PerfInfo, x: PerfInfo) =
+  r.count += x.count
+  r.flops += x.flops
+  r.secs += x.secs
+proc `$`*(pi: PerfInfo): string =
+  result = system.`$`(pi)
+  result &= &" {1e-9*pi.flops/pi.secs:.2f} Gflops"
+
 type TicType* = distinct int64
 template getTics*: TicType = TicType(getMonoTime().ticks)
 template nsec*(t:TicType):int64 = int64(t)
diff --git a/src/gauge/fat7l.nim b/src/gauge/fat7l.nim
index 9fdaea10..1a110247 100644
--- a/src/gauge/fat7l.nim
+++ b/src/gauge/fat7l.nim
@@ -1,10 +1,6 @@
 import base
 import layout
-#import gauge
-#import strUtils
-
-type PerfInfo* = object
-  flops*: float
+export PerfInfo
 
 type Fat7lCoefs* = object
   oneLink*: float
@@ -168,6 +164,10 @@ proc makeImpLinks*(info: var PerfInfo, fl: auto, gf: auto, coef: auto,
   #info.final_flop = nflop*QDP_sites_on_node;
   #info.status = QOP_SUCCESS;
   toc()
+  inc info.count
+  info.flops += nflop * gf[0].l.localGeom.prod
+  info.secs += getElapsedTime()
+
 proc makeImpLinks*(info: var PerfInfo, fl: auto, gf: auto, coef: auto) =
   makeImpLinks(info, fl, gf, coef, fl, gf, 0.0)
 
diff --git a/src/gauge/fat7lderiv.nim b/src/gauge/fat7lderiv.nim
index f77b3095..a579e4fc 100644
--- a/src/gauge/fat7lderiv.nim
+++ b/src/gauge/fat7lderiv.nim
@@ -292,9 +292,8 @@ proc fat7lderiv(perf: var PerfInfo, gauge: auto, deriv: auto,
   #info->status = QOP_SUCCESS;
 
 when isMainModule:
-  import qex
-  import physics/qcdTypes
-  import gauge
+  import qex, physics/qcdTypes
+  import strformat
   qexInit()
   #var defaultGaugeFile = "l88.scidac"
   let defaultLat = @[8,8,8,8]
@@ -324,7 +323,9 @@ when isMainModule:
   echo g.plaq
   echo g2.plaq
   makeImpLinks(info, fl, g, coef)
+  echo info
   makeImpLinks(info, fl2, g2, coef)
+  echo info
   echo fl.plaq
   echo fl2.plaq
   #echo ll.plaq
@@ -335,6 +336,7 @@ when isMainModule:
   #echo pow(1.0+6.0+6.0*4.0+6.0*4.0*2.0+6.0,4)/6.0
 
   fat7lderiv(info, g, fd, coef, dg)
+  echo info
   for mu in 0..3:
     dfl[mu] := fl2[mu] - fl[mu]
     echo dfl[mu].norm2

From 5b97195b7022ca220b85dc3281d4eb16d36588ec Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Fri, 6 Dec 2024 12:50:32 -0600
Subject: [PATCH 11/12] add naik derivative

---
 src/gauge/fat7l.nim         | 13 ++++---
 src/gauge/fat7lderiv.nim    | 74 ++++++++++++++++++++++++++++++++++---
 src/physics/asqtadLinks.nim |  2 +-
 src/physics/hisqLinks.nim   |  4 +-
 4 files changed, 78 insertions(+), 15 deletions(-)

diff --git a/src/gauge/fat7l.nim b/src/gauge/fat7l.nim
index 1a110247..49216b4d 100644
--- a/src/gauge/fat7l.nim
+++ b/src/gauge/fat7l.nim
@@ -68,8 +68,9 @@ proc computeGenStaple(staple: auto, mu,nu: int, link: auto, coef: float,
   #if(link!=gauge[mu]) QDP_discard_M(ts0);
   #QDP_discard_M(ts2);
 
-proc makeImpLinks*(info: var PerfInfo, fl: auto, gf: auto, coef: auto,
-                   ll: auto, gfLong: auto, naik: auto) =
+proc makeImpLinks*(fl: auto, gf: auto, coef: auto,
+                   ll: auto, gfLong: auto, naik: auto,
+                   info: var PerfInfo) =
   tic()
   type lcm = type(gf[0])
   proc QDP_create_M(): lcm = result.new(gf[0].l)
@@ -168,8 +169,8 @@ proc makeImpLinks*(info: var PerfInfo, fl: auto, gf: auto, coef: auto,
   info.flops += nflop * gf[0].l.localGeom.prod
   info.secs += getElapsedTime()
 
-proc makeImpLinks*(info: var PerfInfo, fl: auto, gf: auto, coef: auto) =
-  makeImpLinks(info, fl, gf, coef, fl, gf, 0.0)
+proc makeImpLinks*(fl: auto, gf: auto, coef: auto, info: var PerfInfo) =
+  makeImpLinks(fl, gf, coef, fl, gf, 0.0, info)
 
 when isMainModule:
   import qex
@@ -196,8 +197,8 @@ when isMainModule:
   var g3 = g
 
   echo g.plaq
-  makeImpLinks(info, fl, g, coef)
-  makeImpLinks(info, fl, g, coef, ll, g3, naik)
+  makeImpLinks(fl, g, coef, info)
+  makeImpLinks(fl, g, coef, ll, g3, naik, info)
   echo fl.plaq
   echo ll.plaq
   echo pow(1.0,4)/6.0
diff --git a/src/gauge/fat7lderiv.nim b/src/gauge/fat7lderiv.nim
index a579e4fc..08edd857 100644
--- a/src/gauge/fat7lderiv.nim
+++ b/src/gauge/fat7lderiv.nim
@@ -119,8 +119,9 @@ side_force(QDP_ColorMatrix *force, QDP_ColorMatrix *bot0, QDP_ColorMatrix *side0
 }
 ]#
 
-proc fat7lderiv(perf: var PerfInfo, gauge: auto, deriv: auto,
-                coef: Fat7lCoefs, mid: auto) =
+proc fat7lDeriv(deriv: auto, gauge: auto, mid: auto, coef: Fat7lCoefs,
+                llderiv: auto, llgauge: auto, llmid: auto, naik: float,
+                perf: var PerfInfo) =
   tic()
   var nflops = 0
   let coefL = coef.lepage
@@ -287,9 +288,49 @@ proc fat7lderiv(perf: var PerfInfo, gauge: auto, deriv: auto,
       deriv[mu] += fixL * stpl5
     nflops += 4*(3*(2*198+3*18)+198+216+36)
 
+  if naik!=0.0:
+    for mu in 0..3:
+      # force += mid * Umumu' * Umu'
+      # force -= U-mu' * mid-mu * Umu'
+      # force += U-mu' * U-mu-mu' * mid-mu-mu
+      #QDP_M_eq_M(Uf, U, QDP_all);
+      #QDP_M_eq_sM(Umu, Uf, QDP_neighbor[mu], QDP_forward, QDP_all);
+      discard s1a[mu] ^* llgauge[mu]
+      #QDP_M_eq_Ma_times_M(Umid, Uf, mid, QDP_all);
+      tm1 := llgauge[mu].adj * llmid[mu]
+      #QDP_M_eq_sM(Umidbmu, Umid, QDP_neighbor[mu], QDP_backward, QDP_all);
+      discard sm1[mu] ^* tm1
+      #QDP_M_eq_Ma_times_Ma(UmuU, Umu, Uf, QDP_all);
+      tm2 := s1a[mu].field.adj * llgauge[mu].adj
+      #QDP_M_eq_sM(UmuUs, UmuU, QDP_neighbor[mu], QDP_forward, QDP_all);
+      discard s1b[mu] ^* tm2
+      #QDP_M_eq_Ma_times_M(f3b, Uf, Umidbmu, QDP_all);
+      llderiv[mu] += naik*(sm1[mu].field*s1a[mu].field.adj)
+      tm1 := llgauge[mu].adj * sm1[mu].field
+      #QDP_M_eq_sM(f3, f3b, QDP_neighbor[mu], QDP_backward, QDP_all);
+      discard sm1[mu] ^* tm1
+      #QDP_M_eq_M_times_M(f, mid, UmuUs, QDP_all);
+      #QDP_discard_M(UmuUs);
+      #QDP_M_peq_M_times_Ma(f, Umidbmu, Umu, QDP_all);
+      #QDP_discard_M(Umidbmu);
+      #QDP_discard_M(Umu);
+      #QDP_M_peq_M(f, f3, QDP_all);
+      #QDP_discard_M(f3);
+      #QDP_M_peq_r_times_M(deriv[mu], &coefN, f, QDP_all);
+      llderiv[mu] += naik*(llmid[mu]*s1b[mu].field+sm1[mu].field)
+
   #info->final_sec = QOP_time() - dtime;
   #info->final_flop = nflops*QDP_sites_on_node;
   #info->status = QOP_SUCCESS;
+  toc()
+  inc perf.count
+  perf.flops += nflops * gauge[0].l.localGeom.prod
+  perf.secs += getElapsedTime()
+
+proc fat7lDeriv(deriv: auto, gauge: auto, mid: auto, coef: Fat7lCoefs,
+                perf: var PerfInfo) =
+  fat7lDeriv(deriv, gauge, mid, coef, deriv, gauge, mid, 0.0, perf)
+
 
 when isMainModule:
   import qex, physics/qcdTypes
@@ -308,13 +349,17 @@ when isMainModule:
   coef.fiveStaple = 1.0
   coef.sevenStaple = 1.0
   coef.lepage = 1.0
+  var naik = 1.0
 
   var fl = lo.newGauge()
   var fl2 = lo.newGauge()
+  var ll = lo.newGauge()
+  var ll2 = lo.newGauge()
   var dfl = lo.newGauge()
   var g2 = lo.newGauge()
   var dg = lo.newGauge()
   var fd = lo.newGauge()
+  var ld = lo.newGauge()
   for mu in 0..<dg.len:
     dg[mu] := 0.00001 * g[mu]
     g2[mu] := g[mu] + dg[mu]
@@ -322,9 +367,9 @@ when isMainModule:
 
   echo g.plaq
   echo g2.plaq
-  makeImpLinks(info, fl, g, coef)
+  makeImpLinks(fl, g, coef, info)
   echo info
-  makeImpLinks(info, fl2, g2, coef)
+  makeImpLinks(fl2, g2, coef, info)
   echo info
   echo fl.plaq
   echo fl2.plaq
@@ -335,13 +380,30 @@ when isMainModule:
   #echo pow(1.0+6.0+6.0*4.0+6.0*4.0*2.0,4)/6.0
   #echo pow(1.0+6.0+6.0*4.0+6.0*4.0*2.0+6.0,4)/6.0
 
-  fat7lderiv(info, g, fd, coef, dg)
+  fat7lderiv(fd, g, dg, coef, info)
   echo info
   for mu in 0..3:
     dfl[mu] := fl2[mu] - fl[mu]
+    #echo dfl[mu].norm2
+    #echo fd[mu].norm2
+    dfl[mu] -= fd[mu]
     echo dfl[mu].norm2
-    echo fd[mu].norm2
+
+  for mu in 0..<dg.len:
+    fd[mu] := 0
+    ld[mu] := 0
+  makeImpLinks(fl, g, coef, ll, g, naik, info)
+  makeImpLinks(fl2, g2, coef, ll2, g2, naik, info)
+  fat7lderiv(fd, g, dg, coef, ld, g, dg, naik, info)
+  echo info
+  for mu in 0..3:
+    dfl[mu] := fl2[mu] - fl[mu]
+    #echo dfl[mu].norm2
+    #echo fd[mu].norm2
     dfl[mu] -= fd[mu]
     echo dfl[mu].norm2
+    dfl[mu] := ll2[mu] - ll[mu]
+    dfl[mu] -= ld[mu]
+    echo dfl[mu].norm2
 
   qexFinalize()
diff --git a/src/physics/asqtadLinks.nim b/src/physics/asqtadLinks.nim
index ff0a3444..8574b88c 100644
--- a/src/physics/asqtadLinks.nim
+++ b/src/physics/asqtadLinks.nim
@@ -26,7 +26,7 @@ proc `$`*(c: AsqtadCoefs): string =
 
 proc smear*(c: AsqtadCoefs, g: any, fl,ll: any) =
   var info: PerfInfo
-  makeImpLinks(info, fl, g, c.fat7, ll, g, c.naik)
+  makeImpLinks(fl, g, c.fat7, ll, g, c.naik, info)
 
 when isMainModule:
   import qex
diff --git a/src/physics/hisqLinks.nim b/src/physics/hisqLinks.nim
index 150b8116..092c631c 100644
--- a/src/physics/hisqLinks.nim
+++ b/src/physics/hisqLinks.nim
@@ -32,11 +32,11 @@ proc `$`*(c: HisqCoefs): string =
 proc smear*(c: HisqCoefs, g: auto, fl,ll: auto, t1,t2: auto) =
   mixin projectU
   var info: PerfInfo
-  makeImpLinks(info, t1, g, c.fat7first)
+  makeImpLinks(t1, g, c.fat7first, info)
   for mu in 0..<4:
     for i in t2[mu]:
       projectU(t2[mu][i], t1[mu][i])
-  makeImpLinks(info, fl, t2, c.fat7second, ll, t2, c.naik)
+  makeImpLinks(fl, t2, c.fat7second, ll, t2, c.naik, info)
 proc smear*(c: HisqCoefs, g: auto, fl,ll: auto) =
   var t1 = fl.newOneOf
   var t2 = fl.newOneOf

From ea83f2e1ccc7acdd90c061ece7885ec4c30c7ea8 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Fri, 6 Dec 2024 15:58:33 -0600
Subject: [PATCH 12/12] add threading to fat7

---
 src/base/profile.nim     |   2 +-
 src/base/threading.nim   |  12 ++
 src/gauge/fat7l.nim      |  95 ++++++------
 src/gauge/fat7lderiv.nim | 311 +++++++++++++++++++--------------------
 4 files changed, 210 insertions(+), 210 deletions(-)

diff --git a/src/base/profile.nim b/src/base/profile.nim
index 455e3995..56b665e3 100644
--- a/src/base/profile.nim
+++ b/src/base/profile.nim
@@ -12,7 +12,7 @@ type PerfInfo* = object
   flops*: float
   secs*: float
 template clear*(pi: var PerfInfo) =
-  pi.count = 0.0
+  pi.count = 0
   pi.flops = 0.0
   pi.secs = 0.0
 template `+=`*(r: var PerfInfo, x: PerfInfo) =
diff --git a/src/base/threading.nim b/src/base/threading.nim
index 58689754..1e5faba2 100644
--- a/src/base/threading.nim
+++ b/src/base/threading.nim
@@ -460,6 +460,18 @@ macro threadSum2*(a:varargs[untyped]):auto =
   result.add(m)
   #echo result.treeRepr
 
+type ThreadSingle*[T] = distinct T
+template `[]`*[T](x: ThreadSingle[T]): auto = T(x)
+template `[]=`*[T](x: ThreadSingle[T], y: auto): auto =
+  T(x) = y
+template `=`*(x: var ThreadSingle, y: auto) =
+  threadSingle:
+    x[] = y
+template `+=`*(x: var ThreadSingle, y: auto) =
+  threadSingle:
+    x[] += y
+converter fromThreadSingle*[T](x: ThreadSingle[T]): T = T(x)
+
 when isMainModule:
   threadsInit()
   echo threadNum, "/", numThreads
diff --git a/src/gauge/fat7l.nim b/src/gauge/fat7l.nim
index 49216b4d..c8459689 100644
--- a/src/gauge/fat7l.nim
+++ b/src/gauge/fat7l.nim
@@ -71,7 +71,7 @@ proc computeGenStaple(staple: auto, mu,nu: int, link: auto, coef: float,
 proc makeImpLinks*(fl: auto, gf: auto, coef: auto,
                    ll: auto, gfLong: auto, naik: auto,
                    info: var PerfInfo) =
-  tic()
+  tic("makeImpLinks")
   type lcm = type(gf[0])
   proc QDP_create_M(): lcm = result.new(gf[0].l)
   var
@@ -111,60 +111,49 @@ proc makeImpLinks*(fl: auto, gf: auto, coef: auto,
         for nu in 0..<4:
           if dir!=nu:
             tsg[dir][nu] = newShifter(gf[dir], nu, 1)
-            discard tsg[dir][nu] ^* gf[dir]
-
-  for dir in 0..<4:
-    #QDP_M_eq_r_times_M(fl[dir], &coef1, gf[dir], QDP_all);
-    fl[dir] := coef1 * gf[dir]
-    if have3:
-      for nu in 0..<4:
-        if nu!=dir:
-          compute_gen_staple(staple, dir, nu, gf[dir], coef3, gf, fl,
-                             tsg[dir][nu], tsg[nu][dir], t1, t2, ts2[nu])
-          if coefL!=0.0:
-            compute_gen_staple(false, dir, nu, staple, coefL, gf, fl,
-                               tsl[nu], tsg[nu][dir], t1, t2, ts2[nu])
-          if coef5!=0.0 or coef7!=0.0:
-            for rho in 0..<4:
-              if (rho!=dir) and (rho!=nu):
-                compute_gen_staple(tempmat1, dir, rho, staple, coef5, gf, fl,
-                                   tsl[rho], tsg[rho][dir], t1, t2, ts2[rho])
-                if coef7!=0.0:
-                  for sig in 0..<4:
-                    if (sig!=dir) and (sig!=nu) and (sig!=rho):
-                      compute_gen_staple(false, dir, sig, tempmat1,coef7,gf,fl,
-                                         ts1[sig],tsg[sig][dir],t1,t2,ts2[sig])
-
-  # long links
-  if naik!=0.0:
+      threads:
+        for dir in 0..<4:
+          for nu in 0..<4:
+            if dir!=nu:
+              discard tsg[dir][nu] ^*! gf[dir]
+
+  toc("main loop")
+  threads:
     for dir in 0..<4:
-      #QDP_M_eq_sM(staple, gfLong[dir], QDP_neighbor[dir], QDP_forward,QDP_all)
-      #QDP_M_eq_M_times_M(tempmat1, gfLong[dir], staple, QDP_all)
-      #QDP_M_eq_sM(staple, tempmat1, QDP_neighbor[dir], QDP_forward, QDP_all)
-      #QDP_M_eq_M_times_M(ll[dir], gfLong[dir], staple, QDP_all)
-      #QDP_M_eq_r_times_M(ll[dir], &naik, ll[dir], QDP_all)
-      discard tsl[dir] ^* gfLong[dir]
-      discard ts1[dir] ^* (gfLong[dir] * tsl[dir].field)
-      ll[dir] := naik * (gfLong[dir] * ts1[dir].field)
-
-  #[
-  if have3 or naik!=0.0:
-    QDP_destroy_M(staple);
-    QDP_destroy_M(tempmat1);
-    if have3:
-      QDP_destroy_M(t1);
-      QDP_destroy_M(t2);
-      for dir in 0..<4:
-        QDP_destroy_M(tsl[dir]);
-        QDP_destroy_M(ts1[dir]);
-        QDP_destroy_M(ts2[dir]);
+      #QDP_M_eq_r_times_M(fl[dir], &coef1, gf[dir], QDP_all);
+      fl[dir] := coef1 * gf[dir]
+      if have3:
         for nu in 0..<4:
-          if(tsg[dir][nu]!=nil) QDP_destroy_M(tsg[dir][nu])
-  ]#
-  #info.final_sec = dtime;
-  #info.final_flop = nflop*QDP_sites_on_node;
-  #info.status = QOP_SUCCESS;
-  toc()
+          if nu!=dir:
+            compute_gen_staple(staple, dir, nu, gf[dir], coef3, gf, fl,
+                               tsg[dir][nu], tsg[nu][dir], t1, t2, ts2[nu])
+            if coefL!=0.0:
+              compute_gen_staple(false, dir, nu, staple, coefL, gf, fl,
+                                 tsl[nu], tsg[nu][dir], t1, t2, ts2[nu])
+            if coef5!=0.0 or coef7!=0.0:
+              for rho in 0..<4:
+                if (rho!=dir) and (rho!=nu):
+                  compute_gen_staple(tempmat1, dir, rho, staple, coef5, gf, fl,
+                                     tsl[rho], tsg[rho][dir], t1, t2, ts2[rho])
+                  if coef7!=0.0:
+                    for sig in 0..<4:
+                      if (sig!=dir) and (sig!=nu) and (sig!=rho):
+                        compute_gen_staple(false, dir, sig, tempmat1,coef7,gf,fl,
+                                           ts1[sig],tsg[sig][dir],t1,t2,ts2[sig])
+
+    # long links
+    if naik!=0.0:
+      for dir in 0..<4:
+        #QDP_M_eq_sM(staple, gfLong[dir], QDP_neighbor[dir], QDP_forward,QDP_all)
+        #QDP_M_eq_M_times_M(tempmat1, gfLong[dir], staple, QDP_all)
+        #QDP_M_eq_sM(staple, tempmat1, QDP_neighbor[dir], QDP_forward, QDP_all)
+        #QDP_M_eq_M_times_M(ll[dir], gfLong[dir], staple, QDP_all)
+        #QDP_M_eq_r_times_M(ll[dir], &naik, ll[dir], QDP_all)
+        discard tsl[dir] ^* gfLong[dir]
+        discard ts1[dir] ^* (gfLong[dir] * tsl[dir].field)
+        ll[dir] := naik * (gfLong[dir] * ts1[dir].field)
+
+  toc("end")
   inc info.count
   info.flops += nflop * gf[0].l.localGeom.prod
   info.secs += getElapsedTime()
diff --git a/src/gauge/fat7lderiv.nim b/src/gauge/fat7lderiv.nim
index 08edd857..5ba34fd7 100644
--- a/src/gauge/fat7lderiv.nim
+++ b/src/gauge/fat7lderiv.nim
@@ -122,8 +122,8 @@ side_force(QDP_ColorMatrix *force, QDP_ColorMatrix *bot0, QDP_ColorMatrix *side0
 proc fat7lDeriv(deriv: auto, gauge: auto, mid: auto, coef: Fat7lCoefs,
                 llderiv: auto, llgauge: auto, llmid: auto, naik: float,
                 perf: var PerfInfo) =
-  tic()
-  var nflops = 0
+  tic("fat7lDeriv")
+  var nflops = ThreadSingle[int](0)
   let coefL = coef.lepage
   let coef1 = coef.oneLink - 6*coefL
   let coef3 = coef.threeStaple
@@ -162,167 +162,168 @@ proc fat7lDeriv(deriv: auto, gauge: auto, mid: auto, coef: Fat7lCoefs,
       for mu in 0..<4:
         for nu in 0..<4:
           if nu!=mu:
-            discard s1[mu][nu] ^* gauge[mu]
-  for sig in 0..3:
-    if have5:
-      for mu in 0..3:
-        if mu==sig: continue
-        #QDP_M_eq_zero(stpl3[mu], QDP_all);
-        stpl3[mu] := 0
-        #staple(stpl3[mu], gauge[sig], gauge[mu], sig, mu);
-        symStaple(stpl3[mu], 1.0, gauge[mu], gauge[sig],
-                  s1[mu][sig], s1[sig][mu], tm1, sm1[mu])
-        #QDP_M_eq_zero(mid5[mu], QDP_all);
-        mid5[mu] := 0
-        nflops += STAPLE_FLOPS
-      for rho in 0..3:
-        if rho==sig: continue
-        #QDP_M_eq_zero(stpl5, QDP_all);
-        stpl5 := 0
-        if coef7!=0.0:
+            discard s1[mu][nu] ^*! gauge[mu]
+  threads:
+    for sig in 0..3:
+      if have5:
+        for mu in 0..3:
+          if mu==sig: continue
+          #QDP_M_eq_zero(stpl3[mu], QDP_all);
+          stpl3[mu] := 0
+          #staple(stpl3[mu], gauge[sig], gauge[mu], sig, mu);
+          symStaple(stpl3[mu], 1.0, gauge[mu], gauge[sig],
+                    s1[mu][sig], s1[sig][mu], tm1, sm1[mu])
+          #QDP_M_eq_zero(mid5[mu], QDP_all);
+          mid5[mu] := 0
+          nflops += STAPLE_FLOPS
+        for rho in 0..3:
+          if rho==sig: continue
+          #QDP_M_eq_zero(stpl5, QDP_all);
+          stpl5 := 0
+          if coef7!=0.0:
+            for mu in 0..3:
+              if  mu==sig or mu==rho: continue
+              for nu in 0..3:
+                if nu==sig or nu==rho or nu==mu: continue
+                #staple(stpl5, stpl3[mu], gauge[nu], sig, nu);
+                discard s1a[nu] ^* stpl3[mu]
+                symStaple(stpl5, 1.0, gauge[nu], stpl3[mu],
+                          s1[nu][sig], s1a[nu], tm1, sm1[nu])
+                nflops += STAPLE_FLOPS
+            #QDP_M_eq_r_times_M(stpl5, &coef7, stpl5, QDP_all);
+            stpl5 *= coef7
+            nflops += 18
+          #QDP_M_eq_zero(mid3, QDP_all);
+          mid3 := 0
+          if coefL!=0.0:
+            #QDP_M_peq_r_times_M(stpl5, &coefL, stpl3[rho], QDP_all);
+            stpl5 += coefL * stpl3[rho]
+            nflops += 36
+          if coefL!=0.0 or coef7!=0.0:
+            #side_force(deriv[rho], mid[sig], gauge[rho], stpl5, sig, rho, mid3);
+            discard s1a[rho] ^* stpl5
+            discard s1b[rho] ^* mid[sig]
+            symStapleDeriv(deriv[rho], mid3,
+                           gauge[rho], stpl5, s1[rho][sig], s1a[rho],
+                           mid[sig], s1b[rho], tm1, tm2, sm1[rho], sm1[sig])
+            nflops += SIDE_FORCE_FLOPS
+          if coefL!=0.0:
+            #QDP_M_peq_r_times_M(mid5[rho], &coefL, mid3, QDP_all);
+            mid5[rho] += coefL * mid3
+            nflops += 36
+          #//QDP_M_eqm_r_times_M(mid3, &coef7, mid3, QDP_all);
+          #QDP_M_eq_r_times_M(mid3, &coef7, mid3, QDP_all);
+          mid3 *= coef7
+          #QDP_M_peq_r_times_M(mid3, &coef5, mid[sig], QDP_all);
+          mid3 += coef5 * mid[sig]
+          nflops += 18+36
           for mu in 0..3:
-            if  mu==sig or mu==rho: continue
+            if mu==sig or mu==rho: continue
             for nu in 0..3:
               if nu==sig or nu==rho or nu==mu: continue
-              #staple(stpl5, stpl3[mu], gauge[nu], sig, nu);
-              discard s1a[nu] ^* stpl3[mu]
-              symStaple(stpl5, 1.0, gauge[nu], stpl3[mu],
-                        s1[nu][sig], s1a[nu], tm1, sm1[nu])
-              nflops += STAPLE_FLOPS
-          #QDP_M_eq_r_times_M(stpl5, &coef7, stpl5, QDP_all);
-          stpl5 *= coef7
-          nflops += 18
+              #side_force(deriv[mu], mid3, gauge[mu], stpl3[nu], sig, mu, mid5[nu]);
+              discard s1a[mu] ^* stpl3[nu]
+              discard s1b[mu] ^* mid3
+              symStapleDeriv(deriv[mu], mid5[nu],
+                             gauge[mu], stpl3[nu], s1[mu][sig], s1a[mu],
+                             mid3, s1b[mu], tm1, tm2, sm1[mu], sm1[sig])
+              nflops += SIDE_FORCE_FLOPS
+
+      if have3:
         #QDP_M_eq_zero(mid3, QDP_all);
         mid3 := 0
-        if coefL!=0.0:
-          #QDP_M_peq_r_times_M(stpl5, &coefL, stpl3[rho], QDP_all);
-          stpl5 += coefL * stpl3[rho]
-          nflops += 36
-        if coefL!=0.0 or coef7!=0.0:
-          #side_force(deriv[rho], mid[sig], gauge[rho], stpl5, sig, rho, mid3);
-          discard s1a[rho] ^* stpl5
-          discard s1b[rho] ^* mid[sig]
-          symStapleDeriv(deriv[rho], mid3,
-                         gauge[rho], stpl5, s1[rho][sig], s1a[rho],
-                         mid[sig], s1b[rho], tm1, tm2, sm1[rho], sm1[sig])
-          nflops += SIDE_FORCE_FLOPS
-        if coefL!=0.0:
-          #QDP_M_peq_r_times_M(mid5[rho], &coefL, mid3, QDP_all);
-          mid5[rho] += coefL * mid3
-          nflops += 36
-        #//QDP_M_eqm_r_times_M(mid3, &coef7, mid3, QDP_all);
-        #QDP_M_eq_r_times_M(mid3, &coef7, mid3, QDP_all);
-        mid3 *= coef7
-        #QDP_M_peq_r_times_M(mid3, &coef5, mid[sig], QDP_all);
-        mid3 += coef5 * mid[sig]
-        nflops += 18+36
         for mu in 0..3:
-          if mu==sig or mu==rho: continue
-          for nu in 0..3:
-            if nu==sig or nu==rho or nu==mu: continue
-            #side_force(deriv[mu], mid3, gauge[mu], stpl3[nu], sig, mu, mid5[nu]);
-            discard s1a[mu] ^* stpl3[nu]
-            discard s1b[mu] ^* mid3
-            symStapleDeriv(deriv[mu], mid5[nu],
-                           gauge[mu], stpl3[nu], s1[mu][sig], s1a[mu],
-                           mid3, s1b[mu], tm1, tm2, sm1[mu], sm1[sig])
-            nflops += SIDE_FORCE_FLOPS
+          if mu==sig: continue
+          if have5:
+            #//QDP_M_eq_r_times_M_minus_M(stpl5, &coef3, mid[sig], mid5[mu], QDP_all);
+            #QDP_M_eq_r_times_M_plus_M(stpl5, &coef3, mid[sig], mid5[mu], QDP_all);
+            stpl5 := coef3*mid[sig] + mid5[mu]
+            nflops += 36
+          else:
+            #QDP_M_eq_r_times_M(stpl5, &coef3, mid[sig], QDP_all);
+            stpl5 := coef3*mid[sig]
+            nflops += 18
+          #side_force(deriv[mu], stpl5, gauge[mu], gauge[sig], sig, mu, mid3);
+          discard s1a[mu] ^* stpl5
+          symStapleDeriv(deriv[mu], mid3,
+                         gauge[mu], gauge[sig], s1[mu][sig], s1[sig][mu],
+                         stpl5, s1a[mu], tm1, tm2, sm1[mu], sm1[sig])
+          nflops += SIDE_FORCE_FLOPS
+        #//QDP_M_meq_M(deriv[sig], mid3, QDP_all);
+        #QDP_M_peq_M(deriv[sig], mid3, QDP_all);
+        deriv[sig] += mid3
+        nflops += 18
+      if coef1!=0.0:
+        #QDP_M_peq_r_times_M(deriv[sig], &coef1, mid[sig], QDP_all);
+        deriv[sig] += coef1 * mid[sig]
+        nflops += 36
 
-    if have3:
-      #QDP_M_eq_zero(mid3, QDP_all);
-      mid3 := 0
+    if coefL!=0.0:
+      # fix up Lepage term
+      let fixL = -coefL
       for mu in 0..3:
-        if mu==sig: continue
-        if have5:
-          #//QDP_M_eq_r_times_M_minus_M(stpl5, &coef3, mid[sig], mid5[mu], QDP_all);
-          #QDP_M_eq_r_times_M_plus_M(stpl5, &coef3, mid[sig], mid5[mu], QDP_all);
-          stpl5 := coef3*mid[sig] + mid5[mu]
-          nflops += 36
-        else:
-          #QDP_M_eq_r_times_M(stpl5, &coef3, mid[sig], QDP_all);
-          stpl5 := coef3*mid[sig]
-          nflops += 18
-        #side_force(deriv[mu], stpl5, gauge[mu], gauge[sig], sig, mu, mid3);
-        discard s1a[mu] ^* stpl5
-        symStapleDeriv(deriv[mu], mid3,
-                       gauge[mu], gauge[sig], s1[mu][sig], s1[sig][mu],
-                       stpl5, s1a[mu], tm1, tm2, sm1[mu], sm1[sig])
-        nflops += SIDE_FORCE_FLOPS
-      #//QDP_M_meq_M(deriv[sig], mid3, QDP_all);
-      #QDP_M_peq_M(deriv[sig], mid3, QDP_all);
-      deriv[sig] += mid3
-      nflops += 18
-    if coef1!=0.0:
-      #QDP_M_peq_r_times_M(deriv[sig], &coef1, mid[sig], QDP_all);
-      deriv[sig] += coef1 * mid[sig]
-      nflops += 36
-
-  if coefL!=0.0:
-    # fix up Lepage term
-    let fixL = -coefL
-    for mu in 0..3:
-      #QDP_M_eq_zero(ftmp0[0], QDP_all);
-      tm1 := 0
-      for nu in 0..3:
-        if nu==mu: continue
-        #QDP_M_eq_Ma_times_M(btmp0[0], mid[nu], gauge[nu], QDP_all);
-        tm2 := mid[nu].adj * gauge[nu]
-        #QDP_M_eq_sM(btmp[0][nu], btmp0[0], QDP_neighbor[nu], QDP_backward, QDP_all);
-        discard sm1[nu] ^* tm2
-        #QDP_M_eq_M_times_Ma(stpl5, mid[nu], gauge[nu], QDP_all);
-        stpl5 := mid[nu] * gauge[nu].adj
-        #QDP_M_meq_M(stpl5, btmp[0][nu], QDP_all);
-        stpl5 -= sm1[nu].field
-        #QDP_discard_M(btmp[0][nu]);
-        #QDP_M_peq_M(ftmp0[0], stpl5, QDP_all);
-        #QDP_M_peq_Ma(ftmp0[0], stpl5, QDP_all);
-        tm1 += stpl5 + stpl5.adj
-      #QDP_M_eq_sM(ftmp[0][mu], ftmp0[0], QDP_neighbor[mu], QDP_forward, QDP_all);
-      discard s1a[mu] ^* tm1
-      #QDP_M_eq_M_times_M(stpl5, ftmp0[0], gauge[mu], QDP_all);
-      stpl5 := tm1 * gauge[mu]
-      #QDP_M_meq_M_times_M(stpl5, gauge[mu], ftmp[0][mu], QDP_all);
-      stpl5 -= gauge[mu] * s1a[mu].field
-      #QDP_discard_M(ftmp[0][mu]);
-      #QDP_M_peq_r_times_M(deriv[mu], &fixL, stpl5, QDP_all);
-      deriv[mu] += fixL * stpl5
-    nflops += 4*(3*(2*198+3*18)+198+216+36)
+        #QDP_M_eq_zero(ftmp0[0], QDP_all);
+        tm1 := 0
+        for nu in 0..3:
+          if nu==mu: continue
+          #QDP_M_eq_Ma_times_M(btmp0[0], mid[nu], gauge[nu], QDP_all);
+          tm2 := mid[nu].adj * gauge[nu]
+          #QDP_M_eq_sM(btmp[0][nu], btmp0[0], QDP_neighbor[nu], QDP_backward, QDP_all);
+          discard sm1[nu] ^* tm2
+          #QDP_M_eq_M_times_Ma(stpl5, mid[nu], gauge[nu], QDP_all);
+          stpl5 := mid[nu] * gauge[nu].adj
+          #QDP_M_meq_M(stpl5, btmp[0][nu], QDP_all);
+          stpl5 -= sm1[nu].field
+          #QDP_discard_M(btmp[0][nu]);
+          #QDP_M_peq_M(ftmp0[0], stpl5, QDP_all);
+          #QDP_M_peq_Ma(ftmp0[0], stpl5, QDP_all);
+          tm1 += stpl5 + stpl5.adj
+        #QDP_M_eq_sM(ftmp[0][mu], ftmp0[0], QDP_neighbor[mu], QDP_forward, QDP_all);
+        discard s1a[mu] ^* tm1
+        #QDP_M_eq_M_times_M(stpl5, ftmp0[0], gauge[mu], QDP_all);
+        stpl5 := tm1 * gauge[mu]
+        #QDP_M_meq_M_times_M(stpl5, gauge[mu], ftmp[0][mu], QDP_all);
+        stpl5 -= gauge[mu] * s1a[mu].field
+        #QDP_discard_M(ftmp[0][mu]);
+        #QDP_M_peq_r_times_M(deriv[mu], &fixL, stpl5, QDP_all);
+        deriv[mu] += fixL * stpl5
+      nflops += 4*(3*(2*198+3*18)+198+216+36)
 
-  if naik!=0.0:
-    for mu in 0..3:
-      # force += mid * Umumu' * Umu'
-      # force -= U-mu' * mid-mu * Umu'
-      # force += U-mu' * U-mu-mu' * mid-mu-mu
-      #QDP_M_eq_M(Uf, U, QDP_all);
-      #QDP_M_eq_sM(Umu, Uf, QDP_neighbor[mu], QDP_forward, QDP_all);
-      discard s1a[mu] ^* llgauge[mu]
-      #QDP_M_eq_Ma_times_M(Umid, Uf, mid, QDP_all);
-      tm1 := llgauge[mu].adj * llmid[mu]
-      #QDP_M_eq_sM(Umidbmu, Umid, QDP_neighbor[mu], QDP_backward, QDP_all);
-      discard sm1[mu] ^* tm1
-      #QDP_M_eq_Ma_times_Ma(UmuU, Umu, Uf, QDP_all);
-      tm2 := s1a[mu].field.adj * llgauge[mu].adj
-      #QDP_M_eq_sM(UmuUs, UmuU, QDP_neighbor[mu], QDP_forward, QDP_all);
-      discard s1b[mu] ^* tm2
-      #QDP_M_eq_Ma_times_M(f3b, Uf, Umidbmu, QDP_all);
-      llderiv[mu] += naik*(sm1[mu].field*s1a[mu].field.adj)
-      tm1 := llgauge[mu].adj * sm1[mu].field
-      #QDP_M_eq_sM(f3, f3b, QDP_neighbor[mu], QDP_backward, QDP_all);
-      discard sm1[mu] ^* tm1
-      #QDP_M_eq_M_times_M(f, mid, UmuUs, QDP_all);
-      #QDP_discard_M(UmuUs);
-      #QDP_M_peq_M_times_Ma(f, Umidbmu, Umu, QDP_all);
-      #QDP_discard_M(Umidbmu);
-      #QDP_discard_M(Umu);
-      #QDP_M_peq_M(f, f3, QDP_all);
-      #QDP_discard_M(f3);
-      #QDP_M_peq_r_times_M(deriv[mu], &coefN, f, QDP_all);
-      llderiv[mu] += naik*(llmid[mu]*s1b[mu].field+sm1[mu].field)
+    if naik!=0.0:
+      for mu in 0..3:
+        # force += mid * Umumu' * Umu'
+        # force -= U-mu' * mid-mu * Umu'
+        # force += U-mu' * U-mu-mu' * mid-mu-mu
+        #QDP_M_eq_M(Uf, U, QDP_all);
+        #QDP_M_eq_sM(Umu, Uf, QDP_neighbor[mu], QDP_forward, QDP_all);
+        discard s1a[mu] ^* llgauge[mu]
+        #QDP_M_eq_Ma_times_M(Umid, Uf, mid, QDP_all);
+        tm1 := llgauge[mu].adj * llmid[mu]
+        #QDP_M_eq_sM(Umidbmu, Umid, QDP_neighbor[mu], QDP_backward, QDP_all);
+        discard sm1[mu] ^* tm1
+        #QDP_M_eq_Ma_times_Ma(UmuU, Umu, Uf, QDP_all);
+        tm2 := s1a[mu].field.adj * llgauge[mu].adj
+        #QDP_M_eq_sM(UmuUs, UmuU, QDP_neighbor[mu], QDP_forward, QDP_all);
+        discard s1b[mu] ^* tm2
+        #QDP_M_eq_Ma_times_M(f3b, Uf, Umidbmu, QDP_all);
+        llderiv[mu] += naik*(sm1[mu].field*s1a[mu].field.adj)
+        tm1 := llgauge[mu].adj * sm1[mu].field
+        #QDP_M_eq_sM(f3, f3b, QDP_neighbor[mu], QDP_backward, QDP_all);
+        discard sm1[mu] ^* tm1
+        #QDP_M_eq_M_times_M(f, mid, UmuUs, QDP_all);
+        #QDP_discard_M(UmuUs);
+        #QDP_M_peq_M_times_Ma(f, Umidbmu, Umu, QDP_all);
+        #QDP_discard_M(Umidbmu);
+        #QDP_discard_M(Umu);
+        #QDP_M_peq_M(f, f3, QDP_all);
+        #QDP_discard_M(f3);
+        #QDP_M_peq_r_times_M(deriv[mu], &coefN, f, QDP_all);
+        llderiv[mu] += naik*(llmid[mu]*s1b[mu].field+sm1[mu].field)
 
   #info->final_sec = QOP_time() - dtime;
   #info->final_flop = nflops*QDP_sites_on_node;
   #info->status = QOP_SUCCESS;
-  toc()
+  toc("end")
   inc perf.count
   perf.flops += nflops * gauge[0].l.localGeom.prod
   perf.secs += getElapsedTime()
@@ -368,17 +369,14 @@ when isMainModule:
   echo g.plaq
   echo g2.plaq
   makeImpLinks(fl, g, coef, info)
+  info.clear
+  resetTimers()
+  makeImpLinks(fl, g, coef, info)
   echo info
   makeImpLinks(fl2, g2, coef, info)
   echo info
   echo fl.plaq
   echo fl2.plaq
-  #echo ll.plaq
-  #echo pow(1.0,4)/6.0
-  #echo pow(1.0+6.0,4)/6.0
-  #echo pow(1.0+6.0+6.0*4.0,4)/6.0
-  #echo pow(1.0+6.0+6.0*4.0+6.0*4.0*2.0,4)/6.0
-  #echo pow(1.0+6.0+6.0*4.0+6.0*4.0*2.0+6.0,4)/6.0
 
   fat7lderiv(fd, g, dg, coef, info)
   echo info
@@ -406,4 +404,5 @@ when isMainModule:
     dfl[mu] -= ld[mu]
     echo dfl[mu].norm2
 
+  echoProf()
   qexFinalize()