From d2032ec9e5df56cccf4a6ab9137cef36cd99b450 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Sun, 17 Nov 2024 14:49:30 -0600 Subject: [PATCH 01/12] improve benchmarking --- src/base/globals.nim | 10 +++++++ src/base/profile.nim | 57 ++++++++----------------------------- src/bench/benchLinalg.nim | 12 ++++++-- src/bench/benchStagProp.nim | 4 +-- src/experimental/stagag.nim | 5 +--- src/physics/qcdTypes.nim | 1 - tests/base/tshift.nim | 26 +++++++++++------ 7 files changed, 53 insertions(+), 62 deletions(-) diff --git a/src/base/globals.nim b/src/base/globals.nim index 49325700..428bf6a3 100644 --- a/src/base/globals.nim +++ b/src/base/globals.nim @@ -44,3 +44,13 @@ macro setDefaultNc*(n: static[int]): untyped = result = newEmptyNode() macro getDefaultNc*(): untyped = return newLit(defaultNc) + +template getDefPrecStr:string = + const defPrec {.strdefine.} = "D" + defPrec + +var defPrec* {.compiletime.} = getDefPrecStr() +macro setDefaultSingle* = + defPrec = "S" + echo "Default precision: ", defPrec +static: echo "Default precision: ", defPrec diff --git a/src/base/profile.nim b/src/base/profile.nim index f577bece..506568ca 100644 --- a/src/base/profile.nim +++ b/src/base/profile.nim @@ -1,6 +1,6 @@ import threading export threading -import comms/comms, stdUtils, base/basicOps +import comms/comms, stdUtils, base/[basicOps,params] import os, strutils, sequtils, std/monotimes, std/tables, std/algorithm, strformat export monotimes getOptimPragmas() @@ -20,7 +20,7 @@ var ##[ -Each Tic starts a local timer. Each Toc records the time difference +Each tic() starts a local timer. Each toc() records the time difference of the current time with the one in the local timer visible in the scope, and then update the timer with the current time. @@ -516,49 +516,10 @@ template tocI(f: SomeNumber; s:SString = ""; n = -1) = localCode {.global.} = newList[CodePoint]() thisCode = CodePoint(-1) if threadNum==0: - when false: - #echo "==== begin toc ",s," ",ii - #echo " rtiStack: ",indent($rtiStack,5) - #echo " cpHeap: ",indent($cpHeap,5) - if unlikely VerboseTimer: echoToc(s,ii) - if prevRTI.int32 >= 0: - if restartTimer: - thawTimers() - restartTimer = false - if not timersFrozen(): - let theTime = getTics() - when not cname: - for c in items(localCode): - if cpHeap[c.int].name.equal(s): - thisCode = c - break - if thisCode.isNil: - thisCode = newCodePoint(ii.addr, s) - when not cname: - localCode.add thisCode - let - ns = theTime-localTimer - thisRTI = record(localTic, prevRTI, thisCode, ns, float(f)) - var oh = rtiStack[thisRTI.int].childrenOverhead - let c = rtiStack[thisRTI.int].children - for i in 0.. DropWasteTimerRatio: - # Signal stop if the overhead is too large. - dropTimer(prevRTI) - if toDropTimer(thisCode): - freezeTimers() - restartTimer = true - localTimer = getTics() - rtiStack[thisRTI.int].overhead = nsec(localTimer-theTime) - prevRTI = thisRTI - #echo "==== end toc ",s," ",ii + when cname: + tocSet(localTimer,prevRTI,restartTimer,thisCode,f,s,ii.addr,localTic,false) else: - when cname: - tocSet(localTimer,prevRTI,restartTimer,thisCode,f,s,ii.addr,localTic,false) - else: - tocSet(localTimer,prevRTI,restartTimer,thisCode,f,s,ii.addr,localTic,addr localCode) + tocSet(localTimer,prevRTI,restartTimer,thisCode,f,s,ii.addr,localTic,addr localCode) when noTicToc: template toc*() = discard @@ -822,7 +783,7 @@ proc makeHotspotTable(lrti: List[RTInfoObj]): tuple[ns:int64,oh:int64] = t.children.add ri.children[i] do: # loc not found hs[loc] = ri - #let tot = makeHotSpotTable(List[RTInfoObj](ri.children)) + let tot = makeHotSpotTable(List[RTInfoObj](ri.children)) return (nstot, ohtot) proc echoHotspots* = @@ -879,6 +840,12 @@ proc echoHotspots* = let tsnspct = 100.0 * tsns / nstot echo &"{pct:6.3f} {tsnspct:7.3f} {count} {mf} {nc} S {lc} {nm}" +proc echoProf*(def = 0) = + case intParam("prof",def) + of 1: echoHotspots() + of 2: echoTimers() + else: discard + when isMainModule: import os proc test = diff --git a/src/bench/benchLinalg.nim b/src/bench/benchLinalg.nim index 68d61447..928fae78 100644 --- a/src/bench/benchLinalg.nim +++ b/src/bench/benchLinalg.nim @@ -45,6 +45,7 @@ proc checkMem = # fps: flops per site # bps: bytes moved (load+store) per site # mm: memory footprint (bytes) per site +var minNrep = int.high template bench(fps,bps,mm,eqn: untyped) {.dirty.} = block: let vol = lo.nSites.float @@ -73,12 +74,19 @@ template bench(fps,bps,mm,eqn: untyped) {.dirty.} = inc nbench echo "bench: ",nbench|(-6), "secs: ", dt|(6,3), " mf: ", mf|7, " mb: ", mb|7, " mem: ", mem, " nrep: ", nrep echo exp2string(eqn), "\n" + minNrep = min(minNrep, nrep) template bench(fps,bps,eqn: untyped) = bench(fps,bps,0,eqn) proc test(lat:auto) = - let maxl = intParam("maxl",16) - if lat[0] > maxl: return + let minl = intParam("minl",0) + if lat[0] < minl: return + let maxl = intParam("maxl",0) + if maxl>0: + if lat[0] > maxl: return + else: + if minNrep == 1: return + minNrep = int.high var lo = newLayout(lat) template newCV: untyped = lo.ColorVector() template newCM: untyped = lo.ColorMatrix() diff --git a/src/bench/benchStagProp.nim b/src/bench/benchStagProp.nim index 2ac4f8f5..164758cf 100644 --- a/src/bench/benchStagProp.nim +++ b/src/bench/benchStagProp.nim @@ -63,7 +63,7 @@ threads: threadBarrier() echo r.norm2 #echo v2 -echoTimers() +echoProf() var g3:array[8,type(g[0])] for i in 0..3: @@ -76,6 +76,6 @@ var s3 = newStag3(g3) s3.solve(v2, v1, mass, sp) resetTimers() s3.solve(v2, v1, mass, sp) -echoTimers() +echoProf() qexFinalize() diff --git a/src/experimental/stagag.nim b/src/experimental/stagag.nim index e29f9341..4574ce98 100644 --- a/src/experimental/stagag.nim +++ b/src/experimental/stagag.nim @@ -2103,8 +2103,5 @@ if outfn != "": echo "Saving gauge field to file: ", outfn let err = g.saveGauge outfn -case intParam("prof",0) -of 1: echoTimers() -of 2: echoHotspots() -else: discard +echoProf() qexFinalize() diff --git a/src/physics/qcdTypes.nim b/src/physics/qcdTypes.nim index 007ca9f3..2f0bcef2 100644 --- a/src/physics/qcdTypes.nim +++ b/src/physics/qcdTypes.nim @@ -416,7 +416,6 @@ macro makeConstructors(x: untyped): untyped = result = newStmtList() result.add getAst mp(ident(f&"S"), ident("S"&r&"V"), ident"result") result.add getAst mp(ident(f&"D"), ident("D"&r&"V"), ident"result") - const defPrec {.strdefine.} = "D" result.add getAst mp(ident(f), ident(defPrec&r&"V"), ident"result") # non-Simd versions result.add getAst mp(ident(f&"S1"), ident("S"&r), ident"result") diff --git a/tests/base/tshift.nim b/tests/base/tshift.nim index 75d39e5d..4ddc04f1 100644 --- a/tests/base/tshift.nim +++ b/tests/base/tshift.nim @@ -126,18 +126,20 @@ template testS16(Smd: typedesc) = qexInit() template makeSimdArrayX(T,N,B: untyped) {.dirty.} = - makeSimdArray(`T X`, N, B) - type T = Simd[`T X`] - template toDoubleImpl(x: `T X`): untyped = x # always already double + #makeSimdArray(`T X`, N, B) + #type T = Simd[`T X`] + #template toDoubleImpl(x: `T X`): untyped = x # always already double + type T = Simd[SimdArrayObj[N,B]] #testS1(float) + makeSimdArrayX(SD1, 1, float) testS1(SD1) when declared(SimdD1): testS1(SimdD1) -#makeSimdArrayX(SS1, 1, float32) -#testS1(SS1) +makeSimdArrayX(SS1, 1, float32) +testS1(SS1) when declared(SimdS1): testS1(SimdS1) @@ -146,8 +148,8 @@ testS2(SD2) when declared(SimdD2): testS2(SimdD2) -#makeSimdArrayX(SS2, 2, float32) -#testS2(SS2) +makeSimdArrayX(SS2, 2, float32) +testS2(SS2) when declared(SimdS2): testS2(SimdS2) @@ -155,6 +157,9 @@ makeSimdArrayX(SD4, 4, float) testS4(SD4) when declared(SimdD4): testS4(SimdD4) + +makeSimdArrayX(SS4, 4, float32) +testS4(SS4) when declared(SimdS4): testS4(SimdS4) @@ -162,6 +167,9 @@ makeSimdArrayX(SD8, 8, float) testS8(SD8) when declared(SimdD8): testS8(SimdD8) + +makeSimdArrayX(SS8, 8, float32) +testS8(SS8) when declared(SimdS8): testS8(SimdS8) @@ -169,8 +177,10 @@ makeSimdArrayX(SD16, 16, float) testS16(SD16) when declared(SimdD16): testS16(SimdD16) + +makeSimdArrayX(SS16, 16, float32) +testS16(SS16) when declared(SimdS16): testS16(SimdS16) - qexFinalize() From 5d6b7b47a61651bc116484fb35a91f7966f5a268 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Tue, 19 Nov 2024 15:19:52 -0600 Subject: [PATCH 02/12] fix threading issue fix issue when doing multiple hotspot outputs add single precision versions of some benchmarks --- src/base/backtrace.nim | 3 +- src/base/omp.nim | 11 ++ src/base/profile.nim | 1 + src/base/stdUtils.nim | 13 ++- src/base/threading.nim | 218 +++++++++++++++++++++++++++-------- src/bench/benchLinalgS.nim | 3 + src/bench/benchStagPropS.nim | 3 + src/comms/commsUtils.nim | 30 ++++- src/examples/staghmc_sh.nim | 3 +- src/layout/shifts.nim | 3 +- 10 files changed, 222 insertions(+), 66 deletions(-) create mode 100644 src/bench/benchLinalgS.nim create mode 100644 src/bench/benchStagPropS.nim diff --git a/src/base/backtrace.nim b/src/base/backtrace.nim index 0a75dbe9..d393cd82 100644 --- a/src/base/backtrace.nim +++ b/src/base/backtrace.nim @@ -8,7 +8,7 @@ proc backtrace_symbols(buffer: ptr pointer, size: cint): ptr UncheckedArray[cstr proc print_trace = #void *array[10]; - const nmax = 10 + const nmax = 20 var arr: array[nmax, pointer] #char **strings; @@ -22,6 +22,7 @@ proc print_trace = proc sigtrace(sig: cint) {.noconv.} = print_trace() + quit() proc setTrace* = c_signal(SIGSEGV, sigtrace) diff --git a/src/base/omp.nim b/src/base/omp.nim index 80b803b2..52b2c3fe 100644 --- a/src/base/omp.nim +++ b/src/base/omp.nim @@ -28,6 +28,11 @@ else: #forceOmpOn() #{. emit:["#pragma omp ", p] .} {. emit:["_Pragma(\"omp ", p, "\")"] .} + template ompPragma(p:string,body:typed) = + {. push stackTrace:off, lineTrace:off, line_dir:off .} + {. emit:["_Pragma(\"omp ", p, "\")"] .} + body + {. pop .} template ompBlock*(p:string; body:untyped) = #{. emit:"#pragma omp " & p .} #{. emit:"{ /* Inserted by ompBlock " & p & " */".} @@ -38,6 +43,12 @@ else: #{. emit:"} /* End ompBlock " & p & " */".} template ompBarrier* = ompPragma("barrier") +template ompFlush* = ompPragma("flush") +template ompFlushAcquire* = ompPragma("flush acquire") +template ompFlushRelease* = ompPragma("flush release") +template ompFlushSeqCst* = ompPragma("flush seq_cst") +template ompAtomicRead*(body) = ompPragma("atomic read acquire", body) +template ompAtomicWrite*(body) = ompPragma("atomic write release", body) template ompParallel*(body:untyped) = ompBlock("parallel"): diff --git a/src/base/profile.nim b/src/base/profile.nim index 506568ca..160f0411 100644 --- a/src/base/profile.nim +++ b/src/base/profile.nim @@ -787,6 +787,7 @@ proc makeHotspotTable(lrti: List[RTInfoObj]): tuple[ns:int64,oh:int64] = return (nstot, ohtot) proc echoHotspots* = + hs.clear let tot = makeHotspotTable(rtiStack) #let nstot = tot.ns let ohtot = tot.oh diff --git a/src/base/stdUtils.nim b/src/base/stdUtils.nim index e0430c3b..5da5f26d 100644 --- a/src/base/stdUtils.nim +++ b/src/base/stdUtils.nim @@ -8,10 +8,11 @@ type #template `[]`*(x: cArray): untyped = addr x[0] #template `&`*(x: ptr cArray): untyped = addr x[0] -template ptrInt*(x:untyped):untyped = cast[ByteAddress](x) -template addrInt*(x:untyped):untyped = cast[ByteAddress](addr(x)) -template unsafeAddrInt*(x:untyped):untyped = cast[ByteAddress](addr(x)) -template toHex*(x: ptr typed): untyped = toHex(cast[ByteAddress](x)) +template ptrInt*(x: auto): auto = cast[uint](x) +template addrInt*(x: auto): auto = cast[uint](addr(x)) +template unsafeAddrInt*(x: auto): auto = cast[uint](addr(x)) +template toHex*(x: ptr auto): auto = toHex(cast[uint](x)) +template toHex*(x: pointer): auto = toHex(cast[uint](x)) type ConstInt* {.importc:"const int".} = object @@ -31,7 +32,7 @@ proc isInteger*(s: string):bool = var t:int parseInt(s, t) == s.len -template `$&`*(x: untyped): string = +template `$&`*(x: auto): string = toHex(unsafeAddrInt(x)) proc `|`*(s: string, d: tuple[w:int,c:char]): string = @@ -52,7 +53,7 @@ proc `|`*(f: SomeFloat, d: tuple[w,p: int]): string = formatFloat(f, ffDefault, d.p) | d.w proc `|`*(f: float, d: int): string = f | (d,d) -template `|-`*(x:SomeNumber, y: int): untyped = +template `|-`*(x:SomeNumber, y: int): auto = x | -y proc indexOf*[T](x: openArray[T], y: auto): int = diff --git a/src/base/threading.nim b/src/base/threading.nim index 73db56c1..8a01205b 100644 --- a/src/base/threading.nim +++ b/src/base/threading.nim @@ -4,6 +4,8 @@ import stdUtils import macros import omp import metaUtils +import base/basicOps +getOptimPragmas type ThreadShare* = object @@ -18,19 +20,34 @@ var threadNum*{.threadvar.}:int var numThreads*{.threadvar.}:int var threadLocals*{.threadvar.}:ThreadObj var inited = false +var ts: pointer = nil +var nts = 0 -template initThreadLocals*(ts:seq[ThreadShare]):untyped = +proc allocTs* {.alwaysInline.} = + if numThreads > nts and threadNum == 0: + if ts == nil: + ts = allocShared(numThreads*sizeof(ThreadShare)) + else: + ts = reallocShared(ts, numThreads*sizeof(ThreadShare)) + nts = numThreads + +#template initThreadLocals*(ts:seq[ThreadShare]) = +template initThreadLocals* = + bind ts threadLocals.threadNum = threadNum threadLocals.numThreads = numThreads - threadLocals.share = cast[ptr cArray[ThreadShare]](ts[0].addr) + #threadLocals.share = cast[ptr cArray[ThreadShare]](ts[0].addr) + threadLocals.share = cast[ptr cArray[ThreadShare]](ts) threadLocals.share[threadNum].p = nil threadLocals.share[threadNum].counter = 0 proc init = inited = true threadNum = 0 numThreads = 1 - var ts = newSeq[ThreadShare](numThreads) - initThreadLocals(ts) + #var ts = newSeq[ThreadShare](numThreads) + #initThreadLocals(ts) + allocTs() + initThreadLocals() template threadsInit* = if not inited: init() @@ -57,51 +74,61 @@ template emitStackTrace: untyped = template threads*(body:untyped):untyped = checkInit() doAssert(numThreads==1) - let tidOld = threadNum - let nidOld = numThreads - let tlOld = threadLocals + #let tidOld = threadNum + #let nidOld = numThreads + #let tlOld = threadLocals #proc tproc2{.genSym,inline.} = # body proc tproc{.genSym.} = emitStackTrace() - var ts:seq[ThreadShare] + #var ts:seq[ThreadShare] ompParallel: threadNum = ompGetThreadNum() numThreads = ompGetNumThreads() - if threadNum==0: ts.newSeq(numThreads) + #if threadNum==0: ts.newSeq(numThreads) + allocTs() threadBarrierO() - initThreadLocals(ts) + #initThreadLocals(ts) + initThreadLocals() threadBarrierO() #echoAll threadNum, " s: ", ptrInt(threadLocals.share) body #tproc2() threadBarrierO() tproc() - threadNum = tidOld - numThreads = nidOld - threadLocals = tlOld + #threadNum = tidOld + #numThreads = nidOld + #threadLocals = tlOld + threadNum = 0 + numThreads = 1 + initThreadLocals() template threads*(x0:untyped;body:untyped):untyped = checkInit() - let tidOld = threadNum - let nidOld = numThreads - let tlOld = threadLocals + #let tidOld = threadNum + #let nidOld = numThreads + #let tlOld = threadLocals proc tproc(xx:var type(x0)) {.genSym.} = - var ts:seq[ThreadShare] + #var ts:seq[ThreadShare] ompParallel: threadNum = ompGetThreadNum() numThreads = ompGetNumThreads() - if threadNum==0: ts.newSeq(numThreads) + #if threadNum==0: ts.newSeq(numThreads) + allocTs() threadBarrierO() - initThreadLocals(ts) + #initThreadLocals(ts) + initThreadLocals() threadBarrierO() #echoAll threadNum, " s: ", ptrInt(threadLocals.share) subst(x0,xx): body threadBarrierO() tproc(x0) - threadNum = tidOld - numThreads = nidOld - threadLocals = tlOld + #threadNum = tidOld + #numThreads = nidOld + #threadLocals = tlOld + threadNum = 0 + numThreads = 1 + initThreadLocals() template nothreads*(body: untyped): untyped = ## convenient way to turn off threading @@ -114,13 +141,18 @@ template threadBarrierO* = ompBarrier template threadMaster*(x:untyped) = ompMaster(x) template threadSingle*(x:untyped) = ompSingle(x) template threadCritical*(x:untyped) = ompCritical(x) +template threadFlush* = ompFlush +template threadFlushRelease* = ompFlushRelease +template threadFlushAcquire* = ompFlushAcquire +template threadFlushSeqCst* = ompFlushSeqCst +template threadAtomicRead*(body:typed) = ompAtomicRead(body) +template threadAtomicWrite*(body:typed) = ompAtomicWrite(body) template threadDivideLow*(x,y: untyped): untyped = x + (threadNum*(y-x)) div numThreads template threadDivideHigh*(x,y: untyped): untyped = x + ((threadNum+1)*(y-x)) div numThreads - proc tForX*(index,i0,i1,body:NimNode):NimNode = return quote do: let d = 1+`i1` - `i0` @@ -154,39 +186,123 @@ iterator `.|`*[S, T](a: S, b: T): T {.inline.} = inc(res) """ -template t0waitO* = threadBarrier() -template t0waitX* = - if threadNum==0: - inc threadLocals.share[0].counter - let tbar0 = threadLocals.share[0].counter - for b in 1.. 1: + if threadNum==0: + inc threadLocals.share[0].counter + let tbar0 = threadLocals.share[0].counter + for b in 1..= tbar0: break + var t {.noInit.}: type(p[]) + ompAtomicRead: t = p[] + if t >= tbar0: break + else: + #inc threadLocals.share[threadNum].counter + #fence() + #ompRelease + let t = threadLocals.share[threadNum].counter + 1 + ompAtomicWrite: + threadLocals.share[threadNum].counter = t +template t0wait* = t0waitA() +#template t0wait* = t0waitB() + +template twait0B* = threadBarrier() +template twait0A* = + if numThreads > 1: + if threadNum==0: + #inc threadLocals.share[0].counter + #fence() + #ompRelease + let t = threadLocals.share[0].counter + 1 + ompAtomicWrite: + threadLocals.share[0].counter = t + else: + inc threadLocals.share[threadNum].counter + let tbar0 = threadLocals.share[threadNum].counter + let p = threadLocals.share[0].counter.addr while true: - fence() - if p[] >= tbar0: break - else: - inc threadLocals.share[threadNum].counter - fence() -template t0wait* = t0waitO() + #fence() + #ompAcquire + #if p[] >= tbar0: break + var t {.noInit.}: type(p[]) + ompAtomicRead: t = p[] + if t >= tbar0: break +template twait0* = twait0A() +#template twait0* = twait0B() + +template threadBarrierA* = + threadFlushRelease + t0waitA + twait0A + threadFlushAcquire +template threadBarrier* = threadBarrierA +#template threadBarrier* = ompBarrier -template twait0O* = threadBarrier() -template twait0X* = +template threadSum01A*[T](a: T) = + ## sum value with result on thread 0, atomic version if threadNum==0: - inc threadLocals.share[0].counter - fence() + tic("threadSum01A") + t0wait() + toc("t0wait") + for i in 1..= tbar0: break -template twait0* = twait0O() + threadAtomicWrite: + threadLocals.share[threadNum].p = a.addr + t0wait() + twait0() + +template threadSum01B*[T](a: T) = + ## sum value with result on thread 0, barrier version + block: + tic("threadSum01") + if threadNum!=0: + threadLocals.share[threadNum].p = a.addr + threadBarrier() + toc("threadBarrier first") + if threadNum==0: + for i in 1.. Date: Wed, 20 Nov 2024 22:42:28 -0600 Subject: [PATCH 03/12] improve threading --- src/base/profile.nim | 3 +- src/base/threading.nim | 97 +++++++++++++++++++++++++++++++++++-- src/bench/benchStagProp.nim | 2 + 3 files changed, 97 insertions(+), 5 deletions(-) diff --git a/src/base/profile.nim b/src/base/profile.nim index 160f0411..6a861f6c 100644 --- a/src/base/profile.nim +++ b/src/base/profile.nim @@ -15,7 +15,8 @@ template ticDiffSecs*(x,y: TicType): float = 1e-9 * float(x.int64 - y.int64) template `-`*(x,y: TicType): TicType = TicType(x.int64 - y.int64) var - DropWasteTimerRatio* = 0.05 ## Drop children timers if the proportion of their overhead is larger than this. + ## Drop children timers if the proportion of their overhead is larger than this. + DropWasteTimerRatio* = floatParam("dropRatio", 0.05) VerboseTimer* = false ## If true print out all the timers during execution. ##[ diff --git a/src/base/threading.nim b/src/base/threading.nim index 8a01205b..e4acf106 100644 --- a/src/base/threading.nim +++ b/src/base/threading.nim @@ -5,15 +5,18 @@ import macros import omp import metaUtils import base/basicOps +import bitops getOptimPragmas type ThreadShare* = object p*:pointer counter*:int + extra*:int ThreadObj* = object threadNum*:int numThreads*:int + #counter*: int share*:ptr cArray[ThreadShare] var threadNum*{.threadvar.}:int @@ -209,8 +212,29 @@ template t0waitA* = let t = threadLocals.share[threadNum].counter + 1 ompAtomicWrite: threadLocals.share[threadNum].counter = t -template t0wait* = t0waitA() +template t0waitC* = + if numThreads > 1: + if threadNum==0: + inc threadLocals.share[0].counter + let tbar0 = threadLocals.share[0].counter + var left = numThreads - 1 + for i in 1.. 0: + for i in 1.. 0: + let p = threadLocals.share[i].counter.addr + var t {.noInit.}: type(p[]) + ompAtomicRead: t = p[] + if t >= tbar0: + threadLocals.share[i].extra = 0 + dec left + else: + let t = threadLocals.share[threadNum].counter + 1 + ompAtomicWrite: + threadLocals.share[threadNum].counter = t +#template t0wait* = t0waitA() #template t0wait* = t0waitB() +template t0wait* = t0waitC() template twait0B* = threadBarrier() template twait0A* = @@ -244,10 +268,10 @@ template threadBarrierA* = template threadBarrier* = threadBarrierA #template threadBarrier* = ompBarrier -template threadSum01A*[T](a: T) = +template threadSum01A0*[T](a: T) = ## sum value with result on thread 0, atomic version if threadNum==0: - tic("threadSum01A") + tic("threadSum01A0") t0wait() toc("t0wait") for i in 1..= tbar0: break + var p{.noInit.}: pointer + threadAtomicRead: + p = threadLocals.share[b].p + a += cast[ptr T](p)[] + toc("sum") + twait0() + toc("twait0") + else: + threadAtomicWrite: + threadLocals.share[threadNum].p = a.addr + let t = threadLocals.share[threadNum].counter + 1 + threadAtomicWrite: + threadLocals.share[threadNum].counter = t + twait0() + template threadSum01B*[T](a: T) = ## sum value with result on thread 0, barrier version block: @@ -279,15 +330,53 @@ template threadSum01B*[T](a: T) = threadBarrier() toc("threadBarrier last") +template threadSum01T*[T](a: T) = + ## sum value with result on thread 0, tree version + threadAtomicWrite: + threadLocals.share[threadNum].p = a.addr + var c = threadLocals.share[threadNum].counter + var done = false + var b = 1 + while b < numThreads: + inc c + if not done: + let o = bitxor(threadNum, b) + if bitand(threadNum, b) == 0: + if o < numThreads: + while true: + var d{.noInit.}: int + threadAtomicRead: + d = threadLocals.share[o].counter + if d >= c: break + var p{.noInit.}: pointer + threadAtomicRead: + p = threadLocals.share[o].p + a += cast[ptr T](p)[] + threadAtomicWrite: + threadLocals.share[threadNum].counter = c + else: + threadAtomicWrite: + threadLocals.share[threadNum].counter = c + while true: + var d{.noInit.}: int + threadAtomicRead: + d = threadLocals.share[o].counter + if d >= c: break + done = true + else: + threadLocals.share[threadNum].counter = c + b *= 2 + template threadSum01*(a: auto) = threadSum01A(a) #template threadSum01*(a: auto) = threadSum01B(a) +#template threadSum01*(a: auto) = threadSum01T(a) template threadSum0*(a: auto) = threadSum01(a) # threadMax0 FIXME template threadBroadcast1A*[T](a: T) = if threadNum==0: - tic("threadRankSum1") + tic("threadBroadcast1A") threadAtomicWrite: threadLocals.share[0].p = a.addr twait0() diff --git a/src/bench/benchStagProp.nim b/src/bench/benchStagProp.nim index 164758cf..4a518367 100644 --- a/src/bench/benchStagProp.nim +++ b/src/bench/benchStagProp.nim @@ -71,6 +71,8 @@ for i in 0..3: g3[2*i+1] = lo.ColorMatrix() g3[2*i+1].randomSU rs g3[2*i+1] *= 0.1 +for i in 0.. Date: Thu, 21 Nov 2024 14:01:02 -0600 Subject: [PATCH 04/12] fix echo in profile --- src/base/profile.nim | 2 +- src/comms/comms.nim | 10 +---- src/comms/commsEcho.nim | 86 ++++++++++++++++++++++++++++++++++++++++ src/comms/commsTypes.nim | 8 ++++ src/comms/commsUtils.nim | 83 -------------------------------------- 5 files changed, 97 insertions(+), 92 deletions(-) create mode 100644 src/comms/commsEcho.nim diff --git a/src/base/profile.nim b/src/base/profile.nim index 6a861f6c..de7ca62e 100644 --- a/src/base/profile.nim +++ b/src/base/profile.nim @@ -1,6 +1,6 @@ import threading export threading -import comms/comms, stdUtils, base/[basicOps,params] +import comms/commsEcho, stdUtils, base/[basicOps,params] import os, strutils, sequtils, std/monotimes, std/tables, std/algorithm, strformat export monotimes getOptimPragmas() diff --git a/src/comms/comms.nim b/src/comms/comms.nim index 9e4be9a8..28e3be03 100644 --- a/src/comms/comms.nim +++ b/src/comms/comms.nim @@ -1,14 +1,6 @@ import commsTypes export commsTypes -# globals - -var defaultComm*: Comm -template getDefaultComm*(): Comm = defaultComm -template getComm*(): Comm = getDefaultComm() # temporary alias -var myRank* = 0 -var nRanks* = 1 - # base methods method name*(c: Comm): string {.base.} = discard @@ -157,6 +149,8 @@ commsNames.add "QMP" commsInits.add getQmpComm commsFinis.add commsFinalizeQmp +import commsEcho +export commsEcho import commsUtils export commsUtils diff --git a/src/comms/commsEcho.nim b/src/comms/commsEcho.nim new file mode 100644 index 00000000..b06c68ef --- /dev/null +++ b/src/comms/commsEcho.nim @@ -0,0 +1,86 @@ +import macros +import base/[threading,metaUtils] +import commsTypes + +proc evalArgs*(call:var NimNode; args:NimNode):NimNode = + result = newStmtList() + for i in 0..".} +#proc printfOrdered( +macro printf*(fmt:string; args:varargs[untyped]):auto = + var call = newCall(ident("cprintf"), fmt) + result = evalArgs(call, args) + result.add(quote do: + if myRank==0 and threadNum==0: + `call` + ) +proc echoRaw*(x: varargs[typed, `$`]) {.magic: "Echo".} +macro echoAll*(args:varargs[untyped]):auto = + var call = newCall(bindSym"echoRaw") + result = evalArgs(call, args) + result.add(quote do: + `call` + ) +macro echoRank*(args:varargs[untyped]):auto = + var call = newCall(bindSym"echoRaw") + call.add ident"myRank" + call.add newLit"/" + call.add ident"nRanks" + call.add newLit": " + result = evalArgs(call, args) + template f(x:untyped):untyped = + if threadNum==0: x + result.add getAst(f(call)) +macro echo0*(args: varargs[untyped]): untyped = + var call = newCall(bindSym"echoRaw") + result = evalArgs(call, args) + result.add(quote do: + bind myRank + if myRank==0 and threadNum==0: + `call` + ) + #echo result.repr +macro makeEchos(n:static[int]): untyped = + template ech(x,y: untyped) = + template echo* = + when nimvm: + x + else: + y + result = newStmtList() + for i in 1..n: + var er = newCall(bindSym"echoRaw") + var e0 = newCall(bindSym"echo0") + var ea = newSeq[NimNode](0) + for j in 1..i: + let ai = ident("a" & $j) + er.add ai + e0.add ai + ea.add newNimNode(nnkIdentDefs).add(ai).add(ident"untyped").add(newEmptyNode()) + var t = getAst(ech(er,e0)).peelStmt + #echo t.treerepr + for j in 0..".} -#proc printfOrdered( -macro printf*(fmt:string; args:varargs[untyped]):auto = - var call = newCall(ident("cprintf"), fmt) - result = evalArgs(call, args) - result.add(quote do: - if myRank==0 and threadNum==0: - `call` - ) -proc echoRaw*(x: varargs[typed, `$`]) {.magic: "Echo".} -macro echoAll*(args:varargs[untyped]):auto = - var call = newCall(bindSym"echoRaw") - result = evalArgs(call, args) - result.add(quote do: - `call` - ) -macro echoRank*(args:varargs[untyped]):auto = - var call = newCall(bindSym"echoRaw") - call.add ident"myRank" - call.add newLit"/" - call.add ident"nRanks" - call.add newLit": " - result = evalArgs(call, args) - template f(x:untyped):untyped = - if threadNum==0: x - result.add getAst(f(call)) -macro echo0*(args: varargs[untyped]): untyped = - var call = newCall(bindSym"echoRaw") - result = evalArgs(call, args) - result.add(quote do: - bind myRank - if myRank==0 and threadNum==0: - `call` - ) - #echo result.repr -macro makeEchos(n:static[int]): untyped = - template ech(x,y: untyped) = - template echo* = - when nimvm: - x - else: - y - result = newStmtList() - for i in 1..n: - var er = newCall(bindSym"echoRaw") - var e0 = newCall(bindSym"echo0") - var ea = newSeq[NimNode](0) - for j in 1..i: - let ai = ident("a" & $j) - er.add ai - e0.add ai - ea.add newNimNode(nnkIdentDefs).add(ai).add(ident"untyped").add(newEmptyNode()) - var t = getAst(ech(er,e0)).peelStmt - #echo t.treerepr - for j in 0.. Date: Thu, 21 Nov 2024 14:02:12 -0600 Subject: [PATCH 05/12] add version info --- src/base/version.nim | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 src/base/version.nim diff --git a/src/base/version.nim b/src/base/version.nim new file mode 100644 index 00000000..66a47f8e --- /dev/null +++ b/src/base/version.nim @@ -0,0 +1,22 @@ +import strutils, times + +const sep = '='.repeat(78) +const head = "QEX Compilation information:" +const comptime = " Time: " & staticExec("date") +const compmach = " Host: " & staticExec("uname -a") +const gitlog = " Log: " & staticExec("git log -1").indent(7).strip +const gitstat = " Status: " & staticExec("git status -uno").indent(10).strip +const buildInfo = [sep,head,comptime,compmach,gitlog,gitstat,sep].join("\n") +const gitrev = staticExec("git rev-parse HEAD") + +static: echo buildInfo + +proc getBuildInfo*(): string = + buildInfo + +proc getVersion*(): string = + gitrev + +when isMainModule: + echo getBuildInfo() + echo getVersion() From b46d18cb5f2b10fffe640a5db7bcdd9cb9f3055f Mon Sep 17 00:00:00 2001 From: James Osborn Date: Thu, 21 Nov 2024 14:04:44 -0600 Subject: [PATCH 06/12] allow setting of hwloc library --- src/hwloc/capi.nim | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/hwloc/capi.nim b/src/hwloc/capi.nim index 51cfa6f2..f104dcbc 100644 --- a/src/hwloc/capi.nim +++ b/src/hwloc/capi.nim @@ -48,9 +48,10 @@ ## when hostOs == "macosx": - {.pragma: hwloc, dynlib:"libhwloc.dylib".} + const hwloclib {.strDefine.} = "libhwloc.dylib" else: - {.pragma: hwloc, dynlib:"libhwloc.so".} + const hwloclib {.strDefine.} = "libhwloc.so" +{.pragma: hwloc, dynlib:hwloclib.} type hwloc_bitmap_s {.bycopy.} = object From e5ed8b0a89db86101904a8cc6d1a04d9aca1f90b Mon Sep 17 00:00:00 2001 From: James Osborn Date: Sat, 23 Nov 2024 16:14:14 -0600 Subject: [PATCH 07/12] small update to simd types --- src/simd/simdX86Ops.nim | 8 ++++---- src/simd/simdX86Types.nim | 39 +++++++++++++++++++++++++++++++++------ 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/src/simd/simdX86Ops.nim b/src/simd/simdX86Ops.nim index 4edd4375..5f6d1790 100644 --- a/src/simd/simdX86Ops.nim +++ b/src/simd/simdX86Ops.nim @@ -17,22 +17,22 @@ template int2mask*(T: typedesc[m512], i: SomeInteger): mmask16 = cvtu32_mask16(u proc `[]=`*(r:var m128; i:SomeInteger; x:SomeNumber) {.alwaysInline.} = mixin toArray var a = toArray(r) - a[i] := x + a[i] = float32 x assign(r, a) proc `[]=`*(r:var m128d; i:SomeInteger; x:SomeNumber) {.alwaysInline.} = mixin toArray var a = toArray(r) - a[i] := x + a[i] = float x assign(r, a) proc `[]=`*(r:var m256; i:SomeInteger; x:SomeNumber) {.alwaysInline.} = mixin toArray var a = toArray(r) - a[i] := x + a[i] = float32 x assign(r, a) proc `[]=`*(r:var m256d; i:SomeInteger; x:SomeNumber) {.alwaysInline.} = mixin toArray var a = toArray(r) - a[i] := x + a[i] = float x assign(r, a) #proc `[]=`*(r:var m256d; i:SomeInteger; x:SomeNumber) {.alwaysInline.} = # var a {.noInit.}: m256d diff --git a/src/simd/simdX86Types.nim b/src/simd/simdX86Types.nim index bed968b4..f1b001e2 100644 --- a/src/simd/simdX86Types.nim +++ b/src/simd/simdX86Types.nim @@ -2,25 +2,52 @@ import simdWrap export simdWrap {.pragma: imm, header:"immintrin.h".} +{.pragma: imms, header:"immintrin.h", incompleteStruct.} +type + m64* {.importc: "__m64" , imms.} = object + m128* {.importc: "__m128" , imms.} = object + m128d* {.importc: "__m128d", imms.} = object + m128i* {.importc: "__m128i", imms.} = object + m128h* = distinct int64 + m256* {.importc: "__m256" , imms.} = object + m256d* {.importc: "__m256d", imms.} = object + m256i* {.importc: "__m256i", imms.} = object + m256h* = distinct m128i + m512* {.importc: "__m512" , imms.} = object + m512d* {.importc: "__m512d", imms.} = object + m512i* {.importc: "__m512i", imms.} = object + m512h* = distinct m256i + mmask8* {.importc: "__mmask8" , imms.} = object + mmask16* {.importc: "__mmask16", imms.} = object + mmask32* {.importc: "__mmask32", imms.} = object + mmask64* {.importc: "__mmask64", imms.} = object +#[ +{.pragma: imm, header:"immintrin.h", incompleteStruct.} type m64* {.importc: "__m64" , imm.} = object - m128* {.importc: "__m128" , imm.} = object - m128d* {.importc: "__m128d", imm.} = object + a: array[2,float32] + m128* {.importc: "__m128" , imm.} = distinct array[4,float32] + m128d* {.importc: "__m128d", imm.} = distinct array[2,float64] m128i* {.importc: "__m128i", imm.} = object + a: array[4,int32] m128h* = distinct int64 m256* {.importc: "__m256" , imm.} = object - m256d* {.importc: "__m256d", imm.} = object + a: array[8,float32] + m256d* {.importc: "__m256d", imm.} = distinct array[4,float64] m256i* {.importc: "__m256i", imm.} = object + a: array[8,int32] m256h* = distinct m128i - m512* {.importc: "__m512" , imm.} = object - m512d* {.importc: "__m512d", imm.} = object - m512i* {.importc: "__m512i", imm.} = object + m512* {.importc: "__m512" , imm.} = distinct array[16,float32] + m512d* {.importc: "__m512d", imm.} = distinct array[8,float64] + m512i* {.importc: "__m512i", imm.} = distinct array[16,int32] m512h* = distinct m256i mmask8* {.importc: "__mmask8" , imm.} = object mmask16* {.importc: "__mmask16", imm.} = object mmask32* {.importc: "__mmask32", imm.} = object mmask64* {.importc: "__mmask64", imm.} = object +]# +type SimdX86S* = m64 | m128 | m256 | m512 SimdX86D* = m128d | m256d | m512d SimdX86* = SimdX86S | SimdX86D From 01128e27c9d309b2b37953794558d4b2e78d8e50 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Mon, 2 Dec 2024 23:14:33 -0600 Subject: [PATCH 08/12] include compile-time parameters in help small update to random generators --- src/base/params.nim | 57 ++++++++++++++++++++++++++----------- src/base/stdUtils.nim | 7 +++++ src/base/threading.nim | 2 +- src/bench/benchStagProp.nim | 3 ++ src/layout/layoutX.nim | 13 +++++++-- src/rng/distribution.nim | 2 +- src/rng/generator.nim | 6 ++++ src/rng/milcrng.nim | 18 ++++++++---- src/rng/mrg32k3a.nim | 12 ++++++-- 9 files changed, 90 insertions(+), 30 deletions(-) diff --git a/src/base/params.nim b/src/base/params.nim index dd779a1b..7f896e3a 100644 --- a/src/base/params.nim +++ b/src/base/params.nim @@ -1,14 +1,12 @@ import macros, os, strUtils, tables, algorithm, seqUtils -# Note: may get imported before 'echo' is redefined -# uses of 'echo' should be in a template with 'mixin echo' # TODO # include CT in saveParams RT -# include CT in help # check used # only write from rank 0 # only read from rank 0 # add way to represent empty seq/string (versus unset/unknown) +# sort paramHelp output by file and option name type ParamListT = object # used to store set parameters from cmd line or file @@ -28,7 +26,8 @@ var loadParamsCommand {.compileTime.} = "loadParams" # normally not changed var params: Table[string,ParamListT] # store params during setup # param info generated at compile time, as setParam commands are compiled var paramInfoCT {.compileTime.} = newSeq[ParamObj](0) -var paramInfoCTRT = newSeq[ParamObj](0) # conversion of paramInfoCT to RT variable +#var paramInfoCTRT = newSeq[ParamObj](0) # conversion of paramInfoCT to RT variable +var paramInfoCTRT: seq[ParamObj] # conversion of paramInfoCT to RT variable # param info generated at run time, as setParam commands executed (using set values) var paramInfo = newSeq[ParamObj](0) @@ -218,13 +217,16 @@ proc saveParams(x: seq[ParamObj], fn: string, loc: string, thisfile: string) = proc addParam(s,r: string, c: string = "", ii: IiType) = paramInfo.set(s, r, c, ii.filename, ii.line) -proc addComment(s,c:string):string = +let spcName = " " +let spcValue = " #" +let spcLoc = " ## " +proc addComment(s,c,spc:string):string = result = s if c.len>0: let - spc = " ## " m = min(s.len, spc.len-8) result &= spc[m..^1] & c +proc addComment(s,c:string):string = addComment(s,c,spcValue) proc echoParamsX*(warnUnknown=false):string = result = "Params:\n" @@ -243,6 +245,8 @@ template echoParams*(warnUnknown=false) = echo echoParamsX(warnUnknown) proc paramHelp*(p:string = ""):string = + #echo paramInfo + #echo paramInfoCTRT result = "Usage:\n " & getAppFileName() var t: ParamObj var i = paramInfo.find p @@ -256,10 +260,21 @@ proc paramHelp*(p:string = ""):string = result &= addComment(" -" & p & ":" & t.value & " (current value)", t.comment) else: result &= " -OPTION:VALUE ...\nAvailable OPTIONs and current VALUEs:" - let spc = " " - for t in paramInfo: + template p(t) = let nm = t.name - result &= "\n " & (nm & spc[min(spc.len-1,nm.len)..^1] & " : " & t.value).addComment(t.comment) + var s = nm & spcName[min(spcName.len-1,nm.len)..^1] & " : " & t.value + let f = t.file.splitfile[1] + let c = f & "," & $t.line + s = s.addComment(c,spcValue) + s = s.addComment(t.comment,spcLoc) + result &= "\n " & s + var paramnames = newSeq[string](0) + for t in paramInfo: + paramnames.add t.name + p(t) + for t in paramInfoCTRT: + if t.name notin paramnames: # FIXME: should check location too + p(t) template cnvnone(x:typed):untyped = x template makeTypeParam(name,typ,deflt,cnvrt: untyped): untyped {.dirty.} = @@ -375,15 +390,15 @@ template processSaveParams*(index = -1) = let ii = instantiationInfo(index, fullPaths=true) saveParams(paramInfo, saveValue, ii.filename & ":" & $ii.line, ii.filename) +var makeParamInfoCTRT: proc() template processMakeParamInfoCTRT* = - proc makeParamInfoCTRT() {.inject.} - makeParamInfoCTRT() + if not isNil makeParamInfoCTRT: makeParamInfoCTRT() template installStandardParams*(idx= -3) = installSaveParams(index=idx) installLoadParams(index=idx) installHelpParam(index=idx) - processMakeParamInfoCTRT() + #processMakeParamInfoCTRT() # write param file at compile time macro writeParamFileX*(filename: static string, ii: static IiType) = @@ -405,10 +420,14 @@ macro makeParamInfoCTRTX*(): auto = let l = newLit t.line result.add quote do: paramInfoCTRT.add ParamObj(name:`n`,value:`v`,comment:`c`,file:`f`,line:`l`) +template finalizeParams* = + proc makeParamInfoCTRTImpl(): bool = + makeParamInfoCTRTX() + true + #makeParamInfoCTRT = makeParamInfoCTRTImpl + let dum {.global.} = makeParamInfoCTRTImpl() template writeParamFile*(filename: static string = "") = writeParamFileX(filename, instantiationInfo(fullPaths=true)) - proc makeParamInfoCTRT() {.inject.} = - makeParamInfoCTRTX() template assertParam*(p:auto, f:auto) = if not f p: @@ -439,8 +458,10 @@ template CLIset*(p:typed, n:untyped, prefix = "") = discard when isMainModule: - import qex, paramtest + import qex qexInit() + installHelpParam("h") + processHelpParam() letParam: bf = false @@ -462,11 +483,15 @@ when isMainModule: fa1 = @[1.0,1,1,1] fax = if true: @[2.0,2,2,2] else: @[3.0,3,3,3] - installHelpParam("h") echoParams() defaultSetup() + proc paramTest* = + var i = intParam("i", 0, "Test intParam") + var f = floatParam("f", 0.0, "Test floatParam") + echo i, f paramTest() writeParamFile("") qexFinalize() + finalizeParams() diff --git a/src/base/stdUtils.nim b/src/base/stdUtils.nim index 5da5f26d..85bc4f3d 100644 --- a/src/base/stdUtils.nim +++ b/src/base/stdUtils.nim @@ -56,6 +56,13 @@ proc `|`*(f: float, d: int): string = template `|-`*(x:SomeNumber, y: int): auto = x | -y +proc values*(e: typedesc[enum]): string = + result = "" + var sep = "" + for v in e: + result &= sep & $v + sep = " " + proc indexOf*[T](x: openArray[T], y: auto): int = let n = x.len while result Date: Thu, 5 Dec 2024 18:14:05 -0600 Subject: [PATCH 09/12] added fat7 derivative --- src/gauge/fat7l.nim | 5 + src/gauge/fat7lderiv.nim | 345 +++++++++++++++++++++++++++++++++++++++ src/gauge/hypsmear.nim | 37 +---- src/gauge/smearutil.nim | 50 ++++++ 4 files changed, 401 insertions(+), 36 deletions(-) create mode 100644 src/gauge/fat7lderiv.nim create mode 100644 src/gauge/smearutil.nim diff --git a/src/gauge/fat7l.nim b/src/gauge/fat7l.nim index 2aae3561..9fdaea10 100644 --- a/src/gauge/fat7l.nim +++ b/src/gauge/fat7l.nim @@ -13,6 +13,11 @@ type Fat7lCoefs* = object sevenStaple*: float lepage*: float +#type Fat7lState*[I,O] = object +# coefs*: Fat7lCoefs +# in*: I +# out*: O + proc `$`*(c: Fat7lCoefs): string = result = "oneLink: " & $c.oneLink & "\n" result &= "threeStaple: " & $c.threeStaple & "\n" diff --git a/src/gauge/fat7lderiv.nim b/src/gauge/fat7lderiv.nim new file mode 100644 index 00000000..f77b3095 --- /dev/null +++ b/src/gauge/fat7lderiv.nim @@ -0,0 +1,345 @@ +import fat7l, smearutil +import base/profile, layout + +let STAPLE_FLOPS = 3*198+216+18 +let SIDE_FORCE_FLOPS = 7*198+3*216+3*18 + +#[ +proc staple(auto out, auto in0, auto link0, int mu, int nu) = +#define link ftmp0[0] +#define linkmu ftmp[0][mu] +#define in ftmp0[1] +#define innu ftmp[1][nu] +#define linkin mtmp[0] +#define back btmp0[0] +#define backnu btmp[0][nu] +#define linkinnu mtmp[1] + + QDP_M_eq_M(link, link0, QDP_all); + QDP_M_eq_sM(linkmu, link, QDP_neighbor[mu], QDP_forward, QDP_all); + QDP_M_eq_M(in, in0, QDP_all); + QDP_M_eq_sM(innu, in, QDP_neighbor[nu], QDP_forward, QDP_all); + QDP_M_eq_Ma_times_M(linkin, link, in, QDP_all); + QDP_M_eq_M_times_M(back, linkin, linkmu, QDP_all); + QDP_M_eq_sM(backnu, back, QDP_neighbor[nu], QDP_backward, QDP_all); + QDP_M_eq_M_times_M(linkinnu, link, innu, QDP_all); + QDP_discard_M(innu); + QDP_M_peq_M_times_Ma(out, linkinnu, linkmu, QDP_all); + QDP_discard_M(linkmu); + QDP_M_peq_M(out, backnu, QDP_all); + QDP_discard_M(backnu); +#define STAPLE_FLOPS (3*198+216+18) + +#undef link +#undef linkmu +#undef in +#undef innu +#undef linkin +#undef back +#undef backnu +#undef linkinnu +} +]# + +#[ +static void +side_force(QDP_ColorMatrix *force, QDP_ColorMatrix *bot0, QDP_ColorMatrix *side0, + QDP_ColorMatrix *top0, int mu, int nu, QDP_ColorMatrix *stpl) +{ +#define side ftmp0[0] +#define sidemu ftmp[0][mu] +#define top ftmp0[1] +#define topnu ftmp[1][nu] +#define bot ftmp0[2] +#define botnu ftmp[2][nu] +#define sidebot mtmp[0] +#define sidetop mtmp[1] +#define topnusidebot btmp0[0] +#define fbmu btmp[0][mu] +#define botnusidetop btmp0[1] +#define fmbmu btmp[1][mu] +#define sidebotsidemu btmp0[2] +#define stm btmp[2][nu] +#define botnusidemu mtmp[2] +#define botsidemu mtmp[3] + + // force += bot * sidemu * topnu+ + // force -= bot-mu+ * side-mu * topnu-mu + // -= top <-> bot + // stpl += side * botnu * sidemu+ + // stpl += side-nu+ * bot-nu * sidemu-nu + QDP_M_eq_M(side, side0, QDP_all); + QDP_M_eq_sM(sidemu, side, QDP_neighbor[mu], QDP_forward, QDP_all); + QDP_M_eq_M(top, top0, QDP_all); + QDP_M_eq_sM(topnu, top, QDP_neighbor[nu], QDP_forward, QDP_all); + QDP_M_eq_M(bot, bot0, QDP_all); + QDP_M_eq_sM(botnu, bot, QDP_neighbor[nu], QDP_forward, QDP_all); + QDP_M_eq_Ma_times_M(sidebot, side, bot, QDP_all); + QDP_M_eq_Ma_times_M(sidetop, side, top, QDP_all); + QDP_M_eq_Ma_times_M(topnusidebot, topnu, sidebot, QDP_all); + QDP_M_eq_sM(fbmu, topnusidebot, QDP_neighbor[mu], QDP_backward, QDP_all); + QDP_M_eq_Ma_times_M(botnusidetop, botnu, sidetop, QDP_all); + QDP_M_eq_sM(fmbmu, botnusidetop, QDP_neighbor[mu], QDP_backward, QDP_all); + QDP_M_eq_M_times_M(sidebotsidemu, sidebot, sidemu, QDP_all); + QDP_M_eq_sM(stm, sidebotsidemu, QDP_neighbor[nu], QDP_backward, QDP_all); + QDP_M_eq_M_times_Ma(botnusidemu, botnu, sidemu, QDP_all); + QDP_discard_M(botnu); + QDP_M_peq_M_times_M(stpl, side, botnusidemu, QDP_all); + //QDP_M_meq_M_times_Ma(force, top, botnusidemu, QDP_all); + QDP_M_peq_M_times_Ma(force, top, botnusidemu, QDP_all); + QDP_M_eq_M_times_M(botsidemu, bot, sidemu, QDP_all); + QDP_discard_M(sidemu); + QDP_M_peq_M_times_Ma(force, botsidemu, topnu, QDP_all); + QDP_discard_M(topnu); + //QDP_M_meq_Ma(force, fbmu, QDP_all); + QDP_M_peq_Ma(force, fbmu, QDP_all); + QDP_discard_M(fbmu); + QDP_M_peq_Ma(force, fmbmu, QDP_all); + QDP_discard_M(fmbmu); + QDP_M_peq_M(stpl, stm, QDP_all); + QDP_discard_M(stm); +#define SIDE_FORCE_FLOPS (7*198+3*216+3*18) + +#undef side +#undef sidemu +#undef top +#undef topnu +#undef bot +#undef botnu +#undef sidebot +#undef sidetop +#undef topnusidebot +#undef fbmu +#undef botnusidetop +#undef fmbmu +#undef sidebotsidemu +#undef stm +#undef botnusidemu +#undef botsidemu +} +]# + +proc fat7lderiv(perf: var PerfInfo, gauge: auto, deriv: auto, + coef: Fat7lCoefs, mid: auto) = + tic() + var nflops = 0 + let coefL = coef.lepage + let coef1 = coef.oneLink - 6*coefL + let coef3 = coef.threeStaple + let coef5 = coef.fiveStaple + let coef7 = coef.sevenStaple + let have5 = (coef5!=0.0) or (coef7!=0.0) or (coefL!=0.0) + let have3 = (coef3!=0.0) or have5 + type lcm = type(gauge[0]) + let lo = gauge[0].l + proc newlcm: lcm = result.new(gauge[0].l) + var + stpl3: array[4,lcm] + mid5: array[4,lcm] + stpl5,mid3: lcm + s1: array[4,array[4,Shifter[lcm,type(gauge[0][0])]]] + s1a,s1b: array[4,Shifter[lcm,type(gauge[0][0])]] + tm1,tm2: lcm + sm1: array[4,Shifter[lcm,type(gauge[0][0])]] + if have3: + if have5: + for mu in 0..3: + stpl3[mu] = newlcm() + mid5[mu] = newlcm() + s1b[mu] = newShifter(gauge[0], mu, 1) + stpl5 = newlcm() + mid3 = newlcm() + for mu in 0..3: + for nu in 0..3: + if nu!=mu: + s1[mu][nu] = newShifter(gauge[mu], nu, 1) + s1a[mu] = newShifter(gauge[0], mu, 1) + sm1[mu] = newShifter(gauge[0], mu, -1) + tm1 = newlcm() + tm2 = newlcm() + threads: + for mu in 0..<4: + for nu in 0..<4: + if nu!=mu: + discard s1[mu][nu] ^* gauge[mu] + for sig in 0..3: + if have5: + for mu in 0..3: + if mu==sig: continue + #QDP_M_eq_zero(stpl3[mu], QDP_all); + stpl3[mu] := 0 + #staple(stpl3[mu], gauge[sig], gauge[mu], sig, mu); + symStaple(stpl3[mu], 1.0, gauge[mu], gauge[sig], + s1[mu][sig], s1[sig][mu], tm1, sm1[mu]) + #QDP_M_eq_zero(mid5[mu], QDP_all); + mid5[mu] := 0 + nflops += STAPLE_FLOPS + for rho in 0..3: + if rho==sig: continue + #QDP_M_eq_zero(stpl5, QDP_all); + stpl5 := 0 + if coef7!=0.0: + for mu in 0..3: + if mu==sig or mu==rho: continue + for nu in 0..3: + if nu==sig or nu==rho or nu==mu: continue + #staple(stpl5, stpl3[mu], gauge[nu], sig, nu); + discard s1a[nu] ^* stpl3[mu] + symStaple(stpl5, 1.0, gauge[nu], stpl3[mu], + s1[nu][sig], s1a[nu], tm1, sm1[nu]) + nflops += STAPLE_FLOPS + #QDP_M_eq_r_times_M(stpl5, &coef7, stpl5, QDP_all); + stpl5 *= coef7 + nflops += 18 + #QDP_M_eq_zero(mid3, QDP_all); + mid3 := 0 + if coefL!=0.0: + #QDP_M_peq_r_times_M(stpl5, &coefL, stpl3[rho], QDP_all); + stpl5 += coefL * stpl3[rho] + nflops += 36 + if coefL!=0.0 or coef7!=0.0: + #side_force(deriv[rho], mid[sig], gauge[rho], stpl5, sig, rho, mid3); + discard s1a[rho] ^* stpl5 + discard s1b[rho] ^* mid[sig] + symStapleDeriv(deriv[rho], mid3, + gauge[rho], stpl5, s1[rho][sig], s1a[rho], + mid[sig], s1b[rho], tm1, tm2, sm1[rho], sm1[sig]) + nflops += SIDE_FORCE_FLOPS + if coefL!=0.0: + #QDP_M_peq_r_times_M(mid5[rho], &coefL, mid3, QDP_all); + mid5[rho] += coefL * mid3 + nflops += 36 + #//QDP_M_eqm_r_times_M(mid3, &coef7, mid3, QDP_all); + #QDP_M_eq_r_times_M(mid3, &coef7, mid3, QDP_all); + mid3 *= coef7 + #QDP_M_peq_r_times_M(mid3, &coef5, mid[sig], QDP_all); + mid3 += coef5 * mid[sig] + nflops += 18+36 + for mu in 0..3: + if mu==sig or mu==rho: continue + for nu in 0..3: + if nu==sig or nu==rho or nu==mu: continue + #side_force(deriv[mu], mid3, gauge[mu], stpl3[nu], sig, mu, mid5[nu]); + discard s1a[mu] ^* stpl3[nu] + discard s1b[mu] ^* mid3 + symStapleDeriv(deriv[mu], mid5[nu], + gauge[mu], stpl3[nu], s1[mu][sig], s1a[mu], + mid3, s1b[mu], tm1, tm2, sm1[mu], sm1[sig]) + nflops += SIDE_FORCE_FLOPS + + if have3: + #QDP_M_eq_zero(mid3, QDP_all); + mid3 := 0 + for mu in 0..3: + if mu==sig: continue + if have5: + #//QDP_M_eq_r_times_M_minus_M(stpl5, &coef3, mid[sig], mid5[mu], QDP_all); + #QDP_M_eq_r_times_M_plus_M(stpl5, &coef3, mid[sig], mid5[mu], QDP_all); + stpl5 := coef3*mid[sig] + mid5[mu] + nflops += 36 + else: + #QDP_M_eq_r_times_M(stpl5, &coef3, mid[sig], QDP_all); + stpl5 := coef3*mid[sig] + nflops += 18 + #side_force(deriv[mu], stpl5, gauge[mu], gauge[sig], sig, mu, mid3); + discard s1a[mu] ^* stpl5 + symStapleDeriv(deriv[mu], mid3, + gauge[mu], gauge[sig], s1[mu][sig], s1[sig][mu], + stpl5, s1a[mu], tm1, tm2, sm1[mu], sm1[sig]) + nflops += SIDE_FORCE_FLOPS + #//QDP_M_meq_M(deriv[sig], mid3, QDP_all); + #QDP_M_peq_M(deriv[sig], mid3, QDP_all); + deriv[sig] += mid3 + nflops += 18 + if coef1!=0.0: + #QDP_M_peq_r_times_M(deriv[sig], &coef1, mid[sig], QDP_all); + deriv[sig] += coef1 * mid[sig] + nflops += 36 + + if coefL!=0.0: + # fix up Lepage term + let fixL = -coefL + for mu in 0..3: + #QDP_M_eq_zero(ftmp0[0], QDP_all); + tm1 := 0 + for nu in 0..3: + if nu==mu: continue + #QDP_M_eq_Ma_times_M(btmp0[0], mid[nu], gauge[nu], QDP_all); + tm2 := mid[nu].adj * gauge[nu] + #QDP_M_eq_sM(btmp[0][nu], btmp0[0], QDP_neighbor[nu], QDP_backward, QDP_all); + discard sm1[nu] ^* tm2 + #QDP_M_eq_M_times_Ma(stpl5, mid[nu], gauge[nu], QDP_all); + stpl5 := mid[nu] * gauge[nu].adj + #QDP_M_meq_M(stpl5, btmp[0][nu], QDP_all); + stpl5 -= sm1[nu].field + #QDP_discard_M(btmp[0][nu]); + #QDP_M_peq_M(ftmp0[0], stpl5, QDP_all); + #QDP_M_peq_Ma(ftmp0[0], stpl5, QDP_all); + tm1 += stpl5 + stpl5.adj + #QDP_M_eq_sM(ftmp[0][mu], ftmp0[0], QDP_neighbor[mu], QDP_forward, QDP_all); + discard s1a[mu] ^* tm1 + #QDP_M_eq_M_times_M(stpl5, ftmp0[0], gauge[mu], QDP_all); + stpl5 := tm1 * gauge[mu] + #QDP_M_meq_M_times_M(stpl5, gauge[mu], ftmp[0][mu], QDP_all); + stpl5 -= gauge[mu] * s1a[mu].field + #QDP_discard_M(ftmp[0][mu]); + #QDP_M_peq_r_times_M(deriv[mu], &fixL, stpl5, QDP_all); + deriv[mu] += fixL * stpl5 + nflops += 4*(3*(2*198+3*18)+198+216+36) + + #info->final_sec = QOP_time() - dtime; + #info->final_flop = nflops*QDP_sites_on_node; + #info->status = QOP_SUCCESS; + +when isMainModule: + import qex + import physics/qcdTypes + import gauge + qexInit() + #var defaultGaugeFile = "l88.scidac" + let defaultLat = @[8,8,8,8] + defaultSetup() + for mu in 0.. Date: Fri, 6 Dec 2024 10:40:26 -0600 Subject: [PATCH 10/12] move PerfInfo --- src/base/profile.nim | 16 ++++++++++++++++ src/gauge/fat7l.nim | 10 +++++----- src/gauge/fat7lderiv.nim | 8 +++++--- 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/src/base/profile.nim b/src/base/profile.nim index de7ca62e..455e3995 100644 --- a/src/base/profile.nim +++ b/src/base/profile.nim @@ -7,6 +7,22 @@ getOptimPragmas() const noTicToc {.boolDefine.} = false +type PerfInfo* = object + count*: int + flops*: float + secs*: float +template clear*(pi: var PerfInfo) = + pi.count = 0.0 + pi.flops = 0.0 + pi.secs = 0.0 +template `+=`*(r: var PerfInfo, x: PerfInfo) = + r.count += x.count + r.flops += x.flops + r.secs += x.secs +proc `$`*(pi: PerfInfo): string = + result = system.`$`(pi) + result &= &" {1e-9*pi.flops/pi.secs:.2f} Gflops" + type TicType* = distinct int64 template getTics*: TicType = TicType(getMonoTime().ticks) template nsec*(t:TicType):int64 = int64(t) diff --git a/src/gauge/fat7l.nim b/src/gauge/fat7l.nim index 9fdaea10..1a110247 100644 --- a/src/gauge/fat7l.nim +++ b/src/gauge/fat7l.nim @@ -1,10 +1,6 @@ import base import layout -#import gauge -#import strUtils - -type PerfInfo* = object - flops*: float +export PerfInfo type Fat7lCoefs* = object oneLink*: float @@ -168,6 +164,10 @@ proc makeImpLinks*(info: var PerfInfo, fl: auto, gf: auto, coef: auto, #info.final_flop = nflop*QDP_sites_on_node; #info.status = QOP_SUCCESS; toc() + inc info.count + info.flops += nflop * gf[0].l.localGeom.prod + info.secs += getElapsedTime() + proc makeImpLinks*(info: var PerfInfo, fl: auto, gf: auto, coef: auto) = makeImpLinks(info, fl, gf, coef, fl, gf, 0.0) diff --git a/src/gauge/fat7lderiv.nim b/src/gauge/fat7lderiv.nim index f77b3095..a579e4fc 100644 --- a/src/gauge/fat7lderiv.nim +++ b/src/gauge/fat7lderiv.nim @@ -292,9 +292,8 @@ proc fat7lderiv(perf: var PerfInfo, gauge: auto, deriv: auto, #info->status = QOP_SUCCESS; when isMainModule: - import qex - import physics/qcdTypes - import gauge + import qex, physics/qcdTypes + import strformat qexInit() #var defaultGaugeFile = "l88.scidac" let defaultLat = @[8,8,8,8] @@ -324,7 +323,9 @@ when isMainModule: echo g.plaq echo g2.plaq makeImpLinks(info, fl, g, coef) + echo info makeImpLinks(info, fl2, g2, coef) + echo info echo fl.plaq echo fl2.plaq #echo ll.plaq @@ -335,6 +336,7 @@ when isMainModule: #echo pow(1.0+6.0+6.0*4.0+6.0*4.0*2.0+6.0,4)/6.0 fat7lderiv(info, g, fd, coef, dg) + echo info for mu in 0..3: dfl[mu] := fl2[mu] - fl[mu] echo dfl[mu].norm2 From 5b97195b7022ca220b85dc3281d4eb16d36588ec Mon Sep 17 00:00:00 2001 From: James Osborn Date: Fri, 6 Dec 2024 12:50:32 -0600 Subject: [PATCH 11/12] add naik derivative --- src/gauge/fat7l.nim | 13 ++++--- src/gauge/fat7lderiv.nim | 74 ++++++++++++++++++++++++++++++++++--- src/physics/asqtadLinks.nim | 2 +- src/physics/hisqLinks.nim | 4 +- 4 files changed, 78 insertions(+), 15 deletions(-) diff --git a/src/gauge/fat7l.nim b/src/gauge/fat7l.nim index 1a110247..49216b4d 100644 --- a/src/gauge/fat7l.nim +++ b/src/gauge/fat7l.nim @@ -68,8 +68,9 @@ proc computeGenStaple(staple: auto, mu,nu: int, link: auto, coef: float, #if(link!=gauge[mu]) QDP_discard_M(ts0); #QDP_discard_M(ts2); -proc makeImpLinks*(info: var PerfInfo, fl: auto, gf: auto, coef: auto, - ll: auto, gfLong: auto, naik: auto) = +proc makeImpLinks*(fl: auto, gf: auto, coef: auto, + ll: auto, gfLong: auto, naik: auto, + info: var PerfInfo) = tic() type lcm = type(gf[0]) proc QDP_create_M(): lcm = result.new(gf[0].l) @@ -168,8 +169,8 @@ proc makeImpLinks*(info: var PerfInfo, fl: auto, gf: auto, coef: auto, info.flops += nflop * gf[0].l.localGeom.prod info.secs += getElapsedTime() -proc makeImpLinks*(info: var PerfInfo, fl: auto, gf: auto, coef: auto) = - makeImpLinks(info, fl, gf, coef, fl, gf, 0.0) +proc makeImpLinks*(fl: auto, gf: auto, coef: auto, info: var PerfInfo) = + makeImpLinks(fl, gf, coef, fl, gf, 0.0, info) when isMainModule: import qex @@ -196,8 +197,8 @@ when isMainModule: var g3 = g echo g.plaq - makeImpLinks(info, fl, g, coef) - makeImpLinks(info, fl, g, coef, ll, g3, naik) + makeImpLinks(fl, g, coef, info) + makeImpLinks(fl, g, coef, ll, g3, naik, info) echo fl.plaq echo ll.plaq echo pow(1.0,4)/6.0 diff --git a/src/gauge/fat7lderiv.nim b/src/gauge/fat7lderiv.nim index a579e4fc..08edd857 100644 --- a/src/gauge/fat7lderiv.nim +++ b/src/gauge/fat7lderiv.nim @@ -119,8 +119,9 @@ side_force(QDP_ColorMatrix *force, QDP_ColorMatrix *bot0, QDP_ColorMatrix *side0 } ]# -proc fat7lderiv(perf: var PerfInfo, gauge: auto, deriv: auto, - coef: Fat7lCoefs, mid: auto) = +proc fat7lDeriv(deriv: auto, gauge: auto, mid: auto, coef: Fat7lCoefs, + llderiv: auto, llgauge: auto, llmid: auto, naik: float, + perf: var PerfInfo) = tic() var nflops = 0 let coefL = coef.lepage @@ -287,9 +288,49 @@ proc fat7lderiv(perf: var PerfInfo, gauge: auto, deriv: auto, deriv[mu] += fixL * stpl5 nflops += 4*(3*(2*198+3*18)+198+216+36) + if naik!=0.0: + for mu in 0..3: + # force += mid * Umumu' * Umu' + # force -= U-mu' * mid-mu * Umu' + # force += U-mu' * U-mu-mu' * mid-mu-mu + #QDP_M_eq_M(Uf, U, QDP_all); + #QDP_M_eq_sM(Umu, Uf, QDP_neighbor[mu], QDP_forward, QDP_all); + discard s1a[mu] ^* llgauge[mu] + #QDP_M_eq_Ma_times_M(Umid, Uf, mid, QDP_all); + tm1 := llgauge[mu].adj * llmid[mu] + #QDP_M_eq_sM(Umidbmu, Umid, QDP_neighbor[mu], QDP_backward, QDP_all); + discard sm1[mu] ^* tm1 + #QDP_M_eq_Ma_times_Ma(UmuU, Umu, Uf, QDP_all); + tm2 := s1a[mu].field.adj * llgauge[mu].adj + #QDP_M_eq_sM(UmuUs, UmuU, QDP_neighbor[mu], QDP_forward, QDP_all); + discard s1b[mu] ^* tm2 + #QDP_M_eq_Ma_times_M(f3b, Uf, Umidbmu, QDP_all); + llderiv[mu] += naik*(sm1[mu].field*s1a[mu].field.adj) + tm1 := llgauge[mu].adj * sm1[mu].field + #QDP_M_eq_sM(f3, f3b, QDP_neighbor[mu], QDP_backward, QDP_all); + discard sm1[mu] ^* tm1 + #QDP_M_eq_M_times_M(f, mid, UmuUs, QDP_all); + #QDP_discard_M(UmuUs); + #QDP_M_peq_M_times_Ma(f, Umidbmu, Umu, QDP_all); + #QDP_discard_M(Umidbmu); + #QDP_discard_M(Umu); + #QDP_M_peq_M(f, f3, QDP_all); + #QDP_discard_M(f3); + #QDP_M_peq_r_times_M(deriv[mu], &coefN, f, QDP_all); + llderiv[mu] += naik*(llmid[mu]*s1b[mu].field+sm1[mu].field) + #info->final_sec = QOP_time() - dtime; #info->final_flop = nflops*QDP_sites_on_node; #info->status = QOP_SUCCESS; + toc() + inc perf.count + perf.flops += nflops * gauge[0].l.localGeom.prod + perf.secs += getElapsedTime() + +proc fat7lDeriv(deriv: auto, gauge: auto, mid: auto, coef: Fat7lCoefs, + perf: var PerfInfo) = + fat7lDeriv(deriv, gauge, mid, coef, deriv, gauge, mid, 0.0, perf) + when isMainModule: import qex, physics/qcdTypes @@ -308,13 +349,17 @@ when isMainModule: coef.fiveStaple = 1.0 coef.sevenStaple = 1.0 coef.lepage = 1.0 + var naik = 1.0 var fl = lo.newGauge() var fl2 = lo.newGauge() + var ll = lo.newGauge() + var ll2 = lo.newGauge() var dfl = lo.newGauge() var g2 = lo.newGauge() var dg = lo.newGauge() var fd = lo.newGauge() + var ld = lo.newGauge() for mu in 0.. Date: Fri, 6 Dec 2024 15:58:33 -0600 Subject: [PATCH 12/12] add threading to fat7 --- src/base/profile.nim | 2 +- src/base/threading.nim | 12 ++ src/gauge/fat7l.nim | 95 ++++++------ src/gauge/fat7lderiv.nim | 311 +++++++++++++++++++-------------------- 4 files changed, 210 insertions(+), 210 deletions(-) diff --git a/src/base/profile.nim b/src/base/profile.nim index 455e3995..56b665e3 100644 --- a/src/base/profile.nim +++ b/src/base/profile.nim @@ -12,7 +12,7 @@ type PerfInfo* = object flops*: float secs*: float template clear*(pi: var PerfInfo) = - pi.count = 0.0 + pi.count = 0 pi.flops = 0.0 pi.secs = 0.0 template `+=`*(r: var PerfInfo, x: PerfInfo) = diff --git a/src/base/threading.nim b/src/base/threading.nim index 58689754..1e5faba2 100644 --- a/src/base/threading.nim +++ b/src/base/threading.nim @@ -460,6 +460,18 @@ macro threadSum2*(a:varargs[untyped]):auto = result.add(m) #echo result.treeRepr +type ThreadSingle*[T] = distinct T +template `[]`*[T](x: ThreadSingle[T]): auto = T(x) +template `[]=`*[T](x: ThreadSingle[T], y: auto): auto = + T(x) = y +template `=`*(x: var ThreadSingle, y: auto) = + threadSingle: + x[] = y +template `+=`*(x: var ThreadSingle, y: auto) = + threadSingle: + x[] += y +converter fromThreadSingle*[T](x: ThreadSingle[T]): T = T(x) + when isMainModule: threadsInit() echo threadNum, "/", numThreads diff --git a/src/gauge/fat7l.nim b/src/gauge/fat7l.nim index 49216b4d..c8459689 100644 --- a/src/gauge/fat7l.nim +++ b/src/gauge/fat7l.nim @@ -71,7 +71,7 @@ proc computeGenStaple(staple: auto, mu,nu: int, link: auto, coef: float, proc makeImpLinks*(fl: auto, gf: auto, coef: auto, ll: auto, gfLong: auto, naik: auto, info: var PerfInfo) = - tic() + tic("makeImpLinks") type lcm = type(gf[0]) proc QDP_create_M(): lcm = result.new(gf[0].l) var @@ -111,60 +111,49 @@ proc makeImpLinks*(fl: auto, gf: auto, coef: auto, for nu in 0..<4: if dir!=nu: tsg[dir][nu] = newShifter(gf[dir], nu, 1) - discard tsg[dir][nu] ^* gf[dir] - - for dir in 0..<4: - #QDP_M_eq_r_times_M(fl[dir], &coef1, gf[dir], QDP_all); - fl[dir] := coef1 * gf[dir] - if have3: - for nu in 0..<4: - if nu!=dir: - compute_gen_staple(staple, dir, nu, gf[dir], coef3, gf, fl, - tsg[dir][nu], tsg[nu][dir], t1, t2, ts2[nu]) - if coefL!=0.0: - compute_gen_staple(false, dir, nu, staple, coefL, gf, fl, - tsl[nu], tsg[nu][dir], t1, t2, ts2[nu]) - if coef5!=0.0 or coef7!=0.0: - for rho in 0..<4: - if (rho!=dir) and (rho!=nu): - compute_gen_staple(tempmat1, dir, rho, staple, coef5, gf, fl, - tsl[rho], tsg[rho][dir], t1, t2, ts2[rho]) - if coef7!=0.0: - for sig in 0..<4: - if (sig!=dir) and (sig!=nu) and (sig!=rho): - compute_gen_staple(false, dir, sig, tempmat1,coef7,gf,fl, - ts1[sig],tsg[sig][dir],t1,t2,ts2[sig]) - - # long links - if naik!=0.0: + threads: + for dir in 0..<4: + for nu in 0..<4: + if dir!=nu: + discard tsg[dir][nu] ^*! gf[dir] + + toc("main loop") + threads: for dir in 0..<4: - #QDP_M_eq_sM(staple, gfLong[dir], QDP_neighbor[dir], QDP_forward,QDP_all) - #QDP_M_eq_M_times_M(tempmat1, gfLong[dir], staple, QDP_all) - #QDP_M_eq_sM(staple, tempmat1, QDP_neighbor[dir], QDP_forward, QDP_all) - #QDP_M_eq_M_times_M(ll[dir], gfLong[dir], staple, QDP_all) - #QDP_M_eq_r_times_M(ll[dir], &naik, ll[dir], QDP_all) - discard tsl[dir] ^* gfLong[dir] - discard ts1[dir] ^* (gfLong[dir] * tsl[dir].field) - ll[dir] := naik * (gfLong[dir] * ts1[dir].field) - - #[ - if have3 or naik!=0.0: - QDP_destroy_M(staple); - QDP_destroy_M(tempmat1); - if have3: - QDP_destroy_M(t1); - QDP_destroy_M(t2); - for dir in 0..<4: - QDP_destroy_M(tsl[dir]); - QDP_destroy_M(ts1[dir]); - QDP_destroy_M(ts2[dir]); + #QDP_M_eq_r_times_M(fl[dir], &coef1, gf[dir], QDP_all); + fl[dir] := coef1 * gf[dir] + if have3: for nu in 0..<4: - if(tsg[dir][nu]!=nil) QDP_destroy_M(tsg[dir][nu]) - ]# - #info.final_sec = dtime; - #info.final_flop = nflop*QDP_sites_on_node; - #info.status = QOP_SUCCESS; - toc() + if nu!=dir: + compute_gen_staple(staple, dir, nu, gf[dir], coef3, gf, fl, + tsg[dir][nu], tsg[nu][dir], t1, t2, ts2[nu]) + if coefL!=0.0: + compute_gen_staple(false, dir, nu, staple, coefL, gf, fl, + tsl[nu], tsg[nu][dir], t1, t2, ts2[nu]) + if coef5!=0.0 or coef7!=0.0: + for rho in 0..<4: + if (rho!=dir) and (rho!=nu): + compute_gen_staple(tempmat1, dir, rho, staple, coef5, gf, fl, + tsl[rho], tsg[rho][dir], t1, t2, ts2[rho]) + if coef7!=0.0: + for sig in 0..<4: + if (sig!=dir) and (sig!=nu) and (sig!=rho): + compute_gen_staple(false, dir, sig, tempmat1,coef7,gf,fl, + ts1[sig],tsg[sig][dir],t1,t2,ts2[sig]) + + # long links + if naik!=0.0: + for dir in 0..<4: + #QDP_M_eq_sM(staple, gfLong[dir], QDP_neighbor[dir], QDP_forward,QDP_all) + #QDP_M_eq_M_times_M(tempmat1, gfLong[dir], staple, QDP_all) + #QDP_M_eq_sM(staple, tempmat1, QDP_neighbor[dir], QDP_forward, QDP_all) + #QDP_M_eq_M_times_M(ll[dir], gfLong[dir], staple, QDP_all) + #QDP_M_eq_r_times_M(ll[dir], &naik, ll[dir], QDP_all) + discard tsl[dir] ^* gfLong[dir] + discard ts1[dir] ^* (gfLong[dir] * tsl[dir].field) + ll[dir] := naik * (gfLong[dir] * ts1[dir].field) + + toc("end") inc info.count info.flops += nflop * gf[0].l.localGeom.prod info.secs += getElapsedTime() diff --git a/src/gauge/fat7lderiv.nim b/src/gauge/fat7lderiv.nim index 08edd857..5ba34fd7 100644 --- a/src/gauge/fat7lderiv.nim +++ b/src/gauge/fat7lderiv.nim @@ -122,8 +122,8 @@ side_force(QDP_ColorMatrix *force, QDP_ColorMatrix *bot0, QDP_ColorMatrix *side0 proc fat7lDeriv(deriv: auto, gauge: auto, mid: auto, coef: Fat7lCoefs, llderiv: auto, llgauge: auto, llmid: auto, naik: float, perf: var PerfInfo) = - tic() - var nflops = 0 + tic("fat7lDeriv") + var nflops = ThreadSingle[int](0) let coefL = coef.lepage let coef1 = coef.oneLink - 6*coefL let coef3 = coef.threeStaple @@ -162,167 +162,168 @@ proc fat7lDeriv(deriv: auto, gauge: auto, mid: auto, coef: Fat7lCoefs, for mu in 0..<4: for nu in 0..<4: if nu!=mu: - discard s1[mu][nu] ^* gauge[mu] - for sig in 0..3: - if have5: - for mu in 0..3: - if mu==sig: continue - #QDP_M_eq_zero(stpl3[mu], QDP_all); - stpl3[mu] := 0 - #staple(stpl3[mu], gauge[sig], gauge[mu], sig, mu); - symStaple(stpl3[mu], 1.0, gauge[mu], gauge[sig], - s1[mu][sig], s1[sig][mu], tm1, sm1[mu]) - #QDP_M_eq_zero(mid5[mu], QDP_all); - mid5[mu] := 0 - nflops += STAPLE_FLOPS - for rho in 0..3: - if rho==sig: continue - #QDP_M_eq_zero(stpl5, QDP_all); - stpl5 := 0 - if coef7!=0.0: + discard s1[mu][nu] ^*! gauge[mu] + threads: + for sig in 0..3: + if have5: + for mu in 0..3: + if mu==sig: continue + #QDP_M_eq_zero(stpl3[mu], QDP_all); + stpl3[mu] := 0 + #staple(stpl3[mu], gauge[sig], gauge[mu], sig, mu); + symStaple(stpl3[mu], 1.0, gauge[mu], gauge[sig], + s1[mu][sig], s1[sig][mu], tm1, sm1[mu]) + #QDP_M_eq_zero(mid5[mu], QDP_all); + mid5[mu] := 0 + nflops += STAPLE_FLOPS + for rho in 0..3: + if rho==sig: continue + #QDP_M_eq_zero(stpl5, QDP_all); + stpl5 := 0 + if coef7!=0.0: + for mu in 0..3: + if mu==sig or mu==rho: continue + for nu in 0..3: + if nu==sig or nu==rho or nu==mu: continue + #staple(stpl5, stpl3[mu], gauge[nu], sig, nu); + discard s1a[nu] ^* stpl3[mu] + symStaple(stpl5, 1.0, gauge[nu], stpl3[mu], + s1[nu][sig], s1a[nu], tm1, sm1[nu]) + nflops += STAPLE_FLOPS + #QDP_M_eq_r_times_M(stpl5, &coef7, stpl5, QDP_all); + stpl5 *= coef7 + nflops += 18 + #QDP_M_eq_zero(mid3, QDP_all); + mid3 := 0 + if coefL!=0.0: + #QDP_M_peq_r_times_M(stpl5, &coefL, stpl3[rho], QDP_all); + stpl5 += coefL * stpl3[rho] + nflops += 36 + if coefL!=0.0 or coef7!=0.0: + #side_force(deriv[rho], mid[sig], gauge[rho], stpl5, sig, rho, mid3); + discard s1a[rho] ^* stpl5 + discard s1b[rho] ^* mid[sig] + symStapleDeriv(deriv[rho], mid3, + gauge[rho], stpl5, s1[rho][sig], s1a[rho], + mid[sig], s1b[rho], tm1, tm2, sm1[rho], sm1[sig]) + nflops += SIDE_FORCE_FLOPS + if coefL!=0.0: + #QDP_M_peq_r_times_M(mid5[rho], &coefL, mid3, QDP_all); + mid5[rho] += coefL * mid3 + nflops += 36 + #//QDP_M_eqm_r_times_M(mid3, &coef7, mid3, QDP_all); + #QDP_M_eq_r_times_M(mid3, &coef7, mid3, QDP_all); + mid3 *= coef7 + #QDP_M_peq_r_times_M(mid3, &coef5, mid[sig], QDP_all); + mid3 += coef5 * mid[sig] + nflops += 18+36 for mu in 0..3: - if mu==sig or mu==rho: continue + if mu==sig or mu==rho: continue for nu in 0..3: if nu==sig or nu==rho or nu==mu: continue - #staple(stpl5, stpl3[mu], gauge[nu], sig, nu); - discard s1a[nu] ^* stpl3[mu] - symStaple(stpl5, 1.0, gauge[nu], stpl3[mu], - s1[nu][sig], s1a[nu], tm1, sm1[nu]) - nflops += STAPLE_FLOPS - #QDP_M_eq_r_times_M(stpl5, &coef7, stpl5, QDP_all); - stpl5 *= coef7 - nflops += 18 + #side_force(deriv[mu], mid3, gauge[mu], stpl3[nu], sig, mu, mid5[nu]); + discard s1a[mu] ^* stpl3[nu] + discard s1b[mu] ^* mid3 + symStapleDeriv(deriv[mu], mid5[nu], + gauge[mu], stpl3[nu], s1[mu][sig], s1a[mu], + mid3, s1b[mu], tm1, tm2, sm1[mu], sm1[sig]) + nflops += SIDE_FORCE_FLOPS + + if have3: #QDP_M_eq_zero(mid3, QDP_all); mid3 := 0 - if coefL!=0.0: - #QDP_M_peq_r_times_M(stpl5, &coefL, stpl3[rho], QDP_all); - stpl5 += coefL * stpl3[rho] - nflops += 36 - if coefL!=0.0 or coef7!=0.0: - #side_force(deriv[rho], mid[sig], gauge[rho], stpl5, sig, rho, mid3); - discard s1a[rho] ^* stpl5 - discard s1b[rho] ^* mid[sig] - symStapleDeriv(deriv[rho], mid3, - gauge[rho], stpl5, s1[rho][sig], s1a[rho], - mid[sig], s1b[rho], tm1, tm2, sm1[rho], sm1[sig]) - nflops += SIDE_FORCE_FLOPS - if coefL!=0.0: - #QDP_M_peq_r_times_M(mid5[rho], &coefL, mid3, QDP_all); - mid5[rho] += coefL * mid3 - nflops += 36 - #//QDP_M_eqm_r_times_M(mid3, &coef7, mid3, QDP_all); - #QDP_M_eq_r_times_M(mid3, &coef7, mid3, QDP_all); - mid3 *= coef7 - #QDP_M_peq_r_times_M(mid3, &coef5, mid[sig], QDP_all); - mid3 += coef5 * mid[sig] - nflops += 18+36 for mu in 0..3: - if mu==sig or mu==rho: continue - for nu in 0..3: - if nu==sig or nu==rho or nu==mu: continue - #side_force(deriv[mu], mid3, gauge[mu], stpl3[nu], sig, mu, mid5[nu]); - discard s1a[mu] ^* stpl3[nu] - discard s1b[mu] ^* mid3 - symStapleDeriv(deriv[mu], mid5[nu], - gauge[mu], stpl3[nu], s1[mu][sig], s1a[mu], - mid3, s1b[mu], tm1, tm2, sm1[mu], sm1[sig]) - nflops += SIDE_FORCE_FLOPS + if mu==sig: continue + if have5: + #//QDP_M_eq_r_times_M_minus_M(stpl5, &coef3, mid[sig], mid5[mu], QDP_all); + #QDP_M_eq_r_times_M_plus_M(stpl5, &coef3, mid[sig], mid5[mu], QDP_all); + stpl5 := coef3*mid[sig] + mid5[mu] + nflops += 36 + else: + #QDP_M_eq_r_times_M(stpl5, &coef3, mid[sig], QDP_all); + stpl5 := coef3*mid[sig] + nflops += 18 + #side_force(deriv[mu], stpl5, gauge[mu], gauge[sig], sig, mu, mid3); + discard s1a[mu] ^* stpl5 + symStapleDeriv(deriv[mu], mid3, + gauge[mu], gauge[sig], s1[mu][sig], s1[sig][mu], + stpl5, s1a[mu], tm1, tm2, sm1[mu], sm1[sig]) + nflops += SIDE_FORCE_FLOPS + #//QDP_M_meq_M(deriv[sig], mid3, QDP_all); + #QDP_M_peq_M(deriv[sig], mid3, QDP_all); + deriv[sig] += mid3 + nflops += 18 + if coef1!=0.0: + #QDP_M_peq_r_times_M(deriv[sig], &coef1, mid[sig], QDP_all); + deriv[sig] += coef1 * mid[sig] + nflops += 36 - if have3: - #QDP_M_eq_zero(mid3, QDP_all); - mid3 := 0 + if coefL!=0.0: + # fix up Lepage term + let fixL = -coefL for mu in 0..3: - if mu==sig: continue - if have5: - #//QDP_M_eq_r_times_M_minus_M(stpl5, &coef3, mid[sig], mid5[mu], QDP_all); - #QDP_M_eq_r_times_M_plus_M(stpl5, &coef3, mid[sig], mid5[mu], QDP_all); - stpl5 := coef3*mid[sig] + mid5[mu] - nflops += 36 - else: - #QDP_M_eq_r_times_M(stpl5, &coef3, mid[sig], QDP_all); - stpl5 := coef3*mid[sig] - nflops += 18 - #side_force(deriv[mu], stpl5, gauge[mu], gauge[sig], sig, mu, mid3); - discard s1a[mu] ^* stpl5 - symStapleDeriv(deriv[mu], mid3, - gauge[mu], gauge[sig], s1[mu][sig], s1[sig][mu], - stpl5, s1a[mu], tm1, tm2, sm1[mu], sm1[sig]) - nflops += SIDE_FORCE_FLOPS - #//QDP_M_meq_M(deriv[sig], mid3, QDP_all); - #QDP_M_peq_M(deriv[sig], mid3, QDP_all); - deriv[sig] += mid3 - nflops += 18 - if coef1!=0.0: - #QDP_M_peq_r_times_M(deriv[sig], &coef1, mid[sig], QDP_all); - deriv[sig] += coef1 * mid[sig] - nflops += 36 - - if coefL!=0.0: - # fix up Lepage term - let fixL = -coefL - for mu in 0..3: - #QDP_M_eq_zero(ftmp0[0], QDP_all); - tm1 := 0 - for nu in 0..3: - if nu==mu: continue - #QDP_M_eq_Ma_times_M(btmp0[0], mid[nu], gauge[nu], QDP_all); - tm2 := mid[nu].adj * gauge[nu] - #QDP_M_eq_sM(btmp[0][nu], btmp0[0], QDP_neighbor[nu], QDP_backward, QDP_all); - discard sm1[nu] ^* tm2 - #QDP_M_eq_M_times_Ma(stpl5, mid[nu], gauge[nu], QDP_all); - stpl5 := mid[nu] * gauge[nu].adj - #QDP_M_meq_M(stpl5, btmp[0][nu], QDP_all); - stpl5 -= sm1[nu].field - #QDP_discard_M(btmp[0][nu]); - #QDP_M_peq_M(ftmp0[0], stpl5, QDP_all); - #QDP_M_peq_Ma(ftmp0[0], stpl5, QDP_all); - tm1 += stpl5 + stpl5.adj - #QDP_M_eq_sM(ftmp[0][mu], ftmp0[0], QDP_neighbor[mu], QDP_forward, QDP_all); - discard s1a[mu] ^* tm1 - #QDP_M_eq_M_times_M(stpl5, ftmp0[0], gauge[mu], QDP_all); - stpl5 := tm1 * gauge[mu] - #QDP_M_meq_M_times_M(stpl5, gauge[mu], ftmp[0][mu], QDP_all); - stpl5 -= gauge[mu] * s1a[mu].field - #QDP_discard_M(ftmp[0][mu]); - #QDP_M_peq_r_times_M(deriv[mu], &fixL, stpl5, QDP_all); - deriv[mu] += fixL * stpl5 - nflops += 4*(3*(2*198+3*18)+198+216+36) + #QDP_M_eq_zero(ftmp0[0], QDP_all); + tm1 := 0 + for nu in 0..3: + if nu==mu: continue + #QDP_M_eq_Ma_times_M(btmp0[0], mid[nu], gauge[nu], QDP_all); + tm2 := mid[nu].adj * gauge[nu] + #QDP_M_eq_sM(btmp[0][nu], btmp0[0], QDP_neighbor[nu], QDP_backward, QDP_all); + discard sm1[nu] ^* tm2 + #QDP_M_eq_M_times_Ma(stpl5, mid[nu], gauge[nu], QDP_all); + stpl5 := mid[nu] * gauge[nu].adj + #QDP_M_meq_M(stpl5, btmp[0][nu], QDP_all); + stpl5 -= sm1[nu].field + #QDP_discard_M(btmp[0][nu]); + #QDP_M_peq_M(ftmp0[0], stpl5, QDP_all); + #QDP_M_peq_Ma(ftmp0[0], stpl5, QDP_all); + tm1 += stpl5 + stpl5.adj + #QDP_M_eq_sM(ftmp[0][mu], ftmp0[0], QDP_neighbor[mu], QDP_forward, QDP_all); + discard s1a[mu] ^* tm1 + #QDP_M_eq_M_times_M(stpl5, ftmp0[0], gauge[mu], QDP_all); + stpl5 := tm1 * gauge[mu] + #QDP_M_meq_M_times_M(stpl5, gauge[mu], ftmp[0][mu], QDP_all); + stpl5 -= gauge[mu] * s1a[mu].field + #QDP_discard_M(ftmp[0][mu]); + #QDP_M_peq_r_times_M(deriv[mu], &fixL, stpl5, QDP_all); + deriv[mu] += fixL * stpl5 + nflops += 4*(3*(2*198+3*18)+198+216+36) - if naik!=0.0: - for mu in 0..3: - # force += mid * Umumu' * Umu' - # force -= U-mu' * mid-mu * Umu' - # force += U-mu' * U-mu-mu' * mid-mu-mu - #QDP_M_eq_M(Uf, U, QDP_all); - #QDP_M_eq_sM(Umu, Uf, QDP_neighbor[mu], QDP_forward, QDP_all); - discard s1a[mu] ^* llgauge[mu] - #QDP_M_eq_Ma_times_M(Umid, Uf, mid, QDP_all); - tm1 := llgauge[mu].adj * llmid[mu] - #QDP_M_eq_sM(Umidbmu, Umid, QDP_neighbor[mu], QDP_backward, QDP_all); - discard sm1[mu] ^* tm1 - #QDP_M_eq_Ma_times_Ma(UmuU, Umu, Uf, QDP_all); - tm2 := s1a[mu].field.adj * llgauge[mu].adj - #QDP_M_eq_sM(UmuUs, UmuU, QDP_neighbor[mu], QDP_forward, QDP_all); - discard s1b[mu] ^* tm2 - #QDP_M_eq_Ma_times_M(f3b, Uf, Umidbmu, QDP_all); - llderiv[mu] += naik*(sm1[mu].field*s1a[mu].field.adj) - tm1 := llgauge[mu].adj * sm1[mu].field - #QDP_M_eq_sM(f3, f3b, QDP_neighbor[mu], QDP_backward, QDP_all); - discard sm1[mu] ^* tm1 - #QDP_M_eq_M_times_M(f, mid, UmuUs, QDP_all); - #QDP_discard_M(UmuUs); - #QDP_M_peq_M_times_Ma(f, Umidbmu, Umu, QDP_all); - #QDP_discard_M(Umidbmu); - #QDP_discard_M(Umu); - #QDP_M_peq_M(f, f3, QDP_all); - #QDP_discard_M(f3); - #QDP_M_peq_r_times_M(deriv[mu], &coefN, f, QDP_all); - llderiv[mu] += naik*(llmid[mu]*s1b[mu].field+sm1[mu].field) + if naik!=0.0: + for mu in 0..3: + # force += mid * Umumu' * Umu' + # force -= U-mu' * mid-mu * Umu' + # force += U-mu' * U-mu-mu' * mid-mu-mu + #QDP_M_eq_M(Uf, U, QDP_all); + #QDP_M_eq_sM(Umu, Uf, QDP_neighbor[mu], QDP_forward, QDP_all); + discard s1a[mu] ^* llgauge[mu] + #QDP_M_eq_Ma_times_M(Umid, Uf, mid, QDP_all); + tm1 := llgauge[mu].adj * llmid[mu] + #QDP_M_eq_sM(Umidbmu, Umid, QDP_neighbor[mu], QDP_backward, QDP_all); + discard sm1[mu] ^* tm1 + #QDP_M_eq_Ma_times_Ma(UmuU, Umu, Uf, QDP_all); + tm2 := s1a[mu].field.adj * llgauge[mu].adj + #QDP_M_eq_sM(UmuUs, UmuU, QDP_neighbor[mu], QDP_forward, QDP_all); + discard s1b[mu] ^* tm2 + #QDP_M_eq_Ma_times_M(f3b, Uf, Umidbmu, QDP_all); + llderiv[mu] += naik*(sm1[mu].field*s1a[mu].field.adj) + tm1 := llgauge[mu].adj * sm1[mu].field + #QDP_M_eq_sM(f3, f3b, QDP_neighbor[mu], QDP_backward, QDP_all); + discard sm1[mu] ^* tm1 + #QDP_M_eq_M_times_M(f, mid, UmuUs, QDP_all); + #QDP_discard_M(UmuUs); + #QDP_M_peq_M_times_Ma(f, Umidbmu, Umu, QDP_all); + #QDP_discard_M(Umidbmu); + #QDP_discard_M(Umu); + #QDP_M_peq_M(f, f3, QDP_all); + #QDP_discard_M(f3); + #QDP_M_peq_r_times_M(deriv[mu], &coefN, f, QDP_all); + llderiv[mu] += naik*(llmid[mu]*s1b[mu].field+sm1[mu].field) #info->final_sec = QOP_time() - dtime; #info->final_flop = nflops*QDP_sites_on_node; #info->status = QOP_SUCCESS; - toc() + toc("end") inc perf.count perf.flops += nflops * gauge[0].l.localGeom.prod perf.secs += getElapsedTime() @@ -368,17 +369,14 @@ when isMainModule: echo g.plaq echo g2.plaq makeImpLinks(fl, g, coef, info) + info.clear + resetTimers() + makeImpLinks(fl, g, coef, info) echo info makeImpLinks(fl2, g2, coef, info) echo info echo fl.plaq echo fl2.plaq - #echo ll.plaq - #echo pow(1.0,4)/6.0 - #echo pow(1.0+6.0,4)/6.0 - #echo pow(1.0+6.0+6.0*4.0,4)/6.0 - #echo pow(1.0+6.0+6.0*4.0+6.0*4.0*2.0,4)/6.0 - #echo pow(1.0+6.0+6.0*4.0+6.0*4.0*2.0+6.0,4)/6.0 fat7lderiv(fd, g, dg, coef, info) echo info @@ -406,4 +404,5 @@ when isMainModule: dfl[mu] -= ld[mu] echo dfl[mu].norm2 + echoProf() qexFinalize()