From de23e2723cf09cbd964736fded18129ab0fda484 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Fri, 9 Aug 2024 11:49:04 -0500 Subject: [PATCH] add hotspot profile --- src/base/profile.nim | 163 +++++++++++++++++++++++++++++-- src/base/threading.nim | 12 +-- src/comms/commsUtils.nim | 5 + src/examples/puregaugehb2du1.nim | 10 +- src/experimental/stagag.nim | 43 ++++---- src/field/fieldET.nim | 16 +-- src/io/crc32.nim | 6 +- src/io/parallelIo.nim | 10 +- src/physics/stagD.nim | 14 +-- src/physics/stagSolve.nim | 20 ++-- src/solvers/cg.nim | 5 +- 11 files changed, 234 insertions(+), 70 deletions(-) diff --git a/src/base/profile.nim b/src/base/profile.nim index 570e190a..8153a168 100644 --- a/src/base/profile.nim +++ b/src/base/profile.nim @@ -1,7 +1,7 @@ import threading export threading import comms/comms, stdUtils, base/basicOps -import os, strutils, sequtils, std/monotimes +import os, strutils, sequtils, std/monotimes, std/tables, std/algorithm, strformat export monotimes getOptimPragmas() @@ -49,6 +49,10 @@ type CodePoint = distinct int32 CodePointObj = object toDropTimer: bool + #nsec: int64 + #overhead: int64 + count: uint32 + dropcount: uint32 name: string loc: II SString = static[string] | string @@ -117,6 +121,7 @@ iterator items[T](ls:List[T]):T = proc setLen(ls:var RTInfoObjList, len:int32) {.borrow.} proc free(ls:var RTInfoObjList) {.borrow.} +proc add(ls:var RTInfoObjList, x:RTInfoObj) {.borrow.} template len(ls:RTInfoObjList):int32 = List[RTInfoObj](ls).len template `[]`(ls:RTInfoObjList, n:int32):untyped = List[RTInfoObj](ls)[n] iterator mitems(ls:RTInfoObjList):var RTInfoObj = @@ -183,20 +188,49 @@ proc newCodePoint(ii:II, s:SString = ""):CodePoint = let n = cpHeap.len cpHeap.setlen(n+1) cpHeap[n].toDropTimer = false + cpHeap[n].count = 0 + cpHeap[n].dropcount = 0 cpHeap[n].name = s cpHeap[n].loc = ii CodePoint(n) template name(x:CodePoint):string = cpHeap[x.int].name template loc(x:CodePoint):II = cpHeap[x.int].loc +template count(x:CodePoint):uint32 = cpHeap[x.int].count +template dropcount(x:CodePoint):uint32 = cpHeap[x.int].dropcount template toDropTimer(x:CodePoint):bool = cpHeap[x.int].toDropTimer +template dropTimer(x:CodePoint) = + toDropTimer(x) = true + #echo "dropTimer: ", cpHeap[x.int].loc, " ", cpHeap[x.int].name template overhead(x:RTInfoObj):untyped = x.overhead template childrenOverhead(x:RTInfoObj):untyped = x.childrenOverhead template istic(x:RTInfoObj):bool = x.prev.isNil template isnottic(x:RTInfoObj):bool = not x.prev.isNil - -template dropTimer(x:RTInfo):untyped = toDropTimer(rtiStack[x.int].curr) = true +template toDropTimer(x:RTInfoObj):bool = toDropTimer(x.curr) +template dropTimer(x:RTInfoObj) = dropTimer(x.curr) +proc dropTimerRecursive(x:RTInfoObj) = + #echo "dropTimerRecursive: ", x.prev.loc, " ", x.curr.loc, " ", x.curr.name + dropTimer(x) + for i in 0.. DropWasteTimerRatio: - #if ii.filename != "cg.nim": + inc thisCode.count + if oh.float > ns.float*DropWasteTimerRatio: + #if not toDropTimer(prevRTI) and oh.float > ns.float*DropWasteTimerRatio: + inc thisCode.dropcount + #if ii.filename != "scg.nim": # echo "drop timer: ", oh.float, "/", ns.float, "=", oh.float / ns.float # echo " ", prevRTI.int, " ", thisRTI.int, " ", ii, " ", s # Signal stop if the overhead is too large. - dropTimer(prevRTI) + if thisCode.dropcount > 10 and thisCode.dropcount*10 > thisCode.count: + #echo "dropTimer: ", rtiStack[thisRTI.int].tic.name, " ", thisCode.name, " ", thisCode.loc + dropTimer(prevRTI) + dropTimerChildren(thisRTI) + #dropTimer(thisCode) if toDropTimer(thisCode): freezeTimers() restartTimer = true @@ -574,14 +615,14 @@ type Tstr = tuple label: string stats: string +func markMissing(p:bool,str:string):string = + if p: "[" & str & "]" + else: str template ppT(ts: RTInfoObjList, prefix = "-", total = 0'i64, overhead = 0'i64, count = 0'u32, initIx = 0, showAbove = 0.0, showDropped = true): seq[Tstr] = ppT(List[RTInfoObj](ts), prefix, total, overhead, count, initIx, showAbove, showDropped) proc ppT(ts: List[RTInfoObj], prefix = "-", total = 0'i64, overhead = 0'i64, count = 0'u32, initIx = 0, showAbove = 0.0, showDropped = true): seq[Tstr] = - proc markMissing(p:bool,str:string):string = - if p: "[" & str & "]" - else: str var sub:int64 = 0 subo:int64 = 0 @@ -686,6 +727,110 @@ proc echoTimersRaw* = echo cpHeap echo rtiStack +proc getName(t: ptr RTInfoObj): string = + let tn = t.tic.name + let pn = t.prev.name + let name = (if tn=="":"" else:tn&":") & (if pn=="":"" else:pn&"-") & t.curr.name + if t.prev.toDropTimer: + result = "{" & name & "}" + else: result = name + +var hs = initTable[string,RTInfoObj]() +proc makeHotspotTable(lrti: List[RTInfoObj]): tuple[ns:int64,oh:int64] = + var nstot = int64 0 + var ohtot = int64 0 + for ri in lrti: + let nc = ri.count + if ri.istic or nc==0: continue + let + f0 = splitFile(ri.prev.loc.filename)[1] + l0 = ri.prev.loc.line + f = splitFile(ri.curr.loc.filename)[1] + l = ri.curr.loc.line + loc = f0 & "(" & $l0 & "-" & (if f==f0:"" else:f) & $l & ") " & getName(addr ri) + coh = ri.childrenOverhead + soh = ri.overhead + nsec = ri.nsec + ns = nsec - coh + oh = soh + coh + nstot += ns + ohtot += oh + hs.withValue(loc, t): # t is found value for loc + t.nsec += ri.nsec + t.flops += ri.flops + t.overhead += ri.overhead + t.childrenOverhead += ri.childrenOverhead + t.count += ri.count + for i in 0..0: + let ins = v.nsec - v.childrenOverhead + skeys.add (ns: ins, loc: "incl" & k) + #if v.children.len>0: + var sns = v.nsec + if v.children.len==0: sns -= v.childrenOverhead + for i in 0..0.0: mfs.int|7 else: " ") + nc = (if t.children.len>0: t.children.len|4 else: " ") + f0 = splitFile(t.prev.loc.filename)[1] + l0 = t.prev.loc.line + f = splitFile(t.curr.loc.filename)[1] + l = t.curr.loc.line + loc = f0 & "(" & $l0 & "-" & (if f==f0:"" else:f) & $l & ")" + lc = loc|(-nloc,'.') + nm = getName(t) + #echo &"{pct:6.3f} {ohpct:6.3f} {count} {mf} {nc} {lc} {nm}" + if incl: + echo &"{pct:6.3f} {count} {mf} {nc} I {lc} {nm}" + else: + tsns += nk.ns + let tsnspct = 100.0 * tsns / nstot + echo &"{pct:6.3f} {tsnspct:7.3f} {count} {mf} {nc} S {lc} {nm}" + when isMainModule: import os proc test = diff --git a/src/base/threading.nim b/src/base/threading.nim index 631ef61e..73db56c1 100644 --- a/src/base/threading.nim +++ b/src/base/threading.nim @@ -162,26 +162,26 @@ template t0waitX* = for b in 1..= tbar0: break else: inc threadLocals.share[threadNum].counter - #fence() - fence() -template t0wait* = threadBarrier() + fence() +template t0wait* = t0waitO() template twait0O* = threadBarrier() template twait0X* = if threadNum==0: inc threadLocals.share[0].counter - #fence() + fence() else: inc threadLocals.share[threadNum].counter let tbar0 = threadLocals.share[threadNum].counter let p{.volatile.} = threadLocals.share[0].counter.addr while true: + fence() if p[] >= tbar0: break - fence() -template twait0* = threadBarrier() +template twait0* = twait0O() template threadBarrier* = #t0waitX diff --git a/src/comms/commsUtils.nim b/src/comms/commsUtils.nim index 1d027193..9fb89f86 100644 --- a/src/comms/commsUtils.nim +++ b/src/comms/commsUtils.nim @@ -249,13 +249,18 @@ template threadRankSum1*[T](comm: Comm, a: T) = var ta{.global.}: type(a) #var ta2{.global.}:array[512,type(a)] if threadNum==0: + tic("threadRankSum1") t0wait() + toc("t0wait") for i in 1.. 0: - echo "Starting warmups" - #setupMDx() - alwaysAccept = warmmd - for n in 1..nwarm: - m.update - m.clearStats - pacc.clear +block: + tic("warmup") + if nwarm > 0: + echo "Starting warmups" + #setupMDx() + alwaysAccept = warmmd + for n in 1..nwarm: + m.update + m.clearStats + pacc.clear + toc("end warmup") echo "Starting HMC" #setupMD5() @@ -2043,11 +2048,11 @@ for i in 0.. 0: m.clearStats pacc.clear - tic() + tic("inference") for n in 1..trajs: echo "Starting inference: ", n echoParams() @@ -2091,13 +2096,15 @@ if trajs > 0: let ttot = getElapsedTime() echo "End inference update: ", tup, " measure: ", ttot-tup, " total: ", ttot let et = getElapsedTime() - toc() + toc("end inference") echo "Inference time: ", et if outfn != "": echo "Saving gauge field to file: ", outfn let err = g.saveGauge outfn -if intParam("prof",0) != 0: - echoTimers() +case intParam("prof",0) +of 1: echoTimers() +of 2: echoHotspots() +else: discard qexFinalize() diff --git a/src/field/fieldET.nim b/src/field/fieldET.nim index 591990ed..43679098 100644 --- a/src/field/fieldET.nim +++ b/src/field/fieldET.nim @@ -603,7 +603,7 @@ proc mul*(r:Field; x:Field2; y:Field3) = mul(r[e], x[e], y[e]) proc norm2P*(f:SomeField):auto = - tic() + tic("norm2P[" & $type(f) & "]") mixin norm2, inorm2, simdSum, items, toDouble #var n2:type(norm2(f[0])) var n2: evalType(norm2(toDouble(f[0]))) @@ -611,10 +611,10 @@ proc norm2P*(f:SomeField):auto = #let t = f for x in items(f): inorm2(n2, toDouble(f[x])) - toc("norm2 local") + toc("local") #echoAll n2 result = simdSum(n2) - toc("norm2 simd sum") + toc("simdSum") #echoAll myRank, ",", threadNum, ": ", result #threadSum(result) #toc("norm2 thread sum") @@ -622,7 +622,7 @@ proc norm2P*(f:SomeField):auto = #toc("norm2 rank sum") f.l.threadRankSum(result) #echo result - toc("norm2 thread rank sum") + toc("threadRankSum") template norm2*(f:SomeAllField):auto = when declared(subsetObject): #echo "subsetObj" & s @@ -702,7 +702,7 @@ template dot*(f1:SomeAllField; f2:SomeAllField2):untyped = template dot*(f1:Subsetted; f2:SomeAllField2):untyped = dotP(f1, f2) proc redotP*(f1:SomeField; f2:SomeField2):auto = - tic() + tic("redotP[" & $f1.type & "," & $f2.type & "]") mixin redot, iredot, simdSum, items, toDouble, eval #var d:type(redot(f1[0],f2[0])) var d: evalType(toDouble(redot(f1[0],f2[0]))) @@ -711,9 +711,9 @@ proc redotP*(f1:SomeField; f2:SomeField2):auto = for x in items(t1): #iredot(d, t1[x], t2[x]) d += redot(t1[x], t2[x]) - toc("redot local") + toc("local") result = simdSum(d) - toc("redot simd sum") + toc("simdSum") #threadBarrier() #toc("thread barrier") #threadSum(result) @@ -721,7 +721,7 @@ proc redotP*(f1:SomeField; f2:SomeField2):auto = #rankSum(result) #toc("rank sum") f1.l.threadRankSum(result) - toc("redot thread rank sum") + toc("threadRankSum") template redot*(f1:SomeAllField; f2:SomeAllField2):untyped = when declared(subsetObject): #echo "subsetObj redot" diff --git a/src/io/crc32.nim b/src/io/crc32.nim index 608f2184..e06c6a4a 100644 --- a/src/io/crc32.nim +++ b/src/io/crc32.nim @@ -3,7 +3,7 @@ import strutils type Crc32* = uint32 const InitCrc32* = Crc32(not 0'u32) const crc32PolyLow = Crc32(0xedb88320) -const crc32Poly = uint64(0x100000000) + uint64(crc32PolyLow) +#const crc32Poly = uint64(0x100000000) + uint64(crc32PolyLow) proc createCrcTable(): array[0..255, Crc32] = for i in 0..255: @@ -48,6 +48,7 @@ proc polyMul(x0: uint32, y0: uint64): uint64 = y = y shl 1 x = x shr 1 +#[ proc polyRem(x0: uint64, y0: uint64): uint32 = var x = x0 var y = y0 @@ -67,6 +68,7 @@ proc polyRem(x0: uint64, y0: uint64): uint32 = echo (x0 xor polyMul(q, y0)).toHex echo x.toHex result = x.uint32 +]# proc mulRem(r1,r2: Crc32): Crc32 = var t = polyMul(r1, r2) shl 1 @@ -75,12 +77,14 @@ proc mulRem(r1,r2: Crc32): Crc32 = t = t shr 8 result = result xor Crc32(t) +#[ proc zeroPadCrc32X(crc: Crc32, n: int): Crc32 = var fac = Crc32(0x80000000) for i in 0..sbQex>op") threadBarrier() if parEven: stagD2ee(s.se, s.so, a, s.g, b, m*m) @@ -95,8 +95,9 @@ proc solveXX*(s: Staggered; r,x: Field; m: SomeNumber; sp0: var SolverParams; echo "solveEE(QEX): ", sp.getStats else: echo "solveOO(QEX): ", sp.getStats + toc("end sbQex") of sbQuda: - tic() + tic("sbQuda") if parEven: #echo x.even.norm2, " ", sp.r2req s.qudaSolveEE(r,x,m,sp) @@ -111,7 +112,7 @@ proc solveXX*(s: Staggered; r,x: Field; m: SomeNumber; sp0: var SolverParams; if sp0.verbosity>0: echo "solveXX(QUDA): ", sp.getStats of sbGrid: - tic() + tic("sbQuda") if parEven: s.gridSolveEE(r,x,m,sp) toc("gridSolveEE") @@ -127,6 +128,7 @@ proc solveXX*(s: Staggered; r,x: Field; m: SomeNumber; sp0: var SolverParams; sp.iterationsMax = sp.iterations sp.r2.push 0.0 sp0.addStats(sp) + toc("end solveXX") proc solveEE*(s: Staggered; r,x: Field; m: SomeNumber; sp0: var SolverParams) = solveXX(s, r, x, m, sp0, parEven=true) @@ -137,7 +139,7 @@ proc solveOO*(s: Staggered; r,x: Field; m: SomeNumber; sp0: var SolverParams) = # right-preconditioned proc solveReconR(s:Staggered; x,b:Field; m:SomeNumber; sp: var SolverParams; b2e,b2o: float) = - tic() + tic("solveReconR") let b2 = b2e + b2o let r2stop = sp.r2req * b2 let r2stop2 = 0.5 * r2stop @@ -175,7 +177,7 @@ proc solveReconR(s:Staggered; x,b:Field; m:SomeNumber; sp: var SolverParams; # left-preconditioned with odd reconstruction proc solveReconL(s:Staggered; x,b:Field; m:SomeNumber; sp: var SolverParams; b2e,b2o: float) = - tic() + tic("solveReconL") #if b2e == 0.0 or b2o == 0.0: #solveR(s, y, r, m, sp, r2e, r2o) var d = newOneOf(b) @@ -415,7 +417,7 @@ when isMainModule: threads: v1 := 0 resetTimers() - precon = true + #precon = true s.solve(v1, v3, m, sp) threads: r := v1 - v2 diff --git a/src/solvers/cg.nim b/src/solvers/cg.nim index f3237f3f..8f9234dd 100644 --- a/src/solvers/cg.nim +++ b/src/solvers/cg.nim @@ -54,7 +54,7 @@ proc newCgState*[T](x,b: T): CgState[T] = # solves: A x = b proc solve*(state: var CgState; op: auto; sp: var SolverParams) = mixin apply, applyPrecon - tic() + tic("solve") let vrb = sp.verbosity template verb(n:int; body:untyped) = if vrb>=n: body @@ -172,7 +172,7 @@ proc solve*(state: var CgState; op: auto; sp: var SolverParams) = echo("CG iteration: ", itn, " r2/b2: ", r2/b2) while itnr2stop: - tic() + tic("cg loop") if itn == 0 or precon != cpLeftRight: preconL(z, r) # z = L r or z = R r for RightNonHerm else: @@ -195,6 +195,7 @@ proc solve*(state: var CgState; op: auto; sp: var SolverParams) = verb(3): echo "beta: ", beta preconR(p, q) # p = R q + toc("preconR") inc itn op.apply(Ap, p) toc("Ap")