Skip to content

Commit

Permalink
add hotspot profile
Browse files Browse the repository at this point in the history
  • Loading branch information
jcosborn committed Aug 9, 2024
1 parent c6a86cc commit de23e27
Show file tree
Hide file tree
Showing 11 changed files with 234 additions and 70 deletions.
163 changes: 154 additions & 9 deletions src/base/profile.nim
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import threading
export threading
import comms/comms, stdUtils, base/basicOps
import os, strutils, sequtils, std/monotimes
import os, strutils, sequtils, std/monotimes, std/tables, std/algorithm, strformat
export monotimes
getOptimPragmas()

Expand Down Expand Up @@ -49,6 +49,10 @@ type
CodePoint = distinct int32
CodePointObj = object
toDropTimer: bool
#nsec: int64
#overhead: int64
count: uint32
dropcount: uint32
name: string
loc: II
SString = static[string] | string
Expand Down Expand Up @@ -117,6 +121,7 @@ iterator items[T](ls:List[T]):T =

proc setLen(ls:var RTInfoObjList, len:int32) {.borrow.}
proc free(ls:var RTInfoObjList) {.borrow.}
proc add(ls:var RTInfoObjList, x:RTInfoObj) {.borrow.}
template len(ls:RTInfoObjList):int32 = List[RTInfoObj](ls).len
template `[]`(ls:RTInfoObjList, n:int32):untyped = List[RTInfoObj](ls)[n]
iterator mitems(ls:RTInfoObjList):var RTInfoObj =
Expand Down Expand Up @@ -183,20 +188,49 @@ proc newCodePoint(ii:II, s:SString = ""):CodePoint =
let n = cpHeap.len
cpHeap.setlen(n+1)
cpHeap[n].toDropTimer = false
cpHeap[n].count = 0
cpHeap[n].dropcount = 0
cpHeap[n].name = s
cpHeap[n].loc = ii
CodePoint(n)

template name(x:CodePoint):string = cpHeap[x.int].name
template loc(x:CodePoint):II = cpHeap[x.int].loc
template count(x:CodePoint):uint32 = cpHeap[x.int].count
template dropcount(x:CodePoint):uint32 = cpHeap[x.int].dropcount
template toDropTimer(x:CodePoint):bool = cpHeap[x.int].toDropTimer
template dropTimer(x:CodePoint) =
toDropTimer(x) = true
#echo "dropTimer: ", cpHeap[x.int].loc, " ", cpHeap[x.int].name

template overhead(x:RTInfoObj):untyped = x.overhead
template childrenOverhead(x:RTInfoObj):untyped = x.childrenOverhead
template istic(x:RTInfoObj):bool = x.prev.isNil
template isnottic(x:RTInfoObj):bool = not x.prev.isNil

template dropTimer(x:RTInfo):untyped = toDropTimer(rtiStack[x.int].curr) = true
template toDropTimer(x:RTInfoObj):bool = toDropTimer(x.curr)
template dropTimer(x:RTInfoObj) = dropTimer(x.curr)
proc dropTimerRecursive(x:RTInfoObj) =
#echo "dropTimerRecursive: ", x.prev.loc, " ", x.curr.loc, " ", x.curr.name
dropTimer(x)
for i in 0..<x.children.len:
dropTimer(x.children[i].tic)
dropTimer(x.children[i].prev)
dropTimer(x.children[i].curr)
dropTimerRecursive(x.children[i])
proc dropTimerChildren(x:RTInfoObj) =
#echo "dropTimerRecursive: ", x.prev.loc, " ", x.curr.loc, " ", x.curr.name
#dropTimer(x)
for i in 0..<x.children.len:
#dropTimer(x.children[i].tic)
#echo "dropTimer: ", x.children[i].tic.name, " ", x.children[i].curr.name, " ", x.children[i].curr.loc
dropTimer(x.children[i].prev)
#dropTimer(x.children[i].curr)
dropTimerChildren(x.children[i])

template toDropTimer(x:RTInfo):bool = toDropTimer(rtiStack[x.int])
template dropTimer(x:RTInfo) = dropTimer(rtiStack[x.int])
#template dropTimerRecursive(x:RTInfo) = dropTimerRecursive(rtiStack[x.int])
template dropTimerChildren(x:RTInfo) = dropTimerChildren(rtiStack[x.int])

template identical(x,y:RTInfoObj):bool =
x.tic == y.tic and x.prev == y.prev and x.curr == y.curr
Expand Down Expand Up @@ -399,12 +433,19 @@ proc tocSet(localTimer:var TicType, prevRTI:var RTInfo, restartTimer:var bool,
for i in 0..<c.len:
if toDropTimer(c[i].prev):
oh -= c[i].childrenOverhead
if oh.float / ns.float > DropWasteTimerRatio:
#if ii.filename != "cg.nim":
inc thisCode.count
if oh.float > ns.float*DropWasteTimerRatio:
#if not toDropTimer(prevRTI) and oh.float > ns.float*DropWasteTimerRatio:
inc thisCode.dropcount
#if ii.filename != "scg.nim":
# echo "drop timer: ", oh.float, "/", ns.float, "=", oh.float / ns.float
# echo " ", prevRTI.int, " ", thisRTI.int, " ", ii, " ", s
# Signal stop if the overhead is too large.
dropTimer(prevRTI)
if thisCode.dropcount > 10 and thisCode.dropcount*10 > thisCode.count:
#echo "dropTimer: ", rtiStack[thisRTI.int].tic.name, " ", thisCode.name, " ", thisCode.loc
dropTimer(prevRTI)
dropTimerChildren(thisRTI)
#dropTimer(thisCode)
if toDropTimer(thisCode):
freezeTimers()
restartTimer = true
Expand Down Expand Up @@ -574,14 +615,14 @@ type
Tstr = tuple
label: string
stats: string
func markMissing(p:bool,str:string):string =
if p: "[" & str & "]"
else: str
template ppT(ts: RTInfoObjList, prefix = "-", total = 0'i64, overhead = 0'i64,
count = 0'u32, initIx = 0, showAbove = 0.0, showDropped = true): seq[Tstr] =
ppT(List[RTInfoObj](ts), prefix, total, overhead, count, initIx, showAbove, showDropped)
proc ppT(ts: List[RTInfoObj], prefix = "-", total = 0'i64, overhead = 0'i64,
count = 0'u32, initIx = 0, showAbove = 0.0, showDropped = true): seq[Tstr] =
proc markMissing(p:bool,str:string):string =
if p: "[" & str & "]"
else: str
var
sub:int64 = 0
subo:int64 = 0
Expand Down Expand Up @@ -686,6 +727,110 @@ proc echoTimersRaw* =
echo cpHeap
echo rtiStack

proc getName(t: ptr RTInfoObj): string =
let tn = t.tic.name
let pn = t.prev.name
let name = (if tn=="":"" else:tn&":") & (if pn=="":"" else:pn&"-") & t.curr.name
if t.prev.toDropTimer:
result = "{" & name & "}"
else: result = name

var hs = initTable[string,RTInfoObj]()
proc makeHotspotTable(lrti: List[RTInfoObj]): tuple[ns:int64,oh:int64] =
var nstot = int64 0
var ohtot = int64 0
for ri in lrti:
let nc = ri.count
if ri.istic or nc==0: continue
let
f0 = splitFile(ri.prev.loc.filename)[1]
l0 = ri.prev.loc.line
f = splitFile(ri.curr.loc.filename)[1]
l = ri.curr.loc.line
loc = f0 & "(" & $l0 & "-" & (if f==f0:"" else:f) & $l & ") " & getName(addr ri)
coh = ri.childrenOverhead
soh = ri.overhead
nsec = ri.nsec
ns = nsec - coh
oh = soh + coh
nstot += ns
ohtot += oh
hs.withValue(loc, t): # t is found value for loc
t.nsec += ri.nsec
t.flops += ri.flops
t.overhead += ri.overhead
t.childrenOverhead += ri.childrenOverhead
t.count += ri.count
for i in 0..<ri.children.len:
var j = 0
while j < t.children.len:
if ri.children[i].curr.loc != t.children[int32 j].curr.loc: inc j; continue
if ri.children[i].tic.name != t.children[int32 j].tic.name: inc j; continue
break
if j < t.children.len:
t.children[int32 j].nsec += ri.children[i].nsec
t.children[int32 j].overhead += ri.children[i].overhead
else:
t.children.add ri.children[i]
do: # loc not found
hs[loc] = ri
#let tot = makeHotSpotTable(List[RTInfoObj](ri.children))
return (nstot, ohtot)

proc echoHotspots* =
let tot = makeHotspotTable(rtiStack)
#let nstot = tot.ns
let ohtot = tot.oh
let nloc = 24
var skeys = newSeq[tuple[ns:int64,loc:string]]()
var maxcount = 0
var nstot = 0.0
for k,v in hs:
if v.children.len>0:
let ins = v.nsec - v.childrenOverhead
skeys.add (ns: ins, loc: "incl" & k)
#if v.children.len>0:
var sns = v.nsec
if v.children.len==0: sns -= v.childrenOverhead
for i in 0..<v.children.len:
sns -= v.children[i].nsec + v.children[i].overhead
skeys.add (ns: sns, loc: "self" & k)
maxcount = max(maxcount, int v.count)
nstot += sns
skeys.sort(system.cmp, Sortorder.Descending)
let countdigits = max(5, 1 + int log10(float maxcount))
echo "Profile time: ", (1e-9*nstot)|(0,6), " overhead: ", (1e-9*ohtot)|(0,6), " total: ", (1e-9*(nstot+ohtot))|(0,6)
echo "% time % self count Mf/s #child location name (#child: I=inclusive, S=self)"
var tsns = 0.0
for nk in skeys:
let incl = nk.loc.startsWith("incl")
let key = nk.loc[4..^1]
hs.withValue(key, t):
let
pct = 100.0 * nk.ns / nstot
count = t.count|countdigits
#coh = t.childrenOverhead
#soh = t.overhead
#oh = soh + coh
#ohpct = 100.0 * oh / nstot
mfs = 1000.0 * t.flops / nk.ns.float
mf = (if t.flops>0.0: mfs.int|7 else: " ")
nc = (if t.children.len>0: t.children.len|4 else: " ")
f0 = splitFile(t.prev.loc.filename)[1]
l0 = t.prev.loc.line
f = splitFile(t.curr.loc.filename)[1]
l = t.curr.loc.line
loc = f0 & "(" & $l0 & "-" & (if f==f0:"" else:f) & $l & ")"
lc = loc|(-nloc,'.')
nm = getName(t)
#echo &"{pct:6.3f} {ohpct:6.3f} {count} {mf} {nc} {lc} {nm}"
if incl:
echo &"{pct:6.3f} {count} {mf} {nc} I {lc} {nm}"
else:
tsns += nk.ns
let tsnspct = 100.0 * tsns / nstot
echo &"{pct:6.3f} {tsnspct:7.3f} {count} {mf} {nc} S {lc} {nm}"

when isMainModule:
import os
proc test =
Expand Down
12 changes: 6 additions & 6 deletions src/base/threading.nim
Original file line number Diff line number Diff line change
Expand Up @@ -162,26 +162,26 @@ template t0waitX* =
for b in 1..<numThreads:
let p{.volatile.} = threadLocals.share[b].counter.addr
while true:
fence()
if p[] >= tbar0: break
else:
inc threadLocals.share[threadNum].counter
#fence()
fence()
template t0wait* = threadBarrier()
fence()
template t0wait* = t0waitO()

template twait0O* = threadBarrier()
template twait0X* =
if threadNum==0:
inc threadLocals.share[0].counter
#fence()
fence()
else:
inc threadLocals.share[threadNum].counter
let tbar0 = threadLocals.share[threadNum].counter
let p{.volatile.} = threadLocals.share[0].counter.addr
while true:
fence()
if p[] >= tbar0: break
fence()
template twait0* = threadBarrier()
template twait0* = twait0O()

template threadBarrier* =
#t0waitX
Expand Down
5 changes: 5 additions & 0 deletions src/comms/commsUtils.nim
Original file line number Diff line number Diff line change
Expand Up @@ -249,13 +249,18 @@ template threadRankSum1*[T](comm: Comm, a: T) =
var ta{.global.}: type(a)
#var ta2{.global.}:array[512,type(a)]
if threadNum==0:
tic("threadRankSum1")
t0wait()
toc("t0wait")
for i in 1..<numThreads:
a += cast[ptr type(a)](threadLocals.share[i].p)[]
#a += ta2[threadNum]
toc("sum")
rankSum(comm,a)
toc("rankSum")
ta = a
twait0()
toc("twait0")
else:
threadLocals.share[threadNum].p = a.addr
#ta2[threadNum] = a
Expand Down
10 changes: 5 additions & 5 deletions src/examples/puregaugehb2du1.nim
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import qex
import gauge, physics/qcdTypes
import os, strutils, times

proc sumStaples(f:any, g:any, mu:int, sg:any, ftmp:any, sftmp:any) =
proc sumStaples(f:auto, g:auto, mu:int, sg:auto, ftmp:auto, sftmp:auto) =
f := 0
for nu in 0..<g.len:
if mu==nu: continue
Expand All @@ -23,7 +23,7 @@ type HeatBath[F,E] = object
sftmp: array[2,seq[Shifter[F,E]]]
subs: array[2,Subset]

proc newHeatBath(lo:any):auto =
proc newHeatBath(lo:auto):auto =
type F = typeof(lo.ColorMatrix())
var f = lo.ColorMatrix()
type E = type(f[0])
Expand All @@ -40,7 +40,7 @@ proc newHeatBath(lo:any):auto =
r.subs[1].layoutSubset(lo,"o")
r

proc evolve(h:HeatBath, g:array or seq, gc:GaugeActionCoeffs, r:any) =
proc evolve(h:HeatBath, g:array or seq, gc:GaugeActionCoeffs, r:auto) =
tic("heatbath")
const nc = g[0][0].nrows
let
Expand Down Expand Up @@ -103,7 +103,7 @@ proc evolve(h:HeatBath, g:array or seq, gc:GaugeActionCoeffs, r:any) =

proc topo2DU1(g:array or seq):float =
tic()
const nc = g[0][0].nrows
#const nc = g[0][0].nrows
let
lo = g[0].l
nd = lo.nDim
Expand Down Expand Up @@ -143,7 +143,7 @@ threads: echo "thread ", threadNum, "/", numThreads
let
gc = GaugeActionCoeffs(plaq: beta)
lo = lat.newLayout
vol = lo.physVol
#vol = lo.physVol

var r = lo.newRNGField(RngMilc6, seed)

Expand Down
Loading

0 comments on commit de23e27

Please sign in to comment.