Skip to content

Commit

Permalink
fix threading issue
Browse files Browse the repository at this point in the history
fix issue when doing multiple hotspot outputs
add single precision versions of some benchmarks
  • Loading branch information
jcosborn committed Nov 19, 2024
1 parent d2032ec commit 5d6b7b4
Show file tree
Hide file tree
Showing 10 changed files with 222 additions and 66 deletions.
3 changes: 2 additions & 1 deletion src/base/backtrace.nim
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ proc backtrace_symbols(buffer: ptr pointer, size: cint): ptr UncheckedArray[cstr

proc print_trace =
#void *array[10];
const nmax = 10
const nmax = 20
var arr: array[nmax, pointer]
#char **strings;

Expand All @@ -22,6 +22,7 @@ proc print_trace =

proc sigtrace(sig: cint) {.noconv.} =
print_trace()
quit()

proc setTrace* =
c_signal(SIGSEGV, sigtrace)
Expand Down
11 changes: 11 additions & 0 deletions src/base/omp.nim
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ else:
#forceOmpOn()
#{. emit:["#pragma omp ", p] .}
{. emit:["_Pragma(\"omp ", p, "\")"] .}
template ompPragma(p:string,body:typed) =
{. push stackTrace:off, lineTrace:off, line_dir:off .}
{. emit:["_Pragma(\"omp ", p, "\")"] .}
body
{. pop .}
template ompBlock*(p:string; body:untyped) =
#{. emit:"#pragma omp " & p .}
#{. emit:"{ /* Inserted by ompBlock " & p & " */".}
Expand All @@ -38,6 +43,12 @@ else:
#{. emit:"} /* End ompBlock " & p & " */".}

template ompBarrier* = ompPragma("barrier")
template ompFlush* = ompPragma("flush")
template ompFlushAcquire* = ompPragma("flush acquire")
template ompFlushRelease* = ompPragma("flush release")
template ompFlushSeqCst* = ompPragma("flush seq_cst")
template ompAtomicRead*(body) = ompPragma("atomic read acquire", body)
template ompAtomicWrite*(body) = ompPragma("atomic write release", body)

template ompParallel*(body:untyped) =
ompBlock("parallel"):
Expand Down
1 change: 1 addition & 0 deletions src/base/profile.nim
Original file line number Diff line number Diff line change
Expand Up @@ -787,6 +787,7 @@ proc makeHotspotTable(lrti: List[RTInfoObj]): tuple[ns:int64,oh:int64] =
return (nstot, ohtot)

proc echoHotspots* =
hs.clear
let tot = makeHotspotTable(rtiStack)
#let nstot = tot.ns
let ohtot = tot.oh
Expand Down
13 changes: 7 additions & 6 deletions src/base/stdUtils.nim
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@ type
#template `[]`*(x: cArray): untyped = addr x[0]
#template `&`*(x: ptr cArray): untyped = addr x[0]

template ptrInt*(x:untyped):untyped = cast[ByteAddress](x)
template addrInt*(x:untyped):untyped = cast[ByteAddress](addr(x))
template unsafeAddrInt*(x:untyped):untyped = cast[ByteAddress](addr(x))
template toHex*(x: ptr typed): untyped = toHex(cast[ByteAddress](x))
template ptrInt*(x: auto): auto = cast[uint](x)
template addrInt*(x: auto): auto = cast[uint](addr(x))
template unsafeAddrInt*(x: auto): auto = cast[uint](addr(x))
template toHex*(x: ptr auto): auto = toHex(cast[uint](x))
template toHex*(x: pointer): auto = toHex(cast[uint](x))

type
ConstInt* {.importc:"const int".} = object
Expand All @@ -31,7 +32,7 @@ proc isInteger*(s: string):bool =
var t:int
parseInt(s, t) == s.len

template `$&`*(x: untyped): string =
template `$&`*(x: auto): string =
toHex(unsafeAddrInt(x))

proc `|`*(s: string, d: tuple[w:int,c:char]): string =
Expand All @@ -52,7 +53,7 @@ proc `|`*(f: SomeFloat, d: tuple[w,p: int]): string =
formatFloat(f, ffDefault, d.p) | d.w
proc `|`*(f: float, d: int): string =
f | (d,d)
template `|-`*(x:SomeNumber, y: int): untyped =
template `|-`*(x:SomeNumber, y: int): auto =
x | -y

proc indexOf*[T](x: openArray[T], y: auto): int =
Expand Down
218 changes: 167 additions & 51 deletions src/base/threading.nim
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import stdUtils
import macros
import omp
import metaUtils
import base/basicOps
getOptimPragmas

type
ThreadShare* = object
Expand All @@ -18,19 +20,34 @@ var threadNum*{.threadvar.}:int
var numThreads*{.threadvar.}:int
var threadLocals*{.threadvar.}:ThreadObj
var inited = false
var ts: pointer = nil
var nts = 0

template initThreadLocals*(ts:seq[ThreadShare]):untyped =
proc allocTs* {.alwaysInline.} =
if numThreads > nts and threadNum == 0:
if ts == nil:
ts = allocShared(numThreads*sizeof(ThreadShare))
else:
ts = reallocShared(ts, numThreads*sizeof(ThreadShare))
nts = numThreads

#template initThreadLocals*(ts:seq[ThreadShare]) =
template initThreadLocals* =
bind ts
threadLocals.threadNum = threadNum
threadLocals.numThreads = numThreads
threadLocals.share = cast[ptr cArray[ThreadShare]](ts[0].addr)
#threadLocals.share = cast[ptr cArray[ThreadShare]](ts[0].addr)
threadLocals.share = cast[ptr cArray[ThreadShare]](ts)
threadLocals.share[threadNum].p = nil
threadLocals.share[threadNum].counter = 0
proc init =
inited = true
threadNum = 0
numThreads = 1
var ts = newSeq[ThreadShare](numThreads)
initThreadLocals(ts)
#var ts = newSeq[ThreadShare](numThreads)
#initThreadLocals(ts)
allocTs()
initThreadLocals()
template threadsInit* =
if not inited:
init()
Expand All @@ -57,51 +74,61 @@ template emitStackTrace: untyped =
template threads*(body:untyped):untyped =
checkInit()
doAssert(numThreads==1)
let tidOld = threadNum
let nidOld = numThreads
let tlOld = threadLocals
#let tidOld = threadNum
#let nidOld = numThreads
#let tlOld = threadLocals
#proc tproc2{.genSym,inline.} =
# body
proc tproc{.genSym.} =
emitStackTrace()
var ts:seq[ThreadShare]
#var ts:seq[ThreadShare]
ompParallel:
threadNum = ompGetThreadNum()
numThreads = ompGetNumThreads()
if threadNum==0: ts.newSeq(numThreads)
#if threadNum==0: ts.newSeq(numThreads)
allocTs()
threadBarrierO()
initThreadLocals(ts)
#initThreadLocals(ts)
initThreadLocals()
threadBarrierO()
#echoAll threadNum, " s: ", ptrInt(threadLocals.share)
body
#tproc2()
threadBarrierO()
tproc()
threadNum = tidOld
numThreads = nidOld
threadLocals = tlOld
#threadNum = tidOld
#numThreads = nidOld
#threadLocals = tlOld
threadNum = 0
numThreads = 1
initThreadLocals()
template threads*(x0:untyped;body:untyped):untyped =
checkInit()
let tidOld = threadNum
let nidOld = numThreads
let tlOld = threadLocals
#let tidOld = threadNum
#let nidOld = numThreads
#let tlOld = threadLocals
proc tproc(xx:var type(x0)) {.genSym.} =
var ts:seq[ThreadShare]
#var ts:seq[ThreadShare]
ompParallel:
threadNum = ompGetThreadNum()
numThreads = ompGetNumThreads()
if threadNum==0: ts.newSeq(numThreads)
#if threadNum==0: ts.newSeq(numThreads)
allocTs()
threadBarrierO()
initThreadLocals(ts)
#initThreadLocals(ts)
initThreadLocals()
threadBarrierO()
#echoAll threadNum, " s: ", ptrInt(threadLocals.share)
subst(x0,xx):
body
threadBarrierO()
tproc(x0)
threadNum = tidOld
numThreads = nidOld
threadLocals = tlOld
#threadNum = tidOld
#numThreads = nidOld
#threadLocals = tlOld
threadNum = 0
numThreads = 1
initThreadLocals()

template nothreads*(body: untyped): untyped =
## convenient way to turn off threading
Expand All @@ -114,13 +141,18 @@ template threadBarrierO* = ompBarrier
template threadMaster*(x:untyped) = ompMaster(x)
template threadSingle*(x:untyped) = ompSingle(x)
template threadCritical*(x:untyped) = ompCritical(x)
template threadFlush* = ompFlush
template threadFlushRelease* = ompFlushRelease
template threadFlushAcquire* = ompFlushAcquire
template threadFlushSeqCst* = ompFlushSeqCst
template threadAtomicRead*(body:typed) = ompAtomicRead(body)
template threadAtomicWrite*(body:typed) = ompAtomicWrite(body)

template threadDivideLow*(x,y: untyped): untyped =
x + (threadNum*(y-x)) div numThreads
template threadDivideHigh*(x,y: untyped): untyped =
x + ((threadNum+1)*(y-x)) div numThreads


proc tForX*(index,i0,i1,body:NimNode):NimNode =
return quote do:
let d = 1+`i1` - `i0`
Expand Down Expand Up @@ -154,39 +186,123 @@ iterator `.|`*[S, T](a: S, b: T): T {.inline.} =
inc(res)
"""

template t0waitO* = threadBarrier()
template t0waitX* =
if threadNum==0:
inc threadLocals.share[0].counter
let tbar0 = threadLocals.share[0].counter
for b in 1..<numThreads:
let p{.volatile.} = threadLocals.share[b].counter.addr
template t0waitB* = threadBarrier()
#template t0waitO* = t0waitB()
template t0waitA* =
if numThreads > 1:
if threadNum==0:
inc threadLocals.share[0].counter
let tbar0 = threadLocals.share[0].counter
for b in 1..<numThreads:
let p = threadLocals.share[b].counter.addr
while true:
#fence()
#ompAcquire
#if p[] >= tbar0: break
var t {.noInit.}: type(p[])
ompAtomicRead: t = p[]
if t >= tbar0: break
else:
#inc threadLocals.share[threadNum].counter
#fence()
#ompRelease
let t = threadLocals.share[threadNum].counter + 1
ompAtomicWrite:
threadLocals.share[threadNum].counter = t
template t0wait* = t0waitA()
#template t0wait* = t0waitB()

template twait0B* = threadBarrier()
template twait0A* =
if numThreads > 1:
if threadNum==0:
#inc threadLocals.share[0].counter
#fence()
#ompRelease
let t = threadLocals.share[0].counter + 1
ompAtomicWrite:
threadLocals.share[0].counter = t
else:
inc threadLocals.share[threadNum].counter
let tbar0 = threadLocals.share[threadNum].counter
let p = threadLocals.share[0].counter.addr
while true:
fence()
if p[] >= tbar0: break
else:
inc threadLocals.share[threadNum].counter
fence()
template t0wait* = t0waitO()
#fence()
#ompAcquire
#if p[] >= tbar0: break
var t {.noInit.}: type(p[])
ompAtomicRead: t = p[]
if t >= tbar0: break
template twait0* = twait0A()
#template twait0* = twait0B()

template threadBarrierA* =
threadFlushRelease
t0waitA
twait0A
threadFlushAcquire
template threadBarrier* = threadBarrierA
#template threadBarrier* = ompBarrier

template twait0O* = threadBarrier()
template twait0X* =
template threadSum01A*[T](a: T) =
## sum value with result on thread 0, atomic version
if threadNum==0:
inc threadLocals.share[0].counter
fence()
tic("threadSum01A")
t0wait()
toc("t0wait")
for i in 1..<numThreads:
var p{.noInit.}: pointer
threadAtomicRead:
p = threadLocals.share[i].p
a += cast[ptr T](p)[]
toc("sum")
twait0()
toc("twait0")
else:
inc threadLocals.share[threadNum].counter
let tbar0 = threadLocals.share[threadNum].counter
let p{.volatile.} = threadLocals.share[0].counter.addr
while true:
fence()
if p[] >= tbar0: break
template twait0* = twait0O()
threadAtomicWrite:
threadLocals.share[threadNum].p = a.addr
t0wait()
twait0()

template threadSum01B*[T](a: T) =
## sum value with result on thread 0, barrier version
block:
tic("threadSum01")
if threadNum!=0:
threadLocals.share[threadNum].p = a.addr
threadBarrier()
toc("threadBarrier first")
if threadNum==0:
for i in 1..<numThreads:
a += cast[ptr T](threadLocals.share[i].p)[]
toc("sum")
threadBarrier()
toc("threadBarrier last")

template threadSum01*(a: auto) = threadSum01A(a)
#template threadSum01*(a: auto) = threadSum01B(a)
template threadSum0*(a: auto) = threadSum01(a)

template threadBarrier* =
#t0waitX
#twait0X
ompBarrier
# threadMax0 FIXME

template threadBroadcast1A*[T](a: T) =
if threadNum==0:
tic("threadRankSum1")
threadAtomicWrite:
threadLocals.share[0].p = a.addr
twait0()
toc("twait0")
t0wait()
toc("t0wait")
else:
twait0()
var p{.noInit.}: pointer
threadAtomicRead:
p = threadLocals.share[0].p
a = cast[ptr T](p)[]
t0wait()
template threadBroadcast1*(a: auto) = threadBroadcast1A(a)
template threadBroadcast*(a: auto) = threadBroadcast1(a)

macro threadSum*(a:varargs[untyped]):auto =
#echo a.treeRepr
Expand Down
3 changes: 3 additions & 0 deletions src/bench/benchLinalgS.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import base/globals
setDefaultSingle()
import benchLinalg
3 changes: 3 additions & 0 deletions src/bench/benchStagPropS.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import base/globals
setDefaultSingle()
import benchStagProp
Loading

0 comments on commit 5d6b7b4

Please sign in to comment.