diff --git a/src/bench/benchStagProp.nim b/src/bench/benchStagProp.nim index 9580ffc..2ac4f8f 100644 --- a/src/bench/benchStagProp.nim +++ b/src/bench/benchStagProp.nim @@ -18,8 +18,11 @@ var v1 = lo.ColorVector() var v2 = lo.ColorVector() var r = lo.ColorVector() var rs = newRNGField(RngMilc6, lo, intParam("seed", 987654321).uint64) +let warm0 = 0.29 + 500.0/lo.physVol.float +var warm = floatParam("warm", warm0) threads: - g.random rs + #g.random rs + g.warm warm, rs g.setBC g.stagPhase v1 := 0 @@ -67,6 +70,7 @@ for i in 0..3: g3[2*i] = g[i] g3[2*i+1] = lo.ColorMatrix() g3[2*i+1].randomSU rs + g3[2*i+1] *= 0.1 var s3 = newStag3(g3) #s3.D(v2, v1, m) s3.solve(v2, v1, mass, sp) diff --git a/src/comms/halo.nim b/src/comms/halo.nim new file mode 100644 index 0000000..ba2cc95 --- /dev/null +++ b/src/comms/halo.nim @@ -0,0 +1,441 @@ +import qex, base/hyper, comms/gather +getOptimPragmas() + +type + HaloLayout*[L] = ref object + lo*: L # layout + outerExt*: seq[int32] # extended outer lattice size + offset*: seq[int32] # offset of outer lattice in extended outer + lex*: seq[int32] # outerExt lex index of extended sites + index*: seq[int32] # index of extended site for given lex index + neighborFwd*: seq[seq[int32]] # fwd neighbor for extended outer lattice + neighborBck*: seq[seq[int32]] # bck neighbor for extended outer lattice + nOut*: int # sites in outer lattice + nExt*: int # sites in extended outer lattice + Halo*[L,F,T] = ref object + layout*: HaloLayout[L] + field*: F + halo*: alignedMem[T] + nOut*: int # sites in outer lattice + nExt*: int # sites in extended outer lattice + HaloMap*[L] = ref object + layout*: HaloLayout[L] + offsets*: seq[seq[int32]] # offsets of outer sites needed + gather*: GatherMap # gather map to fetch needed halo sites + gatherRev*: GatherMap # gather map to send halo sites back + GatherHalo*[F,T;Rev:static bool] = object + src: F + dest: ptr UncheckedArray[T] + elemsize: int + vlen: int + +proc norm2*(h: Halo): auto = + var s = h.halo[0].norm2 + for i in 1..= hi[i]: + result = false + break + +proc init[T](s: var seq[T], x: openArray[SomeNumber]) = + s.newSeq(x.len) + for i in 0..= hl.lo.physGeom[l]: y[l] -= int32 hl.lo.physGeom[l] + let ri = hl.lo.rankIndex(y) + let didx = vlen*(i-hl.nOut)+k + rl.add RecvList(didx: int32 didx, srank: int32 ri.rank, sidx: int32 ri.index) + sl.add SendList(sidx: int32 didx, drank: int32 ri.rank, didx: int32 ri.index) + toc("RecvList") + result.gather = c.makeGatherMap(rl) + result.gatherRev = c.makeGatherMap(sl) + toc("makeGatherMap") +proc makeHaloMap*[L,N](hl: HaloLayout[L], c: Comm, offsets: seq[array[N,SomeInteger]]): HaloMap[L] = + var o = newSeq[seq[int32]](offsets.len) + for i in 0.. 0: + h.map.neighborFwd[mu][i] + else: + h.map.neighborBck[mu][i] + +when isMainModule: + qexInit() + tic("main") + var defaultLat = @[4,4,4,4] + defaultSetup() + let nd = lo.nDim + var seed = 987654321'u + #var rng = newRngField(lo, RngMilc6, seed) + var rng = newRngField(lo, MRG32k3a, seed) + g.gaussian rng + toc "gaussian" + #var r0 = lo.Real() + #var cv1 = lo.ColorVector() + #var cv2 = lo.ColorVector() + echo 6.0 * g.plaq + toc "plaq" + #cv0.gaussian rng + resetTimers() + + proc testPlaq = + tic("testPlaq") + #let hl = lo.makeHaloLayout([1,1,1,1],[1,1,1,1]) + let hl = lo.makeHaloLayout([1,1,1,1],[0,0,0,0]) + toc "makeHaloLayout" + type HM = HaloMap[type lo] + let comm = getDefaultComm() + var hm = newSeq[HM](nd) + for mu in 0.. [0,1,0,0],[0,0,1,0],[0,0,0,1], +# [-1,0,0,0],[0,-1,0,0],[0,0,-1,0],[0,0,0,-1], +# [-1,1,0,0],[-1,0,1,0],[-1,0,0,1] + +# [1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1], +# [-1,0,0,0],[0,-1,0,0],[0,0,-1,0],[0,0,0,-1], +# [-1,1,0,0],[-1,0,1,0],[-1,0,0,1], +# [1,-1,0,0],[0,-1,1,0],[0,-1,0,1], +# [1,0,-1,0],[0,1,-1,0],[0,0,-1,1], +# [1,0,0,-1],[0,1,0,-1],[0,0,1,-1] + diff --git a/src/experimental/pgag.nim b/src/experimental/pgag.nim new file mode 100644 index 0000000..02c2b03 --- /dev/null +++ b/src/experimental/pgag.nim @@ -0,0 +1,785 @@ +import qex, gauge, physics/[qcdTypes,stagSolve] +import times, macros, algorithm +import hmc/metropolis +import observables/sources +import quda/qudaWrapper +import hmc/agradOps + +qexinit() + +let + lat = intSeqParam("lat", @[8,8,8,8]) + beta = floatParam("beta", 5.6) + tau0 = floatParam("tau", 0.04) + lam0 = floatParam("lam", 0.19) + sig0 = floatParam("sig", 0.3) + rho0 = floatParam("rho", 0.2) + theta0 = floatParam("theta", 0.2) + ups0 = floatParam("ups", 0.2) + upit = intParam("upit", 1) + gsteps0 = intParam("gsteps", 32) + nwarm = intParam("nwarm", 10) + trajs = intParam("trajs", 10) + seed0 = defaultComm.broadcast(int(1000*epochTime())) + seed = uint64 intParam("seed", seed0) + infn = stringParam("infn", "") + outfn = stringParam("outfn", "") +var + tau = tau0 + lam = lam0 + sig = sig0 + rho = rho0 + ups = ups0 + theta = theta0 + gsteps = gsteps0 + +macro echoparam(x: typed): untyped = + let n = x.repr + result = quote do: + echo `n`, ": ", `x` + +echoparam(beta) +echoparam(tau) +echoparam(lam) +echoparam(sig) +echoparam(rho) +echoparam(theta) +echoparam(ups) +echoparam(upit) +echoparam(gsteps) +echoparam(nwarm) +echoparam(trajs) +echoparam(seed) +echoparam(infn) +echoparam(outfn) + +let + gc = GaugeActionCoeffs(plaq: beta, adjplaq: 0) + lo = lat.newLayout + vol = lo.physVol + +var r = lo.newRNGField(RngMilc6, seed) +var R:RngMilc6 # global RNG +R.seed(seed, 987654321) + +var g = lo.newgauge +type + Gauge = typeof(g) + GaugeV = AgVar[Gauge] +template newGaugeV(c: AgTape, x: Gauge): auto = newGaugeFV(c, x) +case infn +of "": + g.random r + #g.unit +of "hot": + g.random r +of "cold": + g.unit +else: + echo "Loading gauge field from file: ", infn + let err = g.loadGauge infn + +echo "plaq: ", 6.0*g.plaq +echo "gaugeAction2: ", g.gaugeAction2 gc +echo "actionA: ", gc.actionA g + +var + p = lo.newgauge + f = lo.newgauge + g0 = lo.newgauge + +proc norm2*(x: seq): float = + for i in 0.. m.hOld: + costg = m.pAccept * (costg - vtau.obj * params[i].grad) + result.add costg + +proc checkGrad(m: Met) = + #let tg = vtau.grad + #let f0 = vtau.obj * exp(m.hOld-m.hNew) + let eps = 1e-7 + var gs = newSeq[float](0) + for i in 0.. 0: + echo "Starting warmups" + #gsteps = 60 + #fsteps = 40 + #setupMDx() + alwaysAccept = true + for n in 1..nwarm: + m.update + m.clearStats + +echo "Starting HMC" +#gsteps = gsteps0 +#fsteps = fsteps0 +#setupMD5() +alwaysAccept = false +gutime = 0.0 +gftime = 0.0 +#fftime = 0.0 +block: + tic() + for n in 1..trajs: + echo "Starting trajectory: ", n + tic() + m.update + getGrad(m) + if upit > 0: + if n mod upit == 0: + updateParams(0.001) + let tup = getElapsedTime() + measure() + let ttot = getElapsedTime() + echo "End trajectory update: ", tup, " measure: ", ttot-tup, " total: ", ttot + let et = getElapsedTime() + toc() + echo "HMC time: ", et + #let at = gutime + gftime + fftime + #echo &"gu: {gutime} gf: {gftime} ff: {fftime} ot: {et-at} tt: {et}" + +if outfn != "": + echo "Saving gauge field to file: ", outfn + let err = g.saveGauge outfn + +#echoTimers() +qexfinalize() diff --git a/src/gauge/hypsmear2.nim b/src/gauge/hypsmear2.nim new file mode 100644 index 0000000..2dda26d --- /dev/null +++ b/src/gauge/hypsmear2.nim @@ -0,0 +1,544 @@ +import base, layout, gauge, hypsmear, comms/halo, bitops +getOptimPragmas() + +#nflop = 61632.0 +# l1[mu,nu] : g[mu](0,nu,-nu) g[nu](0,mu)(0,-nu) +# l2[mu,nu] : g[mu](0) l1[mu,b](a,-a) l1[a,b](0,mu)(0,-a) +# fl[mu] : g[mu](0) l2[mu,nu](nu,-nu) l2[nu,mu](0,mu)(0,-nu) +# l2[mu,nu] : g[mu](0)+(0,b,-b)(a,-a) +# g[a](0,mu)(0,-a)(0,b,-b) +# g[b](0,mu)(0,-b)(0,a,-a) +# g[0]: (0,+-1,+-1,+-1) (-1,1,+-1,+-1) (-1,+-1,1,+-1) (-1,+-1,+-1,1) + +type + HypTemps*[L,F,T] = object + gf: array[4,F] + hl: HaloLayout[L] + hm: HaloMap[L] + l1x: FieldArray[L.V,T] + l2x: FieldArray[L.V,T] + l1: FieldArray[L.V,T] + l2: FieldArray[L.V,T] + flx: FieldArray[L.V,T] + hgf: array[4,Halo[L,F,T]] + h1x: array[4,array[4,Halo[L,F,T]]] + h2x: array[4,array[4,Halo[L,F,T]]] + h1: array[4,array[4,Halo[L,F,T]]] + h2: array[4,array[4,Halo[L,F,T]]] + info: PerfInfo + +template proj(x,y: auto) = x.projectU y +proc proj[T](x: T): T {.noInit,alwaysInline.} = result.proj x +template projDeriv(r: auto, x: auto, c: auto) = r.projectUDeriv(x, c) +template projDeriv(r: auto, u, x: auto, c: auto) = r.projectUDeriv(u, x, c) + +proc init*[L,F,T,G](ht: var HypTemps[L,F,T], gf: G, comm: Comm) = + tic "HypTemps init" + static: doAssert(type(gf[0]) is F) + static: doAssert(type(gf[0].l) is L) + static: doAssert(type(gf[0][0]) is T) + let lo = gf[0].l + doAssert(lo.nDim == 4) + ht.hl = lo.makeHaloLayout([1,1,1,1],[1,1,1,1]) + toc "makeHaloLayout" + ht.l1x = newFieldArray2(lo,F,[4,4],mu!=nu) + ht.l2x = newFieldArray2(lo,F,[4,4],mu!=nu) + ht.l1 = newFieldArray2(lo,F,[4,4],mu!=nu) + ht.l2 = newFieldArray2(lo,F,[4,4],mu!=nu) + ht.flx = newFieldArray(lo,F,4) + for mu in 0..<4: + ht.gf[mu] = gf[mu] + ht.hgf[mu] = makeHalo(ht.hl, gf[mu]) + for nu in 0..<4: + if nu == mu: continue + ht.h1x[mu][nu] = makeHalo(ht.hl, ht.l1x[mu,nu]) + ht.h2x[mu][nu] = makeHalo(ht.hl, ht.l2x[mu,nu]) + ht.h1[mu][nu] = makeHalo(ht.hl, ht.l1[mu,nu]) + ht.h2[mu][nu] = makeHalo(ht.hl, ht.l2[mu,nu]) + toc "makeHalo" + const hmtype = 0 + when hmtype == 0: + var offsets = newSeq[array[4,int32]](0) + for k in 0..15: # use all corners for simplicity + var t = [1'i32,1,1,1] + for i in 0..<4: + if k.testBit i: t[i] = -1 + offsets.add t + ht.hm = ht.hl.makeHaloMap(comm, offsets) + elif hmtype == 1: + var offsets = newSeq[array[4,int32]](0) + for k in 0..15: + var t = [1'i32,1,1,1] + for i in 0..<4: + if k.testBit i: t[i] = -1 + if k != 0 and k != 15: offsets.add t + for mu in 0..<4: + if t[mu] == 1: + t[mu] = 0 + offsets.add t + t[mu] = 1 + ht.hm = hl.makeHaloMap(comm, offsets) + toc "makeHaloMap" +proc newHypTemps*[G](gf: G): auto = + type + L = type gf[0].l + F = type gf[0] + T = type gf[0][0] + var ht: HypTemps[L,F,T] + let comm = getDefaultComm() + ht.init(gf, comm) + ht + +proc symstaple(s: var auto, a: float, nu0,mu0,nu1,nu2,mu1,nu3: auto) = + let t0 = mu0 * nu1.adj + let t1 = nu0 * t0 + s += a*t1 + let t2 = mu1 * nu3 + let t3 = nu2.adj * t2 + s += a*t3 + +proc symstaple(s: var auto, a: float, x,y: auto, i,fnu,fmu,bnu,fmubnu: SomeInteger, + p: static bool = false) = + when p: + template prj(m: auto): auto = proj(m) + else: + template prj(m: auto): auto = m + let xi = prj x[i] + let yfnu = prj y[fnu] + let xfmu = prj x[fmu] + let xbnu = prj x[bnu] + let ybnu = prj y[bnu] + let xfmubnu = prj x[fmubnu] + var t = xi * yfnu * xfmu.adj + t += xbnu.adj * ybnu * xfmubnu + s += a*t +template symstaplep(s: var auto, a: float, x,y: auto, i,fnu,fmu,bnu,fmubnu: SomeInteger) = + symstaple(s, a, x,y, i,fnu,fmu,bnu,fmubnu, true) + +proc symderiv(s: var auto, x,y,cx,cy: auto, i,fnu,fmu,bnu,fmubnu: SomeInteger, + p: static bool = false): int = + mixin projectUflops + when p: + template prj(m: auto): auto = proj(m) + else: + template prj(m: auto): auto = m + if fnu>=0 and fmu>=0: + let xi = prj x[i] + let yfnu = prj y[fnu] + let xfmu = prj x[fmu] + s += cx[i] * yfnu * xfmu.adj + s += xi * cy[fnu] * xfmu.adj + s += xi * yfnu * cx[fmu].adj + result += 3*projectUflops(3) + 3*(18+2*3*66) + if bnu>=0 and fmubnu>=0: + let xbnu = prj x[bnu] + let ybnu = prj y[bnu] + let xfmubnu = prj x[fmubnu] + s += cx[bnu].adj * ybnu * xfmubnu + s += xbnu.adj * cy[bnu] * xfmubnu + s += xbnu.adj * ybnu * cx[fmubnu] + result += 3*projectUflops(3) + 3*(18+2*3*66) +template symderivp(s: var auto, x,y,cx,cy: auto, i,fnu,fmu,bnu,fmubnu: SomeInteger): int = + symderiv(s, x,y,cx,cy, i,fnu,fmu,bnu,fmubnu, true) + +proc symderiv3(s: var auto, x,y,cx,cy: auto, i,fnu,fmu,bnu,fmubnu,nOut: SomeInteger, + p: static bool = false): int = + mixin projectUflops + when p: + template prj(m: auto): auto = proj(m) + else: + template prj(m: auto): auto = m + if fnu>=0 and fmu>=0: + let xi = prj x[i] + let yfnu = prj y[fnu] + let xfmu = prj x[fmu] + result += 3*projectUflops(3) + if i=0 and fmubnu>=0: + let xbnu = prj x[bnu] + let ybnu = prj y[bnu] + result += 2*projectUflops(3) + if bnu=0 and fmu>=0: + let xi = prj x[i] + let yfnu = prj y[fnu] + let xfmu = prj x[fmu] + s += cx[i] * yfnu * xfmu.adj + s += xi * cy[fnu] * xfmu.adj + s += xi * yfnu * cx[fmu].adj + inc result, 3 + if bnu>=0 and fmubnu>=0: + let xbnu = prj x[bnu] + let ybnu = prj y[bnu] + let xfmubnu = prj x[fmubnu] + s += cx[bnu].adj * ybnu * xfmubnu + s += xbnu.adj * cy[bnu] * xfmubnu + s += xbnu.adj * ybnu * cx[fmubnu] + inc result, 3 + +#[ +# x: side links, y: middle links +# snu: c xmu ynu' +# snu: y xmu cnu' +# snu: c' x-mu ynu-mu +# snu: y' x-mu cnu-mu +# smu: x cnu xmu' +# smu: x-nu' c-nu xmu-nu +# x, xmu, x-mu, x-nu, xmu-nu; y, ynu, ynu-mu; c, cnu, c-nu +proc symderiv(f: var auto, a: float, nu0,mu0,nu1,nu2,mu1,nu3,c,cnu: auto) = + let t0 = mu0 * nu1.adj + let t1 = nu0 * t0 + f += a*t1 + let t2 = mu1 * nu3 + let t3 = nu2.adj * t2 + f += a*t3 +]# + +proc smear*[L,F,T,G](ht: HypTemps[L,F,T], coef: HypCoefs, fl: G) = + tic "HypTemps smear" + static: doAssert(type(fl[0]) is F) + static: doAssert(type(fl[0].l) is L) + static: doAssert(type(fl[0][0]) is T) + let lo = fl[0].l + doAssert(lo.nDim == 4) + let comm = getDefaultComm() + for mu in 0..<4: + ht.hgf[mu].halo := 0 + ht.hgf[mu].update ht.hm, comm + toc "update" + const V = L.V + var + gf = ht.gf + hl = ht.hl + l1x = ht.l1x + l2x = ht.l2x + flx = ht.flx + hgf = ht.hgf + h1x = ht.h1x + h2x = ht.h2x + h1 = ht.h1 + h2 = ht.h2 + let + alp1 = coef.alpha1 / 2.0 + alp2 = coef.alpha2 / 4.0 + alp3 = coef.alpha3 / 6.0 + ma1 = 1 - coef.alpha1 + ma2 = 1 - coef.alpha2 + ma3 = 1 - coef.alpha3 + let nhalo = hl.nExt + threads: + let nc = gf[0][0].nrows + let staplesFlops = float((4*(6*nc+2*(nc-1))+4*2)*nc*nc*V) + let siteFlops1 = staplesFlops + float(2*nc*nc*V) + let siteFlops2 = 2*staplesFlops+float((2*nc*nc+12*projectUflops(nc))*V) + let siteFlops3 = 3*staplesFlops+float((2*nc*nc+19*projectUflops(nc))*V) + var flops = 0.0 + # h1x[mu][nu] mu,nu,-nu,mu-nu + tfor i, 0.. 0: + hl2[mu][nu][i].projDeriv(h2x[mu][nu][i], hl2[mu][nu][i]) + hf[mu][i] += ma2 * hl2[mu][nu][i] + hl2[mu][nu][i] *= alp2 + flops += V*(flp+54+2*projectUflops(nc)) # guess for projDeriv + flops.threadSum + toc "3", flops=flops + flops = 0 + tfor i, 0.. 0: + hl1[mu][nu][i].projDeriv(h1x[mu][nu][i], hl1[mu][nu][i]) + hf[mu][i] += ma1 * hl1[mu][nu][i] + hl1[mu][nu][i] *= alp1 + #t.projDeriv(h1x[mu][nu][i], t) + #hf[mu][i] += ma1 * t + #hl1[mu][nu][i] = alp1 * t + flops += V*(flp+54+2*projectUflops(nc)) # guess for projDeriv + flops.threadSum + toc "2", flops=flops + flops = 0 + tfor i, 0..