Skip to content

Commit

Permalink
fixed log_likelihood.cu merge
Browse files Browse the repository at this point in the history
  • Loading branch information
daurer committed Mar 25, 2022
1 parent b3681ec commit af9cc8c
Showing 1 changed file with 97 additions and 22 deletions.
119 changes: 97 additions & 22 deletions ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ template <class AUX_T>
inline __device__ void log_likelihood_impl(
int nmodes,
AUX_T *aux,
const IN_TYPE *weights,
const IN_TYPE *I,
const IN_TYPE *fmask,
const IN_TYPE *fmag,
const int *addr,
IN_TYPE *llerr,
int A,
Expand All @@ -47,8 +47,8 @@ inline __device__ void log_likelihood_impl(
const int *ma = addr + 12 + (blockIdx.x * nmodes) * addr_stride;

aux += ea[0] * A * B;
weights += da[0] * A * B;
I += ma[0] * A * B;
fmag += da[0] * A * B;
fmask += ma[0] * A * B;
llerr += da[0] * A * B;
MATH_TYPE norm = A * B;

Expand All @@ -57,13 +57,13 @@ inline __device__ void log_likelihood_impl(
for (int b = tx; b < B; b += blockDim.x)
{
MATH_TYPE acc = 0.0;
MATH_TYPE i = I[a * B + b];
for (int idx = 0; idx < nmodes; ++idx)
{
acc += aux_intensity(aux[a * B + b + idx * A * B]);
}
auto I = MATH_TYPE(fmag[a * B + b]) * MATH_TYPE(fmag[a * B + b]);
llerr[a * B + b] =
MATH_TYPE(weights[a * B + b]) * (acc - i) * (acc - i) / norm;
MATH_TYPE(fmask[a * B + b]) * (acc - I) * (acc - I) / (I + 1) / norm;
}
}
}
Expand All @@ -75,36 +75,36 @@ inline __device__ void log_likelihood_impl(
extern "C" __global__ void __launch_bounds__(1024, 2)
log_likelihood(int nmodes,
complex<OUT_TYPE> *aux,
const IN_TYPE *weights,
const IN_TYPE *I,
const IN_TYPE *fmask,
const IN_TYPE *fmag,
const int *addr,
IN_TYPE *llerr,
int A,
int B)
{
log_likelihood_impl(nmodes, aux, weights, I, addr, llerr, A, B);
log_likelihood_impl(nmodes, aux, fmask, fmag, addr, llerr, A, B);
}

extern "C" __global__ void __launch_bounds__(1024, 2)
log_likelihood_auxintensity(int nmodes,
OUT_TYPE *aux,
const IN_TYPE *weights,
const IN_TYPE *I,
const IN_TYPE *fmask,
const IN_TYPE *fmag,
const int *addr,
IN_TYPE *llerr,
int A,
int B)
{
log_likelihood_impl(nmodes, aux, weights, I, addr, llerr, A, B);
log_likelihood_impl(nmodes, aux, fmask, fmag, addr, llerr, A, B);
}

/////////////////// version with 1 thread block per x dimension only
template <class AUX_T>
__device__ inline void log_likelihood2_impl(
int nmodes,
AUX_T *aux,
const IN_TYPE *weights,
const IN_TYPE *I,
const IN_TYPE *fmask,
const IN_TYPE *fmag,
const int *addr,
IN_TYPE *llerr,
int A,
Expand All @@ -122,26 +122,101 @@ __device__ inline void log_likelihood2_impl(
const int *ma = addr + 12 + (bid * nmodes) * addr_stride;

aux += ea[0] * A * B;
weights += da[0] * A * B;
I += ma[0] * A * B;
fmag += da[0] * A * B;
fmask += ma[0] * A * B;
llerr += da[0] * A * B;
MATH_TYPE norm = A * B;

for (int b = tx; b < B; b += blockDim.x)
{
MATH_TYPE acc = 0.0;
MATH_TYPE i = I[a * B + b];
for (int idx = 0; idx < nmodes; ++idx)
{
acc += aux_intensity(aux[a * B + b + idx * A * B]);
}
auto I = MATH_TYPE(fmag[a * B + b]) * MATH_TYPE(fmag[a * B + b]);
llerr[a * B + b] =
MATH_TYPE(weights[a * B + b]) * (acc - i) * (acc - i) / norm;
MATH_TYPE(fmask[a * B + b]) * (acc - I) * (acc - I) / (I + 1) / norm;
}
}

// ML variant which uses weights and intensity directly.
// Based of log_likelihood
extern "C" __global__ void __launch_bounds__(1024, 2)
log_likelihood2(int nmodes,
complex<OUT_TYPE> *aux,
const IN_TYPE *fmask,
const IN_TYPE *fmag,
const int *addr,
IN_TYPE *llerr,
int A,
int B)
{
log_likelihood2_impl(nmodes, aux, fmask, fmag, addr, llerr, A, B);
}

extern "C" __global__ void
log_likelihood2_auxintensity(int nmodes,
OUT_TYPE *aux,
const IN_TYPE *fmask,
const IN_TYPE *fmag,
const int *addr,
IN_TYPE *llerr,
int A,
int B)
{
log_likelihood2_impl(nmodes, aux, fmask, fmag, addr, llerr, A, B);
}



// ML variant which uses weights and intensity directly.
// Based of log_likelihood
template <class AUX_T>
inline __device__ void log_likelihood_ml_impl(
int nmodes,
AUX_T *aux,
const IN_TYPE *weights,
const IN_TYPE *I,
const int *addr,
IN_TYPE *llerr,
int A,
int B)
{
int tx = threadIdx.x;
int ty = threadIdx.y;
int addr_stride = 15;

const int *ea = addr + 6 + (blockIdx.x * nmodes) * addr_stride;
const int *da = addr + 9 + (blockIdx.x * nmodes) * addr_stride;
const int *ma = addr + 12 + (blockIdx.x * nmodes) * addr_stride;

aux += ea[0] * A * B;
weights += da[0] * A * B;
I += ma[0] * A * B;
llerr += da[0] * A * B;
MATH_TYPE norm = A * B;

for (int a = ty; a < A; a += blockDim.y)
{
for (int b = tx; b < B; b += blockDim.x)
{
MATH_TYPE acc = 0.0;
MATH_TYPE i = I[a * B + b];
for (int idx = 0; idx < nmodes; ++idx)
{
acc += aux_intensity(aux[a * B + b + idx * A * B]);
}
llerr[a * B + b] =
MATH_TYPE(weights[a * B + b]) * (acc - i) * (acc - i) / norm;
}
}
}

// specify max number of threads/block and min number of blocks per SM,
// to assist the compiler in register optimisations.
// We achieve a higher occupancy in this case, as less registers are used
// (guided by profiler)
extern "C" __global__ void __launch_bounds__(1024, 2)
log_likelihood_ml(int nmodes,
complex<OUT_TYPE> *aux,
Expand All @@ -152,11 +227,11 @@ extern "C" __global__ void __launch_bounds__(1024, 2)
int A,
int B)
{
log_likelihood2_impl(nmodes, aux, weights, I, addr, llerr, A, B);
log_likelihood_ml_impl(nmodes, aux, weights, I, addr, llerr, A, B);
}

extern "C" __global__ void
log_likelihood2_auxintensity(int nmodes,
extern "C" __global__ void __launch_bounds__(1024, 2)
log_likelihood_ml_auxintensity(int nmodes,
OUT_TYPE *aux,
const IN_TYPE *weights,
const IN_TYPE *I,
Expand All @@ -165,5 +240,5 @@ extern "C" __global__ void
int A,
int B)
{
log_likelihood2_impl(nmodes, aux, weights, I, addr, llerr, A, B);
log_likelihood_ml_impl(nmodes, aux, weights, I, addr, llerr, A, B);
}

0 comments on commit af9cc8c

Please sign in to comment.