Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Experiment with host code generation on old GPGPU 2020 benchmarks #230

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,5 @@ modules.xml
*.pdf
*.gz
*.sc

*.o
7 changes: 7 additions & 0 deletions src/main/scala/apps/harrisCornerDetection.scala
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@ import shine.OpenCL.KernelExecutor._

import scala.reflect.ClassTag

/** This version of Harris follows from the following paper:
* https://dl.acm.org/doi/abs/10.1145/2568058.2568067
*
* Compared to Halide's version:
* - it starts from grayscale images instead of color images
* - it uses a binomial filter instead of a box filter
*/
object harrisCornerDetection {
private val C2D = separableConvolution2D
private val id = C2D.id
Expand Down
242 changes: 242 additions & 0 deletions src/main/scala/apps/harrisCornerDetection2/binomialCoarsity.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
package apps.harrisCornerDetection2

import rise.core._
import rise.core.DSL._
import rise.core.primitives.{id => _, _}
import Type._
import rise.core.types._
import rise.core.types.DataType._
import rise.openCL.DSL._
import rise.openCL.primitives.oclRotateValues
import shine.OpenCL.{GlobalSize, LocalSize}

object binomialCoarsity {
def check(module: shine.OpenCL.Module, h: Int, w: Int, kappa: Float): Unit = {
val main = s"""
#include "src/main/scala/apps/harrisCornerDetection2/common.cpp"

int main(int argc, char** argv) {
Context ctx = createDefaultContext();
size_t in_bytes = $h * $w * sizeof(float);
size_t out_bytes = ${h - 2*bd_h} * ${w - 2*bd_w} * sizeof(float);
Buffer input_ixx = createBuffer(ctx, in_bytes, HOST_WRITE | HOST_READ | DEVICE_READ);
Buffer input_ixy = createBuffer(ctx, in_bytes, HOST_WRITE | HOST_READ | DEVICE_READ);
Buffer input_iyy = createBuffer(ctx, in_bytes, HOST_WRITE | HOST_READ | DEVICE_READ);
Buffer output = createBuffer(ctx, out_bytes, HOST_READ | HOST_WRITE | DEVICE_WRITE);

float* sxx_gold = (float*) malloc(out_bytes);
float* sxy_gold = (float*) malloc(out_bytes);
float* syy_gold = (float*) malloc(out_bytes);
float* out_gold = (float*) malloc(out_bytes);

std::random_device rand_d;
std::default_random_engine rand_e(rand_d());
// bigger range results in higher output differences
std::uniform_real_distribution<float> dist(0, 200);

float* in_ixx = (float*) hostBufferSync(ctx, input_ixx, in_bytes, HOST_WRITE | HOST_READ);
float* in_ixy = (float*) hostBufferSync(ctx, input_ixy, in_bytes, HOST_WRITE | HOST_READ);
float* in_iyy = (float*) hostBufferSync(ctx, input_iyy, in_bytes, HOST_WRITE | HOST_READ);
for (int y = 0; y < $h; y++) {
for (int x = 0; x < $w; x++) {
in_ixx[y*$w + x] = dist(rand_e);
in_ixy[y*$w + x] = dist(rand_e);
in_iyy[y*$w + x] = dist(rand_e);
}
}

binomial_gold(sxx_gold, $h, $w, in_ixx);
binomial_gold(sxy_gold, $h, $w, in_ixy);
binomial_gold(syy_gold, $h, $w, in_iyy);
coarsity_gold(out_gold, ${h - 2*bd_h}, ${w - 2*bd_w}, sxx_gold, sxy_gold, syy_gold, $kappa);

foo_init_run(ctx, output, $h, $w, input_ixx, input_ixy, input_iyy, $kappa);

ErrorStats errors;
init_error_stats(&errors);
float* out = (float*) hostBufferSync(ctx, output, out_bytes, HOST_READ);
accumulate_error_stats(&errors, out, out_gold, ${h - 2*bd_h}, ${w - 2*bd_w});
finish_error_stats(&errors, 0.05, 0.0001);

free(sxx_gold);
free(sxy_gold);
free(syy_gold);
free(out_gold);
destroyBuffer(ctx, input_ixx);
destroyBuffer(ctx, input_ixy);
destroyBuffer(ctx, input_iyy);
destroyBuffer(ctx, output);
destroyContext(ctx);
return EXIT_SUCCESS;
}
"""
util.ExecuteOpenCL.using_cpp(main, module, "one_copy")
}

val base: ToBeTyped[Expr] =
depFun(hFrom(3), (h: Nat) =>
depFun(wFrom(12), (w: Nat) => fun(
(h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32
->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
)((sxx, sxy, syy, kappa) =>
oclRun(LocalSize(1), GlobalSize(num_threads))(
makeArray(3)(sxx)(sxy)(syy) |>
transpose >> map(transpose) >>
map(drop(bd_w-1) >> take(w - 2*(bd_w-1)) >> slide(3)(1)) >>
drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(3)(1) >>
map(transpose) >>
mapGlobal(mapSeq(
map(transpose) >> transpose >>
toPrivateFun(mapSeqUnroll(fun(nbh =>
dotSeqU(join(binomialWeights2d))(join(nbh))
))) >>
letf(fun(s => {
val sxx = s `@` lidx(0, 3)
val sxy = s `@` lidx(1, 3)
val syy = s `@` lidx(2, 3)
val det = sxx * syy - sxy * sxy
val trace = sxx + syy
det - kappa * trace * trace
}))
))
))))

val lineVec: ToBeTyped[Expr] =
depFun(hFrom(3), (h: Nat) =>
depFun(wFrom(12), (w: Nat) => fun(
(h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32
->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
)((sxx, sxy, syy, kappa) =>
oclRun(LocalSize(1), GlobalSize(num_threads))(
makeArray(3)(sxx)(sxy)(syy) |>
map(
map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw))) >>
drop(bd_h-1) >> take(h - 2*(bd_h-1)) >>
map(asVectorAligned(vecw))
) >>
transpose >> map(transpose) >>
slide(3)(1) >> mapGlobal(
transpose >> map(transpose) >>
mapSeq(mapSeqUnroll(dotSeqUWV(binomialWeightsV))) >>
toGlobal >>
slide(3)(1) >>
mapSeq(
transpose >> map(shuffle) >>
toPrivateFun(mapSeqUnroll(dotSeqUWV(binomialWeightsH))) >>
letf(fun(s => {
val sxx = s `@` lidx(0, 3)
val sxy = s `@` lidx(1, 3)
val syy = s `@` lidx(2, 3)
val det = sxx * syy - sxy * sxy
val trace = sxx + syy
det - vectorFromScalar(kappa) * trace * trace
}))
) >> asScalar
)
))))

val rotvVec: ToBeTyped[Expr] =
depFun(hFrom(3), (h: Nat) =>
depFun(wFrom(12), (w: Nat) => fun(
(h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32)
->: f32 ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
)((sxx, sxy, syy, kappa) =>
oclRun(LocalSize(1), GlobalSize(num_threads))(
makeArray(3)(sxx)(sxy)(syy) |>
map(
map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw))) >>
drop(bd_h-1) >> take(h - 2*(bd_h-1)) >>
map(asVectorAligned(vecw))
) >>
transpose >> map(transpose) >>
slide(3)(1) >> mapGlobal(
transpose >> map(transpose) >>
map(map(dotSeqUWV(binomialWeightsV))) >>
oclRotateValues(AddressSpace.Private)(3)(mapSeqUnroll(id)) >> iterateStream(
transpose >> map(shuffle) >>
toPrivateFun(mapSeqUnroll(dotSeqUWV(binomialWeightsH))) >>
letf(fun(s => {
val sxx = s `@` lidx(0, 3)
val sxy = s `@` lidx(1, 3)
val syy = s `@` lidx(2, 3)
val det = sxx * syy - sxy * sxy
val trace = sxx + syy
det - vectorFromScalar(kappa) * trace * trace
}))
) >> asScalar
)
))))

val tile: ToBeTyped[Expr] = {
val tile_x_in = tile_x + 2
val tile_y_in = tile_y + 2
depFun(hFrom(tile_y), (h: Nat) =>
depFun(wFrom(tile_x), (w: Nat) => fun(
(h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32
->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
)((sxx, sxy, syy, kappa) =>
oclRun(LocalSize((tile_x, tile_y)), GlobalSize((w - 2*bd_w, h - 2*bd_h)))(
makeArray(3)(sxx)(sxy)(syy) |>
transpose >> map(transpose) >>
map(drop(bd_w-1) >> take(w - 2*(bd_w-1)) >> slide(tile_x_in)(tile_x)) >>
drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(tile_y_in)(tile_y) >>
map(transpose) >>
map(map(
map(slide(3)(1)) >> slide(3)(1) >> map(transpose)
)) >>
mapWorkGroup(1)(mapWorkGroup(0)(
mapLocal(1)(mapLocal(0)(
map(transpose) >> transpose >>
toPrivateFun(mapSeqUnroll(fun(nbh =>
dotSeqU(join(binomialWeights2d))(join(nbh))
))) >>
letf(fun(s => {
val sxx = s `@` lidx(0, 3)
val sxy = s `@` lidx(1, 3)
val syy = s `@` lidx(2, 3)
val det = sxx * syy - sxy * sxy
val trace = sxx + syy
det - kappa * trace * trace
}))
))
)) >> map(transpose) >> join >> map(join)
))))
}

val tileVec: ToBeTyped[Expr] = {
val tile_x_in = tile_x + 2*vecw
val tile_y_in = tile_y + 2
depFun(hFrom(tile_y), (h: Nat) =>
depFun(wFrom(tile_x), (w: Nat) => fun(
(h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32
->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
)((sxx, sxy, syy, kappa) =>
oclRun(LocalSize((tile_x / vecw, tile_y)), GlobalSize(((w - 2*bd_w) / vecw, h - 2*bd_h)))(
makeArray(3)(sxx)(sxy)(syy) |>
map(
map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw)) >> slide(tile_x_in)(tile_x)) >>
drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(tile_y_in)(tile_y) >>
map(transpose) >>
map(map(
map(asVectorAligned(vecw) >> slide(3)(1)) >> slide(3)(1) >> map(transpose)
))
) >>
transpose >> map(transpose >> map(transpose >> map(transpose))) >>
mapWorkGroup(1)(mapWorkGroup(0)(
mapLocal(1)(mapLocal(0)(
toPrivateFun(mapSeqUnroll(fun(nbh =>
dotSeqUWV(join(binomialWeights2d))(join(map(shuffle)(nbh)))
))) >>
letf(fun(s => {
val sxx = s `@` lidx(0, 3)
val sxy = s `@` lidx(1, 3)
val syy = s `@` lidx(2, 3)
val det = sxx * syy - sxy * sxy
val trace = sxx + syy
det - vectorFromScalar(kappa) * trace * trace
}))
))
)) >> map(transpose) >> join >> map(join >> asScalar)
))))
}
}
Loading