Skip to content

Commit

Permalink
Merge branch 'master' of gitlab.com:oddkiva/sara
Browse files Browse the repository at this point in the history
  • Loading branch information
oddkiva committed Jan 10, 2024
2 parents 1ed950a + 6eab6d7 commit e0af2ee
Show file tree
Hide file tree
Showing 8 changed files with 517 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ auto main() -> int
auto kernel = Halide::Func{"kernel"};

input(x, y) = x + y;
kernel(x) = Halide::exp(-x * x);
kernel(x) = Halide::exp(-x);

auto conv_x = Halide::Func{"conv_x"};
auto conv_y = Halide::Func{"conv_y"};
Expand All @@ -28,19 +28,26 @@ auto main() -> int
conv_x(x, y) = Halide::sum(input(x + k - ksz / 2, y) * kernel(k), "conv_x");
conv_y(x, y) = Halide::sum(conv_x(x, y + k - ksz / 2) * kernel(k), "conv_y");

// The schedule
kernel.compute_root();

conv_y //
.tile(x, y, xo, yo, xi, yi, 64, 64)
.fuse(xo, yo, tile)
.parallel(tile)
// .parallel(yo)
.vectorize(xi, 16, Halide::TailStrategy::GuardWithIf) //
.vectorize(xi, 4, Halide::TailStrategy::GuardWithIf) //
;
conv_x //
.compute_at(conv_y, xi) //
// .vectorize(k, 16, Halide::TailStrategy::GuardWithIf) //
// .store_at(conv_y, tile)
.compute_at(conv_y, xi) //
.vectorize(x, 4, Halide::TailStrategy::GuardWithIf) //
;

conv_y.print_loop_nest();
conv_y.compile_to_lowered_stmt("separable_conv_2d.stmt.html", {},
Halide::HTML);
// conv_y.compile_to_assembly("separable_conv_2d.s", {});

return 0;
}
3 changes: 3 additions & 0 deletions cpp/examples/Shakti/Halide/Generators/halide_aot_example.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ auto halide_pipeline(int argc, char** argv) -> int
//
// Parallelizing the implementation of the linear filtering with OpenMP,
// we are then down to 25ms, not bad at all for a very minimal change!
//
// I have implemented a better schedule for CPU, it performs better than
// Halide GPU implementation (OMG!).
apply_gaussian_filter(frame_gray32f, frame_gray32f_blurred, sigma,
truncation_factor);
shakti_gray32f_to_rgb8u_cpu(buffer_gray32f_blurred, buffer_gray8);
Expand Down
16 changes: 11 additions & 5 deletions cpp/src/DO/Shakti/Halide/Components/SeparableConvolution.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,19 +146,25 @@ namespace DO::Shakti::HalideBackend {
.vectorize(xi, 16, Halide::TailStrategy::GuardWithIf)
.parallel(y);
conv_x.compute_at(conv_y, x).vectorize(
x, 16, Halide::TailStrategy::GuardWithIf);
x, 32, Halide::TailStrategy::GuardWithIf);
#else
// This schedule is a lot better on these machines on a 4K video:
// - Apple Silicon M2 Max
// - CPU Intel(R) Core(TM) i7-6800K CPU @ 3.40GHz (cat /proc/cpuinfo)
//
// 13-19 ms instead 25-27 ms
const auto tile = Halide::Var{"tile"};
conv_y
.tile(x, y, xo, yo, xi, yi, 64, 64,
Halide::TailStrategy::GuardWithIf)
.fuse(xo, yo, tile)
.parallel(tile)
.vectorize(xi, 16, Halide::TailStrategy::GuardWithIf);
conv_x //
.store_at(conv_y, tile)
.vectorize(xi, 32,
Halide::TailStrategy::GuardWithIf); // 32 is the
// good size.
conv_x //
.compute_at(conv_y, tile)
.vectorize(x, 16, Halide::TailStrategy::GuardWithIf);
.vectorize(x, 32, Halide::TailStrategy::GuardWithIf);
#endif
}
}
Expand Down
1 change: 1 addition & 0 deletions doc/book/_bookdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ rmd_files:
"random.Rmd",
"random/python_code_that_suck_less.Rmd",
"random/bilinear_interpolation.Rmd",
# "random/vector_intrinsics.Rmd",
"bibliography.Rmd"
]
latex: [
Expand Down
4 changes: 4 additions & 0 deletions doc/book/random/python_code_that_suck_less.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,10 @@ const auto& x_ijk = x[{i, j, k}];
This is aesthetically better and more meaningful than
```{cpp}
const auto& xijk = ...; // too crammed
// The following is fine too as typing the underscore symbol can be a hassle.
// I am not a snake_case zealot either...
const auto xi = x[i];
```

or the ugly
Expand Down
Loading

0 comments on commit e0af2ee

Please sign in to comment.