Merge branch 'master' of gitlab.com:oddkiva/sara

oddkiva · Jan 10, 2024 · e0af2ee · e0af2ee
2 parents 1ed950a + 6eab6d7
commit e0af2ee
Show file tree

Hide file tree

Showing 8 changed files with 517 additions and 11 deletions.
diff --git a/cpp/examples/Shakti/Halide/Components/halide_print_convolution_schedule.cpp b/cpp/examples/Shakti/Halide/Components/halide_print_convolution_schedule.cpp
@@ -16,7 +16,7 @@ auto main() -> int
   auto kernel = Halide::Func{"kernel"};
 
   input(x, y) = x + y;
-  kernel(x) = Halide::exp(-x * x);
+  kernel(x) = Halide::exp(-x);
 
   auto conv_x = Halide::Func{"conv_x"};
   auto conv_y = Halide::Func{"conv_y"};
@@ -28,19 +28,26 @@ auto main() -> int
   conv_x(x, y) = Halide::sum(input(x + k - ksz / 2, y) * kernel(k), "conv_x");
   conv_y(x, y) = Halide::sum(conv_x(x, y + k - ksz / 2) * kernel(k), "conv_y");
 
+  // The schedule
+  kernel.compute_root();
+
   conv_y  //
       .tile(x, y, xo, yo, xi, yi, 64, 64)
       .fuse(xo, yo, tile)
       .parallel(tile)
       // .parallel(yo)
-      .vectorize(xi, 16, Halide::TailStrategy::GuardWithIf)  //
+      .vectorize(xi, 4, Halide::TailStrategy::GuardWithIf)  //
       ;
   conv_x  //
-      .compute_at(conv_y, xi)  //
-      // .vectorize(k, 16, Halide::TailStrategy::GuardWithIf)  //
+          // .store_at(conv_y, tile)
+      .compute_at(conv_y, xi)                               //
+      .vectorize(x, 4, Halide::TailStrategy::GuardWithIf)  //
       ;
 
   conv_y.print_loop_nest();
+  conv_y.compile_to_lowered_stmt("separable_conv_2d.stmt.html", {},
+                                 Halide::HTML);
+  // conv_y.compile_to_assembly("separable_conv_2d.s", {});
 
   return 0;
 }
diff --git a/cpp/examples/Shakti/Halide/Generators/halide_aot_example.cpp b/cpp/examples/Shakti/Halide/Generators/halide_aot_example.cpp
@@ -98,6 +98,9 @@ auto halide_pipeline(int argc, char** argv) -> int
       //
       // Parallelizing the implementation of the linear filtering with OpenMP,
       // we are then down to 25ms, not bad at all for a very minimal change!
+      //
+      // I have implemented a better schedule for CPU, it performs better than
+      // Halide GPU implementation (OMG!).
       apply_gaussian_filter(frame_gray32f, frame_gray32f_blurred, sigma,
                             truncation_factor);
       shakti_gray32f_to_rgb8u_cpu(buffer_gray32f_blurred, buffer_gray8);

diff --git a/cpp/src/DO/Shakti/Halide/Components/SeparableConvolution.hpp b/cpp/src/DO/Shakti/Halide/Components/SeparableConvolution.hpp
@@ -146,19 +146,25 @@ namespace DO::Shakti::HalideBackend {
             .vectorize(xi, 16, Halide::TailStrategy::GuardWithIf)
             .parallel(y);
         conv_x.compute_at(conv_y, x).vectorize(
-            x, 16, Halide::TailStrategy::GuardWithIf);
+            x, 32, Halide::TailStrategy::GuardWithIf);
 #else
+        // This schedule is a lot better on these machines on a 4K video:
+        // - Apple Silicon M2 Max
+        // - CPU Intel(R) Core(TM) i7-6800K CPU @ 3.40GHz (cat /proc/cpuinfo)
+        //
+        // 13-19 ms instead 25-27 ms
         const auto tile = Halide::Var{"tile"};
         conv_y
             .tile(x, y, xo, yo, xi, yi, 64, 64,
                   Halide::TailStrategy::GuardWithIf)
             .fuse(xo, yo, tile)
             .parallel(tile)
-            .vectorize(xi, 16, Halide::TailStrategy::GuardWithIf);
-        conv_x  //
-            .store_at(conv_y, tile)
+            .vectorize(xi, 32,
+                       Halide::TailStrategy::GuardWithIf);  // 32 is the
+                                                            // good size.
+        conv_x                                              //
             .compute_at(conv_y, tile)
-            .vectorize(x, 16, Halide::TailStrategy::GuardWithIf);
+            .vectorize(x, 32, Halide::TailStrategy::GuardWithIf);
 #endif
       }
     }

diff --git a/doc/book/_bookdown.yml b/doc/book/_bookdown.yml
@@ -43,6 +43,7 @@ rmd_files:
     "random.Rmd",
     "random/python_code_that_suck_less.Rmd",
     "random/bilinear_interpolation.Rmd",
+    # "random/vector_intrinsics.Rmd",
     "bibliography.Rmd"
   ]
   latex: [

diff --git a/doc/book/random/python_code_that_suck_less.Rmd b/doc/book/random/python_code_that_suck_less.Rmd
@@ -326,6 +326,10 @@ const auto& x_ijk = x[{i, j, k}];
 This is aesthetically better and more meaningful than
 ```{cpp}
 const auto& xijk = ...; // too crammed
+
+// The following is fine too as typing the underscore symbol can be a hassle.
+// I am not a snake_case zealot either...
+const auto xi = x[i];
 ```
 
 or the ugly