From b7d6e27793c264c90dfc591884cd24bafc9c7e33 Mon Sep 17 00:00:00 2001
From: Carsten Bauer <crstnbr@gmail.com>
Date: Fri, 2 Feb 2024 11:42:17 +0100
Subject: [PATCH 1/2] tmp

---
 docs/src/examples/integration/integration.jl | 49 ++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 docs/src/examples/integration/integration.jl

diff --git a/docs/src/examples/integration/integration.jl b/docs/src/examples/integration/integration.jl
new file mode 100644
index 00000000..5e5b0b20
--- /dev/null
+++ b/docs/src/examples/integration/integration.jl
@@ -0,0 +1,49 @@
+# # Trapezoidal Integration
+#
+
+# Function to be integrated (from 0 to 1). The analytic result is π.
+
+f(x) = 4 * √(1 - x^2)
+
+# Integration routine
+
+"Evaluate definite integral (from `a` to `b`) by using the trapezoidal rule."
+function trapezoidal(a, b, n, h)
+    y = (f(a) + f(b)) / 2.0
+    for i in 1:n-1
+        x = a + i * h
+        y = y + f(x)
+    end
+    return y * h
+end
+
+using Base.Threads: nthreads
+
+N = 10_000
+
+function integrate_parallel(N)
+    # compute local integration interval etc.
+    h     = 1.0 / N
+    a_loc = [i * n_loc * h for i in 0:nthreads()-1]
+    b_loc = a_loc + n_loc * h
+
+    result = tmapreduce(+, ...) do
+        trapezoidal(a_loc, b_loc, n, h)
+    end
+
+    res_loc = trapezoidal(a_loc, b_loc, n_loc, h)
+
+    println("π (numerical integration) ≈ ", res)
+end
+
+
+# perform local integration
+res_loc = trapezoidal(a_loc, b_loc, n_loc, h)
+
+# parallel reduction
+res = MPI.Reduce(res_loc, +, comm)
+
+# print result
+if 0 == rank
+    @printf("π (numerical integration) ≈ %20.16f\\n", res)
+end

From 635a50f8d3a6b6ad1aedb0b2bb09e9192681bf96 Mon Sep 17 00:00:00 2001
From: Carsten Bauer <crstnbr@gmail.com>
Date: Mon, 5 Feb 2024 17:54:33 +0100
Subject: [PATCH 2/2] done

---
 docs/make.jl                                 |   1 +
 docs/src/examples/integration/Project.toml   |   3 +
 docs/src/examples/integration/integration.jl |  71 +++++++----
 docs/src/examples/integration/integration.md | 124 +++++++++++++++++++
 4 files changed, 174 insertions(+), 25 deletions(-)
 create mode 100644 docs/src/examples/integration/Project.toml
 create mode 100644 docs/src/examples/integration/integration.md

diff --git a/docs/make.jl b/docs/make.jl
index 91f343b9..9e569b5b 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -16,6 +16,7 @@ makedocs(;
          "Examples" => [
              "Parallel Monte Carlo" => "examples/mc/mc.md",
              "Julia Set" => "examples/juliaset/juliaset.md",
+             "Trapezoidal Integration" => "examples/integration/integration.md",
          ],
         #  "Explanations" => [
         #      "B" => "explanations/B.md",
diff --git a/docs/src/examples/integration/Project.toml b/docs/src/examples/integration/Project.toml
new file mode 100644
index 00000000..e3884437
--- /dev/null
+++ b/docs/src/examples/integration/Project.toml
@@ -0,0 +1,3 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5"
diff --git a/docs/src/examples/integration/integration.jl b/docs/src/examples/integration/integration.jl
index 5e5b0b20..e9c0c59c 100644
--- a/docs/src/examples/integration/integration.jl
+++ b/docs/src/examples/integration/integration.jl
@@ -1,49 +1,70 @@
 # # Trapezoidal Integration
 #
-
-# Function to be integrated (from 0 to 1). The analytic result is π.
+# In this example, we want to parallelize the computation of a simple numerical integral
+# via the trapezoidal rule. The latter is given by
+#
+# $\int_{a}^{b}f(x)\,dx \approx h \sum_{i=1}^{N}\frac{f(x_{i-1})+f(x_{i})}{2}.$
+#
+# The function to be integrated is the following.
 
 f(x) = 4 * √(1 - x^2)
 
-# Integration routine
+# The analytic result of the definite integral (from 0 to 1) is known to be $\pi$.
+#
+# ## Sequential
+#
+# Naturally, we implement the trapezoidal rule as a straightforward, sequential `for` loop.
 
-"Evaluate definite integral (from `a` to `b`) by using the trapezoidal rule."
-function trapezoidal(a, b, n, h)
+function trapezoidal(a, b, n; h = (b - a) / n)
     y = (f(a) + f(b)) / 2.0
-    for i in 1:n-1
+    for i in 1:(n - 1)
         x = a + i * h
         y = y + f(x)
     end
     return y * h
 end
 
+# Let's compute the integral of `f` above and see if we get the expected result.
+# For simplicity, we choose `N`, the number of panels used to discretize the integration
+# interval, as a multiple of the number of available Julia threads.
+
 using Base.Threads: nthreads
+@show nthreads()
+N = nthreads() * 1_000_000
 
-N = 10_000
+# Calling `trapezoidal` we do indeed find the (approximate) value of $\pi$.
 
-function integrate_parallel(N)
-    # compute local integration interval etc.
-    h     = 1.0 / N
-    a_loc = [i * n_loc * h for i in 0:nthreads()-1]
-    b_loc = a_loc + n_loc * h
+trapezoidal(0, 1, N) ≈ π
 
-    result = tmapreduce(+, ...) do
-        trapezoidal(a_loc, b_loc, n, h)
-    end
+# ## Parallel
+#
+# Our strategy is the following: Divide the integration interval among the available
+# Julia threads. On each thread, use the sequential trapezoidal rule to compute the partial
+# integral.
+# It is straightforward to implement this strategy with `tmapreduce`. The `map` part
+# is, essentially, the application of `trapezoidal` and the reduction operator is chosen to
+# be `+` to sum up the local integrals.
 
-    res_loc = trapezoidal(a_loc, b_loc, n_loc, h)
+using OhMyThreads
 
-    println("π (numerical integration) ≈ ", res)
+function trapezoidal_parallel(a, b, N)
+    n = N ÷ nthreads()
+    h = (b - a) / N
+    return tmapreduce(+, 1:nthreads()) do i
+        local α = a + (i - 1) * n * h
+        local β = α + n * h
+        trapezoidal(α, β, n; h)
+    end
 end
 
+# First, we check the correctness of our parallel implementation.
+trapezoidal_parallel(0, 1, N) ≈ π
 
-# perform local integration
-res_loc = trapezoidal(a_loc, b_loc, n_loc, h)
+# Then, we benchmark and compare the performance of the sequential and parallel versions.
 
-# parallel reduction
-res = MPI.Reduce(res_loc, +, comm)
+using BenchmarkTools
+@btime trapezoidal(0, 1, $N);
+@btime trapezoidal_parallel(0, 1, $N);
 
-# print result
-if 0 == rank
-    @printf("π (numerical integration) ≈ %20.16f\\n", res)
-end
+# Because the problem is trivially parallel - all threads to the same thing and don't need
+# to communicate - we expect an ideal speedup of (close to) the number of available threads.
diff --git a/docs/src/examples/integration/integration.md b/docs/src/examples/integration/integration.md
new file mode 100644
index 00000000..67b70584
--- /dev/null
+++ b/docs/src/examples/integration/integration.md
@@ -0,0 +1,124 @@
+```@meta
+EditURL = "integration.jl"
+```
+
+# Trapezoidal Integration
+
+In this example, we want to parallelize the computation of a simple numerical integral
+via the trapezoidal rule. The latter is given by
+
+$\int_{a}^{b}f(x)\,dx \approx h \sum_{i=1}^{N}\frac{f(x_{i-1})+f(x_{i})}{2}.$
+
+The function to be integrated is the following.
+
+````julia
+f(x) = 4 * √(1 - x^2)
+````
+
+````
+f (generic function with 1 method)
+````
+
+The analytic result of the definite integral (from 0 to 1) is known to be $\pi$.
+
+## Sequential
+
+Naturally, we implement the trapezoidal rule as a straightforward, sequential `for` loop.
+
+````julia
+function trapezoidal(a, b, n; h = (b - a) / n)
+    y = (f(a) + f(b)) / 2.0
+    for i in 1:(n - 1)
+        x = a + i * h
+        y = y + f(x)
+    end
+    return y * h
+end
+````
+
+````
+trapezoidal (generic function with 1 method)
+````
+
+Let's compute the integral of `f` above and see if we get the expected result.
+For simplicity, we choose `N`, the number of panels used to discretize the integration
+interval, as a multiple of the number of available Julia threads.
+
+````julia
+using Base.Threads: nthreads
+@show nthreads()
+N = nthreads() * 1_000_000
+````
+
+````
+5000000
+````
+
+Calling `trapezoidal` we do indeed find the (approximate) value of $\pi$.
+
+````julia
+trapezoidal(0, 1, N) ≈ π
+````
+
+````
+true
+````
+
+## Parallel
+
+Our strategy is the following: Divide the integration interval among the available
+Julia threads. On each thread, use the sequential trapezoidal rule to compute the partial
+integral.
+It is straightforward to implement this strategy with `tmapreduce`. The `map` part
+is, essentially, the application of `trapezoidal` and the reduction operator is chosen to
+be `+` to sum up the local integrals.
+
+````julia
+using OhMyThreads
+
+function trapezoidal_parallel(a, b, N)
+    n = N ÷ nthreads()
+    h = (b - a) / N
+    return tmapreduce(+, 1:nthreads()) do i
+        local α = a + (i - 1) * n * h
+        local β = α + n * h
+        trapezoidal(α, β, n; h)
+    end
+end
+````
+
+````
+trapezoidal_parallel (generic function with 1 method)
+````
+
+First, we check the correctness of our parallel implementation.
+
+````julia
+trapezoidal_parallel(0, 1, N) ≈ π
+````
+
+````
+true
+````
+
+Then, we benchmark and compare the performance of the sequential and parallel versions.
+
+````julia
+using BenchmarkTools
+@btime trapezoidal(0, 1, $N);
+@btime trapezoidal_parallel(0, 1, $N);
+````
+
+````
+  13.871 ms (0 allocations: 0 bytes)
+  2.781 ms (38 allocations: 3.19 KiB)
+
+````
+
+Because the problem is trivially parallel - all threads to the same thing and don't need
+to communicate - we expect an ideal speedup of (close to) the number of available threads.
+
+---
+
+*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*
+