build based on 889765f

JuliaFolds2 · Feb 1, 2024 · 14b1d47 · 14b1d47
1 parent 54075af
commit 14b1d47
Show file tree

Hide file tree

Showing 15 changed files with 348 additions and 12 deletions.
diff --git a/dev/.documenter-siteinfo.json b/dev/.documenter-siteinfo.json
@@ -1 +1 @@
-{"documenter":{"julia_version":"1.10.0","generation_timestamp":"2024-02-01T13:48:36","documenter_version":"1.2.1"}}
+{"documenter":{"julia_version":"1.10.0","generation_timestamp":"2024-02-01T19:36:19","documenter_version":"1.2.1"}}
diff --git a/dev/examples/Project.toml b/dev/examples/Project.toml
@@ -0,0 +1,5 @@
+[deps]
+Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
+
+[compat]
+Literate = "2.16"
diff --git a/dev/examples/juliaset/Project.toml b/dev/examples/juliaset/Project.toml
@@ -0,0 +1,5 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+DisplayAs = "0b91fe84-8a4c-11e9-3e1d-67c38462b6d6"
+OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
diff --git a/dev/examples/juliaset/juliaset-10.png b/dev/examples/juliaset/juliaset-10.png
diff --git a/dev/examples/juliaset/juliaset-8.png b/dev/examples/juliaset/juliaset-8.png
diff --git a/dev/examples/juliaset/juliaset.jl b/dev/examples/juliaset/juliaset.jl
@@ -0,0 +1,108 @@
+# # Julia Set
+#
+# In this example, we will compute an image of the
+# [Julia set](https://en.wikipedia.org/wiki/Julia_set) in parallel. We will explore
+# the `schedule` and `nchunks` options that can be used to get load balancing.
+#
+# The value of a single pixel of the Julia set, which corresponds to a point in the
+# complex number plane, can be computed by the following iteration procedure.
+
+function _compute_pixel(i, j, n; max_iter = 255, c = -0.79 + 0.15 * im)
+    x = -2.0 + (j - 1) * 4.0 / (n - 1)
+    y = -2.0 + (i - 1) * 4.0 / (n - 1)
+
+    z = x + y * im
+    iter = max_iter
+    for k in 1:max_iter
+        if abs2(z) > 4.0
+            iter = k - 1
+            break
+        end
+        z = z^2 + c
+    end
+    return iter
+end
+
+# Note that the value of the pixel is the number of performed iterations for the
+# corresponding complex input number. Hence, the computational **workload is non-uniform**.
+
+# ## Sequential computation
+#
+# In our naive implementation, we just loop over the dimensions of the image matrix and call
+# the pixel kernel above.
+
+function compute_juliaset_sequential!(img)
+    N = size(img, 1)
+    for j in 1:N
+        for i in 1:N
+            img[i, j] = _compute_pixel(i, j, N)
+        end
+    end
+    return img
+end
+
+N = 2000
+img = zeros(Int, N, N)
+compute_juliaset_sequential!(img);
+
+# Let's look at the result
+
+using Plots
+using DisplayAs #hide
+p = heatmap(img)
+DisplayAs.PNG(p) #hide
+
+# ## Parallelization
+#
+# The Julia set computation above is a `map!` operation: We apply some function to each
+# element of the array. Hence, we can use `tmap!` for parallelization. We use
+# `CartesianIndices` to map between linear and two-dimensional cartesian indices.
+
+using OhMyThreads: tmap!
+
+function compute_juliaset_parallel!(img; kwargs...)
+    N = size(img, 1)
+    cart = CartesianIndices(img)
+    tmap!(img, eachindex(img); kwargs...) do idx
+        c = cart[idx]
+        _compute_pixel(c[1], c[2], N)
+    end
+    return img
+end
+
+N = 2000
+img = zeros(Int, N, N)
+compute_juliaset_parallel!(img);
+p = heatmap(img)
+DisplayAs.PNG(p) #hide
+
+# ## Benchmark
+#
+# Let's benchmark the variants above.
+
+using BenchmarkTools
+using Base.Threads: nthreads
+
+N = 2000
+img = zeros(Int, N, N)
+
+@btime compute_juliaset_sequential!($img) samples=10 evals=3;
+@btime compute_juliaset_parallel!($img) samples=10 evals=3;
+
+# As hoped, the parallel implementation is faster. But can we improve the performance
+# further?
+
+# ### Tuning `nchunks`
+#
+# As stated above, the per-pixel computation is non-uniform. Hence, we might benefit from
+# load balancing. The simplest way to get it is to increase `nchunks` to a value larger
+# than `nthreads`. This divides the overall workload into smaller tasks than can be
+# dynamically distributed among threads (by Julia's scheduler) to balance the per-thread
+# load.
+
+@btime compute_juliaset_parallel!($img; schedule=:dynamic, nchunks=N) samples=10 evals=3;
+
+# Note that if we opt out of dynamic scheduling and set `schedule=:static`, this strategy
+# doesn't help anymore (because chunks are naively distributed up front).
+
+@btime compute_juliaset_parallel!($img; schedule=:static, nchunks=N) samples=10 evals=3;
diff --git a/dev/examples/juliaset/juliaset/index.html b/dev/examples/juliaset/juliaset/index.html
@@ -0,0 +1,55 @@
+<!DOCTYPE html>
+<html lang="en"><head><meta charset="UTF-8"/><meta name="viewport" content="width=device-width, initial-scale=1.0"/><title>Julia Set · OhMyThreads.jl</title><meta name="title" content="Julia Set · OhMyThreads.jl"/><meta property="og:title" content="Julia Set · OhMyThreads.jl"/><meta property="twitter:title" content="Julia Set · OhMyThreads.jl"/><meta name="description" content="Documentation for OhMyThreads.jl."/><meta property="og:description" content="Documentation for OhMyThreads.jl."/><meta property="twitter:description" content="Documentation for OhMyThreads.jl."/><script data-outdated-warner src="../../../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL="../../.."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../../../assets/documenter.js"></script><script src="../../../search_index.js"></script><script src="../../../siteinfo.js"></script><script src="../../../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../../../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../../../assets/themeswap.js"></script></head><body><div id="documenter"><nav class="docs-sidebar"><div class="docs-package-name"><span class="docs-autofit"><a href="../../../">OhMyThreads.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../../../">OhMyThreads</a></li><li><input class="collapse-toggle" id="menuitem-2" type="checkbox" checked/><label class="tocitem" for="menuitem-2"><span class="docs-label">Examples</span><i class="docs-chevron"></i></label><ul class="collapsed"><li><a class="tocitem" href="../../mc/mc/">Parallel Monte Carlo</a></li><li class="is-active"><a class="tocitem" href>Julia Set</a><ul class="internal"><li><a class="tocitem" href="#Sequential-computation"><span>Sequential computation</span></a></li><li><a class="tocitem" href="#Parallelization"><span>Parallelization</span></a></li><li><a class="tocitem" href="#Benchmark"><span>Benchmark</span></a></li></ul></li></ul></li><li><input class="collapse-toggle" id="menuitem-3" type="checkbox"/><label class="tocitem" for="menuitem-3"><span class="docs-label">References</span><i class="docs-chevron"></i></label><ul class="collapsed"><li><a class="tocitem" href="../../../refs/api/">Public API</a></li><li><a class="tocitem" href="../../../refs/internal/">Internal</a></li></ul></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li><a class="is-disabled">Examples</a></li><li class="is-active"><a href>Julia Set</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Julia Set</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaFolds2/OhMyThreads.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaFolds2/OhMyThreads.jl/blob/master/docs/src/examples/juliaset/juliaset.jl#" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Julia-Set"><a class="docs-heading-anchor" href="#Julia-Set">Julia Set</a><a id="Julia-Set-1"></a><a class="docs-heading-anchor-permalink" href="#Julia-Set" title="Permalink"></a></h1><p>In this example, we will compute an image of the <a href="https://en.wikipedia.org/wiki/Julia_set">Julia set</a> in parallel. We will explore the <code>schedule</code> and <code>nchunks</code> options that can be used to get load balancing.</p><p>The value of a single pixel of the Julia set, which corresponds to a point in the complex number plane, can be computed by the following iteration procedure.</p><pre><code class="language-julia hljs">function _compute_pixel(i, j, n; max_iter = 255, c = -0.79 + 0.15 * im)
+    x = -2.0 + (j - 1) * 4.0 / (n - 1)
+    y = -2.0 + (i - 1) * 4.0 / (n - 1)
+
+    z = x + y * im
+    iter = max_iter
+    for k in 1:max_iter
+        if abs2(z) &gt; 4.0
+            iter = k - 1
+            break
+        end
+        z = z^2 + c
+    end
+    return iter
+end</code></pre><pre><code class="nohighlight hljs">_compute_pixel (generic function with 1 method)</code></pre><p>Note that the value of the pixel is the number of performed iterations for the corresponding complex input number. Hence, the computational <strong>workload is non-uniform</strong>.</p><h2 id="Sequential-computation"><a class="docs-heading-anchor" href="#Sequential-computation">Sequential computation</a><a id="Sequential-computation-1"></a><a class="docs-heading-anchor-permalink" href="#Sequential-computation" title="Permalink"></a></h2><p>In our naive implementation, we just loop over the dimensions of the image matrix and call the pixel kernel above.</p><pre><code class="language-julia hljs">function compute_juliaset_sequential!(img)
+    N = size(img, 1)
+    for j in 1:N
+        for i in 1:N
+            img[i, j] = _compute_pixel(i, j, N)
+        end
+    end
+    return img
+end
+
+N = 2000
+img = zeros(Int, N, N)
+compute_juliaset_sequential!(img);</code></pre><p>Let&#39;s look at the result</p><pre><code class="language-julia hljs">using Plots
+p = heatmap(img)</code></pre><p><img src="../juliaset-8.png" alt/></p><h2 id="Parallelization"><a class="docs-heading-anchor" href="#Parallelization">Parallelization</a><a id="Parallelization-1"></a><a class="docs-heading-anchor-permalink" href="#Parallelization" title="Permalink"></a></h2><p>The Julia set computation above is a <code>map!</code> operation: We apply some function to each element of the array. Hence, we can use <code>tmap!</code> for parallelization. We use <code>CartesianIndices</code> to map between linear and two-dimensional cartesian indices.</p><pre><code class="language-julia hljs">using OhMyThreads: tmap!
+
+function compute_juliaset_parallel!(img; kwargs...)
+    N = size(img, 1)
+    cart = CartesianIndices(img)
+    tmap!(img, eachindex(img); kwargs...) do idx
+        c = cart[idx]
+        _compute_pixel(c[1], c[2], N)
+    end
+    return img
+end
+
+N = 2000
+img = zeros(Int, N, N)
+compute_juliaset_parallel!(img);
+p = heatmap(img)</code></pre><p><img src="../juliaset-10.png" alt/></p><h2 id="Benchmark"><a class="docs-heading-anchor" href="#Benchmark">Benchmark</a><a id="Benchmark-1"></a><a class="docs-heading-anchor-permalink" href="#Benchmark" title="Permalink"></a></h2><p>Let&#39;s benchmark the variants above.</p><pre><code class="language-julia hljs">using BenchmarkTools
+using Base.Threads: nthreads
+
+N = 2000
+img = zeros(Int, N, N)
+
+@btime compute_juliaset_sequential!($img) samples=10 evals=3;
+@btime compute_juliaset_parallel!($img) samples=10 evals=3;</code></pre><pre><code class="nohighlight hljs">  138.377 ms (0 allocations: 0 bytes)
+  63.707 ms (39 allocations: 3.30 KiB)
+</code></pre><p>As hoped, the parallel implementation is faster. But can we improve the performance further?</p><h3 id="Tuning-nchunks"><a class="docs-heading-anchor" href="#Tuning-nchunks">Tuning <code>nchunks</code></a><a id="Tuning-nchunks-1"></a><a class="docs-heading-anchor-permalink" href="#Tuning-nchunks" title="Permalink"></a></h3><p>As stated above, the per-pixel computation is non-uniform. Hence, we might benefit from load balancing. The simplest way to get it is to increase <code>nchunks</code> to a value larger than <code>nthreads</code>. This divides the overall workload into smaller tasks than can be dynamically distributed among threads (by Julia&#39;s scheduler) to balance the per-thread load.</p><pre><code class="language-julia hljs">@btime compute_juliaset_parallel!($img; schedule=:dynamic, nchunks=N) samples=10 evals=3;</code></pre><pre><code class="nohighlight hljs">  32.000 ms (12013 allocations: 1.14 MiB)
+</code></pre><p>Note that if we opt out of dynamic scheduling and set <code>schedule=:static</code>, this strategy doesn&#39;t help anymore (because chunks are naively distributed up front).</p><pre><code class="language-julia hljs">@btime compute_juliaset_parallel!($img; schedule=:static, nchunks=N) samples=10 evals=3;</code></pre><pre><code class="nohighlight hljs">  63.439 ms (42 allocations: 3.37 KiB)
+</code></pre><hr/><p><em>This page was generated using <a href="https://github.com/fredrikekre/Literate.jl">Literate.jl</a>.</em></p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../../mc/mc/">« Parallel Monte Carlo</a><a class="docs-footer-nextpage" href="../../../refs/api/">Public API »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="auto">Automatic (OS)</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.2.1 on <span class="colophon-date" title="Thursday 1 February 2024 19:36">Thursday 1 February 2024</span>. Using Julia version 1.10.0.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/examples/mc/Project.toml b/dev/examples/mc/Project.toml
@@ -0,0 +1,3 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5"
diff --git a/dev/examples/mc/mc.jl b/dev/examples/mc/mc.jl
@@ -0,0 +1,80 @@
+# # Parallel Monte Carlo
+#
+# Calculate the value of $\pi$ through parallel direct Monte Carlo.
+#
+# A unit circle is inscribed inside a unit square with side length 2 (from -1 to 1).
+# The area of the circle is $\pi$, the area of the square is 4, and the ratio is $\pi/4$.
+# This means that, if you throw $N$ darts randomly at the square, approximately $M=N\pi/4$
+# of those darts will land inside the unit circle.
+#
+# Throw darts randomly at a unit square and count how many of them ($M$) landed inside of
+# a unit circle. Approximate $\pi \approx 4M/N$.
+#
+# ## Sequential implementation:
+
+function mc(N)
+    M = 0 # number of darts that landed in the circle
+    for i in 1:N
+        if rand()^2 + rand()^2 < 1.0
+            M += 1
+        end
+    end
+    pi = 4 * M / N
+    return pi
+end
+
+N = 100_000_000
+
+mc(N)
+
+# ## Parallelization with `tmapreduce`
+#
+# To parallelize the Monte Carlo simulation, we use [`tmapreduce`](@ref) with `+` as the reduction
+# operator. For the map part, we take `1:N` as our input collection and "throw one dart" per
+# element.
+
+using OhMyThreads
+
+function mc_parallel(N)
+    M = tmapreduce(+, 1:N) do i
+        rand()^2 + rand()^2 < 1.0
+    end
+    pi = 4 * M / N
+    return pi
+end
+
+mc_parallel(N)
+
+# Let's run a quick benchmark.
+
+using BenchmarkTools
+using Base.Threads: nthreads
+
+@assert nthreads() > 1 # make sure we have multiple Julia threads
+@show nthreads()       # print out the number of threads
+
+@btime mc($N) samples=10 evals=3;
+@btime mc_parallel($N) samples=10 evals=3;
+
+# ## Manual parallelization
+#
+# First, using the `chunks` function, we divide the iteration interval `1:N` into
+# `nthreads()` parts. Then, we apply a regular (sequential) `map` to spawn a Julia task
+# per chunk. Each task will locally and independently perform a sequential Monte Carlo
+# simulation. Finally, we fetch the results and compute the average estimate for $\pi$.
+
+using OhMyThreads: @spawn
+
+function mc_parallel_manual(N; nchunks=nthreads())
+    tasks = map(chunks(1:N; n = nchunks)) do idcs # TODO: replace by `tmap` once ready
+        @spawn mc(length(idcs))
+    end
+    pi = sum(fetch, tasks) / nchunks
+    return pi
+end
+
+mc_parallel_manual(N)
+
+# And this is the performance:
+
+@btime mc_parallel_manual($N) samples=10 evals=3;
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"documenter":{"julia_version":"1.10.0","generation_timestamp":"2024-02-01T13:48:36","documenter_version":"1.2.1"}}
		{"documenter":{"julia_version":"1.10.0","generation_timestamp":"2024-02-01T19:36:19","documenter_version":"1.2.1"}}