From 51b730ec570bfe0f682c16e7f215c1ea107607ea Mon Sep 17 00:00:00 2001 From: Carsten Bauer Date: Fri, 3 May 2024 11:52:10 +0200 Subject: [PATCH 1/5] false sharing --- docs/src/literate/falsesharing/Project.toml | 4 + .../literate/falsesharing/false_sharing.svg | 2108 +++++++++++++++++ .../src/literate/falsesharing/falsesharing.jl | 155 ++ .../src/literate/falsesharing/falsesharing.md | 213 ++ 4 files changed, 2480 insertions(+) create mode 100644 docs/src/literate/falsesharing/Project.toml create mode 100644 docs/src/literate/falsesharing/false_sharing.svg create mode 100644 docs/src/literate/falsesharing/falsesharing.jl create mode 100644 docs/src/literate/falsesharing/falsesharing.md diff --git a/docs/src/literate/falsesharing/Project.toml b/docs/src/literate/falsesharing/Project.toml new file mode 100644 index 00000000..f8067280 --- /dev/null +++ b/docs/src/literate/falsesharing/Project.toml @@ -0,0 +1,4 @@ +[deps] +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5" +ThreadPinning = "811555cd-349b-4f26-b7bc-1f208b848042" diff --git a/docs/src/literate/falsesharing/false_sharing.svg b/docs/src/literate/falsesharing/false_sharing.svg new file mode 100644 index 00000000..756ca78f --- /dev/null +++ b/docs/src/literate/falsesharing/false_sharing.svg @@ -0,0 +1,2108 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/src/literate/falsesharing/falsesharing.jl b/docs/src/literate/falsesharing/falsesharing.jl new file mode 100644 index 00000000..10e811f4 --- /dev/null +++ b/docs/src/literate/falsesharing/falsesharing.jl @@ -0,0 +1,155 @@ +# # [False Sharing](@id FalseSharing) +# +# *False Sharing* is a very common but subtle performance issue that comes up again and +# again when writing parallel code manually. For this reason, we shall discuss what it is +# about and how to avoid it. +# +# For simplicity, let's focus on a specific example: parallel summation. +# +# ## Baseline: sequential summation +# +# To establish a baseline, that we can later compare against, we define some fake data, +# which we'll sum up, and benchmark Julia's built-in, non-parallel `sum` function. + +using Base.Threads: nthreads +using BenchmarkTools + +data = rand(1_000_000 * nthreads()); +@btime sum($data); + +# +# ## The problematic parallel implementation +# +# A conceptually simple (and valid) approach to parallelizing the summation is to divide +# the full computation into parts. Specifically, the idea is to divide the data into chunks, +# compute the partial sums of these chunks in parallel, and finally sum up the partial +# results. (Note that we will not concern ourselves with potential minor or +# catastrophic numerical errors due to potential rearrangements of terms in the summation here.) +# +# A common, manual implementation of this idea might look like this: + +using OhMyThreads: @spawn +using ChunkSplitters: chunks + +function parallel_sum_falsesharing(data; nchunks = nthreads()) + psums = zeros(eltype(data), nchunks) + @sync for (c, idcs) in enumerate(chunks(data; n = nchunks)) + @spawn begin + for i in idcs + psums[c] += data[i] + end + end + end + return sum(psums) +end + +# The code is pretty straightforward: We allocate space for the results of the partial sums +# (`psums`) and, on `nchunks` many tasks, add up the data elements of each partial sum in +# parallel. More importantly, and in this context perhaps surprisingly, the code is also +# **correct** in the sense that it produces the desired result. + +using Test +@test sum(data) ≈ parallel_sum_falsesharing(data) + +# This is just a reflection of the fact that there is no logical sharing of data - because +# each parallel tasks modifies a different element of `psums` - implying the absence of +# race conditions. +# +# What's the issue then?! Well, the sole purpose of parallelization is to reduce runtime. +# So let's see how well we're doing in this respect. + +nthreads() + +# + +@btime parallel_sum_falsesharing($data); + +# A **slowdown**?! Clearly, that's the opposite of what we tried to achieve! + +# +# ## The issue: False sharing +# +# Although our parallel summation above is semantically correct, it has a +# big **performance issue**: *False sharing*. To understand false sharing, we have to think +# a little bit about how computers work. Specifically, we need to realize that processors +# cache memory in lines (rather than individual elements) and that caches of different processors +# are kept coherent. +# When two (or more) different CPU cores operate on independent data elements that **fall +# into the same cache line** (i.e. they are part of the same memory address region) +# the **cache coherency mechanism leads to costly synchronization** between cores. + +# In our case, this happens despite the fact that different parallel tasks +# (on different CPU cores) *logically* don't care about the rest of the data in the cache line +# at all. + +# ![](false_sharing.svg) + +# Given these insights, we can come up with a few workarounds that mitigate the issue. +# The most prominent is probably padding, where one simply adds sufficiently many unused +# zeros to `psums` such that different partial sum counters don't fall into the same cache +# line. However, let's discuss a more fundamental, more efficient, and more elegant solution. + +# +# ## Task-local parallel summation +# +# The key mistake in `parallel_sum_falsesharing` above is the non-local modification of +# (implicitly) shared state (cache lines of `psums`) very frequently (in the innermost loop). +# We can simply avoid this by making the code more task-local. To this end, we introduce a +# **task-local accumulator variable**, which we use to perform the task-local partial sums. +# Only at the very end do we communicate the result to the main thread, e.g. by writing it +# into `psums` (once!). + +function parallel_sum_tasklocal(data; nchunks = nthreads()) + psums = zeros(eltype(data), nchunks) + @sync for (c, idcs) in enumerate(chunks(data; n = nchunks)) + @spawn begin + local s = zero(eltype(data)) + @simd for i in idcs + @inbounds s += data[i] + end + psums[c] = s + end + end + return sum(psums) +end + +@test sum(data) ≈ parallel_sum_tasklocal(data) +@btime parallel_sum_tasklocal($data); + +# Finally, there is our expected speed up! 🎉 +# +# Two comments are in order. +# +# First, we note that the only role that `psums` plays is +# as a temporary storage for the results from the parallel tasks to be able to sum them +# up eventually. We could get rid of it entirely by using a `Threads.Atomic` instead which +# would get updated via `Threads.atomic_add!` from each task directly. However, +# for our discussion, this is a detail and we won't discuss it further. +# +# Secondly, while keeping the general idea, we can drastically simplify the above code by +# using `map` and reusing the built-in (sequential) `sum` function on each parallel task: + +function parallel_sum_map(data; nchunks = nthreads()) + ts = map(chunks(data, n = nchunks)) do idcs + @spawn @views sum(data[idcs]) + end + return sum(fetch.(ts)) +end + +@test sum(data) ≈ parallel_sum_map(data) +@btime parallel_sum_map($data); + +# This implementation has comparable performance and, more importantly, is conceptually +# clearer in that there is no explicit modification of shared state, i.e. no `pums[c] = s`, +# anywhere at all. We can't run into false sharing if we don't modify shared state 😉. + +# +# ## Parallel summation with OhMyThreads +# +# Finally, all of the above is abstracted away for you if you simply use [`treduce`](@ref) +# to implement the parallel summation. It also only takes a single line and function call. + +using OhMyThreads: treduce + +@test sum(data) ≈ treduce(+, data; ntasks = nthreads()) +@btime treduce($+, $data; ntasks = $nthreads()); diff --git a/docs/src/literate/falsesharing/falsesharing.md b/docs/src/literate/falsesharing/falsesharing.md new file mode 100644 index 00000000..27ec2244 --- /dev/null +++ b/docs/src/literate/falsesharing/falsesharing.md @@ -0,0 +1,213 @@ +```@meta +EditURL = "falsesharing.jl" +``` + +# [False Sharing](@id FalseSharing) + +*False Sharing* is a very common but subtle performance issue that comes up again and +again when writing parallel code manually. For this reason, we shall discuss what it is +about and how to avoid it. + +For simplicity, let's focus on a specific example: parallel summation. + +## Baseline: sequential summation + +To establish a baseline, that we can later compare against, we define some fake data, +which we'll sum up, and benchmark Julia's built-in, non-parallel `sum` function. + +````julia +using Base.Threads: nthreads +using BenchmarkTools + +data = rand(1_000_000 * nthreads()); +@btime sum($data); +```` + +```` + 1.641 ms (0 allocations: 0 bytes) + +```` + +## The problematic parallel implementation + +A conceptually simple (and valid) approach to parallelizing the summation is to divide +the full computation into parts. Specifically, the idea is to divide the data into chunks, +compute the partial sums of these chunks in parallel, and finally sum up the partial +results. (Note that we will not concern ourselves with potential minor or +catastrophic numerical errors due to potential rearrangements of terms in the summation.) + +A common, manual implementation of this idea might look like this: + +````julia +using OhMyThreads: @spawn +using ChunkSplitters: chunks + +function parallel_sum_falsesharing(data; nchunks = nthreads()) + psums = zeros(eltype(data), nchunks) + @sync for (c, idcs) in enumerate(chunks(data; n = nchunks)) + @spawn begin + for i in idcs + psums[c] += data[i] + end + end + end + return sum(psums) +end +```` + +```` +parallel_sum_falsesharing (generic function with 1 method) +```` + +The code is pretty straightforward: We allocate space for the results of the partial sums +(`psums`) and, on `nchunks` many tasks, add up the data elements of each partial sum in +parallel. More importantly, and in this context perhaps surprisingly, the code is also +**correct** in the sense that it produces the desired result. + +````julia +using Test +@test sum(data) ≈ parallel_sum_falsesharing(data) +```` + +```` +Test Passed +```` + +This is just a reflection of the fact that there is no logical sharing of data - because +each parallel tasks modifies a different element of `psums` - implying the absence of +race conditions. + +**What's the issue then?!** Well, the sole purpose of parallelization is to reduce runtime. +So let's see how well we're doing in this respect. + +````julia +nthreads() +```` + +```` +10 +```` + +````julia +@btime parallel_sum_falsesharing($data); +```` + +```` + 7.530 ms (221 allocations: 18.47 KiB) + +```` + +Clearly, that's the opposite of what we tried to achieve! + +## The issue: False sharing. + +Although our parallel summation above is semantically correct, it has a +big **performance issue**: *False sharing*. To understand false sharing, we have to think +a little bit about how computers work. Specifically, we need to realize that processors +cache memory in lines (rather than individual elements) and that caches of processors +are kept coherent. +When two (or more) different CPU cores operate on **independent data elements that fall +into the same cache line** (i.e. they are part of the same memory address region) +the **cache coherency mechanism leads to costly synchronization** between cores. + +In our case, this happens despite the fact that different parallel tasks +(on different CPU cores) *logically* don't care about the rest of the data in the cache line +at all. + + + + + +Given these insights, we can come up with a few workarounds that mitigate the issue. +The most prominent is probably padding, where one simply adds sufficiently many unused +zeros to `psums` such that different partial sum counters don't fall into the same cache +line. However, let's discuss a more fundamental, more efficient, and more elegant solution. + +## Task-local parallel summation + +The key mistake in `parallel_sum_falsesharing` above is the non-local modification of +(implicitly) shared state (cache lines of `psums`) very frequently (in the innermost loop). +We can simply avoid this by making the code more task-local. To this end, we introduce a +task-local accumulator variable, which we use to perform the task-local partial sums. +Only at the very end do we communicate the result to the main thread, e.g. by writing it +into `psums` (once!). + +````julia +function parallel_sum_tasklocal(data; nchunks = nthreads()) + psums = zeros(eltype(data), nchunks) + @sync for (c, idcs) in enumerate(chunks(data; n = nchunks)) + @spawn begin + local s = zero(eltype(data)) + @simd for i in idcs + @inbounds s += data[i] + end + psums[c] = s + end + end + return sum(psums) +end + +@test sum(data) ≈ parallel_sum_tasklocal(data) +@btime parallel_sum_tasklocal($data); +```` + +```` + 1.407 ms (221 allocations: 18.55 KiB) + +```` + +Finally, there is our expected speed up! 🎉 + +Two comments are in order. + +First, we note that the only role that `psums` plays is +as a temporary storage for the results from the parallel tasks to be able to sum them +up eventually. We could get rid of it entirely by using a `Threads.Atomic` instead which +would get updated via `Threads.atomic_add!` from each task directly. However, +for our discussion, this is a detail and we won't discuss it further. + +Secondly, while keeping the general idea, we can drastically simplify the above code by +using `map` and reusing the built-in (sequential) `sum` function on each parallel task: + +````julia +function parallel_sum_map(data; nchunks = nthreads()) + ts = map(chunks(data, n = nchunks)) do idcs + @spawn @views sum(data[idcs]) + end + return sum(fetch.(ts)) +end + +@test sum(data) ≈ parallel_sum_map(data) +@btime parallel_sum_map($data); +```` + +```` + 1.370 ms (64 allocations: 5.72 KiB) + +```` + +This implementation has comparable performance and, more importantly, is conceptually +clearer in that there is no explicit modification of shared state, i.e. no `pums[c] = s`, +anywhere at all. We can't run into false sharing if we don't modify shared state 😉. + +## Parallel summation with OhMyThreads + +Finally, all of the above is abstracted away for you if you simply use [`treduce`](@ref) +to implement the parallel summation. It also only takes a single line and function call. + +````julia +using OhMyThreads: treduce + +@test sum(data) ≈ treduce(+, data; ntasks = nthreads()) +@btime treduce($+, $data; ntasks = $nthreads()); +```` + +```` + 1.386 ms (68 allocations: 5.92 KiB) + +```` + +--- + +*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).* + From f167e67d2b7646f3c778a20a5314ab9fc708d441 Mon Sep 17 00:00:00 2001 From: Carsten Bauer Date: Fri, 3 May 2024 11:53:02 +0200 Subject: [PATCH 2/5] doc nav entry --- docs/make.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/make.jl b/docs/make.jl index 903c479a..19f8b78e 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -12,13 +12,14 @@ makedocs(; doctest = false, pages = [ "OhMyThreads" => "index.md", - "Translation Guide" => "translation.md", - "Thread-Safe Storage" => "literate/tls/tls.md", "Examples" => [ "Parallel Monte Carlo" => "literate/mc/mc.md", "Julia Set" => "literate/juliaset/juliaset.md", "Trapezoidal Integration" => "literate/integration/integration.md" ], + "Translation Guide" => "translation.md", + "Thread-Safe Storage" => "literate/tls/tls.md", + "False Sharing" => "literate/falsesharing/falsesharing.md", # "Explanations" => [ # "Task-Based Multithreading" => "explain/taskbasedmt.md", # ], From f9d12bb6c6380b13840225c6bac68d9b93286159 Mon Sep 17 00:00:00 2001 From: Carsten Bauer Date: Fri, 3 May 2024 12:11:29 +0200 Subject: [PATCH 3/5] run on n2 --- .../src/literate/falsesharing/falsesharing.jl | 2 +- .../src/literate/falsesharing/falsesharing.md | 30 +++++++++---------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/docs/src/literate/falsesharing/falsesharing.jl b/docs/src/literate/falsesharing/falsesharing.jl index 10e811f4..9ef6709d 100644 --- a/docs/src/literate/falsesharing/falsesharing.jl +++ b/docs/src/literate/falsesharing/falsesharing.jl @@ -14,7 +14,7 @@ using Base.Threads: nthreads using BenchmarkTools -data = rand(1_000_000 * nthreads()); +data = rand(10_000_000 * nthreads()); @btime sum($data); # diff --git a/docs/src/literate/falsesharing/falsesharing.md b/docs/src/literate/falsesharing/falsesharing.md index 27ec2244..d7173944 100644 --- a/docs/src/literate/falsesharing/falsesharing.md +++ b/docs/src/literate/falsesharing/falsesharing.md @@ -19,12 +19,12 @@ which we'll sum up, and benchmark Julia's built-in, non-parallel `sum` function. using Base.Threads: nthreads using BenchmarkTools -data = rand(1_000_000 * nthreads()); +data = rand(10_000_000 * nthreads()); @btime sum($data); ```` ```` - 1.641 ms (0 allocations: 0 bytes) + 27.834 ms (0 allocations: 0 bytes) ```` @@ -34,7 +34,7 @@ A conceptually simple (and valid) approach to parallelizing the summation is to the full computation into parts. Specifically, the idea is to divide the data into chunks, compute the partial sums of these chunks in parallel, and finally sum up the partial results. (Note that we will not concern ourselves with potential minor or -catastrophic numerical errors due to potential rearrangements of terms in the summation.) +catastrophic numerical errors due to potential rearrangements of terms in the summation here.) A common, manual implementation of this idea might look like this: @@ -77,7 +77,7 @@ This is just a reflection of the fact that there is no logical sharing of data - each parallel tasks modifies a different element of `psums` - implying the absence of race conditions. -**What's the issue then?!** Well, the sole purpose of parallelization is to reduce runtime. +What's the issue then?! Well, the sole purpose of parallelization is to reduce runtime. So let's see how well we're doing in this respect. ````julia @@ -93,20 +93,20 @@ nthreads() ```` ```` - 7.530 ms (221 allocations: 18.47 KiB) + 348.539 ms (221 allocations: 18.47 KiB) ```` -Clearly, that's the opposite of what we tried to achieve! +A **slowdown**?! Clearly, that's the opposite of what we tried to achieve! -## The issue: False sharing. +## The issue: False sharing Although our parallel summation above is semantically correct, it has a big **performance issue**: *False sharing*. To understand false sharing, we have to think a little bit about how computers work. Specifically, we need to realize that processors -cache memory in lines (rather than individual elements) and that caches of processors +cache memory in lines (rather than individual elements) and that caches of different processors are kept coherent. -When two (or more) different CPU cores operate on **independent data elements that fall +When two (or more) different CPU cores operate on independent data elements that **fall into the same cache line** (i.e. they are part of the same memory address region) the **cache coherency mechanism leads to costly synchronization** between cores. @@ -114,9 +114,7 @@ In our case, this happens despite the fact that different parallel tasks (on different CPU cores) *logically* don't care about the rest of the data in the cache line at all. - - - +![](false_sharing.svg) Given these insights, we can come up with a few workarounds that mitigate the issue. The most prominent is probably padding, where one simply adds sufficiently many unused @@ -128,7 +126,7 @@ line. However, let's discuss a more fundamental, more efficient, and more elegan The key mistake in `parallel_sum_falsesharing` above is the non-local modification of (implicitly) shared state (cache lines of `psums`) very frequently (in the innermost loop). We can simply avoid this by making the code more task-local. To this end, we introduce a -task-local accumulator variable, which we use to perform the task-local partial sums. +**task-local accumulator variable**, which we use to perform the task-local partial sums. Only at the very end do we communicate the result to the main thread, e.g. by writing it into `psums` (once!). @@ -152,7 +150,7 @@ end ```` ```` - 1.407 ms (221 allocations: 18.55 KiB) + 50.021 ms (221 allocations: 18.55 KiB) ```` @@ -182,7 +180,7 @@ end ```` ```` - 1.370 ms (64 allocations: 5.72 KiB) + 51.305 ms (64 allocations: 5.72 KiB) ```` @@ -203,7 +201,7 @@ using OhMyThreads: treduce ```` ```` - 1.386 ms (68 allocations: 5.92 KiB) + 50.873 ms (68 allocations: 5.92 KiB) ```` From e7dfce36cf7d27905615b20fe116f4ecae55e3b9 Mon Sep 17 00:00:00 2001 From: Carsten Bauer Date: Sat, 4 May 2024 20:29:38 +0200 Subject: [PATCH 4/5] Update falsesharing.jl --- docs/src/literate/falsesharing/falsesharing.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/src/literate/falsesharing/falsesharing.jl b/docs/src/literate/falsesharing/falsesharing.jl index 9ef6709d..933427fe 100644 --- a/docs/src/literate/falsesharing/falsesharing.jl +++ b/docs/src/literate/falsesharing/falsesharing.jl @@ -28,8 +28,7 @@ data = rand(10_000_000 * nthreads()); # # A common, manual implementation of this idea might look like this: -using OhMyThreads: @spawn -using ChunkSplitters: chunks +using OhMyThreads: @spawn, chunks function parallel_sum_falsesharing(data; nchunks = nthreads()) psums = zeros(eltype(data), nchunks) From f660cf25c12452b5e0cbefacd4a7e103d7795a6b Mon Sep 17 00:00:00 2001 From: Carsten Bauer Date: Sat, 4 May 2024 20:30:02 +0200 Subject: [PATCH 5/5] Update falsesharing.md --- docs/src/literate/falsesharing/falsesharing.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/src/literate/falsesharing/falsesharing.md b/docs/src/literate/falsesharing/falsesharing.md index d7173944..a92394ce 100644 --- a/docs/src/literate/falsesharing/falsesharing.md +++ b/docs/src/literate/falsesharing/falsesharing.md @@ -39,8 +39,7 @@ catastrophic numerical errors due to potential rearrangements of terms in the su A common, manual implementation of this idea might look like this: ````julia -using OhMyThreads: @spawn -using ChunkSplitters: chunks +using OhMyThreads: @spawn, chunks function parallel_sum_falsesharing(data; nchunks = nthreads()) psums = zeros(eltype(data), nchunks)