diff --git a/doc/pub/week9/html/._week9-bs000.html b/doc/pub/week9/html/._week9-bs000.html index 0ee9f022..eb84da37 100644 --- a/doc/pub/week9/html/._week9-bs000.html +++ b/doc/pub/week9/html/._week9-bs000.html @@ -79,11 +79,388 @@ 2, None, 'blocking-transformations-final-expressions'), + ('More on the blocking method', + 2, + None, + 'more-on-the-blocking-method'), ('Example code form last week', 2, None, 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('Optimization and debugging', + 2, + None, + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', + 2, + None, + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', + 2, + None, + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', + 2, + None, + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -137,8 +514,127 @@
  • Blocking Transformations
  • Blocking Transformations, getting there
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -193,7 +689,7 @@

    March 11-15

  • 9
  • 10
  • ...
  • -
  • 22
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs001.html b/doc/pub/week9/html/._week9-bs001.html index 977e8485..78e442fe 100644 --- a/doc/pub/week9/html/._week9-bs001.html +++ b/doc/pub/week9/html/._week9-bs001.html @@ -79,11 +79,388 @@ 2, None, 'blocking-transformations-final-expressions'), + ('More on the blocking method', + 2, + None, + 'more-on-the-blocking-method'), ('Example code form last week', 2, None, 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('Optimization and debugging', + 2, + None, + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', + 2, + None, + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', + 2, + None, + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', + 2, + None, + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -137,8 +514,127 @@
  • Blocking Transformations
  • Blocking Transformations, getting there
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -166,16 +662,7 @@

    Overview of week 11, Mar -
    - -
    - +

    Note, these notes contain additional material om optimization and parallelization. Parts of this material will be discussed this week.

    @@ -193,7 +680,7 @@

    Overview of week 11, Mar
  • 10
  • 11
  • ...
  • -
  • 22
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs002.html b/doc/pub/week9/html/._week9-bs002.html index c96461c9..d9e8847c 100644 --- a/doc/pub/week9/html/._week9-bs002.html +++ b/doc/pub/week9/html/._week9-bs002.html @@ -79,11 +79,388 @@ 2, None, 'blocking-transformations-final-expressions'), + ('More on the blocking method', + 2, + None, + 'more-on-the-blocking-method'), ('Example code form last week', 2, None, 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('Optimization and debugging', + 2, + None, + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', + 2, + None, + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', + 2, + None, + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', + 2, + None, + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -137,8 +514,127 @@
  • Blocking Transformations
  • Blocking Transformations, getting there
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -180,7 +676,7 @@

    Why resampling methods ?

  • 11
  • 12
  • ...
  • -
  • 22
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs003.html b/doc/pub/week9/html/._week9-bs003.html index 426da770..93beacca 100644 --- a/doc/pub/week9/html/._week9-bs003.html +++ b/doc/pub/week9/html/._week9-bs003.html @@ -79,11 +79,388 @@ 2, None, 'blocking-transformations-final-expressions'), + ('More on the blocking method', + 2, + None, + 'more-on-the-blocking-method'), ('Example code form last week', 2, None, 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('Optimization and debugging', + 2, + None, + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', + 2, + None, + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', + 2, + None, + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', + 2, + None, + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -137,8 +514,127 @@
  • Blocking Transformations
  • Blocking Transformations, getting there
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -185,7 +681,7 @@

    Statistical analysis

  • 12
  • 13
  • ...
  • -
  • 22
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs004.html b/doc/pub/week9/html/._week9-bs004.html index 18244a7c..63514813 100644 --- a/doc/pub/week9/html/._week9-bs004.html +++ b/doc/pub/week9/html/._week9-bs004.html @@ -79,11 +79,388 @@ 2, None, 'blocking-transformations-final-expressions'), + ('More on the blocking method', + 2, + None, + 'more-on-the-blocking-method'), ('Example code form last week', 2, None, 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('Optimization and debugging', + 2, + None, + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', + 2, + None, + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', + 2, + None, + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', + 2, + None, + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -137,8 +514,127 @@
  • Blocking Transformations
  • Blocking Transformations, getting there
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -181,7 +677,7 @@

    And why do we use such me
  • 13
  • 14
  • ...
  • -
  • 22
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs005.html b/doc/pub/week9/html/._week9-bs005.html index d6b04049..8c6f395a 100644 --- a/doc/pub/week9/html/._week9-bs005.html +++ b/doc/pub/week9/html/._week9-bs005.html @@ -79,11 +79,388 @@ 2, None, 'blocking-transformations-final-expressions'), + ('More on the blocking method', + 2, + None, + 'more-on-the-blocking-method'), ('Example code form last week', 2, None, 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('Optimization and debugging', + 2, + None, + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', + 2, + None, + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', + 2, + None, + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', + 2, + None, + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -137,8 +514,127 @@
  • Blocking Transformations
  • Blocking Transformations, getting there
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -192,7 +688,7 @@

    Central limit theorem

  • 14
  • 15
  • ...
  • -
  • 22
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs006.html b/doc/pub/week9/html/._week9-bs006.html index c5569c77..b4cebfed 100644 --- a/doc/pub/week9/html/._week9-bs006.html +++ b/doc/pub/week9/html/._week9-bs006.html @@ -79,11 +79,388 @@ 2, None, 'blocking-transformations-final-expressions'), + ('More on the blocking method', + 2, + None, + 'more-on-the-blocking-method'), ('Example code form last week', 2, None, 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('Optimization and debugging', + 2, + None, + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', + 2, + None, + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', + 2, + None, + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', + 2, + None, + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -137,8 +514,127 @@
  • Blocking Transformations
  • Blocking Transformations, getting there
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -179,7 +675,7 @@

    Further remarks

  • 15
  • 16
  • ...
  • -
  • 22
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs007.html b/doc/pub/week9/html/._week9-bs007.html index 4c38912b..ac828571 100644 --- a/doc/pub/week9/html/._week9-bs007.html +++ b/doc/pub/week9/html/._week9-bs007.html @@ -79,11 +79,388 @@ 2, None, 'blocking-transformations-final-expressions'), + ('More on the blocking method', + 2, + None, + 'more-on-the-blocking-method'), ('Example code form last week', 2, None, 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('Optimization and debugging', + 2, + None, + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', + 2, + None, + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', + 2, + None, + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', + 2, + None, + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -137,8 +514,127 @@
  • Blocking Transformations
  • Blocking Transformations, getting there
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -193,7 +689,7 @@

    Running many measurements

    16
  • 17
  • ...
  • -
  • 22
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs008.html b/doc/pub/week9/html/._week9-bs008.html index a20bb699..5eae9dce 100644 --- a/doc/pub/week9/html/._week9-bs008.html +++ b/doc/pub/week9/html/._week9-bs008.html @@ -79,11 +79,388 @@ 2, None, 'blocking-transformations-final-expressions'), + ('More on the blocking method', + 2, + None, + 'more-on-the-blocking-method'), ('Example code form last week', 2, None, 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('Optimization and debugging', + 2, + None, + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', + 2, + None, + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', + 2, + None, + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', + 2, + None, + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -137,8 +514,127 @@
  • Blocking Transformations
  • Blocking Transformations, getting there
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -191,7 +687,7 @@

    Adding more definitions

  • 17
  • 18
  • ...
  • -
  • 22
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs009.html b/doc/pub/week9/html/._week9-bs009.html index 9ecf8b6f..07e5de6b 100644 --- a/doc/pub/week9/html/._week9-bs009.html +++ b/doc/pub/week9/html/._week9-bs009.html @@ -79,11 +79,388 @@ 2, None, 'blocking-transformations-final-expressions'), + ('More on the blocking method', + 2, + None, + 'more-on-the-blocking-method'), ('Example code form last week', 2, None, 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('Optimization and debugging', + 2, + None, + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', + 2, + None, + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', + 2, + None, + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', + 2, + None, + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -137,8 +514,127 @@
  • Blocking Transformations
  • Blocking Transformations, getting there
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -186,7 +682,7 @@

    Further rewriting

  • 18
  • 19
  • ...
  • -
  • 22
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs010.html b/doc/pub/week9/html/._week9-bs010.html index f41eadcc..14a3cd97 100644 --- a/doc/pub/week9/html/._week9-bs010.html +++ b/doc/pub/week9/html/._week9-bs010.html @@ -79,11 +79,388 @@ 2, None, 'blocking-transformations-final-expressions'), + ('More on the blocking method', + 2, + None, + 'more-on-the-blocking-method'), ('Example code form last week', 2, None, 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('Optimization and debugging', + 2, + None, + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', + 2, + None, + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', + 2, + None, + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', + 2, + None, + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -137,8 +514,127 @@
  • Blocking Transformations
  • Blocking Transformations, getting there
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -196,7 +692,7 @@

    The covariance term

  • 19
  • 20
  • ...
  • -
  • 22
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs011.html b/doc/pub/week9/html/._week9-bs011.html index 765472ef..5980ce77 100644 --- a/doc/pub/week9/html/._week9-bs011.html +++ b/doc/pub/week9/html/._week9-bs011.html @@ -79,11 +79,388 @@ 2, None, 'blocking-transformations-final-expressions'), + ('More on the blocking method', + 2, + None, + 'more-on-the-blocking-method'), ('Example code form last week', 2, None, 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('Optimization and debugging', + 2, + None, + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', + 2, + None, + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', + 2, + None, + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', + 2, + None, + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -137,8 +514,127 @@
  • Blocking Transformations
  • Blocking Transformations, getting there
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -193,7 +689,7 @@

    Rewriting the covariance t
  • 20
  • 21
  • ...
  • -
  • 22
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs012.html b/doc/pub/week9/html/._week9-bs012.html index 083ad35c..7610e62a 100644 --- a/doc/pub/week9/html/._week9-bs012.html +++ b/doc/pub/week9/html/._week9-bs012.html @@ -79,11 +79,388 @@ 2, None, 'blocking-transformations-final-expressions'), + ('More on the blocking method', + 2, + None, + 'more-on-the-blocking-method'), ('Example code form last week', 2, None, 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('Optimization and debugging', + 2, + None, + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', + 2, + None, + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', + 2, + None, + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', + 2, + None, + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -137,8 +514,127 @@
  • Blocking Transformations
  • Blocking Transformations, getting there
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -185,6 +681,8 @@

    Introducing the cor
  • 20
  • 21
  • 22
  • +
  • ...
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs013.html b/doc/pub/week9/html/._week9-bs013.html index b3926051..39f540a0 100644 --- a/doc/pub/week9/html/._week9-bs013.html +++ b/doc/pub/week9/html/._week9-bs013.html @@ -79,11 +79,388 @@ 2, None, 'blocking-transformations-final-expressions'), + ('More on the blocking method', + 2, + None, + 'more-on-the-blocking-method'), ('Example code form last week', 2, None, 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('Optimization and debugging', + 2, + None, + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', + 2, + None, + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', + 2, + None, + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', + 2, + None, + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -137,8 +514,127 @@
  • Blocking Transformations
  • Blocking Transformations, getting there
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -153,9 +649,9 @@

    Resampling methods: Blocking

    The blocking method was made popular by Flyvbjerg and Pedersen (1989) -and has become one of the standard ways to estimate -\( V(\widehat{\theta}) \) for exactly one \( \widehat{\theta} \), namely -\( \widehat{\theta} = \overline{X} \). +and has become one of the standard ways to estimate the variance +\( \mathrm{var}(\widehat{\theta}) \) for exactly one estimator \( \widehat{\theta} \), namely +\( \widehat{\theta} = \overline{X} \), the mean value.

    Assume \( n = 2^d \) for some integer \( d>1 \) and \( X_1,X_2,\cdots, X_n \) is a stationary time series to begin with. @@ -191,6 +687,9 @@

    Resampling methods: Blocking
  • 20
  • 21
  • 22
  • +
  • 23
  • +
  • ...
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs014.html b/doc/pub/week9/html/._week9-bs014.html index 17f75c4d..fa7b50aa 100644 --- a/doc/pub/week9/html/._week9-bs014.html +++ b/doc/pub/week9/html/._week9-bs014.html @@ -79,11 +79,388 @@ 2, None, 'blocking-transformations-final-expressions'), + ('More on the blocking method', + 2, + None, + 'more-on-the-blocking-method'), ('Example code form last week', 2, None, 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('Optimization and debugging', + 2, + None, + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', + 2, + None, + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', + 2, + None, + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', + 2, + None, + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -137,8 +514,127 @@
  • Blocking Transformations
  • Blocking Transformations, getting there
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -180,6 +676,10 @@

    Why blocking?

  • 20
  • 21
  • 22
  • +
  • 23
  • +
  • 24
  • +
  • ...
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs015.html b/doc/pub/week9/html/._week9-bs015.html index 58375c48..72673cc9 100644 --- a/doc/pub/week9/html/._week9-bs015.html +++ b/doc/pub/week9/html/._week9-bs015.html @@ -79,11 +79,388 @@ 2, None, 'blocking-transformations-final-expressions'), + ('More on the blocking method', + 2, + None, + 'more-on-the-blocking-method'), ('Example code form last week', 2, None, 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('Optimization and debugging', + 2, + None, + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', + 2, + None, + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', + 2, + None, + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', + 2, + None, + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -137,8 +514,127 @@
  • Blocking Transformations
  • Blocking Transformations, getting there
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -190,6 +686,11 @@

    Blocking Transformations

  • 20
  • 21
  • 22
  • +
  • 23
  • +
  • 24
  • +
  • 25
  • +
  • ...
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs016.html b/doc/pub/week9/html/._week9-bs016.html index 2e67d0cc..74235b3b 100644 --- a/doc/pub/week9/html/._week9-bs016.html +++ b/doc/pub/week9/html/._week9-bs016.html @@ -79,11 +79,388 @@ 2, None, 'blocking-transformations-final-expressions'), + ('More on the blocking method', + 2, + None, + 'more-on-the-blocking-method'), ('Example code form last week', 2, None, 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('Optimization and debugging', + 2, + None, + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', + 2, + None, + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', + 2, + None, + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', + 2, + None, + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -137,8 +514,127 @@
  • Blocking Transformations
  • Blocking Transformations, getting there
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -188,6 +684,12 @@

    Blocking transformations

  • 20
  • 21
  • 22
  • +
  • 23
  • +
  • 24
  • +
  • 25
  • +
  • 26
  • +
  • ...
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs017.html b/doc/pub/week9/html/._week9-bs017.html index 9b2ac236..628dc398 100644 --- a/doc/pub/week9/html/._week9-bs017.html +++ b/doc/pub/week9/html/._week9-bs017.html @@ -79,11 +79,388 @@ 2, None, 'blocking-transformations-final-expressions'), + ('More on the blocking method', + 2, + None, + 'more-on-the-blocking-method'), ('Example code form last week', 2, None, 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('Optimization and debugging', + 2, + None, + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', + 2, + None, + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', + 2, + None, + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', + 2, + None, + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -137,8 +514,127 @@
  • Blocking Transformations
  • Blocking Transformations, getting there
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -191,6 +687,13 @@

    Blocking Transformations

  • 20
  • 21
  • 22
  • +
  • 23
  • +
  • 24
  • +
  • 25
  • +
  • 26
  • +
  • 27
  • +
  • ...
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs018.html b/doc/pub/week9/html/._week9-bs018.html index db3ecf38..ce4aec3d 100644 --- a/doc/pub/week9/html/._week9-bs018.html +++ b/doc/pub/week9/html/._week9-bs018.html @@ -79,11 +79,388 @@ 2, None, 'blocking-transformations-final-expressions'), + ('More on the blocking method', + 2, + None, + 'more-on-the-blocking-method'), ('Example code form last week', 2, None, 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('Optimization and debugging', + 2, + None, + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', + 2, + None, + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', + 2, + None, + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', + 2, + None, + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -137,8 +514,127 @@
  • Blocking Transformations
  • Blocking Transformations, getting there
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -187,6 +683,14 @@

    Blocking Transfor
  • 20
  • 21
  • 22
  • +
  • 23
  • +
  • 24
  • +
  • 25
  • +
  • 26
  • +
  • 27
  • +
  • 28
  • +
  • ...
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs019.html b/doc/pub/week9/html/._week9-bs019.html index 0366c982..0fd8978e 100644 --- a/doc/pub/week9/html/._week9-bs019.html +++ b/doc/pub/week9/html/._week9-bs019.html @@ -79,11 +79,388 @@ 2, None, 'blocking-transformations-final-expressions'), + ('More on the blocking method', + 2, + None, + 'more-on-the-blocking-method'), ('Example code form last week', 2, None, 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('Optimization and debugging', + 2, + None, + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', + 2, + None, + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', + 2, + None, + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', + 2, + None, + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -137,8 +514,127 @@
  • Blocking Transformations
  • Blocking Transformations, getting there
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -168,16 +664,6 @@

    Blocking Tran \end{align} $$ -

    Flyvbjerg and Petersen demonstrated that the sequence -\( \{e_k\}_{k=0}^{d-1} \) is decreasing, and conjecture that the term -\( e_k \) can be made as small as we would like by making \( k \) (and hence -\( d \)) sufficiently large. The sequence is decreasing (Master of Science thesis by Marius Jonsson, UiO 2018). -It means we can apply blocking transformations until -\( e_k \) is sufficiently small, and then estimate \( \mathrm{var}(\overline{X}) \) by -\( \widehat{\sigma}^2_k/n_k \). -

    - -

    For an elegant solution and proof of the blocking method, see the recent article of Marius Jonsson (former MSc student of the Computational Physics group).

    @@ -196,6 +682,15 @@

    Blocking Tran
  • 20
  • 21
  • 22
  • +
  • 23
  • +
  • 24
  • +
  • 25
  • +
  • 26
  • +
  • 27
  • +
  • 28
  • +
  • 29
  • +
  • ...
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs020.html b/doc/pub/week9/html/._week9-bs020.html index 29d40bbe..5dc169b1 100644 --- a/doc/pub/week9/html/._week9-bs020.html +++ b/doc/pub/week9/html/._week9-bs020.html @@ -79,11 +79,388 @@ 2, None, 'blocking-transformations-final-expressions'), + ('More on the blocking method', + 2, + None, + 'more-on-the-blocking-method'), ('Example code form last week', 2, None, 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('Optimization and debugging', + 2, + None, + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', + 2, + None, + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', + 2, + None, + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', + 2, + None, + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -137,8 +514,127 @@
  • Blocking Transformations
  • Blocking Transformations, getting there
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -150,245 +646,18 @@

     

     

     

    -

    Example code form last week

    - - -
    -
    -
    -
    -
    -
    # 2-electron VMC code for 2dim quantum dot with importance sampling
    -# Using gaussian rng for new positions and Metropolis- Hastings 
    -# Added energy minimization
    -from math import exp, sqrt
    -from random import random, seed, normalvariate
    -import numpy as np
    -import matplotlib.pyplot as plt
    -from mpl_toolkits.mplot3d import Axes3D
    -from matplotlib import cm
    -from matplotlib.ticker import LinearLocator, FormatStrFormatter
    -from scipy.optimize import minimize
    -import sys
    -import os
    -
    -# Where to save data files
    -PROJECT_ROOT_DIR = "Results"
    -DATA_ID = "Results/EnergyMin"
    +

    More on the blocking method

    -if not os.path.exists(PROJECT_ROOT_DIR): - os.mkdir(PROJECT_ROOT_DIR) - -if not os.path.exists(DATA_ID): - os.makedirs(DATA_ID) - -def data_path(dat_id): - return os.path.join(DATA_ID, dat_id) - -outfile = open(data_path("Energies.dat"),'w') - - -# Trial wave function for the 2-electron quantum dot in two dims -def WaveFunction(r,alpha,beta): - r1 = r[0,0]**2 + r[0,1]**2 - r2 = r[1,0]**2 + r[1,1]**2 - r12 = sqrt((r[0,0]-r[1,0])**2 + (r[0,1]-r[1,1])**2) - deno = r12/(1+beta*r12) - return exp(-0.5*alpha*(r1+r2)+deno) - -# Local energy for the 2-electron quantum dot in two dims, using analytical local energy -def LocalEnergy(r,alpha,beta): - - r1 = (r[0,0]**2 + r[0,1]**2) - r2 = (r[1,0]**2 + r[1,1]**2) - r12 = sqrt((r[0,0]-r[1,0])**2 + (r[0,1]-r[1,1])**2) - deno = 1.0/(1+beta*r12) - deno2 = deno*deno - return 0.5*(1-alpha*alpha)*(r1 + r2) +2.0*alpha + 1.0/r12+deno2*(alpha*r12-deno2+2*beta*deno-1.0/r12) - -# Derivate of wave function ansatz as function of variational parameters -def DerivativeWFansatz(r,alpha,beta): - - WfDer = np.zeros((2), np.double) - r1 = (r[0,0]**2 + r[0,1]**2) - r2 = (r[1,0]**2 + r[1,1]**2) - r12 = sqrt((r[0,0]-r[1,0])**2 + (r[0,1]-r[1,1])**2) - deno = 1.0/(1+beta*r12) - deno2 = deno*deno - WfDer[0] = -0.5*(r1+r2) - WfDer[1] = -r12*r12*deno2 - return WfDer - -# Setting up the quantum force for the two-electron quantum dot, recall that it is a vector -def QuantumForce(r,alpha,beta): - - qforce = np.zeros((NumberParticles,Dimension), np.double) - r12 = sqrt((r[0,0]-r[1,0])**2 + (r[0,1]-r[1,1])**2) - deno = 1.0/(1+beta*r12) - qforce[0,:] = -2*r[0,:]*alpha*(r[0,:]-r[1,:])*deno*deno/r12 - qforce[1,:] = -2*r[1,:]*alpha*(r[1,:]-r[0,:])*deno*deno/r12 - return qforce - - -# Computing the derivative of the energy and the energy -def EnergyDerivative(x0): - - - # Parameters in the Fokker-Planck simulation of the quantum force - D = 0.5 - TimeStep = 0.05 - # positions - PositionOld = np.zeros((NumberParticles,Dimension), np.double) - PositionNew = np.zeros((NumberParticles,Dimension), np.double) - # Quantum force - QuantumForceOld = np.zeros((NumberParticles,Dimension), np.double) - QuantumForceNew = np.zeros((NumberParticles,Dimension), np.double) - - energy = 0.0 - DeltaE = 0.0 - alpha = x0[0] - beta = x0[1] - EnergyDer = 0.0 - DeltaPsi = 0.0 - DerivativePsiE = 0.0 - #Initial position - for i in range(NumberParticles): - for j in range(Dimension): - PositionOld[i,j] = normalvariate(0.0,1.0)*sqrt(TimeStep) - wfold = WaveFunction(PositionOld,alpha,beta) - QuantumForceOld = QuantumForce(PositionOld,alpha, beta) - - #Loop over MC MCcycles - for MCcycle in range(NumberMCcycles): - #Trial position moving one particle at the time - for i in range(NumberParticles): - for j in range(Dimension): - PositionNew[i,j] = PositionOld[i,j]+normalvariate(0.0,1.0)*sqrt(TimeStep)+\ - QuantumForceOld[i,j]*TimeStep*D - wfnew = WaveFunction(PositionNew,alpha,beta) - QuantumForceNew = QuantumForce(PositionNew,alpha, beta) - GreensFunction = 0.0 - for j in range(Dimension): - GreensFunction += 0.5*(QuantumForceOld[i,j]+QuantumForceNew[i,j])*\ - (D*TimeStep*0.5*(QuantumForceOld[i,j]-QuantumForceNew[i,j])-\ - PositionNew[i,j]+PositionOld[i,j]) - - GreensFunction = exp(GreensFunction) - ProbabilityRatio = GreensFunction*wfnew**2/wfold**2 - #Metropolis-Hastings test to see whether we accept the move - if random() <= ProbabilityRatio: - for j in range(Dimension): - PositionOld[i,j] = PositionNew[i,j] - QuantumForceOld[i,j] = QuantumForceNew[i,j] - wfold = wfnew - DeltaE = LocalEnergy(PositionOld,alpha,beta) - DerPsi = DerivativeWFansatz(PositionOld,alpha,beta) - DeltaPsi += DerPsi - energy += DeltaE - DerivativePsiE += DerPsi*DeltaE - - # We calculate mean values - energy /= NumberMCcycles - DerivativePsiE /= NumberMCcycles - DeltaPsi /= NumberMCcycles - EnergyDer = 2*(DerivativePsiE-DeltaPsi*energy) - return EnergyDer - - -# Computing the expectation value of the local energy -def Energy(x0): - # Parameters in the Fokker-Planck simulation of the quantum force - D = 0.5 - TimeStep = 0.05 - # positions - PositionOld = np.zeros((NumberParticles,Dimension), np.double) - PositionNew = np.zeros((NumberParticles,Dimension), np.double) - # Quantum force - QuantumForceOld = np.zeros((NumberParticles,Dimension), np.double) - QuantumForceNew = np.zeros((NumberParticles,Dimension), np.double) - - energy = 0.0 - DeltaE = 0.0 - alpha = x0[0] - beta = x0[1] - #Initial position - for i in range(NumberParticles): - for j in range(Dimension): - PositionOld[i,j] = normalvariate(0.0,1.0)*sqrt(TimeStep) - wfold = WaveFunction(PositionOld,alpha,beta) - QuantumForceOld = QuantumForce(PositionOld,alpha, beta) - - #Loop over MC MCcycles - for MCcycle in range(NumberMCcycles): - #Trial position moving one particle at the time - for i in range(NumberParticles): - for j in range(Dimension): - PositionNew[i,j] = PositionOld[i,j]+normalvariate(0.0,1.0)*sqrt(TimeStep)+\ - QuantumForceOld[i,j]*TimeStep*D - wfnew = WaveFunction(PositionNew,alpha,beta) - QuantumForceNew = QuantumForce(PositionNew,alpha, beta) - GreensFunction = 0.0 - for j in range(Dimension): - GreensFunction += 0.5*(QuantumForceOld[i,j]+QuantumForceNew[i,j])*\ - (D*TimeStep*0.5*(QuantumForceOld[i,j]-QuantumForceNew[i,j])-\ - PositionNew[i,j]+PositionOld[i,j]) - - GreensFunction = exp(GreensFunction) - ProbabilityRatio = GreensFunction*wfnew**2/wfold**2 - #Metropolis-Hastings test to see whether we accept the move - if random() <= ProbabilityRatio: - for j in range(Dimension): - PositionOld[i,j] = PositionNew[i,j] - QuantumForceOld[i,j] = QuantumForceNew[i,j] - wfold = wfnew - DeltaE = LocalEnergy(PositionOld,alpha,beta) - energy += DeltaE - if Printout: - outfile.write('%f\n' %(energy/(MCcycle+1.0))) - # We calculate mean values - energy /= NumberMCcycles - return energy - -#Here starts the main program with variable declarations -NumberParticles = 2 -Dimension = 2 -# seed for rng generator -seed() -# Monte Carlo cycles for parameter optimization -Printout = False -NumberMCcycles= 10000 -# guess for variational parameters -x0 = np.array([0.9,0.2]) -# Using Broydens method to find optimal parameters -res = minimize(Energy, x0, method='BFGS', jac=EnergyDerivative, options={'gtol': 1e-4,'disp': True}) -x0 = res.x -# Compute the energy again with the optimal parameters and increased number of Monte Cycles -NumberMCcycles= 2**19 -Printout = True -FinalEnergy = Energy(x0) -EResult = np.array([FinalEnergy,FinalEnergy]) -outfile.close() -#nice printout with Pandas -import pandas as pd -from pandas import DataFrame -data ={'Optimal Parameters':x0, 'Final Energy':EResult} -frame = pd.DataFrame(data) -print(frame) -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    +

    Flyvbjerg and Petersen demonstrated that the sequence +\( \{e_k\}_{k=0}^{d-1} \) is decreasing, and conjecture that the term +\( e_k \) can be made as small as we would like by making \( k \) (and hence +\( d \)) sufficiently large. The sequence is decreasing. +It means we can apply blocking transformations until +\( e_k \) is sufficiently small, and then estimate \( \mathrm{var}(\overline{X}) \) by +\( \widehat{\sigma}^2_k/n_k \). +

    +

    For an elegant solution and proof of the blocking method, see the recent article of Marius Jonsson (former MSc student of the Computational Physics group).

    @@ -406,6 +675,16 @@

    Example code form last week
  • 20
  • 21
  • 22
  • +
  • 23
  • +
  • 24
  • +
  • 25
  • +
  • 26
  • +
  • 27
  • +
  • 28
  • +
  • 29
  • +
  • 30
  • +
  • ...
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs021.html b/doc/pub/week9/html/._week9-bs021.html index a017baf4..12543a6e 100644 --- a/doc/pub/week9/html/._week9-bs021.html +++ b/doc/pub/week9/html/._week9-bs021.html @@ -79,11 +79,388 @@ 2, None, 'blocking-transformations-final-expressions'), + ('More on the blocking method', + 2, + None, + 'more-on-the-blocking-method'), ('Example code form last week', 2, None, 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('Optimization and debugging', + 2, + None, + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', + 2, + None, + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', + 2, + None, + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', + 2, + None, + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -137,8 +514,127 @@
  • Blocking Transformations
  • Blocking Transformations, getting there
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -150,13 +646,7 @@

     

     

     

    -

    Resampling analysis

    - -

    The next step is then to use the above data sets and perform a -resampling analysis using the blocking method -The blocking code, based on the article of Marius Jonsson is given here -

    - +

    Example code form last week

    @@ -164,60 +654,221 @@

    Resampling analysis

    -
    # Common imports
    +  
    # 2-electron VMC code for 2dim quantum dot with importance sampling
    +# Using gaussian rng for new positions and Metropolis- Hastings 
    +# Added energy minimization
    +from math import exp, sqrt
    +from random import random, seed, normalvariate
    +import numpy as np
    +import matplotlib.pyplot as plt
    +from mpl_toolkits.mplot3d import Axes3D
    +from matplotlib import cm
    +from matplotlib.ticker import LinearLocator, FormatStrFormatter
    +from scipy.optimize import minimize
    +import sys
     import os
     
    -# Where to save the figures and data files
    +# Where to save data files
    +PROJECT_ROOT_DIR = "Results"
     DATA_ID = "Results/EnergyMin"
     
    +if not os.path.exists(PROJECT_ROOT_DIR):
    +    os.mkdir(PROJECT_ROOT_DIR)
    +
    +if not os.path.exists(DATA_ID):
    +    os.makedirs(DATA_ID)
    +
     def data_path(dat_id):
         return os.path.join(DATA_ID, dat_id)
     
    -infile = open(data_path("Energies.dat"),'r')
    -
    -from numpy import log2, zeros, mean, var, sum, loadtxt, arange, array, cumsum, dot, transpose, diagonal, sqrt
    -from numpy.linalg import inv
    -
    -def block(x):
    -    # preliminaries
    -    n = len(x)
    -    d = int(log2(n))
    -    s, gamma = zeros(d), zeros(d)
    -    mu = mean(x)
    -
    -    # estimate the auto-covariance and variances 
    -    # for each blocking transformation
    -    for i in arange(0,d):
    -        n = len(x)
    -        # estimate autocovariance of x
    -        gamma[i] = (n)**(-1)*sum( (x[0:(n-1)]-mu)*(x[1:n]-mu) )
    -        # estimate variance of x
    -        s[i] = var(x)
    -        # perform blocking transformation
    -        x = 0.5*(x[0::2] + x[1::2])
    -   
    -    # generate the test observator M_k from the theorem
    -    M = (cumsum( ((gamma/s)**2*2**arange(1,d+1)[::-1])[::-1] )  )[::-1]
    -
    -    # we need a list of magic numbers
    -    q =array([6.634897,9.210340, 11.344867, 13.276704, 15.086272, 16.811894, 18.475307, 20.090235, 21.665994, 23.209251, 24.724970, 26.216967, 27.688250, 29.141238, 30.577914, 31.999927, 33.408664, 34.805306, 36.190869, 37.566235, 38.932173, 40.289360, 41.638398, 42.979820, 44.314105, 45.641683, 46.962942, 48.278236, 49.587884, 50.892181])
    -
    -    # use magic to determine when we should have stopped blocking
    -    for k in arange(0,d):
    -        if(M[k] < q[k]):
    -            break
    -    if (k >= d-1):
    -        print("Warning: Use more data")
    -    return mu, s[k]/2**(d-k)
    -
    -
    -x = loadtxt(infile)
    -(mean, var) = block(x) 
    -std = sqrt(var)
    +outfile = open(data_path("Energies.dat"),'w')
    +
    +
    +# Trial wave function for the 2-electron quantum dot in two dims
    +def WaveFunction(r,alpha,beta):
    +    r1 = r[0,0]**2 + r[0,1]**2
    +    r2 = r[1,0]**2 + r[1,1]**2
    +    r12 = sqrt((r[0,0]-r[1,0])**2 + (r[0,1]-r[1,1])**2)
    +    deno = r12/(1+beta*r12)
    +    return exp(-0.5*alpha*(r1+r2)+deno)
    +
    +# Local energy  for the 2-electron quantum dot in two dims, using analytical local energy
    +def LocalEnergy(r,alpha,beta):
    +    
    +    r1 = (r[0,0]**2 + r[0,1]**2)
    +    r2 = (r[1,0]**2 + r[1,1]**2)
    +    r12 = sqrt((r[0,0]-r[1,0])**2 + (r[0,1]-r[1,1])**2)
    +    deno = 1.0/(1+beta*r12)
    +    deno2 = deno*deno
    +    return 0.5*(1-alpha*alpha)*(r1 + r2) +2.0*alpha + 1.0/r12+deno2*(alpha*r12-deno2+2*beta*deno-1.0/r12)
    +
    +# Derivate of wave function ansatz as function of variational parameters
    +def DerivativeWFansatz(r,alpha,beta):
    +    
    +    WfDer  = np.zeros((2), np.double)
    +    r1 = (r[0,0]**2 + r[0,1]**2)
    +    r2 = (r[1,0]**2 + r[1,1]**2)
    +    r12 = sqrt((r[0,0]-r[1,0])**2 + (r[0,1]-r[1,1])**2)
    +    deno = 1.0/(1+beta*r12)
    +    deno2 = deno*deno
    +    WfDer[0] = -0.5*(r1+r2)
    +    WfDer[1] = -r12*r12*deno2
    +    return  WfDer
    +
    +# Setting up the quantum force for the two-electron quantum dot, recall that it is a vector
    +def QuantumForce(r,alpha,beta):
    +
    +    qforce = np.zeros((NumberParticles,Dimension), np.double)
    +    r12 = sqrt((r[0,0]-r[1,0])**2 + (r[0,1]-r[1,1])**2)
    +    deno = 1.0/(1+beta*r12)
    +    qforce[0,:] = -2*r[0,:]*alpha*(r[0,:]-r[1,:])*deno*deno/r12
    +    qforce[1,:] = -2*r[1,:]*alpha*(r[1,:]-r[0,:])*deno*deno/r12
    +    return qforce
    +    
    +
    +# Computing the derivative of the energy and the energy 
    +def EnergyDerivative(x0):
    +
    +    
    +    # Parameters in the Fokker-Planck simulation of the quantum force
    +    D = 0.5
    +    TimeStep = 0.05
    +    # positions
    +    PositionOld = np.zeros((NumberParticles,Dimension), np.double)
    +    PositionNew = np.zeros((NumberParticles,Dimension), np.double)
    +    # Quantum force
    +    QuantumForceOld = np.zeros((NumberParticles,Dimension), np.double)
    +    QuantumForceNew = np.zeros((NumberParticles,Dimension), np.double)
    +
    +    energy = 0.0
    +    DeltaE = 0.0
    +    alpha = x0[0]
    +    beta = x0[1]
    +    EnergyDer = 0.0
    +    DeltaPsi = 0.0
    +    DerivativePsiE = 0.0 
    +    #Initial position
    +    for i in range(NumberParticles):
    +        for j in range(Dimension):
    +            PositionOld[i,j] = normalvariate(0.0,1.0)*sqrt(TimeStep)
    +    wfold = WaveFunction(PositionOld,alpha,beta)
    +    QuantumForceOld = QuantumForce(PositionOld,alpha, beta)
    +
    +    #Loop over MC MCcycles
    +    for MCcycle in range(NumberMCcycles):
    +        #Trial position moving one particle at the time
    +        for i in range(NumberParticles):
    +            for j in range(Dimension):
    +                PositionNew[i,j] = PositionOld[i,j]+normalvariate(0.0,1.0)*sqrt(TimeStep)+\
    +                                       QuantumForceOld[i,j]*TimeStep*D
    +            wfnew = WaveFunction(PositionNew,alpha,beta)
    +            QuantumForceNew = QuantumForce(PositionNew,alpha, beta)
    +            GreensFunction = 0.0
    +            for j in range(Dimension):
    +                GreensFunction += 0.5*(QuantumForceOld[i,j]+QuantumForceNew[i,j])*\
    +	                              (D*TimeStep*0.5*(QuantumForceOld[i,j]-QuantumForceNew[i,j])-\
    +                                      PositionNew[i,j]+PositionOld[i,j])
    +      
    +            GreensFunction = exp(GreensFunction)
    +            ProbabilityRatio = GreensFunction*wfnew**2/wfold**2
    +            #Metropolis-Hastings test to see whether we accept the move
    +            if random() <= ProbabilityRatio:
    +                for j in range(Dimension):
    +                    PositionOld[i,j] = PositionNew[i,j]
    +                    QuantumForceOld[i,j] = QuantumForceNew[i,j]
    +                wfold = wfnew
    +        DeltaE = LocalEnergy(PositionOld,alpha,beta)
    +        DerPsi = DerivativeWFansatz(PositionOld,alpha,beta)
    +        DeltaPsi += DerPsi
    +        energy += DeltaE
    +        DerivativePsiE += DerPsi*DeltaE
    +            
    +    # We calculate mean values
    +    energy /= NumberMCcycles
    +    DerivativePsiE /= NumberMCcycles
    +    DeltaPsi /= NumberMCcycles
    +    EnergyDer  = 2*(DerivativePsiE-DeltaPsi*energy)
    +    return EnergyDer
    +
    +
    +# Computing the expectation value of the local energy 
    +def Energy(x0):
    +    # Parameters in the Fokker-Planck simulation of the quantum force
    +    D = 0.5
    +    TimeStep = 0.05
    +    # positions
    +    PositionOld = np.zeros((NumberParticles,Dimension), np.double)
    +    PositionNew = np.zeros((NumberParticles,Dimension), np.double)
    +    # Quantum force
    +    QuantumForceOld = np.zeros((NumberParticles,Dimension), np.double)
    +    QuantumForceNew = np.zeros((NumberParticles,Dimension), np.double)
    +
    +    energy = 0.0
    +    DeltaE = 0.0
    +    alpha = x0[0]
    +    beta = x0[1]
    +    #Initial position
    +    for i in range(NumberParticles):
    +        for j in range(Dimension):
    +            PositionOld[i,j] = normalvariate(0.0,1.0)*sqrt(TimeStep)
    +    wfold = WaveFunction(PositionOld,alpha,beta)
    +    QuantumForceOld = QuantumForce(PositionOld,alpha, beta)
    +
    +    #Loop over MC MCcycles
    +    for MCcycle in range(NumberMCcycles):
    +        #Trial position moving one particle at the time
    +        for i in range(NumberParticles):
    +            for j in range(Dimension):
    +                PositionNew[i,j] = PositionOld[i,j]+normalvariate(0.0,1.0)*sqrt(TimeStep)+\
    +                                       QuantumForceOld[i,j]*TimeStep*D
    +            wfnew = WaveFunction(PositionNew,alpha,beta)
    +            QuantumForceNew = QuantumForce(PositionNew,alpha, beta)
    +            GreensFunction = 0.0
    +            for j in range(Dimension):
    +                GreensFunction += 0.5*(QuantumForceOld[i,j]+QuantumForceNew[i,j])*\
    +	                              (D*TimeStep*0.5*(QuantumForceOld[i,j]-QuantumForceNew[i,j])-\
    +                                      PositionNew[i,j]+PositionOld[i,j])
    +      
    +            GreensFunction = exp(GreensFunction)
    +            ProbabilityRatio = GreensFunction*wfnew**2/wfold**2
    +            #Metropolis-Hastings test to see whether we accept the move
    +            if random() <= ProbabilityRatio:
    +                for j in range(Dimension):
    +                    PositionOld[i,j] = PositionNew[i,j]
    +                    QuantumForceOld[i,j] = QuantumForceNew[i,j]
    +                wfold = wfnew
    +        DeltaE = LocalEnergy(PositionOld,alpha,beta)
    +        energy += DeltaE
    +        if Printout: 
    +           outfile.write('%f\n' %(energy/(MCcycle+1.0)))            
    +    # We calculate mean values
    +    energy /= NumberMCcycles
    +    return energy
    +
    +#Here starts the main program with variable declarations
    +NumberParticles = 2
    +Dimension = 2
    +# seed for rng generator 
    +seed()
    +# Monte Carlo cycles for parameter optimization
    +Printout = False
    +NumberMCcycles= 10000
    +# guess for variational parameters
    +x0 = np.array([0.9,0.2])
    +# Using Broydens method to find optimal parameters
    +res = minimize(Energy, x0, method='BFGS', jac=EnergyDerivative, options={'gtol': 1e-4,'disp': True})
    +x0 = res.x
    +# Compute the energy again with the optimal parameters and increased number of Monte Cycles
    +NumberMCcycles= 2**19
    +Printout = True
    +FinalEnergy = Energy(x0)
    +EResult = np.array([FinalEnergy,FinalEnergy])
    +outfile.close()
    +#nice printout with Pandas
     import pandas as pd
     from pandas import DataFrame
    -data ={'Mean':[mean], 'STDev':[std]}
    -frame = pd.DataFrame(data,index=['Values'])
    +data ={'Optimal Parameters':x0, 'Final Energy':EResult}
    +frame = pd.DataFrame(data)
     print(frame)
     
    @@ -250,6 +901,18 @@

    Resampling analysis

  • 20
  • 21
  • 22
  • +
  • 23
  • +
  • 24
  • +
  • 25
  • +
  • 26
  • +
  • 27
  • +
  • 28
  • +
  • 29
  • +
  • 30
  • +
  • 31
  • +
  • ...
  • +
  • 141
  • +
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs022.html b/doc/pub/week9/html/._week9-bs022.html index bc5be66d..e49fae5f 100644 --- a/doc/pub/week9/html/._week9-bs022.html +++ b/doc/pub/week9/html/._week9-bs022.html @@ -47,6 +47,7 @@ None, 'and-why-do-we-use-such-methods'), ('Central limit theorem', 2, None, 'central-limit-theorem'), + ('Further remarks', 2, None, 'further-remarks'), ('Running many measurements', 2, None, @@ -62,62 +63,404 @@ 2, None, 'introducing-the-correlation-function'), - ('Statistics, wrapping up from last week', + ('Resampling methods: Blocking', 2, None, - 'statistics-wrapping-up-from-last-week'), - ('Statistics, final expression', + 'resampling-methods-blocking'), + ('Why blocking?', 2, None, 'why-blocking'), + ('Blocking Transformations', 2, None, 'blocking-transformations'), + ('Blocking transformations', 2, None, 'blocking-transformations'), + ('Blocking Transformations', 2, None, 'blocking-transformations'), + ('Blocking Transformations, getting there', 2, None, - 'statistics-final-expression'), - ('Statistics, effective number of correlations', + 'blocking-transformations-getting-there'), + ('Blocking Transformations, final expressions', 2, None, - 'statistics-effective-number-of-correlations'), - ('Can we understand this? Time Auto-correlation Function', + 'blocking-transformations-final-expressions'), + ('More on the blocking method', 2, None, - 'can-we-understand-this-time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'more-on-the-blocking-method'), + ('Example code form last week', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'example-code-form-last-week'), + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'optimization-and-profiling'), + ('Optimization and debugging', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', 2, None, - 'time-auto-correlation-function'), - ('Correlation Time', 2, None, 'correlation-time'), - ('Resampling methods: Blocking', + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', 2, None, - 'resampling-methods-blocking'), - ('Blocking Transformations', 2, None, 'blocking-transformations'), - ('Blocking Transformations', 2, None, 'blocking-transformations'), - ('Blocking Transformations, getting there', + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', 2, None, - 'blocking-transformations-getting-there'), - ('Blocking Transformations, final expressions', + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', 2, None, - 'blocking-transformations-final-expressions'), - ('Example code form last week', + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', 2, None, - 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -157,29 +500,141 @@
  • Statistical analysis
  • And why do we use such methods?
  • Central limit theorem
  • -
  • Running many measurements
  • -
  • Adding more definitions
  • -
  • Further rewriting
  • -
  • The covariance term
  • -
  • Rewriting the covariance term
  • -
  • Introducing the correlation function
  • -
  • Statistics, wrapping up from last week
  • -
  • Statistics, final expression
  • -
  • Statistics, effective number of correlations
  • -
  • Can we understand this? Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Correlation Time
  • -
  • Resampling methods: Blocking
  • -
  • Blocking Transformations
  • -
  • Blocking Transformations
  • -
  • Blocking Transformations, getting there
  • -
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • Further remarks
  • +
  • Running many measurements
  • +
  • Adding more definitions
  • +
  • Further rewriting
  • +
  • The covariance term
  • +
  • Rewriting the covariance term
  • +
  • Introducing the correlation function
  • +
  • Resampling methods: Blocking
  • +
  • Why blocking?
  • +
  • Blocking Transformations
  • +
  • Blocking transformations
  • +
  • Blocking Transformations
  • +
  • Blocking Transformations, getting there
  • +
  • Blocking Transformations, final expressions
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -191,28 +646,90 @@

     

     

     

    -

    Resampling methods: Blocking

    +

    Resampling analysis

    -

    The blocking method was made popular by Flyvbjerg and Pedersen (1989) -and has become one of the standard ways to estimate -\( V(\widehat{\theta}) \) for exactly one \( \widehat{\theta} \), namely -\( \widehat{\theta} = \overline{X} \). +

    The next step is then to use the above data sets and perform a +resampling analysis using the blocking method +The blocking code, based on the article of Marius Jonsson is given here

    -

    Assume \( n = 2^d \) for some integer \( d>1 \) and \( X_1,X_2,\cdots, X_n \) is a stationary time series to begin with. -Moreover, assume that the time series is asymptotically uncorrelated. We switch to vector notation by arranging \( X_1,X_2,\cdots,X_n \) in an \( n \)-tuple. Define: -

    -$$ -\begin{align*} -\hat{X} = (X_1,X_2,\cdots,X_n). -\end{align*} -$$ - -

    The strength of the blocking method is when the number of -observations, \( n \) is large. For large \( n \), the complexity of dependent -bootstrapping scales poorly, but the blocking method does not, -moreover, it becomes more accurate the larger \( n \) is. -

    + + +
    +
    +
    +
    +
    +
    # Common imports
    +import os
    +
    +# Where to save the figures and data files
    +DATA_ID = "Results/EnergyMin"
    +
    +def data_path(dat_id):
    +    return os.path.join(DATA_ID, dat_id)
    +
    +infile = open(data_path("Energies.dat"),'r')
    +
    +from numpy import log2, zeros, mean, var, sum, loadtxt, arange, array, cumsum, dot, transpose, diagonal, sqrt
    +from numpy.linalg import inv
    +
    +def block(x):
    +    # preliminaries
    +    n = len(x)
    +    d = int(log2(n))
    +    s, gamma = zeros(d), zeros(d)
    +    mu = mean(x)
    +
    +    # estimate the auto-covariance and variances 
    +    # for each blocking transformation
    +    for i in arange(0,d):
    +        n = len(x)
    +        # estimate autocovariance of x
    +        gamma[i] = (n)**(-1)*sum( (x[0:(n-1)]-mu)*(x[1:n]-mu) )
    +        # estimate variance of x
    +        s[i] = var(x)
    +        # perform blocking transformation
    +        x = 0.5*(x[0::2] + x[1::2])
    +   
    +    # generate the test observator M_k from the theorem
    +    M = (cumsum( ((gamma/s)**2*2**arange(1,d+1)[::-1])[::-1] )  )[::-1]
    +
    +    # we need a list of magic numbers
    +    q =array([6.634897,9.210340, 11.344867, 13.276704, 15.086272, 16.811894, 18.475307, 20.090235, 21.665994, 23.209251, 24.724970, 26.216967, 27.688250, 29.141238, 30.577914, 31.999927, 33.408664, 34.805306, 36.190869, 37.566235, 38.932173, 40.289360, 41.638398, 42.979820, 44.314105, 45.641683, 46.962942, 48.278236, 49.587884, 50.892181])
    +
    +    # use magic to determine when we should have stopped blocking
    +    for k in arange(0,d):
    +        if(M[k] < q[k]):
    +            break
    +    if (k >= d-1):
    +        print("Warning: Use more data")
    +    return mu, s[k]/2**(d-k)
    +
    +
    +x = loadtxt(infile)
    +(mean, var) = block(x) 
    +std = sqrt(var)
    +import pandas as pd
    +from pandas import DataFrame
    +data ={'Mean':[mean], 'STDev':[std]}
    +frame = pd.DataFrame(data,index=['Values'])
    +print(frame)
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    @@ -235,6 +752,11 @@

    Resampling methods: Blocking
  • 27
  • 28
  • 29
  • +
  • 30
  • +
  • 31
  • +
  • 32
  • +
  • ...
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs023.html b/doc/pub/week9/html/._week9-bs023.html index 63b8cb51..3a01dcc7 100644 --- a/doc/pub/week9/html/._week9-bs023.html +++ b/doc/pub/week9/html/._week9-bs023.html @@ -47,6 +47,7 @@ None, 'and-why-do-we-use-such-methods'), ('Central limit theorem', 2, None, 'central-limit-theorem'), + ('Further remarks', 2, None, 'further-remarks'), ('Running many measurements', 2, None, @@ -62,62 +63,404 @@ 2, None, 'introducing-the-correlation-function'), - ('Statistics, wrapping up from last week', + ('Resampling methods: Blocking', 2, None, - 'statistics-wrapping-up-from-last-week'), - ('Statistics, final expression', + 'resampling-methods-blocking'), + ('Why blocking?', 2, None, 'why-blocking'), + ('Blocking Transformations', 2, None, 'blocking-transformations'), + ('Blocking transformations', 2, None, 'blocking-transformations'), + ('Blocking Transformations', 2, None, 'blocking-transformations'), + ('Blocking Transformations, getting there', 2, None, - 'statistics-final-expression'), - ('Statistics, effective number of correlations', + 'blocking-transformations-getting-there'), + ('Blocking Transformations, final expressions', 2, None, - 'statistics-effective-number-of-correlations'), - ('Can we understand this? Time Auto-correlation Function', + 'blocking-transformations-final-expressions'), + ('More on the blocking method', 2, None, - 'can-we-understand-this-time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'more-on-the-blocking-method'), + ('Example code form last week', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'example-code-form-last-week'), + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'optimization-and-profiling'), + ('Optimization and debugging', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', 2, None, - 'time-auto-correlation-function'), - ('Correlation Time', 2, None, 'correlation-time'), - ('Resampling methods: Blocking', + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', 2, None, - 'resampling-methods-blocking'), - ('Blocking Transformations', 2, None, 'blocking-transformations'), - ('Blocking Transformations', 2, None, 'blocking-transformations'), - ('Blocking Transformations, getting there', + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', 2, None, - 'blocking-transformations-getting-there'), - ('Blocking Transformations, final expressions', + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', 2, None, - 'blocking-transformations-final-expressions'), - ('Example code form last week', + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', 2, None, - 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -157,29 +500,141 @@
  • Statistical analysis
  • And why do we use such methods?
  • Central limit theorem
  • -
  • Running many measurements
  • -
  • Adding more definitions
  • -
  • Further rewriting
  • -
  • The covariance term
  • -
  • Rewriting the covariance term
  • -
  • Introducing the correlation function
  • -
  • Statistics, wrapping up from last week
  • -
  • Statistics, final expression
  • -
  • Statistics, effective number of correlations
  • -
  • Can we understand this? Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Correlation Time
  • -
  • Resampling methods: Blocking
  • -
  • Blocking Transformations
  • -
  • Blocking Transformations
  • -
  • Blocking Transformations, getting there
  • -
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • Further remarks
  • +
  • Running many measurements
  • +
  • Adding more definitions
  • +
  • Further rewriting
  • +
  • The covariance term
  • +
  • Rewriting the covariance term
  • +
  • Introducing the correlation function
  • +
  • Resampling methods: Blocking
  • +
  • Why blocking?
  • +
  • Blocking Transformations
  • +
  • Blocking transformations
  • +
  • Blocking Transformations
  • +
  • Blocking Transformations, getting there
  • +
  • Blocking Transformations, final expressions
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -191,41 +646,17 @@

     

     

     

    -

    Blocking Transformations

    -

    We now define -blocking transformations. The idea is to take the mean of subsequent -pair of elements from \( \vec{X} \) and form a new vector -\( \vec{X}_1 \). Continuing in the same way by taking the mean of -subsequent pairs of elements of \( \vec{X}_1 \) we obtain \( \vec{X}_2 \), and -so on. -Define \( \vec{X}_i \) recursively by: -

    - -$$ -\begin{align} -(\vec{X}_0)_k &\equiv (\vec{X})_k \nonumber \\ -(\vec{X}_{i+1})_k &\equiv \frac{1}{2}\Big( (\vec{X}_i)_{2k-1} + -(\vec{X}_i)_{2k} \Big) \qquad \text{for all} \qquad 1 \leq i \leq d-1 -\tag{5} -\end{align} -$$ - -

    The quantity \( \vec{X}_k \) is -subject to \( k \) blocking transformations. We now have \( d \) vectors -\( \vec{X}_0, \vec{X}_1,\cdots,\vec X_{d-1} \) containing the subsequent -averages of observations. It turns out that if the components of -\( \vec{X} \) is a stationary time series, then the components of -\( \vec{X}_i \) is a stationary time series for all \( 0 \leq i \leq d-1 \) -

    - -

    We can then compute the autocovariance, the variance, sample mean, and -number of observations for each \( i \). -Let \( \gamma_i, \sigma_i^2, -\overline{X}_i \) denote the autocovariance, variance and average of the -elements of \( \vec{X}_i \) and let \( n_i \) be the number of elements of -\( \vec{X}_i \). It follows by induction that \( n_i = n/2^i \). -

    - +

    Content

    +

    diff --git a/doc/pub/week9/html/._week9-bs024.html b/doc/pub/week9/html/._week9-bs024.html index 4381840f..2d43eb0d 100644 --- a/doc/pub/week9/html/._week9-bs024.html +++ b/doc/pub/week9/html/._week9-bs024.html @@ -47,6 +47,7 @@ None, 'and-why-do-we-use-such-methods'), ('Central limit theorem', 2, None, 'central-limit-theorem'), + ('Further remarks', 2, None, 'further-remarks'), ('Running many measurements', 2, None, @@ -62,62 +63,404 @@ 2, None, 'introducing-the-correlation-function'), - ('Statistics, wrapping up from last week', + ('Resampling methods: Blocking', 2, None, - 'statistics-wrapping-up-from-last-week'), - ('Statistics, final expression', + 'resampling-methods-blocking'), + ('Why blocking?', 2, None, 'why-blocking'), + ('Blocking Transformations', 2, None, 'blocking-transformations'), + ('Blocking transformations', 2, None, 'blocking-transformations'), + ('Blocking Transformations', 2, None, 'blocking-transformations'), + ('Blocking Transformations, getting there', 2, None, - 'statistics-final-expression'), - ('Statistics, effective number of correlations', + 'blocking-transformations-getting-there'), + ('Blocking Transformations, final expressions', 2, None, - 'statistics-effective-number-of-correlations'), - ('Can we understand this? Time Auto-correlation Function', + 'blocking-transformations-final-expressions'), + ('More on the blocking method', 2, None, - 'can-we-understand-this-time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'more-on-the-blocking-method'), + ('Example code form last week', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'example-code-form-last-week'), + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'optimization-and-profiling'), + ('Optimization and debugging', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', 2, None, - 'time-auto-correlation-function'), - ('Correlation Time', 2, None, 'correlation-time'), - ('Resampling methods: Blocking', + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', 2, None, - 'resampling-methods-blocking'), - ('Blocking Transformations', 2, None, 'blocking-transformations'), - ('Blocking Transformations', 2, None, 'blocking-transformations'), - ('Blocking Transformations, getting there', + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', 2, None, - 'blocking-transformations-getting-there'), - ('Blocking Transformations, final expressions', + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', 2, None, - 'blocking-transformations-final-expressions'), - ('Example code form last week', + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', 2, None, - 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -157,29 +500,141 @@
  • Statistical analysis
  • And why do we use such methods?
  • Central limit theorem
  • -
  • Running many measurements
  • -
  • Adding more definitions
  • -
  • Further rewriting
  • -
  • The covariance term
  • -
  • Rewriting the covariance term
  • -
  • Introducing the correlation function
  • -
  • Statistics, wrapping up from last week
  • -
  • Statistics, final expression
  • -
  • Statistics, effective number of correlations
  • -
  • Can we understand this? Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Correlation Time
  • -
  • Resampling methods: Blocking
  • -
  • Blocking Transformations
  • -
  • Blocking Transformations
  • -
  • Blocking Transformations, getting there
  • -
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • Further remarks
  • +
  • Running many measurements
  • +
  • Adding more definitions
  • +
  • Further rewriting
  • +
  • The covariance term
  • +
  • Rewriting the covariance term
  • +
  • Introducing the correlation function
  • +
  • Resampling methods: Blocking
  • +
  • Why blocking?
  • +
  • Blocking Transformations
  • +
  • Blocking transformations
  • +
  • Blocking Transformations
  • +
  • Blocking Transformations, getting there
  • +
  • Blocking Transformations, final expressions
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -191,25 +646,70 @@

     

     

     

    -

    Blocking Transformations

    +

    Optimization and profiling

    +
    +
    + + +

    Till now we have not paid much attention to speed and possible optimization possibilities +inherent in the various compilers. We have compiled and linked as +

    + + +
    +
    +
    +
    +
    +
    c++  -c  mycode.cpp
    +c++  -o  mycode.exe  mycode.o
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    Using the -definition of the blocking transformation and the distributive -property of the covariance, it is clear that since \( h =|i-j| \) -we can define +

    For Fortran replace with for example gfortran or ifort. +This is what we call a flat compiler option and should be used when we develop the code. +It produces normally a very large and slow code when translated to machine instructions. +We use this option for debugging and for establishing the correct program output because +every operation is done precisely as the user specified it.

    -$$ -\begin{align} -\gamma_{k+1}(h) &= cov\left( ({X}_{k+1})_{i}, ({X}_{k+1})_{j} \right) \nonumber \\ -&= \frac{1}{4}cov\left( ({X}_{k})_{2i-1} + ({X}_{k})_{2i}, ({X}_{k})_{2j-1} + ({X}_{k})_{2j} \right) \nonumber \\ -&= \frac{1}{2}\gamma_{k}(2h) + \frac{1}{2}\gamma_k(2h+1) \hspace{0.1cm} \mathrm{h = 0} -\tag{6}\\ -&=\frac{1}{4}\gamma_k(2h-1) + \frac{1}{2}\gamma_k(2h) + \frac{1}{4}\gamma_k(2h+1) \quad \mathrm{else} -\tag{7} -\end{align} -$$ -

    The quantity \( \hat{X} \) is asymptotic uncorrelated by assumption, \( \hat{X}_k \) is also asymptotic uncorrelated. Let's turn our attention to the variance of the sample mean \( V(\overline{X}) \).

    +

    It is instructive to look up the compiler manual for further instructions by writing

    + + +
    +
    +
    +
    +
    +
    man c++
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    @@ -230,6 +730,13 @@

    Blocking Transformations

  • 27
  • 28
  • 29
  • +
  • 30
  • +
  • 31
  • +
  • 32
  • +
  • 33
  • +
  • 34
  • +
  • ...
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs025.html b/doc/pub/week9/html/._week9-bs025.html index 72e4f7f7..1121a3ae 100644 --- a/doc/pub/week9/html/._week9-bs025.html +++ b/doc/pub/week9/html/._week9-bs025.html @@ -47,6 +47,7 @@ None, 'and-why-do-we-use-such-methods'), ('Central limit theorem', 2, None, 'central-limit-theorem'), + ('Further remarks', 2, None, 'further-remarks'), ('Running many measurements', 2, None, @@ -62,62 +63,404 @@ 2, None, 'introducing-the-correlation-function'), - ('Statistics, wrapping up from last week', + ('Resampling methods: Blocking', 2, None, - 'statistics-wrapping-up-from-last-week'), - ('Statistics, final expression', + 'resampling-methods-blocking'), + ('Why blocking?', 2, None, 'why-blocking'), + ('Blocking Transformations', 2, None, 'blocking-transformations'), + ('Blocking transformations', 2, None, 'blocking-transformations'), + ('Blocking Transformations', 2, None, 'blocking-transformations'), + ('Blocking Transformations, getting there', 2, None, - 'statistics-final-expression'), - ('Statistics, effective number of correlations', + 'blocking-transformations-getting-there'), + ('Blocking Transformations, final expressions', 2, None, - 'statistics-effective-number-of-correlations'), - ('Can we understand this? Time Auto-correlation Function', + 'blocking-transformations-final-expressions'), + ('More on the blocking method', 2, None, - 'can-we-understand-this-time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'more-on-the-blocking-method'), + ('Example code form last week', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'example-code-form-last-week'), + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'optimization-and-profiling'), + ('Optimization and debugging', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', 2, None, - 'time-auto-correlation-function'), - ('Correlation Time', 2, None, 'correlation-time'), - ('Resampling methods: Blocking', + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', 2, None, - 'resampling-methods-blocking'), - ('Blocking Transformations', 2, None, 'blocking-transformations'), - ('Blocking Transformations', 2, None, 'blocking-transformations'), - ('Blocking Transformations, getting there', + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', 2, None, - 'blocking-transformations-getting-there'), - ('Blocking Transformations, final expressions', + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', 2, None, - 'blocking-transformations-final-expressions'), - ('Example code form last week', + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', 2, None, - 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -157,29 +500,141 @@
  • Statistical analysis
  • And why do we use such methods?
  • Central limit theorem
  • -
  • Running many measurements
  • -
  • Adding more definitions
  • -
  • Further rewriting
  • -
  • The covariance term
  • -
  • Rewriting the covariance term
  • -
  • Introducing the correlation function
  • -
  • Statistics, wrapping up from last week
  • -
  • Statistics, final expression
  • -
  • Statistics, effective number of correlations
  • -
  • Can we understand this? Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Correlation Time
  • -
  • Resampling methods: Blocking
  • -
  • Blocking Transformations
  • -
  • Blocking Transformations
  • -
  • Blocking Transformations, getting there
  • -
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • Further remarks
  • +
  • Running many measurements
  • +
  • Adding more definitions
  • +
  • Further rewriting
  • +
  • The covariance term
  • +
  • Rewriting the covariance term
  • +
  • Introducing the correlation function
  • +
  • Resampling methods: Blocking
  • +
  • Why blocking?
  • +
  • Blocking Transformations
  • +
  • Blocking transformations
  • +
  • Blocking Transformations
  • +
  • Blocking Transformations, getting there
  • +
  • Blocking Transformations, final expressions
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -191,24 +646,42 @@

     

     

     

    -

    Blocking Transformations, getting there

    -

    We have

    -$$ -\begin{align} -V(\overline{X}_k) = \frac{\sigma_k^2}{n_k} + \underbrace{\frac{2}{n_k} \sum_{h=1}^{n_k-1}\left( 1 - \frac{h}{n_k} \right)\gamma_k(h)}_{\equiv e_k} = \frac{\sigma^2_k}{n_k} + e_k \quad \text{if} \quad \gamma_k(0) = \sigma_k^2. -\tag{8} -\end{align} -$$ +

    More on optimization

    +
    +
    + +

    We have additional compiler options for optimization. These may include procedure inlining where +performance may be improved, moving constants inside loops outside the loop, +identify potential parallelism, include automatic vectorization or replace a division with a reciprocal +and a multiplication if this speeds up the code. +

    -

    The term \( e_k \) is called the truncation error:

    -$$ -\begin{equation} -e_k = \frac{2}{n_k} \sum_{h=1}^{n_k-1}\left( 1 - \frac{h}{n_k} \right)\gamma_k(h). -\tag{9} -\end{equation} -$$ + +
    +
    +
    +
    +
    +
    c++  -O3 -c  mycode.cpp
    +c++  -O3 -o  mycode.exe  mycode.o
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    We can show that \( V(\overline{X}_i) = V(\overline{X}_j) \) for all \( 0 \leq i \leq d-1 \) and \( 0 \leq j \leq d-1 \).

    +

    This (other options are -O2 or -Ofast) is the recommended option.

    +
    +

    @@ -228,6 +701,14 @@

    Blocking Transfor
  • 27
  • 28
  • 29
  • +
  • 30
  • +
  • 31
  • +
  • 32
  • +
  • 33
  • +
  • 34
  • +
  • 35
  • +
  • ...
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs026.html b/doc/pub/week9/html/._week9-bs026.html index ba2f41f2..bed0c48d 100644 --- a/doc/pub/week9/html/._week9-bs026.html +++ b/doc/pub/week9/html/._week9-bs026.html @@ -47,6 +47,7 @@ None, 'and-why-do-we-use-such-methods'), ('Central limit theorem', 2, None, 'central-limit-theorem'), + ('Further remarks', 2, None, 'further-remarks'), ('Running many measurements', 2, None, @@ -62,62 +63,404 @@ 2, None, 'introducing-the-correlation-function'), - ('Statistics, wrapping up from last week', + ('Resampling methods: Blocking', 2, None, - 'statistics-wrapping-up-from-last-week'), - ('Statistics, final expression', + 'resampling-methods-blocking'), + ('Why blocking?', 2, None, 'why-blocking'), + ('Blocking Transformations', 2, None, 'blocking-transformations'), + ('Blocking transformations', 2, None, 'blocking-transformations'), + ('Blocking Transformations', 2, None, 'blocking-transformations'), + ('Blocking Transformations, getting there', 2, None, - 'statistics-final-expression'), - ('Statistics, effective number of correlations', + 'blocking-transformations-getting-there'), + ('Blocking Transformations, final expressions', 2, None, - 'statistics-effective-number-of-correlations'), - ('Can we understand this? Time Auto-correlation Function', + 'blocking-transformations-final-expressions'), + ('More on the blocking method', 2, None, - 'can-we-understand-this-time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'more-on-the-blocking-method'), + ('Example code form last week', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'example-code-form-last-week'), + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'optimization-and-profiling'), + ('Optimization and debugging', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', 2, None, - 'time-auto-correlation-function'), - ('Correlation Time', 2, None, 'correlation-time'), - ('Resampling methods: Blocking', + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', 2, None, - 'resampling-methods-blocking'), - ('Blocking Transformations', 2, None, 'blocking-transformations'), - ('Blocking Transformations', 2, None, 'blocking-transformations'), - ('Blocking Transformations, getting there', + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', 2, None, - 'blocking-transformations-getting-there'), - ('Blocking Transformations, final expressions', + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', 2, None, - 'blocking-transformations-final-expressions'), - ('Example code form last week', + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', 2, None, - 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -157,29 +500,141 @@
  • Statistical analysis
  • And why do we use such methods?
  • Central limit theorem
  • -
  • Running many measurements
  • -
  • Adding more definitions
  • -
  • Further rewriting
  • -
  • The covariance term
  • -
  • Rewriting the covariance term
  • -
  • Introducing the correlation function
  • -
  • Statistics, wrapping up from last week
  • -
  • Statistics, final expression
  • -
  • Statistics, effective number of correlations
  • -
  • Can we understand this? Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Correlation Time
  • -
  • Resampling methods: Blocking
  • -
  • Blocking Transformations
  • -
  • Blocking Transformations
  • -
  • Blocking Transformations, getting there
  • -
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • Further remarks
  • +
  • Running many measurements
  • +
  • Adding more definitions
  • +
  • Further rewriting
  • +
  • The covariance term
  • +
  • Rewriting the covariance term
  • +
  • Introducing the correlation function
  • +
  • Resampling methods: Blocking
  • +
  • Why blocking?
  • +
  • Blocking Transformations
  • +
  • Blocking transformations
  • +
  • Blocking Transformations
  • +
  • Blocking Transformations, getting there
  • +
  • Blocking Transformations, final expressions
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -191,34 +646,68 @@

     

     

     

    -

    Blocking Transformations, final expressions

    +

    Optimization and profiling

    +
    +
    + +

    It is also useful to profile your program under the development stage. +You would then compile with +

    -

    We can then wrap up

    -$$ -\begin{align} -n_{j+1} \overline{X}_{j+1} &= \sum_{i=1}^{n_{j+1}} (\hat{X}_{j+1})_i = \frac{1}{2}\sum_{i=1}^{n_{j}/2} (\hat{X}_{j})_{2i-1} + (\hat{X}_{j})_{2i} \nonumber \\ -&= \frac{1}{2}\left[ (\hat{X}_j)_1 + (\hat{X}_j)_2 + \cdots + (\hat{X}_j)_{n_j} \right] = \underbrace{\frac{n_j}{2}}_{=n_{j+1}} \overline{X}_j = n_{j+1}\overline{X}_j. -\tag{10} -\end{align} -$$ + +
    +
    +
    +
    +
    +
    c++  -pg -O3 -c  mycode.cpp
    +c++  -pg -O3 -o  mycode.exe  mycode.o
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    After you have run the code you can obtain the profiling information via

    -

    By repeated use of this equation we get \( V(\overline{X}_i) = V(\overline{X}_0) = V(\overline{X}) \) for all \( 0 \leq i \leq d-1 \). This has the consequence that

    -$$ -\begin{align} -V(\overline{X}) = \frac{\sigma_k^2}{n_k} + e_k \qquad \text{for all} \qquad 0 \leq k \leq d-1. \tag{11} -\end{align} -$$ + +
    +
    +
    +
    +
    +
    gprof mycode.exe >  ProfileOutput
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    Flyvbjerg and Petersen demonstrated that the sequence -\( \{e_k\}_{k=0}^{d-1} \) is decreasing, and conjecture that the term -\( e_k \) can be made as small as we would like by making \( k \) (and hence -\( d \)) sufficiently large. The sequence is decreasing (Master of Science thesis by Marius Jonsson, UiO 2018). -It means we can apply blocking transformations until -\( e_k \) is sufficiently small, and then estimate \( V(\overline{X}) \) by -\( \widehat{\sigma}^2_k/n_k \). +

    When you have profiled properly your code, you must take out this option as it +slows down performance. +For memory tests use valgrind. An excellent environment for all these aspects, and much more, is Qt creator.

    +
    +
    -

    For an elegant solution and proof of the blocking method, see the recent article of Marius Jonsson (former MSc student of the Computational Physics group).

    @@ -237,6 +726,15 @@

    Blocking Tran
  • 27
  • 28
  • 29
  • +
  • 30
  • +
  • 31
  • +
  • 32
  • +
  • 33
  • +
  • 34
  • +
  • 35
  • +
  • 36
  • +
  • ...
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs027.html b/doc/pub/week9/html/._week9-bs027.html index 9533d45f..19c3cbe9 100644 --- a/doc/pub/week9/html/._week9-bs027.html +++ b/doc/pub/week9/html/._week9-bs027.html @@ -47,6 +47,7 @@ None, 'and-why-do-we-use-such-methods'), ('Central limit theorem', 2, None, 'central-limit-theorem'), + ('Further remarks', 2, None, 'further-remarks'), ('Running many measurements', 2, None, @@ -62,62 +63,404 @@ 2, None, 'introducing-the-correlation-function'), - ('Statistics, wrapping up from last week', + ('Resampling methods: Blocking', 2, None, - 'statistics-wrapping-up-from-last-week'), - ('Statistics, final expression', + 'resampling-methods-blocking'), + ('Why blocking?', 2, None, 'why-blocking'), + ('Blocking Transformations', 2, None, 'blocking-transformations'), + ('Blocking transformations', 2, None, 'blocking-transformations'), + ('Blocking Transformations', 2, None, 'blocking-transformations'), + ('Blocking Transformations, getting there', 2, None, - 'statistics-final-expression'), - ('Statistics, effective number of correlations', + 'blocking-transformations-getting-there'), + ('Blocking Transformations, final expressions', 2, None, - 'statistics-effective-number-of-correlations'), - ('Can we understand this? Time Auto-correlation Function', + 'blocking-transformations-final-expressions'), + ('More on the blocking method', 2, None, - 'can-we-understand-this-time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'more-on-the-blocking-method'), + ('Example code form last week', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'example-code-form-last-week'), + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'optimization-and-profiling'), + ('Optimization and debugging', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', 2, None, - 'time-auto-correlation-function'), - ('Correlation Time', 2, None, 'correlation-time'), - ('Resampling methods: Blocking', + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', 2, None, - 'resampling-methods-blocking'), - ('Blocking Transformations', 2, None, 'blocking-transformations'), - ('Blocking Transformations', 2, None, 'blocking-transformations'), - ('Blocking Transformations, getting there', + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', 2, None, - 'blocking-transformations-getting-there'), - ('Blocking Transformations, final expressions', + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', 2, None, - 'blocking-transformations-final-expressions'), - ('Example code form last week', + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', 2, None, - 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -157,29 +500,141 @@
  • Statistical analysis
  • And why do we use such methods?
  • Central limit theorem
  • -
  • Running many measurements
  • -
  • Adding more definitions
  • -
  • Further rewriting
  • -
  • The covariance term
  • -
  • Rewriting the covariance term
  • -
  • Introducing the correlation function
  • -
  • Statistics, wrapping up from last week
  • -
  • Statistics, final expression
  • -
  • Statistics, effective number of correlations
  • -
  • Can we understand this? Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Correlation Time
  • -
  • Resampling methods: Blocking
  • -
  • Blocking Transformations
  • -
  • Blocking Transformations
  • -
  • Blocking Transformations, getting there
  • -
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • Further remarks
  • +
  • Running many measurements
  • +
  • Adding more definitions
  • +
  • Further rewriting
  • +
  • The covariance term
  • +
  • Rewriting the covariance term
  • +
  • Introducing the correlation function
  • +
  • Resampling methods: Blocking
  • +
  • Why blocking?
  • +
  • Blocking Transformations
  • +
  • Blocking transformations
  • +
  • Blocking Transformations
  • +
  • Blocking Transformations, getting there
  • +
  • Blocking Transformations, final expressions
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -191,230 +646,22 @@

     

     

     

    -

    Example code form last week

    - - +

    Optimization and debugging

    +
    +
    + +

    Adding debugging options is a very useful alternative under the development stage of a program. +You would then compile with +

    + +
    -
    # 2-electron VMC code for 2dim quantum dot with importance sampling
    -# Using gaussian rng for new positions and Metropolis- Hastings 
    -# Added energy minimization
    -from math import exp, sqrt
    -from random import random, seed, normalvariate
    -import numpy as np
    -import matplotlib.pyplot as plt
    -from mpl_toolkits.mplot3d import Axes3D
    -from matplotlib import cm
    -from matplotlib.ticker import LinearLocator, FormatStrFormatter
    -from scipy.optimize import minimize
    -import sys
    -import os
    -
    -# Where to save data files
    -PROJECT_ROOT_DIR = "Results"
    -DATA_ID = "Results/EnergyMin"
    -
    -if not os.path.exists(PROJECT_ROOT_DIR):
    -    os.mkdir(PROJECT_ROOT_DIR)
    -
    -if not os.path.exists(DATA_ID):
    -    os.makedirs(DATA_ID)
    -
    -def data_path(dat_id):
    -    return os.path.join(DATA_ID, dat_id)
    -
    -outfile = open(data_path("Energies.dat"),'w')
    -
    -
    -# Trial wave function for the 2-electron quantum dot in two dims
    -def WaveFunction(r,alpha,beta):
    -    r1 = r[0,0]**2 + r[0,1]**2
    -    r2 = r[1,0]**2 + r[1,1]**2
    -    r12 = sqrt((r[0,0]-r[1,0])**2 + (r[0,1]-r[1,1])**2)
    -    deno = r12/(1+beta*r12)
    -    return exp(-0.5*alpha*(r1+r2)+deno)
    -
    -# Local energy  for the 2-electron quantum dot in two dims, using analytical local energy
    -def LocalEnergy(r,alpha,beta):
    -    
    -    r1 = (r[0,0]**2 + r[0,1]**2)
    -    r2 = (r[1,0]**2 + r[1,1]**2)
    -    r12 = sqrt((r[0,0]-r[1,0])**2 + (r[0,1]-r[1,1])**2)
    -    deno = 1.0/(1+beta*r12)
    -    deno2 = deno*deno
    -    return 0.5*(1-alpha*alpha)*(r1 + r2) +2.0*alpha + 1.0/r12+deno2*(alpha*r12-deno2+2*beta*deno-1.0/r12)
    -
    -# Derivate of wave function ansatz as function of variational parameters
    -def DerivativeWFansatz(r,alpha,beta):
    -    
    -    WfDer  = np.zeros((2), np.double)
    -    r1 = (r[0,0]**2 + r[0,1]**2)
    -    r2 = (r[1,0]**2 + r[1,1]**2)
    -    r12 = sqrt((r[0,0]-r[1,0])**2 + (r[0,1]-r[1,1])**2)
    -    deno = 1.0/(1+beta*r12)
    -    deno2 = deno*deno
    -    WfDer[0] = -0.5*(r1+r2)
    -    WfDer[1] = -r12*r12*deno2
    -    return  WfDer
    -
    -# Setting up the quantum force for the two-electron quantum dot, recall that it is a vector
    -def QuantumForce(r,alpha,beta):
    -
    -    qforce = np.zeros((NumberParticles,Dimension), np.double)
    -    r12 = sqrt((r[0,0]-r[1,0])**2 + (r[0,1]-r[1,1])**2)
    -    deno = 1.0/(1+beta*r12)
    -    qforce[0,:] = -2*r[0,:]*alpha*(r[0,:]-r[1,:])*deno*deno/r12
    -    qforce[1,:] = -2*r[1,:]*alpha*(r[1,:]-r[0,:])*deno*deno/r12
    -    return qforce
    -    
    -
    -# Computing the derivative of the energy and the energy 
    -def EnergyDerivative(x0):
    -
    -    
    -    # Parameters in the Fokker-Planck simulation of the quantum force
    -    D = 0.5
    -    TimeStep = 0.05
    -    # positions
    -    PositionOld = np.zeros((NumberParticles,Dimension), np.double)
    -    PositionNew = np.zeros((NumberParticles,Dimension), np.double)
    -    # Quantum force
    -    QuantumForceOld = np.zeros((NumberParticles,Dimension), np.double)
    -    QuantumForceNew = np.zeros((NumberParticles,Dimension), np.double)
    -
    -    energy = 0.0
    -    DeltaE = 0.0
    -    alpha = x0[0]
    -    beta = x0[1]
    -    EnergyDer = 0.0
    -    DeltaPsi = 0.0
    -    DerivativePsiE = 0.0 
    -    #Initial position
    -    for i in range(NumberParticles):
    -        for j in range(Dimension):
    -            PositionOld[i,j] = normalvariate(0.0,1.0)*sqrt(TimeStep)
    -    wfold = WaveFunction(PositionOld,alpha,beta)
    -    QuantumForceOld = QuantumForce(PositionOld,alpha, beta)
    -
    -    #Loop over MC MCcycles
    -    for MCcycle in range(NumberMCcycles):
    -        #Trial position moving one particle at the time
    -        for i in range(NumberParticles):
    -            for j in range(Dimension):
    -                PositionNew[i,j] = PositionOld[i,j]+normalvariate(0.0,1.0)*sqrt(TimeStep)+\
    -                                       QuantumForceOld[i,j]*TimeStep*D
    -            wfnew = WaveFunction(PositionNew,alpha,beta)
    -            QuantumForceNew = QuantumForce(PositionNew,alpha, beta)
    -            GreensFunction = 0.0
    -            for j in range(Dimension):
    -                GreensFunction += 0.5*(QuantumForceOld[i,j]+QuantumForceNew[i,j])*\
    -	                              (D*TimeStep*0.5*(QuantumForceOld[i,j]-QuantumForceNew[i,j])-\
    -                                      PositionNew[i,j]+PositionOld[i,j])
    -      
    -            GreensFunction = exp(GreensFunction)
    -            ProbabilityRatio = GreensFunction*wfnew**2/wfold**2
    -            #Metropolis-Hastings test to see whether we accept the move
    -            if random() <= ProbabilityRatio:
    -                for j in range(Dimension):
    -                    PositionOld[i,j] = PositionNew[i,j]
    -                    QuantumForceOld[i,j] = QuantumForceNew[i,j]
    -                wfold = wfnew
    -        DeltaE = LocalEnergy(PositionOld,alpha,beta)
    -        DerPsi = DerivativeWFansatz(PositionOld,alpha,beta)
    -        DeltaPsi += DerPsi
    -        energy += DeltaE
    -        DerivativePsiE += DerPsi*DeltaE
    -            
    -    # We calculate mean values
    -    energy /= NumberMCcycles
    -    DerivativePsiE /= NumberMCcycles
    -    DeltaPsi /= NumberMCcycles
    -    EnergyDer  = 2*(DerivativePsiE-DeltaPsi*energy)
    -    return EnergyDer
    -
    -
    -# Computing the expectation value of the local energy 
    -def Energy(x0):
    -    # Parameters in the Fokker-Planck simulation of the quantum force
    -    D = 0.5
    -    TimeStep = 0.05
    -    # positions
    -    PositionOld = np.zeros((NumberParticles,Dimension), np.double)
    -    PositionNew = np.zeros((NumberParticles,Dimension), np.double)
    -    # Quantum force
    -    QuantumForceOld = np.zeros((NumberParticles,Dimension), np.double)
    -    QuantumForceNew = np.zeros((NumberParticles,Dimension), np.double)
    -
    -    energy = 0.0
    -    DeltaE = 0.0
    -    alpha = x0[0]
    -    beta = x0[1]
    -    #Initial position
    -    for i in range(NumberParticles):
    -        for j in range(Dimension):
    -            PositionOld[i,j] = normalvariate(0.0,1.0)*sqrt(TimeStep)
    -    wfold = WaveFunction(PositionOld,alpha,beta)
    -    QuantumForceOld = QuantumForce(PositionOld,alpha, beta)
    -
    -    #Loop over MC MCcycles
    -    for MCcycle in range(NumberMCcycles):
    -        #Trial position moving one particle at the time
    -        for i in range(NumberParticles):
    -            for j in range(Dimension):
    -                PositionNew[i,j] = PositionOld[i,j]+normalvariate(0.0,1.0)*sqrt(TimeStep)+\
    -                                       QuantumForceOld[i,j]*TimeStep*D
    -            wfnew = WaveFunction(PositionNew,alpha,beta)
    -            QuantumForceNew = QuantumForce(PositionNew,alpha, beta)
    -            GreensFunction = 0.0
    -            for j in range(Dimension):
    -                GreensFunction += 0.5*(QuantumForceOld[i,j]+QuantumForceNew[i,j])*\
    -	                              (D*TimeStep*0.5*(QuantumForceOld[i,j]-QuantumForceNew[i,j])-\
    -                                      PositionNew[i,j]+PositionOld[i,j])
    -      
    -            GreensFunction = exp(GreensFunction)
    -            ProbabilityRatio = GreensFunction*wfnew**2/wfold**2
    -            #Metropolis-Hastings test to see whether we accept the move
    -            if random() <= ProbabilityRatio:
    -                for j in range(Dimension):
    -                    PositionOld[i,j] = PositionNew[i,j]
    -                    QuantumForceOld[i,j] = QuantumForceNew[i,j]
    -                wfold = wfnew
    -        DeltaE = LocalEnergy(PositionOld,alpha,beta)
    -        energy += DeltaE
    -        if Printout: 
    -           outfile.write('%f\n' %(energy/(MCcycle+1.0)))            
    -    # We calculate mean values
    -    energy /= NumberMCcycles
    -    return energy
    -
    -#Here starts the main program with variable declarations
    -NumberParticles = 2
    -Dimension = 2
    -# seed for rng generator 
    -seed()
    -# Monte Carlo cycles for parameter optimization
    -Printout = False
    -NumberMCcycles= 10000
    -# guess for variational parameters
    -x0 = np.array([0.9,0.2])
    -# Using Broydens method to find optimal parameters
    -res = minimize(Energy, x0, method='BFGS', jac=EnergyDerivative, options={'gtol': 1e-4,'disp': True})
    -x0 = res.x
    -# Compute the energy again with the optimal parameters and increased number of Monte Cycles
    -NumberMCcycles= 2**19
    -Printout = True
    -FinalEnergy = Energy(x0)
    -EResult = np.array([FinalEnergy,FinalEnergy])
    -outfile.close()
    -#nice printout with Pandas
    -import pandas as pd
    -from pandas import DataFrame
    -data ={'Optimal Parameters':x0, 'Final Energy':EResult}
    -frame = pd.DataFrame(data)
    -print(frame)
    +  
    c++  -g -O0 -c  mycode.cpp
    +c++  -g -O0 -o  mycode.exe  mycode.o
     
    @@ -430,6 +677,19 @@

    Example code form last week

    +

    This option generates debugging information allowing you to trace for example if an array is properly allocated. Some compilers work best with the no optimization option -O0.

    +
    +
    + +
    +
    + +

    Depending on the compiler, one can add flags which generate code that catches integer overflow errors. +The flag -ftrapv does this for the CLANG compiler on OS X operating systems. +

    +
    +
    +

    @@ -447,6 +707,16 @@

    Example code form last week
  • 27
  • 28
  • 29
  • +
  • 30
  • +
  • 31
  • +
  • 32
  • +
  • 33
  • +
  • 34
  • +
  • 35
  • +
  • 36
  • +
  • 37
  • +
  • ...
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs028.html b/doc/pub/week9/html/._week9-bs028.html index ff6b5365..c6698375 100644 --- a/doc/pub/week9/html/._week9-bs028.html +++ b/doc/pub/week9/html/._week9-bs028.html @@ -47,6 +47,7 @@ None, 'and-why-do-we-use-such-methods'), ('Central limit theorem', 2, None, 'central-limit-theorem'), + ('Further remarks', 2, None, 'further-remarks'), ('Running many measurements', 2, None, @@ -62,62 +63,404 @@ 2, None, 'introducing-the-correlation-function'), - ('Statistics, wrapping up from last week', + ('Resampling methods: Blocking', 2, None, - 'statistics-wrapping-up-from-last-week'), - ('Statistics, final expression', + 'resampling-methods-blocking'), + ('Why blocking?', 2, None, 'why-blocking'), + ('Blocking Transformations', 2, None, 'blocking-transformations'), + ('Blocking transformations', 2, None, 'blocking-transformations'), + ('Blocking Transformations', 2, None, 'blocking-transformations'), + ('Blocking Transformations, getting there', 2, None, - 'statistics-final-expression'), - ('Statistics, effective number of correlations', + 'blocking-transformations-getting-there'), + ('Blocking Transformations, final expressions', 2, None, - 'statistics-effective-number-of-correlations'), - ('Can we understand this? Time Auto-correlation Function', + 'blocking-transformations-final-expressions'), + ('More on the blocking method', 2, None, - 'can-we-understand-this-time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'more-on-the-blocking-method'), + ('Example code form last week', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'example-code-form-last-week'), + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'optimization-and-profiling'), + ('Optimization and debugging', 2, None, - 'time-auto-correlation-function'), - ('Time Auto-correlation Function', + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', 2, None, - 'time-auto-correlation-function'), - ('Correlation Time', 2, None, 'correlation-time'), - ('Resampling methods: Blocking', + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', 2, None, - 'resampling-methods-blocking'), - ('Blocking Transformations', 2, None, 'blocking-transformations'), - ('Blocking Transformations', 2, None, 'blocking-transformations'), - ('Blocking Transformations, getting there', + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', 2, None, - 'blocking-transformations-getting-there'), - ('Blocking Transformations, final expressions', + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', 2, None, - 'blocking-transformations-final-expressions'), - ('Example code form last week', + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', 2, None, - 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -157,29 +500,141 @@
  • Statistical analysis
  • And why do we use such methods?
  • Central limit theorem
  • -
  • Running many measurements
  • -
  • Adding more definitions
  • -
  • Further rewriting
  • -
  • The covariance term
  • -
  • Rewriting the covariance term
  • -
  • Introducing the correlation function
  • -
  • Statistics, wrapping up from last week
  • -
  • Statistics, final expression
  • -
  • Statistics, effective number of correlations
  • -
  • Can we understand this? Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Time Auto-correlation Function
  • -
  • Correlation Time
  • -
  • Resampling methods: Blocking
  • -
  • Blocking Transformations
  • -
  • Blocking Transformations
  • -
  • Blocking Transformations, getting there
  • -
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • Further remarks
  • +
  • Running many measurements
  • +
  • Adding more definitions
  • +
  • Further rewriting
  • +
  • The covariance term
  • +
  • Rewriting the covariance term
  • +
  • Introducing the correlation function
  • +
  • Resampling methods: Blocking
  • +
  • Why blocking?
  • +
  • Blocking Transformations
  • +
  • Blocking transformations
  • +
  • Blocking Transformations
  • +
  • Blocking Transformations, getting there
  • +
  • Blocking Transformations, final expressions
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -191,75 +646,56 @@

     

     

     

    -

    Resampling analysis

    - -

    The next step is then to use the above data sets and perform a -resampling analysis using the blocking method -The blocking code, based on the article of Marius Jonsson is given here -

    - +

    Other hints

    +
    +
    + +

    In general, irrespective of compiler options, it is useful to

    +
      +
    • avoid if tests or call to functions inside loops, if possible.
    • +
    • avoid multiplication with constants inside loops if possible
    • +
    +

    Here is an example of a part of a program where specific operations lead to a slower code

    - +
    -
    # Common imports
    -import os
    -
    -# Where to save the figures and data files
    -DATA_ID = "Results/EnergyMin"
    -
    -def data_path(dat_id):
    -    return os.path.join(DATA_ID, dat_id)
    -
    -infile = open(data_path("Energies.dat"),'r')
    -
    -from numpy import log2, zeros, mean, var, sum, loadtxt, arange, array, cumsum, dot, transpose, diagonal, sqrt
    -from numpy.linalg import inv
    -
    -def block(x):
    -    # preliminaries
    -    n = len(x)
    -    d = int(log2(n))
    -    s, gamma = zeros(d), zeros(d)
    -    mu = mean(x)
    -
    -    # estimate the auto-covariance and variances 
    -    # for each blocking transformation
    -    for i in arange(0,d):
    -        n = len(x)
    -        # estimate autocovariance of x
    -        gamma[i] = (n)**(-1)*sum( (x[0:(n-1)]-mu)*(x[1:n]-mu) )
    -        # estimate variance of x
    -        s[i] = var(x)
    -        # perform blocking transformation
    -        x = 0.5*(x[0::2] + x[1::2])
    -   
    -    # generate the test observator M_k from the theorem
    -    M = (cumsum( ((gamma/s)**2*2**arange(1,d+1)[::-1])[::-1] )  )[::-1]
    -
    -    # we need a list of magic numbers
    -    q =array([6.634897,9.210340, 11.344867, 13.276704, 15.086272, 16.811894, 18.475307, 20.090235, 21.665994, 23.209251, 24.724970, 26.216967, 27.688250, 29.141238, 30.577914, 31.999927, 33.408664, 34.805306, 36.190869, 37.566235, 38.932173, 40.289360, 41.638398, 42.979820, 44.314105, 45.641683, 46.962942, 48.278236, 49.587884, 50.892181])
    -
    -    # use magic to determine when we should have stopped blocking
    -    for k in arange(0,d):
    -        if(M[k] < q[k]):
    -            break
    -    if (k >= d-1):
    -        print("Warning: Use more data")
    -    return mu, s[k]/2**(d-k)
    +  
    k = n-1;
    +for (i = 0; i < n; i++){
    +    a[i] = b[i] +c*d;
    +    e = g[k];
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    A better code is

    -x = loadtxt(infile) -(mean, var) = block(x) -std = sqrt(var) -import pandas as pd -from pandas import DataFrame -data ={'Mean':[mean], 'STDev':[std]} -frame = pd.DataFrame(data,index=['Values']) -print(frame) + +
    +
    +
    +
    +
    +
    temp = c*d;
    +for (i = 0; i < n; i++){
    +    a[i] = b[i] + temp;
    +}
    +e = g[n-1];
     
    @@ -275,6 +711,12 @@

    Resampling analysis

    +

    Here we avoid a repeated multiplication inside a loop. +Most compilers, depending on compiler flags, identify and optimize such bottlenecks on their own, without requiring any particular action by the programmer. However, it is always useful to single out and avoid code examples like the first one discussed here. +

    +
    +
    +

    @@ -291,6 +733,18 @@

    Resampling analysis

  • 27
  • 28
  • 29
  • +
  • 30
  • +
  • 31
  • +
  • 32
  • +
  • 33
  • +
  • 34
  • +
  • 35
  • +
  • 36
  • +
  • 37
  • +
  • 38
  • +
  • ...
  • +
  • 141
  • +
  • »
  • diff --git a/doc/pub/week9/html/._week9-bs029.html b/doc/pub/week9/html/._week9-bs029.html index a19a51b9..c7c38c51 100644 --- a/doc/pub/week9/html/._week9-bs029.html +++ b/doc/pub/week9/html/._week9-bs029.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - +

    Vectorization and the basic idea behind parallel computing

    +
    +
    + +

    Present CPUs are highly parallel processors with varying levels of parallelism. The typical situation can be described via the following three statements.

    +
      +
    • Pursuit of shorter computation time and larger simulation size gives rise to parallel computing.
    • +
    • Multiple processors are involved to solve a global problem.
    • +
    • The essence is to divide the entire computation evenly among collaborative processors. Divide and conquer.
    • +
    +

    Before we proceed with a more detailed discussion of topics like vectorization and parallelization, we need to remind ourselves about some basic features of different hardware models.

    +
    +
    -

    Automatic vectorization and vectorization inhibitors, memory stride

    - -

    -For C++ programmers it is also worth keeping in mind that an array notation is preferred to the more compact use of pointers to access array elements. The compiler can often not tell if it is safe to vectorize the code. - -

    -When dealing with arrays, you should also avoid memory stride, since this slows down considerably vectorization. When you access array element, write for example the inner loop to vectorize using unit stride, that is, access successively the next array element in memory, as shown here -

    - -

      for (int i = 0; i < n; i++) {
    -      for (int j = 0; j < n; j++) {
    -           a[i][j] += b[i][j];
    -      }  
    -  }
    -
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs030.html b/doc/pub/week9/html/._week9-bs030.html index 888eeec7..02a3522a 100644 --- a/doc/pub/week9/html/._week9-bs030.html +++ b/doc/pub/week9/html/._week9-bs030.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - - -

    Memory management

    -The main memory contains the program data - -
      -
    1. Cache memory contains a copy of the main memory data
    2. -
    3. Cache is faster but consumes more space and power. It is normally assumed to be much faster than main memory
    4. -
    5. Registers contain working data only
    6. - -
        -
      • Modern CPUs perform most or all operations only on data in register
      • -
      - -
    7. Multiple Cache memories contain a copy of the main memory data
    8. +

      A rough classification of hardware models

      +
      +
      +
        -
      • Cache items accessed by their address in main memory
      • -
      • L1 cache is the fastest but has the least capacity
      • -
      • L2, L3 provide intermediate performance/size tradeoffs
      • +
      • Conventional single-processor computers are named SISD (single-instruction-single-data) machines.
      • +
      • SIMD (single-instruction-multiple-data) machines incorporate the idea of parallel processing, using a large number of processing units to execute the same instruction on different data.
      • +
      • Modern parallel computers are so-called MIMD (multiple-instruction-multiple-data) machines and can execute different instruction streams in parallel on different data.
      +
      +
      -
    - -Loads and stores to memory can be as important as floating point operations when we measure performance. - -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs031.html b/doc/pub/week9/html/._week9-bs031.html index d97cafb4..00d8962c 100644 --- a/doc/pub/week9/html/._week9-bs031.html +++ b/doc/pub/week9/html/._week9-bs031.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - - -

    Memory and communication

    - -
      -
    1. Most communication in a computer is carried out in chunks, blocks of bytes of data that move together
    2. -
    3. In the memory hierarchy, data moves between memory and cache, and between different levels of cache, in groups called lines
    4. - +

      Shared memory and distributed memory

      +
      +
      + +

      One way of categorizing modern parallel computers is to look at the memory configuration.

        -
      • Lines are typically 64-128 bytes, or 8-16 double precision words
      • -
      • Even if you do not use the data, it is moved and occupies space in the cache
      • +
      • In shared memory systems the CPUs share the same address space. Any CPU can access any data in the global memory.
      • +
      • In distributed memory systems each CPU has its own memory.
      +

      The CPUs are connected by some network and may exchange messages.

      +
      +
      -
    - -Many of these performance features are not captured in most programming languages. -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs032.html b/doc/pub/week9/html/._week9-bs032.html index 135aa4d9..9f51a05f 100644 --- a/doc/pub/week9/html/._week9-bs032.html +++ b/doc/pub/week9/html/._week9-bs032.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - +

    Different parallel programming paradigms

    +
    +
    + -

    Measuring performance

    - -

    -How do we measure performance? What is wrong with this code to time a loop? -

    +

      +
    • Task parallelism: the work of a global problem can be divided into a number of independent tasks, which rarely need to synchronize. Monte Carlo simulations represent a typical situation. Integration is another. However this paradigm is of limited use.
    • +
    • Data parallelism: use of multiple threads (e.g. one or more threads per processor) to dissect loops over arrays etc. Communication and synchronization between processors are often hidden, thus easy to program. However, the user surrenders much control to a specialized compiler. Examples of data parallelism are compiler-based parallelization and OpenMP directives.
    • +
    +
    +
    - -
      clock_t start, finish;
    -  start = clock();
    -  for (int j = 0; j < i; j++) {
    -    a[j] = b[j]+b[j]*c[j];
    -  }
    -  finish = clock();
    -  double timeused = (double) (finish - start)/(CLOCKS_PER_SEC );
    -
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs033.html b/doc/pub/week9/html/._week9-bs033.html index 9798d735..2e6b278d 100644 --- a/doc/pub/week9/html/._week9-bs033.html +++ b/doc/pub/week9/html/._week9-bs033.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - +

    Different parallel programming paradigms

    +
    +
    + -

    Problems with measuring time

    +
      +
    • Message passing: all involved processors have an independent memory address space. The user is responsible for partitioning the data/work of a global problem and distributing the subproblems to the processors. Collaboration between processors is achieved by explicit message passing, which is used for data transfer plus synchronization.
    • +
    • This paradigm is the most general one where the user has full control. Better parallel efficiency is usually achieved by explicit message passing. However, message-passing programming is more difficult.
    • +
    +
    +
    -
      -
    1. Timers are not infinitely accurate
    2. -
    3. All clocks have a granularity, the minimum time that they can measure
    4. -
    5. The error in a time measurement, even if everything is perfect, may be the size of this granularity (sometimes called a clock tick)
    6. -
    7. Always know what your clock granularity is
    8. -
    9. Ensure that your measurement is for a long enough duration (say 100 times the tick)
    10. -

    @@ -624,29 +684,22 @@

    Problems with measuring tim
  • 42
  • 43
  • ...
  • -
  • 120
  • +
  • 141
  • »
  • -

    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs034.html b/doc/pub/week9/html/._week9-bs034.html index 96280c9b..a5e43a70 100644 --- a/doc/pub/week9/html/._week9-bs034.html +++ b/doc/pub/week9/html/._week9-bs034.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - + +

    What is vectorization?

    +

    Vectorization is a special +case of Single Instructions Multiple Data (SIMD) to denote a single +instruction stream capable of operating on multiple data elements in +parallel. +We can think of vectorization as the unrolling of loops accompanied with SIMD instructions. +

    -

    Problems with cold start

    +

    Vectorization is the process of converting an algorithm that performs scalar operations +(typically one operation at the time) to vector operations where a single operation can refer to many simultaneous operations. +Consider the following example +

    -

    -What happens when the code is executed? The assumption is that the code is ready to -execute. But + +

    +
    +
    +
    +
    +
    for (i = 0; i < n; i++){
    +    a[i] = b[i] + c[i];
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -
      -
    1. Code may still be on disk, and not even read into memory.
    2. -
    3. Data may be in slow memory rather than fast (which may be wrong or right for what you are measuring)
    4. -
    5. Multiple tests often necessary to ensure that cold start effects are not present
    6. -
    7. Special effort often required to ensure data in the intended part of the memory hierarchy.
    8. -
    +

    If the code is not vectorized, the compiler will simply start with the first element and +then perform subsequent additions operating on one address in memory at the time. +

    @@ -627,29 +712,22 @@

    Problems with cold start

  • 43
  • 44
  • ...
  • -
  • 120
  • +
  • 141
  • »
  • -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs035.html b/doc/pub/week9/html/._week9-bs035.html index 6749fc37..4b8dbfca 100644 --- a/doc/pub/week9/html/._week9-bs035.html +++ b/doc/pub/week9/html/._week9-bs035.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - - -

    Problems with smart compilers

    + +

    Number of elements that can acted upon

    +

    A SIMD instruction can operate on multiple data elements in one single instruction. +It uses the so-called 128-bit SIMD floating-point register. +In this sense, vectorization adds some form of parallelism since one instruction is applied +to many parts of say a vector. +

    -
      -
    1. If the result of the computation is not used, the compiler may eliminate the code
    2. -
    3. Performance will look impossibly fantastic
    4. -
    5. Even worse, eliminate some of the code so the performance looks plausible
    6. -
    7. Ensure that the results are (or may be) used.
    8. -
    +

    The number of elements which can be operated on in parallel +range from four single-precision floating point data elements in so-called +Streaming SIMD Extensions and two double-precision floating-point data +elements in Streaming SIMD Extensions 2 to sixteen byte operations in +a 128-bit register in Streaming SIMD Extensions 2. Thus, vector-length +ranges from 2 to 16, depending on the instruction extensions used and +on the data type. +

    +

    IN summary, our instructions operate on 128 bit (16 byte) operands

    +

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs036.html b/doc/pub/week9/html/._week9-bs036.html index 71406233..66e077b3 100644 --- a/doc/pub/week9/html/._week9-bs036.html +++ b/doc/pub/week9/html/._week9-bs036.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - - -

    Problems with interference

    - -
      -
    1. Other activities are sharing your processor
    2. - - - -
    3. Make multiple tests and report
    4. -
    5. Easy choices include
    6. + +

      Number of elements that can acted upon, examples

      +

      We start with the simple scalar operations given by

      + + +
      +
      +
      +
      +
      +
      for (i = 0; i < n; i++){
      +    a[i] = b[i] + c[i];
      +}
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      - +

      If the code is not vectorized and we have a 128-bit register to store a 32 bits floating point number, +it means that we have \( 3\times 32 \) bits that are not used. +

      -
    +

    We have thus unused space in our SIMD registers. These registers could hold three additional integers.

    @@ -633,29 +704,22 @@

    Problems with interference 45
  • 46
  • ...
  • -
  • 120
  • +
  • 141
  • »
  • -

    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs037.html b/doc/pub/week9/html/._week9-bs037.html index fe6f5e53..a3465561 100644 --- a/doc/pub/week9/html/._week9-bs037.html +++ b/doc/pub/week9/html/._week9-bs037.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - + +

    Operation counts for scalar operation

    +

    The code

    -

    Problems with measuring performance

    + +
    +
    +
    +
    +
    +
    for (i = 0; i < n; i++){
    +    a[i] = b[i] + c[i];
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    has for \( n \) repeats

      -
    1. Accurate, reproducible performance measurement is hard
    2. -
    3. Think carefully about your experiment:
    4. -
    5. What is it, precisely, that you want to measure?
    6. -
    7. How representative is your test to the situation that you are trying to measure?
    8. +
    9. one load for \( c[i] \) in address 1
    10. +
    11. one load for \( b[i] \) in address 2
    12. +
    13. add \( c[i] \) and \( b[i] \) to give \( a[i] \)
    14. +
    15. store \( a[i] \) in address 2
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs038.html b/doc/pub/week9/html/._week9-bs038.html index e1606a25..91d5dede 100644 --- a/doc/pub/week9/html/._week9-bs038.html +++ b/doc/pub/week9/html/._week9-bs038.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - + +

    Number of elements that can acted upon, examples

    +

    If we vectorize the code, we can perform, with a 128-bit register four simultaneous operations, that is +we have +

    -

    Thomas algorithm for tridiagonal linear algebra equations

    -
    -
    -

    -$$ -\left( \begin{array}{ccccc} - b_0 & c_0 & & & \\ - a_0 & b_1 & c_1 & & \\ - & & \ddots & & \\ - & & a_{m-3} & b_{m-2} & c_{m-2} \\ - & & & a_{m-2} & b_{m-1} - \end{array} \right) -\left( \begin{array}{c} - x_0 \\ - x_1 \\ - \vdots \\ - x_{m-2} \\ - x_{m-1} - \end{array} \right)=\left( \begin{array}{c} - f_0 \\ - f_1 \\ - \vdots \\ - f_{m-2} \\ - f_{m-1} \\ - \end{array} \right) -$$ + +

    +
    +
    +
    +
    +
    for (i = 0; i < n; i+=4){
    +    a[i] = b[i] + c[i];
    +    a[i+1] = b[i+1] + c[i+1];
    +    a[i+2] = b[i+2] + c[i+2];
    +    a[i+3] = b[i+3] + c[i+3];
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    Four additions are now done in a single step.

    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs039.html b/doc/pub/week9/html/._week9-bs039.html index 9a3b4d79..15837176 100644 --- a/doc/pub/week9/html/._week9-bs039.html +++ b/doc/pub/week9/html/._week9-bs039.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - - - -

    Thomas algorithm, forward substitution

    -
    -
    -

    -The first step is to multiply the first row by \( a_0/b_0 \) and subtract it from the second row. This is known as the forward substitution step. We obtain then -$$ - a_i = 0, -$$ - - -$$ - b_i = b_i - \frac{a_{i-1}}{b_{i-1}}c_{i-1}, -$$ - -and -$$ - f_i = f_i - \frac{a_{i-1}}{b_{i-1}}f_{i-1}. -$$ - -At this point the simplified equation, with only an upper triangular matrix takes the form -$$ -\left( \begin{array}{ccccc} - b_0 & c_0 & & & \\ - & b_1 & c_1 & & \\ - & & \ddots & & \\ - & & & b_{m-2} & c_{m-2} \\ - & & & & b_{m-1} - \end{array} \right)\left( \begin{array}{c} - x_0 \\ - x_1 \\ - \vdots \\ - x_{m-2} \\ - x_{m-1} - \end{array} \right)=\left( \begin{array}{c} - f_0 \\ - f_1 \\ - \vdots \\ - f_{m-2} \\ - f_{m-1} \\ - \end{array} \right) -$$ -

    -
    - - -

    + +

    Number of operations when vectorized

    +

    For \( n/4 \) repeats assuming floats or integers

    +
      +
    1. one vector load for \( c[i] \) in address 1
    2. +
    3. one load for \( b[i] \) in address 2
    4. +
    5. add \( c[i] \) and \( b[i] \) to give \( a[i] \)
    6. +
    7. store \( a[i] \) in address 2
    8. +

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs040.html b/doc/pub/week9/html/._week9-bs040.html index 6686edb6..f32f8c13 100644 --- a/doc/pub/week9/html/._week9-bs040.html +++ b/doc/pub/week9/html/._week9-bs040.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - - -

    Thomas algorithm, backward substitution

    -
    -
    -

    -The next step is the backward substitution step. The last row is multiplied by \( c_{N-3}/b_{N-2} \) and subtracted from the second to last row, thus eliminating \( c_{N-3} \) from the last row. The general backward substitution procedure is -$$ - c_i = 0, -$$ - -and -$$ - f_{i-1} = f_{i-1} - \frac{c_{i-1}}{b_i}f_i -$$ - -All that ramains to be computed is the solution, which is the very straight forward process of -$$ -x_i = \frac{f_i}{b_i} -$$ +

    A simple test case with and without vectorization

    +

    We implement these operations in a simple c++ program that computes at the end the norm of a vector.

    + + + +
    +
    +
    +
    +
    +
    #include <cstdlib>
    +#include <iostream>
    +#include <cmath>
    +#include <iomanip>
    +#include "time.h"
    +
    +using namespace std; // note use of namespace
    +int main (int argc, char* argv[])
    +{
    +  // read in dimension of square matrix
    +  int n = atoi(argv[1]);
    +  double s = 1.0/sqrt( (double) n);
    +  double *a, *b, *c;
    +  // Start timing
    +  clock_t start, finish;
    +  start = clock();
    +// Allocate space for the vectors to be used
    +    a = new double [n]; b = new double [n]; c = new double [n];
    +  // Define parallel region
    +  // Set up values for vectors  a and b
    +  for (int i = 0; i < n; i++){
    +    double angle = 2.0*M_PI*i/ (( double ) n);
    +    a[i] = s*(sin(angle) + cos(angle));
    +    b[i] =  s*sin(2.0*angle);
    +    c[i] = 0.0;
    +  }
    +  // Then perform the vector addition
    +  for (int i = 0; i < n; i++){
    +    c[i] += a[i]+b[i];
    +  }
    +  // Compute now the norm-2
    +  double Norm2 = 0.0;
    +  for (int i = 0; i < n; i++){
    +    Norm2  += c[i]*c[i];
    +  }
    +  finish = clock();
    +  double timeused = (double) (finish - start)/(CLOCKS_PER_SEC );
    +  cout << setiosflags(ios::showpoint | ios::uppercase);
    +  cout << setprecision(10) << setw(20) << "Time used  for norm computation=" << timeused  << endl;
    +  cout << "  Norm-2  = " << Norm2 << endl;
    +  // Free up space
    +  delete[] a;
    +  delete[] b;
    +  delete[] c;
    +  return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs041.html b/doc/pub/week9/html/._week9-bs041.html index d3483569..d2c79367 100644 --- a/doc/pub/week9/html/._week9-bs041.html +++ b/doc/pub/week9/html/._week9-bs041.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - - + +

    Compiling with and without vectorization

    +

    We can compile and link without vectorization using the clang c++ compiler

    -

    Thomas algorithm and counting of operations (floating point and memory)

    -
    -
    -

    + +

    +
    +
    +
    +
    +
    clang -o novec.x vecexample.cpp
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    +

    and with vectorization (and additional optimizations)

    -
    -
    - - - - - - - - - - - -
    Operation Floating Point
    Memory Reads \( 14(N-2) \)
    Memory Writes \( 4(N-2) \)
    Subtractions \( 3(N-2) \)
    Multiplications \( 3(N-2) \)
    Divisions \( 4(N-2) \)
    -
    -
    -

    + +

    +
    +
    +
    +
    +
    clang++ -O3 -Rpass=loop-vectorize -o  vec.x vecexample.cpp 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    The speedup depends on the size of the vectors. In the example here we have run with \( 10^7 \) elements. +The example here was run on an IMac17.1 with OSX El Capitan (10.11.4) as operating system and an Intel i5 3.3 GHz CPU. +

    -

    -

    -
    -

    -

    + +

    +
    +
    +
    +
    +
    Compphys:~ hjensen$ ./vec.x 10000000
    +Time used  for norm computation=0.04720500000
    +Compphys:~ hjensen$ ./novec.x 10000000
    +Time used  for norm computation=0.03311700000
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    This particular C++ compiler speeds up the above loop operations with a factor of 1.5 +Performing the same operations for \( 10^9 \) elements results in a smaller speedup since reading from main memory is required. The non-vectorized code is seemingly faster. +

    -
    // Forward substitution    
    -// Note that we can simplify by precalculating a[i-1]/b[i-1]
    -  for (int i=1; i < n; i++) {
    -     b[i] = b[i] - (a[i-1]*c[i-1])/b[i-1];
    -     f[i] = g[i] - (a[i-1]*f[i-1])/b[i-1];
    -  }
    -  x[n-1] = f[n-1] / b[n-1];
    -  // Backwards substitution                                                           
    -  for (int i = n-2; i >= 0; i--) {
    -     f[i] = f[i] - c[i]*f[i+1]/b[i+1];
    -     x[i] = f[i]/b[i];
    -  }
    -
    -

    +

    +
    +
    +
    +
    +
    Compphys:~ hjensen$ ./vec.x 1000000000
    +Time used  for norm computation=58.41391100
    +Compphys:~ hjensen$ ./novec.x 1000000000
    +Time used  for norm computation=46.51295300
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    We will discuss these issues further in the next slides.

    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs042.html b/doc/pub/week9/html/._week9-bs042.html index 37a2188a..1d38757d 100644 --- a/doc/pub/week9/html/._week9-bs042.html +++ b/doc/pub/week9/html/._week9-bs042.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - - + +

    Compiling with and without vectorization using clang

    +

    We can compile and link without vectorization with clang compiler

    + + +
    +
    +
    +
    +
    +
    clang++ -o -fno-vectorize novec.x vecexample.cpp
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    Example: Transpose of a matrix

    +

    and with vectorization

    -

    + +

    +
    +
    +
    +
    +
    clang++ -O3 -Rpass=loop-vectorize -o  vec.x vecexample.cpp 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    - -
    #include <cstdlib>
    -#include <iostream>
    -#include <cmath>
    -#include <iomanip>
    -#include "time.h"
    +

    We can also add vectorization analysis, see for example

    -using namespace std; // note use of namespace -int main (int argc, char* argv[]) -{ - // read in dimension of square matrix - int n = atoi(argv[1]); - double **A, **B; - // Allocate space for the two matrices - A = new double*[n]; B = new double*[n]; - for (int i = 0; i < n; i++){ - A[i] = new double[n]; - B[i] = new double[n]; - } - // Set up values for matrix A - for (int i = 0; i < n; i++){ - for (int j = 0; j < n; j++) { - A[i][j] = cos(i*1.0)*sin(j*3.0); - } - } - clock_t start, finish; - start = clock(); - // Then compute the transpose - for (int i = 0; i < n; i++){ - for (int j = 0; j < n; j++) { - B[i][j]= A[j][i]; - } - } + +
    +
    +
    +
    +
    +
    clang++ -O3 -Rpass-analysis=loop-vectorize -o  vec.x vecexample.cpp 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    or figure out if vectorization was missed

    + + +
    +
    +
    +
    +
    +
    clang++ -O3 -Rpass-missed=loop-vectorize -o  vec.x vecexample.cpp 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    - finish = clock(); - double timeused = (double) (finish - start)/(CLOCKS_PER_SEC ); - cout << setiosflags(ios::showpoint | ios::uppercase); - cout << setprecision(10) << setw(20) << "Time used for setting up transpose of matrix=" << timeused << endl; - // Free up space - for (int i = 0; i < n; i++){ - delete[] A[i]; - delete[] B[i]; - } - delete[] A; - delete[] B; - return 0; -} -
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs043.html b/doc/pub/week9/html/._week9-bs043.html index 44eaa3c8..a8f6a19d 100644 --- a/doc/pub/week9/html/._week9-bs043.html +++ b/doc/pub/week9/html/._week9-bs043.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - +

    Automatic vectorization and vectorization inhibitors, criteria

    -

    Matrix-matrix multiplication

    -This the matrix-matrix multiplication code with plain c++ memory allocation. It computes at the end the Frobenius norm. +

    Not all loops can be vectorized, as discussed in Intel's guide to vectorization

    -

    +

    An important criteria is that the loop counter \( n \) is known at the entry of the loop.

    - -
    #include <cstdlib>
    -#include <iostream>
    -#include <cmath>
    -#include <iomanip>
    -#include "time.h"
    +
    +
    +
    +
    +
    +
    +
      for (int j = 0; j < n; j++) {
    +    a[j] = cos(j*1.0);
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    The variable \( n \) does need to be known at compile time. However, this variable must stay the same for the entire duration of the loop. It implies that an exit statement inside the loop cannot be data dependent.

    -using namespace std; // note use of namespace -int main (int argc, char* argv[]) -{ - // read in dimension of square matrix - int n = atoi(argv[1]); - double s = 1.0/sqrt( (double) n); - double **A, **B, **C; - // Start timing - clock_t start, finish; - start = clock(); - // Allocate space for the two matrices - A = new double*[n]; B = new double*[n]; C = new double*[n]; - for (int i = 0; i < n; i++){ - A[i] = new double[n]; - B[i] = new double[n]; - C[i] = new double[n]; - } - // Set up values for matrix A and B and zero matrix C - for (int i = 0; i < n; i++){ - for (int j = 0; j < n; j++) { - double angle = 2.0*M_PI*i*j/ (( double ) n); - A[i][j] = s * ( sin ( angle ) + cos ( angle ) ); - B[j][i] = A[i][j]; - } - } - // Then perform the matrix-matrix multiplication - for (int i = 0; i < n; i++){ - for (int j = 0; j < n; j++) { - double sum = 0.0; - for (int k = 0; k < n; k++) { - sum += B[i][k]*A[k][j]; - } - C[i][j] = sum; - } - } - // Compute now the Frobenius norm - double Fsum = 0.0; - for (int i = 0; i < n; i++){ - for (int j = 0; j < n; j++) { - Fsum += C[i][j]*C[i][j]; - } - } - Fsum = sqrt(Fsum); - finish = clock(); - double timeused = (double) (finish - start)/(CLOCKS_PER_SEC ); - cout << setiosflags(ios::showpoint | ios::uppercase); - cout << setprecision(10) << setw(20) << "Time used for matrix-matrix multiplication=" << timeused << endl; - cout << " Frobenius norm = " << Fsum << endl; - // Free up space - for (int i = 0; i < n; i++){ - delete[] A[i]; - delete[] B[i]; - delete[] C[i]; - } - delete[] A; - delete[] B; - delete[] C; - return 0; -} -
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs044.html b/doc/pub/week9/html/._week9-bs044.html index ad170bd0..a009250c 100644 --- a/doc/pub/week9/html/._week9-bs044.html +++ b/doc/pub/week9/html/._week9-bs044.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - +

    Automatic vectorization and vectorization inhibitors, exit criteria

    -

    How do we define speedup? Simplest form

    -
    -
    -

    +

    An exit statement should in general be avoided. +If the exit statement contains data-dependent conditions, the loop cannot be vectorized. +The following is an example of a non-vectorizable loop +

    -
      -
    • Speedup measures the ratio of performance between two objects
    • -
    • Versions of same code, with different number of processors
    • -
    • Serial and vector versions
    • -
    • Try different programing languages, c++ and Fortran
    • -
    • Two algorithms computing the same result
    • -
    + +
    +
    +
    +
    +
    +
      for (int j = 0; j < n; j++) {
    +    a[j] = cos(j*1.0);
    +    if (a[j] < 0 ) break;
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    Avoid loop termination conditions and opt for a single entry loop variable \( n \). The lower and upper bounds have to be kept fixed within the loop.

    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs045.html b/doc/pub/week9/html/._week9-bs045.html index 848a2843..208b044e 100644 --- a/doc/pub/week9/html/._week9-bs045.html +++ b/doc/pub/week9/html/._week9-bs045.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - - -

    How do we define speedup? Correct baseline

    -
    -
    -

    -The key is choosing the correct baseline for comparison - -

      -
    • For our serial vs. vectorization examples, using compiler-provided vectorization, the baseline is simple; the same code, with vectorization turned off
    • - -
        -
      • For parallel applications, this is much harder:
      • - -
          -
        • Choice of algorithm, decomposition, performance of baseline case etc.
        • -
        - -
      - -
    +

    Automatic vectorization and vectorization inhibitors, straight-line code

    + +

    SIMD instructions perform the same type of operations multiple times. +A switch statement leads thus to a non-vectorizable loop since different statemens cannot branch. +The following code can however be vectorized since the if statement is implemented as a masked assignment. +

    + + +
    +
    +
    +
    +
    +
      for (int j = 0; j < n; j++) {
    +    double x  = cos(j*1.0);
    +    if (x > 0 ) {
    +       a[j] =  x*sin(j*2.0); 
    +    }
    +    else {
    +       a[j] = 0.0;
    +    }
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    These operations can be performed for all data elements but only those elements which the mask evaluates as true are stored. In general, one should avoid branches such as switch, go to, or return statements or if constructs that cannot be treated as masked assignments.

    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs046.html b/doc/pub/week9/html/._week9-bs046.html index b3244f9e..69aac9c7 100644 --- a/doc/pub/week9/html/._week9-bs046.html +++ b/doc/pub/week9/html/._week9-bs046.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - - -

    Parallel speedup

    -
    -
    -

    -For parallel applications, speedup is typically defined as - -

      -
    • Speedup \( =T_1/T_p \)
    • -
    - -Here \( T_1 \) is the time on one processor and \( T_p \) is the time using \( p \) processors. - -
      -
    • Can the speedup become larger than \( p \)? That means using \( p \) processors is more than \( p \) times faster than using one processor.
    • -
    +

    Automatic vectorization and vectorization inhibitors, nested loops

    + +

    Only the innermost loop of the following example is vectorized

    + + +
    +
    +
    +
    +
    +
      for (int i = 0; i < n; i++) {
    +      for (int j = 0; j < n; j++) {
    +           a[i][j] += b[i][j];
    +      }  
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    The exception is if an original outer loop is transformed into an inner loop as the result of compiler optimizations.

    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs047.html b/doc/pub/week9/html/._week9-bs047.html index 9272d7e6..cfb30d7b 100644 --- a/doc/pub/week9/html/._week9-bs047.html +++ b/doc/pub/week9/html/._week9-bs047.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - +

    Automatic vectorization and vectorization inhibitors, function calls

    -

    Speedup and memory

    -
    -
    -

    -The speedup on \( p \) processors can -be greater than \( p \) if memory usage is optimal! -Consider the case of a memorybound computation with \( M \) words of memory +

    Calls to programmer defined functions ruin vectorization. However, calls to intrinsic functions like +\( \sin{x} \), \( \cos{x} \), \( \exp{x} \) etc are allowed since they are normally efficiently vectorized. +The following example is fully vectorizable +

    -
      -
    • If \( M/p \) fits into cache while \( M \) does not, the time to access memory will be different in the two cases:
    • -
    • \( T_1 \) uses the main memory bandwidth
    • -
    • \( T_p \) uses the appropriate cache bandwidth
    • -
    + +
    +
    +
    +
    +
    +
      for (int i = 0; i < n; i++) {
    +      a[i] = log10(i)*cos(i);
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    Similarly, inline functions defined by the programmer, allow for vectorization since the function statements are glued into the actual place where the function is called.

    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs048.html b/doc/pub/week9/html/._week9-bs048.html index bdcad49a..a733da36 100644 --- a/doc/pub/week9/html/._week9-bs048.html +++ b/doc/pub/week9/html/._week9-bs048.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - +

    Automatic vectorization and vectorization inhibitors, data dependencies

    + +

    One has to keep in mind that vectorization changes the order of operations inside a loop. A so-called +read-after-write statement with an explicit flow dependency cannot be vectorized. The following code +

    + + +
    +
    +
    +
    +
    +
      double b = 15.;
    +  for (int i = 1; i < n; i++) {
    +      a[i] = a[i-1] + b;
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    Upper bounds on speedup

    -
    -
    -

    -Assume that almost all parts of a code are perfectly -parallelizable (fraction \( f \)). The remainder, -fraction \( (1-f) \) cannot be parallelized at all. - -

    -That is, there is work that takes time \( W \) on one process; a fraction \( f \) of that work will take -time \( Wf/p \) on \( p \) processors. - -

      -
    • What is the maximum possible speedup as a function of \( f \)?
    • -
    +

    is an example of flow dependency and results in wrong numerical results if vectorized. For a scalar operation, the value \( a[i-1] \) computed during the iteration is loaded into the right-hand side and the results are fine. In vector mode however, with a vector length of four, the values \( a[0] \), \( a[1] \), \( a[2] \) and \( a[3] \) from the previous loop will be loaded into the right-hand side and produce wrong results. That is, we have

    + + +
    +
    +
    +
    +
    +
       a[1] = a[0] + b;
    +   a[2] = a[1] + b;
    +   a[3] = a[2] + b;
    +   a[4] = a[3] + b;
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    and if the two first iterations are executed at the same by the SIMD instruction, the value of say \( a[1] \) could be used by the second iteration before it has been calculated by the first iteration, leading thereby to wrong results.

    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs049.html b/doc/pub/week9/html/._week9-bs049.html index 780f6641..81e0eddb 100644 --- a/doc/pub/week9/html/._week9-bs049.html +++ b/doc/pub/week9/html/._week9-bs049.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - - -

    Amdahl's law

    -
    -
    -

    -On one processor we have -$$ -T_1 = (1-f)W + fW = W -$$ - -On \( p \) processors we have -$$ -T_p = (1-f)W + \frac{fW}{p}, -$$ - -resulting in a speedup of -$$ -\frac{T_1}{T_p} = \frac{W}{(1-f)W+fW/p} -$$ - -

    -As \( p \) goes to infinity, \( fW/p \) goes to zero, and the maximum speedup is -$$ -\frac{1}{1-f}, -$$ - -meaning that if -if \( f = 0.99 \) (all but \( 1\% \) parallelizable), the maximum speedup -is \( 1/(1-.99)=100 \)! +

    Automatic vectorization and vectorization inhibitors, more data dependencies

    + +

    On the other hand, a so-called +write-after-read statement can be vectorized. The following code +

    + + +
    +
    +
    +
    +
    +
      double b = 15.;
    +  for (int i = 1; i < n; i++) {
    +      a[i-1] = a[i] + b;
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    is an example of flow dependency that can be vectorized since no iteration with a higher value of \( i \) +can complete before an iteration with a lower value of \( i \). However, such code leads to problems with parallelization. +

    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs050.html b/doc/pub/week9/html/._week9-bs050.html index 1c2b3c42..41ad4205 100644 --- a/doc/pub/week9/html/._week9-bs050.html +++ b/doc/pub/week9/html/._week9-bs050.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - - -

    How much is parallelizable

    -
    -
    -

    -If any non-parallel code slips into the -application, the parallel -performance is limited. - -

    -In many simulations, however, the fraction of non-parallelizable work -is \( 10^{-6} \) or less due to large arrays or objects that are perfectly parallelizable. - -

    +

    Automatic vectorization and vectorization inhibitors, memory stride

    + +

    For C++ programmers it is also worth keeping in mind that an array notation is preferred to the more compact use of pointers to access array elements. The compiler can often not tell if it is safe to vectorize the code.

    + +

    When dealing with arrays, you should also avoid memory stride, since this slows down considerably vectorization. When you access array element, write for example the inner loop to vectorize using unit stride, that is, access successively the next array element in memory, as shown here

    + + +
    +
    +
    +
    +
    +
      for (int i = 0; i < n; i++) {
    +      for (int j = 0; j < n; j++) {
    +           a[i][j] += b[i][j];
    +      }  
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

      @@ -632,29 +704,22 @@

      How much is parallelizable 59
    • 60
    • ...
    • -
    • 120
    • +
    • 141
    • »
    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs051.html b/doc/pub/week9/html/._week9-bs051.html index cd6bb807..650c41f8 100644 --- a/doc/pub/week9/html/._week9-bs051.html +++ b/doc/pub/week9/html/._week9-bs051.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - - -

    Today's situation of parallel computing

    -
    -
    -

    - +

    Memory management

    +

    The main memory contains the program data

    +
      +
    1. Cache memory contains a copy of the main memory data
    2. +
    3. Cache is faster but consumes more space and power. It is normally assumed to be much faster than main memory
    4. +
    5. Registers contain working data only
      • -
      • Distributed memory is the dominant hardware configuration. There is a large diversity in these machines, from MPP (massively parallel processing) systems to clusters of off-the-shelf PCs, which are very cost-effective.
      • -
      • Message-passing is a mature programming paradigm and widely accepted. It often provides an efficient match to the hardware. It is primarily used for the distributed memory systems, but can also be used on shared memory systems.
      • -
      • Modern nodes have nowadays several cores, which makes it interesting to use both shared memory (the given node) and distributed memory (several nodes with communication). This leads often to codes which use both MPI and OpenMP.
      • +
      • Modern CPUs perform most or all operations only on data in register
      +
    6. Multiple Cache memories contain a copy of the main memory data
    7. +
        +
      • Cache items accessed by their address in main memory
      • +
      • L1 cache is the fastest but has the least capacity
      • +
      • L2, L3 provide intermediate performance/size tradeoffs
      • +
      +
    +

    Loads and stores to memory can be as important as floating point operations when we measure performance.

    -Our lectures will focus on both MPI and OpenMP. - -

    -

    -
    - - -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs052.html b/doc/pub/week9/html/._week9-bs052.html index 495cdd56..3a54f757 100644 --- a/doc/pub/week9/html/._week9-bs052.html +++ b/doc/pub/week9/html/._week9-bs052.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - +

    Memory and communication

    -

    Overhead present in parallel computing

    -
    -
    -

    - +

      +
    1. Most communication in a computer is carried out in chunks, blocks of bytes of data that move together
    2. +
    3. In the memory hierarchy, data moves between memory and cache, and between different levels of cache, in groups called lines
      • -
      • Uneven load balance: not all the processors can perform useful work at all time.
      • -
      • Overhead of synchronization
      • -
      • Overhead of communication
      • -
      • Extra computation due to parallelization
      • +
      • Lines are typically 64-128 bytes, or 8-16 double precision words
      • +
      • Even if you do not use the data, it is moved and occupies space in the cache
      - -Due to the above overhead and that certain parts of a sequential -algorithm cannot be parallelized we may not achieve an optimal parallelization. -
    -
    - + +

    Many of these performance features are not captured in most programming languages.

    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs053.html b/doc/pub/week9/html/._week9-bs053.html index c8342328..fe227f06 100644 --- a/doc/pub/week9/html/._week9-bs053.html +++ b/doc/pub/week9/html/._week9-bs053.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - +

    Measuring performance

    -

    Parallelizing a sequential algorithm

    -
    -
    -

    +

    How do we measure performance? What is wrong with this code to time a loop?

    -
      -
    • Identify the part(s) of a sequential algorithm that can be executed in parallel. This is the difficult part,
    • -
    • Distribute the global work and data among \( P \) processors.
    • -
    + +
    +
    +
    +
    +
    +
      clock_t start, finish;
    +  start = clock();
    +  for (int j = 0; j < i; j++) {
    +    a[j] = b[j]+b[j]*c[j];
    +  }
    +  finish = clock();
    +  double timeused = (double) (finish - start)/(CLOCKS_PER_SEC );
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs054.html b/doc/pub/week9/html/._week9-bs054.html index d075705c..3b7c0212 100644 --- a/doc/pub/week9/html/._week9-bs054.html +++ b/doc/pub/week9/html/._week9-bs054.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - - -

    Strategies

    -
    -
    -

    - -

      -
    • Develop codes locally, run with some few processes and test your codes. Do benchmarking, timing and so forth on local nodes, for example your laptop or PC.
    • -
    • When you are convinced that your codes run correctly, you can start your production runs on available supercomputers.
    • -
    -
    -
    - - -

    +

    Problems with measuring time

    +
      +
    1. Timers are not infinitely accurate
    2. +
    3. All clocks have a granularity, the minimum time that they can measure
    4. +
    5. The error in a time measurement, even if everything is perfect, may be the size of this granularity (sometimes called a clock tick)
    6. +
    7. Always know what your clock granularity is
    8. +
    9. Ensure that your measurement is for a long enough duration (say 100 times the tick)
    10. +

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs055.html b/doc/pub/week9/html/._week9-bs055.html index 911c6bde..b5e0ffa4 100644 --- a/doc/pub/week9/html/._week9-bs055.html +++ b/doc/pub/week9/html/._week9-bs055.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - - -

    How do I run MPI on a PC/Laptop? MPI

    -
    -
    -

    -To install MPI is rather easy on hardware running unix/linux as operating systems, follow simply the instructions from the OpenMPI website. See also subsequent slides. -When you have made sure you have installed MPI on your PC/laptop, - -

      -
    • Compile with mpicxx/mpic++ or mpif90
    • -
    - -

    - - -

      # Compile and link
    -  mpic++ -O3 -o nameofprog.x nameofprog.cpp
    -  #  run code with for example 8 processes using mpirun/mpiexec
    -  mpiexec -n 8 ./nameofprog.x
    -
    -

    -

    -
    - - -

    +

    Problems with cold start

    + +

    What happens when the code is executed? The assumption is that the code is ready to +execute. But +

    +
      +
    1. Code may still be on disk, and not even read into memory.
    2. +
    3. Data may be in slow memory rather than fast (which may be wrong or right for what you are measuring)
    4. +
    5. Multiple tests often necessary to ensure that cold start effects are not present
    6. +
    7. Special effort often required to ensure data in the intended part of the memory hierarchy.
    8. +

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs056.html b/doc/pub/week9/html/._week9-bs056.html index 3d8bba7f..3e234143 100644 --- a/doc/pub/week9/html/._week9-bs056.html +++ b/doc/pub/week9/html/._week9-bs056.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - - -

    Can I do it on my own PC/laptop? OpenMP installation

    -
    -
    -

    -If you wish to install MPI and OpenMP -on your laptop/PC, we recommend the following: - -

      -
    • For OpenMP, the compile option -fopenmp is included automatically in recent versions of the C++ compiler and Fortran compilers. For users of different Linux distributions, simply use the available C++ or Fortran compilers and add the above compiler instructions, see also code examples below.
    • -
    • For OS X users however, install libomp
    • -
    - -

    - - -

      brew install libomp
    -
    -

    -and compile and link as -

    - - -

    c++ -o <name executable> <name program.cpp>  -lomp
    -
    -

    -

    -
    - - -

    +

    Problems with smart compilers

    + +
      +
    1. If the result of the computation is not used, the compiler may eliminate the code
    2. +
    3. Performance will look impossibly fantastic
    4. +
    5. Even worse, eliminate some of the code so the performance looks plausible
    6. +
    7. Ensure that the results are (or may be) used.
    8. +

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs057.html b/doc/pub/week9/html/._week9-bs057.html index 9d57c7e9..8c76800c 100644 --- a/doc/pub/week9/html/._week9-bs057.html +++ b/doc/pub/week9/html/._week9-bs057.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - - -

    Installing MPI

    -
    -
    -

    -For linux/ubuntu users, you need to install two packages (alternatively use the synaptic package manager) -

    - - -

      sudo apt-get install libopenmpi-dev
    -  sudo apt-get install openmpi-bin
    -
    -

    -For OS X users, install brew (after having installed xcode and gcc, needed for the -gfortran compiler of openmpi) and then install with brew -

    - - -

       brew install openmpi
    -
    -

    -When running an executable (code.x), run as -

    - - -

      mpirun -n 10 ./code.x
    -
    -

    -where we indicate that we want the number of processes to be 10. - -

    -

    -
    - - -

    +

    Problems with interference

    +
      +
    1. Other activities are sharing your processor
    2. +
        +
      • Operating system, system demons, other users
      • +
      • Some parts of the hardware do not always perform with exactly the same performance
      • +
      +
    3. Make multiple tests and report
    4. +
    5. Easy choices include
    6. +
        +
      • Average tests represent what users might observe over time
      • +
      +

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs058.html b/doc/pub/week9/html/._week9-bs058.html index 6cc09ff9..716f9a3f 100644 --- a/doc/pub/week9/html/._week9-bs058.html +++ b/doc/pub/week9/html/._week9-bs058.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - - -

    Installing MPI and using Qt

    -
    -
    -

    -With openmpi installed, when using Qt, add to your .pro file the instructions here - -

    -You may need to tell Qt where openmpi is stored. - -

    -

    -
    - - -

    +

    Problems with measuring performance

    +
      +
    1. Accurate, reproducible performance measurement is hard
    2. +
    3. Think carefully about your experiment:
    4. +
    5. What is it, precisely, that you want to measure?
    6. +
    7. How representative is your test to the situation that you are trying to measure?
    8. +

      @@ -629,29 +678,22 @@

      Installing MPI and using Qt
    • 67
    • 68
    • ...
    • -
    • 120
    • +
    • 141
    • »
    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs059.html b/doc/pub/week9/html/._week9-bs059.html index 4ed67e4b..17184cda 100644 --- a/doc/pub/week9/html/._week9-bs059.html +++ b/doc/pub/week9/html/._week9-bs059.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - - -

    What is Message Passing Interface (MPI)?

    +

    Thomas algorithm for tridiagonal linear algebra equations

    -

    - -

    -MPI is a library, not a language. It specifies the names, calling sequences and results of functions -or subroutines to be called from C/C++ or Fortran programs, and the classes and methods that make up the MPI C++ -library. The programs that users write in Fortran, C or C++ are compiled with ordinary compilers and linked -with the MPI library. - -

    -MPI programs should be able to run -on all possible machines and run all MPI implementetations without change. - -

    -An MPI computation is a collection of processes communicating with messages. - -

    + +$$ +\left( \begin{array}{ccccc} + b_0 & c_0 & & & \\ + a_0 & b_1 & c_1 & & \\ + & & \ddots & & \\ + & & a_{m-3} & b_{m-2} & c_{m-2} \\ + & & & a_{m-2} & b_{m-1} + \end{array} \right) +\left( \begin{array}{c} + x_0 \\ + x_1 \\ + \vdots \\ + x_{m-2} \\ + x_{m-1} + \end{array} \right)=\left( \begin{array}{c} + f_0 \\ + f_1 \\ + \vdots \\ + f_{m-2} \\ + f_{m-1} \\ + \end{array} \right) +$$

    +

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs060.html b/doc/pub/week9/html/._week9-bs060.html index ac087d68..e66e5ddf 100644 --- a/doc/pub/week9/html/._week9-bs060.html +++ b/doc/pub/week9/html/._week9-bs060.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - - -

    Going Parallel with MPI

    +

    Thomas algorithm, forward substitution

    -

    -Task parallelism: the work of a global problem can be divided -into a number of independent tasks, which rarely need to synchronize. -Monte Carlo simulations or numerical integration are examples of this. - -

    -MPI is a message-passing library where all the routines -have corresponding C/C++-binding -

    - - -

       MPI_Command_name
    -
    -

    -and Fortran-binding (routine names are in uppercase, but can also be in lower case) -

    - - -

       MPI_COMMAND_NAME
    -
    -

    + +

    The first step is to multiply the first row by \( a_0/b_0 \) and subtract it from the second row. This is known as the forward substitution step. We obtain then

    +$$ + a_i = 0, +$$ + + +$$ + b_i = b_i - \frac{a_{i-1}}{b_{i-1}}c_{i-1}, +$$ + +

    and

    +$$ + f_i = f_i - \frac{a_{i-1}}{b_{i-1}}f_{i-1}. +$$ + +

    At this point the simplified equation, with only an upper triangular matrix takes the form

    +$$ +\left( \begin{array}{ccccc} + b_0 & c_0 & & & \\ + & b_1 & c_1 & & \\ + & & \ddots & & \\ + & & & b_{m-2} & c_{m-2} \\ + & & & & b_{m-1} + \end{array} \right)\left( \begin{array}{c} + x_0 \\ + x_1 \\ + \vdots \\ + x_{m-2} \\ + x_{m-1} + \end{array} \right)=\left( \begin{array}{c} + f_0 \\ + f_1 \\ + \vdots \\ + f_{m-2} \\ + f_{m-1} \\ + \end{array} \right) +$$
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs061.html b/doc/pub/week9/html/._week9-bs061.html index 6b43b51c..49cf568e 100644 --- a/doc/pub/week9/html/._week9-bs061.html +++ b/doc/pub/week9/html/._week9-bs061.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - - -

    MPI is a library

    +

    Thomas algorithm, backward substitution

    -

    -MPI is a library specification for the message passing interface, -proposed as a standard. - -

      -
    • independent of hardware;
    • -
    • not a language or compiler specification;
    • -
    • not a specific implementation or product.
    • -
    - -A message passing standard for portability and ease-of-use. -Designed for high performance. - -

    -Insert communication and synchronization functions where necessary. - -

    + +

    The next step is the backward substitution step. The last row is multiplied by \( c_{N-3}/b_{N-2} \) and subtracted from the second to last row, thus eliminating \( c_{N-3} \) from the last row. The general backward substitution procedure is

    +$$ + c_i = 0, +$$ + +

    and

    +$$ + f_{i-1} = f_{i-1} - \frac{c_{i-1}}{b_i}f_i +$$ + +

    All that ramains to be computed is the solution, which is the very straight forward process of

    +$$ +x_i = \frac{f_i}{b_i} +$$
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs062.html b/doc/pub/week9/html/._week9-bs062.html index 7bbd47db..dd041aa0 100644 --- a/doc/pub/week9/html/._week9-bs062.html +++ b/doc/pub/week9/html/._week9-bs062.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - - -

    Bindings to MPI routines

    +

    Thomas algorithm and counting of operations (floating point and memory)

    -

    + -

    -MPI is a message-passing library where all the routines -have corresponding C/C++-binding -

    +

    We have in specific case the following operations with the floating operations

    - -
       MPI_Command_name
    -
    -

    -and Fortran-binding (routine names are in uppercase, but can also be in lower case) -

    +

      +
    • Memory Reads: \( 14(N-2) \);
    • +
    • Memory Writes: \( 4(N-2) \);
    • +
    • Subtractions: \( 3(N-2) \);
    • +
    • Multiplications: \( 3(N-2) \);
    • +
    • Divisions: \( 4(N-2) \).
    • +
    +
    +
    - -
       MPI_COMMAND_NAME
    -
    -

    -The discussion in these slides focuses on the C++ binding. -

    +

    +
    + + + +
    +
    +
    +
    +
    +
    // Forward substitution    
    +// Note that we can simplify by precalculating a[i-1]/b[i-1]
    +  for (int i=1; i < n; i++) {
    +     b[i] = b[i] - (a[i-1]*c[i-1])/b[i-1];
    +     f[i] = g[i] - (a[i-1]*f[i-1])/b[i-1];
    +  }
    +  x[n-1] = f[n-1] / b[n-1];
    +  // Backwards substitution                                                           
    +  for (int i = n-2; i >= 0; i--) {
    +     f[i] = f[i] - c[i]*f[i+1]/b[i+1];
    +     x[i] = f[i]/b[i];
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs063.html b/doc/pub/week9/html/._week9-bs063.html index 6e4b171d..81516243 100644 --- a/doc/pub/week9/html/._week9-bs063.html +++ b/doc/pub/week9/html/._week9-bs063.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - +

    Example: Transpose of a matrix

    + + + +
    +
    +
    +
    +
    +
    #include <cstdlib>
    +#include <iostream>
    +#include <cmath>
    +#include <iomanip>
    +#include "time.h"
    +
    +using namespace std; // note use of namespace
    +int main (int argc, char* argv[])
    +{
    +  // read in dimension of square matrix
    +  int n = atoi(argv[1]);
    +  double **A, **B;
    +  // Allocate space for the two matrices
    +  A = new double*[n]; B = new double*[n];
    +  for (int i = 0; i < n; i++){
    +    A[i] = new double[n];
    +    B[i] = new double[n];
    +  }
    +  // Set up values for matrix A
    +  for (int i = 0; i < n; i++){
    +    for (int j = 0; j < n; j++) {
    +      A[i][j] =  cos(i*1.0)*sin(j*3.0);
    +    }
    +  }
    +  clock_t start, finish;
    +  start = clock();
    +  // Then compute the transpose
    +  for (int i = 0; i < n; i++){
    +    for (int j = 0; j < n; j++) {
    +      B[i][j]= A[j][i];
    +    }
    +  }
     
    -

    Communicator

    -
    -
    -

    - -

      -
    • A group of MPI processes with a name (context).
    • -
    • Any process is identified by its rank. The rank is only meaningful within a particular communicator.
    • -
    • By default the communicator contains all the MPI processes.
    • -
    - -

    + finish = clock(); + double timeused = (double) (finish - start)/(CLOCKS_PER_SEC ); + cout << setiosflags(ios::showpoint | ios::uppercase); + cout << setprecision(10) << setw(20) << "Time used for setting up transpose of matrix=" << timeused << endl; - -

      MPI_COMM_WORLD 
    -
    -
      -
    • Mechanism to identify subset of processes.
    • -
    • Promotes modular design of parallel libraries.
    • -
    + // Free up space + for (int i = 0; i < n; i++){ + delete[] A[i]; + delete[] B[i]; + } + delete[] A; + delete[] B; + return 0; +} +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs064.html b/doc/pub/week9/html/._week9-bs064.html index 97f08b61..7d9d3169 100644 --- a/doc/pub/week9/html/._week9-bs064.html +++ b/doc/pub/week9/html/._week9-bs064.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - +

    Matrix-matrix multiplication

    +

    This the matrix-matrix multiplication code with plain c++ memory allocation. It computes at the end the Frobenius norm.

    -

    Some of the most important MPI functions

    -
    -
    -

    -

      -
    • \( MPI\_Init \) - initiate an MPI computation
    • -
    • \( MPI\_Finalize \) - terminate the MPI computation and clean up
    • -
    • \( MPI\_Comm\_size \) - how many processes participate in a given MPI communicator?
    • -
    • \( MPI\_Comm\_rank \) - which one am I? (A number between 0 and size-1.)
    • -
    • \( MPI\_Send \) - send a message to a particular process within an MPI communicator
    • -
    • \( MPI\_Recv \) - receive a message from a particular process within an MPI communicator
    • -
    • \( MPI\_reduce \) or \( MPI\_Allreduce \), send and receive messages
    • -
    + +
    +
    +
    +
    +
    +
    #include <cstdlib>
    +#include <iostream>
    +#include <cmath>
    +#include <iomanip>
    +#include "time.h"
    +
    +using namespace std; // note use of namespace
    +int main (int argc, char* argv[])
    +{
    +  // read in dimension of square matrix
    +  int n = atoi(argv[1]);
    +  double s = 1.0/sqrt( (double) n);
    +  double **A, **B, **C;
    +  // Start timing
    +  clock_t start, finish;
    +  start = clock();
    +  // Allocate space for the two matrices
    +  A = new double*[n]; B = new double*[n]; C = new double*[n];
    +  for (int i = 0; i < n; i++){
    +    A[i] = new double[n];
    +    B[i] = new double[n];
    +    C[i] = new double[n];
    +  }
    +  // Set up values for matrix A and B and zero matrix C
    +  for (int i = 0; i < n; i++){
    +    for (int j = 0; j < n; j++) {
    +      double angle = 2.0*M_PI*i*j/ (( double ) n);
    +      A[i][j] = s * ( sin ( angle ) + cos ( angle ) );
    +      B[j][i] =  A[i][j];
    +    }
    +  }
    +  // Then perform the matrix-matrix multiplication
    +  for (int i = 0; i < n; i++){
    +    for (int j = 0; j < n; j++) {
    +      double sum = 0.0;
    +       for (int k = 0; k < n; k++) {
    +           sum += B[i][k]*A[k][j];
    +       }
    +       C[i][j] = sum;
    +    }
    +  }
    +  // Compute now the Frobenius norm
    +  double Fsum = 0.0;
    +  for (int i = 0; i < n; i++){
    +    for (int j = 0; j < n; j++) {
    +      Fsum += C[i][j]*C[i][j];
    +    }
    +  }
    +  Fsum = sqrt(Fsum);
    +  finish = clock();
    +  double timeused = (double) (finish - start)/(CLOCKS_PER_SEC );
    +  cout << setiosflags(ios::showpoint | ios::uppercase);
    +  cout << setprecision(10) << setw(20) << "Time used  for matrix-matrix multiplication=" << timeused  << endl;
    +  cout << "  Frobenius norm  = " << Fsum << endl;
    +  // Free up space
    +  for (int i = 0; i < n; i++){
    +    delete[] A[i];
    +    delete[] B[i];
    +    delete[] C[i];
    +  }
    +  delete[] A;
    +  delete[] B;
    +  delete[] C;
    +  return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs065.html b/doc/pub/week9/html/._week9-bs065.html index 04875f80..48608261 100644 --- a/doc/pub/week9/html/._week9-bs065.html +++ b/doc/pub/week9/html/._week9-bs065.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - - -

    The first MPI C/C++ program

    +

    How do we define speedup? Simplest form

    -

    - -

    -Let every process write "Hello world" (oh not this program again!!) on the standard output. -

    - - -

    using namespace std;
    -#include <mpi.h>
    -#include <iostream>
    -int main (int nargs, char* args[])
    -{
    -int numprocs, my_rank;
    -//   MPI initializations
    -MPI_Init (&nargs, &args);
    -MPI_Comm_size (MPI_COMM_WORLD, &numprocs);
    -MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);
    -cout << "Hello world, I have  rank " << my_rank << " out of " 
    -     << numprocs << endl;
    -//  End MPI
    -MPI_Finalize ();
    -
    -

    + +

      +
    • Speedup measures the ratio of performance between two objects
    • +
    • Versions of same code, with different number of processors
    • +
    • Serial and vector versions
    • +
    • Try different programing languages, c++ and Fortran
    • +
    • Two algorithms computing the same result
    • +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs066.html b/doc/pub/week9/html/._week9-bs066.html index 9516c6a4..455f5be4 100644 --- a/doc/pub/week9/html/._week9-bs066.html +++ b/doc/pub/week9/html/._week9-bs066.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - - -

    The Fortran program

    +

    How do we define speedup? Correct baseline

    -

    -

    - - -

    PROGRAM hello
    -INCLUDE "mpif.h"
    -INTEGER:: size, my_rank, ierr
    -
    -CALL  MPI_INIT(ierr)
    -CALL MPI_COMM_SIZE(MPI_COMM_WORLD, size, ierr)
    -CALL MPI_COMM_RANK(MPI_COMM_WORLD, my_rank, ierr)
    -WRITE(*,*)"Hello world, I've rank ",my_rank," out of ",size
    -CALL MPI_FINALIZE(ierr)
    -
    -END PROGRAM hello
    -
    -

    + +

    The key is choosing the correct baseline for comparison

    +
      +
    • For our serial vs. vectorization examples, using compiler-provided vectorization, the baseline is simple; the same code, with vectorization turned off
    • +
        +
      • For parallel applications, this is much harder:
      • +
          +
        • Choice of algorithm, decomposition, performance of baseline case etc.
        • +
        +
      +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs067.html b/doc/pub/week9/html/._week9-bs067.html index 62956a30..f231b190 100644 --- a/doc/pub/week9/html/._week9-bs067.html +++ b/doc/pub/week9/html/._week9-bs067.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Note 1

    +

    Parallel speedup

    -

    - + +

    For parallel applications, speedup is typically defined as

    +
      +
    • Speedup \( =T_1/T_p \)
    • +
    +

    Here \( T_1 \) is the time on one processor and \( T_p \) is the time using \( p \) processors.

      -
    • The output to screen is not ordered since all processes are trying to write to screen simultaneously.
    • -
    • It is the operating system which opts for an ordering.
    • -
    • If we wish to have an organized output, starting from the first process, we may rewrite our program as in the next example.
    • +
    • Can the speedup become larger than \( p \)? That means using \( p \) processors is more than \( p \) times faster than using one processor.
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs068.html b/doc/pub/week9/html/._week9-bs068.html index e9cde472..7f0fefe9 100644 --- a/doc/pub/week9/html/._week9-bs068.html +++ b/doc/pub/week9/html/._week9-bs068.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Ordered output with MPIBarrier

    +

    Speedup and memory

    -

    - -

    - - -

    int main (int nargs, char* args[])
    -{
    - int numprocs, my_rank, i;
    - MPI_Init (&nargs, &args);
    - MPI_Comm_size (MPI_COMM_WORLD, &numprocs);
    - MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);
    - for (i = 0; i < numprocs; i++) {}
    - MPI_Barrier (MPI_COMM_WORLD);
    - if (i == my_rank) {
    - cout << "Hello world, I have  rank " << my_rank << 
    -        " out of " << numprocs << endl;}
    -      MPI_Finalize ();
    -
    -

    + +

    The speedup on \( p \) processors can +be greater than \( p \) if memory usage is optimal! +Consider the case of a memorybound computation with \( M \) words of memory +

    +
      +
    • If \( M/p \) fits into cache while \( M \) does not, the time to access memory will be different in the two cases:
    • +
    • \( T_1 \) uses the main memory bandwidth
    • +
    • \( T_p \) uses the appropriate cache bandwidth
    • +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs069.html b/doc/pub/week9/html/._week9-bs069.html index 6db01713..1ca12bc3 100644 --- a/doc/pub/week9/html/._week9-bs069.html +++ b/doc/pub/week9/html/._week9-bs069.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Note 2

    +

    Upper bounds on speedup

    -

    - + +

    Assume that almost all parts of a code are perfectly +parallelizable (fraction \( f \)). The remainder, +fraction \( (1-f) \) cannot be parallelized at all. +

    + +

    That is, there is work that takes time \( W \) on one process; a fraction \( f \) of that work will take +time \( Wf/p \) on \( p \) processors. +

      -
    • Here we have used the \( MPI\_Barrier \) function to ensure that that every process has completed its set of instructions in a particular order.
    • -
    • A barrier is a special collective operation that does not allow the processes to continue until all processes in the communicator (here \( MPI\_COMM\_WORLD \)) have called \( MPI\_Barrier \).
    • -
    • The barriers make sure that all processes have reached the same point in the code. Many of the collective operations like \( MPI\_ALLREDUCE \) to be discussed later, have the same property; that is, no process can exit the operation until all processes have started.
    • +
    • What is the maximum possible speedup as a function of \( f \)?
    - -However, this is slightly more time-consuming since the processes synchronize between themselves as many times as there -are processes. In the next Hello world example we use the send and receive functions in order to a have a synchronized -action. - -

    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs070.html b/doc/pub/week9/html/._week9-bs070.html index 4d40beea..01108c04 100644 --- a/doc/pub/week9/html/._week9-bs070.html +++ b/doc/pub/week9/html/._week9-bs070.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Ordered output

    +

    Amdahl's law

    -

    - -

    - - -

    .....
    -int numprocs, my_rank, flag;
    -MPI_Status status;
    -MPI_Init (&nargs, &args);
    -MPI_Comm_size (MPI_COMM_WORLD, &numprocs);
    -MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);
    -if (my_rank > 0)
    -MPI_Recv (&flag, 1, MPI_INT, my_rank-1, 100, 
    -           MPI_COMM_WORLD, &status);
    -cout << "Hello world, I have  rank " << my_rank << " out of " 
    -<< numprocs << endl;
    -if (my_rank < numprocs-1)
    -MPI_Send (&my_rank, 1, MPI_INT, my_rank+1, 
    -          100, MPI_COMM_WORLD);
    -MPI_Finalize ();
    -
    -

    + +

    On one processor we have

    +$$ +T_1 = (1-f)W + fW = W +$$ + +

    On \( p \) processors we have

    +$$ +T_p = (1-f)W + \frac{fW}{p}, +$$ + +

    resulting in a speedup of

    +$$ +\frac{T_1}{T_p} = \frac{W}{(1-f)W+fW/p} +$$ + +

    As \( p \) goes to infinity, \( fW/p \) goes to zero, and the maximum speedup is

    +$$ +\frac{1}{1-f}, +$$ + +

    meaning that if +if \( f = 0.99 \) (all but \( 1\% \) parallelizable), the maximum speedup +is \( 1/(1-.99)=100 \)! +

    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs071.html b/doc/pub/week9/html/._week9-bs071.html index fec5d586..fed561cb 100644 --- a/doc/pub/week9/html/._week9-bs071.html +++ b/doc/pub/week9/html/._week9-bs071.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Note 3

    +

    How much is parallelizable

    -

    - -

    -The basic sending of messages is given by the function \( MPI\_SEND \), which in C/C++ -is defined as -

    - - -

    int MPI_Send(void *buf, int count, 
    -             MPI_Datatype datatype, 
    -             int dest, int tag, MPI_Comm comm)}
    -
    -

    -This single command allows the passing of any kind of variable, even a large array, to any group of tasks. -The variable buf is the variable we wish to send while count -is the number of variables we are passing. If we are passing only a single value, this should be 1. - -

    -If we transfer an array, it is the overall size of the array. -For example, if we want to send a 10 by 10 array, count would be \( 10\times 10=100 \) -since we are actually passing 100 values. - -

    + +

    If any non-parallel code slips into the +application, the parallel +performance is limited. +

    + +

    In many simulations, however, the fraction of non-parallelizable work +is \( 10^{-6} \) or less due to large arrays or objects that are perfectly parallelizable. +

    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs072.html b/doc/pub/week9/html/._week9-bs072.html index b76d114e..8f094ac5 100644 --- a/doc/pub/week9/html/._week9-bs072.html +++ b/doc/pub/week9/html/._week9-bs072.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Note 4

    +

    Today's situation of parallel computing

    -

    - -

    -Once you have sent a message, you must receive it on another task. The function \( MPI\_RECV \) -is similar to the send call. -

    - - -

    int MPI_Recv( void *buf, int count, MPI_Datatype datatype, 
    -            int source, 
    -            int tag, MPI_Comm comm, MPI_Status *status )
    -
    -

    -The arguments that are different from those in MPI\_SEND are -buf which is the name of the variable where you will be storing the received data, -source which replaces the destination in the send command. This is the return ID of the sender. - -

    -Finally, we have used \( MPI\_Status\_status \), -where one can check if the receive was completed. - -

    -The output of this code is the same as the previous example, but now -process 0 sends a message to process 1, which forwards it further -to process 2, and so forth. + -

    +

      +
    • Distributed memory is the dominant hardware configuration. There is a large diversity in these machines, from MPP (massively parallel processing) systems to clusters of off-the-shelf PCs, which are very cost-effective.
    • +
    • Message-passing is a mature programming paradigm and widely accepted. It often provides an efficient match to the hardware. It is primarily used for the distributed memory systems, but can also be used on shared memory systems.
    • +
    • Modern nodes have nowadays several cores, which makes it interesting to use both shared memory (the given node) and distributed memory (several nodes with communication). This leads often to codes which use both MPI and OpenMP.
    • +
    +

    Our lectures will focus on both MPI and OpenMP.

    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs073.html b/doc/pub/week9/html/._week9-bs073.html index b1b5ba3b..6acdfb79 100644 --- a/doc/pub/week9/html/._week9-bs073.html +++ b/doc/pub/week9/html/._week9-bs073.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Numerical integration in parallel

    +

    Overhead present in parallel computing

    -

    +

      -
    • The code example computes \( \pi \) using the trapezoidal rules.
    • -
    • The trapezoidal rule
    • +
    • Uneven load balance: not all the processors can perform useful work at all time.
    • +
    • Overhead of synchronization
    • +
    • Overhead of communication
    • +
    • Extra computation due to parallelization
    - -$$ - I=\int_a^bf(x) dx\approx h\left(f(a)/2 + f(a+h) +f(a+2h)+\dots +f(b-h)+ f(b)/2\right). -$$ - -Click on this link for the full program. - -

    +

    Due to the above overhead and that certain parts of a sequential +algorithm cannot be parallelized we may not achieve an optimal parallelization. +

    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs074.html b/doc/pub/week9/html/._week9-bs074.html index d479d163..0d951384 100644 --- a/doc/pub/week9/html/._week9-bs074.html +++ b/doc/pub/week9/html/._week9-bs074.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Dissection of trapezoidal rule with \( MPI\_reduce \)

    +

    Parallelizing a sequential algorithm

    -

    - -

    - - -

    //    Trapezoidal rule and numerical integration usign MPI
    -using namespace std;
    -#include <mpi.h>
    -#include <iostream>
    +
     
    -//     Here we define various functions called by the main program
    -
    -double int_function(double );
    -double trapezoidal_rule(double , double , int , double (*)(double));
    -
    -//   Main function begins here
    -int main (int nargs, char* args[])
    -{
    -  int n, local_n, numprocs, my_rank; 
    -  double a, b, h, local_a, local_b, total_sum, local_sum;   
    -  double  time_start, time_end, total_time;
    -
    -

    +

      +
    • Identify the part(s) of a sequential algorithm that can be executed in parallel. This is the difficult part,
    • +
    • Distribute the global work and data among \( P \) processors.
    • +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs075.html b/doc/pub/week9/html/._week9-bs075.html index 15d5355a..5266e16f 100644 --- a/doc/pub/week9/html/._week9-bs075.html +++ b/doc/pub/week9/html/._week9-bs075.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Dissection of trapezoidal rule

    +

    Strategies

    -

    - -

    - - -

      //  MPI initializations
    -  MPI_Init (&nargs, &args);
    -  MPI_Comm_size (MPI_COMM_WORLD, &numprocs);
    -  MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);
    -  time_start = MPI_Wtime();
    -  //  Fixed values for a, b and n 
    -  a = 0.0 ; b = 1.0;  n = 1000;
    -  h = (b-a)/n;    // h is the same for all processes 
    -  local_n = n/numprocs;  
    -  // make sure n > numprocs, else integer division gives zero
    -  // Length of each process' interval of
    -  // integration = local_n*h.  
    -  local_a = a + my_rank*local_n*h;
    -  local_b = local_a + local_n*h;
    -
    -

    + +

      +
    • Develop codes locally, run with some few processes and test your codes. Do benchmarking, timing and so forth on local nodes, for example your laptop or PC.
    • +
    • When you are convinced that your codes run correctly, you can start your production runs on available supercomputers.
    • +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs076.html b/doc/pub/week9/html/._week9-bs076.html index 80fa547b..e5f746fb 100644 --- a/doc/pub/week9/html/._week9-bs076.html +++ b/doc/pub/week9/html/._week9-bs076.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Integrating with MPI

    +

    How do I run MPI on a PC/Laptop? MPI

    -

    - -

    + +

    To install MPI is rather easy on hardware running unix/linux as operating systems, follow simply the instructions from the OpenMPI website. See also subsequent slides. +When you have made sure you have installed MPI on your PC/laptop, +

    +
      +
    • Compile with mpicxx/mpic++ or mpif90
    • +
    -
      total_sum = 0.0;
    -  local_sum = trapezoidal_rule(local_a, local_b, local_n, 
    -                               &int_function); 
    -  MPI_Reduce(&local_sum, &total_sum, 1, MPI_DOUBLE, 
    -              MPI_SUM, 0, MPI_COMM_WORLD);
    -  time_end = MPI_Wtime();
    -  total_time = time_end-time_start;
    -  if ( my_rank == 0) {
    -    cout << "Trapezoidal rule = " <<  total_sum << endl;
    -    cout << "Time = " <<  total_time  
    -         << " on number of processors: "  << numprocs  << endl;
    -  }
    -  // End MPI
    -  MPI_Finalize ();  
    -  return 0;
    -}  // end of main program
    -
    -

    +

    +
    +
    +
    +
    +
      # Compile and link
    +  mpic++ -O3 -o nameofprog.x nameofprog.cpp
    +  #  run code with for example 8 processes using mpirun/mpiexec
    +  mpiexec -n 8 ./nameofprog.x
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs077.html b/doc/pub/week9/html/._week9-bs077.html index 90fe99bd..2d14eea9 100644 --- a/doc/pub/week9/html/._week9-bs077.html +++ b/doc/pub/week9/html/._week9-bs077.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    How do I use \( MPI\_reduce \)?

    +

    Can I do it on my own PC/laptop? OpenMP installation

    -

    - -

    -Here we have used -

    + +

    If you wish to install MPI and OpenMP +on your laptop/PC, we recommend the following: +

    + +
      +
    • For OpenMP, the compile option -fopenmp is included automatically in recent versions of the C++ compiler and Fortran compilers. For users of different Linux distributions, simply use the available C++ or Fortran compilers and add the above compiler instructions, see also code examples below.
    • +
    • For OS X users however, install libomp
    • +
    -
    MPI_reduce( void *senddata, void* resultdata, int count, 
    -     MPI_Datatype datatype, MPI_Op, int root, MPI_Comm comm)
    -
    -

    -The two variables \( senddata \) and \( resultdata \) are obvious, besides the fact that one sends the address -of the variable or the first element of an array. If they are arrays they need to have the same size. -The variable \( count \) represents the total dimensionality, 1 in case of just one variable, -while \( MPI\_Datatype \) -defines the type of variable which is sent and received. +

    +
    +
    +
    +
    +
      brew install libomp
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    -The new feature is \( MPI\_Op \). It defines the type -of operation we want to do. +

    and compile and link as

    + + +
    +
    +
    +
    +
    +
    c++ -o <name executable> <name program.cpp>  -lomp
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs078.html b/doc/pub/week9/html/._week9-bs078.html index d9e91cc5..f3e68e76 100644 --- a/doc/pub/week9/html/._week9-bs078.html +++ b/doc/pub/week9/html/._week9-bs078.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    More on \( MPI\_Reduce \)

    +

    Installing MPI

    -

    -In our case, since we are summing -the rectangle contributions from every process we define \( MPI\_Op = MPI\_SUM \). -If we have an array or matrix we can search for the largest og smallest element by sending either \( MPI\_MAX \) or -\( MPI\_MIN \). If we want the location as well (which array element) we simply transfer -\( MPI\_MAXLOC \) or \( MPI\_MINOC \). If we want the product we write \( MPI\_PROD \). + +

    For linux/ubuntu users, you need to install two packages (alternatively use the synaptic package manager)

    -

    -\( MPI\_Allreduce \) is defined as -

    + +

    +
    +
    +
    +
    +
      sudo apt-get install libopenmpi-dev
    +  sudo apt-get install openmpi-bin
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    For OS X users, install brew (after having installed xcode and gcc, needed for the +gfortran compiler of openmpi) and then install with brew +

    -
    MPI_Allreduce( void *senddata, void* resultdata, int count, 
    -          MPI_Datatype datatype, MPI_Op, MPI_Comm comm)        
    -
    -

    +

    +
    +
    +
    +
    +
       brew install openmpi
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    When running an executable (code.x), run as

    + + +
    +
    +
    +
    +
    +
      mpirun -n 10 ./code.x
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    where we indicate that we want the number of processes to be 10.

    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs079.html b/doc/pub/week9/html/._week9-bs079.html index 8f52b31a..9564c933 100644 --- a/doc/pub/week9/html/._week9-bs079.html +++ b/doc/pub/week9/html/._week9-bs079.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Dissection of trapezoidal rule

    +

    Installing MPI and using Qt

    -

    + +

    With openmpi installed, when using Qt, add to your .pro file the instructions here

    -

    -We use \( MPI\_reduce \) to collect data from each process. Note also the use of the function -\( MPI\_Wtime \). -

    - - -

    //  this function defines the function to integrate
    -double int_function(double x)
    -{
    -  double value = 4./(1.+x*x);
    -  return value;
    -} // end of function to evaluate
    -
    -

    +

    You may need to tell Qt where openmpi is stored.

    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs080.html b/doc/pub/week9/html/._week9-bs080.html index c880cecb..07a4adcb 100644 --- a/doc/pub/week9/html/._week9-bs080.html +++ b/doc/pub/week9/html/._week9-bs080.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Dissection of trapezoidal rule

    +

    What is Message Passing Interface (MPI)?

    -

    -

    + - -

    //  this function defines the trapezoidal rule
    -double trapezoidal_rule(double a, double b, int n, 
    -                         double (*func)(double))
    -{
    -  double trapez_sum;
    -  double fa, fb, x, step;
    -  int    j;
    -  step=(b-a)/((double) n);
    -  fa=(*func)(a)/2. ;
    -  fb=(*func)(b)/2. ;
    -  trapez_sum=0.;
    -  for (j=1; j <= n-1; j++){
    -    x=j*step+a;
    -    trapez_sum+=(*func)(x);
    -  }
    -  trapez_sum=(trapez_sum+fb+fa)*step;
    -  return trapez_sum;
    -}  // end trapezoidal_rule 
    -
    -

    +

    MPI is a library, not a language. It specifies the names, calling sequences and results of functions +or subroutines to be called from C/C++ or Fortran programs, and the classes and methods that make up the MPI C++ +library. The programs that users write in Fortran, C or C++ are compiled with ordinary compilers and linked +with the MPI library. +

    + +

    MPI programs should be able to run +on all possible machines and run all MPI implementetations without change. +

    + +

    An MPI computation is a collection of processes communicating with messages.

    - -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs081.html b/doc/pub/week9/html/._week9-bs081.html index 08d7daa6..6752638a 100644 --- a/doc/pub/week9/html/._week9-bs081.html +++ b/doc/pub/week9/html/._week9-bs081.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    The quantum dot program for two electrons

    +

    Going Parallel with MPI

    -

    -

    - - -

    // Variational Monte Carlo for atoms with importance sampling, slater det
    -// Test case for 2-electron quantum dot, no classes using Mersenne-Twister RNG
    -#include "mpi.h"
    -#include <cmath>
    -#include <random>
    -#include <string>
    -#include <iostream>
    -#include <fstream>
    -#include <iomanip>
    -#include "vectormatrixclass.h"
    -
    -using namespace  std;
    -// output file as global variable
    -ofstream ofile;  
    -// the step length and its squared inverse for the second derivative 
    -//  Here we define global variables  used in various functions
    -//  These can be changed by using classes
    -int Dimension = 2; 
    -int NumberParticles  = 2;  //  we fix also the number of electrons to be 2
    -
    -// declaration of functions 
    -
    -// The Mc sampling for the variational Monte Carlo 
    -void  MonteCarloSampling(int, double &, double &, Vector &);
    -
    -// The variational wave function
    -double  WaveFunction(Matrix &, Vector &);
    -
    -// The local energy 
    -double  LocalEnergy(Matrix &, Vector &);
    -
    -// The quantum force
    -void  QuantumForce(Matrix &, Matrix &, Vector &);
    -
    -
    -// inline function for single-particle wave function
    -inline double SPwavefunction(double r, double alpha) { 
    -   return exp(-alpha*r*0.5);
    -}
    -
    -// inline function for derivative of single-particle wave function
    -inline double DerivativeSPwavefunction(double r, double alpha) { 
    -  return -r*alpha;
    -}
    -
    -// function for absolute value of relative distance
    -double RelativeDistance(Matrix &r, int i, int j) { 
    -      double r_ij = 0;  
    -      for (int k = 0; k < Dimension; k++) { 
    -	r_ij += (r(i,k)-r(j,k))*(r(i,k)-r(j,k));
    -      }
    -      return sqrt(r_ij); 
    -}
    -
    -// inline function for derivative of Jastrow factor
    -inline double JastrowDerivative(Matrix &r, double beta, int i, int j, int k){
    -  return (r(i,k)-r(j,k))/(RelativeDistance(r, i, j)*pow(1.0+beta*RelativeDistance(r, i, j),2));
    -}
    -
    -// function for square of position of single particle
    -double singleparticle_pos2(Matrix &r, int i) { 
    -    double r_single_particle = 0;
    -    for (int j = 0; j < Dimension; j++) { 
    -      r_single_particle  += r(i,j)*r(i,j);
    -    }
    -    return r_single_particle;
    -}
    -
    -void lnsrch(int n, Vector &xold, double fold, Vector &g, Vector &p, Vector &x,
    -		 double *f, double stpmax, int *check, double (*func)(Vector &p));
    -
    -void dfpmin(Vector &p, int n, double gtol, int *iter, double *fret,
    -	    double(*func)(Vector &p), void (*dfunc)(Vector &p, Vector &g));
    -
    -static double sqrarg;
    -#define SQR(a) ((sqrarg=(a)) == 0.0 ? 0.0 : sqrarg*sqrarg)
    -
    -
    -static double maxarg1,maxarg2;
    -#define FMAX(a,b) (maxarg1=(a),maxarg2=(b),(maxarg1) > (maxarg2) ?\
    -        (maxarg1) : (maxarg2))
    +
    +

    Task parallelism: the work of a global problem can be divided +into a number of independent tasks, which rarely need to synchronize. +Monte Carlo simulations or numerical integration are examples of this. +

    +

    MPI is a message-passing library where all the routines +have corresponding C/C++-binding +

    -// Begin of main program - -int main(int argc, char* argv[]) -{ - - // MPI initializations - int NumberProcesses, MyRank, NumberMCsamples; - MPI_Init (&argc, &argv); - MPI_Comm_size (MPI_COMM_WORLD, &NumberProcesses); - MPI_Comm_rank (MPI_COMM_WORLD, &MyRank); - double StartTime = MPI_Wtime(); - if (MyRank == 0 && argc <= 1) { - cout << "Bad Usage: " << argv[0] << - " Read also output file on same line and number of Monte Carlo cycles" << endl; - } - // Read filename and number of Monte Carlo cycles from the command line - if (MyRank == 0 && argc > 2) { - string filename = argv[1]; // first command line argument after name of program - NumberMCsamples = atoi(argv[2]); - string fileout = filename; - string argument = to_string(NumberMCsamples); - // Final filename as filename+NumberMCsamples - fileout.append(argument); - ofile.open(fileout); - } - // broadcast the number of Monte Carlo samples - MPI_Bcast (&NumberMCsamples, 1, MPI_INT, 0, MPI_COMM_WORLD); - // Two variational parameters only - Vector VariationalParameters(2); - int TotalNumberMCsamples = NumberMCsamples*NumberProcesses; - // Loop over variational parameters - for (double alpha = 0.5; alpha <= 1.5; alpha +=0.1){ - for (double beta = 0.1; beta <= 0.5; beta +=0.05){ - VariationalParameters(0) = alpha; // value of alpha - VariationalParameters(1) = beta; // value of beta - // Do the mc sampling and accumulate data with MPI_Reduce - double TotalEnergy, TotalEnergySquared, LocalProcessEnergy, LocalProcessEnergy2; - LocalProcessEnergy = LocalProcessEnergy2 = 0.0; - MonteCarloSampling(NumberMCsamples, LocalProcessEnergy, LocalProcessEnergy2, VariationalParameters); - // Collect data in total averages - MPI_Reduce(&LocalProcessEnergy, &TotalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); - MPI_Reduce(&LocalProcessEnergy2, &TotalEnergySquared, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); - // Print out results in case of Master node, set to MyRank = 0 - if ( MyRank == 0) { - double Energy = TotalEnergy/( (double)NumberProcesses); - double Variance = TotalEnergySquared/( (double)NumberProcesses)-Energy*Energy; - double StandardDeviation = sqrt(Variance/((double)TotalNumberMCsamples)); // over optimistic error - ofile << setiosflags(ios::showpoint | ios::uppercase); - ofile << setw(15) << setprecision(8) << VariationalParameters(0); - ofile << setw(15) << setprecision(8) << VariationalParameters(1); - ofile << setw(15) << setprecision(8) << Energy; - ofile << setw(15) << setprecision(8) << Variance; - ofile << setw(15) << setprecision(8) << StandardDeviation << endl; - } - } - } - double EndTime = MPI_Wtime(); - double TotalTime = EndTime-StartTime; - if ( MyRank == 0 ) cout << "Time = " << TotalTime << " on number of processors: " << NumberProcesses << endl; - if (MyRank == 0) ofile.close(); // close output file - // End MPI - MPI_Finalize (); - return 0; -} // end of main function - - -// Monte Carlo sampling with the Metropolis algorithm - -void MonteCarloSampling(int NumberMCsamples, double &cumulative_e, double &cumulative_e2, Vector &VariationalParameters) -{ - - // Initialize the seed and call the Mersienne algo - std::random_device rd; - std::mt19937_64 gen(rd()); - // Set up the uniform distribution for x \in [[0, 1] - std::uniform_real_distribution<double> UniformNumberGenerator(0.0,1.0); - std::normal_distribution<double> Normaldistribution(0.0,1.0); - // diffusion constant from Schroedinger equation - double D = 0.5; - double timestep = 0.05; // we fix the time step for the gaussian deviate - // allocate matrices which contain the position of the particles - Matrix OldPosition( NumberParticles, Dimension), NewPosition( NumberParticles, Dimension); - Matrix OldQuantumForce(NumberParticles, Dimension), NewQuantumForce(NumberParticles, Dimension); - double Energy = 0.0; double EnergySquared = 0.0; double DeltaE = 0.0; - // initial trial positions - for (int i = 0; i < NumberParticles; i++) { - for (int j = 0; j < Dimension; j++) { - OldPosition(i,j) = Normaldistribution(gen)*sqrt(timestep); - } - } - double OldWaveFunction = WaveFunction(OldPosition, VariationalParameters); - QuantumForce(OldPosition, OldQuantumForce, VariationalParameters); - // loop over monte carlo cycles - for (int cycles = 1; cycles <= NumberMCsamples; cycles++){ - // new position - for (int i = 0; i < NumberParticles; i++) { - for (int j = 0; j < Dimension; j++) { - // gaussian deviate to compute new positions using a given timestep - NewPosition(i,j) = OldPosition(i,j) + Normaldistribution(gen)*sqrt(timestep)+OldQuantumForce(i,j)*timestep*D; - // NewPosition(i,j) = OldPosition(i,j) + gaussian_deviate(&idum)*sqrt(timestep)+OldQuantumForce(i,j)*timestep*D; - } - // for the other particles we need to set the position to the old position since - // we move only one particle at the time - for (int k = 0; k < NumberParticles; k++) { - if ( k != i) { - for (int j = 0; j < Dimension; j++) { - NewPosition(k,j) = OldPosition(k,j); - } - } - } - double NewWaveFunction = WaveFunction(NewPosition, VariationalParameters); - QuantumForce(NewPosition, NewQuantumForce, VariationalParameters); - // we compute the log of the ratio of the greens functions to be used in the - // Metropolis-Hastings algorithm - double GreensFunction = 0.0; - for (int j = 0; j < Dimension; j++) { - GreensFunction += 0.5*(OldQuantumForce(i,j)+NewQuantumForce(i,j))* - (D*timestep*0.5*(OldQuantumForce(i,j)-NewQuantumForce(i,j))-NewPosition(i,j)+OldPosition(i,j)); - } - GreensFunction = exp(GreensFunction); - // The Metropolis test is performed by moving one particle at the time - if(UniformNumberGenerator(gen) <= GreensFunction*NewWaveFunction*NewWaveFunction/OldWaveFunction/OldWaveFunction ) { - for (int j = 0; j < Dimension; j++) { - OldPosition(i,j) = NewPosition(i,j); - OldQuantumForce(i,j) = NewQuantumForce(i,j); - } - OldWaveFunction = NewWaveFunction; - } - } // end of loop over particles - // compute local energy - double DeltaE = LocalEnergy(OldPosition, VariationalParameters); - // update energies - Energy += DeltaE; - EnergySquared += DeltaE*DeltaE; - } // end of loop over MC trials - // update the energy average and its squared - cumulative_e = Energy/NumberMCsamples; - cumulative_e2 = EnergySquared/NumberMCsamples; -} // end MonteCarloSampling function - - -// Function to compute the squared wave function and the quantum force - -double WaveFunction(Matrix &r, Vector &VariationalParameters) -{ - double wf = 0.0; - // full Slater determinant for two particles, replace with Slater det for more particles - wf = SPwavefunction(singleparticle_pos2(r, 0), VariationalParameters(0))*SPwavefunction(singleparticle_pos2(r, 1),VariationalParameters(0)); - // contribution from Jastrow factor - for (int i = 0; i < NumberParticles-1; i++) { - for (int j = i+1; j < NumberParticles; j++) { - wf *= exp(RelativeDistance(r, i, j)/((1.0+VariationalParameters(1)*RelativeDistance(r, i, j)))); - } - } - return wf; -} - -// Function to calculate the local energy without numerical derivation of kinetic energy - -double LocalEnergy(Matrix &r, Vector &VariationalParameters) -{ - - // compute the kinetic and potential energy from the single-particle part - // for a many-electron system this has to be replaced by a Slater determinant - // The absolute value of the interparticle length - Matrix length( NumberParticles, NumberParticles); - // Set up interparticle distance - for (int i = 0; i < NumberParticles-1; i++) { - for(int j = i+1; j < NumberParticles; j++){ - length(i,j) = RelativeDistance(r, i, j); - length(j,i) = length(i,j); - } - } - double KineticEnergy = 0.0; - // Set up kinetic energy from Slater and Jastrow terms - for (int i = 0; i < NumberParticles; i++) { - for (int k = 0; k < Dimension; k++) { - double sum1 = 0.0; - for(int j = 0; j < NumberParticles; j++){ - if ( j != i) { - sum1 += JastrowDerivative(r, VariationalParameters(1), i, j, k); - } - } - KineticEnergy += (sum1+DerivativeSPwavefunction(r(i,k),VariationalParameters(0)))*(sum1+DerivativeSPwavefunction(r(i,k),VariationalParameters(0))); - } - } - KineticEnergy += -2*VariationalParameters(0)*NumberParticles; - for (int i = 0; i < NumberParticles-1; i++) { - for (int j = i+1; j < NumberParticles; j++) { - KineticEnergy += 2.0/(pow(1.0 + VariationalParameters(1)*length(i,j),2))*(1.0/length(i,j)-2*VariationalParameters(1)/(1+VariationalParameters(1)*length(i,j)) ); - } - } - KineticEnergy *= -0.5; - // Set up potential energy, external potential + eventual electron-electron repulsion - double PotentialEnergy = 0; - for (int i = 0; i < NumberParticles; i++) { - double DistanceSquared = singleparticle_pos2(r, i); - PotentialEnergy += 0.5*DistanceSquared; // sp energy HO part, note it has the oscillator frequency set to 1! - } - // Add the electron-electron repulsion - for (int i = 0; i < NumberParticles-1; i++) { - for (int j = i+1; j < NumberParticles; j++) { - PotentialEnergy += 1.0/length(i,j); - } - } - double LocalE = KineticEnergy+PotentialEnergy; - return LocalE; -} - -// Compute the analytical expression for the quantum force -void QuantumForce(Matrix &r, Matrix &qforce, Vector &VariationalParameters) -{ - // compute the first derivative - for (int i = 0; i < NumberParticles; i++) { - for (int k = 0; k < Dimension; k++) { - // single-particle part, replace with Slater det for larger systems - double sppart = DerivativeSPwavefunction(r(i,k),VariationalParameters(0)); - // Jastrow factor contribution - double Jsum = 0.0; - for (int j = 0; j < NumberParticles; j++) { - if ( j != i) { - Jsum += JastrowDerivative(r, VariationalParameters(1), i, j, k); - } - } - qforce(i,k) = 2.0*(Jsum+sppart); - } - } -} // end of QuantumForce function - - -#define ITMAX 200 -#define EPS 3.0e-8 -#define TOLX (4*EPS) -#define STPMX 100.0 - -void dfpmin(Vector &p, int n, double gtol, int *iter, double *fret, - double(*func)(Vector &p), void (*dfunc)(Vector &p, Vector &g)) -{ - - int check,i,its,j; - double den,fac,fad,fae,fp,stpmax,sum=0.0,sumdg,sumxi,temp,test; - Vector dg(n), g(n), hdg(n), pnew(n), xi(n); - Matrix hessian(n,n); - - fp=(*func)(p); - (*dfunc)(p,g); - for (i = 0;i < n;i++) { - for (j = 0; j< n;j++) hessian(i,j)=0.0; - hessian(i,i)=1.0; - xi(i) = -g(i); - sum += p(i)*p(i); - } - stpmax=STPMX*FMAX(sqrt(sum),(double)n); - for (its=1;its<=ITMAX;its++) { - *iter=its; - lnsrch(n,p,fp,g,xi,pnew,fret,stpmax,&check,func); - fp = *fret; - for (i = 0; i< n;i++) { - xi(i)=pnew(i)-p(i); - p(i)=pnew(i); - } - test=0.0; - for (i = 0;i< n;i++) { - temp=fabs(xi(i))/FMAX(fabs(p(i)),1.0); - if (temp > test) test=temp; - } - if (test < TOLX) { - return; - } - for (i=0;i<n;i++) dg(i)=g(i); - (*dfunc)(p,g); - test=0.0; - den=FMAX(*fret,1.0); - for (i=0;i<n;i++) { - temp=fabs(g(i))*FMAX(fabs(p(i)),1.0)/den; - if (temp > test) test=temp; - } - if (test < gtol) { - return; - } - for (i=0;i<n;i++) dg(i)=g(i)-dg(i); - for (i=0;i<n;i++) { - hdg(i)=0.0; - for (j=0;j<n;j++) hdg(i) += hessian(i,j)*dg(j); - } - fac=fae=sumdg=sumxi=0.0; - for (i=0;i<n;i++) { - fac += dg(i)*xi(i); - fae += dg(i)*hdg(i); - sumdg += SQR(dg(i)); - sumxi += SQR(xi(i)); - } - if (fac*fac > EPS*sumdg*sumxi) { - fac=1.0/fac; - fad=1.0/fae; - for (i=0;i<n;i++) dg(i)=fac*xi(i)-fad*hdg(i); - for (i=0;i<n;i++) { - for (j=0;j<n;j++) { - hessian(i,j) += fac*xi(i)*xi(j) - -fad*hdg(i)*hdg(j)+fae*dg(i)*dg(j); - } - } - } - for (i=0;i<n;i++) { - xi(i)=0.0; - for (j=0;j<n;j++) xi(i) -= hessian(i,j)*g(j); - } - } - cout << "too many iterations in dfpmin" << endl; -} -#undef ITMAX -#undef EPS -#undef TOLX -#undef STPMX - -#define ALF 1.0e-4 -#define TOLX 1.0e-7 + +
    +
    +
    +
    +
    +
       MPI_Command_name
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -void lnsrch(int n, Vector &xold, double fold, Vector &g, Vector &p, Vector &x, - double *f, double stpmax, int *check, double (*func)(Vector &p)) -{ - int i; - double a,alam,alam2,alamin,b,disc,f2,fold2,rhs1,rhs2,slope,sum,temp, - test,tmplam; +

    and Fortran-binding (routine names are in uppercase, but can also be in lower case)

    - *check=0; - for (sum=0.0,i=0;i<n;i++) sum += p(i)*p(i); - sum=sqrt(sum); - if (sum > stpmax) - for (i=0;i<n;i++) p(i) *= stpmax/sum; - for (slope=0.0,i=0;i<n;i++) - slope += g(i)*p(i); - test=0.0; - for (i=0;i<n;i++) { - temp=fabs(p(i))/FMAX(fabs(xold(i)),1.0); - if (temp > test) test=temp; - } - alamin=TOLX/test; - alam=1.0; - for (;;) { - for (i=0;i<n;i++) x(i)=xold(i)+alam*p(i); - *f=(*func)(x); - if (alam < alamin) { - for (i=0;i<n;i++) x(i)=xold(i); - *check=1; - return; - } else if (*f <= fold+ALF*alam*slope) return; - else { - if (alam == 1.0) - tmplam = -slope/(2.0*(*f-fold-slope)); - else { - rhs1 = *f-fold-alam*slope; - rhs2=f2-fold2-alam2*slope; - a=(rhs1/(alam*alam)-rhs2/(alam2*alam2))/(alam-alam2); - b=(-alam2*rhs1/(alam*alam)+alam*rhs2/(alam2*alam2))/(alam-alam2); - if (a == 0.0) tmplam = -slope/(2.0*b); - else { - disc=b*b-3.0*a*slope; - if (disc<0.0) cout << "Roundoff problem in lnsrch." << endl; - else tmplam=(-b+sqrt(disc))/(3.0*a); - } - if (tmplam>0.5*alam) - tmplam=0.5*alam; - } - } - alam2=alam; - f2 = *f; - fold2=fold; - alam=FMAX(tmplam,0.1*alam); - } -} -#undef ALF -#undef TOLX -
    -

    + +

    +
    +
    +
    +
    +
       MPI_COMMAND_NAME
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs082.html b/doc/pub/week9/html/._week9-bs082.html index 04c49817..70705a4a 100644 --- a/doc/pub/week9/html/._week9-bs082.html +++ b/doc/pub/week9/html/._week9-bs082.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    What is OpenMP

    +

    MPI is a library

    -

    - -

      -
    • OpenMP provides high-level thread programming
    • -
    • Multiple cooperating threads are allowed to run simultaneously
    • -
    • Threads are created and destroyed dynamically in a fork-join pattern
    • + +

      MPI is a library specification for the message passing interface, +proposed as a standard. +

        -
      • An OpenMP program consists of a number of parallel regions
      • -
      • Between two parallel regions there is only one master thread
      • -
      • In the beginning of a parallel region, a team of new threads is spawned
      • +
      • independent of hardware;
      • +
      • not a language or compiler specification;
      • +
      • not a specific implementation or product.
      +

      A message passing standard for portability and ease-of-use. +Designed for high performance. +

      -
    • The newly spawned threads work simultaneously with the master thread
    • -
    • At the end of a parallel region, the new threads are destroyed
    • -
    - -Many good tutorials online and excellent textbook - -
      -
    1. Using OpenMP, by B. Chapman, G. Jost, and A. van der Pas
    2. -
    3. Many tutorials online like OpenMP official site
    4. -
    +

    Insert communication and synchronization functions where necessary.

    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs083.html b/doc/pub/week9/html/._week9-bs083.html index 09291884..4dec0aed 100644 --- a/doc/pub/week9/html/._week9-bs083.html +++ b/doc/pub/week9/html/._week9-bs083.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Getting started, things to remember

    +

    Bindings to MPI routines

    -

    + -

      -
    • Remember the header file
    • -
    - -

    +

    MPI is a message-passing library where all the routines +have corresponding C/C++-binding +

    -
    #include <omp.h>
    -
    -
      -
    • Insert compiler directives in C++ syntax as
    • -
    - -

    +

    +
    +
    +
    +
    +
       MPI_Command_name
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    - -
    #pragma omp...
    -
    -
      -
    • Compile with for example c++ -fopenmp code.cpp
    • -
    • Execute
    • +

      and Fortran-binding (routine names are in uppercase, but can also be in lower case)

      -
        -
      • Remember to assign the environment variable OMP NUM THREADS
      • -
      • It specifies the total number of threads inside a parallel region, if not otherwise overwritten
      • -
      + +
      +
      +
      +
      +
      +
         MPI_COMMAND_NAME
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      -
    +

    The discussion in these slides focuses on the C++ binding.

    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs084.html b/doc/pub/week9/html/._week9-bs084.html index dae1448c..4ba1f0d6 100644 --- a/doc/pub/week9/html/._week9-bs084.html +++ b/doc/pub/week9/html/._week9-bs084.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    OpenMP syntax

    - +

    Communicator

    +
    +
    +
      -
    • Mostly directives
    • +
    • A group of MPI processes with a name (context).
    • +
    • Any process is identified by its rank. The rank is only meaningful within a particular communicator.
    • +
    • By default the communicator contains all the MPI processes.
    -

    - -

    #pragma omp construct [ clause ...]
    -
    -
      -
    • Some functions and types
    • -
    - -

    +

    +
    +
    +
    +
    +
      MPI_COMM_WORLD 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    - -
    #include <omp.h>
    -
      -
    • Most apply to a block of code
    • -
    • Specifically, a structured block
    • -
    • Enter at top, exit at bottom only, exit(), abort() permitted
    • +
    • Mechanism to identify subset of processes.
    • +
    • Promotes modular design of parallel libraries.
    +
    +
    +

    @@ -640,29 +711,22 @@

    OpenMP syntax

  • 93
  • 94
  • ...
  • -
  • 120
  • +
  • 141
  • »
  • -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs085.html b/doc/pub/week9/html/._week9-bs085.html index af59c3c3..81713681 100644 --- a/doc/pub/week9/html/._week9-bs085.html +++ b/doc/pub/week9/html/._week9-bs085.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Different OpenMP styles of parallelism

    -OpenMP supports several different ways to specify thread parallelism +

    Some of the most important MPI functions

    +
    +
    +
      -
    • General parallel regions: All threads execute the code, roughly as if you made a routine of that region and created a thread to run that code
    • -
    • Parallel loops: Special case for loops, simplifies data parallel code
    • -
    • Task parallelism, new in OpenMP 3
    • -
    • Several ways to manage thread coordination, including Master regions and Locks
    • -
    • Memory model for shared data
    • +
    • \( MPI\_Init \) - initiate an MPI computation
    • +
    • \( MPI\_Finalize \) - terminate the MPI computation and clean up
    • +
    • \( MPI\_Comm\_size \) - how many processes participate in a given MPI communicator?
    • +
    • \( MPI\_Comm\_rank \) - which one am I? (A number between 0 and size-1.)
    • +
    • \( MPI\_Send \) - send a message to a particular process within an MPI communicator
    • +
    • \( MPI\_Recv \) - receive a message from a particular process within an MPI communicator
    • +
    • \( MPI\_reduce \) or \( MPI\_Allreduce \), send and receive messages
    +
    +
    +

    @@ -625,29 +689,22 @@

    Different OpenMP
  • 94
  • 95
  • ...
  • -
  • 120
  • +
  • 141
  • »
  • -

    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs086.html b/doc/pub/week9/html/._week9-bs086.html index dfb9e30b..0d4d72c4 100644 --- a/doc/pub/week9/html/._week9-bs086.html +++ b/doc/pub/week9/html/._week9-bs086.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    General code structure

    +

    The first MPI C/C++ program

    -

    -

    + + +

    Let every process write "Hello world" (oh not this program again!!) on the standard output.

    -
    #include <omp.h>
    -main ()
    -{
    -int var1, var2, var3;
    -/* serial code */
    -/* ... */
    -/* start of a parallel region */
    -#pragma omp parallel private(var1, var2) shared(var3)
    -{
    -/* ... */
    -}
    -/* more serial code */
    -/* ... */
    -/* another parallel region */
    -#pragma omp parallel
    -{
    -/* ... */
    -}
    -}
    -
    -

    +

    +
    +
    +
    +
    +
    using namespace std;
    +#include <mpi.h>
    +#include <iostream>
    +int main (int nargs, char* args[])
    +{
    +int numprocs, my_rank;
    +//   MPI initializations
    +MPI_Init (&nargs, &args);
    +MPI_Comm_size (MPI_COMM_WORLD, &numprocs);
    +MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);
    +cout << "Hello world, I have  rank " << my_rank << " out of " 
    +     << numprocs << endl;
    +//  End MPI
    +MPI_Finalize ();
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs087.html b/doc/pub/week9/html/._week9-bs087.html index 425b579a..134f35cc 100644 --- a/doc/pub/week9/html/._week9-bs087.html +++ b/doc/pub/week9/html/._week9-bs087.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Parallel region

    +

    The Fortran program

    -

    - -

      -
    • A parallel region is a block of code that is executed by a team of threads
    • -
    • The following compiler directive creates a parallel region
    • -
    - -

    - - -

    #pragma omp parallel { ... }
    -
    -
      -
    • Clauses can be added at the end of the directive
    • -
    • Most often used clauses:
    • - -
        -
      • default(shared) or default(none)
      • -
      • public(list of variables)
      • -
      • private(list of variables)
      • -
      - -
    + + + +
    +
    +
    +
    +
    +
    PROGRAM hello
    +INCLUDE "mpif.h"
    +INTEGER:: size, my_rank, ierr
    +
    +CALL  MPI_INIT(ierr)
    +CALL MPI_COMM_SIZE(MPI_COMM_WORLD, size, ierr)
    +CALL MPI_COMM_RANK(MPI_COMM_WORLD, my_rank, ierr)
    +WRITE(*,*)"Hello world, I've rank ",my_rank," out of ",size
    +CALL MPI_FINALIZE(ierr)
    +
    +END PROGRAM hello
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs088.html b/doc/pub/week9/html/._week9-bs088.html index 9c76ad31..4475cf1e 100644 --- a/doc/pub/week9/html/._week9-bs088.html +++ b/doc/pub/week9/html/._week9-bs088.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Hello world, not again, please!

    +

    Note 1

    -

    -

    + - -

    #include <omp.h>
    -#include <cstdio>
    -int main (int argc, char *argv[])
    -{
    -int th_id, nthreads;
    -#pragma omp parallel private(th_id) shared(nthreads)
    -{
    -th_id = omp_get_thread_num();
    -printf("Hello World from thread %d\n", th_id);
    -#pragma omp barrier
    -if ( th_id == 0 ) {
    -nthreads = omp_get_num_threads();
    -printf("There are %d threads\n",nthreads);
    -}
    -}
    -return 0;
    -}
    -
    -

    +

      +
    • The output to screen is not ordered since all processes are trying to write to screen simultaneously.
    • +
    • It is the operating system which opts for an ordering.
    • +
    • If we wish to have an organized output, starting from the first process, we may rewrite our program as in the next example.
    • +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs089.html b/doc/pub/week9/html/._week9-bs089.html index b271a939..64cbb14f 100644 --- a/doc/pub/week9/html/._week9-bs089.html +++ b/doc/pub/week9/html/._week9-bs089.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Hello world, yet another variant

    +

    Ordered output with MPIBarrier

    -

    -

    + - -

    #include <cstdio>
    -#include <omp.h>
    -int main(int argc, char *argv[]) 
    -{
    - omp_set_num_threads(4); 
    -#pragma omp parallel
    - {
    -   int id = omp_get_thread_num();
    -   int nproc = omp_get_num_threads(); 
    -   cout << "Hello world with id number and processes " <<  id <<  nproc << endl;
    - } 
    -return 0;
    -}
    -
    -

    -Variables declared outside of the parallel region are shared by all threads -If a variable like id is declared outside of the -

    -

    #pragma omp parallel, 
    -
    -

    -it would have been shared by various the threads, possibly causing erroneous output - -

      -
    • Why? What would go wrong? Why do we add possibly?
    • -
    +
    +
    +
    +
    +
    +
    int main (int nargs, char* args[])
    +{
    + int numprocs, my_rank, i;
    + MPI_Init (&nargs, &args);
    + MPI_Comm_size (MPI_COMM_WORLD, &numprocs);
    + MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);
    + for (i = 0; i < numprocs; i++) {}
    + MPI_Barrier (MPI_COMM_WORLD);
    + if (i == my_rank) {
    + cout << "Hello world, I have  rank " << my_rank << 
    +        " out of " << numprocs << endl;}
    +      MPI_Finalize ();
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs090.html b/doc/pub/week9/html/._week9-bs090.html index 4d5a2bce..ad85091c 100644 --- a/doc/pub/week9/html/._week9-bs090.html +++ b/doc/pub/week9/html/._week9-bs090.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Important OpenMP library routines

    +

    Note 2

    -

    - +

      -
    • int omp get num threads (), returns the number of threads inside a parallel region
    • -
    • int omp get thread num (), returns the a thread for each thread inside a parallel region
    • -
    • void omp set num threads (int), sets the number of threads to be used
    • -
    • void omp set nested (int), turns nested parallelism on/off
    • +
    • Here we have used the \( MPI\_Barrier \) function to ensure that that every process has completed its set of instructions in a particular order.
    • +
    • A barrier is a special collective operation that does not allow the processes to continue until all processes in the communicator (here \( MPI\_COMM\_WORLD \)) have called \( MPI\_Barrier \).
    • +
    • The barriers make sure that all processes have reached the same point in the code. Many of the collective operations like \( MPI\_ALLREDUCE \) to be discussed later, have the same property; that is, no process can exit the operation until all processes have started.
    +

    However, this is slightly more time-consuming since the processes synchronize between themselves as many times as there +are processes. In the next Hello world example we use the send and receive functions in order to a have a synchronized +action. +

    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs091.html b/doc/pub/week9/html/._week9-bs091.html index 981265e6..2dfc5223 100644 --- a/doc/pub/week9/html/._week9-bs091.html +++ b/doc/pub/week9/html/._week9-bs091.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Private variables

    +

    Ordered output

    -

    -Private clause can be used to make thread- private versions of such variables: -

    + - -

    #pragma omp parallel private(id)
    -{
    - int id = omp_get_thread_num();
    - cout << "My thread num" << id << endl; 
    -}
    -
    -
      -
    • What is their value on entry? Exit?
    • -
    • OpenMP provides ways to control that
    • -
    • Can use default(none) to require the sharing of each variable to be described
    • -
    + + +
    +
    +
    +
    +
    +
    .....
    +int numprocs, my_rank, flag;
    +MPI_Status status;
    +MPI_Init (&nargs, &args);
    +MPI_Comm_size (MPI_COMM_WORLD, &numprocs);
    +MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);
    +if (my_rank > 0)
    +MPI_Recv (&flag, 1, MPI_INT, my_rank-1, 100, 
    +           MPI_COMM_WORLD, &status);
    +cout << "Hello world, I have  rank " << my_rank << " out of " 
    +<< numprocs << endl;
    +if (my_rank < numprocs-1)
    +MPI_Send (&my_rank, 1, MPI_INT, my_rank+1, 
    +          100, MPI_COMM_WORLD);
    +MPI_Finalize ();
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs092.html b/doc/pub/week9/html/._week9-bs092.html index 10e49d65..ae90d140 100644 --- a/doc/pub/week9/html/._week9-bs092.html +++ b/doc/pub/week9/html/._week9-bs092.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Master region

    +

    Note 3

    -

    -It is often useful to have only one thread execute some of the code in a parallel region. I/O statements are a common example -

    + + +

    The basic sending of messages is given by the function \( MPI\_SEND \), which in C/C++ +is defined as +

    -
    #pragma omp parallel 
    -{
    -  #pragma omp master
    -   {
    -      int id = omp_get_thread_num();
    -      cout << "My thread num" << id << endl; 
    -   } 
    -}
    -
    -

    +

    +
    +
    +
    +
    +
    int MPI_Send(void *buf, int count, 
    +             MPI_Datatype datatype, 
    +             int dest, int tag, MPI_Comm comm)}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    This single command allows the passing of any kind of variable, even a large array, to any group of tasks. +The variable buf is the variable we wish to send while count +is the number of variables we are passing. If we are passing only a single value, this should be 1. +

    + +

    If we transfer an array, it is the overall size of the array. +For example, if we want to send a 10 by 10 array, count would be \( 10\times 10=100 \) +since we are actually passing 100 values. +

    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs093.html b/doc/pub/week9/html/._week9-bs093.html index 01280a2b..f2e07cd3 100644 --- a/doc/pub/week9/html/._week9-bs093.html +++ b/doc/pub/week9/html/._week9-bs093.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Parallel for loop

    +

    Note 4

    -

    - -

      -
    • Inside a parallel region, the following compiler directive can be used to parallelize a for-loop:
    • -
    + -

    +

    Once you have sent a message, you must receive it on another task. The function \( MPI\_RECV \) +is similar to the send call. +

    -
    #pragma omp for
    -
    -
      -
    • Clauses can be added, such as
    • +
      +
      +
      +
      +
      +
      int MPI_Recv( void *buf, int count, MPI_Datatype datatype, 
      +            int source, 
      +            int tag, MPI_Comm comm, MPI_Status *status )
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      -
        -
      • schedule(static, chunk size)
      • -
      • schedule(dynamic, chunk size)
      • -
      • schedule(guided, chunk size) (non-deterministic allocation)
      • -
      • schedule(runtime)
      • -
      • private(list of variables)
      • -
      • reduction(operator:variable)
      • -
      • nowait
      • -
      +

      The arguments that are different from those in MPI\_SEND are +buf which is the name of the variable where you will be storing the received data, +source which replaces the destination in the send command. This is the return ID of the sender. +

      -
    +

    Finally, we have used \( MPI\_Status\_status \), +where one can check if the receive was completed. +

    + +

    The output of this code is the same as the previous example, but now +process 0 sends a message to process 1, which forwards it further +to process 2, and so forth. +

    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs094.html b/doc/pub/week9/html/._week9-bs094.html index f936541b..62f58770 100644 --- a/doc/pub/week9/html/._week9-bs094.html +++ b/doc/pub/week9/html/._week9-bs094.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Parallel computations and loops

    - -

    +

    Numerical integration in parallel

    -

    -OpenMP provides an easy way to parallelize a loop -

    + - -

    #pragma omp parallel for
    -  for (i=0; i<n; i++) c[i] = a[i];
    -
    -

    -OpenMP handles index variable (no need to declare in for loop or make private) +

      +
    • The code example computes \( \pi \) using the trapezoidal rules.
    • +
    • The trapezoidal rule
    • +
    +$$ + I=\int_a^bf(x) dx\approx h\left(f(a)/2 + f(a+h) +f(a+2h)+\dots +f(b-h)+ f(b)/2\right). +$$ -

    -Which thread does which values? Several options. +

    Click on this link for the full program.

    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs095.html b/doc/pub/week9/html/._week9-bs095.html index 54e900b7..a463281d 100644 --- a/doc/pub/week9/html/._week9-bs095.html +++ b/doc/pub/week9/html/._week9-bs095.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Scheduling of loop computations

    - -

    +

    Dissection of trapezoidal rule with \( MPI\_reduce \)

    -

    -We can let the OpenMP runtime decide. The decision is about how the loop iterates are scheduled -and OpenMP defines three choices of loop scheduling: - -

      -
    1. Static: Predefined at compile time. Lowest overhead, predictable
    2. -
    3. Dynamic: Selection made at runtime
    4. -
    5. Guided: Special case of dynamic; attempts to reduce overhead
    6. -
    + + + + +
    +
    +
    +
    +
    +
    //    Trapezoidal rule and numerical integration usign MPI
    +using namespace std;
    +#include <mpi.h>
    +#include <iostream>
    +
    +//     Here we define various functions called by the main program
    +
    +double int_function(double );
    +double trapezoidal_rule(double , double , int , double (*)(double));
    +
    +//   Main function begins here
    +int main (int nargs, char* args[])
    +{
    +  int n, local_n, numprocs, my_rank; 
    +  double a, b, h, local_a, local_b, total_sum, local_sum;   
    +  double  time_start, time_end, total_time;
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs096.html b/doc/pub/week9/html/._week9-bs096.html index 35d1368f..4f371e52 100644 --- a/doc/pub/week9/html/._week9-bs096.html +++ b/doc/pub/week9/html/._week9-bs096.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Example code for loop scheduling

    +

    Dissection of trapezoidal rule

    -

    -

    + + -

    #include <omp.h>
    -#define CHUNKSIZE 100
    -#define N 1000
    -int main (int argc, char *argv[])
    -{
    -int i, chunk;
    -float a[N], b[N], c[N];
    -for (i=0; i < N; i++) a[i] = b[i] = i * 1.0;
    -chunk = CHUNKSIZE;
    -#pragma omp parallel shared(a,b,c,chunk) private(i)
    -{
    -#pragma omp for schedule(dynamic,chunk)
    -for (i=0; i < N; i++) c[i] = a[i] + b[i];
    -} /* end of parallel region */
    -}
    -
    -

    +

    +
    +
    +
    +
    +
      //  MPI initializations
    +  MPI_Init (&nargs, &args);
    +  MPI_Comm_size (MPI_COMM_WORLD, &numprocs);
    +  MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);
    +  time_start = MPI_Wtime();
    +  //  Fixed values for a, b and n 
    +  a = 0.0 ; b = 1.0;  n = 1000;
    +  h = (b-a)/n;    // h is the same for all processes 
    +  local_n = n/numprocs;  
    +  // make sure n > numprocs, else integer division gives zero
    +  // Length of each process' interval of
    +  // integration = local_n*h.  
    +  local_a = a + my_rank*local_n*h;
    +  local_b = local_a + local_n*h;
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs097.html b/doc/pub/week9/html/._week9-bs097.html index 11206705..2e93defe 100644 --- a/doc/pub/week9/html/._week9-bs097.html +++ b/doc/pub/week9/html/._week9-bs097.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Example code for loop scheduling, guided instead of dynamic

    +

    Integrating with MPI

    -

    -

    + + -

    #include <omp.h>
    -#define CHUNKSIZE 100
    -#define N 1000
    -int main (int argc, char *argv[])
    -{
    -int i, chunk;
    -float a[N], b[N], c[N];
    -for (i=0; i < N; i++) a[i] = b[i] = i * 1.0;
    -chunk = CHUNKSIZE;
    -#pragma omp parallel shared(a,b,c,chunk) private(i)
    -{
    -#pragma omp for schedule(guided,chunk)
    -for (i=0; i < N; i++) c[i] = a[i] + b[i];
    -} /* end of parallel region */
    -}
    -
    -

    +

    +
    +
    +
    +
    +
      total_sum = 0.0;
    +  local_sum = trapezoidal_rule(local_a, local_b, local_n, 
    +                               &int_function); 
    +  MPI_Reduce(&local_sum, &total_sum, 1, MPI_DOUBLE, 
    +              MPI_SUM, 0, MPI_COMM_WORLD);
    +  time_end = MPI_Wtime();
    +  total_time = time_end-time_start;
    +  if ( my_rank == 0) {
    +    cout << "Trapezoidal rule = " <<  total_sum << endl;
    +    cout << "Time = " <<  total_time  
    +         << " on number of processors: "  << numprocs  << endl;
    +  }
    +  // End MPI
    +  MPI_Finalize ();  
    +  return 0;
    +}  // end of main program
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs098.html b/doc/pub/week9/html/._week9-bs098.html index 10d25551..896e921e 100644 --- a/doc/pub/week9/html/._week9-bs098.html +++ b/doc/pub/week9/html/._week9-bs098.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    More on Parallel for loop

    +

    How do I use \( MPI\_reduce \)?

    -

    + -

      -
    • The number of loop iterations cannot be non-deterministic; break, return, exit, goto not allowed inside the for-loop
    • -
    • The loop index is private to each thread
    • -
    • A reduction variable is special
    • - -
        -
      • During the for-loop there is a local private copy in each thread
      • -
      • At the end of the for-loop, all the local copies are combined together by the reduction operation
      • -
      - -
    • Unless the nowait clause is used, an implicit barrier synchronization will be added at the end by the compiler
    • -
    - -

    +

    Here we have used

    -
    // #pragma omp parallel and #pragma omp for
    -
    -

    -can be combined into -

    +

    +
    +
    +
    +
    +
    MPI_reduce( void *senddata, void* resultdata, int count, 
    +     MPI_Datatype datatype, MPI_Op, int root, MPI_Comm comm)
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    - -
    #pragma omp parallel for
    -
    -

    +

    The two variables \( senddata \) and \( resultdata \) are obvious, besides the fact that one sends the address +of the variable or the first element of an array. If they are arrays they need to have the same size. +The variable \( count \) represents the total dimensionality, 1 in case of just one variable, +while \( MPI\_Datatype \) +defines the type of variable which is sent and received. +

    + +

    The new feature is \( MPI\_Op \). It defines the type +of operation we want to do. +

    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs099.html b/doc/pub/week9/html/._week9-bs099.html index 233ad1b7..5f9228b0 100644 --- a/doc/pub/week9/html/._week9-bs099.html +++ b/doc/pub/week9/html/._week9-bs099.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    What can happen with this loop?

    - -

    +

    More on \( MPI\_Reduce \)

    -

    -What happens with code like this -

    + +

    In our case, since we are summing +the rectangle contributions from every process we define \( MPI\_Op = MPI\_SUM \). +If we have an array or matrix we can search for the largest og smallest element by sending either \( MPI\_MAX \) or +\( MPI\_MIN \). If we want the location as well (which array element) we simply transfer +\( MPI\_MAXLOC \) or \( MPI\_MINOC \). If we want the product we write \( MPI\_PROD \). +

    - -
    #pragma omp parallel for
    -for (i=0; i<n; i++) sum += a[i]*a[i];
    -
    -

    -All threads can access the sum variable, but the addition is not atomic! It is important to avoid race between threads. So-called reductions in OpenMP are thus important for performance and for obtaining correct results. OpenMP lets us indicate that a variable is used for a reduction with a particular operator. The above code becomes -

    +

    \( MPI\_Allreduce \) is defined as

    -
    sum = 0.0;
    -#pragma omp parallel for reduction(+:sum)
    -for (i=0; i<n; i++) sum += a[i]*a[i];
    -
    -

    +

    +
    +
    +
    +
    +
    MPI_Allreduce( void *senddata, void* resultdata, int count, 
    +          MPI_Datatype datatype, MPI_Op, MPI_Comm comm)        
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs100.html b/doc/pub/week9/html/._week9-bs100.html index 58f59b5b..e4e70a3c 100644 --- a/doc/pub/week9/html/._week9-bs100.html +++ b/doc/pub/week9/html/._week9-bs100.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Inner product

    +

    Dissection of trapezoidal rule

    -

    -$$ -\sum_{i=0}^{n-1} a_ib_i -$$ + -

    +

    We use \( MPI\_reduce \) to collect data from each process. Note also the use of the function +\( MPI\_Wtime \). +

    -
    int i;
    -double sum = 0.;
    -/* allocating and initializing arrays */
    -/* ... */
    -#pragma omp parallel for default(shared) private(i) reduction(+:sum)
    - for (i=0; i<N; i++) sum += a[i]*b[i];
    -}
    -
    -

    +

    +
    +
    +
    +
    +
    //  this function defines the function to integrate
    +double int_function(double x)
    +{
    +  double value = 4./(1.+x*x);
    +  return value;
    +} // end of function to evaluate
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs101.html b/doc/pub/week9/html/._week9-bs101.html index 096b34fe..6370ee56 100644 --- a/doc/pub/week9/html/._week9-bs101.html +++ b/doc/pub/week9/html/._week9-bs101.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Different threads do different tasks

    +

    Dissection of trapezoidal rule

    -

    - -

    -Different threads do different tasks independently, each section is executed by one thread. -

    + -

    #pragma omp parallel
    -{
    -#pragma omp sections
    -{
    -#pragma omp section
    -funcA ();
    -#pragma omp section
    -funcB ();
    -#pragma omp section
    -funcC ();
    -}
    -}
    -
    -

    +

    +
    +
    +
    +
    +
    //  this function defines the trapezoidal rule
    +double trapezoidal_rule(double a, double b, int n, 
    +                         double (*func)(double))
    +{
    +  double trapez_sum;
    +  double fa, fb, x, step;
    +  int    j;
    +  step=(b-a)/((double) n);
    +  fa=(*func)(a)/2. ;
    +  fb=(*func)(b)/2. ;
    +  trapez_sum=0.;
    +  for (j=1; j <= n-1; j++){
    +    x=j*step+a;
    +    trapez_sum+=(*func)(x);
    +  }
    +  trapez_sum=(trapez_sum+fb+fa)*step;
    +  return trapez_sum;
    +}  // end trapezoidal_rule 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs102.html b/doc/pub/week9/html/._week9-bs102.html index 2de362c9..e3f6cc51 100644 --- a/doc/pub/week9/html/._week9-bs102.html +++ b/doc/pub/week9/html/._week9-bs102.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Single execution

    +

    The quantum dot program for two electrons

    -

    -

    + -

    #pragma omp single { ... }
    -
    -

    -The code is executed by one thread only, no guarantee which thread +

    +
    +
    +
    +
    +
    // Variational Monte Carlo for atoms with importance sampling, slater det
    +// Test case for 2-electron quantum dot, no classes using Mersenne-Twister RNG
    +#include "mpi.h"
    +#include <cmath>
    +#include <random>
    +#include <string>
    +#include <iostream>
    +#include <fstream>
    +#include <iomanip>
    +#include "vectormatrixclass.h"
     
    -

    -Can introduce an implicit barrier at the end -

    +using namespace std; +// output file as global variable +ofstream ofile; +// the step length and its squared inverse for the second derivative +// Here we define global variables used in various functions +// These can be changed by using classes +int Dimension = 2; +int NumberParticles = 2; // we fix also the number of electrons to be 2 - -

    #pragma omp master { ... }
    -
    -

    -Code executed by the master thread, guaranteed and no implicit barrier at the end. +// declaration of functions + +// The Mc sampling for the variational Monte Carlo +void MonteCarloSampling(int, double &, double &, Vector &); + +// The variational wave function +double WaveFunction(Matrix &, Vector &); + +// The local energy +double LocalEnergy(Matrix &, Vector &); + +// The quantum force +void QuantumForce(Matrix &, Matrix &, Vector &); + + +// inline function for single-particle wave function +inline double SPwavefunction(double r, double alpha) { + return exp(-alpha*r*0.5); +} + +// inline function for derivative of single-particle wave function +inline double DerivativeSPwavefunction(double r, double alpha) { + return -r*alpha; +} + +// function for absolute value of relative distance +double RelativeDistance(Matrix &r, int i, int j) { + double r_ij = 0; + for (int k = 0; k < Dimension; k++) { + r_ij += (r(i,k)-r(j,k))*(r(i,k)-r(j,k)); + } + return sqrt(r_ij); +} + +// inline function for derivative of Jastrow factor +inline double JastrowDerivative(Matrix &r, double beta, int i, int j, int k){ + return (r(i,k)-r(j,k))/(RelativeDistance(r, i, j)*pow(1.0+beta*RelativeDistance(r, i, j),2)); +} + +// function for square of position of single particle +double singleparticle_pos2(Matrix &r, int i) { + double r_single_particle = 0; + for (int j = 0; j < Dimension; j++) { + r_single_particle += r(i,j)*r(i,j); + } + return r_single_particle; +} + +void lnsrch(int n, Vector &xold, double fold, Vector &g, Vector &p, Vector &x, + double *f, double stpmax, int *check, double (*func)(Vector &p)); + +void dfpmin(Vector &p, int n, double gtol, int *iter, double *fret, + double(*func)(Vector &p), void (*dfunc)(Vector &p, Vector &g)); + +static double sqrarg; +#define SQR(a) ((sqrarg=(a)) == 0.0 ? 0.0 : sqrarg*sqrarg) + + +static double maxarg1,maxarg2; +#define FMAX(a,b) (maxarg1=(a),maxarg2=(b),(maxarg1) > (maxarg2) ?\ + (maxarg1) : (maxarg2)) + + +// Begin of main program + +int main(int argc, char* argv[]) +{ + + // MPI initializations + int NumberProcesses, MyRank, NumberMCsamples; + MPI_Init (&argc, &argv); + MPI_Comm_size (MPI_COMM_WORLD, &NumberProcesses); + MPI_Comm_rank (MPI_COMM_WORLD, &MyRank); + double StartTime = MPI_Wtime(); + if (MyRank == 0 && argc <= 1) { + cout << "Bad Usage: " << argv[0] << + " Read also output file on same line and number of Monte Carlo cycles" << endl; + } + // Read filename and number of Monte Carlo cycles from the command line + if (MyRank == 0 && argc > 2) { + string filename = argv[1]; // first command line argument after name of program + NumberMCsamples = atoi(argv[2]); + string fileout = filename; + string argument = to_string(NumberMCsamples); + // Final filename as filename+NumberMCsamples + fileout.append(argument); + ofile.open(fileout); + } + // broadcast the number of Monte Carlo samples + MPI_Bcast (&NumberMCsamples, 1, MPI_INT, 0, MPI_COMM_WORLD); + // Two variational parameters only + Vector VariationalParameters(2); + int TotalNumberMCsamples = NumberMCsamples*NumberProcesses; + // Loop over variational parameters + for (double alpha = 0.5; alpha <= 1.5; alpha +=0.1){ + for (double beta = 0.1; beta <= 0.5; beta +=0.05){ + VariationalParameters(0) = alpha; // value of alpha + VariationalParameters(1) = beta; // value of beta + // Do the mc sampling and accumulate data with MPI_Reduce + double TotalEnergy, TotalEnergySquared, LocalProcessEnergy, LocalProcessEnergy2; + LocalProcessEnergy = LocalProcessEnergy2 = 0.0; + MonteCarloSampling(NumberMCsamples, LocalProcessEnergy, LocalProcessEnergy2, VariationalParameters); + // Collect data in total averages + MPI_Reduce(&LocalProcessEnergy, &TotalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + MPI_Reduce(&LocalProcessEnergy2, &TotalEnergySquared, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + // Print out results in case of Master node, set to MyRank = 0 + if ( MyRank == 0) { + double Energy = TotalEnergy/( (double)NumberProcesses); + double Variance = TotalEnergySquared/( (double)NumberProcesses)-Energy*Energy; + double StandardDeviation = sqrt(Variance/((double)TotalNumberMCsamples)); // over optimistic error + ofile << setiosflags(ios::showpoint | ios::uppercase); + ofile << setw(15) << setprecision(8) << VariationalParameters(0); + ofile << setw(15) << setprecision(8) << VariationalParameters(1); + ofile << setw(15) << setprecision(8) << Energy; + ofile << setw(15) << setprecision(8) << Variance; + ofile << setw(15) << setprecision(8) << StandardDeviation << endl; + } + } + } + double EndTime = MPI_Wtime(); + double TotalTime = EndTime-StartTime; + if ( MyRank == 0 ) cout << "Time = " << TotalTime << " on number of processors: " << NumberProcesses << endl; + if (MyRank == 0) ofile.close(); // close output file + // End MPI + MPI_Finalize (); + return 0; +} // end of main function + + +// Monte Carlo sampling with the Metropolis algorithm + +void MonteCarloSampling(int NumberMCsamples, double &cumulative_e, double &cumulative_e2, Vector &VariationalParameters) +{ + + // Initialize the seed and call the Mersienne algo + std::random_device rd; + std::mt19937_64 gen(rd()); + // Set up the uniform distribution for x \in [[0, 1] + std::uniform_real_distribution<double> UniformNumberGenerator(0.0,1.0); + std::normal_distribution<double> Normaldistribution(0.0,1.0); + // diffusion constant from Schroedinger equation + double D = 0.5; + double timestep = 0.05; // we fix the time step for the gaussian deviate + // allocate matrices which contain the position of the particles + Matrix OldPosition( NumberParticles, Dimension), NewPosition( NumberParticles, Dimension); + Matrix OldQuantumForce(NumberParticles, Dimension), NewQuantumForce(NumberParticles, Dimension); + double Energy = 0.0; double EnergySquared = 0.0; double DeltaE = 0.0; + // initial trial positions + for (int i = 0; i < NumberParticles; i++) { + for (int j = 0; j < Dimension; j++) { + OldPosition(i,j) = Normaldistribution(gen)*sqrt(timestep); + } + } + double OldWaveFunction = WaveFunction(OldPosition, VariationalParameters); + QuantumForce(OldPosition, OldQuantumForce, VariationalParameters); + // loop over monte carlo cycles + for (int cycles = 1; cycles <= NumberMCsamples; cycles++){ + // new position + for (int i = 0; i < NumberParticles; i++) { + for (int j = 0; j < Dimension; j++) { + // gaussian deviate to compute new positions using a given timestep + NewPosition(i,j) = OldPosition(i,j) + Normaldistribution(gen)*sqrt(timestep)+OldQuantumForce(i,j)*timestep*D; + // NewPosition(i,j) = OldPosition(i,j) + gaussian_deviate(&idum)*sqrt(timestep)+OldQuantumForce(i,j)*timestep*D; + } + // for the other particles we need to set the position to the old position since + // we move only one particle at the time + for (int k = 0; k < NumberParticles; k++) { + if ( k != i) { + for (int j = 0; j < Dimension; j++) { + NewPosition(k,j) = OldPosition(k,j); + } + } + } + double NewWaveFunction = WaveFunction(NewPosition, VariationalParameters); + QuantumForce(NewPosition, NewQuantumForce, VariationalParameters); + // we compute the log of the ratio of the greens functions to be used in the + // Metropolis-Hastings algorithm + double GreensFunction = 0.0; + for (int j = 0; j < Dimension; j++) { + GreensFunction += 0.5*(OldQuantumForce(i,j)+NewQuantumForce(i,j))* + (D*timestep*0.5*(OldQuantumForce(i,j)-NewQuantumForce(i,j))-NewPosition(i,j)+OldPosition(i,j)); + } + GreensFunction = exp(GreensFunction); + // The Metropolis test is performed by moving one particle at the time + if(UniformNumberGenerator(gen) <= GreensFunction*NewWaveFunction*NewWaveFunction/OldWaveFunction/OldWaveFunction ) { + for (int j = 0; j < Dimension; j++) { + OldPosition(i,j) = NewPosition(i,j); + OldQuantumForce(i,j) = NewQuantumForce(i,j); + } + OldWaveFunction = NewWaveFunction; + } + } // end of loop over particles + // compute local energy + double DeltaE = LocalEnergy(OldPosition, VariationalParameters); + // update energies + Energy += DeltaE; + EnergySquared += DeltaE*DeltaE; + } // end of loop over MC trials + // update the energy average and its squared + cumulative_e = Energy/NumberMCsamples; + cumulative_e2 = EnergySquared/NumberMCsamples; +} // end MonteCarloSampling function + + +// Function to compute the squared wave function and the quantum force + +double WaveFunction(Matrix &r, Vector &VariationalParameters) +{ + double wf = 0.0; + // full Slater determinant for two particles, replace with Slater det for more particles + wf = SPwavefunction(singleparticle_pos2(r, 0), VariationalParameters(0))*SPwavefunction(singleparticle_pos2(r, 1),VariationalParameters(0)); + // contribution from Jastrow factor + for (int i = 0; i < NumberParticles-1; i++) { + for (int j = i+1; j < NumberParticles; j++) { + wf *= exp(RelativeDistance(r, i, j)/((1.0+VariationalParameters(1)*RelativeDistance(r, i, j)))); + } + } + return wf; +} + +// Function to calculate the local energy without numerical derivation of kinetic energy + +double LocalEnergy(Matrix &r, Vector &VariationalParameters) +{ + + // compute the kinetic and potential energy from the single-particle part + // for a many-electron system this has to be replaced by a Slater determinant + // The absolute value of the interparticle length + Matrix length( NumberParticles, NumberParticles); + // Set up interparticle distance + for (int i = 0; i < NumberParticles-1; i++) { + for(int j = i+1; j < NumberParticles; j++){ + length(i,j) = RelativeDistance(r, i, j); + length(j,i) = length(i,j); + } + } + double KineticEnergy = 0.0; + // Set up kinetic energy from Slater and Jastrow terms + for (int i = 0; i < NumberParticles; i++) { + for (int k = 0; k < Dimension; k++) { + double sum1 = 0.0; + for(int j = 0; j < NumberParticles; j++){ + if ( j != i) { + sum1 += JastrowDerivative(r, VariationalParameters(1), i, j, k); + } + } + KineticEnergy += (sum1+DerivativeSPwavefunction(r(i,k),VariationalParameters(0)))*(sum1+DerivativeSPwavefunction(r(i,k),VariationalParameters(0))); + } + } + KineticEnergy += -2*VariationalParameters(0)*NumberParticles; + for (int i = 0; i < NumberParticles-1; i++) { + for (int j = i+1; j < NumberParticles; j++) { + KineticEnergy += 2.0/(pow(1.0 + VariationalParameters(1)*length(i,j),2))*(1.0/length(i,j)-2*VariationalParameters(1)/(1+VariationalParameters(1)*length(i,j)) ); + } + } + KineticEnergy *= -0.5; + // Set up potential energy, external potential + eventual electron-electron repulsion + double PotentialEnergy = 0; + for (int i = 0; i < NumberParticles; i++) { + double DistanceSquared = singleparticle_pos2(r, i); + PotentialEnergy += 0.5*DistanceSquared; // sp energy HO part, note it has the oscillator frequency set to 1! + } + // Add the electron-electron repulsion + for (int i = 0; i < NumberParticles-1; i++) { + for (int j = i+1; j < NumberParticles; j++) { + PotentialEnergy += 1.0/length(i,j); + } + } + double LocalE = KineticEnergy+PotentialEnergy; + return LocalE; +} + +// Compute the analytical expression for the quantum force +void QuantumForce(Matrix &r, Matrix &qforce, Vector &VariationalParameters) +{ + // compute the first derivative + for (int i = 0; i < NumberParticles; i++) { + for (int k = 0; k < Dimension; k++) { + // single-particle part, replace with Slater det for larger systems + double sppart = DerivativeSPwavefunction(r(i,k),VariationalParameters(0)); + // Jastrow factor contribution + double Jsum = 0.0; + for (int j = 0; j < NumberParticles; j++) { + if ( j != i) { + Jsum += JastrowDerivative(r, VariationalParameters(1), i, j, k); + } + } + qforce(i,k) = 2.0*(Jsum+sppart); + } + } +} // end of QuantumForce function + + +#define ITMAX 200 +#define EPS 3.0e-8 +#define TOLX (4*EPS) +#define STPMX 100.0 + +void dfpmin(Vector &p, int n, double gtol, int *iter, double *fret, + double(*func)(Vector &p), void (*dfunc)(Vector &p, Vector &g)) +{ + + int check,i,its,j; + double den,fac,fad,fae,fp,stpmax,sum=0.0,sumdg,sumxi,temp,test; + Vector dg(n), g(n), hdg(n), pnew(n), xi(n); + Matrix hessian(n,n); + + fp=(*func)(p); + (*dfunc)(p,g); + for (i = 0;i < n;i++) { + for (j = 0; j< n;j++) hessian(i,j)=0.0; + hessian(i,i)=1.0; + xi(i) = -g(i); + sum += p(i)*p(i); + } + stpmax=STPMX*FMAX(sqrt(sum),(double)n); + for (its=1;its<=ITMAX;its++) { + *iter=its; + lnsrch(n,p,fp,g,xi,pnew,fret,stpmax,&check,func); + fp = *fret; + for (i = 0; i< n;i++) { + xi(i)=pnew(i)-p(i); + p(i)=pnew(i); + } + test=0.0; + for (i = 0;i< n;i++) { + temp=fabs(xi(i))/FMAX(fabs(p(i)),1.0); + if (temp > test) test=temp; + } + if (test < TOLX) { + return; + } + for (i=0;i<n;i++) dg(i)=g(i); + (*dfunc)(p,g); + test=0.0; + den=FMAX(*fret,1.0); + for (i=0;i<n;i++) { + temp=fabs(g(i))*FMAX(fabs(p(i)),1.0)/den; + if (temp > test) test=temp; + } + if (test < gtol) { + return; + } + for (i=0;i<n;i++) dg(i)=g(i)-dg(i); + for (i=0;i<n;i++) { + hdg(i)=0.0; + for (j=0;j<n;j++) hdg(i) += hessian(i,j)*dg(j); + } + fac=fae=sumdg=sumxi=0.0; + for (i=0;i<n;i++) { + fac += dg(i)*xi(i); + fae += dg(i)*hdg(i); + sumdg += SQR(dg(i)); + sumxi += SQR(xi(i)); + } + if (fac*fac > EPS*sumdg*sumxi) { + fac=1.0/fac; + fad=1.0/fae; + for (i=0;i<n;i++) dg(i)=fac*xi(i)-fad*hdg(i); + for (i=0;i<n;i++) { + for (j=0;j<n;j++) { + hessian(i,j) += fac*xi(i)*xi(j) + -fad*hdg(i)*hdg(j)+fae*dg(i)*dg(j); + } + } + } + for (i=0;i<n;i++) { + xi(i)=0.0; + for (j=0;j<n;j++) xi(i) -= hessian(i,j)*g(j); + } + } + cout << "too many iterations in dfpmin" << endl; +} +#undef ITMAX +#undef EPS +#undef TOLX +#undef STPMX + +#define ALF 1.0e-4 +#define TOLX 1.0e-7 + +void lnsrch(int n, Vector &xold, double fold, Vector &g, Vector &p, Vector &x, + double *f, double stpmax, int *check, double (*func)(Vector &p)) +{ + int i; + double a,alam,alam2,alamin,b,disc,f2,fold2,rhs1,rhs2,slope,sum,temp, + test,tmplam; + + *check=0; + for (sum=0.0,i=0;i<n;i++) sum += p(i)*p(i); + sum=sqrt(sum); + if (sum > stpmax) + for (i=0;i<n;i++) p(i) *= stpmax/sum; + for (slope=0.0,i=0;i<n;i++) + slope += g(i)*p(i); + test=0.0; + for (i=0;i<n;i++) { + temp=fabs(p(i))/FMAX(fabs(xold(i)),1.0); + if (temp > test) test=temp; + } + alamin=TOLX/test; + alam=1.0; + for (;;) { + for (i=0;i<n;i++) x(i)=xold(i)+alam*p(i); + *f=(*func)(x); + if (alam < alamin) { + for (i=0;i<n;i++) x(i)=xold(i); + *check=1; + return; + } else if (*f <= fold+ALF*alam*slope) return; + else { + if (alam == 1.0) + tmplam = -slope/(2.0*(*f-fold-slope)); + else { + rhs1 = *f-fold-alam*slope; + rhs2=f2-fold2-alam2*slope; + a=(rhs1/(alam*alam)-rhs2/(alam2*alam2))/(alam-alam2); + b=(-alam2*rhs1/(alam*alam)+alam*rhs2/(alam2*alam2))/(alam-alam2); + if (a == 0.0) tmplam = -slope/(2.0*b); + else { + disc=b*b-3.0*a*slope; + if (disc<0.0) cout << "Roundoff problem in lnsrch." << endl; + else tmplam=(-b+sqrt(disc))/(3.0*a); + } + if (tmplam>0.5*alam) + tmplam=0.5*alam; + } + } + alam2=alam; + f2 = *f; + fold2=fold; + alam=FMAX(tmplam,0.1*alam); + } +} +#undef ALF +#undef TOLX +

    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs103.html b/doc/pub/week9/html/._week9-bs103.html index 295e8307..8fac6fa4 100644 --- a/doc/pub/week9/html/._week9-bs103.html +++ b/doc/pub/week9/html/._week9-bs103.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Coordination and synchronization

    +

    What is OpenMP

    -

    -

    - - -

    #pragma omp barrier
    -
    -

    -Synchronization, must be encountered by all threads in a team (or none) -

    - - -

    #pragma omp ordered { a block of codes }
    -
    -

    -is another form of synchronization (in sequential order). -The form -

    - - -

    #pragma omp critical { a block of codes }
    -
    -

    -and -

    - - -

    #pragma omp atomic { single assignment statement }
    -
    -

    -is more efficient than -

    - - -

    #pragma omp critical
    -
    -

    + +

      +
    • OpenMP provides high-level thread programming
    • +
    • Multiple cooperating threads are allowed to run simultaneously
    • +
    • Threads are created and destroyed dynamically in a fork-join pattern
    • +
        +
      • An OpenMP program consists of a number of parallel regions
      • +
      • Between two parallel regions there is only one master thread
      • +
      • In the beginning of a parallel region, a team of new threads is spawned
      • +
      +
    • The newly spawned threads work simultaneously with the master thread
    • +
    • At the end of a parallel region, the new threads are destroyed
    • +
    +

    Many good tutorials online and excellent textbook

    +
      +
    1. Using OpenMP, by B. Chapman, G. Jost, and A. van der Pas
    2. +
    3. Many tutorials online like OpenMP official site
    4. +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs104.html b/doc/pub/week9/html/._week9-bs104.html index 549d7a3d..6e219014 100644 --- a/doc/pub/week9/html/._week9-bs104.html +++ b/doc/pub/week9/html/._week9-bs104.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Data scope

    +

    Getting started, things to remember

    -

    - -

      -
    • OpenMP data scope attribute clauses:
    • - +
        -
      • shared
      • -
      • private
      • -
      • firstprivate
      • -
      • lastprivate
      • -
      • reduction
      • +
      • Remember the header file
      + +
      +
      +
      +
      +
      +
      #include <omp.h>
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      + +
        +
      • Insert compiler directives in C++ syntax as
      -What are the purposes of these attributes + +
      +
      +
      +
      +
      +
      #pragma omp...
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
        -
      • define how and which variables are transferred to a parallel region (and back)
      • -
      • define which variables are visible to all threads in a parallel region, and which variables are privately allocated to each thread
      • +
      • Compile with for example c++ -fopenmp code.cpp
      • +
      • Execute
      • +
          +
        • Remember to assign the environment variable OMP NUM THREADS
        • +
        • It specifies the total number of threads inside a parallel region, if not otherwise overwritten
        • +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs105.html b/doc/pub/week9/html/._week9-bs105.html index 6443e423..13aecc4f 100644 --- a/doc/pub/week9/html/._week9-bs105.html +++ b/doc/pub/week9/html/._week9-bs105.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - +

    OpenMP syntax

    + -

    Some remarks

    -
    -
    -

    + +

    +
    +
    +
    +
    +
    #pragma omp construct [ clause ...]
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
      -
    • When entering a parallel region, the private clause ensures each thread having its own new variable instances. The new variables are assumed to be uninitialized.
    • -
    • A shared variable exists in only one memory location and all threads can read and write to that address. It is the programmer's responsibility to ensure that multiple threads properly access a shared variable.
    • -
    • The firstprivate clause combines the behavior of the private clause with automatic initialization.
    • -
    • The lastprivate clause combines the behavior of the private clause with a copy back (from the last loop iteration or section) to the original variable outside the parallel region.
    • +
    • Some functions and types
    + + +
    +
    +
    +
    +
    +
    #include <omp.h>
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    - -

    +

      +
    • Most apply to a block of code
    • +
    • Specifically, a structured block
    • +
    • Enter at top, exit at bottom only, exit(), abort() permitted
    • +

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs106.html b/doc/pub/week9/html/._week9-bs106.html index 8a22e844..85a3adf4 100644 --- a/doc/pub/week9/html/._week9-bs106.html +++ b/doc/pub/week9/html/._week9-bs106.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - +

    Different OpenMP styles of parallelism

    +

    OpenMP supports several different ways to specify thread parallelism

    -

    Parallelizing nested for-loops

    -
    -
    -

    - -

      -
    • Serial code
    • -
    - -

    - - -

    for (i=0; i<100; i++)
    -    for (j=0; j<100; j++)
    -        a[i][j] = b[i][j] + c[i][j];
    -    }
    -}
    -
      -
    • Parallelization
    • +
    • General parallel regions: All threads execute the code, roughly as if you made a routine of that region and created a thread to run that code
    • +
    • Parallel loops: Special case for loops, simplifies data parallel code
    • +
    • Task parallelism, new in OpenMP 3
    • +
    • Several ways to manage thread coordination, including Master regions and Locks
    • +
    • Memory model for shared data
    - -

    - - -

    #pragma omp parallel for private(j)
    -for (i=0; i<100; i++)
    -    for (j=0; j<100; j++)
    -       a[i][j] = b[i][j] + c[i][j];
    -    }
    -}
    -
    -
      -
    • Why not parallelize the inner loop? to save overhead of repeated thread forks-joins
    • -
    • Why must j be private? To avoid race condition among the threads
    • -
    -
    -
    - - -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs107.html b/doc/pub/week9/html/._week9-bs107.html index a6f36bb5..83cdccb7 100644 --- a/doc/pub/week9/html/._week9-bs107.html +++ b/doc/pub/week9/html/._week9-bs107.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - -
    -
    -

     

     

     

    - - -

    Nested parallelism

    +

    General code structure

    -

    -When a thread in a parallel region encounters another parallel construct, it -may create a new team of threads and become the master of the new -team. -

    + -

    #pragma omp parallel num_threads(4)
    -{
    -/* .... */
    -#pragma omp parallel num_threads(2)
    -{
    -//  
    -}
    -}
    -
    -

    +

    +
    +
    +
    +
    +
    #include <omp.h>
    +main ()
    +{
    +int var1, var2, var3;
    +/* serial code */
    +/* ... */
    +/* start of a parallel region */
    +#pragma omp parallel private(var1, var2) shared(var3)
    +{
    +/* ... */
    +}
    +/* more serial code */
    +/* ... */
    +/* another parallel region */
    +#pragma omp parallel
    +{
    +/* ... */
    +}
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs108.html b/doc/pub/week9/html/._week9-bs108.html index 1f75de14..8302350d 100644 --- a/doc/pub/week9/html/._week9-bs108.html +++ b/doc/pub/week9/html/._week9-bs108.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Parallel tasks

    +

    Parallel region

    -

    -

    + +

      +
    • A parallel region is a block of code that is executed by a team of threads
    • +
    • The following compiler directive creates a parallel region
    • +
    -
    #pragma omp task 
    -#pragma omp parallel shared(p_vec) private(i)
    -{
    -#pragma omp single
    -{
    -for (i=0; i<N; i++) {
    -  double r = random_number();
    -  if (p_vec[i] > r) {
    -#pragma omp task
    -   do_work (p_vec[i]);
    -
    -

    +

    +
    +
    +
    +
    +
    #pragma omp parallel { ... }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
      +
    • Clauses can be added at the end of the directive
    • +
    • Most often used clauses:
    • +
        +
      • default(shared) or default(none)
      • +
      • public(list of variables)
      • +
      • private(list of variables)
      • +
      +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs109.html b/doc/pub/week9/html/._week9-bs109.html index 8b373337..0535ac7e 100644 --- a/doc/pub/week9/html/._week9-bs109.html +++ b/doc/pub/week9/html/._week9-bs109.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Common mistakes

    +

    Hello world, not again, please!

    -

    -Race condition -

    + -

    int nthreads;
    -#pragma omp parallel shared(nthreads)
    -{
    -nthreads = omp_get_num_threads();
    -}
    -
    -

    -Deadlock -

    - - -

    #pragma omp parallel
    -{
    -...
    -#pragma omp critical
    -{
    -...
    +
    +
    +
    +
    +
    +
    #include <omp.h>
    +#include <cstdio>
    +int main (int argc, char *argv[])
    +{
    +int th_id, nthreads;
    +#pragma omp parallel private(th_id) shared(nthreads)
    +{
    +th_id = omp_get_thread_num();
    +printf("Hello World from thread %d\n", th_id);
     #pragma omp barrier
    -}
    -}
    -
    -

    +if ( th_id == 0 ) { +nthreads = omp_get_num_threads(); +printf("There are %d threads\n",nthreads); +} +} +return 0; +} +

    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + -

    - - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs110.html b/doc/pub/week9/html/._week9-bs110.html index e74a6aad..9be952fd 100644 --- a/doc/pub/week9/html/._week9-bs110.html +++ b/doc/pub/week9/html/._week9-bs110.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - - -

    Not all computations are simple

    + +

    Hello world, yet another variant

    -

    -Not all computations are simple loops where the data can be evenly -divided among threads without any dependencies between threads + -

    -An example is finding the location and value of the largest element in an array -

    + +

    +
    +
    +
    +
    +
    #include <cstdio>
    +#include <omp.h>
    +int main(int argc, char *argv[]) 
    +{
    + omp_set_num_threads(4); 
    +#pragma omp parallel
    + {
    +   int id = omp_get_thread_num();
    +   int nproc = omp_get_num_threads(); 
    +   cout << "Hello world with id number and processes " <<  id <<  nproc << endl;
    + } 
    +return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Variables declared outside of the parallel region are shared by all threads +If a variable like id is declared outside of the +

    -
    for (i=0; i<n; i++) { 
    -   if (x[i] > maxval) {
    -      maxval = x[i];
    -      maxloc = i; 
    -   }
    -}
    -
    -

    +

    +
    +
    +
    +
    +
    #pragma omp parallel, 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    it would have been shared by various the threads, possibly causing erroneous output

    +
      +
    • Why? What would go wrong? Why do we add possibly?
    • +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs111.html b/doc/pub/week9/html/._week9-bs111.html index 14c46bf5..f62dcabf 100644 --- a/doc/pub/week9/html/._week9-bs111.html +++ b/doc/pub/week9/html/._week9-bs111.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - - -

    Not all computations are simple, competing threads

    + +

    Important OpenMP library routines

    -

    -All threads are potentially accessing and changing the same values, maxloc and maxval. - -

      -
    1. OpenMP provides several ways to coordinate access to shared values
    2. -
    - -

    - - -

    #pragma omp atomic
    -
    -
      -
    1. Only one thread at a time can execute the following statement (not block). We can use the critical option
    2. -
    + -

    - - -

    #pragma omp critical
    -
    -
      -
    1. Only one thread at a time can execute the following block
    2. -
    - -Atomic may be faster than critical but depends on hardware +
      +
    • int omp get num threads (), returns the number of threads inside a parallel region
    • +
    • int omp get thread num (), returns the a thread for each thread inside a parallel region
    • +
    • void omp set num threads (int), sets the number of threads to be used
    • +
    • void omp set nested (int), turns nested parallelism on/off
    • +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs112.html b/doc/pub/week9/html/._week9-bs112.html index c5ef5c53..dc1368f0 100644 --- a/doc/pub/week9/html/._week9-bs112.html +++ b/doc/pub/week9/html/._week9-bs112.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    How to find the max value using OpenMP

    +

    Private variables

    -

    -Write down the simplest algorithm and look carefully for race conditions. How would you handle them? -The first step would be to parallelize as -

    + +

    Private clause can be used to make thread- private versions of such variables:

    -
    #pragma omp parallel for
    - for (i=0; i<n; i++) {
    -    if (x[i] > maxval) {
    -      maxval = x[i];
    -      maxloc = i; 
    -    }
    -}
    -
    -

    +

    +
    +
    +
    +
    +
    #pragma omp parallel private(id)
    +{
    + int id = omp_get_thread_num();
    + cout << "My thread num" << id << endl; 
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
      +
    • What is their value on entry? Exit?
    • +
    • OpenMP provides ways to control that
    • +
    • Can use default(none) to require the sharing of each variable to be described
    • +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs113.html b/doc/pub/week9/html/._week9-bs113.html index 4b6629e9..5786409f 100644 --- a/doc/pub/week9/html/._week9-bs113.html +++ b/doc/pub/week9/html/._week9-bs113.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Then deal with the race conditions

    +

    Master region

    -

    -Write down the simplest algorithm and look carefully for race conditions. How would you handle them? -The first step would be to parallelize as -

    + +

    It is often useful to have only one thread execute some of the code in a parallel region. I/O statements are a common example

    -
    #pragma omp parallel for
    - for (i=0; i<n; i++) {
    -#pragma omp critical
    -  {
    -     if (x[i] > maxval) {
    -       maxval = x[i];
    -       maxloc = i; 
    -     }
    -  }
    -} 
    -
    -

    -Exercise: write a code which implements this and give an estimate on performance. Perform several runs, -with a serial code only with and without vectorization and compare the serial code with the one that uses OpenMP. Run on different archictectures if you can. +

    +
    +
    +
    +
    +
    #pragma omp parallel 
    +{
    +  #pragma omp master
    +   {
    +      int id = omp_get_thread_num();
    +      cout << "My thread num" << id << endl; 
    +   } 
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs114.html b/doc/pub/week9/html/._week9-bs114.html index 234e9038..8de6d5b0 100644 --- a/doc/pub/week9/html/._week9-bs114.html +++ b/doc/pub/week9/html/._week9-bs114.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - +

    Parallel for loop

    +
    +
    + +
      +
    • Inside a parallel region, the following compiler directive can be used to parallelize a for-loop:
    • +
    + + +
    +
    +
    +
    +
    +
    #pragma omp for
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
      +
    • Clauses can be added, such as
    • +
        +
      • schedule(static, chunk size)
      • +
      • schedule(dynamic, chunk size)
      • +
      • schedule(guided, chunk size) (non-deterministic allocation)
      • +
      • schedule(runtime)
      • +
      • private(list of variables)
      • +
      • reduction(operator:variable)
      • +
      • nowait
      • +
      +
    +
    +
    -

    What can slow down OpenMP performance?

    -Give it a thought! -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs115.html b/doc/pub/week9/html/._week9-bs115.html index 75fcbf79..5ed23399 100644 --- a/doc/pub/week9/html/._week9-bs115.html +++ b/doc/pub/week9/html/._week9-bs115.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - +

    Parallel computations and loops

    -

    What can slow down OpenMP performance?

    -

    -Performance poor because we insisted on keeping track of the maxval and location during the execution of the loop. - -

      -
    • We do not care about the value during the execution of the loop, just the value at the end.
    • -
    + +

    OpenMP provides an easy way to parallelize a loop

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel for
    +  for (i=0; i<n; i++) c[i] = a[i];
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -This is a common source of performance issues, namely the description of the method used to compute a value imposes additional, unnecessary requirements or properties +

    OpenMP handles index variable (no need to declare in for loop or make private)

    -

    -Idea: Have each thread find the maxloc in its own data, then combine and use temporary arrays indexed by thread number to hold the values found by each thread +

    Which thread does which values? Several options.

    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs116.html b/doc/pub/week9/html/._week9-bs116.html index a0ac4527..604c97f6 100644 --- a/doc/pub/week9/html/._week9-bs116.html +++ b/doc/pub/week9/html/._week9-bs116.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - +

    Scheduling of loop computations

    -

    Find the max location for each thread

    -

    -

    - - -

    int maxloc[MAX_THREADS], mloc;
    -double maxval[MAX_THREADS], mval; 
    -#pragma omp parallel shared(maxval,maxloc)
    -{
    -  int id = omp_get_thread_num(); 
    -  maxval[id] = -1.0e30;
    -#pragma omp for
    -   for (int i=0; i<n; i++) {
    -       if (x[i] > maxval[id]) { 
    -           maxloc[id] = i;
    -           maxval[id] = x[i]; 
    -       }
    -    }
    -}
    -
    -

    + +

    We can let the OpenMP runtime decide. The decision is about how the loop iterates are scheduled +and OpenMP defines three choices of loop scheduling: +

    +
      +
    1. Static: Predefined at compile time. Lowest overhead, predictable
    2. +
    3. Dynamic: Selection made at runtime
    4. +
    5. Guided: Special case of dynamic; attempts to reduce overhead
    6. +
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs117.html b/doc/pub/week9/html/._week9-bs117.html index 3fb55255..7ff2f4e8 100644 --- a/doc/pub/week9/html/._week9-bs117.html +++ b/doc/pub/week9/html/._week9-bs117.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - - -

    Combine the values from each thread

    +

    Example code for loop scheduling

    -

    -

    + -

    #pragma omp flush (maxloc,maxval)
    -#pragma omp master
    -  {
    -    int nt = omp_get_num_threads(); 
    -    mloc = maxloc[0]; 
    -    mval = maxval[0]; 
    -    for (int i=1; i<nt; i++) {
    -        if (maxval[i] > mval) { 
    -           mval = maxval[i]; 
    -           mloc = maxloc[i];
    -        } 
    -     }
    -   }
    -
    -

    -Note that we let the master process perform the last operation. +

    +
    +
    +
    +
    +
    #include <omp.h>
    +#define CHUNKSIZE 100
    +#define N 1000
    +int main (int argc, char *argv[])
    +{
    +int i, chunk;
    +float a[N], b[N], c[N];
    +for (i=0; i < N; i++) a[i] = b[i] = i * 1.0;
    +chunk = CHUNKSIZE;
    +#pragma omp parallel shared(a,b,c,chunk) private(i)
    +{
    +#pragma omp for schedule(dynamic,chunk)
    +for (i=0; i < N; i++) c[i] = a[i] + b[i];
    +} /* end of parallel region */
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs118.html b/doc/pub/week9/html/._week9-bs118.html index 6e1e20ac..65988f01 100644 --- a/doc/pub/week9/html/._week9-bs118.html +++ b/doc/pub/week9/html/._week9-bs118.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - +

    Example code for loop scheduling, guided instead of dynamic

    +
    +
    + -

    Matrix-matrix multiplication

    -This code computes the norm of a vector using OpenMp -

    - - -

    //  OpenMP program to compute vector norm by adding two other vectors
    -#include <cstdlib>
    -#include <iostream>
    -#include <cmath>
    -#include <iomanip>
    -#include  <omp.h>
    -# include <ctime>
    +
    +
    +
    +
    +
    +
    +
    #include <omp.h>
    +#define CHUNKSIZE 100
    +#define N 1000
    +int main (int argc, char *argv[])
    +{
    +int i, chunk;
    +float a[N], b[N], c[N];
    +for (i=0; i < N; i++) a[i] = b[i] = i * 1.0;
    +chunk = CHUNKSIZE;
    +#pragma omp parallel shared(a,b,c,chunk) private(i)
    +{
    +#pragma omp for schedule(guided,chunk)
    +for (i=0; i < N; i++) c[i] = a[i] + b[i];
    +} /* end of parallel region */
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -using namespace std; // note use of namespace -int main (int argc, char* argv[]) -{ - // read in dimension of vector - int n = atoi(argv[1]); - double *a, *b, *c; - int i; - int thread_num; - double wtime, Norm2, s, angle; - cout << " Perform addition of two vectors and compute the norm-2." << endl; - omp_set_num_threads(4); - thread_num = omp_get_max_threads (); - cout << " The number of processors available = " << omp_get_num_procs () << endl ; - cout << " The number of threads available = " << thread_num << endl; - cout << " The matrix order n = " << n << endl; - s = 1.0/sqrt( (double) n); - wtime = omp_get_wtime ( ); - // Allocate space for the vectors to be used - a = new double [n]; b = new double [n]; c = new double [n]; - // Define parallel region -# pragma omp parallel for default(shared) private (angle, i) reduction(+:Norm2) - // Set up values for vectors a and b - for (i = 0; i < n; i++){ - angle = 2.0*M_PI*i/ (( double ) n); - a[i] = s*(sin(angle) + cos(angle)); - b[i] = s*sin(2.0*angle); - c[i] = 0.0; - } - // Then perform the vector addition - for (i = 0; i < n; i++){ - c[i] += a[i]+b[i]; - } - // Compute now the norm-2 - Norm2 = 0.0; - for (i = 0; i < n; i++){ - Norm2 += c[i]*c[i]; - } -// end parallel region - wtime = omp_get_wtime ( ) - wtime; - cout << setiosflags(ios::showpoint | ios::uppercase); - cout << setprecision(10) << setw(20) << "Time used for norm-2 computation=" << wtime << endl; - cout << " Norm-2 = " << Norm2 << endl; - // Free up space - delete[] a; - delete[] b; - delete[] c; - return 0; -} -
    -

    -
    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs119.html b/doc/pub/week9/html/._week9-bs119.html index 53fddbf5..94d0fadf 100644 --- a/doc/pub/week9/html/._week9-bs119.html +++ b/doc/pub/week9/html/._week9-bs119.html @@ -1,31 +1,28 @@ - - -Week 10 March 8-12: Object-orientation strategies and Parallelization - + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + - - - -
    -

     

     

     

    - +

    More on Parallel for loop

    +
    +
    + +
      +
    • The number of loop iterations cannot be non-deterministic; break, return, exit, goto not allowed inside the for-loop
    • +
    • The loop index is private to each thread
    • +
    • A reduction variable is special
    • +
        +
      • During the for-loop there is a local private copy in each thread
      • +
      • At the end of the for-loop, all the local copies are combined together by the reduction operation
      • +
      +
    • Unless the nowait clause is used, an implicit barrier synchronization will be added at the end by the compiler
    • +
    -

    Matrix-matrix multiplication

    -This the matrix-matrix multiplication code with plain c++ memory allocation using OpenMP + +
    +
    +
    +
    +
    +
    // #pragma omp parallel and #pragma omp for
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -

    +

    can be combined into

    - -
    //  Matrix-matrix multiplication and Frobenius norm of a matrix with OpenMP
    -#include <cstdlib>
    -#include <iostream>
    -#include <cmath>
    -#include <iomanip>
    -#include  <omp.h>
    -# include <ctime>
    +
    +
    +
    +
    +
    +
    +
    #pragma omp parallel for
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    -using namespace std; // note use of namespace -int main (int argc, char* argv[]) -{ - // read in dimension of square matrix - int n = atoi(argv[1]); - double **A, **B, **C; - int i, j, k; - int thread_num; - double wtime, Fsum, s, angle; - cout << " Compute matrix product C = A * B and Frobenius norm." << endl; - omp_set_num_threads(4); - thread_num = omp_get_max_threads (); - cout << " The number of processors available = " << omp_get_num_procs () << endl ; - cout << " The number of threads available = " << thread_num << endl; - cout << " The matrix order n = " << n << endl; - - s = 1.0/sqrt( (double) n); - wtime = omp_get_wtime ( ); - // Allocate space for the two matrices - A = new double*[n]; B = new double*[n]; C = new double*[n]; - for (i = 0; i < n; i++){ - A[i] = new double[n]; - B[i] = new double[n]; - C[i] = new double[n]; - } - // Define parallel region -# pragma omp parallel for default(shared) private (angle, i, j, k) reduction(+:Fsum) - // Set up values for matrix A and B and zero matrix C - for (i = 0; i < n; i++){ - for (j = 0; j < n; j++) { - angle = 2.0*M_PI*i*j/ (( double ) n); - A[i][j] = s * ( sin ( angle ) + cos ( angle ) ); - B[j][i] = A[i][j]; - } - } - // Then perform the matrix-matrix multiplication - for (i = 0; i < n; i++){ - for (j = 0; j < n; j++) { - C[i][j] = 0.0; - for (k = 0; k < n; k++) { - C[i][j] += A[i][k]*B[k][j]; - } - } - } - // Compute now the Frobenius norm - Fsum = 0.0; - for (i = 0; i < n; i++){ - for (j = 0; j < n; j++) { - Fsum += C[i][j]*C[i][j]; - } - } - Fsum = sqrt(Fsum); -// end parallel region and letting only one thread perform I/O - wtime = omp_get_wtime ( ) - wtime; - cout << setiosflags(ios::showpoint | ios::uppercase); - cout << setprecision(10) << setw(20) << "Time used for matrix-matrix multiplication=" << wtime << endl; - cout << " Frobenius norm = " << Fsum << endl; - // Free up space - for (int i = 0; i < n; i++){ - delete[] A[i]; - delete[] B[i]; - delete[] C[i]; - } - delete[] A; - delete[] B; - delete[] C; - return 0; -} -
    -

    @@ -689,27 +725,32 @@

    118
  • 119
  • 120
  • +
  • 121
  • +
  • 122
  • +
  • 123
  • +
  • 124
  • +
  • 125
  • +
  • 126
  • +
  • 127
  • +
  • 128
  • +
  • 129
  • +
  • ...
  • +
  • 141
  • +
  • »
  • -

    - - -
    - - - diff --git a/doc/pub/week9/html/._week9-bs120.html b/doc/pub/week9/html/._week9-bs120.html new file mode 100644 index 00000000..e3df3b26 --- /dev/null +++ b/doc/pub/week9/html/._week9-bs120.html @@ -0,0 +1,751 @@ + + + + + + + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + + + + + + + + + + + + + + + + + + + + +
    +

     

     

     

    + + +

    What can happen with this loop?

    + +
    +
    + +

    What happens with code like this

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel for
    +for (i=0; i<n; i++) sum += a[i]*a[i];
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    All threads can access the sum variable, but the addition is not atomic! It is important to avoid race between threads. So-called reductions in OpenMP are thus important for performance and for obtaining correct results. OpenMP lets us indicate that a variable is used for a reduction with a particular operator. The above code becomes

    + + +
    +
    +
    +
    +
    +
    sum = 0.0;
    +#pragma omp parallel for reduction(+:sum)
    +for (i=0; i<n; i++) sum += a[i]*a[i];
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +

    + +

    + +
    + + + + +
    + +
    + + + diff --git a/doc/pub/week9/html/._week9-bs121.html b/doc/pub/week9/html/._week9-bs121.html new file mode 100644 index 00000000..04dff569 --- /dev/null +++ b/doc/pub/week9/html/._week9-bs121.html @@ -0,0 +1,732 @@ + + + + + + + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + + + + + + + + + + + + + + + + + + + + +
    +

     

     

     

    + + +

    Inner product

    +
    +
    + +$$ +\sum_{i=0}^{n-1} a_ib_i +$$ + + + +
    +
    +
    +
    +
    +
    int i;
    +double sum = 0.;
    +/* allocating and initializing arrays */
    +/* ... */
    +#pragma omp parallel for default(shared) private(i) reduction(+:sum)
    + for (i=0; i<N; i++) sum += a[i]*b[i];
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +

    + +

    + +
    + + + + +
    + +
    + + + diff --git a/doc/pub/week9/html/._week9-bs122.html b/doc/pub/week9/html/._week9-bs122.html new file mode 100644 index 00000000..f39aa7a7 --- /dev/null +++ b/doc/pub/week9/html/._week9-bs122.html @@ -0,0 +1,735 @@ + + + + + + + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + + + + + + + + + + + + + + + + + + + + +
    +

     

     

     

    + + +

    Different threads do different tasks

    +
    +
    + + +

    Different threads do different tasks independently, each section is executed by one thread.

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel
    +{
    +#pragma omp sections
    +{
    +#pragma omp section
    +funcA ();
    +#pragma omp section
    +funcB ();
    +#pragma omp section
    +funcC ();
    +}
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +

    + +

    + +
    + + + + +
    + +
    + + + diff --git a/doc/pub/week9/html/._week9-bs123.html b/doc/pub/week9/html/._week9-bs123.html new file mode 100644 index 00000000..a6542487 --- /dev/null +++ b/doc/pub/week9/html/._week9-bs123.html @@ -0,0 +1,750 @@ + + + + + + + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + + + + + + + + + + + + + + + + + + + + +
    +

     

     

     

    + + +

    Single execution

    +
    +
    + + + +
    +
    +
    +
    +
    +
    #pragma omp single { ... }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    The code is executed by one thread only, no guarantee which thread

    + +

    Can introduce an implicit barrier at the end

    + + +
    +
    +
    +
    +
    +
    #pragma omp master { ... }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Code executed by the master thread, guaranteed and no implicit barrier at the end.

    +
    +
    + + +

    + +

    + +
    + + + + +
    + +
    + + + diff --git a/doc/pub/week9/html/._week9-bs124.html b/doc/pub/week9/html/._week9-bs124.html new file mode 100644 index 00000000..7fafa115 --- /dev/null +++ b/doc/pub/week9/html/._week9-bs124.html @@ -0,0 +1,820 @@ + + + + + + + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + + + + + + + + + + + + + + + + + + + + +
    +

     

     

     

    + + +

    Coordination and synchronization

    +
    +
    + + + +
    +
    +
    +
    +
    +
    #pragma omp barrier
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Synchronization, must be encountered by all threads in a team (or none)

    + + +
    +
    +
    +
    +
    +
    #pragma omp ordered { a block of codes }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    is another form of synchronization (in sequential order). +The form +

    + + +
    +
    +
    +
    +
    +
    #pragma omp critical { a block of codes }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    and

    + + +
    +
    +
    +
    +
    +
    #pragma omp atomic { single assignment statement }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    is more efficient than

    + + +
    +
    +
    +
    +
    +
    #pragma omp critical
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +

    + +

    + +
    + + + + +
    + +
    + + + diff --git a/doc/pub/week9/html/._week9-bs125.html b/doc/pub/week9/html/._week9-bs125.html new file mode 100644 index 00000000..862c4125 --- /dev/null +++ b/doc/pub/week9/html/._week9-bs125.html @@ -0,0 +1,715 @@ + + + + + + + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + + + + + + + + + + + + + + + + + + + + +
    +

     

     

     

    + + +

    Data scope

    +
    +
    + +
      +
    • OpenMP data scope attribute clauses:
    • +
        +
      • shared
      • +
      • private
      • +
      • firstprivate
      • +
      • lastprivate
      • +
      • reduction
      • +
      +
    +

    What are the purposes of these attributes

    +
      +
    • define how and which variables are transferred to a parallel region (and back)
    • +
    • define which variables are visible to all threads in a parallel region, and which variables are privately allocated to each thread
    • +
    +
    +
    + + +

    + +

    + +
    + + + + +
    + +
    + + + diff --git a/doc/pub/week9/html/._week9-bs126.html b/doc/pub/week9/html/._week9-bs126.html new file mode 100644 index 00000000..68256fe4 --- /dev/null +++ b/doc/pub/week9/html/._week9-bs126.html @@ -0,0 +1,707 @@ + + + + + + + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + + + + + + + + + + + + + + + + + + + + +
    +

     

     

     

    + + +

    Some remarks

    +
    +
    + + +
      +
    • When entering a parallel region, the private clause ensures each thread having its own new variable instances. The new variables are assumed to be uninitialized.
    • +
    • A shared variable exists in only one memory location and all threads can read and write to that address. It is the programmer's responsibility to ensure that multiple threads properly access a shared variable.
    • +
    • The firstprivate clause combines the behavior of the private clause with automatic initialization.
    • +
    • The lastprivate clause combines the behavior of the private clause with a copy back (from the last loop iteration or section) to the original variable outside the parallel region.
    • +
    +
    +
    + + +

    + +

    + +
    + + + + +
    + +
    + + + diff --git a/doc/pub/week9/html/._week9-bs127.html b/doc/pub/week9/html/._week9-bs127.html new file mode 100644 index 00000000..cd237ae7 --- /dev/null +++ b/doc/pub/week9/html/._week9-bs127.html @@ -0,0 +1,768 @@ + + + + + + + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + + + + + + + + + + + + + + + + + + + + +
    +

     

     

     

    + + +

    Parallelizing nested for-loops

    +
    +
    + + +
      +
    • Serial code
    • +
    + + +
    +
    +
    +
    +
    +
    for (i=0; i<100; i++)
    +    for (j=0; j<100; j++)
    +        a[i][j] = b[i][j] + c[i][j];
    +    }
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
      +
    • Parallelization
    • +
    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel for private(j)
    +for (i=0; i<100; i++)
    +    for (j=0; j<100; j++)
    +       a[i][j] = b[i][j] + c[i][j];
    +    }
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
      +
    • Why not parallelize the inner loop? to save overhead of repeated thread forks-joins
    • +
    • Why must j be private? To avoid race condition among the threads
    • +
    +
    +
    + + +

    + +

    + +
    + + + + +
    + +
    + + + diff --git a/doc/pub/week9/html/._week9-bs128.html b/doc/pub/week9/html/._week9-bs128.html new file mode 100644 index 00000000..bbde9f39 --- /dev/null +++ b/doc/pub/week9/html/._week9-bs128.html @@ -0,0 +1,733 @@ + + + + + + + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + + + + + + + + + + + + + + + + + + + + +
    +

     

     

     

    + + +

    Nested parallelism

    +
    +
    + +

    When a thread in a parallel region encounters another parallel construct, it +may create a new team of threads and become the master of the new +team. +

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel num_threads(4)
    +{
    +/* .... */
    +#pragma omp parallel num_threads(2)
    +{
    +//  
    +}
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +

    + +

    + +
    + + + + +
    + +
    + + + diff --git a/doc/pub/week9/html/._week9-bs129.html b/doc/pub/week9/html/._week9-bs129.html new file mode 100644 index 00000000..d6c9e935 --- /dev/null +++ b/doc/pub/week9/html/._week9-bs129.html @@ -0,0 +1,731 @@ + + + + + + + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + + + + + + + + + + + + + + + + + + + + +
    +

     

     

     

    + + +

    Parallel tasks

    +
    +
    + + + +
    +
    +
    +
    +
    +
    #pragma omp task 
    +#pragma omp parallel shared(p_vec) private(i)
    +{
    +#pragma omp single
    +{
    +for (i=0; i<N; i++) {
    +  double r = random_number();
    +  if (p_vec[i] > r) {
    +#pragma omp task
    +   do_work (p_vec[i]);
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +

    + +

    + +
    + + + + +
    + +
    + + + diff --git a/doc/pub/week9/html/._week9-bs130.html b/doc/pub/week9/html/._week9-bs130.html new file mode 100644 index 00000000..3d44e13a --- /dev/null +++ b/doc/pub/week9/html/._week9-bs130.html @@ -0,0 +1,759 @@ + + + + + + + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + + + + + + + + + + + + + + + + + + + + +
    +

     

     

     

    + + +

    Common mistakes

    +
    +
    + +

    Race condition

    + + +
    +
    +
    +
    +
    +
    int nthreads;
    +#pragma omp parallel shared(nthreads)
    +{
    +nthreads = omp_get_num_threads();
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Deadlock

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel
    +{
    +...
    +#pragma omp critical
    +{
    +...
    +#pragma omp barrier
    +}
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +

    + +

    + +
    + + + + +
    + +
    + + + diff --git a/doc/pub/week9/html/._week9-bs131.html b/doc/pub/week9/html/._week9-bs131.html new file mode 100644 index 00000000..f1d5da69 --- /dev/null +++ b/doc/pub/week9/html/._week9-bs131.html @@ -0,0 +1,730 @@ + + + + + + + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + + + + + + + + + + + + + + + + + + + + +
    +

     

     

     

    + + +

    Not all computations are simple

    +
    +
    + +

    Not all computations are simple loops where the data can be evenly +divided among threads without any dependencies between threads +

    + +

    An example is finding the location and value of the largest element in an array

    + + +
    +
    +
    +
    +
    +
    for (i=0; i<n; i++) { 
    +   if (x[i] > maxval) {
    +      maxval = x[i];
    +      maxloc = i; 
    +   }
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +

    + +

    + +
    + + + + +
    + +
    + + + diff --git a/doc/pub/week9/html/._week9-bs132.html b/doc/pub/week9/html/._week9-bs132.html new file mode 100644 index 00000000..26b84107 --- /dev/null +++ b/doc/pub/week9/html/._week9-bs132.html @@ -0,0 +1,754 @@ + + + + + + + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + + + + + + + + + + + + + + + + + + + + +
    +

     

     

     

    + + +

    Not all computations are simple, competing threads

    +
    +
    + +

    All threads are potentially accessing and changing the same values, maxloc and maxval.

    +
      +
    1. OpenMP provides several ways to coordinate access to shared values
    2. +
    + + +
    +
    +
    +
    +
    +
    #pragma omp atomic
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
      +
    1. Only one thread at a time can execute the following statement (not block). We can use the critical option
    2. +
    + + +
    +
    +
    +
    +
    +
    #pragma omp critical
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
      +
    1. Only one thread at a time can execute the following block
    2. +
    +

    Atomic may be faster than critical but depends on hardware

    +
    +
    + + +

    + +

    + +
    + + + + +
    + +
    + + + diff --git a/doc/pub/week9/html/._week9-bs133.html b/doc/pub/week9/html/._week9-bs133.html new file mode 100644 index 00000000..b6b41ca9 --- /dev/null +++ b/doc/pub/week9/html/._week9-bs133.html @@ -0,0 +1,727 @@ + + + + + + + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + + + + + + + + + + + + + + + + + + + + +
    +

     

     

     

    + + +

    How to find the max value using OpenMP

    +
    +
    + +

    Write down the simplest algorithm and look carefully for race conditions. How would you handle them? +The first step would be to parallelize as +

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel for
    + for (i=0; i<n; i++) {
    +    if (x[i] > maxval) {
    +      maxval = x[i];
    +      maxloc = i; 
    +    }
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +

    + +

    + +
    + + + + +
    + +
    + + + diff --git a/doc/pub/week9/html/._week9-bs134.html b/doc/pub/week9/html/._week9-bs134.html new file mode 100644 index 00000000..b3373106 --- /dev/null +++ b/doc/pub/week9/html/._week9-bs134.html @@ -0,0 +1,732 @@ + + + + + + + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + + + + + + + + + + + + + + + + + + + + +
    +

     

     

     

    + + +

    Then deal with the race conditions

    +
    +
    + +

    Write down the simplest algorithm and look carefully for race conditions. How would you handle them? +The first step would be to parallelize as +

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel for
    + for (i=0; i<n; i++) {
    +#pragma omp critical
    +  {
    +     if (x[i] > maxval) {
    +       maxval = x[i];
    +       maxloc = i; 
    +     }
    +  }
    +} 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Exercise: write a code which implements this and give an estimate on performance. Perform several runs, +with a serial code only with and without vectorization and compare the serial code with the one that uses OpenMP. Run on different archictectures if you can. +

    +
    +
    + +

    + +

    + +
    + + + + +
    + +
    + + + diff --git a/doc/pub/week9/html/._week9-bs135.html b/doc/pub/week9/html/._week9-bs135.html new file mode 100644 index 00000000..ecd0d861 --- /dev/null +++ b/doc/pub/week9/html/._week9-bs135.html @@ -0,0 +1,689 @@ + + + + + + + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + + + + + + + + + + + + + + + + + + + + +
    +

     

     

     

    + + +

    What can slow down OpenMP performance?

    +

    Give it a thought!

    + +

    + +

    + +
    + + + + +
    + +
    + + + diff --git a/doc/pub/week9/html/._week9-bs136.html b/doc/pub/week9/html/._week9-bs136.html new file mode 100644 index 00000000..599b6c70 --- /dev/null +++ b/doc/pub/week9/html/._week9-bs136.html @@ -0,0 +1,700 @@ + + + + + + + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + + + + + + + + + + + + + + + + + + + + +
    +

     

     

     

    + + +

    What can slow down OpenMP performance?

    +
    +
    + +

    Performance poor because we insisted on keeping track of the maxval and location during the execution of the loop.

    +
      +
    • We do not care about the value during the execution of the loop, just the value at the end.
    • +
    +

    This is a common source of performance issues, namely the description of the method used to compute a value imposes additional, unnecessary requirements or properties

    + +Idea: Have each thread find the maxloc in its own data, then combine and use temporary arrays indexed by thread number to hold the values found by each thread +
    +
    + + +

    + +

    + +
    + + + + +
    + +
    + + + diff --git a/doc/pub/week9/html/._week9-bs137.html b/doc/pub/week9/html/._week9-bs137.html new file mode 100644 index 00000000..3740456b --- /dev/null +++ b/doc/pub/week9/html/._week9-bs137.html @@ -0,0 +1,727 @@ + + + + + + + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + + + + + + + + + + + + + + + + + + + + +
    +

     

     

     

    + + +

    Find the max location for each thread

    +
    +
    + + + +
    +
    +
    +
    +
    +
    int maxloc[MAX_THREADS], mloc;
    +double maxval[MAX_THREADS], mval; 
    +#pragma omp parallel shared(maxval,maxloc)
    +{
    +  int id = omp_get_thread_num(); 
    +  maxval[id] = -1.0e30;
    +#pragma omp for
    +   for (int i=0; i<n; i++) {
    +       if (x[i] > maxval[id]) { 
    +           maxloc[id] = i;
    +           maxval[id] = x[i]; 
    +       }
    +    }
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +

    + +

    + +
    + + + + +
    + +
    + + + diff --git a/doc/pub/week9/html/._week9-bs138.html b/doc/pub/week9/html/._week9-bs138.html new file mode 100644 index 00000000..060ae5da --- /dev/null +++ b/doc/pub/week9/html/._week9-bs138.html @@ -0,0 +1,726 @@ + + + + + + + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + + + + + + + + + + + + + + + + + + + + +
    +

     

     

     

    + + +

    Combine the values from each thread

    +
    +
    + + + +
    +
    +
    +
    +
    +
    #pragma omp flush (maxloc,maxval)
    +#pragma omp master
    +  {
    +    int nt = omp_get_num_threads(); 
    +    mloc = maxloc[0]; 
    +    mval = maxval[0]; 
    +    for (int i=1; i<nt; i++) {
    +        if (maxval[i] > mval) { 
    +           mval = maxval[i]; 
    +           mloc = maxloc[i];
    +        } 
    +     }
    +   }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Note that we let the master process perform the last operation.

    +
    +
    + +

    + +

    + +
    + + + + +
    + +
    + + + diff --git a/doc/pub/week9/html/._week9-bs139.html b/doc/pub/week9/html/._week9-bs139.html new file mode 100644 index 00000000..5214c06c --- /dev/null +++ b/doc/pub/week9/html/._week9-bs139.html @@ -0,0 +1,764 @@ + + + + + + + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + + + + + + + + + + + + + + + + + + + + +
    +

     

     

     

    + + +

    Matrix-matrix multiplication

    +

    This code computes the norm of a vector using OpenMp

    + + +
    +
    +
    +
    +
    +
    //  OpenMP program to compute vector norm by adding two other vectors
    +#include <cstdlib>
    +#include <iostream>
    +#include <cmath>
    +#include <iomanip>
    +#include  <omp.h>
    +# include <ctime>
    +
    +using namespace std; // note use of namespace
    +int main (int argc, char* argv[])
    +{
    +  // read in dimension of vector
    +  int n = atoi(argv[1]);
    +  double *a, *b, *c;
    +  int i;
    +  int thread_num;
    +  double wtime, Norm2, s, angle;
    +  cout << "  Perform addition of two vectors and compute the norm-2." << endl;
    +  omp_set_num_threads(4);
    +  thread_num = omp_get_max_threads ();
    +  cout << "  The number of processors available = " << omp_get_num_procs () << endl ;
    +  cout << "  The number of threads available    = " << thread_num <<  endl;
    +  cout << "  The matrix order n                 = " << n << endl;
    +
    +  s = 1.0/sqrt( (double) n);
    +  wtime = omp_get_wtime ( );
    +  // Allocate space for the vectors to be used
    +  a = new double [n]; b = new double [n]; c = new double [n];
    +  // Define parallel region
    +# pragma omp parallel for default(shared) private (angle, i) reduction(+:Norm2)
    +  // Set up values for vectors  a and b
    +  for (i = 0; i < n; i++){
    +      angle = 2.0*M_PI*i/ (( double ) n);
    +      a[i] = s*(sin(angle) + cos(angle));
    +      b[i] =  s*sin(2.0*angle);
    +      c[i] = 0.0;
    +  }
    +  // Then perform the vector addition
    +  for (i = 0; i < n; i++){
    +     c[i] += a[i]+b[i];
    +  }
    +  // Compute now the norm-2
    +  Norm2 = 0.0;
    +  for (i = 0; i < n; i++){
    +     Norm2  += c[i]*c[i];
    +  }
    +// end parallel region
    +  wtime = omp_get_wtime ( ) - wtime;
    +  cout << setiosflags(ios::showpoint | ios::uppercase);
    +  cout << setprecision(10) << setw(20) << "Time used  for norm-2 computation=" << wtime  << endl;
    +  cout << " Norm-2  = " << Norm2 << endl;
    +  // Free up space
    +  delete[] a;
    +  delete[] b;
    +  delete[] c;
    +  return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +

    + +

    + +
    + + + + +
    + +
    + + + diff --git a/doc/pub/week9/html/._week9-bs140.html b/doc/pub/week9/html/._week9-bs140.html new file mode 100644 index 00000000..7688209c --- /dev/null +++ b/doc/pub/week9/html/._week9-bs140.html @@ -0,0 +1,782 @@ + + + + + + + +Week 11, March 11-15: Resampling Techniques, Bootstrap and Blocking + + + + + + + + + + + + + + + + + + + + +
    +

     

     

     

    + + +

    Matrix-matrix multiplication

    +

    This the matrix-matrix multiplication code with plain c++ memory allocation using OpenMP

    + + + +
    +
    +
    +
    +
    +
    //  Matrix-matrix multiplication and Frobenius norm of a matrix with OpenMP
    +#include <cstdlib>
    +#include <iostream>
    +#include <cmath>
    +#include <iomanip>
    +#include  <omp.h>
    +# include <ctime>
    +
    +using namespace std; // note use of namespace
    +int main (int argc, char* argv[])
    +{
    +  // read in dimension of square matrix
    +  int n = atoi(argv[1]);
    +  double **A, **B, **C;
    +  int i, j, k;
    +  int thread_num;
    +  double wtime, Fsum, s, angle;
    +  cout << "  Compute matrix product C = A * B and Frobenius norm." << endl;
    +  omp_set_num_threads(4);
    +  thread_num = omp_get_max_threads ();
    +  cout << "  The number of processors available = " << omp_get_num_procs () << endl ;
    +  cout << "  The number of threads available    = " << thread_num <<  endl;
    +  cout << "  The matrix order n                 = " << n << endl;
    +
    +  s = 1.0/sqrt( (double) n);
    +  wtime = omp_get_wtime ( );
    +  // Allocate space for the two matrices
    +  A = new double*[n]; B = new double*[n]; C = new double*[n];
    +  for (i = 0; i < n; i++){
    +    A[i] = new double[n];
    +    B[i] = new double[n];
    +    C[i] = new double[n];
    +  }
    +  // Define parallel region
    +# pragma omp parallel for default(shared) private (angle, i, j, k) reduction(+:Fsum)
    +  // Set up values for matrix A and B and zero matrix C
    +  for (i = 0; i < n; i++){
    +    for (j = 0; j < n; j++) {
    +      angle = 2.0*M_PI*i*j/ (( double ) n);
    +      A[i][j] = s * ( sin ( angle ) + cos ( angle ) );
    +      B[j][i] =  A[i][j];
    +    }
    +  }
    +  // Then perform the matrix-matrix multiplication
    +  for (i = 0; i < n; i++){
    +    for (j = 0; j < n; j++) {
    +       C[i][j] =  0.0;    
    +       for (k = 0; k < n; k++) {
    +            C[i][j] += A[i][k]*B[k][j];
    +       }
    +    }
    +  }
    +  // Compute now the Frobenius norm
    +  Fsum = 0.0;
    +  for (i = 0; i < n; i++){
    +    for (j = 0; j < n; j++) {
    +      Fsum += C[i][j]*C[i][j];
    +    }
    +  }
    +  Fsum = sqrt(Fsum);
    +// end parallel region and letting only one thread perform I/O
    +  wtime = omp_get_wtime ( ) - wtime;
    +  cout << setiosflags(ios::showpoint | ios::uppercase);
    +  cout << setprecision(10) << setw(20) << "Time used  for matrix-matrix multiplication=" << wtime  << endl;
    +  cout << "  Frobenius norm  = " << Fsum << endl;
    +  // Free up space
    +  for (int i = 0; i < n; i++){
    +    delete[] A[i];
    +    delete[] B[i];
    +    delete[] C[i];
    +  }
    +  delete[] A;
    +  delete[] B;
    +  delete[] C;
    +  return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +

    + +

    + +
    + + + + +
    + +
    + + + diff --git a/doc/pub/week9/html/week9-bs.html b/doc/pub/week9/html/week9-bs.html index 0ee9f022..eb84da37 100644 --- a/doc/pub/week9/html/week9-bs.html +++ b/doc/pub/week9/html/week9-bs.html @@ -79,11 +79,388 @@ 2, None, 'blocking-transformations-final-expressions'), + ('More on the blocking method', + 2, + None, + 'more-on-the-blocking-method'), ('Example code form last week', 2, None, 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('Optimization and debugging', + 2, + None, + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', + 2, + None, + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', + 2, + None, + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', + 2, + None, + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -137,8 +514,127 @@
  • Blocking Transformations
  • Blocking Transformations, getting there
  • Blocking Transformations, final expressions
  • -
  • Example code form last week
  • -
  • Resampling analysis
  • +
  • More on the blocking method
  • +
  • Example code form last week
  • +
  • Resampling analysis
  • +
  • Content
  • +
  • Optimization and profiling
  • +
  • More on optimization
  • +
  • Optimization and profiling
  • +
  • Optimization and debugging
  • +
  • Other hints
  • +
  • Vectorization and the basic idea behind parallel computing
  • +
  • A rough classification of hardware models
  • +
  • Shared memory and distributed memory
  • +
  • Different parallel programming paradigms
  • +
  • Different parallel programming paradigms
  • +
  • What is vectorization?
  • +
  • Number of elements that can acted upon
  • +
  • Number of elements that can acted upon, examples
  • +
  • Operation counts for scalar operation
  • +
  • Number of elements that can acted upon, examples
  • +
  • Number of operations when vectorized
  • +
  • "A simple test case with and without vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"
  • +
  • Compiling with and without vectorization
  • +
  • Compiling with and without vectorization using clang
  • +
  • Automatic vectorization and vectorization inhibitors, criteria
  • +
  • Automatic vectorization and vectorization inhibitors, exit criteria
  • +
  • Automatic vectorization and vectorization inhibitors, straight-line code
  • +
  • Automatic vectorization and vectorization inhibitors, nested loops
  • +
  • Automatic vectorization and vectorization inhibitors, function calls
  • +
  • Automatic vectorization and vectorization inhibitors, data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, more data dependencies
  • +
  • Automatic vectorization and vectorization inhibitors, memory stride
  • +
  • Memory management
  • +
  • Memory and communication
  • +
  • Measuring performance
  • +
  • Problems with measuring time
  • +
  • Problems with cold start
  • +
  • Problems with smart compilers
  • +
  • Problems with interference
  • +
  • Problems with measuring performance
  • +
  • Thomas algorithm for tridiagonal linear algebra equations
  • +
  • Thomas algorithm, forward substitution
  • +
  • Thomas algorithm, backward substitution
  • +
  • Thomas algorithm and counting of operations (floating point and memory)
  • +
  • "Example: Transpose of a matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"
  • +
  • How do we define speedup? Simplest form
  • +
  • How do we define speedup? Correct baseline
  • +
  • Parallel speedup
  • +
  • Speedup and memory
  • +
  • Upper bounds on speedup
  • +
  • Amdahl's law
  • +
  • How much is parallelizable
  • +
  • Today's situation of parallel computing
  • +
  • Overhead present in parallel computing
  • +
  • Parallelizing a sequential algorithm
  • +
  • Strategies
  • +
  • How do I run MPI on a PC/Laptop? MPI
  • +
  • Can I do it on my own PC/laptop? OpenMP installation
  • +
  • Installing MPI
  • +
  • Installing MPI and using Qt
  • +
  • What is Message Passing Interface (MPI)?
  • +
  • Going Parallel with MPI
  • +
  • MPI is a library
  • +
  • Bindings to MPI routines
  • +
  • Communicator
  • +
  • Some of the most important MPI functions
  • +
  • "The first MPI C/C++ program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"
  • +
  • The Fortran program
  • +
  • Note 1
  • +
  • "Ordered output with MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"
  • +
  • Note 2
  • +
  • "Ordered output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"
  • +
  • Note 3
  • +
  • Note 4
  • +
  • "Numerical integration in parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"
  • +
  • Dissection of trapezoidal rule with \( MPI\_reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Integrating with MPI
  • +
  • How do I use \( MPI\_reduce \)?
  • +
  • More on \( MPI\_Reduce \)
  • +
  • Dissection of trapezoidal rule
  • +
  • Dissection of trapezoidal rule
  • +
  • "The quantum dot program for two electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"
  • +
  • What is OpenMP
  • +
  • Getting started, things to remember
  • +
  • OpenMP syntax
  • +
  • Different OpenMP styles of parallelism
  • +
  • General code structure
  • +
  • Parallel region
  • +
  • Hello world, not again, please!
  • +
  • Hello world, yet another variant
  • +
  • Important OpenMP library routines
  • +
  • Private variables
  • +
  • Master region
  • +
  • Parallel for loop
  • +
  • Parallel computations and loops
  • +
  • Scheduling of loop computations
  • +
  • Example code for loop scheduling
  • +
  • Example code for loop scheduling, guided instead of dynamic
  • +
  • More on Parallel for loop
  • +
  • What can happen with this loop?
  • +
  • Inner product
  • +
  • Different threads do different tasks
  • +
  • Single execution
  • +
  • Coordination and synchronization
  • +
  • Data scope
  • +
  • Some remarks
  • +
  • Parallelizing nested for-loops
  • +
  • Nested parallelism
  • +
  • Parallel tasks
  • +
  • Common mistakes
  • +
  • Not all computations are simple
  • +
  • Not all computations are simple, competing threads
  • +
  • How to find the max value using OpenMP
  • +
  • Then deal with the race conditions
  • +
  • What can slow down OpenMP performance?
  • +
  • What can slow down OpenMP performance?
  • +
  • Find the max location for each thread
  • +
  • Combine the values from each thread
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"
  • +
  • "Matrix-matrix multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"
  • @@ -193,7 +689,7 @@

    March 11-15

  • 9
  • 10
  • ...
  • -
  • 22
  • +
  • 141
  • »
  • diff --git a/doc/pub/week9/html/week9-reveal.html b/doc/pub/week9/html/week9-reveal.html index e6b89bc5..fcf34527 100644 --- a/doc/pub/week9/html/week9-reveal.html +++ b/doc/pub/week9/html/week9-reveal.html @@ -209,15 +209,7 @@

    Overview of week 11, March 11-15

    - -
    -Teaching Material, videos and written material -

    -

    -
    +

    Note, these notes contain additional material om optimization and parallelization. Parts of this material will be discussed this week.

    @@ -437,9 +429,9 @@

    Introducing the correlation functi

    Resampling methods: Blocking

    The blocking method was made popular by Flyvbjerg and Pedersen (1989) -and has become one of the standard ways to estimate -\( V(\widehat{\theta}) \) for exactly one \( \widehat{\theta} \), namely -\( \widehat{\theta} = \overline{X} \). +and has become one of the standard ways to estimate the variance +\( \mathrm{var}(\widehat{\theta}) \) for exactly one estimator \( \widehat{\theta} \), namely +\( \widehat{\theta} = \overline{X} \), the mean value.

    Assume \( n = 2^d \) for some integer \( d>1 \) and \( X_1,X_2,\cdots, X_n \) is a stationary time series to begin with. @@ -579,11 +571,15 @@

    Blocking Transformations, fi \end{align} $$

     
    +

    + +
    +

    More on the blocking method

    Flyvbjerg and Petersen demonstrated that the sequence \( \{e_k\}_{k=0}^{d-1} \) is decreasing, and conjecture that the term \( e_k \) can be made as small as we would like by making \( k \) (and hence -\( d \)) sufficiently large. The sequence is decreasing (Master of Science thesis by Marius Jonsson, UiO 2018). +\( d \)) sufficiently large. The sequence is decreasing. It means we can apply blocking transformations until \( e_k \) is sufficiently small, and then estimate \( \mathrm{var}(\overline{X}) \) by \( \widehat{\sigma}^2_k/n_k \). @@ -919,6 +915,5206 @@

    Resampling analysis

    +
    +

    Content

    + +
    + +
    +

    Optimization and profiling

    +
    + +

    + +

    Till now we have not paid much attention to speed and possible optimization possibilities +inherent in the various compilers. We have compiled and linked as +

    + + +
    +
    +
    +
    +
    +
    c++  -c  mycode.cpp
    +c++  -o  mycode.exe  mycode.o
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    For Fortran replace with for example gfortran or ifort. +This is what we call a flat compiler option and should be used when we develop the code. +It produces normally a very large and slow code when translated to machine instructions. +We use this option for debugging and for establishing the correct program output because +every operation is done precisely as the user specified it. +

    + +

    It is instructive to look up the compiler manual for further instructions by writing

    + + +
    +
    +
    +
    +
    +
    man c++
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    More on optimization

    +
    + +

    +

    We have additional compiler options for optimization. These may include procedure inlining where +performance may be improved, moving constants inside loops outside the loop, +identify potential parallelism, include automatic vectorization or replace a division with a reciprocal +and a multiplication if this speeds up the code. +

    + + +
    +
    +
    +
    +
    +
    c++  -O3 -c  mycode.cpp
    +c++  -O3 -o  mycode.exe  mycode.o
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    This (other options are -O2 or -Ofast) is the recommended option.

    +
    +
    + +
    +

    Optimization and profiling

    +
    + +

    +

    It is also useful to profile your program under the development stage. +You would then compile with +

    + + +
    +
    +
    +
    +
    +
    c++  -pg -O3 -c  mycode.cpp
    +c++  -pg -O3 -o  mycode.exe  mycode.o
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    After you have run the code you can obtain the profiling information via

    + + +
    +
    +
    +
    +
    +
    gprof mycode.exe >  ProfileOutput
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    When you have profiled properly your code, you must take out this option as it +slows down performance. +For memory tests use valgrind. An excellent environment for all these aspects, and much more, is Qt creator. +

    +
    +
    + +
    +

    Optimization and debugging

    +
    + +

    +

    Adding debugging options is a very useful alternative under the development stage of a program. +You would then compile with +

    + + +
    +
    +
    +
    +
    +
    c++  -g -O0 -c  mycode.cpp
    +c++  -g -O0 -o  mycode.exe  mycode.o
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    This option generates debugging information allowing you to trace for example if an array is properly allocated. Some compilers work best with the no optimization option -O0.

    +
    + +
    +Other optimization flags +

    +

    Depending on the compiler, one can add flags which generate code that catches integer overflow errors. +The flag -ftrapv does this for the CLANG compiler on OS X operating systems. +

    +
    +
    + +
    +

    Other hints

    +
    + +

    +

    In general, irrespective of compiler options, it is useful to

    + +

    +

    Here is an example of a part of a program where specific operations lead to a slower code

    + + +
    +
    +
    +
    +
    +
    k = n-1;
    +for (i = 0; i < n; i++){
    +    a[i] = b[i] +c*d;
    +    e = g[k];
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    A better code is

    + + +
    +
    +
    +
    +
    +
    temp = c*d;
    +for (i = 0; i < n; i++){
    +    a[i] = b[i] + temp;
    +}
    +e = g[n-1];
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Here we avoid a repeated multiplication inside a loop. +Most compilers, depending on compiler flags, identify and optimize such bottlenecks on their own, without requiring any particular action by the programmer. However, it is always useful to single out and avoid code examples like the first one discussed here. +

    +
    +
    + +
    +

    Vectorization and the basic idea behind parallel computing

    +
    + +

    +

    Present CPUs are highly parallel processors with varying levels of parallelism. The typical situation can be described via the following three statements.

    + +

    +

    Before we proceed with a more detailed discussion of topics like vectorization and parallelization, we need to remind ourselves about some basic features of different hardware models.

    +
    +
    + +
    +

    A rough classification of hardware models

    +
    + +

    + +

    +
    +
    + +
    +

    Shared memory and distributed memory

    +
    + +

    +

    One way of categorizing modern parallel computers is to look at the memory configuration.

    + +

    +

    The CPUs are connected by some network and may exchange messages.

    +
    +
    + +
    +

    Different parallel programming paradigms

    +
    + +

    + +

    +
    +
    + +
    +

    Different parallel programming paradigms

    +
    + +

    + +

    +
    +
    + +
    +

    What is vectorization?

    +

    Vectorization is a special +case of Single Instructions Multiple Data (SIMD) to denote a single +instruction stream capable of operating on multiple data elements in +parallel. +We can think of vectorization as the unrolling of loops accompanied with SIMD instructions. +

    + +

    Vectorization is the process of converting an algorithm that performs scalar operations +(typically one operation at the time) to vector operations where a single operation can refer to many simultaneous operations. +Consider the following example +

    + + +
    +
    +
    +
    +
    +
    for (i = 0; i < n; i++){
    +    a[i] = b[i] + c[i];
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    If the code is not vectorized, the compiler will simply start with the first element and +then perform subsequent additions operating on one address in memory at the time. +

    +
    + +
    +

    Number of elements that can acted upon

    +

    A SIMD instruction can operate on multiple data elements in one single instruction. +It uses the so-called 128-bit SIMD floating-point register. +In this sense, vectorization adds some form of parallelism since one instruction is applied +to many parts of say a vector. +

    + +

    The number of elements which can be operated on in parallel +range from four single-precision floating point data elements in so-called +Streaming SIMD Extensions and two double-precision floating-point data +elements in Streaming SIMD Extensions 2 to sixteen byte operations in +a 128-bit register in Streaming SIMD Extensions 2. Thus, vector-length +ranges from 2 to 16, depending on the instruction extensions used and +on the data type. +

    + +

    IN summary, our instructions operate on 128 bit (16 byte) operands

    + +
    + +
    +

    Number of elements that can acted upon, examples

    +

    We start with the simple scalar operations given by

    + + +
    +
    +
    +
    +
    +
    for (i = 0; i < n; i++){
    +    a[i] = b[i] + c[i];
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    If the code is not vectorized and we have a 128-bit register to store a 32 bits floating point number, +it means that we have \( 3\times 32 \) bits that are not used. +

    + +

    We have thus unused space in our SIMD registers. These registers could hold three additional integers.

    +
    + +
    +

    Operation counts for scalar operation

    +

    The code

    + + +
    +
    +
    +
    +
    +
    for (i = 0; i < n; i++){
    +    a[i] = b[i] + c[i];
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    has for \( n \) repeats

    +
      +

    1. one load for \( c[i] \) in address 1
    2. +

    3. one load for \( b[i] \) in address 2
    4. +

    5. add \( c[i] \) and \( b[i] \) to give \( a[i] \)
    6. +

    7. store \( a[i] \) in address 2
    8. +
    +
    + +
    +

    Number of elements that can acted upon, examples

    +

    If we vectorize the code, we can perform, with a 128-bit register four simultaneous operations, that is +we have +

    + + +
    +
    +
    +
    +
    +
    for (i = 0; i < n; i+=4){
    +    a[i] = b[i] + c[i];
    +    a[i+1] = b[i+1] + c[i+1];
    +    a[i+2] = b[i+2] + c[i+2];
    +    a[i+3] = b[i+3] + c[i+3];
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Four additions are now done in a single step.

    +
    + +
    +

    Number of operations when vectorized

    +

    For \( n/4 \) repeats assuming floats or integers

    +
      +

    1. one vector load for \( c[i] \) in address 1
    2. +

    3. one load for \( b[i] \) in address 2
    4. +

    5. add \( c[i] \) and \( b[i] \) to give \( a[i] \)
    6. +

    7. store \( a[i] \) in address 2
    8. +
    +
    + +
    +

    A simple test case with and without vectorization

    +

    We implement these operations in a simple c++ program that computes at the end the norm of a vector.

    + + + +
    +
    +
    +
    +
    +
    #include <cstdlib>
    +#include <iostream>
    +#include <cmath>
    +#include <iomanip>
    +#include "time.h"
    +
    +using namespace std; // note use of namespace
    +int main (int argc, char* argv[])
    +{
    +  // read in dimension of square matrix
    +  int n = atoi(argv[1]);
    +  double s = 1.0/sqrt( (double) n);
    +  double *a, *b, *c;
    +  // Start timing
    +  clock_t start, finish;
    +  start = clock();
    +// Allocate space for the vectors to be used
    +    a = new double [n]; b = new double [n]; c = new double [n];
    +  // Define parallel region
    +  // Set up values for vectors  a and b
    +  for (int i = 0; i < n; i++){
    +    double angle = 2.0*M_PI*i/ (( double ) n);
    +    a[i] = s*(sin(angle) + cos(angle));
    +    b[i] =  s*sin(2.0*angle);
    +    c[i] = 0.0;
    +  }
    +  // Then perform the vector addition
    +  for (int i = 0; i < n; i++){
    +    c[i] += a[i]+b[i];
    +  }
    +  // Compute now the norm-2
    +  double Norm2 = 0.0;
    +  for (int i = 0; i < n; i++){
    +    Norm2  += c[i]*c[i];
    +  }
    +  finish = clock();
    +  double timeused = (double) (finish - start)/(CLOCKS_PER_SEC );
    +  cout << setiosflags(ios::showpoint | ios::uppercase);
    +  cout << setprecision(10) << setw(20) << "Time used  for norm computation=" << timeused  << endl;
    +  cout << "  Norm-2  = " << Norm2 << endl;
    +  // Free up space
    +  delete[] a;
    +  delete[] b;
    +  delete[] c;
    +  return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Compiling with and without vectorization

    +

    We can compile and link without vectorization using the clang c++ compiler

    + + +
    +
    +
    +
    +
    +
    clang -o novec.x vecexample.cpp
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    and with vectorization (and additional optimizations)

    + + +
    +
    +
    +
    +
    +
    clang++ -O3 -Rpass=loop-vectorize -o  vec.x vecexample.cpp 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    The speedup depends on the size of the vectors. In the example here we have run with \( 10^7 \) elements. +The example here was run on an IMac17.1 with OSX El Capitan (10.11.4) as operating system and an Intel i5 3.3 GHz CPU. +

    + + +
    +
    +
    +
    +
    +
    Compphys:~ hjensen$ ./vec.x 10000000
    +Time used  for norm computation=0.04720500000
    +Compphys:~ hjensen$ ./novec.x 10000000
    +Time used  for norm computation=0.03311700000
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    This particular C++ compiler speeds up the above loop operations with a factor of 1.5 +Performing the same operations for \( 10^9 \) elements results in a smaller speedup since reading from main memory is required. The non-vectorized code is seemingly faster. +

    + + +
    +
    +
    +
    +
    +
    Compphys:~ hjensen$ ./vec.x 1000000000
    +Time used  for norm computation=58.41391100
    +Compphys:~ hjensen$ ./novec.x 1000000000
    +Time used  for norm computation=46.51295300
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    We will discuss these issues further in the next slides.

    +
    + +
    +

    Compiling with and without vectorization using clang

    +

    We can compile and link without vectorization with clang compiler

    + + +
    +
    +
    +
    +
    +
    clang++ -o -fno-vectorize novec.x vecexample.cpp
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    and with vectorization

    + + +
    +
    +
    +
    +
    +
    clang++ -O3 -Rpass=loop-vectorize -o  vec.x vecexample.cpp 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    We can also add vectorization analysis, see for example

    + + +
    +
    +
    +
    +
    +
    clang++ -O3 -Rpass-analysis=loop-vectorize -o  vec.x vecexample.cpp 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    or figure out if vectorization was missed

    + + +
    +
    +
    +
    +
    +
    clang++ -O3 -Rpass-missed=loop-vectorize -o  vec.x vecexample.cpp 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Automatic vectorization and vectorization inhibitors, criteria

    + +

    Not all loops can be vectorized, as discussed in Intel's guide to vectorization

    + +

    An important criteria is that the loop counter \( n \) is known at the entry of the loop.

    + + +
    +
    +
    +
    +
    +
      for (int j = 0; j < n; j++) {
    +    a[j] = cos(j*1.0);
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    The variable \( n \) does need to be known at compile time. However, this variable must stay the same for the entire duration of the loop. It implies that an exit statement inside the loop cannot be data dependent.

    +
    + +
    +

    Automatic vectorization and vectorization inhibitors, exit criteria

    + +

    An exit statement should in general be avoided. +If the exit statement contains data-dependent conditions, the loop cannot be vectorized. +The following is an example of a non-vectorizable loop +

    + + +
    +
    +
    +
    +
    +
      for (int j = 0; j < n; j++) {
    +    a[j] = cos(j*1.0);
    +    if (a[j] < 0 ) break;
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Avoid loop termination conditions and opt for a single entry loop variable \( n \). The lower and upper bounds have to be kept fixed within the loop.

    +
    + +
    +

    Automatic vectorization and vectorization inhibitors, straight-line code

    + +

    SIMD instructions perform the same type of operations multiple times. +A switch statement leads thus to a non-vectorizable loop since different statemens cannot branch. +The following code can however be vectorized since the if statement is implemented as a masked assignment. +

    + + +
    +
    +
    +
    +
    +
      for (int j = 0; j < n; j++) {
    +    double x  = cos(j*1.0);
    +    if (x > 0 ) {
    +       a[j] =  x*sin(j*2.0); 
    +    }
    +    else {
    +       a[j] = 0.0;
    +    }
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    These operations can be performed for all data elements but only those elements which the mask evaluates as true are stored. In general, one should avoid branches such as switch, go to, or return statements or if constructs that cannot be treated as masked assignments.

    +
    + +
    +

    Automatic vectorization and vectorization inhibitors, nested loops

    + +

    Only the innermost loop of the following example is vectorized

    + + +
    +
    +
    +
    +
    +
      for (int i = 0; i < n; i++) {
    +      for (int j = 0; j < n; j++) {
    +           a[i][j] += b[i][j];
    +      }  
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    The exception is if an original outer loop is transformed into an inner loop as the result of compiler optimizations.

    +
    + +
    +

    Automatic vectorization and vectorization inhibitors, function calls

    + +

    Calls to programmer defined functions ruin vectorization. However, calls to intrinsic functions like +\( \sin{x} \), \( \cos{x} \), \( \exp{x} \) etc are allowed since they are normally efficiently vectorized. +The following example is fully vectorizable +

    + + +
    +
    +
    +
    +
    +
      for (int i = 0; i < n; i++) {
    +      a[i] = log10(i)*cos(i);
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Similarly, inline functions defined by the programmer, allow for vectorization since the function statements are glued into the actual place where the function is called.

    +
    + +
    +

    Automatic vectorization and vectorization inhibitors, data dependencies

    + +

    One has to keep in mind that vectorization changes the order of operations inside a loop. A so-called +read-after-write statement with an explicit flow dependency cannot be vectorized. The following code +

    + + +
    +
    +
    +
    +
    +
      double b = 15.;
    +  for (int i = 1; i < n; i++) {
    +      a[i] = a[i-1] + b;
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    is an example of flow dependency and results in wrong numerical results if vectorized. For a scalar operation, the value \( a[i-1] \) computed during the iteration is loaded into the right-hand side and the results are fine. In vector mode however, with a vector length of four, the values \( a[0] \), \( a[1] \), \( a[2] \) and \( a[3] \) from the previous loop will be loaded into the right-hand side and produce wrong results. That is, we have

    + + +
    +
    +
    +
    +
    +
       a[1] = a[0] + b;
    +   a[2] = a[1] + b;
    +   a[3] = a[2] + b;
    +   a[4] = a[3] + b;
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    and if the two first iterations are executed at the same by the SIMD instruction, the value of say \( a[1] \) could be used by the second iteration before it has been calculated by the first iteration, leading thereby to wrong results.

    +
    + +
    +

    Automatic vectorization and vectorization inhibitors, more data dependencies

    + +

    On the other hand, a so-called +write-after-read statement can be vectorized. The following code +

    + + +
    +
    +
    +
    +
    +
      double b = 15.;
    +  for (int i = 1; i < n; i++) {
    +      a[i-1] = a[i] + b;
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    is an example of flow dependency that can be vectorized since no iteration with a higher value of \( i \) +can complete before an iteration with a lower value of \( i \). However, such code leads to problems with parallelization. +

    +
    + +
    +

    Automatic vectorization and vectorization inhibitors, memory stride

    + +

    For C++ programmers it is also worth keeping in mind that an array notation is preferred to the more compact use of pointers to access array elements. The compiler can often not tell if it is safe to vectorize the code.

    + +

    When dealing with arrays, you should also avoid memory stride, since this slows down considerably vectorization. When you access array element, write for example the inner loop to vectorize using unit stride, that is, access successively the next array element in memory, as shown here

    + + +
    +
    +
    +
    +
    +
      for (int i = 0; i < n; i++) {
    +      for (int j = 0; j < n; j++) {
    +           a[i][j] += b[i][j];
    +      }  
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Memory management

    +

    The main memory contains the program data

    +
      +

    1. Cache memory contains a copy of the main memory data
    2. +

    3. Cache is faster but consumes more space and power. It is normally assumed to be much faster than main memory
    4. +

    5. Registers contain working data only
    6. + +

      +

    7. Multiple Cache memories contain a copy of the main memory data
    8. + +

      +

    +

    +

    Loads and stores to memory can be as important as floating point operations when we measure performance.

    +
    + +
    +

    Memory and communication

    + +
      +

    1. Most communication in a computer is carried out in chunks, blocks of bytes of data that move together
    2. +

    3. In the memory hierarchy, data moves between memory and cache, and between different levels of cache, in groups called lines
    4. + +

      +

    +

    +

    Many of these performance features are not captured in most programming languages.

    +
    + +
    +

    Measuring performance

    + +

    How do we measure performance? What is wrong with this code to time a loop?

    + + +
    +
    +
    +
    +
    +
      clock_t start, finish;
    +  start = clock();
    +  for (int j = 0; j < i; j++) {
    +    a[j] = b[j]+b[j]*c[j];
    +  }
    +  finish = clock();
    +  double timeused = (double) (finish - start)/(CLOCKS_PER_SEC );
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Problems with measuring time

    +
      +

    1. Timers are not infinitely accurate
    2. +

    3. All clocks have a granularity, the minimum time that they can measure
    4. +

    5. The error in a time measurement, even if everything is perfect, may be the size of this granularity (sometimes called a clock tick)
    6. +

    7. Always know what your clock granularity is
    8. +

    9. Ensure that your measurement is for a long enough duration (say 100 times the tick)
    10. +
    +
    + +
    +

    Problems with cold start

    + +

    What happens when the code is executed? The assumption is that the code is ready to +execute. But +

    +
      +

    1. Code may still be on disk, and not even read into memory.
    2. +

    3. Data may be in slow memory rather than fast (which may be wrong or right for what you are measuring)
    4. +

    5. Multiple tests often necessary to ensure that cold start effects are not present
    6. +

    7. Special effort often required to ensure data in the intended part of the memory hierarchy.
    8. +
    +
    + +
    +

    Problems with smart compilers

    + +
      +

    1. If the result of the computation is not used, the compiler may eliminate the code
    2. +

    3. Performance will look impossibly fantastic
    4. +

    5. Even worse, eliminate some of the code so the performance looks plausible
    6. +

    7. Ensure that the results are (or may be) used.
    8. +
    +
    + +
    +

    Problems with interference

    +
      +

    1. Other activities are sharing your processor
    2. + +

      +

    3. Make multiple tests and report
    4. +

    5. Easy choices include
    6. + +

      +

    +
    + +
    +

    Problems with measuring performance

    +
      +

    1. Accurate, reproducible performance measurement is hard
    2. +

    3. Think carefully about your experiment:
    4. +

    5. What is it, precisely, that you want to measure?
    6. +

    7. How representative is your test to the situation that you are trying to measure?
    8. +
    +
    + +
    +

    Thomas algorithm for tridiagonal linear algebra equations

    +
    + +

    +

     
    +$$ +\left( \begin{array}{ccccc} + b_0 & c_0 & & & \\ + a_0 & b_1 & c_1 & & \\ + & & \ddots & & \\ + & & a_{m-3} & b_{m-2} & c_{m-2} \\ + & & & a_{m-2} & b_{m-1} + \end{array} \right) +\left( \begin{array}{c} + x_0 \\ + x_1 \\ + \vdots \\ + x_{m-2} \\ + x_{m-1} + \end{array} \right)=\left( \begin{array}{c} + f_0 \\ + f_1 \\ + \vdots \\ + f_{m-2} \\ + f_{m-1} \\ + \end{array} \right) +$$ +

     
    +

    +
    + +
    +

    Thomas algorithm, forward substitution

    +
    + +

    +

    The first step is to multiply the first row by \( a_0/b_0 \) and subtract it from the second row. This is known as the forward substitution step. We obtain then

    +

     
    +$$ + a_i = 0, +$$ +

     
    + +

     
    +$$ + b_i = b_i - \frac{a_{i-1}}{b_{i-1}}c_{i-1}, +$$ +

     
    + +

    and

    +

     
    +$$ + f_i = f_i - \frac{a_{i-1}}{b_{i-1}}f_{i-1}. +$$ +

     
    + +

    At this point the simplified equation, with only an upper triangular matrix takes the form

    +

     
    +$$ +\left( \begin{array}{ccccc} + b_0 & c_0 & & & \\ + & b_1 & c_1 & & \\ + & & \ddots & & \\ + & & & b_{m-2} & c_{m-2} \\ + & & & & b_{m-1} + \end{array} \right)\left( \begin{array}{c} + x_0 \\ + x_1 \\ + \vdots \\ + x_{m-2} \\ + x_{m-1} + \end{array} \right)=\left( \begin{array}{c} + f_0 \\ + f_1 \\ + \vdots \\ + f_{m-2} \\ + f_{m-1} \\ + \end{array} \right) +$$ +

     
    +

    +
    + +
    +

    Thomas algorithm, backward substitution

    +
    + +

    +

    The next step is the backward substitution step. The last row is multiplied by \( c_{N-3}/b_{N-2} \) and subtracted from the second to last row, thus eliminating \( c_{N-3} \) from the last row. The general backward substitution procedure is

    +

     
    +$$ + c_i = 0, +$$ +

     
    + +

    and

    +

     
    +$$ + f_{i-1} = f_{i-1} - \frac{c_{i-1}}{b_i}f_i +$$ +

     
    + +

    All that ramains to be computed is the solution, which is the very straight forward process of

    +

     
    +$$ +x_i = \frac{f_i}{b_i} +$$ +

     
    +

    +
    + +
    +

    Thomas algorithm and counting of operations (floating point and memory)

    +
    + +

    + +

    We have in specific case the following operations with the floating operations

    + + +
    + + +
    + +

    + + +

    +
    +
    +
    +
    +
    // Forward substitution    
    +// Note that we can simplify by precalculating a[i-1]/b[i-1]
    +  for (int i=1; i < n; i++) {
    +     b[i] = b[i] - (a[i-1]*c[i-1])/b[i-1];
    +     f[i] = g[i] - (a[i-1]*f[i-1])/b[i-1];
    +  }
    +  x[n-1] = f[n-1] / b[n-1];
    +  // Backwards substitution                                                           
    +  for (int i = n-2; i >= 0; i--) {
    +     f[i] = f[i] - c[i]*f[i+1]/b[i+1];
    +     x[i] = f[i]/b[i];
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Example: Transpose of a matrix

    + + + +
    +
    +
    +
    +
    +
    #include <cstdlib>
    +#include <iostream>
    +#include <cmath>
    +#include <iomanip>
    +#include "time.h"
    +
    +using namespace std; // note use of namespace
    +int main (int argc, char* argv[])
    +{
    +  // read in dimension of square matrix
    +  int n = atoi(argv[1]);
    +  double **A, **B;
    +  // Allocate space for the two matrices
    +  A = new double*[n]; B = new double*[n];
    +  for (int i = 0; i < n; i++){
    +    A[i] = new double[n];
    +    B[i] = new double[n];
    +  }
    +  // Set up values for matrix A
    +  for (int i = 0; i < n; i++){
    +    for (int j = 0; j < n; j++) {
    +      A[i][j] =  cos(i*1.0)*sin(j*3.0);
    +    }
    +  }
    +  clock_t start, finish;
    +  start = clock();
    +  // Then compute the transpose
    +  for (int i = 0; i < n; i++){
    +    for (int j = 0; j < n; j++) {
    +      B[i][j]= A[j][i];
    +    }
    +  }
    +
    +  finish = clock();
    +  double timeused = (double) (finish - start)/(CLOCKS_PER_SEC );
    +  cout << setiosflags(ios::showpoint | ios::uppercase);
    +  cout << setprecision(10) << setw(20) << "Time used  for setting up transpose of matrix=" << timeused  << endl;
    +
    +  // Free up space
    +  for (int i = 0; i < n; i++){
    +    delete[] A[i];
    +    delete[] B[i];
    +  }
    +  delete[] A;
    +  delete[] B;
    +  return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Matrix-matrix multiplication

    +

    This the matrix-matrix multiplication code with plain c++ memory allocation. It computes at the end the Frobenius norm.

    + + + +
    +
    +
    +
    +
    +
    #include <cstdlib>
    +#include <iostream>
    +#include <cmath>
    +#include <iomanip>
    +#include "time.h"
    +
    +using namespace std; // note use of namespace
    +int main (int argc, char* argv[])
    +{
    +  // read in dimension of square matrix
    +  int n = atoi(argv[1]);
    +  double s = 1.0/sqrt( (double) n);
    +  double **A, **B, **C;
    +  // Start timing
    +  clock_t start, finish;
    +  start = clock();
    +  // Allocate space for the two matrices
    +  A = new double*[n]; B = new double*[n]; C = new double*[n];
    +  for (int i = 0; i < n; i++){
    +    A[i] = new double[n];
    +    B[i] = new double[n];
    +    C[i] = new double[n];
    +  }
    +  // Set up values for matrix A and B and zero matrix C
    +  for (int i = 0; i < n; i++){
    +    for (int j = 0; j < n; j++) {
    +      double angle = 2.0*M_PI*i*j/ (( double ) n);
    +      A[i][j] = s * ( sin ( angle ) + cos ( angle ) );
    +      B[j][i] =  A[i][j];
    +    }
    +  }
    +  // Then perform the matrix-matrix multiplication
    +  for (int i = 0; i < n; i++){
    +    for (int j = 0; j < n; j++) {
    +      double sum = 0.0;
    +       for (int k = 0; k < n; k++) {
    +           sum += B[i][k]*A[k][j];
    +       }
    +       C[i][j] = sum;
    +    }
    +  }
    +  // Compute now the Frobenius norm
    +  double Fsum = 0.0;
    +  for (int i = 0; i < n; i++){
    +    for (int j = 0; j < n; j++) {
    +      Fsum += C[i][j]*C[i][j];
    +    }
    +  }
    +  Fsum = sqrt(Fsum);
    +  finish = clock();
    +  double timeused = (double) (finish - start)/(CLOCKS_PER_SEC );
    +  cout << setiosflags(ios::showpoint | ios::uppercase);
    +  cout << setprecision(10) << setw(20) << "Time used  for matrix-matrix multiplication=" << timeused  << endl;
    +  cout << "  Frobenius norm  = " << Fsum << endl;
    +  // Free up space
    +  for (int i = 0; i < n; i++){
    +    delete[] A[i];
    +    delete[] B[i];
    +    delete[] C[i];
    +  }
    +  delete[] A;
    +  delete[] B;
    +  delete[] C;
    +  return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    How do we define speedup? Simplest form

    +
    + +

    +

    +
    +
    + +
    +

    How do we define speedup? Correct baseline

    +
    + +

    +

    The key is choosing the correct baseline for comparison

    + +
    +
    + +
    +

    Parallel speedup

    +
    + +

    +

    For parallel applications, speedup is typically defined as

    + +

    +

    Here \( T_1 \) is the time on one processor and \( T_p \) is the time using \( p \) processors.

    + +
    +
    + +
    +

    Speedup and memory

    +
    + +

    +

    The speedup on \( p \) processors can +be greater than \( p \) if memory usage is optimal! +Consider the case of a memorybound computation with \( M \) words of memory +

    + +
    +
    + +
    +

    Upper bounds on speedup

    +
    + +

    +

    Assume that almost all parts of a code are perfectly +parallelizable (fraction \( f \)). The remainder, +fraction \( (1-f) \) cannot be parallelized at all. +

    + +

    That is, there is work that takes time \( W \) on one process; a fraction \( f \) of that work will take +time \( Wf/p \) on \( p \) processors. +

    + +
    +
    + +
    +

    Amdahl's law

    +
    + +

    +

    On one processor we have

    +

     
    +$$ +T_1 = (1-f)W + fW = W +$$ +

     
    + +

    On \( p \) processors we have

    +

     
    +$$ +T_p = (1-f)W + \frac{fW}{p}, +$$ +

     
    + +

    resulting in a speedup of

    +

     
    +$$ +\frac{T_1}{T_p} = \frac{W}{(1-f)W+fW/p} +$$ +

     
    + +

    As \( p \) goes to infinity, \( fW/p \) goes to zero, and the maximum speedup is

    +

     
    +$$ +\frac{1}{1-f}, +$$ +

     
    + +

    meaning that if +if \( f = 0.99 \) (all but \( 1\% \) parallelizable), the maximum speedup +is \( 1/(1-.99)=100 \)! +

    +
    +
    + +
    +

    How much is parallelizable

    +
    + +

    +

    If any non-parallel code slips into the +application, the parallel +performance is limited. +

    + +

    In many simulations, however, the fraction of non-parallelizable work +is \( 10^{-6} \) or less due to large arrays or objects that are perfectly parallelizable. +

    +
    +
    + +
    +

    Today's situation of parallel computing

    +
    + +

    + +

    +

    +

    Our lectures will focus on both MPI and OpenMP.

    +
    +
    + +
    +

    Overhead present in parallel computing

    +
    + +

    + +

    +

    +

    Due to the above overhead and that certain parts of a sequential +algorithm cannot be parallelized we may not achieve an optimal parallelization. +

    +
    +
    + +
    +

    Parallelizing a sequential algorithm

    +
    + +

    + +

    +
    +
    + +
    +

    Strategies

    +
    + +

    +

    +
    +
    + +
    +

    How do I run MPI on a PC/Laptop? MPI

    +
    + +

    +

    To install MPI is rather easy on hardware running unix/linux as operating systems, follow simply the instructions from the OpenMPI website. See also subsequent slides. +When you have made sure you have installed MPI on your PC/laptop, +

    + +

    + + +

    +
    +
    +
    +
    +
      # Compile and link
    +  mpic++ -O3 -o nameofprog.x nameofprog.cpp
    +  #  run code with for example 8 processes using mpirun/mpiexec
    +  mpiexec -n 8 ./nameofprog.x
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Can I do it on my own PC/laptop? OpenMP installation

    +
    + +

    +

    If you wish to install MPI and OpenMP +on your laptop/PC, we recommend the following: +

    + + +

    + + +

    +
    +
    +
    +
    +
      brew install libomp
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    and compile and link as

    + + +
    +
    +
    +
    +
    +
    c++ -o <name executable> <name program.cpp>  -lomp
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Installing MPI

    +
    + +

    +

    For linux/ubuntu users, you need to install two packages (alternatively use the synaptic package manager)

    + + +
    +
    +
    +
    +
    +
      sudo apt-get install libopenmpi-dev
    +  sudo apt-get install openmpi-bin
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    For OS X users, install brew (after having installed xcode and gcc, needed for the +gfortran compiler of openmpi) and then install with brew +

    + + +
    +
    +
    +
    +
    +
       brew install openmpi
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    When running an executable (code.x), run as

    + + +
    +
    +
    +
    +
    +
      mpirun -n 10 ./code.x
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    where we indicate that we want the number of processes to be 10.

    +
    +
    + +
    +

    Installing MPI and using Qt

    +
    + +

    +

    With openmpi installed, when using Qt, add to your .pro file the instructions here

    + +

    You may need to tell Qt where openmpi is stored.

    +
    +
    + +
    +

    What is Message Passing Interface (MPI)?

    +
    + +

    + +

    MPI is a library, not a language. It specifies the names, calling sequences and results of functions +or subroutines to be called from C/C++ or Fortran programs, and the classes and methods that make up the MPI C++ +library. The programs that users write in Fortran, C or C++ are compiled with ordinary compilers and linked +with the MPI library. +

    + +

    MPI programs should be able to run +on all possible machines and run all MPI implementetations without change. +

    + +

    An MPI computation is a collection of processes communicating with messages.

    +
    +
    + +
    +

    Going Parallel with MPI

    +
    + +

    +

    Task parallelism: the work of a global problem can be divided +into a number of independent tasks, which rarely need to synchronize. +Monte Carlo simulations or numerical integration are examples of this. +

    + +

    MPI is a message-passing library where all the routines +have corresponding C/C++-binding +

    + + +
    +
    +
    +
    +
    +
       MPI_Command_name
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    and Fortran-binding (routine names are in uppercase, but can also be in lower case)

    + + +
    +
    +
    +
    +
    +
       MPI_COMMAND_NAME
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    MPI is a library

    +
    + +

    +

    MPI is a library specification for the message passing interface, +proposed as a standard. +

    + + +

    +

    A message passing standard for portability and ease-of-use. +Designed for high performance. +

    + +

    Insert communication and synchronization functions where necessary.

    +
    +
    + +
    +

    Bindings to MPI routines

    +
    + +

    + +

    MPI is a message-passing library where all the routines +have corresponding C/C++-binding +

    + + +
    +
    +
    +
    +
    +
       MPI_Command_name
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    and Fortran-binding (routine names are in uppercase, but can also be in lower case)

    + + +
    +
    +
    +
    +
    +
       MPI_COMMAND_NAME
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    The discussion in these slides focuses on the C++ binding.

    +
    +
    + +
    +

    Communicator

    +
    + +

    +

    +

    + + +

    +
    +
    +
    +
    +
      MPI_COMM_WORLD 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
    +
    + +
    +

    Some of the most important MPI functions

    +
    + +

    + +

    +
    +
    + +
    +

    The first MPI C/C++ program

    +
    + +

    + +

    Let every process write "Hello world" (oh not this program again!!) on the standard output.

    + + +
    +
    +
    +
    +
    +
    using namespace std;
    +#include <mpi.h>
    +#include <iostream>
    +int main (int nargs, char* args[])
    +{
    +int numprocs, my_rank;
    +//   MPI initializations
    +MPI_Init (&nargs, &args);
    +MPI_Comm_size (MPI_COMM_WORLD, &numprocs);
    +MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);
    +cout << "Hello world, I have  rank " << my_rank << " out of " 
    +     << numprocs << endl;
    +//  End MPI
    +MPI_Finalize ();
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    The Fortran program

    +
    + +

    + + +

    +
    +
    +
    +
    +
    PROGRAM hello
    +INCLUDE "mpif.h"
    +INTEGER:: size, my_rank, ierr
    +
    +CALL  MPI_INIT(ierr)
    +CALL MPI_COMM_SIZE(MPI_COMM_WORLD, size, ierr)
    +CALL MPI_COMM_RANK(MPI_COMM_WORLD, my_rank, ierr)
    +WRITE(*,*)"Hello world, I've rank ",my_rank," out of ",size
    +CALL MPI_FINALIZE(ierr)
    +
    +END PROGRAM hello
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Note 1

    +
    + +

    + +

    +
    +
    + +
    +

    Ordered output with MPIBarrier

    +
    + +

    + + + +

    +
    +
    +
    +
    +
    int main (int nargs, char* args[])
    +{
    + int numprocs, my_rank, i;
    + MPI_Init (&nargs, &args);
    + MPI_Comm_size (MPI_COMM_WORLD, &numprocs);
    + MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);
    + for (i = 0; i < numprocs; i++) {}
    + MPI_Barrier (MPI_COMM_WORLD);
    + if (i == my_rank) {
    + cout << "Hello world, I have  rank " << my_rank << 
    +        " out of " << numprocs << endl;}
    +      MPI_Finalize ();
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Note 2

    +
    + +

    +

    +

    +

    However, this is slightly more time-consuming since the processes synchronize between themselves as many times as there +are processes. In the next Hello world example we use the send and receive functions in order to a have a synchronized +action. +

    +
    +
    + +
    +

    Ordered output

    +
    + +

    + + + +

    +
    +
    +
    +
    +
    .....
    +int numprocs, my_rank, flag;
    +MPI_Status status;
    +MPI_Init (&nargs, &args);
    +MPI_Comm_size (MPI_COMM_WORLD, &numprocs);
    +MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);
    +if (my_rank > 0)
    +MPI_Recv (&flag, 1, MPI_INT, my_rank-1, 100, 
    +           MPI_COMM_WORLD, &status);
    +cout << "Hello world, I have  rank " << my_rank << " out of " 
    +<< numprocs << endl;
    +if (my_rank < numprocs-1)
    +MPI_Send (&my_rank, 1, MPI_INT, my_rank+1, 
    +          100, MPI_COMM_WORLD);
    +MPI_Finalize ();
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Note 3

    +
    + +

    + +

    The basic sending of messages is given by the function \( MPI\_SEND \), which in C/C++ +is defined as +

    + + +
    +
    +
    +
    +
    +
    int MPI_Send(void *buf, int count, 
    +             MPI_Datatype datatype, 
    +             int dest, int tag, MPI_Comm comm)}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    This single command allows the passing of any kind of variable, even a large array, to any group of tasks. +The variable buf is the variable we wish to send while count +is the number of variables we are passing. If we are passing only a single value, this should be 1. +

    + +

    If we transfer an array, it is the overall size of the array. +For example, if we want to send a 10 by 10 array, count would be \( 10\times 10=100 \) +since we are actually passing 100 values. +

    +
    +
    + +
    +

    Note 4

    +
    + +

    + +

    Once you have sent a message, you must receive it on another task. The function \( MPI\_RECV \) +is similar to the send call. +

    + + +
    +
    +
    +
    +
    +
    int MPI_Recv( void *buf, int count, MPI_Datatype datatype, 
    +            int source, 
    +            int tag, MPI_Comm comm, MPI_Status *status )
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    The arguments that are different from those in MPI\_SEND are +buf which is the name of the variable where you will be storing the received data, +source which replaces the destination in the send command. This is the return ID of the sender. +

    + +

    Finally, we have used \( MPI\_Status\_status \), +where one can check if the receive was completed. +

    + +

    The output of this code is the same as the previous example, but now +process 0 sends a message to process 1, which forwards it further +to process 2, and so forth. +

    +
    +
    + +
    +

    Numerical integration in parallel

    +
    +Integrating \( \pi \) +

    + +

    +

    +

     
    +$$ + I=\int_a^bf(x) dx\approx h\left(f(a)/2 + f(a+h) +f(a+2h)+\dots +f(b-h)+ f(b)/2\right). +$$ +

     
    + +

    Click on this link for the full program.

    +
    +
    + +
    +

    Dissection of trapezoidal rule with \( MPI\_reduce \)

    +
    + +

    + + + +

    +
    +
    +
    +
    +
    //    Trapezoidal rule and numerical integration usign MPI
    +using namespace std;
    +#include <mpi.h>
    +#include <iostream>
    +
    +//     Here we define various functions called by the main program
    +
    +double int_function(double );
    +double trapezoidal_rule(double , double , int , double (*)(double));
    +
    +//   Main function begins here
    +int main (int nargs, char* args[])
    +{
    +  int n, local_n, numprocs, my_rank; 
    +  double a, b, h, local_a, local_b, total_sum, local_sum;   
    +  double  time_start, time_end, total_time;
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Dissection of trapezoidal rule

    +
    + +

    + + + +

    +
    +
    +
    +
    +
      //  MPI initializations
    +  MPI_Init (&nargs, &args);
    +  MPI_Comm_size (MPI_COMM_WORLD, &numprocs);
    +  MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);
    +  time_start = MPI_Wtime();
    +  //  Fixed values for a, b and n 
    +  a = 0.0 ; b = 1.0;  n = 1000;
    +  h = (b-a)/n;    // h is the same for all processes 
    +  local_n = n/numprocs;  
    +  // make sure n > numprocs, else integer division gives zero
    +  // Length of each process' interval of
    +  // integration = local_n*h.  
    +  local_a = a + my_rank*local_n*h;
    +  local_b = local_a + local_n*h;
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Integrating with MPI

    +
    + +

    + + + +

    +
    +
    +
    +
    +
      total_sum = 0.0;
    +  local_sum = trapezoidal_rule(local_a, local_b, local_n, 
    +                               &int_function); 
    +  MPI_Reduce(&local_sum, &total_sum, 1, MPI_DOUBLE, 
    +              MPI_SUM, 0, MPI_COMM_WORLD);
    +  time_end = MPI_Wtime();
    +  total_time = time_end-time_start;
    +  if ( my_rank == 0) {
    +    cout << "Trapezoidal rule = " <<  total_sum << endl;
    +    cout << "Time = " <<  total_time  
    +         << " on number of processors: "  << numprocs  << endl;
    +  }
    +  // End MPI
    +  MPI_Finalize ();  
    +  return 0;
    +}  // end of main program
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    How do I use \( MPI\_reduce \)?

    +
    + +

    + +

    Here we have used

    + + +
    +
    +
    +
    +
    +
    MPI_reduce( void *senddata, void* resultdata, int count, 
    +     MPI_Datatype datatype, MPI_Op, int root, MPI_Comm comm)
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    The two variables \( senddata \) and \( resultdata \) are obvious, besides the fact that one sends the address +of the variable or the first element of an array. If they are arrays they need to have the same size. +The variable \( count \) represents the total dimensionality, 1 in case of just one variable, +while \( MPI\_Datatype \) +defines the type of variable which is sent and received. +

    + +

    The new feature is \( MPI\_Op \). It defines the type +of operation we want to do. +

    +
    +
    + +
    +

    More on \( MPI\_Reduce \)

    +
    + +

    +

    In our case, since we are summing +the rectangle contributions from every process we define \( MPI\_Op = MPI\_SUM \). +If we have an array or matrix we can search for the largest og smallest element by sending either \( MPI\_MAX \) or +\( MPI\_MIN \). If we want the location as well (which array element) we simply transfer +\( MPI\_MAXLOC \) or \( MPI\_MINOC \). If we want the product we write \( MPI\_PROD \). +

    + +

    \( MPI\_Allreduce \) is defined as

    + + +
    +
    +
    +
    +
    +
    MPI_Allreduce( void *senddata, void* resultdata, int count, 
    +          MPI_Datatype datatype, MPI_Op, MPI_Comm comm)        
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Dissection of trapezoidal rule

    +
    + +

    + +

    We use \( MPI\_reduce \) to collect data from each process. Note also the use of the function +\( MPI\_Wtime \). +

    + + +
    +
    +
    +
    +
    +
    //  this function defines the function to integrate
    +double int_function(double x)
    +{
    +  double value = 4./(1.+x*x);
    +  return value;
    +} // end of function to evaluate
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Dissection of trapezoidal rule

    +
    + +

    + + +

    +
    +
    +
    +
    +
    //  this function defines the trapezoidal rule
    +double trapezoidal_rule(double a, double b, int n, 
    +                         double (*func)(double))
    +{
    +  double trapez_sum;
    +  double fa, fb, x, step;
    +  int    j;
    +  step=(b-a)/((double) n);
    +  fa=(*func)(a)/2. ;
    +  fb=(*func)(b)/2. ;
    +  trapez_sum=0.;
    +  for (j=1; j <= n-1; j++){
    +    x=j*step+a;
    +    trapez_sum+=(*func)(x);
    +  }
    +  trapez_sum=(trapez_sum+fb+fa)*step;
    +  return trapez_sum;
    +}  // end trapezoidal_rule 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    The quantum dot program for two electrons

    +
    + +

    + + +

    +
    +
    +
    +
    +
    // Variational Monte Carlo for atoms with importance sampling, slater det
    +// Test case for 2-electron quantum dot, no classes using Mersenne-Twister RNG
    +#include "mpi.h"
    +#include <cmath>
    +#include <random>
    +#include <string>
    +#include <iostream>
    +#include <fstream>
    +#include <iomanip>
    +#include "vectormatrixclass.h"
    +
    +using namespace  std;
    +// output file as global variable
    +ofstream ofile;  
    +// the step length and its squared inverse for the second derivative 
    +//  Here we define global variables  used in various functions
    +//  These can be changed by using classes
    +int Dimension = 2; 
    +int NumberParticles  = 2;  //  we fix also the number of electrons to be 2
    +
    +// declaration of functions 
    +
    +// The Mc sampling for the variational Monte Carlo 
    +void  MonteCarloSampling(int, double &, double &, Vector &);
    +
    +// The variational wave function
    +double  WaveFunction(Matrix &, Vector &);
    +
    +// The local energy 
    +double  LocalEnergy(Matrix &, Vector &);
    +
    +// The quantum force
    +void  QuantumForce(Matrix &, Matrix &, Vector &);
    +
    +
    +// inline function for single-particle wave function
    +inline double SPwavefunction(double r, double alpha) { 
    +   return exp(-alpha*r*0.5);
    +}
    +
    +// inline function for derivative of single-particle wave function
    +inline double DerivativeSPwavefunction(double r, double alpha) { 
    +  return -r*alpha;
    +}
    +
    +// function for absolute value of relative distance
    +double RelativeDistance(Matrix &r, int i, int j) { 
    +      double r_ij = 0;  
    +      for (int k = 0; k < Dimension; k++) { 
    +	r_ij += (r(i,k)-r(j,k))*(r(i,k)-r(j,k));
    +      }
    +      return sqrt(r_ij); 
    +}
    +
    +// inline function for derivative of Jastrow factor
    +inline double JastrowDerivative(Matrix &r, double beta, int i, int j, int k){
    +  return (r(i,k)-r(j,k))/(RelativeDistance(r, i, j)*pow(1.0+beta*RelativeDistance(r, i, j),2));
    +}
    +
    +// function for square of position of single particle
    +double singleparticle_pos2(Matrix &r, int i) { 
    +    double r_single_particle = 0;
    +    for (int j = 0; j < Dimension; j++) { 
    +      r_single_particle  += r(i,j)*r(i,j);
    +    }
    +    return r_single_particle;
    +}
    +
    +void lnsrch(int n, Vector &xold, double fold, Vector &g, Vector &p, Vector &x,
    +		 double *f, double stpmax, int *check, double (*func)(Vector &p));
    +
    +void dfpmin(Vector &p, int n, double gtol, int *iter, double *fret,
    +	    double(*func)(Vector &p), void (*dfunc)(Vector &p, Vector &g));
    +
    +static double sqrarg;
    +#define SQR(a) ((sqrarg=(a)) == 0.0 ? 0.0 : sqrarg*sqrarg)
    +
    +
    +static double maxarg1,maxarg2;
    +#define FMAX(a,b) (maxarg1=(a),maxarg2=(b),(maxarg1) > (maxarg2) ?\
    +        (maxarg1) : (maxarg2))
    +
    +
    +// Begin of main program   
    +
    +int main(int argc, char* argv[])
    +{
    +
    +  //  MPI initializations
    +  int NumberProcesses, MyRank, NumberMCsamples;
    +  MPI_Init (&argc, &argv);
    +  MPI_Comm_size (MPI_COMM_WORLD, &NumberProcesses);
    +  MPI_Comm_rank (MPI_COMM_WORLD, &MyRank);
    +  double StartTime = MPI_Wtime();
    +  if (MyRank == 0 && argc <= 1) {
    +    cout << "Bad Usage: " << argv[0] << 
    +      " Read also output file on same line and number of Monte Carlo cycles" << endl;
    +  }
    +  // Read filename and number of Monte Carlo cycles from the command line
    +  if (MyRank == 0 && argc > 2) {
    +    string filename = argv[1]; // first command line argument after name of program
    +    NumberMCsamples  = atoi(argv[2]);
    +    string fileout = filename;
    +    string argument = to_string(NumberMCsamples);
    +    // Final filename as filename+NumberMCsamples
    +    fileout.append(argument);
    +    ofile.open(fileout);
    +  }
    +  // broadcast the number of  Monte Carlo samples
    +  MPI_Bcast (&NumberMCsamples, 1, MPI_INT, 0, MPI_COMM_WORLD);
    +  // Two variational parameters only
    +  Vector VariationalParameters(2);
    +  int TotalNumberMCsamples = NumberMCsamples*NumberProcesses; 
    +  // Loop over variational parameters
    +  for (double alpha = 0.5; alpha <= 1.5; alpha +=0.1){
    +    for (double beta = 0.1; beta <= 0.5; beta +=0.05){
    +      VariationalParameters(0) = alpha;  // value of alpha
    +      VariationalParameters(1) = beta;  // value of beta
    +      //  Do the mc sampling  and accumulate data with MPI_Reduce
    +      double TotalEnergy, TotalEnergySquared, LocalProcessEnergy, LocalProcessEnergy2;
    +      LocalProcessEnergy = LocalProcessEnergy2 = 0.0;
    +      MonteCarloSampling(NumberMCsamples, LocalProcessEnergy, LocalProcessEnergy2, VariationalParameters);
    +      //  Collect data in total averages
    +      MPI_Reduce(&LocalProcessEnergy, &TotalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    +      MPI_Reduce(&LocalProcessEnergy2, &TotalEnergySquared, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    +      // Print out results  in case of Master node, set to MyRank = 0
    +      if ( MyRank == 0) {
    +	double Energy = TotalEnergy/( (double)NumberProcesses);
    +	double Variance = TotalEnergySquared/( (double)NumberProcesses)-Energy*Energy;
    +	double StandardDeviation = sqrt(Variance/((double)TotalNumberMCsamples)); // over optimistic error
    +	ofile << setiosflags(ios::showpoint | ios::uppercase);
    +	ofile << setw(15) << setprecision(8) << VariationalParameters(0);
    +	ofile << setw(15) << setprecision(8) << VariationalParameters(1);
    +	ofile << setw(15) << setprecision(8) << Energy;
    +	ofile << setw(15) << setprecision(8) << Variance;
    +	ofile << setw(15) << setprecision(8) << StandardDeviation << endl;
    +      }
    +    }
    +  }
    +  double EndTime = MPI_Wtime();
    +  double TotalTime = EndTime-StartTime;
    +  if ( MyRank == 0 )  cout << "Time = " <<  TotalTime  << " on number of processors: "  << NumberProcesses  << endl;
    +  if (MyRank == 0)  ofile.close();  // close output file
    +  // End MPI
    +  MPI_Finalize ();  
    +  return 0;
    +}  //  end of main function
    +
    +
    +// Monte Carlo sampling with the Metropolis algorithm  
    +
    +void MonteCarloSampling(int NumberMCsamples, double &cumulative_e, double &cumulative_e2, Vector &VariationalParameters)
    +{
    +
    + // Initialize the seed and call the Mersienne algo
    +  std::random_device rd;
    +  std::mt19937_64 gen(rd());
    +  // Set up the uniform distribution for x \in [[0, 1]
    +  std::uniform_real_distribution<double> UniformNumberGenerator(0.0,1.0);
    +  std::normal_distribution<double> Normaldistribution(0.0,1.0);
    +  // diffusion constant from Schroedinger equation
    +  double D = 0.5; 
    +  double timestep = 0.05;  //  we fix the time step  for the gaussian deviate
    +  // allocate matrices which contain the position of the particles  
    +  Matrix OldPosition( NumberParticles, Dimension), NewPosition( NumberParticles, Dimension);
    +  Matrix OldQuantumForce(NumberParticles, Dimension), NewQuantumForce(NumberParticles, Dimension);
    +  double Energy = 0.0; double EnergySquared = 0.0; double DeltaE = 0.0;
    +  //  initial trial positions
    +  for (int i = 0; i < NumberParticles; i++) { 
    +    for (int j = 0; j < Dimension; j++) {
    +      OldPosition(i,j) = Normaldistribution(gen)*sqrt(timestep);
    +    }
    +  }
    +  double OldWaveFunction = WaveFunction(OldPosition, VariationalParameters);
    +  QuantumForce(OldPosition, OldQuantumForce, VariationalParameters);
    +  // loop over monte carlo cycles 
    +  for (int cycles = 1; cycles <= NumberMCsamples; cycles++){ 
    +    // new position 
    +    for (int i = 0; i < NumberParticles; i++) { 
    +      for (int j = 0; j < Dimension; j++) {
    +	// gaussian deviate to compute new positions using a given timestep
    +	NewPosition(i,j) = OldPosition(i,j) + Normaldistribution(gen)*sqrt(timestep)+OldQuantumForce(i,j)*timestep*D;
    +	//	NewPosition(i,j) = OldPosition(i,j) + gaussian_deviate(&idum)*sqrt(timestep)+OldQuantumForce(i,j)*timestep*D;
    +      }  
    +      //  for the other particles we need to set the position to the old position since
    +      //  we move only one particle at the time
    +      for (int k = 0; k < NumberParticles; k++) {
    +	if ( k != i) {
    +	  for (int j = 0; j < Dimension; j++) {
    +	    NewPosition(k,j) = OldPosition(k,j);
    +	  }
    +	} 
    +      }
    +      double NewWaveFunction = WaveFunction(NewPosition, VariationalParameters); 
    +      QuantumForce(NewPosition, NewQuantumForce, VariationalParameters);
    +      //  we compute the log of the ratio of the greens functions to be used in the 
    +      //  Metropolis-Hastings algorithm
    +      double GreensFunction = 0.0;            
    +      for (int j = 0; j < Dimension; j++) {
    +	GreensFunction += 0.5*(OldQuantumForce(i,j)+NewQuantumForce(i,j))*
    +	  (D*timestep*0.5*(OldQuantumForce(i,j)-NewQuantumForce(i,j))-NewPosition(i,j)+OldPosition(i,j));
    +      }
    +      GreensFunction = exp(GreensFunction);
    +      // The Metropolis test is performed by moving one particle at the time
    +      if(UniformNumberGenerator(gen) <= GreensFunction*NewWaveFunction*NewWaveFunction/OldWaveFunction/OldWaveFunction ) { 
    +	for (int  j = 0; j < Dimension; j++) {
    +	  OldPosition(i,j) = NewPosition(i,j);
    +	  OldQuantumForce(i,j) = NewQuantumForce(i,j);
    +	}
    +	OldWaveFunction = NewWaveFunction;
    +      }
    +    }  //  end of loop over particles
    +    // compute local energy  
    +    double DeltaE = LocalEnergy(OldPosition, VariationalParameters);
    +    // update energies
    +    Energy += DeltaE;
    +    EnergySquared += DeltaE*DeltaE;
    +  }   // end of loop over MC trials   
    +  // update the energy average and its squared 
    +  cumulative_e = Energy/NumberMCsamples;
    +  cumulative_e2 = EnergySquared/NumberMCsamples;
    +}   // end MonteCarloSampling function  
    +
    +
    +// Function to compute the squared wave function and the quantum force
    +
    +double  WaveFunction(Matrix &r, Vector &VariationalParameters)
    +{
    +  double wf = 0.0;
    +  // full Slater determinant for two particles, replace with Slater det for more particles 
    +  wf  = SPwavefunction(singleparticle_pos2(r, 0), VariationalParameters(0))*SPwavefunction(singleparticle_pos2(r, 1),VariationalParameters(0));
    +  // contribution from Jastrow factor
    +  for (int i = 0; i < NumberParticles-1; i++) { 
    +    for (int j = i+1; j < NumberParticles; j++) {
    +      wf *= exp(RelativeDistance(r, i, j)/((1.0+VariationalParameters(1)*RelativeDistance(r, i, j))));
    +    }
    +  }
    +  return wf;
    +}
    +
    +// Function to calculate the local energy without numerical derivation of kinetic energy
    +
    +double  LocalEnergy(Matrix &r, Vector &VariationalParameters)
    +{
    +
    +  // compute the kinetic and potential energy from the single-particle part
    +  // for a many-electron system this has to be replaced by a Slater determinant
    +  // The absolute value of the interparticle length
    +  Matrix length( NumberParticles, NumberParticles);
    +  // Set up interparticle distance
    +  for (int i = 0; i < NumberParticles-1; i++) { 
    +    for(int j = i+1; j < NumberParticles; j++){
    +      length(i,j) = RelativeDistance(r, i, j);
    +      length(j,i) =  length(i,j);
    +    }
    +  }
    +  double KineticEnergy = 0.0;
    +  // Set up kinetic energy from Slater and Jastrow terms
    +  for (int i = 0; i < NumberParticles; i++) { 
    +    for (int k = 0; k < Dimension; k++) {
    +      double sum1 = 0.0; 
    +      for(int j = 0; j < NumberParticles; j++){
    +	if ( j != i) {
    +	  sum1 += JastrowDerivative(r, VariationalParameters(1), i, j, k);
    +	}
    +      }
    +      KineticEnergy += (sum1+DerivativeSPwavefunction(r(i,k),VariationalParameters(0)))*(sum1+DerivativeSPwavefunction(r(i,k),VariationalParameters(0)));
    +    }
    +  }
    +  KineticEnergy += -2*VariationalParameters(0)*NumberParticles;
    +  for (int i = 0; i < NumberParticles-1; i++) {
    +      for (int j = i+1; j < NumberParticles; j++) {
    +        KineticEnergy += 2.0/(pow(1.0 + VariationalParameters(1)*length(i,j),2))*(1.0/length(i,j)-2*VariationalParameters(1)/(1+VariationalParameters(1)*length(i,j)) );
    +      }
    +  }
    +  KineticEnergy *= -0.5;
    +  // Set up potential energy, external potential + eventual electron-electron repulsion
    +  double PotentialEnergy = 0;
    +  for (int i = 0; i < NumberParticles; i++) { 
    +    double DistanceSquared = singleparticle_pos2(r, i);
    +    PotentialEnergy += 0.5*DistanceSquared;  // sp energy HO part, note it has the oscillator frequency set to 1!
    +  }
    +  // Add the electron-electron repulsion
    +  for (int i = 0; i < NumberParticles-1; i++) { 
    +    for (int j = i+1; j < NumberParticles; j++) {
    +      PotentialEnergy += 1.0/length(i,j);          
    +    }
    +  }
    +  double LocalE = KineticEnergy+PotentialEnergy;
    +  return LocalE;
    +}
    +
    +// Compute the analytical expression for the quantum force
    +void  QuantumForce(Matrix &r, Matrix &qforce, Vector &VariationalParameters)
    +{
    +  // compute the first derivative 
    +  for (int i = 0; i < NumberParticles; i++) {
    +    for (int k = 0; k < Dimension; k++) {
    +      // single-particle part, replace with Slater det for larger systems
    +      double sppart = DerivativeSPwavefunction(r(i,k),VariationalParameters(0));
    +      //  Jastrow factor contribution
    +      double Jsum = 0.0;
    +      for (int j = 0; j < NumberParticles; j++) {
    +	if ( j != i) {
    +	  Jsum += JastrowDerivative(r, VariationalParameters(1), i, j, k);
    +	}
    +      }
    +      qforce(i,k) = 2.0*(Jsum+sppart);
    +    }
    +  }
    +} // end of QuantumForce function
    +
    +
    +#define ITMAX 200
    +#define EPS 3.0e-8
    +#define TOLX (4*EPS)
    +#define STPMX 100.0
    +
    +void dfpmin(Vector &p, int n, double gtol, int *iter, double *fret,
    +	    double(*func)(Vector &p), void (*dfunc)(Vector &p, Vector &g))
    +{
    +
    +  int check,i,its,j;
    +  double den,fac,fad,fae,fp,stpmax,sum=0.0,sumdg,sumxi,temp,test;
    +  Vector dg(n), g(n), hdg(n), pnew(n), xi(n);
    +  Matrix hessian(n,n);
    +
    +  fp=(*func)(p);
    +  (*dfunc)(p,g);
    +  for (i = 0;i < n;i++) {
    +    for (j = 0; j< n;j++) hessian(i,j)=0.0;
    +    hessian(i,i)=1.0;
    +    xi(i) = -g(i);
    +    sum += p(i)*p(i);
    +  }
    +  stpmax=STPMX*FMAX(sqrt(sum),(double)n);
    +  for (its=1;its<=ITMAX;its++) {
    +    *iter=its;
    +    lnsrch(n,p,fp,g,xi,pnew,fret,stpmax,&check,func);
    +    fp = *fret;
    +    for (i = 0; i< n;i++) {
    +      xi(i)=pnew(i)-p(i);
    +      p(i)=pnew(i);
    +    }
    +    test=0.0;
    +    for (i = 0;i< n;i++) {
    +      temp=fabs(xi(i))/FMAX(fabs(p(i)),1.0);
    +      if (temp > test) test=temp;
    +    }
    +    if (test < TOLX) {
    +      return;
    +    }
    +    for (i=0;i<n;i++) dg(i)=g(i);
    +    (*dfunc)(p,g);
    +    test=0.0;
    +    den=FMAX(*fret,1.0);
    +    for (i=0;i<n;i++) {
    +      temp=fabs(g(i))*FMAX(fabs(p(i)),1.0)/den;
    +      if (temp > test) test=temp;
    +    }
    +    if (test < gtol) {
    +      return;
    +    }
    +    for (i=0;i<n;i++) dg(i)=g(i)-dg(i);
    +    for (i=0;i<n;i++) {
    +      hdg(i)=0.0;
    +      for (j=0;j<n;j++) hdg(i) += hessian(i,j)*dg(j);
    +    }
    +    fac=fae=sumdg=sumxi=0.0;
    +    for (i=0;i<n;i++) {
    +      fac += dg(i)*xi(i);
    +      fae += dg(i)*hdg(i);
    +      sumdg += SQR(dg(i));
    +      sumxi += SQR(xi(i));
    +    }
    +    if (fac*fac > EPS*sumdg*sumxi) {
    +      fac=1.0/fac;
    +      fad=1.0/fae;
    +      for (i=0;i<n;i++) dg(i)=fac*xi(i)-fad*hdg(i);
    +      for (i=0;i<n;i++) {
    +	for (j=0;j<n;j++) {
    +	  hessian(i,j) += fac*xi(i)*xi(j)
    +	    -fad*hdg(i)*hdg(j)+fae*dg(i)*dg(j);
    +	}
    +      }
    +    }
    +    for (i=0;i<n;i++) {
    +      xi(i)=0.0;
    +      for (j=0;j<n;j++) xi(i) -= hessian(i,j)*g(j);
    +    }
    +  }
    +  cout << "too many iterations in dfpmin" << endl;
    +}
    +#undef ITMAX
    +#undef EPS
    +#undef TOLX
    +#undef STPMX
    +
    +#define ALF 1.0e-4
    +#define TOLX 1.0e-7
    +
    +void lnsrch(int n, Vector &xold, double fold, Vector &g, Vector &p, Vector &x,
    +	    double *f, double stpmax, int *check, double (*func)(Vector &p))
    +{
    +  int i;
    +  double a,alam,alam2,alamin,b,disc,f2,fold2,rhs1,rhs2,slope,sum,temp,
    +    test,tmplam;
    +
    +  *check=0;
    +  for (sum=0.0,i=0;i<n;i++) sum += p(i)*p(i);
    +  sum=sqrt(sum);
    +  if (sum > stpmax)
    +    for (i=0;i<n;i++) p(i) *= stpmax/sum;
    +  for (slope=0.0,i=0;i<n;i++)
    +    slope += g(i)*p(i);
    +  test=0.0;
    +  for (i=0;i<n;i++) {
    +    temp=fabs(p(i))/FMAX(fabs(xold(i)),1.0);
    +    if (temp > test) test=temp;
    +  }
    +  alamin=TOLX/test;
    +  alam=1.0;
    +  for (;;) {
    +    for (i=0;i<n;i++) x(i)=xold(i)+alam*p(i);
    +    *f=(*func)(x);
    +    if (alam < alamin) {
    +      for (i=0;i<n;i++) x(i)=xold(i);
    +      *check=1;
    +      return;
    +    } else if (*f <= fold+ALF*alam*slope) return;
    +    else {
    +      if (alam == 1.0)
    +	tmplam = -slope/(2.0*(*f-fold-slope));
    +      else {
    +	rhs1 = *f-fold-alam*slope;
    +	rhs2=f2-fold2-alam2*slope;
    +	a=(rhs1/(alam*alam)-rhs2/(alam2*alam2))/(alam-alam2);
    +	b=(-alam2*rhs1/(alam*alam)+alam*rhs2/(alam2*alam2))/(alam-alam2);
    +	if (a == 0.0) tmplam = -slope/(2.0*b);
    +	else {
    +	  disc=b*b-3.0*a*slope;
    +	  if (disc<0.0) cout << "Roundoff problem in lnsrch." << endl;
    +	  else tmplam=(-b+sqrt(disc))/(3.0*a);
    +	}
    +	if (tmplam>0.5*alam)
    +	  tmplam=0.5*alam;
    +      }
    +    }
    +    alam2=alam;
    +    f2 = *f;
    +    fold2=fold;
    +    alam=FMAX(tmplam,0.1*alam);
    +  }
    +}
    +#undef ALF
    +#undef TOLX
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    What is OpenMP

    +
    + +

    +

    +

    +

    Many good tutorials online and excellent textbook

    +
      +

    1. Using OpenMP, by B. Chapman, G. Jost, and A. van der Pas
    2. +

    3. Many tutorials online like OpenMP official site
    4. +
    +
    +
    + +
    +

    Getting started, things to remember

    +
    + +

    +

    +

    + + +

    +
    +
    +
    +
    +
    #include <omp.h>
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +

    + + +

    +
    +
    +
    +
    +
    #pragma omp...
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
    +
    + +
    +

    OpenMP syntax

    + +

    + + +

    +
    +
    +
    +
    +
    #pragma omp construct [ clause ...]
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +

    + + +

    +
    +
    +
    +
    +
    #include <omp.h>
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
    + +
    +

    Different OpenMP styles of parallelism

    +

    OpenMP supports several different ways to specify thread parallelism

    + + +
    + +
    +

    General code structure

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #include <omp.h>
    +main ()
    +{
    +int var1, var2, var3;
    +/* serial code */
    +/* ... */
    +/* start of a parallel region */
    +#pragma omp parallel private(var1, var2) shared(var3)
    +{
    +/* ... */
    +}
    +/* more serial code */
    +/* ... */
    +/* another parallel region */
    +#pragma omp parallel
    +{
    +/* ... */
    +}
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Parallel region

    +
    + +

    +

    +

    + + +

    +
    +
    +
    +
    +
    #pragma omp parallel { ... }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
    +
    + +
    +

    Hello world, not again, please!

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #include <omp.h>
    +#include <cstdio>
    +int main (int argc, char *argv[])
    +{
    +int th_id, nthreads;
    +#pragma omp parallel private(th_id) shared(nthreads)
    +{
    +th_id = omp_get_thread_num();
    +printf("Hello World from thread %d\n", th_id);
    +#pragma omp barrier
    +if ( th_id == 0 ) {
    +nthreads = omp_get_num_threads();
    +printf("There are %d threads\n",nthreads);
    +}
    +}
    +return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Hello world, yet another variant

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #include <cstdio>
    +#include <omp.h>
    +int main(int argc, char *argv[]) 
    +{
    + omp_set_num_threads(4); 
    +#pragma omp parallel
    + {
    +   int id = omp_get_thread_num();
    +   int nproc = omp_get_num_threads(); 
    +   cout << "Hello world with id number and processes " <<  id <<  nproc << endl;
    + } 
    +return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Variables declared outside of the parallel region are shared by all threads +If a variable like id is declared outside of the +

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel, 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    it would have been shared by various the threads, possibly causing erroneous output

    + +
    +
    + +
    +

    Important OpenMP library routines

    +
    + +

    + +

    +
    +
    + +
    +

    Private variables

    +
    + +

    +

    Private clause can be used to make thread- private versions of such variables:

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel private(id)
    +{
    + int id = omp_get_thread_num();
    + cout << "My thread num" << id << endl; 
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
    +
    + +
    +

    Master region

    +
    + +

    +

    It is often useful to have only one thread execute some of the code in a parallel region. I/O statements are a common example

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel 
    +{
    +  #pragma omp master
    +   {
    +      int id = omp_get_thread_num();
    +      cout << "My thread num" << id << endl; 
    +   } 
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Parallel for loop

    +
    + +

    +

    +

    + + +

    +
    +
    +
    +
    +
    #pragma omp for
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
    +
    + +
    +

    Parallel computations and loops

    + +
    + +

    +

    OpenMP provides an easy way to parallelize a loop

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel for
    +  for (i=0; i<n; i++) c[i] = a[i];
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    OpenMP handles index variable (no need to declare in for loop or make private)

    + +

    Which thread does which values? Several options.

    +
    +
    + +
    +

    Scheduling of loop computations

    + +
    + +

    +

    We can let the OpenMP runtime decide. The decision is about how the loop iterates are scheduled +and OpenMP defines three choices of loop scheduling: +

    +
      +

    1. Static: Predefined at compile time. Lowest overhead, predictable
    2. +

    3. Dynamic: Selection made at runtime
    4. +

    5. Guided: Special case of dynamic; attempts to reduce overhead
    6. +
    +
    +
    + +
    +

    Example code for loop scheduling

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #include <omp.h>
    +#define CHUNKSIZE 100
    +#define N 1000
    +int main (int argc, char *argv[])
    +{
    +int i, chunk;
    +float a[N], b[N], c[N];
    +for (i=0; i < N; i++) a[i] = b[i] = i * 1.0;
    +chunk = CHUNKSIZE;
    +#pragma omp parallel shared(a,b,c,chunk) private(i)
    +{
    +#pragma omp for schedule(dynamic,chunk)
    +for (i=0; i < N; i++) c[i] = a[i] + b[i];
    +} /* end of parallel region */
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Example code for loop scheduling, guided instead of dynamic

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #include <omp.h>
    +#define CHUNKSIZE 100
    +#define N 1000
    +int main (int argc, char *argv[])
    +{
    +int i, chunk;
    +float a[N], b[N], c[N];
    +for (i=0; i < N; i++) a[i] = b[i] = i * 1.0;
    +chunk = CHUNKSIZE;
    +#pragma omp parallel shared(a,b,c,chunk) private(i)
    +{
    +#pragma omp for schedule(guided,chunk)
    +for (i=0; i < N; i++) c[i] = a[i] + b[i];
    +} /* end of parallel region */
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    More on Parallel for loop

    +
    + +

    +

    +

    + + +

    +
    +
    +
    +
    +
    // #pragma omp parallel and #pragma omp for
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    can be combined into

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel for
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    What can happen with this loop?

    + +
    + +

    +

    What happens with code like this

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel for
    +for (i=0; i<n; i++) sum += a[i]*a[i];
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    All threads can access the sum variable, but the addition is not atomic! It is important to avoid race between threads. So-called reductions in OpenMP are thus important for performance and for obtaining correct results. OpenMP lets us indicate that a variable is used for a reduction with a particular operator. The above code becomes

    + + +
    +
    +
    +
    +
    +
    sum = 0.0;
    +#pragma omp parallel for reduction(+:sum)
    +for (i=0; i<n; i++) sum += a[i]*a[i];
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Inner product

    +
    + +

    +

     
    +$$ +\sum_{i=0}^{n-1} a_ib_i +$$ +

     
    + + + +

    +
    +
    +
    +
    +
    int i;
    +double sum = 0.;
    +/* allocating and initializing arrays */
    +/* ... */
    +#pragma omp parallel for default(shared) private(i) reduction(+:sum)
    + for (i=0; i<N; i++) sum += a[i]*b[i];
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Different threads do different tasks

    +
    + +

    + +

    Different threads do different tasks independently, each section is executed by one thread.

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel
    +{
    +#pragma omp sections
    +{
    +#pragma omp section
    +funcA ();
    +#pragma omp section
    +funcB ();
    +#pragma omp section
    +funcC ();
    +}
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Single execution

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #pragma omp single { ... }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    The code is executed by one thread only, no guarantee which thread

    + +

    Can introduce an implicit barrier at the end

    + + +
    +
    +
    +
    +
    +
    #pragma omp master { ... }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Code executed by the master thread, guaranteed and no implicit barrier at the end.

    +
    +
    + +
    +

    Coordination and synchronization

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #pragma omp barrier
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Synchronization, must be encountered by all threads in a team (or none)

    + + +
    +
    +
    +
    +
    +
    #pragma omp ordered { a block of codes }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    is another form of synchronization (in sequential order). +The form +

    + + +
    +
    +
    +
    +
    +
    #pragma omp critical { a block of codes }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    and

    + + +
    +
    +
    +
    +
    +
    #pragma omp atomic { single assignment statement }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    is more efficient than

    + + +
    +
    +
    +
    +
    +
    #pragma omp critical
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Data scope

    +
    + +

    +

    +

    +

    What are the purposes of these attributes

    + +
    +
    + +
    +

    Some remarks

    +
    + +

    + +

    +
    +
    + +
    +

    Parallelizing nested for-loops

    +
    + +

    + +

    +

    + + +

    +
    +
    +
    +
    +
    for (i=0; i<100; i++)
    +    for (j=0; j<100; j++)
    +        a[i][j] = b[i][j] + c[i][j];
    +    }
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + +

    + + +

    +
    +
    +
    +
    +
    #pragma omp parallel for private(j)
    +for (i=0; i<100; i++)
    +    for (j=0; j<100; j++)
    +       a[i][j] = b[i][j] + c[i][j];
    +    }
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + +
    +
    + +
    +

    Nested parallelism

    +
    + +

    +

    When a thread in a parallel region encounters another parallel construct, it +may create a new team of threads and become the master of the new +team. +

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel num_threads(4)
    +{
    +/* .... */
    +#pragma omp parallel num_threads(2)
    +{
    +//  
    +}
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Parallel tasks

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #pragma omp task 
    +#pragma omp parallel shared(p_vec) private(i)
    +{
    +#pragma omp single
    +{
    +for (i=0; i<N; i++) {
    +  double r = random_number();
    +  if (p_vec[i] > r) {
    +#pragma omp task
    +   do_work (p_vec[i]);
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Common mistakes

    +
    + +

    +

    Race condition

    + + +
    +
    +
    +
    +
    +
    int nthreads;
    +#pragma omp parallel shared(nthreads)
    +{
    +nthreads = omp_get_num_threads();
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Deadlock

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel
    +{
    +...
    +#pragma omp critical
    +{
    +...
    +#pragma omp barrier
    +}
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Not all computations are simple

    +
    + +

    +

    Not all computations are simple loops where the data can be evenly +divided among threads without any dependencies between threads +

    + +

    An example is finding the location and value of the largest element in an array

    + + +
    +
    +
    +
    +
    +
    for (i=0; i<n; i++) { 
    +   if (x[i] > maxval) {
    +      maxval = x[i];
    +      maxloc = i; 
    +   }
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Not all computations are simple, competing threads

    +
    + +

    +

    All threads are potentially accessing and changing the same values, maxloc and maxval.

    +
      +

    1. OpenMP provides several ways to coordinate access to shared values
    2. +
    +

    + + +

    +
    +
    +
    +
    +
    #pragma omp atomic
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
      +

    1. Only one thread at a time can execute the following statement (not block). We can use the critical option
    2. +
    +

    + + +

    +
    +
    +
    +
    +
    #pragma omp critical
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
      +

    1. Only one thread at a time can execute the following block
    2. +
    +

    +

    Atomic may be faster than critical but depends on hardware

    +
    +
    + +
    +

    How to find the max value using OpenMP

    +
    + +

    +

    Write down the simplest algorithm and look carefully for race conditions. How would you handle them? +The first step would be to parallelize as +

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel for
    + for (i=0; i<n; i++) {
    +    if (x[i] > maxval) {
    +      maxval = x[i];
    +      maxloc = i; 
    +    }
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Then deal with the race conditions

    +
    + +

    +

    Write down the simplest algorithm and look carefully for race conditions. How would you handle them? +The first step would be to parallelize as +

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel for
    + for (i=0; i<n; i++) {
    +#pragma omp critical
    +  {
    +     if (x[i] > maxval) {
    +       maxval = x[i];
    +       maxloc = i; 
    +     }
    +  }
    +} 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Exercise: write a code which implements this and give an estimate on performance. Perform several runs, +with a serial code only with and without vectorization and compare the serial code with the one that uses OpenMP. Run on different archictectures if you can. +

    +
    +
    + +
    +

    What can slow down OpenMP performance?

    +

    Give it a thought!

    +
    + +
    +

    What can slow down OpenMP performance?

    +
    + +

    +

    Performance poor because we insisted on keeping track of the maxval and location during the execution of the loop.

    + +

    +

    This is a common source of performance issues, namely the description of the method used to compute a value imposes additional, unnecessary requirements or properties

    + +Idea: Have each thread find the maxloc in its own data, then combine and use temporary arrays indexed by thread number to hold the values found by each thread +
    +
    + +
    +

    Find the max location for each thread

    +
    + +

    + + +

    +
    +
    +
    +
    +
    int maxloc[MAX_THREADS], mloc;
    +double maxval[MAX_THREADS], mval; 
    +#pragma omp parallel shared(maxval,maxloc)
    +{
    +  int id = omp_get_thread_num(); 
    +  maxval[id] = -1.0e30;
    +#pragma omp for
    +   for (int i=0; i<n; i++) {
    +       if (x[i] > maxval[id]) { 
    +           maxloc[id] = i;
    +           maxval[id] = x[i]; 
    +       }
    +    }
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Combine the values from each thread

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #pragma omp flush (maxloc,maxval)
    +#pragma omp master
    +  {
    +    int nt = omp_get_num_threads(); 
    +    mloc = maxloc[0]; 
    +    mval = maxval[0]; 
    +    for (int i=1; i<nt; i++) {
    +        if (maxval[i] > mval) { 
    +           mval = maxval[i]; 
    +           mloc = maxloc[i];
    +        } 
    +     }
    +   }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Note that we let the master process perform the last operation.

    +
    +
    + +
    +

    Matrix-matrix multiplication

    +

    This code computes the norm of a vector using OpenMp

    + + +
    +
    +
    +
    +
    +
    //  OpenMP program to compute vector norm by adding two other vectors
    +#include <cstdlib>
    +#include <iostream>
    +#include <cmath>
    +#include <iomanip>
    +#include  <omp.h>
    +# include <ctime>
    +
    +using namespace std; // note use of namespace
    +int main (int argc, char* argv[])
    +{
    +  // read in dimension of vector
    +  int n = atoi(argv[1]);
    +  double *a, *b, *c;
    +  int i;
    +  int thread_num;
    +  double wtime, Norm2, s, angle;
    +  cout << "  Perform addition of two vectors and compute the norm-2." << endl;
    +  omp_set_num_threads(4);
    +  thread_num = omp_get_max_threads ();
    +  cout << "  The number of processors available = " << omp_get_num_procs () << endl ;
    +  cout << "  The number of threads available    = " << thread_num <<  endl;
    +  cout << "  The matrix order n                 = " << n << endl;
    +
    +  s = 1.0/sqrt( (double) n);
    +  wtime = omp_get_wtime ( );
    +  // Allocate space for the vectors to be used
    +  a = new double [n]; b = new double [n]; c = new double [n];
    +  // Define parallel region
    +# pragma omp parallel for default(shared) private (angle, i) reduction(+:Norm2)
    +  // Set up values for vectors  a and b
    +  for (i = 0; i < n; i++){
    +      angle = 2.0*M_PI*i/ (( double ) n);
    +      a[i] = s*(sin(angle) + cos(angle));
    +      b[i] =  s*sin(2.0*angle);
    +      c[i] = 0.0;
    +  }
    +  // Then perform the vector addition
    +  for (i = 0; i < n; i++){
    +     c[i] += a[i]+b[i];
    +  }
    +  // Compute now the norm-2
    +  Norm2 = 0.0;
    +  for (i = 0; i < n; i++){
    +     Norm2  += c[i]*c[i];
    +  }
    +// end parallel region
    +  wtime = omp_get_wtime ( ) - wtime;
    +  cout << setiosflags(ios::showpoint | ios::uppercase);
    +  cout << setprecision(10) << setw(20) << "Time used  for norm-2 computation=" << wtime  << endl;
    +  cout << " Norm-2  = " << Norm2 << endl;
    +  // Free up space
    +  delete[] a;
    +  delete[] b;
    +  delete[] c;
    +  return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +

    Matrix-matrix multiplication

    +

    This the matrix-matrix multiplication code with plain c++ memory allocation using OpenMP

    + + + +
    +
    +
    +
    +
    +
    //  Matrix-matrix multiplication and Frobenius norm of a matrix with OpenMP
    +#include <cstdlib>
    +#include <iostream>
    +#include <cmath>
    +#include <iomanip>
    +#include  <omp.h>
    +# include <ctime>
    +
    +using namespace std; // note use of namespace
    +int main (int argc, char* argv[])
    +{
    +  // read in dimension of square matrix
    +  int n = atoi(argv[1]);
    +  double **A, **B, **C;
    +  int i, j, k;
    +  int thread_num;
    +  double wtime, Fsum, s, angle;
    +  cout << "  Compute matrix product C = A * B and Frobenius norm." << endl;
    +  omp_set_num_threads(4);
    +  thread_num = omp_get_max_threads ();
    +  cout << "  The number of processors available = " << omp_get_num_procs () << endl ;
    +  cout << "  The number of threads available    = " << thread_num <<  endl;
    +  cout << "  The matrix order n                 = " << n << endl;
    +
    +  s = 1.0/sqrt( (double) n);
    +  wtime = omp_get_wtime ( );
    +  // Allocate space for the two matrices
    +  A = new double*[n]; B = new double*[n]; C = new double*[n];
    +  for (i = 0; i < n; i++){
    +    A[i] = new double[n];
    +    B[i] = new double[n];
    +    C[i] = new double[n];
    +  }
    +  // Define parallel region
    +# pragma omp parallel for default(shared) private (angle, i, j, k) reduction(+:Fsum)
    +  // Set up values for matrix A and B and zero matrix C
    +  for (i = 0; i < n; i++){
    +    for (j = 0; j < n; j++) {
    +      angle = 2.0*M_PI*i*j/ (( double ) n);
    +      A[i][j] = s * ( sin ( angle ) + cos ( angle ) );
    +      B[j][i] =  A[i][j];
    +    }
    +  }
    +  // Then perform the matrix-matrix multiplication
    +  for (i = 0; i < n; i++){
    +    for (j = 0; j < n; j++) {
    +       C[i][j] =  0.0;    
    +       for (k = 0; k < n; k++) {
    +            C[i][j] += A[i][k]*B[k][j];
    +       }
    +    }
    +  }
    +  // Compute now the Frobenius norm
    +  Fsum = 0.0;
    +  for (i = 0; i < n; i++){
    +    for (j = 0; j < n; j++) {
    +      Fsum += C[i][j]*C[i][j];
    +    }
    +  }
    +  Fsum = sqrt(Fsum);
    +// end parallel region and letting only one thread perform I/O
    +  wtime = omp_get_wtime ( ) - wtime;
    +  cout << setiosflags(ios::showpoint | ios::uppercase);
    +  cout << setprecision(10) << setw(20) << "Time used  for matrix-matrix multiplication=" << wtime  << endl;
    +  cout << "  Frobenius norm  = " << Fsum << endl;
    +  // Free up space
    +  for (int i = 0; i < n; i++){
    +    delete[] A[i];
    +    delete[] B[i];
    +    delete[] C[i];
    +  }
    +  delete[] A;
    +  delete[] B;
    +  delete[] C;
    +  return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + diff --git a/doc/pub/week9/html/week9-solarized.html b/doc/pub/week9/html/week9-solarized.html index 8becdd7a..38716503 100644 --- a/doc/pub/week9/html/week9-solarized.html +++ b/doc/pub/week9/html/week9-solarized.html @@ -106,11 +106,388 @@ 2, None, 'blocking-transformations-final-expressions'), + ('More on the blocking method', + 2, + None, + 'more-on-the-blocking-method'), ('Example code form last week', 2, None, 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('Optimization and debugging', + 2, + None, + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', + 2, + None, + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', + 2, + None, + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', + 2, + None, + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -168,15 +545,7 @@

    Overview of week 11, March 11-15

    -
    -Teaching Material, videos and written material -

    -

    -
    - +

    Note, these notes contain additional material om optimization and parallelization. Parts of this material will be discussed this week.











    Why resampling methods ?

    @@ -362,9 +731,9 @@

    Introducing the correlation functi

    Resampling methods: Blocking

    The blocking method was made popular by Flyvbjerg and Pedersen (1989) -and has become one of the standard ways to estimate -\( V(\widehat{\theta}) \) for exactly one \( \widehat{\theta} \), namely -\( \widehat{\theta} = \overline{X} \). +and has become one of the standard ways to estimate the variance +\( \mathrm{var}(\widehat{\theta}) \) for exactly one estimator \( \widehat{\theta} \), namely +\( \widehat{\theta} = \overline{X} \), the mean value.

    Assume \( n = 2^d \) for some integer \( d>1 \) and \( X_1,X_2,\cdots, X_n \) is a stationary time series to begin with. @@ -487,10 +856,14 @@

    Blocking Transformations, fi \end{align} $$ + +









    +

    More on the blocking method

    +

    Flyvbjerg and Petersen demonstrated that the sequence \( \{e_k\}_{k=0}^{d-1} \) is decreasing, and conjecture that the term \( e_k \) can be made as small as we would like by making \( k \) (and hence -\( d \)) sufficiently large. The sequence is decreasing (Master of Science thesis by Marius Jonsson, UiO 2018). +\( d \)) sufficiently large. The sequence is decreasing. It means we can apply blocking transformations until \( e_k \) is sufficiently small, and then estimate \( \mathrm{var}(\overline{X}) \) by \( \widehat{\sigma}^2_k/n_k \). @@ -825,6 +1198,5073 @@

    Resampling analysis

    +









    +

    Content

    + +









    +

    Optimization and profiling

    +
    + +

    + +

    Till now we have not paid much attention to speed and possible optimization possibilities +inherent in the various compilers. We have compiled and linked as +

    + + +
    +
    +
    +
    +
    +
    c++  -c  mycode.cpp
    +c++  -o  mycode.exe  mycode.o
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    For Fortran replace with for example gfortran or ifort. +This is what we call a flat compiler option and should be used when we develop the code. +It produces normally a very large and slow code when translated to machine instructions. +We use this option for debugging and for establishing the correct program output because +every operation is done precisely as the user specified it. +

    + +

    It is instructive to look up the compiler manual for further instructions by writing

    + + +
    +
    +
    +
    +
    +
    man c++
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +









    +

    More on optimization

    +
    + +

    +

    We have additional compiler options for optimization. These may include procedure inlining where +performance may be improved, moving constants inside loops outside the loop, +identify potential parallelism, include automatic vectorization or replace a division with a reciprocal +and a multiplication if this speeds up the code. +

    + + +
    +
    +
    +
    +
    +
    c++  -O3 -c  mycode.cpp
    +c++  -O3 -o  mycode.exe  mycode.o
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    This (other options are -O2 or -Ofast) is the recommended option.

    +
    + +









    +

    Optimization and profiling

    +
    + +

    +

    It is also useful to profile your program under the development stage. +You would then compile with +

    + + +
    +
    +
    +
    +
    +
    c++  -pg -O3 -c  mycode.cpp
    +c++  -pg -O3 -o  mycode.exe  mycode.o
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    After you have run the code you can obtain the profiling information via

    + + +
    +
    +
    +
    +
    +
    gprof mycode.exe >  ProfileOutput
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    When you have profiled properly your code, you must take out this option as it +slows down performance. +For memory tests use valgrind. An excellent environment for all these aspects, and much more, is Qt creator. +

    +
    + + +









    +

    Optimization and debugging

    +
    + +

    +

    Adding debugging options is a very useful alternative under the development stage of a program. +You would then compile with +

    + + +
    +
    +
    +
    +
    +
    c++  -g -O0 -c  mycode.cpp
    +c++  -g -O0 -o  mycode.exe  mycode.o
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    This option generates debugging information allowing you to trace for example if an array is properly allocated. Some compilers work best with the no optimization option -O0.

    +
    + +
    +Other optimization flags +

    +

    Depending on the compiler, one can add flags which generate code that catches integer overflow errors. +The flag -ftrapv does this for the CLANG compiler on OS X operating systems. +

    +
    + + +









    +

    Other hints

    +
    + +

    +

    In general, irrespective of compiler options, it is useful to

    + +

    Here is an example of a part of a program where specific operations lead to a slower code

    + + +
    +
    +
    +
    +
    +
    k = n-1;
    +for (i = 0; i < n; i++){
    +    a[i] = b[i] +c*d;
    +    e = g[k];
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    A better code is

    + + +
    +
    +
    +
    +
    +
    temp = c*d;
    +for (i = 0; i < n; i++){
    +    a[i] = b[i] + temp;
    +}
    +e = g[n-1];
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Here we avoid a repeated multiplication inside a loop. +Most compilers, depending on compiler flags, identify and optimize such bottlenecks on their own, without requiring any particular action by the programmer. However, it is always useful to single out and avoid code examples like the first one discussed here. +

    +
    + + +









    +

    Vectorization and the basic idea behind parallel computing

    +
    + +

    +

    Present CPUs are highly parallel processors with varying levels of parallelism. The typical situation can be described via the following three statements.

    + +

    Before we proceed with a more detailed discussion of topics like vectorization and parallelization, we need to remind ourselves about some basic features of different hardware models.

    +
    + + +









    +

    A rough classification of hardware models

    +
    + +

    + +

    +
    + +









    +

    Shared memory and distributed memory

    +
    + +

    +

    One way of categorizing modern parallel computers is to look at the memory configuration.

    + +

    The CPUs are connected by some network and may exchange messages.

    +
    + + +









    +

    Different parallel programming paradigms

    +
    + +

    + +

    +
    + +









    +

    Different parallel programming paradigms

    +
    + +

    + +

    +
    + + + +

    What is vectorization?

    +

    Vectorization is a special +case of Single Instructions Multiple Data (SIMD) to denote a single +instruction stream capable of operating on multiple data elements in +parallel. +We can think of vectorization as the unrolling of loops accompanied with SIMD instructions. +

    + +

    Vectorization is the process of converting an algorithm that performs scalar operations +(typically one operation at the time) to vector operations where a single operation can refer to many simultaneous operations. +Consider the following example +

    + + +
    +
    +
    +
    +
    +
    for (i = 0; i < n; i++){
    +    a[i] = b[i] + c[i];
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    If the code is not vectorized, the compiler will simply start with the first element and +then perform subsequent additions operating on one address in memory at the time. +

    + + +

    Number of elements that can acted upon

    +

    A SIMD instruction can operate on multiple data elements in one single instruction. +It uses the so-called 128-bit SIMD floating-point register. +In this sense, vectorization adds some form of parallelism since one instruction is applied +to many parts of say a vector. +

    + +

    The number of elements which can be operated on in parallel +range from four single-precision floating point data elements in so-called +Streaming SIMD Extensions and two double-precision floating-point data +elements in Streaming SIMD Extensions 2 to sixteen byte operations in +a 128-bit register in Streaming SIMD Extensions 2. Thus, vector-length +ranges from 2 to 16, depending on the instruction extensions used and +on the data type. +

    + +

    IN summary, our instructions operate on 128 bit (16 byte) operands

    + + +

    Number of elements that can acted upon, examples

    +

    We start with the simple scalar operations given by

    + + +
    +
    +
    +
    +
    +
    for (i = 0; i < n; i++){
    +    a[i] = b[i] + c[i];
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    If the code is not vectorized and we have a 128-bit register to store a 32 bits floating point number, +it means that we have \( 3\times 32 \) bits that are not used. +

    + +

    We have thus unused space in our SIMD registers. These registers could hold three additional integers.

    + + +

    Operation counts for scalar operation

    +

    The code

    + + +
    +
    +
    +
    +
    +
    for (i = 0; i < n; i++){
    +    a[i] = b[i] + c[i];
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    has for \( n \) repeats

    +
      +
    1. one load for \( c[i] \) in address 1
    2. +
    3. one load for \( b[i] \) in address 2
    4. +
    5. add \( c[i] \) and \( b[i] \) to give \( a[i] \)
    6. +
    7. store \( a[i] \) in address 2
    8. +
    + +

    Number of elements that can acted upon, examples

    +

    If we vectorize the code, we can perform, with a 128-bit register four simultaneous operations, that is +we have +

    + + +
    +
    +
    +
    +
    +
    for (i = 0; i < n; i+=4){
    +    a[i] = b[i] + c[i];
    +    a[i+1] = b[i+1] + c[i+1];
    +    a[i+2] = b[i+2] + c[i+2];
    +    a[i+3] = b[i+3] + c[i+3];
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Four additions are now done in a single step.

    + + +

    Number of operations when vectorized

    +

    For \( n/4 \) repeats assuming floats or integers

    +
      +
    1. one vector load for \( c[i] \) in address 1
    2. +
    3. one load for \( b[i] \) in address 2
    4. +
    5. add \( c[i] \) and \( b[i] \) to give \( a[i] \)
    6. +
    7. store \( a[i] \) in address 2
    8. +
    +









    +

    A simple test case with and without vectorization

    +

    We implement these operations in a simple c++ program that computes at the end the norm of a vector.

    + + + +
    +
    +
    +
    +
    +
    #include <cstdlib>
    +#include <iostream>
    +#include <cmath>
    +#include <iomanip>
    +#include "time.h"
    +
    +using namespace std; // note use of namespace
    +int main (int argc, char* argv[])
    +{
    +  // read in dimension of square matrix
    +  int n = atoi(argv[1]);
    +  double s = 1.0/sqrt( (double) n);
    +  double *a, *b, *c;
    +  // Start timing
    +  clock_t start, finish;
    +  start = clock();
    +// Allocate space for the vectors to be used
    +    a = new double [n]; b = new double [n]; c = new double [n];
    +  // Define parallel region
    +  // Set up values for vectors  a and b
    +  for (int i = 0; i < n; i++){
    +    double angle = 2.0*M_PI*i/ (( double ) n);
    +    a[i] = s*(sin(angle) + cos(angle));
    +    b[i] =  s*sin(2.0*angle);
    +    c[i] = 0.0;
    +  }
    +  // Then perform the vector addition
    +  for (int i = 0; i < n; i++){
    +    c[i] += a[i]+b[i];
    +  }
    +  // Compute now the norm-2
    +  double Norm2 = 0.0;
    +  for (int i = 0; i < n; i++){
    +    Norm2  += c[i]*c[i];
    +  }
    +  finish = clock();
    +  double timeused = (double) (finish - start)/(CLOCKS_PER_SEC );
    +  cout << setiosflags(ios::showpoint | ios::uppercase);
    +  cout << setprecision(10) << setw(20) << "Time used  for norm computation=" << timeused  << endl;
    +  cout << "  Norm-2  = " << Norm2 << endl;
    +  // Free up space
    +  delete[] a;
    +  delete[] b;
    +  delete[] c;
    +  return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + +

    Compiling with and without vectorization

    +

    We can compile and link without vectorization using the clang c++ compiler

    + + +
    +
    +
    +
    +
    +
    clang -o novec.x vecexample.cpp
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    and with vectorization (and additional optimizations)

    + + +
    +
    +
    +
    +
    +
    clang++ -O3 -Rpass=loop-vectorize -o  vec.x vecexample.cpp 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    The speedup depends on the size of the vectors. In the example here we have run with \( 10^7 \) elements. +The example here was run on an IMac17.1 with OSX El Capitan (10.11.4) as operating system and an Intel i5 3.3 GHz CPU. +

    + + +
    +
    +
    +
    +
    +
    Compphys:~ hjensen$ ./vec.x 10000000
    +Time used  for norm computation=0.04720500000
    +Compphys:~ hjensen$ ./novec.x 10000000
    +Time used  for norm computation=0.03311700000
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    This particular C++ compiler speeds up the above loop operations with a factor of 1.5 +Performing the same operations for \( 10^9 \) elements results in a smaller speedup since reading from main memory is required. The non-vectorized code is seemingly faster. +

    + + +
    +
    +
    +
    +
    +
    Compphys:~ hjensen$ ./vec.x 1000000000
    +Time used  for norm computation=58.41391100
    +Compphys:~ hjensen$ ./novec.x 1000000000
    +Time used  for norm computation=46.51295300
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    We will discuss these issues further in the next slides.

    + + +

    Compiling with and without vectorization using clang

    +

    We can compile and link without vectorization with clang compiler

    + + +
    +
    +
    +
    +
    +
    clang++ -o -fno-vectorize novec.x vecexample.cpp
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    and with vectorization

    + + +
    +
    +
    +
    +
    +
    clang++ -O3 -Rpass=loop-vectorize -o  vec.x vecexample.cpp 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    We can also add vectorization analysis, see for example

    + + +
    +
    +
    +
    +
    +
    clang++ -O3 -Rpass-analysis=loop-vectorize -o  vec.x vecexample.cpp 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    or figure out if vectorization was missed

    + + +
    +
    +
    +
    +
    +
    clang++ -O3 -Rpass-missed=loop-vectorize -o  vec.x vecexample.cpp 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Automatic vectorization and vectorization inhibitors, criteria

    + +

    Not all loops can be vectorized, as discussed in Intel's guide to vectorization

    + +

    An important criteria is that the loop counter \( n \) is known at the entry of the loop.

    + + +
    +
    +
    +
    +
    +
      for (int j = 0; j < n; j++) {
    +    a[j] = cos(j*1.0);
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    The variable \( n \) does need to be known at compile time. However, this variable must stay the same for the entire duration of the loop. It implies that an exit statement inside the loop cannot be data dependent.

    + +









    +

    Automatic vectorization and vectorization inhibitors, exit criteria

    + +

    An exit statement should in general be avoided. +If the exit statement contains data-dependent conditions, the loop cannot be vectorized. +The following is an example of a non-vectorizable loop +

    + + +
    +
    +
    +
    +
    +
      for (int j = 0; j < n; j++) {
    +    a[j] = cos(j*1.0);
    +    if (a[j] < 0 ) break;
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Avoid loop termination conditions and opt for a single entry loop variable \( n \). The lower and upper bounds have to be kept fixed within the loop.

    + +









    +

    Automatic vectorization and vectorization inhibitors, straight-line code

    + +

    SIMD instructions perform the same type of operations multiple times. +A switch statement leads thus to a non-vectorizable loop since different statemens cannot branch. +The following code can however be vectorized since the if statement is implemented as a masked assignment. +

    + + +
    +
    +
    +
    +
    +
      for (int j = 0; j < n; j++) {
    +    double x  = cos(j*1.0);
    +    if (x > 0 ) {
    +       a[j] =  x*sin(j*2.0); 
    +    }
    +    else {
    +       a[j] = 0.0;
    +    }
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    These operations can be performed for all data elements but only those elements which the mask evaluates as true are stored. In general, one should avoid branches such as switch, go to, or return statements or if constructs that cannot be treated as masked assignments.

    + +









    +

    Automatic vectorization and vectorization inhibitors, nested loops

    + +

    Only the innermost loop of the following example is vectorized

    + + +
    +
    +
    +
    +
    +
      for (int i = 0; i < n; i++) {
    +      for (int j = 0; j < n; j++) {
    +           a[i][j] += b[i][j];
    +      }  
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    The exception is if an original outer loop is transformed into an inner loop as the result of compiler optimizations.

    + +









    +

    Automatic vectorization and vectorization inhibitors, function calls

    + +

    Calls to programmer defined functions ruin vectorization. However, calls to intrinsic functions like +\( \sin{x} \), \( \cos{x} \), \( \exp{x} \) etc are allowed since they are normally efficiently vectorized. +The following example is fully vectorizable +

    + + +
    +
    +
    +
    +
    +
      for (int i = 0; i < n; i++) {
    +      a[i] = log10(i)*cos(i);
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Similarly, inline functions defined by the programmer, allow for vectorization since the function statements are glued into the actual place where the function is called.

    + +









    +

    Automatic vectorization and vectorization inhibitors, data dependencies

    + +

    One has to keep in mind that vectorization changes the order of operations inside a loop. A so-called +read-after-write statement with an explicit flow dependency cannot be vectorized. The following code +

    + + +
    +
    +
    +
    +
    +
      double b = 15.;
    +  for (int i = 1; i < n; i++) {
    +      a[i] = a[i-1] + b;
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    is an example of flow dependency and results in wrong numerical results if vectorized. For a scalar operation, the value \( a[i-1] \) computed during the iteration is loaded into the right-hand side and the results are fine. In vector mode however, with a vector length of four, the values \( a[0] \), \( a[1] \), \( a[2] \) and \( a[3] \) from the previous loop will be loaded into the right-hand side and produce wrong results. That is, we have

    + + +
    +
    +
    +
    +
    +
       a[1] = a[0] + b;
    +   a[2] = a[1] + b;
    +   a[3] = a[2] + b;
    +   a[4] = a[3] + b;
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    and if the two first iterations are executed at the same by the SIMD instruction, the value of say \( a[1] \) could be used by the second iteration before it has been calculated by the first iteration, leading thereby to wrong results.

    + +









    +

    Automatic vectorization and vectorization inhibitors, more data dependencies

    + +

    On the other hand, a so-called +write-after-read statement can be vectorized. The following code +

    + + +
    +
    +
    +
    +
    +
      double b = 15.;
    +  for (int i = 1; i < n; i++) {
    +      a[i-1] = a[i] + b;
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    is an example of flow dependency that can be vectorized since no iteration with a higher value of \( i \) +can complete before an iteration with a lower value of \( i \). However, such code leads to problems with parallelization. +

    + +









    +

    Automatic vectorization and vectorization inhibitors, memory stride

    + +

    For C++ programmers it is also worth keeping in mind that an array notation is preferred to the more compact use of pointers to access array elements. The compiler can often not tell if it is safe to vectorize the code.

    + +

    When dealing with arrays, you should also avoid memory stride, since this slows down considerably vectorization. When you access array element, write for example the inner loop to vectorize using unit stride, that is, access successively the next array element in memory, as shown here

    + + +
    +
    +
    +
    +
    +
      for (int i = 0; i < n; i++) {
    +      for (int j = 0; j < n; j++) {
    +           a[i][j] += b[i][j];
    +      }  
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Memory management

    +

    The main memory contains the program data

    +
      +
    1. Cache memory contains a copy of the main memory data
    2. +
    3. Cache is faster but consumes more space and power. It is normally assumed to be much faster than main memory
    4. +
    5. Registers contain working data only
    6. + +
    7. Multiple Cache memories contain a copy of the main memory data
    8. + +
    +

    Loads and stores to memory can be as important as floating point operations when we measure performance.

    + +









    +

    Memory and communication

    + +
      +
    1. Most communication in a computer is carried out in chunks, blocks of bytes of data that move together
    2. +
    3. In the memory hierarchy, data moves between memory and cache, and between different levels of cache, in groups called lines
    4. + +
    +

    Many of these performance features are not captured in most programming languages.

    + +









    +

    Measuring performance

    + +

    How do we measure performance? What is wrong with this code to time a loop?

    + + +
    +
    +
    +
    +
    +
      clock_t start, finish;
    +  start = clock();
    +  for (int j = 0; j < i; j++) {
    +    a[j] = b[j]+b[j]*c[j];
    +  }
    +  finish = clock();
    +  double timeused = (double) (finish - start)/(CLOCKS_PER_SEC );
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Problems with measuring time

    +
      +
    1. Timers are not infinitely accurate
    2. +
    3. All clocks have a granularity, the minimum time that they can measure
    4. +
    5. The error in a time measurement, even if everything is perfect, may be the size of this granularity (sometimes called a clock tick)
    6. +
    7. Always know what your clock granularity is
    8. +
    9. Ensure that your measurement is for a long enough duration (say 100 times the tick)
    10. +
    +









    +

    Problems with cold start

    + +

    What happens when the code is executed? The assumption is that the code is ready to +execute. But +

    +
      +
    1. Code may still be on disk, and not even read into memory.
    2. +
    3. Data may be in slow memory rather than fast (which may be wrong or right for what you are measuring)
    4. +
    5. Multiple tests often necessary to ensure that cold start effects are not present
    6. +
    7. Special effort often required to ensure data in the intended part of the memory hierarchy.
    8. +
    +









    +

    Problems with smart compilers

    + +
      +
    1. If the result of the computation is not used, the compiler may eliminate the code
    2. +
    3. Performance will look impossibly fantastic
    4. +
    5. Even worse, eliminate some of the code so the performance looks plausible
    6. +
    7. Ensure that the results are (or may be) used.
    8. +
    +









    +

    Problems with interference

    +
      +
    1. Other activities are sharing your processor
    2. + +
    3. Make multiple tests and report
    4. +
    5. Easy choices include
    6. + +
    +









    +

    Problems with measuring performance

    +
      +
    1. Accurate, reproducible performance measurement is hard
    2. +
    3. Think carefully about your experiment:
    4. +
    5. What is it, precisely, that you want to measure?
    6. +
    7. How representative is your test to the situation that you are trying to measure?
    8. +
    +









    +

    Thomas algorithm for tridiagonal linear algebra equations

    +
    + +

    +$$ +\left( \begin{array}{ccccc} + b_0 & c_0 & & & \\ + a_0 & b_1 & c_1 & & \\ + & & \ddots & & \\ + & & a_{m-3} & b_{m-2} & c_{m-2} \\ + & & & a_{m-2} & b_{m-1} + \end{array} \right) +\left( \begin{array}{c} + x_0 \\ + x_1 \\ + \vdots \\ + x_{m-2} \\ + x_{m-1} + \end{array} \right)=\left( \begin{array}{c} + f_0 \\ + f_1 \\ + \vdots \\ + f_{m-2} \\ + f_{m-1} \\ + \end{array} \right) +$$ +

    + + +









    +

    Thomas algorithm, forward substitution

    +
    + +

    +

    The first step is to multiply the first row by \( a_0/b_0 \) and subtract it from the second row. This is known as the forward substitution step. We obtain then

    +$$ + a_i = 0, +$$ + + +$$ + b_i = b_i - \frac{a_{i-1}}{b_{i-1}}c_{i-1}, +$$ + +

    and

    +$$ + f_i = f_i - \frac{a_{i-1}}{b_{i-1}}f_{i-1}. +$$ + +

    At this point the simplified equation, with only an upper triangular matrix takes the form

    +$$ +\left( \begin{array}{ccccc} + b_0 & c_0 & & & \\ + & b_1 & c_1 & & \\ + & & \ddots & & \\ + & & & b_{m-2} & c_{m-2} \\ + & & & & b_{m-1} + \end{array} \right)\left( \begin{array}{c} + x_0 \\ + x_1 \\ + \vdots \\ + x_{m-2} \\ + x_{m-1} + \end{array} \right)=\left( \begin{array}{c} + f_0 \\ + f_1 \\ + \vdots \\ + f_{m-2} \\ + f_{m-1} \\ + \end{array} \right) +$$ +
    + + +









    +

    Thomas algorithm, backward substitution

    +
    + +

    +

    The next step is the backward substitution step. The last row is multiplied by \( c_{N-3}/b_{N-2} \) and subtracted from the second to last row, thus eliminating \( c_{N-3} \) from the last row. The general backward substitution procedure is

    +$$ + c_i = 0, +$$ + +

    and

    +$$ + f_{i-1} = f_{i-1} - \frac{c_{i-1}}{b_i}f_i +$$ + +

    All that ramains to be computed is the solution, which is the very straight forward process of

    +$$ +x_i = \frac{f_i}{b_i} +$$ +
    + + +









    +

    Thomas algorithm and counting of operations (floating point and memory)

    +
    + +

    + +

    We have in specific case the following operations with the floating operations

    + + +
    + + +
    + +

    + + +

    +
    +
    +
    +
    +
    // Forward substitution    
    +// Note that we can simplify by precalculating a[i-1]/b[i-1]
    +  for (int i=1; i < n; i++) {
    +     b[i] = b[i] - (a[i-1]*c[i-1])/b[i-1];
    +     f[i] = g[i] - (a[i-1]*f[i-1])/b[i-1];
    +  }
    +  x[n-1] = f[n-1] / b[n-1];
    +  // Backwards substitution                                                           
    +  for (int i = n-2; i >= 0; i--) {
    +     f[i] = f[i] - c[i]*f[i+1]/b[i+1];
    +     x[i] = f[i]/b[i];
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Example: Transpose of a matrix

    + + + +
    +
    +
    +
    +
    +
    #include <cstdlib>
    +#include <iostream>
    +#include <cmath>
    +#include <iomanip>
    +#include "time.h"
    +
    +using namespace std; // note use of namespace
    +int main (int argc, char* argv[])
    +{
    +  // read in dimension of square matrix
    +  int n = atoi(argv[1]);
    +  double **A, **B;
    +  // Allocate space for the two matrices
    +  A = new double*[n]; B = new double*[n];
    +  for (int i = 0; i < n; i++){
    +    A[i] = new double[n];
    +    B[i] = new double[n];
    +  }
    +  // Set up values for matrix A
    +  for (int i = 0; i < n; i++){
    +    for (int j = 0; j < n; j++) {
    +      A[i][j] =  cos(i*1.0)*sin(j*3.0);
    +    }
    +  }
    +  clock_t start, finish;
    +  start = clock();
    +  // Then compute the transpose
    +  for (int i = 0; i < n; i++){
    +    for (int j = 0; j < n; j++) {
    +      B[i][j]= A[j][i];
    +    }
    +  }
    +
    +  finish = clock();
    +  double timeused = (double) (finish - start)/(CLOCKS_PER_SEC );
    +  cout << setiosflags(ios::showpoint | ios::uppercase);
    +  cout << setprecision(10) << setw(20) << "Time used  for setting up transpose of matrix=" << timeused  << endl;
    +
    +  // Free up space
    +  for (int i = 0; i < n; i++){
    +    delete[] A[i];
    +    delete[] B[i];
    +  }
    +  delete[] A;
    +  delete[] B;
    +  return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Matrix-matrix multiplication

    +

    This the matrix-matrix multiplication code with plain c++ memory allocation. It computes at the end the Frobenius norm.

    + + + +
    +
    +
    +
    +
    +
    #include <cstdlib>
    +#include <iostream>
    +#include <cmath>
    +#include <iomanip>
    +#include "time.h"
    +
    +using namespace std; // note use of namespace
    +int main (int argc, char* argv[])
    +{
    +  // read in dimension of square matrix
    +  int n = atoi(argv[1]);
    +  double s = 1.0/sqrt( (double) n);
    +  double **A, **B, **C;
    +  // Start timing
    +  clock_t start, finish;
    +  start = clock();
    +  // Allocate space for the two matrices
    +  A = new double*[n]; B = new double*[n]; C = new double*[n];
    +  for (int i = 0; i < n; i++){
    +    A[i] = new double[n];
    +    B[i] = new double[n];
    +    C[i] = new double[n];
    +  }
    +  // Set up values for matrix A and B and zero matrix C
    +  for (int i = 0; i < n; i++){
    +    for (int j = 0; j < n; j++) {
    +      double angle = 2.0*M_PI*i*j/ (( double ) n);
    +      A[i][j] = s * ( sin ( angle ) + cos ( angle ) );
    +      B[j][i] =  A[i][j];
    +    }
    +  }
    +  // Then perform the matrix-matrix multiplication
    +  for (int i = 0; i < n; i++){
    +    for (int j = 0; j < n; j++) {
    +      double sum = 0.0;
    +       for (int k = 0; k < n; k++) {
    +           sum += B[i][k]*A[k][j];
    +       }
    +       C[i][j] = sum;
    +    }
    +  }
    +  // Compute now the Frobenius norm
    +  double Fsum = 0.0;
    +  for (int i = 0; i < n; i++){
    +    for (int j = 0; j < n; j++) {
    +      Fsum += C[i][j]*C[i][j];
    +    }
    +  }
    +  Fsum = sqrt(Fsum);
    +  finish = clock();
    +  double timeused = (double) (finish - start)/(CLOCKS_PER_SEC );
    +  cout << setiosflags(ios::showpoint | ios::uppercase);
    +  cout << setprecision(10) << setw(20) << "Time used  for matrix-matrix multiplication=" << timeused  << endl;
    +  cout << "  Frobenius norm  = " << Fsum << endl;
    +  // Free up space
    +  for (int i = 0; i < n; i++){
    +    delete[] A[i];
    +    delete[] B[i];
    +    delete[] C[i];
    +  }
    +  delete[] A;
    +  delete[] B;
    +  delete[] C;
    +  return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    How do we define speedup? Simplest form

    +
    + +

    +

    +
    + + +









    +

    How do we define speedup? Correct baseline

    +
    + +

    +

    The key is choosing the correct baseline for comparison

    + +
    + + +









    +

    Parallel speedup

    +
    + +

    +

    For parallel applications, speedup is typically defined as

    + +

    Here \( T_1 \) is the time on one processor and \( T_p \) is the time using \( p \) processors.

    + +
    + + +









    +

    Speedup and memory

    +
    + +

    +

    The speedup on \( p \) processors can +be greater than \( p \) if memory usage is optimal! +Consider the case of a memorybound computation with \( M \) words of memory +

    + +
    + + +









    +

    Upper bounds on speedup

    +
    + +

    +

    Assume that almost all parts of a code are perfectly +parallelizable (fraction \( f \)). The remainder, +fraction \( (1-f) \) cannot be parallelized at all. +

    + +

    That is, there is work that takes time \( W \) on one process; a fraction \( f \) of that work will take +time \( Wf/p \) on \( p \) processors. +

    + +
    + + +









    +

    Amdahl's law

    +
    + +

    +

    On one processor we have

    +$$ +T_1 = (1-f)W + fW = W +$$ + +

    On \( p \) processors we have

    +$$ +T_p = (1-f)W + \frac{fW}{p}, +$$ + +

    resulting in a speedup of

    +$$ +\frac{T_1}{T_p} = \frac{W}{(1-f)W+fW/p} +$$ + +

    As \( p \) goes to infinity, \( fW/p \) goes to zero, and the maximum speedup is

    +$$ +\frac{1}{1-f}, +$$ + +

    meaning that if +if \( f = 0.99 \) (all but \( 1\% \) parallelizable), the maximum speedup +is \( 1/(1-.99)=100 \)! +

    +
    + + +









    +

    How much is parallelizable

    +
    + +

    +

    If any non-parallel code slips into the +application, the parallel +performance is limited. +

    + +

    In many simulations, however, the fraction of non-parallelizable work +is \( 10^{-6} \) or less due to large arrays or objects that are perfectly parallelizable. +

    +
    + + +









    +

    Today's situation of parallel computing

    +
    + +

    + +

    +

    Our lectures will focus on both MPI and OpenMP.

    +
    + + +









    +

    Overhead present in parallel computing

    +
    + +

    + +

    +

    Due to the above overhead and that certain parts of a sequential +algorithm cannot be parallelized we may not achieve an optimal parallelization. +

    +
    + + +









    +

    Parallelizing a sequential algorithm

    +
    + +

    + +

    +
    + + +









    +

    Strategies

    +
    + +

    +

    +
    + + +









    +

    How do I run MPI on a PC/Laptop? MPI

    +
    + +

    +

    To install MPI is rather easy on hardware running unix/linux as operating systems, follow simply the instructions from the OpenMPI website. See also subsequent slides. +When you have made sure you have installed MPI on your PC/laptop, +

    + + + +
    +
    +
    +
    +
    +
      # Compile and link
    +  mpic++ -O3 -o nameofprog.x nameofprog.cpp
    +  #  run code with for example 8 processes using mpirun/mpiexec
    +  mpiexec -n 8 ./nameofprog.x
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Can I do it on my own PC/laptop? OpenMP installation

    +
    + +

    +

    If you wish to install MPI and OpenMP +on your laptop/PC, we recommend the following: +

    + + + + +
    +
    +
    +
    +
    +
      brew install libomp
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    and compile and link as

    + + +
    +
    +
    +
    +
    +
    c++ -o <name executable> <name program.cpp>  -lomp
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Installing MPI

    +
    + +

    +

    For linux/ubuntu users, you need to install two packages (alternatively use the synaptic package manager)

    + + +
    +
    +
    +
    +
    +
      sudo apt-get install libopenmpi-dev
    +  sudo apt-get install openmpi-bin
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    For OS X users, install brew (after having installed xcode and gcc, needed for the +gfortran compiler of openmpi) and then install with brew +

    + + +
    +
    +
    +
    +
    +
       brew install openmpi
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    When running an executable (code.x), run as

    + + +
    +
    +
    +
    +
    +
      mpirun -n 10 ./code.x
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    where we indicate that we want the number of processes to be 10.

    +
    + + +









    +

    Installing MPI and using Qt

    +
    + +

    +

    With openmpi installed, when using Qt, add to your .pro file the instructions here

    + +

    You may need to tell Qt where openmpi is stored.

    +
    + + +









    +

    What is Message Passing Interface (MPI)?

    +
    + +

    + +

    MPI is a library, not a language. It specifies the names, calling sequences and results of functions +or subroutines to be called from C/C++ or Fortran programs, and the classes and methods that make up the MPI C++ +library. The programs that users write in Fortran, C or C++ are compiled with ordinary compilers and linked +with the MPI library. +

    + +

    MPI programs should be able to run +on all possible machines and run all MPI implementetations without change. +

    + +

    An MPI computation is a collection of processes communicating with messages.

    +
    + +









    +

    Going Parallel with MPI

    +
    + +

    +

    Task parallelism: the work of a global problem can be divided +into a number of independent tasks, which rarely need to synchronize. +Monte Carlo simulations or numerical integration are examples of this. +

    + +

    MPI is a message-passing library where all the routines +have corresponding C/C++-binding +

    + + +
    +
    +
    +
    +
    +
       MPI_Command_name
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    and Fortran-binding (routine names are in uppercase, but can also be in lower case)

    + + +
    +
    +
    +
    +
    +
       MPI_COMMAND_NAME
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    MPI is a library

    +
    + +

    +

    MPI is a library specification for the message passing interface, +proposed as a standard. +

    + + +

    A message passing standard for portability and ease-of-use. +Designed for high performance. +

    + +

    Insert communication and synchronization functions where necessary.

    +
    + + +









    +

    Bindings to MPI routines

    +
    + +

    + +

    MPI is a message-passing library where all the routines +have corresponding C/C++-binding +

    + + +
    +
    +
    +
    +
    +
       MPI_Command_name
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    and Fortran-binding (routine names are in uppercase, but can also be in lower case)

    + + +
    +
    +
    +
    +
    +
       MPI_COMMAND_NAME
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    The discussion in these slides focuses on the C++ binding.

    +
    + + +









    +

    Communicator

    +
    + +

    +

    + + +
    +
    +
    +
    +
    +
      MPI_COMM_WORLD 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
    + + +









    +

    Some of the most important MPI functions

    +
    + +

    + +

    +
    + + +









    +

    The first MPI C/C++ program

    +
    + +

    + +

    Let every process write "Hello world" (oh not this program again!!) on the standard output.

    + + +
    +
    +
    +
    +
    +
    using namespace std;
    +#include <mpi.h>
    +#include <iostream>
    +int main (int nargs, char* args[])
    +{
    +int numprocs, my_rank;
    +//   MPI initializations
    +MPI_Init (&nargs, &args);
    +MPI_Comm_size (MPI_COMM_WORLD, &numprocs);
    +MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);
    +cout << "Hello world, I have  rank " << my_rank << " out of " 
    +     << numprocs << endl;
    +//  End MPI
    +MPI_Finalize ();
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    The Fortran program

    +
    + +

    + + +

    +
    +
    +
    +
    +
    PROGRAM hello
    +INCLUDE "mpif.h"
    +INTEGER:: size, my_rank, ierr
    +
    +CALL  MPI_INIT(ierr)
    +CALL MPI_COMM_SIZE(MPI_COMM_WORLD, size, ierr)
    +CALL MPI_COMM_RANK(MPI_COMM_WORLD, my_rank, ierr)
    +WRITE(*,*)"Hello world, I've rank ",my_rank," out of ",size
    +CALL MPI_FINALIZE(ierr)
    +
    +END PROGRAM hello
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Note 1

    +
    + +

    + +

    +
    + + +









    +

    Ordered output with MPIBarrier

    +
    + +

    + + + +

    +
    +
    +
    +
    +
    int main (int nargs, char* args[])
    +{
    + int numprocs, my_rank, i;
    + MPI_Init (&nargs, &args);
    + MPI_Comm_size (MPI_COMM_WORLD, &numprocs);
    + MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);
    + for (i = 0; i < numprocs; i++) {}
    + MPI_Barrier (MPI_COMM_WORLD);
    + if (i == my_rank) {
    + cout << "Hello world, I have  rank " << my_rank << 
    +        " out of " << numprocs << endl;}
    +      MPI_Finalize ();
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Note 2

    +
    + +

    +

    +

    However, this is slightly more time-consuming since the processes synchronize between themselves as many times as there +are processes. In the next Hello world example we use the send and receive functions in order to a have a synchronized +action. +

    +
    + + +









    +

    Ordered output

    +
    + +

    + + + +

    +
    +
    +
    +
    +
    .....
    +int numprocs, my_rank, flag;
    +MPI_Status status;
    +MPI_Init (&nargs, &args);
    +MPI_Comm_size (MPI_COMM_WORLD, &numprocs);
    +MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);
    +if (my_rank > 0)
    +MPI_Recv (&flag, 1, MPI_INT, my_rank-1, 100, 
    +           MPI_COMM_WORLD, &status);
    +cout << "Hello world, I have  rank " << my_rank << " out of " 
    +<< numprocs << endl;
    +if (my_rank < numprocs-1)
    +MPI_Send (&my_rank, 1, MPI_INT, my_rank+1, 
    +          100, MPI_COMM_WORLD);
    +MPI_Finalize ();
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Note 3

    +
    + +

    + +

    The basic sending of messages is given by the function \( MPI\_SEND \), which in C/C++ +is defined as +

    + + +
    +
    +
    +
    +
    +
    int MPI_Send(void *buf, int count, 
    +             MPI_Datatype datatype, 
    +             int dest, int tag, MPI_Comm comm)}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    This single command allows the passing of any kind of variable, even a large array, to any group of tasks. +The variable buf is the variable we wish to send while count +is the number of variables we are passing. If we are passing only a single value, this should be 1. +

    + +

    If we transfer an array, it is the overall size of the array. +For example, if we want to send a 10 by 10 array, count would be \( 10\times 10=100 \) +since we are actually passing 100 values. +

    +
    + + +









    +

    Note 4

    +
    + +

    + +

    Once you have sent a message, you must receive it on another task. The function \( MPI\_RECV \) +is similar to the send call. +

    + + +
    +
    +
    +
    +
    +
    int MPI_Recv( void *buf, int count, MPI_Datatype datatype, 
    +            int source, 
    +            int tag, MPI_Comm comm, MPI_Status *status )
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    The arguments that are different from those in MPI\_SEND are +buf which is the name of the variable where you will be storing the received data, +source which replaces the destination in the send command. This is the return ID of the sender. +

    + +

    Finally, we have used \( MPI\_Status\_status \), +where one can check if the receive was completed. +

    + +

    The output of this code is the same as the previous example, but now +process 0 sends a message to process 1, which forwards it further +to process 2, and so forth. +

    +
    + + +









    +

    Numerical integration in parallel

    +
    +Integrating \( \pi \) +

    + +

    +$$ + I=\int_a^bf(x) dx\approx h\left(f(a)/2 + f(a+h) +f(a+2h)+\dots +f(b-h)+ f(b)/2\right). +$$ + +

    Click on this link for the full program.

    +
    + + +









    +

    Dissection of trapezoidal rule with \( MPI\_reduce \)

    +
    + +

    + + + +

    +
    +
    +
    +
    +
    //    Trapezoidal rule and numerical integration usign MPI
    +using namespace std;
    +#include <mpi.h>
    +#include <iostream>
    +
    +//     Here we define various functions called by the main program
    +
    +double int_function(double );
    +double trapezoidal_rule(double , double , int , double (*)(double));
    +
    +//   Main function begins here
    +int main (int nargs, char* args[])
    +{
    +  int n, local_n, numprocs, my_rank; 
    +  double a, b, h, local_a, local_b, total_sum, local_sum;   
    +  double  time_start, time_end, total_time;
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Dissection of trapezoidal rule

    +
    + +

    + + + +

    +
    +
    +
    +
    +
      //  MPI initializations
    +  MPI_Init (&nargs, &args);
    +  MPI_Comm_size (MPI_COMM_WORLD, &numprocs);
    +  MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);
    +  time_start = MPI_Wtime();
    +  //  Fixed values for a, b and n 
    +  a = 0.0 ; b = 1.0;  n = 1000;
    +  h = (b-a)/n;    // h is the same for all processes 
    +  local_n = n/numprocs;  
    +  // make sure n > numprocs, else integer division gives zero
    +  // Length of each process' interval of
    +  // integration = local_n*h.  
    +  local_a = a + my_rank*local_n*h;
    +  local_b = local_a + local_n*h;
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Integrating with MPI

    +
    + +

    + + + +

    +
    +
    +
    +
    +
      total_sum = 0.0;
    +  local_sum = trapezoidal_rule(local_a, local_b, local_n, 
    +                               &int_function); 
    +  MPI_Reduce(&local_sum, &total_sum, 1, MPI_DOUBLE, 
    +              MPI_SUM, 0, MPI_COMM_WORLD);
    +  time_end = MPI_Wtime();
    +  total_time = time_end-time_start;
    +  if ( my_rank == 0) {
    +    cout << "Trapezoidal rule = " <<  total_sum << endl;
    +    cout << "Time = " <<  total_time  
    +         << " on number of processors: "  << numprocs  << endl;
    +  }
    +  // End MPI
    +  MPI_Finalize ();  
    +  return 0;
    +}  // end of main program
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    How do I use \( MPI\_reduce \)?

    +
    + +

    + +

    Here we have used

    + + +
    +
    +
    +
    +
    +
    MPI_reduce( void *senddata, void* resultdata, int count, 
    +     MPI_Datatype datatype, MPI_Op, int root, MPI_Comm comm)
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    The two variables \( senddata \) and \( resultdata \) are obvious, besides the fact that one sends the address +of the variable or the first element of an array. If they are arrays they need to have the same size. +The variable \( count \) represents the total dimensionality, 1 in case of just one variable, +while \( MPI\_Datatype \) +defines the type of variable which is sent and received. +

    + +

    The new feature is \( MPI\_Op \). It defines the type +of operation we want to do. +

    +
    + + +









    +

    More on \( MPI\_Reduce \)

    +
    + +

    +

    In our case, since we are summing +the rectangle contributions from every process we define \( MPI\_Op = MPI\_SUM \). +If we have an array or matrix we can search for the largest og smallest element by sending either \( MPI\_MAX \) or +\( MPI\_MIN \). If we want the location as well (which array element) we simply transfer +\( MPI\_MAXLOC \) or \( MPI\_MINOC \). If we want the product we write \( MPI\_PROD \). +

    + +

    \( MPI\_Allreduce \) is defined as

    + + +
    +
    +
    +
    +
    +
    MPI_Allreduce( void *senddata, void* resultdata, int count, 
    +          MPI_Datatype datatype, MPI_Op, MPI_Comm comm)        
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Dissection of trapezoidal rule

    +
    + +

    + +

    We use \( MPI\_reduce \) to collect data from each process. Note also the use of the function +\( MPI\_Wtime \). +

    + + +
    +
    +
    +
    +
    +
    //  this function defines the function to integrate
    +double int_function(double x)
    +{
    +  double value = 4./(1.+x*x);
    +  return value;
    +} // end of function to evaluate
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Dissection of trapezoidal rule

    +
    + +

    + + +

    +
    +
    +
    +
    +
    //  this function defines the trapezoidal rule
    +double trapezoidal_rule(double a, double b, int n, 
    +                         double (*func)(double))
    +{
    +  double trapez_sum;
    +  double fa, fb, x, step;
    +  int    j;
    +  step=(b-a)/((double) n);
    +  fa=(*func)(a)/2. ;
    +  fb=(*func)(b)/2. ;
    +  trapez_sum=0.;
    +  for (j=1; j <= n-1; j++){
    +    x=j*step+a;
    +    trapez_sum+=(*func)(x);
    +  }
    +  trapez_sum=(trapez_sum+fb+fa)*step;
    +  return trapez_sum;
    +}  // end trapezoidal_rule 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    The quantum dot program for two electrons

    +
    + +

    + + +

    +
    +
    +
    +
    +
    // Variational Monte Carlo for atoms with importance sampling, slater det
    +// Test case for 2-electron quantum dot, no classes using Mersenne-Twister RNG
    +#include "mpi.h"
    +#include <cmath>
    +#include <random>
    +#include <string>
    +#include <iostream>
    +#include <fstream>
    +#include <iomanip>
    +#include "vectormatrixclass.h"
    +
    +using namespace  std;
    +// output file as global variable
    +ofstream ofile;  
    +// the step length and its squared inverse for the second derivative 
    +//  Here we define global variables  used in various functions
    +//  These can be changed by using classes
    +int Dimension = 2; 
    +int NumberParticles  = 2;  //  we fix also the number of electrons to be 2
    +
    +// declaration of functions 
    +
    +// The Mc sampling for the variational Monte Carlo 
    +void  MonteCarloSampling(int, double &, double &, Vector &);
    +
    +// The variational wave function
    +double  WaveFunction(Matrix &, Vector &);
    +
    +// The local energy 
    +double  LocalEnergy(Matrix &, Vector &);
    +
    +// The quantum force
    +void  QuantumForce(Matrix &, Matrix &, Vector &);
    +
    +
    +// inline function for single-particle wave function
    +inline double SPwavefunction(double r, double alpha) { 
    +   return exp(-alpha*r*0.5);
    +}
    +
    +// inline function for derivative of single-particle wave function
    +inline double DerivativeSPwavefunction(double r, double alpha) { 
    +  return -r*alpha;
    +}
    +
    +// function for absolute value of relative distance
    +double RelativeDistance(Matrix &r, int i, int j) { 
    +      double r_ij = 0;  
    +      for (int k = 0; k < Dimension; k++) { 
    +	r_ij += (r(i,k)-r(j,k))*(r(i,k)-r(j,k));
    +      }
    +      return sqrt(r_ij); 
    +}
    +
    +// inline function for derivative of Jastrow factor
    +inline double JastrowDerivative(Matrix &r, double beta, int i, int j, int k){
    +  return (r(i,k)-r(j,k))/(RelativeDistance(r, i, j)*pow(1.0+beta*RelativeDistance(r, i, j),2));
    +}
    +
    +// function for square of position of single particle
    +double singleparticle_pos2(Matrix &r, int i) { 
    +    double r_single_particle = 0;
    +    for (int j = 0; j < Dimension; j++) { 
    +      r_single_particle  += r(i,j)*r(i,j);
    +    }
    +    return r_single_particle;
    +}
    +
    +void lnsrch(int n, Vector &xold, double fold, Vector &g, Vector &p, Vector &x,
    +		 double *f, double stpmax, int *check, double (*func)(Vector &p));
    +
    +void dfpmin(Vector &p, int n, double gtol, int *iter, double *fret,
    +	    double(*func)(Vector &p), void (*dfunc)(Vector &p, Vector &g));
    +
    +static double sqrarg;
    +#define SQR(a) ((sqrarg=(a)) == 0.0 ? 0.0 : sqrarg*sqrarg)
    +
    +
    +static double maxarg1,maxarg2;
    +#define FMAX(a,b) (maxarg1=(a),maxarg2=(b),(maxarg1) > (maxarg2) ?\
    +        (maxarg1) : (maxarg2))
    +
    +
    +// Begin of main program   
    +
    +int main(int argc, char* argv[])
    +{
    +
    +  //  MPI initializations
    +  int NumberProcesses, MyRank, NumberMCsamples;
    +  MPI_Init (&argc, &argv);
    +  MPI_Comm_size (MPI_COMM_WORLD, &NumberProcesses);
    +  MPI_Comm_rank (MPI_COMM_WORLD, &MyRank);
    +  double StartTime = MPI_Wtime();
    +  if (MyRank == 0 && argc <= 1) {
    +    cout << "Bad Usage: " << argv[0] << 
    +      " Read also output file on same line and number of Monte Carlo cycles" << endl;
    +  }
    +  // Read filename and number of Monte Carlo cycles from the command line
    +  if (MyRank == 0 && argc > 2) {
    +    string filename = argv[1]; // first command line argument after name of program
    +    NumberMCsamples  = atoi(argv[2]);
    +    string fileout = filename;
    +    string argument = to_string(NumberMCsamples);
    +    // Final filename as filename+NumberMCsamples
    +    fileout.append(argument);
    +    ofile.open(fileout);
    +  }
    +  // broadcast the number of  Monte Carlo samples
    +  MPI_Bcast (&NumberMCsamples, 1, MPI_INT, 0, MPI_COMM_WORLD);
    +  // Two variational parameters only
    +  Vector VariationalParameters(2);
    +  int TotalNumberMCsamples = NumberMCsamples*NumberProcesses; 
    +  // Loop over variational parameters
    +  for (double alpha = 0.5; alpha <= 1.5; alpha +=0.1){
    +    for (double beta = 0.1; beta <= 0.5; beta +=0.05){
    +      VariationalParameters(0) = alpha;  // value of alpha
    +      VariationalParameters(1) = beta;  // value of beta
    +      //  Do the mc sampling  and accumulate data with MPI_Reduce
    +      double TotalEnergy, TotalEnergySquared, LocalProcessEnergy, LocalProcessEnergy2;
    +      LocalProcessEnergy = LocalProcessEnergy2 = 0.0;
    +      MonteCarloSampling(NumberMCsamples, LocalProcessEnergy, LocalProcessEnergy2, VariationalParameters);
    +      //  Collect data in total averages
    +      MPI_Reduce(&LocalProcessEnergy, &TotalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    +      MPI_Reduce(&LocalProcessEnergy2, &TotalEnergySquared, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    +      // Print out results  in case of Master node, set to MyRank = 0
    +      if ( MyRank == 0) {
    +	double Energy = TotalEnergy/( (double)NumberProcesses);
    +	double Variance = TotalEnergySquared/( (double)NumberProcesses)-Energy*Energy;
    +	double StandardDeviation = sqrt(Variance/((double)TotalNumberMCsamples)); // over optimistic error
    +	ofile << setiosflags(ios::showpoint | ios::uppercase);
    +	ofile << setw(15) << setprecision(8) << VariationalParameters(0);
    +	ofile << setw(15) << setprecision(8) << VariationalParameters(1);
    +	ofile << setw(15) << setprecision(8) << Energy;
    +	ofile << setw(15) << setprecision(8) << Variance;
    +	ofile << setw(15) << setprecision(8) << StandardDeviation << endl;
    +      }
    +    }
    +  }
    +  double EndTime = MPI_Wtime();
    +  double TotalTime = EndTime-StartTime;
    +  if ( MyRank == 0 )  cout << "Time = " <<  TotalTime  << " on number of processors: "  << NumberProcesses  << endl;
    +  if (MyRank == 0)  ofile.close();  // close output file
    +  // End MPI
    +  MPI_Finalize ();  
    +  return 0;
    +}  //  end of main function
    +
    +
    +// Monte Carlo sampling with the Metropolis algorithm  
    +
    +void MonteCarloSampling(int NumberMCsamples, double &cumulative_e, double &cumulative_e2, Vector &VariationalParameters)
    +{
    +
    + // Initialize the seed and call the Mersienne algo
    +  std::random_device rd;
    +  std::mt19937_64 gen(rd());
    +  // Set up the uniform distribution for x \in [[0, 1]
    +  std::uniform_real_distribution<double> UniformNumberGenerator(0.0,1.0);
    +  std::normal_distribution<double> Normaldistribution(0.0,1.0);
    +  // diffusion constant from Schroedinger equation
    +  double D = 0.5; 
    +  double timestep = 0.05;  //  we fix the time step  for the gaussian deviate
    +  // allocate matrices which contain the position of the particles  
    +  Matrix OldPosition( NumberParticles, Dimension), NewPosition( NumberParticles, Dimension);
    +  Matrix OldQuantumForce(NumberParticles, Dimension), NewQuantumForce(NumberParticles, Dimension);
    +  double Energy = 0.0; double EnergySquared = 0.0; double DeltaE = 0.0;
    +  //  initial trial positions
    +  for (int i = 0; i < NumberParticles; i++) { 
    +    for (int j = 0; j < Dimension; j++) {
    +      OldPosition(i,j) = Normaldistribution(gen)*sqrt(timestep);
    +    }
    +  }
    +  double OldWaveFunction = WaveFunction(OldPosition, VariationalParameters);
    +  QuantumForce(OldPosition, OldQuantumForce, VariationalParameters);
    +  // loop over monte carlo cycles 
    +  for (int cycles = 1; cycles <= NumberMCsamples; cycles++){ 
    +    // new position 
    +    for (int i = 0; i < NumberParticles; i++) { 
    +      for (int j = 0; j < Dimension; j++) {
    +	// gaussian deviate to compute new positions using a given timestep
    +	NewPosition(i,j) = OldPosition(i,j) + Normaldistribution(gen)*sqrt(timestep)+OldQuantumForce(i,j)*timestep*D;
    +	//	NewPosition(i,j) = OldPosition(i,j) + gaussian_deviate(&idum)*sqrt(timestep)+OldQuantumForce(i,j)*timestep*D;
    +      }  
    +      //  for the other particles we need to set the position to the old position since
    +      //  we move only one particle at the time
    +      for (int k = 0; k < NumberParticles; k++) {
    +	if ( k != i) {
    +	  for (int j = 0; j < Dimension; j++) {
    +	    NewPosition(k,j) = OldPosition(k,j);
    +	  }
    +	} 
    +      }
    +      double NewWaveFunction = WaveFunction(NewPosition, VariationalParameters); 
    +      QuantumForce(NewPosition, NewQuantumForce, VariationalParameters);
    +      //  we compute the log of the ratio of the greens functions to be used in the 
    +      //  Metropolis-Hastings algorithm
    +      double GreensFunction = 0.0;            
    +      for (int j = 0; j < Dimension; j++) {
    +	GreensFunction += 0.5*(OldQuantumForce(i,j)+NewQuantumForce(i,j))*
    +	  (D*timestep*0.5*(OldQuantumForce(i,j)-NewQuantumForce(i,j))-NewPosition(i,j)+OldPosition(i,j));
    +      }
    +      GreensFunction = exp(GreensFunction);
    +      // The Metropolis test is performed by moving one particle at the time
    +      if(UniformNumberGenerator(gen) <= GreensFunction*NewWaveFunction*NewWaveFunction/OldWaveFunction/OldWaveFunction ) { 
    +	for (int  j = 0; j < Dimension; j++) {
    +	  OldPosition(i,j) = NewPosition(i,j);
    +	  OldQuantumForce(i,j) = NewQuantumForce(i,j);
    +	}
    +	OldWaveFunction = NewWaveFunction;
    +      }
    +    }  //  end of loop over particles
    +    // compute local energy  
    +    double DeltaE = LocalEnergy(OldPosition, VariationalParameters);
    +    // update energies
    +    Energy += DeltaE;
    +    EnergySquared += DeltaE*DeltaE;
    +  }   // end of loop over MC trials   
    +  // update the energy average and its squared 
    +  cumulative_e = Energy/NumberMCsamples;
    +  cumulative_e2 = EnergySquared/NumberMCsamples;
    +}   // end MonteCarloSampling function  
    +
    +
    +// Function to compute the squared wave function and the quantum force
    +
    +double  WaveFunction(Matrix &r, Vector &VariationalParameters)
    +{
    +  double wf = 0.0;
    +  // full Slater determinant for two particles, replace with Slater det for more particles 
    +  wf  = SPwavefunction(singleparticle_pos2(r, 0), VariationalParameters(0))*SPwavefunction(singleparticle_pos2(r, 1),VariationalParameters(0));
    +  // contribution from Jastrow factor
    +  for (int i = 0; i < NumberParticles-1; i++) { 
    +    for (int j = i+1; j < NumberParticles; j++) {
    +      wf *= exp(RelativeDistance(r, i, j)/((1.0+VariationalParameters(1)*RelativeDistance(r, i, j))));
    +    }
    +  }
    +  return wf;
    +}
    +
    +// Function to calculate the local energy without numerical derivation of kinetic energy
    +
    +double  LocalEnergy(Matrix &r, Vector &VariationalParameters)
    +{
    +
    +  // compute the kinetic and potential energy from the single-particle part
    +  // for a many-electron system this has to be replaced by a Slater determinant
    +  // The absolute value of the interparticle length
    +  Matrix length( NumberParticles, NumberParticles);
    +  // Set up interparticle distance
    +  for (int i = 0; i < NumberParticles-1; i++) { 
    +    for(int j = i+1; j < NumberParticles; j++){
    +      length(i,j) = RelativeDistance(r, i, j);
    +      length(j,i) =  length(i,j);
    +    }
    +  }
    +  double KineticEnergy = 0.0;
    +  // Set up kinetic energy from Slater and Jastrow terms
    +  for (int i = 0; i < NumberParticles; i++) { 
    +    for (int k = 0; k < Dimension; k++) {
    +      double sum1 = 0.0; 
    +      for(int j = 0; j < NumberParticles; j++){
    +	if ( j != i) {
    +	  sum1 += JastrowDerivative(r, VariationalParameters(1), i, j, k);
    +	}
    +      }
    +      KineticEnergy += (sum1+DerivativeSPwavefunction(r(i,k),VariationalParameters(0)))*(sum1+DerivativeSPwavefunction(r(i,k),VariationalParameters(0)));
    +    }
    +  }
    +  KineticEnergy += -2*VariationalParameters(0)*NumberParticles;
    +  for (int i = 0; i < NumberParticles-1; i++) {
    +      for (int j = i+1; j < NumberParticles; j++) {
    +        KineticEnergy += 2.0/(pow(1.0 + VariationalParameters(1)*length(i,j),2))*(1.0/length(i,j)-2*VariationalParameters(1)/(1+VariationalParameters(1)*length(i,j)) );
    +      }
    +  }
    +  KineticEnergy *= -0.5;
    +  // Set up potential energy, external potential + eventual electron-electron repulsion
    +  double PotentialEnergy = 0;
    +  for (int i = 0; i < NumberParticles; i++) { 
    +    double DistanceSquared = singleparticle_pos2(r, i);
    +    PotentialEnergy += 0.5*DistanceSquared;  // sp energy HO part, note it has the oscillator frequency set to 1!
    +  }
    +  // Add the electron-electron repulsion
    +  for (int i = 0; i < NumberParticles-1; i++) { 
    +    for (int j = i+1; j < NumberParticles; j++) {
    +      PotentialEnergy += 1.0/length(i,j);          
    +    }
    +  }
    +  double LocalE = KineticEnergy+PotentialEnergy;
    +  return LocalE;
    +}
    +
    +// Compute the analytical expression for the quantum force
    +void  QuantumForce(Matrix &r, Matrix &qforce, Vector &VariationalParameters)
    +{
    +  // compute the first derivative 
    +  for (int i = 0; i < NumberParticles; i++) {
    +    for (int k = 0; k < Dimension; k++) {
    +      // single-particle part, replace with Slater det for larger systems
    +      double sppart = DerivativeSPwavefunction(r(i,k),VariationalParameters(0));
    +      //  Jastrow factor contribution
    +      double Jsum = 0.0;
    +      for (int j = 0; j < NumberParticles; j++) {
    +	if ( j != i) {
    +	  Jsum += JastrowDerivative(r, VariationalParameters(1), i, j, k);
    +	}
    +      }
    +      qforce(i,k) = 2.0*(Jsum+sppart);
    +    }
    +  }
    +} // end of QuantumForce function
    +
    +
    +#define ITMAX 200
    +#define EPS 3.0e-8
    +#define TOLX (4*EPS)
    +#define STPMX 100.0
    +
    +void dfpmin(Vector &p, int n, double gtol, int *iter, double *fret,
    +	    double(*func)(Vector &p), void (*dfunc)(Vector &p, Vector &g))
    +{
    +
    +  int check,i,its,j;
    +  double den,fac,fad,fae,fp,stpmax,sum=0.0,sumdg,sumxi,temp,test;
    +  Vector dg(n), g(n), hdg(n), pnew(n), xi(n);
    +  Matrix hessian(n,n);
    +
    +  fp=(*func)(p);
    +  (*dfunc)(p,g);
    +  for (i = 0;i < n;i++) {
    +    for (j = 0; j< n;j++) hessian(i,j)=0.0;
    +    hessian(i,i)=1.0;
    +    xi(i) = -g(i);
    +    sum += p(i)*p(i);
    +  }
    +  stpmax=STPMX*FMAX(sqrt(sum),(double)n);
    +  for (its=1;its<=ITMAX;its++) {
    +    *iter=its;
    +    lnsrch(n,p,fp,g,xi,pnew,fret,stpmax,&check,func);
    +    fp = *fret;
    +    for (i = 0; i< n;i++) {
    +      xi(i)=pnew(i)-p(i);
    +      p(i)=pnew(i);
    +    }
    +    test=0.0;
    +    for (i = 0;i< n;i++) {
    +      temp=fabs(xi(i))/FMAX(fabs(p(i)),1.0);
    +      if (temp > test) test=temp;
    +    }
    +    if (test < TOLX) {
    +      return;
    +    }
    +    for (i=0;i<n;i++) dg(i)=g(i);
    +    (*dfunc)(p,g);
    +    test=0.0;
    +    den=FMAX(*fret,1.0);
    +    for (i=0;i<n;i++) {
    +      temp=fabs(g(i))*FMAX(fabs(p(i)),1.0)/den;
    +      if (temp > test) test=temp;
    +    }
    +    if (test < gtol) {
    +      return;
    +    }
    +    for (i=0;i<n;i++) dg(i)=g(i)-dg(i);
    +    for (i=0;i<n;i++) {
    +      hdg(i)=0.0;
    +      for (j=0;j<n;j++) hdg(i) += hessian(i,j)*dg(j);
    +    }
    +    fac=fae=sumdg=sumxi=0.0;
    +    for (i=0;i<n;i++) {
    +      fac += dg(i)*xi(i);
    +      fae += dg(i)*hdg(i);
    +      sumdg += SQR(dg(i));
    +      sumxi += SQR(xi(i));
    +    }
    +    if (fac*fac > EPS*sumdg*sumxi) {
    +      fac=1.0/fac;
    +      fad=1.0/fae;
    +      for (i=0;i<n;i++) dg(i)=fac*xi(i)-fad*hdg(i);
    +      for (i=0;i<n;i++) {
    +	for (j=0;j<n;j++) {
    +	  hessian(i,j) += fac*xi(i)*xi(j)
    +	    -fad*hdg(i)*hdg(j)+fae*dg(i)*dg(j);
    +	}
    +      }
    +    }
    +    for (i=0;i<n;i++) {
    +      xi(i)=0.0;
    +      for (j=0;j<n;j++) xi(i) -= hessian(i,j)*g(j);
    +    }
    +  }
    +  cout << "too many iterations in dfpmin" << endl;
    +}
    +#undef ITMAX
    +#undef EPS
    +#undef TOLX
    +#undef STPMX
    +
    +#define ALF 1.0e-4
    +#define TOLX 1.0e-7
    +
    +void lnsrch(int n, Vector &xold, double fold, Vector &g, Vector &p, Vector &x,
    +	    double *f, double stpmax, int *check, double (*func)(Vector &p))
    +{
    +  int i;
    +  double a,alam,alam2,alamin,b,disc,f2,fold2,rhs1,rhs2,slope,sum,temp,
    +    test,tmplam;
    +
    +  *check=0;
    +  for (sum=0.0,i=0;i<n;i++) sum += p(i)*p(i);
    +  sum=sqrt(sum);
    +  if (sum > stpmax)
    +    for (i=0;i<n;i++) p(i) *= stpmax/sum;
    +  for (slope=0.0,i=0;i<n;i++)
    +    slope += g(i)*p(i);
    +  test=0.0;
    +  for (i=0;i<n;i++) {
    +    temp=fabs(p(i))/FMAX(fabs(xold(i)),1.0);
    +    if (temp > test) test=temp;
    +  }
    +  alamin=TOLX/test;
    +  alam=1.0;
    +  for (;;) {
    +    for (i=0;i<n;i++) x(i)=xold(i)+alam*p(i);
    +    *f=(*func)(x);
    +    if (alam < alamin) {
    +      for (i=0;i<n;i++) x(i)=xold(i);
    +      *check=1;
    +      return;
    +    } else if (*f <= fold+ALF*alam*slope) return;
    +    else {
    +      if (alam == 1.0)
    +	tmplam = -slope/(2.0*(*f-fold-slope));
    +      else {
    +	rhs1 = *f-fold-alam*slope;
    +	rhs2=f2-fold2-alam2*slope;
    +	a=(rhs1/(alam*alam)-rhs2/(alam2*alam2))/(alam-alam2);
    +	b=(-alam2*rhs1/(alam*alam)+alam*rhs2/(alam2*alam2))/(alam-alam2);
    +	if (a == 0.0) tmplam = -slope/(2.0*b);
    +	else {
    +	  disc=b*b-3.0*a*slope;
    +	  if (disc<0.0) cout << "Roundoff problem in lnsrch." << endl;
    +	  else tmplam=(-b+sqrt(disc))/(3.0*a);
    +	}
    +	if (tmplam>0.5*alam)
    +	  tmplam=0.5*alam;
    +      }
    +    }
    +    alam2=alam;
    +    f2 = *f;
    +    fold2=fold;
    +    alam=FMAX(tmplam,0.1*alam);
    +  }
    +}
    +#undef ALF
    +#undef TOLX
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    What is OpenMP

    +
    + +

    +

    +

    Many good tutorials online and excellent textbook

    +
      +
    1. Using OpenMP, by B. Chapman, G. Jost, and A. van der Pas
    2. +
    3. Many tutorials online like OpenMP official site
    4. +
    +
    + + +









    +

    Getting started, things to remember

    +
    + +

    +

    + + +
    +
    +
    +
    +
    +
    #include <omp.h>
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    #pragma omp...
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
    + + +









    +

    OpenMP syntax

    + + + +
    +
    +
    +
    +
    +
    #pragma omp construct [ clause ...]
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    #include <omp.h>
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Different OpenMP styles of parallelism

    +

    OpenMP supports several different ways to specify thread parallelism

    + + +









    +

    General code structure

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #include <omp.h>
    +main ()
    +{
    +int var1, var2, var3;
    +/* serial code */
    +/* ... */
    +/* start of a parallel region */
    +#pragma omp parallel private(var1, var2) shared(var3)
    +{
    +/* ... */
    +}
    +/* more serial code */
    +/* ... */
    +/* another parallel region */
    +#pragma omp parallel
    +{
    +/* ... */
    +}
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Parallel region

    +
    + +

    +

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel { ... }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
    + + +









    +

    Hello world, not again, please!

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #include <omp.h>
    +#include <cstdio>
    +int main (int argc, char *argv[])
    +{
    +int th_id, nthreads;
    +#pragma omp parallel private(th_id) shared(nthreads)
    +{
    +th_id = omp_get_thread_num();
    +printf("Hello World from thread %d\n", th_id);
    +#pragma omp barrier
    +if ( th_id == 0 ) {
    +nthreads = omp_get_num_threads();
    +printf("There are %d threads\n",nthreads);
    +}
    +}
    +return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Hello world, yet another variant

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #include <cstdio>
    +#include <omp.h>
    +int main(int argc, char *argv[]) 
    +{
    + omp_set_num_threads(4); 
    +#pragma omp parallel
    + {
    +   int id = omp_get_thread_num();
    +   int nproc = omp_get_num_threads(); 
    +   cout << "Hello world with id number and processes " <<  id <<  nproc << endl;
    + } 
    +return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Variables declared outside of the parallel region are shared by all threads +If a variable like id is declared outside of the +

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel, 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    it would have been shared by various the threads, possibly causing erroneous output

    + +
    + + +









    +

    Important OpenMP library routines

    +
    + +

    + +

    +
    + + +









    +

    Private variables

    +
    + +

    +

    Private clause can be used to make thread- private versions of such variables:

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel private(id)
    +{
    + int id = omp_get_thread_num();
    + cout << "My thread num" << id << endl; 
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
    + + +









    +

    Master region

    +
    + +

    +

    It is often useful to have only one thread execute some of the code in a parallel region. I/O statements are a common example

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel 
    +{
    +  #pragma omp master
    +   {
    +      int id = omp_get_thread_num();
    +      cout << "My thread num" << id << endl; 
    +   } 
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Parallel for loop

    +
    + +

    +

    + + +
    +
    +
    +
    +
    +
    #pragma omp for
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
    + + +









    +

    Parallel computations and loops

    + +
    + +

    +

    OpenMP provides an easy way to parallelize a loop

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel for
    +  for (i=0; i<n; i++) c[i] = a[i];
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    OpenMP handles index variable (no need to declare in for loop or make private)

    + +

    Which thread does which values? Several options.

    +
    + + +









    +

    Scheduling of loop computations

    + +
    + +

    +

    We can let the OpenMP runtime decide. The decision is about how the loop iterates are scheduled +and OpenMP defines three choices of loop scheduling: +

    +
      +
    1. Static: Predefined at compile time. Lowest overhead, predictable
    2. +
    3. Dynamic: Selection made at runtime
    4. +
    5. Guided: Special case of dynamic; attempts to reduce overhead
    6. +
    +
    + + +









    +

    Example code for loop scheduling

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #include <omp.h>
    +#define CHUNKSIZE 100
    +#define N 1000
    +int main (int argc, char *argv[])
    +{
    +int i, chunk;
    +float a[N], b[N], c[N];
    +for (i=0; i < N; i++) a[i] = b[i] = i * 1.0;
    +chunk = CHUNKSIZE;
    +#pragma omp parallel shared(a,b,c,chunk) private(i)
    +{
    +#pragma omp for schedule(dynamic,chunk)
    +for (i=0; i < N; i++) c[i] = a[i] + b[i];
    +} /* end of parallel region */
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Example code for loop scheduling, guided instead of dynamic

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #include <omp.h>
    +#define CHUNKSIZE 100
    +#define N 1000
    +int main (int argc, char *argv[])
    +{
    +int i, chunk;
    +float a[N], b[N], c[N];
    +for (i=0; i < N; i++) a[i] = b[i] = i * 1.0;
    +chunk = CHUNKSIZE;
    +#pragma omp parallel shared(a,b,c,chunk) private(i)
    +{
    +#pragma omp for schedule(guided,chunk)
    +for (i=0; i < N; i++) c[i] = a[i] + b[i];
    +} /* end of parallel region */
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    More on Parallel for loop

    +
    + +

    +

    + + +
    +
    +
    +
    +
    +
    // #pragma omp parallel and #pragma omp for
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    can be combined into

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel for
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    What can happen with this loop?

    + +
    + +

    +

    What happens with code like this

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel for
    +for (i=0; i<n; i++) sum += a[i]*a[i];
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    All threads can access the sum variable, but the addition is not atomic! It is important to avoid race between threads. So-called reductions in OpenMP are thus important for performance and for obtaining correct results. OpenMP lets us indicate that a variable is used for a reduction with a particular operator. The above code becomes

    + + +
    +
    +
    +
    +
    +
    sum = 0.0;
    +#pragma omp parallel for reduction(+:sum)
    +for (i=0; i<n; i++) sum += a[i]*a[i];
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Inner product

    +
    + +

    +$$ +\sum_{i=0}^{n-1} a_ib_i +$$ + + + +

    +
    +
    +
    +
    +
    int i;
    +double sum = 0.;
    +/* allocating and initializing arrays */
    +/* ... */
    +#pragma omp parallel for default(shared) private(i) reduction(+:sum)
    + for (i=0; i<N; i++) sum += a[i]*b[i];
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Different threads do different tasks

    +
    + +

    + +

    Different threads do different tasks independently, each section is executed by one thread.

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel
    +{
    +#pragma omp sections
    +{
    +#pragma omp section
    +funcA ();
    +#pragma omp section
    +funcB ();
    +#pragma omp section
    +funcC ();
    +}
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Single execution

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #pragma omp single { ... }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    The code is executed by one thread only, no guarantee which thread

    + +

    Can introduce an implicit barrier at the end

    + + +
    +
    +
    +
    +
    +
    #pragma omp master { ... }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Code executed by the master thread, guaranteed and no implicit barrier at the end.

    +
    + + +









    +

    Coordination and synchronization

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #pragma omp barrier
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Synchronization, must be encountered by all threads in a team (or none)

    + + +
    +
    +
    +
    +
    +
    #pragma omp ordered { a block of codes }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    is another form of synchronization (in sequential order). +The form +

    + + +
    +
    +
    +
    +
    +
    #pragma omp critical { a block of codes }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    and

    + + +
    +
    +
    +
    +
    +
    #pragma omp atomic { single assignment statement }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    is more efficient than

    + + +
    +
    +
    +
    +
    +
    #pragma omp critical
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Data scope

    +
    + +

    +

    +

    What are the purposes of these attributes

    + +
    + + +









    +

    Some remarks

    +
    + +

    + +

    +
    + + +









    +

    Parallelizing nested for-loops

    +
    + +

    + +

    + + +
    +
    +
    +
    +
    +
    for (i=0; i<100; i++)
    +    for (j=0; j<100; j++)
    +        a[i][j] = b[i][j] + c[i][j];
    +    }
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + + + +
    +
    +
    +
    +
    +
    #pragma omp parallel for private(j)
    +for (i=0; i<100; i++)
    +    for (j=0; j<100; j++)
    +       a[i][j] = b[i][j] + c[i][j];
    +    }
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + +
    + + +









    +

    Nested parallelism

    +
    + +

    +

    When a thread in a parallel region encounters another parallel construct, it +may create a new team of threads and become the master of the new +team. +

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel num_threads(4)
    +{
    +/* .... */
    +#pragma omp parallel num_threads(2)
    +{
    +//  
    +}
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Parallel tasks

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #pragma omp task 
    +#pragma omp parallel shared(p_vec) private(i)
    +{
    +#pragma omp single
    +{
    +for (i=0; i<N; i++) {
    +  double r = random_number();
    +  if (p_vec[i] > r) {
    +#pragma omp task
    +   do_work (p_vec[i]);
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Common mistakes

    +
    + +

    +

    Race condition

    + + +
    +
    +
    +
    +
    +
    int nthreads;
    +#pragma omp parallel shared(nthreads)
    +{
    +nthreads = omp_get_num_threads();
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Deadlock

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel
    +{
    +...
    +#pragma omp critical
    +{
    +...
    +#pragma omp barrier
    +}
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + +

    Not all computations are simple

    +
    + +

    +

    Not all computations are simple loops where the data can be evenly +divided among threads without any dependencies between threads +

    + +

    An example is finding the location and value of the largest element in an array

    + + +
    +
    +
    +
    +
    +
    for (i=0; i<n; i++) { 
    +   if (x[i] > maxval) {
    +      maxval = x[i];
    +      maxloc = i; 
    +   }
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + +

    Not all computations are simple, competing threads

    +
    + +

    +

    All threads are potentially accessing and changing the same values, maxloc and maxval.

    +
      +
    1. OpenMP provides several ways to coordinate access to shared values
    2. +
    + + +
    +
    +
    +
    +
    +
    #pragma omp atomic
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
      +
    1. Only one thread at a time can execute the following statement (not block). We can use the critical option
    2. +
    + + +
    +
    +
    +
    +
    +
    #pragma omp critical
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
      +
    1. Only one thread at a time can execute the following block
    2. +
    +

    Atomic may be faster than critical but depends on hardware

    +
    + + +









    +

    How to find the max value using OpenMP

    +
    + +

    +

    Write down the simplest algorithm and look carefully for race conditions. How would you handle them? +The first step would be to parallelize as +

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel for
    + for (i=0; i<n; i++) {
    +    if (x[i] > maxval) {
    +      maxval = x[i];
    +      maxloc = i; 
    +    }
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Then deal with the race conditions

    +
    + +

    +

    Write down the simplest algorithm and look carefully for race conditions. How would you handle them? +The first step would be to parallelize as +

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel for
    + for (i=0; i<n; i++) {
    +#pragma omp critical
    +  {
    +     if (x[i] > maxval) {
    +       maxval = x[i];
    +       maxloc = i; 
    +     }
    +  }
    +} 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Exercise: write a code which implements this and give an estimate on performance. Perform several runs, +with a serial code only with and without vectorization and compare the serial code with the one that uses OpenMP. Run on different archictectures if you can. +

    +
    + +









    +

    What can slow down OpenMP performance?

    +

    Give it a thought!

    + +









    +

    What can slow down OpenMP performance?

    +
    + +

    +

    Performance poor because we insisted on keeping track of the maxval and location during the execution of the loop.

    + +

    This is a common source of performance issues, namely the description of the method used to compute a value imposes additional, unnecessary requirements or properties

    + +Idea: Have each thread find the maxloc in its own data, then combine and use temporary arrays indexed by thread number to hold the values found by each thread +
    + + +









    +

    Find the max location for each thread

    +
    + +

    + + +

    +
    +
    +
    +
    +
    int maxloc[MAX_THREADS], mloc;
    +double maxval[MAX_THREADS], mval; 
    +#pragma omp parallel shared(maxval,maxloc)
    +{
    +  int id = omp_get_thread_num(); 
    +  maxval[id] = -1.0e30;
    +#pragma omp for
    +   for (int i=0; i<n; i++) {
    +       if (x[i] > maxval[id]) { 
    +           maxloc[id] = i;
    +           maxval[id] = x[i]; 
    +       }
    +    }
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Combine the values from each thread

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #pragma omp flush (maxloc,maxval)
    +#pragma omp master
    +  {
    +    int nt = omp_get_num_threads(); 
    +    mloc = maxloc[0]; 
    +    mval = maxval[0]; 
    +    for (int i=1; i<nt; i++) {
    +        if (maxval[i] > mval) { 
    +           mval = maxval[i]; 
    +           mloc = maxloc[i];
    +        } 
    +     }
    +   }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Note that we let the master process perform the last operation.

    +
    + +









    +

    Matrix-matrix multiplication

    +

    This code computes the norm of a vector using OpenMp

    + + +
    +
    +
    +
    +
    +
    //  OpenMP program to compute vector norm by adding two other vectors
    +#include <cstdlib>
    +#include <iostream>
    +#include <cmath>
    +#include <iomanip>
    +#include  <omp.h>
    +# include <ctime>
    +
    +using namespace std; // note use of namespace
    +int main (int argc, char* argv[])
    +{
    +  // read in dimension of vector
    +  int n = atoi(argv[1]);
    +  double *a, *b, *c;
    +  int i;
    +  int thread_num;
    +  double wtime, Norm2, s, angle;
    +  cout << "  Perform addition of two vectors and compute the norm-2." << endl;
    +  omp_set_num_threads(4);
    +  thread_num = omp_get_max_threads ();
    +  cout << "  The number of processors available = " << omp_get_num_procs () << endl ;
    +  cout << "  The number of threads available    = " << thread_num <<  endl;
    +  cout << "  The matrix order n                 = " << n << endl;
    +
    +  s = 1.0/sqrt( (double) n);
    +  wtime = omp_get_wtime ( );
    +  // Allocate space for the vectors to be used
    +  a = new double [n]; b = new double [n]; c = new double [n];
    +  // Define parallel region
    +# pragma omp parallel for default(shared) private (angle, i) reduction(+:Norm2)
    +  // Set up values for vectors  a and b
    +  for (i = 0; i < n; i++){
    +      angle = 2.0*M_PI*i/ (( double ) n);
    +      a[i] = s*(sin(angle) + cos(angle));
    +      b[i] =  s*sin(2.0*angle);
    +      c[i] = 0.0;
    +  }
    +  // Then perform the vector addition
    +  for (i = 0; i < n; i++){
    +     c[i] += a[i]+b[i];
    +  }
    +  // Compute now the norm-2
    +  Norm2 = 0.0;
    +  for (i = 0; i < n; i++){
    +     Norm2  += c[i]*c[i];
    +  }
    +// end parallel region
    +  wtime = omp_get_wtime ( ) - wtime;
    +  cout << setiosflags(ios::showpoint | ios::uppercase);
    +  cout << setprecision(10) << setw(20) << "Time used  for norm-2 computation=" << wtime  << endl;
    +  cout << " Norm-2  = " << Norm2 << endl;
    +  // Free up space
    +  delete[] a;
    +  delete[] b;
    +  delete[] c;
    +  return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Matrix-matrix multiplication

    +

    This the matrix-matrix multiplication code with plain c++ memory allocation using OpenMP

    + + + +
    +
    +
    +
    +
    +
    //  Matrix-matrix multiplication and Frobenius norm of a matrix with OpenMP
    +#include <cstdlib>
    +#include <iostream>
    +#include <cmath>
    +#include <iomanip>
    +#include  <omp.h>
    +# include <ctime>
    +
    +using namespace std; // note use of namespace
    +int main (int argc, char* argv[])
    +{
    +  // read in dimension of square matrix
    +  int n = atoi(argv[1]);
    +  double **A, **B, **C;
    +  int i, j, k;
    +  int thread_num;
    +  double wtime, Fsum, s, angle;
    +  cout << "  Compute matrix product C = A * B and Frobenius norm." << endl;
    +  omp_set_num_threads(4);
    +  thread_num = omp_get_max_threads ();
    +  cout << "  The number of processors available = " << omp_get_num_procs () << endl ;
    +  cout << "  The number of threads available    = " << thread_num <<  endl;
    +  cout << "  The matrix order n                 = " << n << endl;
    +
    +  s = 1.0/sqrt( (double) n);
    +  wtime = omp_get_wtime ( );
    +  // Allocate space for the two matrices
    +  A = new double*[n]; B = new double*[n]; C = new double*[n];
    +  for (i = 0; i < n; i++){
    +    A[i] = new double[n];
    +    B[i] = new double[n];
    +    C[i] = new double[n];
    +  }
    +  // Define parallel region
    +# pragma omp parallel for default(shared) private (angle, i, j, k) reduction(+:Fsum)
    +  // Set up values for matrix A and B and zero matrix C
    +  for (i = 0; i < n; i++){
    +    for (j = 0; j < n; j++) {
    +      angle = 2.0*M_PI*i*j/ (( double ) n);
    +      A[i][j] = s * ( sin ( angle ) + cos ( angle ) );
    +      B[j][i] =  A[i][j];
    +    }
    +  }
    +  // Then perform the matrix-matrix multiplication
    +  for (i = 0; i < n; i++){
    +    for (j = 0; j < n; j++) {
    +       C[i][j] =  0.0;    
    +       for (k = 0; k < n; k++) {
    +            C[i][j] += A[i][k]*B[k][j];
    +       }
    +    }
    +  }
    +  // Compute now the Frobenius norm
    +  Fsum = 0.0;
    +  for (i = 0; i < n; i++){
    +    for (j = 0; j < n; j++) {
    +      Fsum += C[i][j]*C[i][j];
    +    }
    +  }
    +  Fsum = sqrt(Fsum);
    +// end parallel region and letting only one thread perform I/O
    +  wtime = omp_get_wtime ( ) - wtime;
    +  cout << setiosflags(ios::showpoint | ios::uppercase);
    +  cout << setprecision(10) << setw(20) << "Time used  for matrix-matrix multiplication=" << wtime  << endl;
    +  cout << "  Frobenius norm  = " << Fsum << endl;
    +  // Free up space
    +  for (int i = 0; i < n; i++){
    +    delete[] A[i];
    +    delete[] B[i];
    +    delete[] C[i];
    +  }
    +  delete[] A;
    +  delete[] B;
    +  delete[] C;
    +  return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    © 1999-2024, Morten Hjorth-Jensen Email morten.hjorth-jensen@fys.uio.no. Released under CC Attribution-NonCommercial 4.0 license diff --git a/doc/pub/week9/html/week9.html b/doc/pub/week9/html/week9.html index e26205e1..98929dfe 100644 --- a/doc/pub/week9/html/week9.html +++ b/doc/pub/week9/html/week9.html @@ -183,11 +183,388 @@ 2, None, 'blocking-transformations-final-expressions'), + ('More on the blocking method', + 2, + None, + 'more-on-the-blocking-method'), ('Example code form last week', 2, None, 'example-code-form-last-week'), - ('Resampling analysis', 2, None, 'resampling-analysis')]} + ('Resampling analysis', 2, None, 'resampling-analysis'), + ('Content', 2, None, 'content'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('More on optimization', 2, None, 'more-on-optimization'), + ('Optimization and profiling', + 2, + None, + 'optimization-and-profiling'), + ('Optimization and debugging', + 2, + None, + 'optimization-and-debugging'), + ('Other hints', 2, None, 'other-hints'), + ('Vectorization and the basic idea behind parallel computing', + 2, + None, + 'vectorization-and-the-basic-idea-behind-parallel-computing'), + ('A rough classification of hardware models', + 2, + None, + 'a-rough-classification-of-hardware-models'), + ('Shared memory and distributed memory', + 2, + None, + 'shared-memory-and-distributed-memory'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('Different parallel programming paradigms', + 2, + None, + 'different-parallel-programming-paradigms'), + ('What is vectorization?', 2, None, 'what-is-vectorization'), + ('Number of elements that can acted upon', + 2, + None, + 'number-of-elements-that-can-acted-upon'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Operation counts for scalar operation', + 2, + None, + 'operation-counts-for-scalar-operation'), + ('Number of elements that can acted upon, examples', + 2, + None, + 'number-of-elements-that-can-acted-upon-examples'), + ('Number of operations when vectorized', + 2, + None, + 'number-of-operations-when-vectorized'), + ('"A simple test case with and without ' + 'vectorization":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp"', + 2, + None, + 'a-simple-test-case-with-and-without-vectorization-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program7-cpp'), + ('Compiling with and without vectorization', + 2, + None, + 'compiling-with-and-without-vectorization'), + ('Compiling with and without vectorization using clang', + 2, + None, + 'compiling-with-and-without-vectorization-using-clang'), + ('Automatic vectorization and vectorization inhibitors, criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-criteria'), + ('Automatic vectorization and vectorization inhibitors, exit ' + 'criteria', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-exit-criteria'), + ('Automatic vectorization and vectorization inhibitors, ' + 'straight-line code', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-straight-line-code'), + ('Automatic vectorization and vectorization inhibitors, nested ' + 'loops', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-nested-loops'), + ('Automatic vectorization and vectorization inhibitors, function ' + 'calls', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-function-calls'), + ('Automatic vectorization and vectorization inhibitors, data ' + 'dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, more ' + 'data dependencies', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-more-data-dependencies'), + ('Automatic vectorization and vectorization inhibitors, memory ' + 'stride', + 2, + None, + 'automatic-vectorization-and-vectorization-inhibitors-memory-stride'), + ('Memory management', 2, None, 'memory-management'), + ('Memory and communication', 2, None, 'memory-and-communication'), + ('Measuring performance', 2, None, 'measuring-performance'), + ('Problems with measuring time', + 2, + None, + 'problems-with-measuring-time'), + ('Problems with cold start', 2, None, 'problems-with-cold-start'), + ('Problems with smart compilers', + 2, + None, + 'problems-with-smart-compilers'), + ('Problems with interference', + 2, + None, + 'problems-with-interference'), + ('Problems with measuring performance', + 2, + None, + 'problems-with-measuring-performance'), + ('Thomas algorithm for tridiagonal linear algebra equations', + 2, + None, + 'thomas-algorithm-for-tridiagonal-linear-algebra-equations'), + ('Thomas algorithm, forward substitution', + 2, + None, + 'thomas-algorithm-forward-substitution'), + ('Thomas algorithm, backward substitution', + 2, + None, + 'thomas-algorithm-backward-substitution'), + ('Thomas algorithm and counting of operations (floating point ' + 'and memory)', + 2, + None, + 'thomas-algorithm-and-counting-of-operations-floating-point-and-memory'), + ('"Example: Transpose of a ' + 'matrix":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp"', + 2, + None, + 'example-transpose-of-a-matrix-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program8-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-lectureprograms-programs-classes-cpp-program9-cpp'), + ('How do we define speedup? Simplest form', + 2, + None, + 'how-do-we-define-speedup-simplest-form'), + ('How do we define speedup? Correct baseline', + 2, + None, + 'how-do-we-define-speedup-correct-baseline'), + ('Parallel speedup', 2, None, 'parallel-speedup'), + ('Speedup and memory', 2, None, 'speedup-and-memory'), + ('Upper bounds on speedup', 2, None, 'upper-bounds-on-speedup'), + ("Amdahl's law", 2, None, 'amdahl-s-law'), + ('How much is parallelizable', + 2, + None, + 'how-much-is-parallelizable'), + ("Today's situation of parallel computing", + 2, + None, + 'today-s-situation-of-parallel-computing'), + ('Overhead present in parallel computing', + 2, + None, + 'overhead-present-in-parallel-computing'), + ('Parallelizing a sequential algorithm', + 2, + None, + 'parallelizing-a-sequential-algorithm'), + ('Strategies', 2, None, 'strategies'), + ('How do I run MPI on a PC/Laptop? MPI', + 2, + None, + 'how-do-i-run-mpi-on-a-pc-laptop-mpi'), + ('Can I do it on my own PC/laptop? OpenMP installation', + 2, + None, + 'can-i-do-it-on-my-own-pc-laptop-openmp-installation'), + ('Installing MPI', 2, None, 'installing-mpi'), + ('Installing MPI and using Qt', + 2, + None, + 'installing-mpi-and-using-qt'), + ('What is Message Passing Interface (MPI)?', + 2, + None, + 'what-is-message-passing-interface-mpi'), + ('Going Parallel with MPI', 2, None, 'going-parallel-with-mpi'), + ('MPI is a library', 2, None, 'mpi-is-a-library'), + ('Bindings to MPI routines', 2, None, 'bindings-to-mpi-routines'), + ('Communicator', 2, None, 'communicator'), + ('Some of the most important MPI functions', + 2, + None, + 'some-of-the-most-important-mpi-functions'), + ('"The first MPI C/C++ ' + 'program":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp"', + 2, + None, + 'the-first-mpi-c-c-program-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program2-cpp'), + ('The Fortran program', 2, None, 'the-fortran-program'), + ('Note 1', 2, None, 'note-1'), + ('"Ordered output with ' + 'MPIBarrier":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp"', + 2, + None, + 'ordered-output-with-mpibarrier-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program3-cpp'), + ('Note 2', 2, None, 'note-2'), + ('"Ordered ' + 'output":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp"', + 2, + None, + 'ordered-output-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program4-cpp'), + ('Note 3', 2, None, 'note-3'), + ('Note 4', 2, None, 'note-4'), + ('"Numerical integration in ' + 'parallel":"https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp"', + 2, + None, + 'numerical-integration-in-parallel-https-github-com-compphysics-computationalphysics2-blob-gh-pages-doc-programs-lectureprograms-programs-mpi-chapter07-program6-cpp'), + ('Dissection of trapezoidal rule with $MPI\\_reduce$', + 2, + None, + 'dissection-of-trapezoidal-rule-with-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Integrating with _MPI_', 2, None, 'integrating-with-mpi'), + ('How do I use $MPI\\_reduce$?', + 2, + None, + 'how-do-i-use-mpi-reduce'), + ('More on $MPI\\_Reduce$', 2, None, 'more-on-mpi-reduce'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('Dissection of trapezoidal rule', + 2, + None, + 'dissection-of-trapezoidal-rule'), + ('"The quantum dot program for two ' + 'electrons":"https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp"', + 2, + None, + 'the-quantum-dot-program-for-two-electrons-https-github-com-compphysics-computationalphysics2-blob-master-doc-programs-parallelizationmpi-mpivmcqdot-cpp'), + ('What is OpenMP', 2, None, 'what-is-openmp'), + ('Getting started, things to remember', + 2, + None, + 'getting-started-things-to-remember'), + ('OpenMP syntax', 2, None, 'openmp-syntax'), + ('Different OpenMP styles of parallelism', + 2, + None, + 'different-openmp-styles-of-parallelism'), + ('General code structure', 2, None, 'general-code-structure'), + ('Parallel region', 2, None, 'parallel-region'), + ('Hello world, not again, please!', + 2, + None, + 'hello-world-not-again-please'), + ('Hello world, yet another variant', + 2, + None, + 'hello-world-yet-another-variant'), + ('Important OpenMP library routines', + 2, + None, + 'important-openmp-library-routines'), + ('Private variables', 2, None, 'private-variables'), + ('Master region', 2, None, 'master-region'), + ('Parallel for loop', 2, None, 'parallel-for-loop'), + ('Parallel computations and loops', + 2, + None, + 'parallel-computations-and-loops'), + ('Scheduling of loop computations', + 2, + None, + 'scheduling-of-loop-computations'), + ('Example code for loop scheduling', + 2, + None, + 'example-code-for-loop-scheduling'), + ('Example code for loop scheduling, guided instead of dynamic', + 2, + None, + 'example-code-for-loop-scheduling-guided-instead-of-dynamic'), + ('More on Parallel for loop', + 2, + None, + 'more-on-parallel-for-loop'), + ('What can happen with this loop?', + 2, + None, + 'what-can-happen-with-this-loop'), + ('Inner product', 2, None, 'inner-product'), + ('Different threads do different tasks', + 2, + None, + 'different-threads-do-different-tasks'), + ('Single execution', 2, None, 'single-execution'), + ('Coordination and synchronization', + 2, + None, + 'coordination-and-synchronization'), + ('Data scope', 2, None, 'data-scope'), + ('Some remarks', 2, None, 'some-remarks'), + ('Parallelizing nested for-loops', + 2, + None, + 'parallelizing-nested-for-loops'), + ('Nested parallelism', 2, None, 'nested-parallelism'), + ('Parallel tasks', 2, None, 'parallel-tasks'), + ('Common mistakes', 2, None, 'common-mistakes'), + ('Not all computations are simple', + 2, + None, + 'not-all-computations-are-simple'), + ('Not all computations are simple, competing threads', + 2, + None, + 'not-all-computations-are-simple-competing-threads'), + ('How to find the max value using OpenMP', + 2, + None, + 'how-to-find-the-max-value-using-openmp'), + ('Then deal with the race conditions', + 2, + None, + 'then-deal-with-the-race-conditions'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('What can slow down OpenMP performance?', + 2, + None, + 'what-can-slow-down-openmp-performance'), + ('Find the max location for each thread', + 2, + None, + 'find-the-max-location-for-each-thread'), + ('Combine the values from each thread', + 2, + None, + 'combine-the-values-from-each-thread'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpvectornorm-cpp'), + ('"Matrix-matrix ' + 'multiplication":"https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp"', + 2, + None, + 'matrix-matrix-multiplication-https-github-com-compphysics-computationalphysicsmsu-blob-master-doc-programs-parallelizationopenmp-openmpmatrixmatrixmult-cpp')]} end of tocinfo --> @@ -245,15 +622,7 @@

    Overview of week 11, March 11-15

    -
    -Teaching Material, videos and written material -

    -

    -
    - +

    Note, these notes contain additional material om optimization and parallelization. Parts of this material will be discussed this week.











    Why resampling methods ?

    @@ -439,9 +808,9 @@

    Introducing the correlation functi

    Resampling methods: Blocking

    The blocking method was made popular by Flyvbjerg and Pedersen (1989) -and has become one of the standard ways to estimate -\( V(\widehat{\theta}) \) for exactly one \( \widehat{\theta} \), namely -\( \widehat{\theta} = \overline{X} \). +and has become one of the standard ways to estimate the variance +\( \mathrm{var}(\widehat{\theta}) \) for exactly one estimator \( \widehat{\theta} \), namely +\( \widehat{\theta} = \overline{X} \), the mean value.

    Assume \( n = 2^d \) for some integer \( d>1 \) and \( X_1,X_2,\cdots, X_n \) is a stationary time series to begin with. @@ -564,10 +933,14 @@

    Blocking Transformations, fi \end{align} $$ + +









    +

    More on the blocking method

    +

    Flyvbjerg and Petersen demonstrated that the sequence \( \{e_k\}_{k=0}^{d-1} \) is decreasing, and conjecture that the term \( e_k \) can be made as small as we would like by making \( k \) (and hence -\( d \)) sufficiently large. The sequence is decreasing (Master of Science thesis by Marius Jonsson, UiO 2018). +\( d \)) sufficiently large. The sequence is decreasing. It means we can apply blocking transformations until \( e_k \) is sufficiently small, and then estimate \( \mathrm{var}(\overline{X}) \) by \( \widehat{\sigma}^2_k/n_k \). @@ -902,6 +1275,5073 @@

    Resampling analysis

    +









    +

    Content

    + +









    +

    Optimization and profiling

    +
    + +

    + +

    Till now we have not paid much attention to speed and possible optimization possibilities +inherent in the various compilers. We have compiled and linked as +

    + + +
    +
    +
    +
    +
    +
    c++  -c  mycode.cpp
    +c++  -o  mycode.exe  mycode.o
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    For Fortran replace with for example gfortran or ifort. +This is what we call a flat compiler option and should be used when we develop the code. +It produces normally a very large and slow code when translated to machine instructions. +We use this option for debugging and for establishing the correct program output because +every operation is done precisely as the user specified it. +

    + +

    It is instructive to look up the compiler manual for further instructions by writing

    + + +
    +
    +
    +
    +
    +
    man c++
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +









    +

    More on optimization

    +
    + +

    +

    We have additional compiler options for optimization. These may include procedure inlining where +performance may be improved, moving constants inside loops outside the loop, +identify potential parallelism, include automatic vectorization or replace a division with a reciprocal +and a multiplication if this speeds up the code. +

    + + +
    +
    +
    +
    +
    +
    c++  -O3 -c  mycode.cpp
    +c++  -O3 -o  mycode.exe  mycode.o
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    This (other options are -O2 or -Ofast) is the recommended option.

    +
    + +









    +

    Optimization and profiling

    +
    + +

    +

    It is also useful to profile your program under the development stage. +You would then compile with +

    + + +
    +
    +
    +
    +
    +
    c++  -pg -O3 -c  mycode.cpp
    +c++  -pg -O3 -o  mycode.exe  mycode.o
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    After you have run the code you can obtain the profiling information via

    + + +
    +
    +
    +
    +
    +
    gprof mycode.exe >  ProfileOutput
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    When you have profiled properly your code, you must take out this option as it +slows down performance. +For memory tests use valgrind. An excellent environment for all these aspects, and much more, is Qt creator. +

    +
    + + +









    +

    Optimization and debugging

    +
    + +

    +

    Adding debugging options is a very useful alternative under the development stage of a program. +You would then compile with +

    + + +
    +
    +
    +
    +
    +
    c++  -g -O0 -c  mycode.cpp
    +c++  -g -O0 -o  mycode.exe  mycode.o
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    This option generates debugging information allowing you to trace for example if an array is properly allocated. Some compilers work best with the no optimization option -O0.

    +
    + +
    +Other optimization flags +

    +

    Depending on the compiler, one can add flags which generate code that catches integer overflow errors. +The flag -ftrapv does this for the CLANG compiler on OS X operating systems. +

    +
    + + +









    +

    Other hints

    +
    + +

    +

    In general, irrespective of compiler options, it is useful to

    + +

    Here is an example of a part of a program where specific operations lead to a slower code

    + + +
    +
    +
    +
    +
    +
    k = n-1;
    +for (i = 0; i < n; i++){
    +    a[i] = b[i] +c*d;
    +    e = g[k];
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    A better code is

    + + +
    +
    +
    +
    +
    +
    temp = c*d;
    +for (i = 0; i < n; i++){
    +    a[i] = b[i] + temp;
    +}
    +e = g[n-1];
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Here we avoid a repeated multiplication inside a loop. +Most compilers, depending on compiler flags, identify and optimize such bottlenecks on their own, without requiring any particular action by the programmer. However, it is always useful to single out and avoid code examples like the first one discussed here. +

    +
    + + +









    +

    Vectorization and the basic idea behind parallel computing

    +
    + +

    +

    Present CPUs are highly parallel processors with varying levels of parallelism. The typical situation can be described via the following three statements.

    + +

    Before we proceed with a more detailed discussion of topics like vectorization and parallelization, we need to remind ourselves about some basic features of different hardware models.

    +
    + + +









    +

    A rough classification of hardware models

    +
    + +

    + +

    +
    + +









    +

    Shared memory and distributed memory

    +
    + +

    +

    One way of categorizing modern parallel computers is to look at the memory configuration.

    + +

    The CPUs are connected by some network and may exchange messages.

    +
    + + +









    +

    Different parallel programming paradigms

    +
    + +

    + +

    +
    + +









    +

    Different parallel programming paradigms

    +
    + +

    + +

    +
    + + + +

    What is vectorization?

    +

    Vectorization is a special +case of Single Instructions Multiple Data (SIMD) to denote a single +instruction stream capable of operating on multiple data elements in +parallel. +We can think of vectorization as the unrolling of loops accompanied with SIMD instructions. +

    + +

    Vectorization is the process of converting an algorithm that performs scalar operations +(typically one operation at the time) to vector operations where a single operation can refer to many simultaneous operations. +Consider the following example +

    + + +
    +
    +
    +
    +
    +
    for (i = 0; i < n; i++){
    +    a[i] = b[i] + c[i];
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    If the code is not vectorized, the compiler will simply start with the first element and +then perform subsequent additions operating on one address in memory at the time. +

    + + +

    Number of elements that can acted upon

    +

    A SIMD instruction can operate on multiple data elements in one single instruction. +It uses the so-called 128-bit SIMD floating-point register. +In this sense, vectorization adds some form of parallelism since one instruction is applied +to many parts of say a vector. +

    + +

    The number of elements which can be operated on in parallel +range from four single-precision floating point data elements in so-called +Streaming SIMD Extensions and two double-precision floating-point data +elements in Streaming SIMD Extensions 2 to sixteen byte operations in +a 128-bit register in Streaming SIMD Extensions 2. Thus, vector-length +ranges from 2 to 16, depending on the instruction extensions used and +on the data type. +

    + +

    IN summary, our instructions operate on 128 bit (16 byte) operands

    + + +

    Number of elements that can acted upon, examples

    +

    We start with the simple scalar operations given by

    + + +
    +
    +
    +
    +
    +
    for (i = 0; i < n; i++){
    +    a[i] = b[i] + c[i];
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    If the code is not vectorized and we have a 128-bit register to store a 32 bits floating point number, +it means that we have \( 3\times 32 \) bits that are not used. +

    + +

    We have thus unused space in our SIMD registers. These registers could hold three additional integers.

    + + +

    Operation counts for scalar operation

    +

    The code

    + + +
    +
    +
    +
    +
    +
    for (i = 0; i < n; i++){
    +    a[i] = b[i] + c[i];
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    has for \( n \) repeats

    +
      +
    1. one load for \( c[i] \) in address 1
    2. +
    3. one load for \( b[i] \) in address 2
    4. +
    5. add \( c[i] \) and \( b[i] \) to give \( a[i] \)
    6. +
    7. store \( a[i] \) in address 2
    8. +
    + +

    Number of elements that can acted upon, examples

    +

    If we vectorize the code, we can perform, with a 128-bit register four simultaneous operations, that is +we have +

    + + +
    +
    +
    +
    +
    +
    for (i = 0; i < n; i+=4){
    +    a[i] = b[i] + c[i];
    +    a[i+1] = b[i+1] + c[i+1];
    +    a[i+2] = b[i+2] + c[i+2];
    +    a[i+3] = b[i+3] + c[i+3];
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Four additions are now done in a single step.

    + + +

    Number of operations when vectorized

    +

    For \( n/4 \) repeats assuming floats or integers

    +
      +
    1. one vector load for \( c[i] \) in address 1
    2. +
    3. one load for \( b[i] \) in address 2
    4. +
    5. add \( c[i] \) and \( b[i] \) to give \( a[i] \)
    6. +
    7. store \( a[i] \) in address 2
    8. +
    +









    +

    A simple test case with and without vectorization

    +

    We implement these operations in a simple c++ program that computes at the end the norm of a vector.

    + + + +
    +
    +
    +
    +
    +
    #include <cstdlib>
    +#include <iostream>
    +#include <cmath>
    +#include <iomanip>
    +#include "time.h"
    +
    +using namespace std; // note use of namespace
    +int main (int argc, char* argv[])
    +{
    +  // read in dimension of square matrix
    +  int n = atoi(argv[1]);
    +  double s = 1.0/sqrt( (double) n);
    +  double *a, *b, *c;
    +  // Start timing
    +  clock_t start, finish;
    +  start = clock();
    +// Allocate space for the vectors to be used
    +    a = new double [n]; b = new double [n]; c = new double [n];
    +  // Define parallel region
    +  // Set up values for vectors  a and b
    +  for (int i = 0; i < n; i++){
    +    double angle = 2.0*M_PI*i/ (( double ) n);
    +    a[i] = s*(sin(angle) + cos(angle));
    +    b[i] =  s*sin(2.0*angle);
    +    c[i] = 0.0;
    +  }
    +  // Then perform the vector addition
    +  for (int i = 0; i < n; i++){
    +    c[i] += a[i]+b[i];
    +  }
    +  // Compute now the norm-2
    +  double Norm2 = 0.0;
    +  for (int i = 0; i < n; i++){
    +    Norm2  += c[i]*c[i];
    +  }
    +  finish = clock();
    +  double timeused = (double) (finish - start)/(CLOCKS_PER_SEC );
    +  cout << setiosflags(ios::showpoint | ios::uppercase);
    +  cout << setprecision(10) << setw(20) << "Time used  for norm computation=" << timeused  << endl;
    +  cout << "  Norm-2  = " << Norm2 << endl;
    +  // Free up space
    +  delete[] a;
    +  delete[] b;
    +  delete[] c;
    +  return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + +

    Compiling with and without vectorization

    +

    We can compile and link without vectorization using the clang c++ compiler

    + + +
    +
    +
    +
    +
    +
    clang -o novec.x vecexample.cpp
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    and with vectorization (and additional optimizations)

    + + +
    +
    +
    +
    +
    +
    clang++ -O3 -Rpass=loop-vectorize -o  vec.x vecexample.cpp 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    The speedup depends on the size of the vectors. In the example here we have run with \( 10^7 \) elements. +The example here was run on an IMac17.1 with OSX El Capitan (10.11.4) as operating system and an Intel i5 3.3 GHz CPU. +

    + + +
    +
    +
    +
    +
    +
    Compphys:~ hjensen$ ./vec.x 10000000
    +Time used  for norm computation=0.04720500000
    +Compphys:~ hjensen$ ./novec.x 10000000
    +Time used  for norm computation=0.03311700000
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    This particular C++ compiler speeds up the above loop operations with a factor of 1.5 +Performing the same operations for \( 10^9 \) elements results in a smaller speedup since reading from main memory is required. The non-vectorized code is seemingly faster. +

    + + +
    +
    +
    +
    +
    +
    Compphys:~ hjensen$ ./vec.x 1000000000
    +Time used  for norm computation=58.41391100
    +Compphys:~ hjensen$ ./novec.x 1000000000
    +Time used  for norm computation=46.51295300
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    We will discuss these issues further in the next slides.

    + + +

    Compiling with and without vectorization using clang

    +

    We can compile and link without vectorization with clang compiler

    + + +
    +
    +
    +
    +
    +
    clang++ -o -fno-vectorize novec.x vecexample.cpp
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    and with vectorization

    + + +
    +
    +
    +
    +
    +
    clang++ -O3 -Rpass=loop-vectorize -o  vec.x vecexample.cpp 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    We can also add vectorization analysis, see for example

    + + +
    +
    +
    +
    +
    +
    clang++ -O3 -Rpass-analysis=loop-vectorize -o  vec.x vecexample.cpp 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    or figure out if vectorization was missed

    + + +
    +
    +
    +
    +
    +
    clang++ -O3 -Rpass-missed=loop-vectorize -o  vec.x vecexample.cpp 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Automatic vectorization and vectorization inhibitors, criteria

    + +

    Not all loops can be vectorized, as discussed in Intel's guide to vectorization

    + +

    An important criteria is that the loop counter \( n \) is known at the entry of the loop.

    + + +
    +
    +
    +
    +
    +
      for (int j = 0; j < n; j++) {
    +    a[j] = cos(j*1.0);
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    The variable \( n \) does need to be known at compile time. However, this variable must stay the same for the entire duration of the loop. It implies that an exit statement inside the loop cannot be data dependent.

    + +









    +

    Automatic vectorization and vectorization inhibitors, exit criteria

    + +

    An exit statement should in general be avoided. +If the exit statement contains data-dependent conditions, the loop cannot be vectorized. +The following is an example of a non-vectorizable loop +

    + + +
    +
    +
    +
    +
    +
      for (int j = 0; j < n; j++) {
    +    a[j] = cos(j*1.0);
    +    if (a[j] < 0 ) break;
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Avoid loop termination conditions and opt for a single entry loop variable \( n \). The lower and upper bounds have to be kept fixed within the loop.

    + +









    +

    Automatic vectorization and vectorization inhibitors, straight-line code

    + +

    SIMD instructions perform the same type of operations multiple times. +A switch statement leads thus to a non-vectorizable loop since different statemens cannot branch. +The following code can however be vectorized since the if statement is implemented as a masked assignment. +

    + + +
    +
    +
    +
    +
    +
      for (int j = 0; j < n; j++) {
    +    double x  = cos(j*1.0);
    +    if (x > 0 ) {
    +       a[j] =  x*sin(j*2.0); 
    +    }
    +    else {
    +       a[j] = 0.0;
    +    }
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    These operations can be performed for all data elements but only those elements which the mask evaluates as true are stored. In general, one should avoid branches such as switch, go to, or return statements or if constructs that cannot be treated as masked assignments.

    + +









    +

    Automatic vectorization and vectorization inhibitors, nested loops

    + +

    Only the innermost loop of the following example is vectorized

    + + +
    +
    +
    +
    +
    +
      for (int i = 0; i < n; i++) {
    +      for (int j = 0; j < n; j++) {
    +           a[i][j] += b[i][j];
    +      }  
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    The exception is if an original outer loop is transformed into an inner loop as the result of compiler optimizations.

    + +









    +

    Automatic vectorization and vectorization inhibitors, function calls

    + +

    Calls to programmer defined functions ruin vectorization. However, calls to intrinsic functions like +\( \sin{x} \), \( \cos{x} \), \( \exp{x} \) etc are allowed since they are normally efficiently vectorized. +The following example is fully vectorizable +

    + + +
    +
    +
    +
    +
    +
      for (int i = 0; i < n; i++) {
    +      a[i] = log10(i)*cos(i);
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Similarly, inline functions defined by the programmer, allow for vectorization since the function statements are glued into the actual place where the function is called.

    + +









    +

    Automatic vectorization and vectorization inhibitors, data dependencies

    + +

    One has to keep in mind that vectorization changes the order of operations inside a loop. A so-called +read-after-write statement with an explicit flow dependency cannot be vectorized. The following code +

    + + +
    +
    +
    +
    +
    +
      double b = 15.;
    +  for (int i = 1; i < n; i++) {
    +      a[i] = a[i-1] + b;
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    is an example of flow dependency and results in wrong numerical results if vectorized. For a scalar operation, the value \( a[i-1] \) computed during the iteration is loaded into the right-hand side and the results are fine. In vector mode however, with a vector length of four, the values \( a[0] \), \( a[1] \), \( a[2] \) and \( a[3] \) from the previous loop will be loaded into the right-hand side and produce wrong results. That is, we have

    + + +
    +
    +
    +
    +
    +
       a[1] = a[0] + b;
    +   a[2] = a[1] + b;
    +   a[3] = a[2] + b;
    +   a[4] = a[3] + b;
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    and if the two first iterations are executed at the same by the SIMD instruction, the value of say \( a[1] \) could be used by the second iteration before it has been calculated by the first iteration, leading thereby to wrong results.

    + +









    +

    Automatic vectorization and vectorization inhibitors, more data dependencies

    + +

    On the other hand, a so-called +write-after-read statement can be vectorized. The following code +

    + + +
    +
    +
    +
    +
    +
      double b = 15.;
    +  for (int i = 1; i < n; i++) {
    +      a[i-1] = a[i] + b;
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    is an example of flow dependency that can be vectorized since no iteration with a higher value of \( i \) +can complete before an iteration with a lower value of \( i \). However, such code leads to problems with parallelization. +

    + +









    +

    Automatic vectorization and vectorization inhibitors, memory stride

    + +

    For C++ programmers it is also worth keeping in mind that an array notation is preferred to the more compact use of pointers to access array elements. The compiler can often not tell if it is safe to vectorize the code.

    + +

    When dealing with arrays, you should also avoid memory stride, since this slows down considerably vectorization. When you access array element, write for example the inner loop to vectorize using unit stride, that is, access successively the next array element in memory, as shown here

    + + +
    +
    +
    +
    +
    +
      for (int i = 0; i < n; i++) {
    +      for (int j = 0; j < n; j++) {
    +           a[i][j] += b[i][j];
    +      }  
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Memory management

    +

    The main memory contains the program data

    +
      +
    1. Cache memory contains a copy of the main memory data
    2. +
    3. Cache is faster but consumes more space and power. It is normally assumed to be much faster than main memory
    4. +
    5. Registers contain working data only
    6. + +
    7. Multiple Cache memories contain a copy of the main memory data
    8. + +
    +

    Loads and stores to memory can be as important as floating point operations when we measure performance.

    + +









    +

    Memory and communication

    + +
      +
    1. Most communication in a computer is carried out in chunks, blocks of bytes of data that move together
    2. +
    3. In the memory hierarchy, data moves between memory and cache, and between different levels of cache, in groups called lines
    4. + +
    +

    Many of these performance features are not captured in most programming languages.

    + +









    +

    Measuring performance

    + +

    How do we measure performance? What is wrong with this code to time a loop?

    + + +
    +
    +
    +
    +
    +
      clock_t start, finish;
    +  start = clock();
    +  for (int j = 0; j < i; j++) {
    +    a[j] = b[j]+b[j]*c[j];
    +  }
    +  finish = clock();
    +  double timeused = (double) (finish - start)/(CLOCKS_PER_SEC );
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Problems with measuring time

    +
      +
    1. Timers are not infinitely accurate
    2. +
    3. All clocks have a granularity, the minimum time that they can measure
    4. +
    5. The error in a time measurement, even if everything is perfect, may be the size of this granularity (sometimes called a clock tick)
    6. +
    7. Always know what your clock granularity is
    8. +
    9. Ensure that your measurement is for a long enough duration (say 100 times the tick)
    10. +
    +









    +

    Problems with cold start

    + +

    What happens when the code is executed? The assumption is that the code is ready to +execute. But +

    +
      +
    1. Code may still be on disk, and not even read into memory.
    2. +
    3. Data may be in slow memory rather than fast (which may be wrong or right for what you are measuring)
    4. +
    5. Multiple tests often necessary to ensure that cold start effects are not present
    6. +
    7. Special effort often required to ensure data in the intended part of the memory hierarchy.
    8. +
    +









    +

    Problems with smart compilers

    + +
      +
    1. If the result of the computation is not used, the compiler may eliminate the code
    2. +
    3. Performance will look impossibly fantastic
    4. +
    5. Even worse, eliminate some of the code so the performance looks plausible
    6. +
    7. Ensure that the results are (or may be) used.
    8. +
    +









    +

    Problems with interference

    +
      +
    1. Other activities are sharing your processor
    2. + +
    3. Make multiple tests and report
    4. +
    5. Easy choices include
    6. + +
    +









    +

    Problems with measuring performance

    +
      +
    1. Accurate, reproducible performance measurement is hard
    2. +
    3. Think carefully about your experiment:
    4. +
    5. What is it, precisely, that you want to measure?
    6. +
    7. How representative is your test to the situation that you are trying to measure?
    8. +
    +









    +

    Thomas algorithm for tridiagonal linear algebra equations

    +
    + +

    +$$ +\left( \begin{array}{ccccc} + b_0 & c_0 & & & \\ + a_0 & b_1 & c_1 & & \\ + & & \ddots & & \\ + & & a_{m-3} & b_{m-2} & c_{m-2} \\ + & & & a_{m-2} & b_{m-1} + \end{array} \right) +\left( \begin{array}{c} + x_0 \\ + x_1 \\ + \vdots \\ + x_{m-2} \\ + x_{m-1} + \end{array} \right)=\left( \begin{array}{c} + f_0 \\ + f_1 \\ + \vdots \\ + f_{m-2} \\ + f_{m-1} \\ + \end{array} \right) +$$ +

    + + +









    +

    Thomas algorithm, forward substitution

    +
    + +

    +

    The first step is to multiply the first row by \( a_0/b_0 \) and subtract it from the second row. This is known as the forward substitution step. We obtain then

    +$$ + a_i = 0, +$$ + + +$$ + b_i = b_i - \frac{a_{i-1}}{b_{i-1}}c_{i-1}, +$$ + +

    and

    +$$ + f_i = f_i - \frac{a_{i-1}}{b_{i-1}}f_{i-1}. +$$ + +

    At this point the simplified equation, with only an upper triangular matrix takes the form

    +$$ +\left( \begin{array}{ccccc} + b_0 & c_0 & & & \\ + & b_1 & c_1 & & \\ + & & \ddots & & \\ + & & & b_{m-2} & c_{m-2} \\ + & & & & b_{m-1} + \end{array} \right)\left( \begin{array}{c} + x_0 \\ + x_1 \\ + \vdots \\ + x_{m-2} \\ + x_{m-1} + \end{array} \right)=\left( \begin{array}{c} + f_0 \\ + f_1 \\ + \vdots \\ + f_{m-2} \\ + f_{m-1} \\ + \end{array} \right) +$$ +
    + + +









    +

    Thomas algorithm, backward substitution

    +
    + +

    +

    The next step is the backward substitution step. The last row is multiplied by \( c_{N-3}/b_{N-2} \) and subtracted from the second to last row, thus eliminating \( c_{N-3} \) from the last row. The general backward substitution procedure is

    +$$ + c_i = 0, +$$ + +

    and

    +$$ + f_{i-1} = f_{i-1} - \frac{c_{i-1}}{b_i}f_i +$$ + +

    All that ramains to be computed is the solution, which is the very straight forward process of

    +$$ +x_i = \frac{f_i}{b_i} +$$ +
    + + +









    +

    Thomas algorithm and counting of operations (floating point and memory)

    +
    + +

    + +

    We have in specific case the following operations with the floating operations

    + + +
    + + +
    + +

    + + +

    +
    +
    +
    +
    +
    // Forward substitution    
    +// Note that we can simplify by precalculating a[i-1]/b[i-1]
    +  for (int i=1; i < n; i++) {
    +     b[i] = b[i] - (a[i-1]*c[i-1])/b[i-1];
    +     f[i] = g[i] - (a[i-1]*f[i-1])/b[i-1];
    +  }
    +  x[n-1] = f[n-1] / b[n-1];
    +  // Backwards substitution                                                           
    +  for (int i = n-2; i >= 0; i--) {
    +     f[i] = f[i] - c[i]*f[i+1]/b[i+1];
    +     x[i] = f[i]/b[i];
    +  }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Example: Transpose of a matrix

    + + + +
    +
    +
    +
    +
    +
    #include <cstdlib>
    +#include <iostream>
    +#include <cmath>
    +#include <iomanip>
    +#include "time.h"
    +
    +using namespace std; // note use of namespace
    +int main (int argc, char* argv[])
    +{
    +  // read in dimension of square matrix
    +  int n = atoi(argv[1]);
    +  double **A, **B;
    +  // Allocate space for the two matrices
    +  A = new double*[n]; B = new double*[n];
    +  for (int i = 0; i < n; i++){
    +    A[i] = new double[n];
    +    B[i] = new double[n];
    +  }
    +  // Set up values for matrix A
    +  for (int i = 0; i < n; i++){
    +    for (int j = 0; j < n; j++) {
    +      A[i][j] =  cos(i*1.0)*sin(j*3.0);
    +    }
    +  }
    +  clock_t start, finish;
    +  start = clock();
    +  // Then compute the transpose
    +  for (int i = 0; i < n; i++){
    +    for (int j = 0; j < n; j++) {
    +      B[i][j]= A[j][i];
    +    }
    +  }
    +
    +  finish = clock();
    +  double timeused = (double) (finish - start)/(CLOCKS_PER_SEC );
    +  cout << setiosflags(ios::showpoint | ios::uppercase);
    +  cout << setprecision(10) << setw(20) << "Time used  for setting up transpose of matrix=" << timeused  << endl;
    +
    +  // Free up space
    +  for (int i = 0; i < n; i++){
    +    delete[] A[i];
    +    delete[] B[i];
    +  }
    +  delete[] A;
    +  delete[] B;
    +  return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Matrix-matrix multiplication

    +

    This the matrix-matrix multiplication code with plain c++ memory allocation. It computes at the end the Frobenius norm.

    + + + +
    +
    +
    +
    +
    +
    #include <cstdlib>
    +#include <iostream>
    +#include <cmath>
    +#include <iomanip>
    +#include "time.h"
    +
    +using namespace std; // note use of namespace
    +int main (int argc, char* argv[])
    +{
    +  // read in dimension of square matrix
    +  int n = atoi(argv[1]);
    +  double s = 1.0/sqrt( (double) n);
    +  double **A, **B, **C;
    +  // Start timing
    +  clock_t start, finish;
    +  start = clock();
    +  // Allocate space for the two matrices
    +  A = new double*[n]; B = new double*[n]; C = new double*[n];
    +  for (int i = 0; i < n; i++){
    +    A[i] = new double[n];
    +    B[i] = new double[n];
    +    C[i] = new double[n];
    +  }
    +  // Set up values for matrix A and B and zero matrix C
    +  for (int i = 0; i < n; i++){
    +    for (int j = 0; j < n; j++) {
    +      double angle = 2.0*M_PI*i*j/ (( double ) n);
    +      A[i][j] = s * ( sin ( angle ) + cos ( angle ) );
    +      B[j][i] =  A[i][j];
    +    }
    +  }
    +  // Then perform the matrix-matrix multiplication
    +  for (int i = 0; i < n; i++){
    +    for (int j = 0; j < n; j++) {
    +      double sum = 0.0;
    +       for (int k = 0; k < n; k++) {
    +           sum += B[i][k]*A[k][j];
    +       }
    +       C[i][j] = sum;
    +    }
    +  }
    +  // Compute now the Frobenius norm
    +  double Fsum = 0.0;
    +  for (int i = 0; i < n; i++){
    +    for (int j = 0; j < n; j++) {
    +      Fsum += C[i][j]*C[i][j];
    +    }
    +  }
    +  Fsum = sqrt(Fsum);
    +  finish = clock();
    +  double timeused = (double) (finish - start)/(CLOCKS_PER_SEC );
    +  cout << setiosflags(ios::showpoint | ios::uppercase);
    +  cout << setprecision(10) << setw(20) << "Time used  for matrix-matrix multiplication=" << timeused  << endl;
    +  cout << "  Frobenius norm  = " << Fsum << endl;
    +  // Free up space
    +  for (int i = 0; i < n; i++){
    +    delete[] A[i];
    +    delete[] B[i];
    +    delete[] C[i];
    +  }
    +  delete[] A;
    +  delete[] B;
    +  delete[] C;
    +  return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    How do we define speedup? Simplest form

    +
    + +

    +

    +
    + + +









    +

    How do we define speedup? Correct baseline

    +
    + +

    +

    The key is choosing the correct baseline for comparison

    + +
    + + +









    +

    Parallel speedup

    +
    + +

    +

    For parallel applications, speedup is typically defined as

    + +

    Here \( T_1 \) is the time on one processor and \( T_p \) is the time using \( p \) processors.

    + +
    + + +









    +

    Speedup and memory

    +
    + +

    +

    The speedup on \( p \) processors can +be greater than \( p \) if memory usage is optimal! +Consider the case of a memorybound computation with \( M \) words of memory +

    + +
    + + +









    +

    Upper bounds on speedup

    +
    + +

    +

    Assume that almost all parts of a code are perfectly +parallelizable (fraction \( f \)). The remainder, +fraction \( (1-f) \) cannot be parallelized at all. +

    + +

    That is, there is work that takes time \( W \) on one process; a fraction \( f \) of that work will take +time \( Wf/p \) on \( p \) processors. +

    + +
    + + +









    +

    Amdahl's law

    +
    + +

    +

    On one processor we have

    +$$ +T_1 = (1-f)W + fW = W +$$ + +

    On \( p \) processors we have

    +$$ +T_p = (1-f)W + \frac{fW}{p}, +$$ + +

    resulting in a speedup of

    +$$ +\frac{T_1}{T_p} = \frac{W}{(1-f)W+fW/p} +$$ + +

    As \( p \) goes to infinity, \( fW/p \) goes to zero, and the maximum speedup is

    +$$ +\frac{1}{1-f}, +$$ + +

    meaning that if +if \( f = 0.99 \) (all but \( 1\% \) parallelizable), the maximum speedup +is \( 1/(1-.99)=100 \)! +

    +
    + + +









    +

    How much is parallelizable

    +
    + +

    +

    If any non-parallel code slips into the +application, the parallel +performance is limited. +

    + +

    In many simulations, however, the fraction of non-parallelizable work +is \( 10^{-6} \) or less due to large arrays or objects that are perfectly parallelizable. +

    +
    + + +









    +

    Today's situation of parallel computing

    +
    + +

    + +

    +

    Our lectures will focus on both MPI and OpenMP.

    +
    + + +









    +

    Overhead present in parallel computing

    +
    + +

    + +

    +

    Due to the above overhead and that certain parts of a sequential +algorithm cannot be parallelized we may not achieve an optimal parallelization. +

    +
    + + +









    +

    Parallelizing a sequential algorithm

    +
    + +

    + +

    +
    + + +









    +

    Strategies

    +
    + +

    +

    +
    + + +









    +

    How do I run MPI on a PC/Laptop? MPI

    +
    + +

    +

    To install MPI is rather easy on hardware running unix/linux as operating systems, follow simply the instructions from the OpenMPI website. See also subsequent slides. +When you have made sure you have installed MPI on your PC/laptop, +

    + + + +
    +
    +
    +
    +
    +
      # Compile and link
    +  mpic++ -O3 -o nameofprog.x nameofprog.cpp
    +  #  run code with for example 8 processes using mpirun/mpiexec
    +  mpiexec -n 8 ./nameofprog.x
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Can I do it on my own PC/laptop? OpenMP installation

    +
    + +

    +

    If you wish to install MPI and OpenMP +on your laptop/PC, we recommend the following: +

    + + + + +
    +
    +
    +
    +
    +
      brew install libomp
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    and compile and link as

    + + +
    +
    +
    +
    +
    +
    c++ -o <name executable> <name program.cpp>  -lomp
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Installing MPI

    +
    + +

    +

    For linux/ubuntu users, you need to install two packages (alternatively use the synaptic package manager)

    + + +
    +
    +
    +
    +
    +
      sudo apt-get install libopenmpi-dev
    +  sudo apt-get install openmpi-bin
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    For OS X users, install brew (after having installed xcode and gcc, needed for the +gfortran compiler of openmpi) and then install with brew +

    + + +
    +
    +
    +
    +
    +
       brew install openmpi
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    When running an executable (code.x), run as

    + + +
    +
    +
    +
    +
    +
      mpirun -n 10 ./code.x
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    where we indicate that we want the number of processes to be 10.

    +
    + + +









    +

    Installing MPI and using Qt

    +
    + +

    +

    With openmpi installed, when using Qt, add to your .pro file the instructions here

    + +

    You may need to tell Qt where openmpi is stored.

    +
    + + +









    +

    What is Message Passing Interface (MPI)?

    +
    + +

    + +

    MPI is a library, not a language. It specifies the names, calling sequences and results of functions +or subroutines to be called from C/C++ or Fortran programs, and the classes and methods that make up the MPI C++ +library. The programs that users write in Fortran, C or C++ are compiled with ordinary compilers and linked +with the MPI library. +

    + +

    MPI programs should be able to run +on all possible machines and run all MPI implementetations without change. +

    + +

    An MPI computation is a collection of processes communicating with messages.

    +
    + +









    +

    Going Parallel with MPI

    +
    + +

    +

    Task parallelism: the work of a global problem can be divided +into a number of independent tasks, which rarely need to synchronize. +Monte Carlo simulations or numerical integration are examples of this. +

    + +

    MPI is a message-passing library where all the routines +have corresponding C/C++-binding +

    + + +
    +
    +
    +
    +
    +
       MPI_Command_name
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    and Fortran-binding (routine names are in uppercase, but can also be in lower case)

    + + +
    +
    +
    +
    +
    +
       MPI_COMMAND_NAME
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    MPI is a library

    +
    + +

    +

    MPI is a library specification for the message passing interface, +proposed as a standard. +

    + + +

    A message passing standard for portability and ease-of-use. +Designed for high performance. +

    + +

    Insert communication and synchronization functions where necessary.

    +
    + + +









    +

    Bindings to MPI routines

    +
    + +

    + +

    MPI is a message-passing library where all the routines +have corresponding C/C++-binding +

    + + +
    +
    +
    +
    +
    +
       MPI_Command_name
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    and Fortran-binding (routine names are in uppercase, but can also be in lower case)

    + + +
    +
    +
    +
    +
    +
       MPI_COMMAND_NAME
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    The discussion in these slides focuses on the C++ binding.

    +
    + + +









    +

    Communicator

    +
    + +

    +

    + + +
    +
    +
    +
    +
    +
      MPI_COMM_WORLD 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
    + + +









    +

    Some of the most important MPI functions

    +
    + +

    + +

    +
    + + +









    +

    The first MPI C/C++ program

    +
    + +

    + +

    Let every process write "Hello world" (oh not this program again!!) on the standard output.

    + + +
    +
    +
    +
    +
    +
    using namespace std;
    +#include <mpi.h>
    +#include <iostream>
    +int main (int nargs, char* args[])
    +{
    +int numprocs, my_rank;
    +//   MPI initializations
    +MPI_Init (&nargs, &args);
    +MPI_Comm_size (MPI_COMM_WORLD, &numprocs);
    +MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);
    +cout << "Hello world, I have  rank " << my_rank << " out of " 
    +     << numprocs << endl;
    +//  End MPI
    +MPI_Finalize ();
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    The Fortran program

    +
    + +

    + + +

    +
    +
    +
    +
    +
    PROGRAM hello
    +INCLUDE "mpif.h"
    +INTEGER:: size, my_rank, ierr
    +
    +CALL  MPI_INIT(ierr)
    +CALL MPI_COMM_SIZE(MPI_COMM_WORLD, size, ierr)
    +CALL MPI_COMM_RANK(MPI_COMM_WORLD, my_rank, ierr)
    +WRITE(*,*)"Hello world, I've rank ",my_rank," out of ",size
    +CALL MPI_FINALIZE(ierr)
    +
    +END PROGRAM hello
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Note 1

    +
    + +

    + +

    +
    + + +









    +

    Ordered output with MPIBarrier

    +
    + +

    + + + +

    +
    +
    +
    +
    +
    int main (int nargs, char* args[])
    +{
    + int numprocs, my_rank, i;
    + MPI_Init (&nargs, &args);
    + MPI_Comm_size (MPI_COMM_WORLD, &numprocs);
    + MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);
    + for (i = 0; i < numprocs; i++) {}
    + MPI_Barrier (MPI_COMM_WORLD);
    + if (i == my_rank) {
    + cout << "Hello world, I have  rank " << my_rank << 
    +        " out of " << numprocs << endl;}
    +      MPI_Finalize ();
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Note 2

    +
    + +

    +

    +

    However, this is slightly more time-consuming since the processes synchronize between themselves as many times as there +are processes. In the next Hello world example we use the send and receive functions in order to a have a synchronized +action. +

    +
    + + +









    +

    Ordered output

    +
    + +

    + + + +

    +
    +
    +
    +
    +
    .....
    +int numprocs, my_rank, flag;
    +MPI_Status status;
    +MPI_Init (&nargs, &args);
    +MPI_Comm_size (MPI_COMM_WORLD, &numprocs);
    +MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);
    +if (my_rank > 0)
    +MPI_Recv (&flag, 1, MPI_INT, my_rank-1, 100, 
    +           MPI_COMM_WORLD, &status);
    +cout << "Hello world, I have  rank " << my_rank << " out of " 
    +<< numprocs << endl;
    +if (my_rank < numprocs-1)
    +MPI_Send (&my_rank, 1, MPI_INT, my_rank+1, 
    +          100, MPI_COMM_WORLD);
    +MPI_Finalize ();
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Note 3

    +
    + +

    + +

    The basic sending of messages is given by the function \( MPI\_SEND \), which in C/C++ +is defined as +

    + + +
    +
    +
    +
    +
    +
    int MPI_Send(void *buf, int count, 
    +             MPI_Datatype datatype, 
    +             int dest, int tag, MPI_Comm comm)}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    This single command allows the passing of any kind of variable, even a large array, to any group of tasks. +The variable buf is the variable we wish to send while count +is the number of variables we are passing. If we are passing only a single value, this should be 1. +

    + +

    If we transfer an array, it is the overall size of the array. +For example, if we want to send a 10 by 10 array, count would be \( 10\times 10=100 \) +since we are actually passing 100 values. +

    +
    + + +









    +

    Note 4

    +
    + +

    + +

    Once you have sent a message, you must receive it on another task. The function \( MPI\_RECV \) +is similar to the send call. +

    + + +
    +
    +
    +
    +
    +
    int MPI_Recv( void *buf, int count, MPI_Datatype datatype, 
    +            int source, 
    +            int tag, MPI_Comm comm, MPI_Status *status )
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    The arguments that are different from those in MPI\_SEND are +buf which is the name of the variable where you will be storing the received data, +source which replaces the destination in the send command. This is the return ID of the sender. +

    + +

    Finally, we have used \( MPI\_Status\_status \), +where one can check if the receive was completed. +

    + +

    The output of this code is the same as the previous example, but now +process 0 sends a message to process 1, which forwards it further +to process 2, and so forth. +

    +
    + + +









    +

    Numerical integration in parallel

    +
    +Integrating \( \pi \) +

    + +

    +$$ + I=\int_a^bf(x) dx\approx h\left(f(a)/2 + f(a+h) +f(a+2h)+\dots +f(b-h)+ f(b)/2\right). +$$ + +

    Click on this link for the full program.

    +
    + + +









    +

    Dissection of trapezoidal rule with \( MPI\_reduce \)

    +
    + +

    + + + +

    +
    +
    +
    +
    +
    //    Trapezoidal rule and numerical integration usign MPI
    +using namespace std;
    +#include <mpi.h>
    +#include <iostream>
    +
    +//     Here we define various functions called by the main program
    +
    +double int_function(double );
    +double trapezoidal_rule(double , double , int , double (*)(double));
    +
    +//   Main function begins here
    +int main (int nargs, char* args[])
    +{
    +  int n, local_n, numprocs, my_rank; 
    +  double a, b, h, local_a, local_b, total_sum, local_sum;   
    +  double  time_start, time_end, total_time;
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Dissection of trapezoidal rule

    +
    + +

    + + + +

    +
    +
    +
    +
    +
      //  MPI initializations
    +  MPI_Init (&nargs, &args);
    +  MPI_Comm_size (MPI_COMM_WORLD, &numprocs);
    +  MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);
    +  time_start = MPI_Wtime();
    +  //  Fixed values for a, b and n 
    +  a = 0.0 ; b = 1.0;  n = 1000;
    +  h = (b-a)/n;    // h is the same for all processes 
    +  local_n = n/numprocs;  
    +  // make sure n > numprocs, else integer division gives zero
    +  // Length of each process' interval of
    +  // integration = local_n*h.  
    +  local_a = a + my_rank*local_n*h;
    +  local_b = local_a + local_n*h;
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Integrating with MPI

    +
    + +

    + + + +

    +
    +
    +
    +
    +
      total_sum = 0.0;
    +  local_sum = trapezoidal_rule(local_a, local_b, local_n, 
    +                               &int_function); 
    +  MPI_Reduce(&local_sum, &total_sum, 1, MPI_DOUBLE, 
    +              MPI_SUM, 0, MPI_COMM_WORLD);
    +  time_end = MPI_Wtime();
    +  total_time = time_end-time_start;
    +  if ( my_rank == 0) {
    +    cout << "Trapezoidal rule = " <<  total_sum << endl;
    +    cout << "Time = " <<  total_time  
    +         << " on number of processors: "  << numprocs  << endl;
    +  }
    +  // End MPI
    +  MPI_Finalize ();  
    +  return 0;
    +}  // end of main program
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    How do I use \( MPI\_reduce \)?

    +
    + +

    + +

    Here we have used

    + + +
    +
    +
    +
    +
    +
    MPI_reduce( void *senddata, void* resultdata, int count, 
    +     MPI_Datatype datatype, MPI_Op, int root, MPI_Comm comm)
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    The two variables \( senddata \) and \( resultdata \) are obvious, besides the fact that one sends the address +of the variable or the first element of an array. If they are arrays they need to have the same size. +The variable \( count \) represents the total dimensionality, 1 in case of just one variable, +while \( MPI\_Datatype \) +defines the type of variable which is sent and received. +

    + +

    The new feature is \( MPI\_Op \). It defines the type +of operation we want to do. +

    +
    + + +









    +

    More on \( MPI\_Reduce \)

    +
    + +

    +

    In our case, since we are summing +the rectangle contributions from every process we define \( MPI\_Op = MPI\_SUM \). +If we have an array or matrix we can search for the largest og smallest element by sending either \( MPI\_MAX \) or +\( MPI\_MIN \). If we want the location as well (which array element) we simply transfer +\( MPI\_MAXLOC \) or \( MPI\_MINOC \). If we want the product we write \( MPI\_PROD \). +

    + +

    \( MPI\_Allreduce \) is defined as

    + + +
    +
    +
    +
    +
    +
    MPI_Allreduce( void *senddata, void* resultdata, int count, 
    +          MPI_Datatype datatype, MPI_Op, MPI_Comm comm)        
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Dissection of trapezoidal rule

    +
    + +

    + +

    We use \( MPI\_reduce \) to collect data from each process. Note also the use of the function +\( MPI\_Wtime \). +

    + + +
    +
    +
    +
    +
    +
    //  this function defines the function to integrate
    +double int_function(double x)
    +{
    +  double value = 4./(1.+x*x);
    +  return value;
    +} // end of function to evaluate
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Dissection of trapezoidal rule

    +
    + +

    + + +

    +
    +
    +
    +
    +
    //  this function defines the trapezoidal rule
    +double trapezoidal_rule(double a, double b, int n, 
    +                         double (*func)(double))
    +{
    +  double trapez_sum;
    +  double fa, fb, x, step;
    +  int    j;
    +  step=(b-a)/((double) n);
    +  fa=(*func)(a)/2. ;
    +  fb=(*func)(b)/2. ;
    +  trapez_sum=0.;
    +  for (j=1; j <= n-1; j++){
    +    x=j*step+a;
    +    trapez_sum+=(*func)(x);
    +  }
    +  trapez_sum=(trapez_sum+fb+fa)*step;
    +  return trapez_sum;
    +}  // end trapezoidal_rule 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    The quantum dot program for two electrons

    +
    + +

    + + +

    +
    +
    +
    +
    +
    // Variational Monte Carlo for atoms with importance sampling, slater det
    +// Test case for 2-electron quantum dot, no classes using Mersenne-Twister RNG
    +#include "mpi.h"
    +#include <cmath>
    +#include <random>
    +#include <string>
    +#include <iostream>
    +#include <fstream>
    +#include <iomanip>
    +#include "vectormatrixclass.h"
    +
    +using namespace  std;
    +// output file as global variable
    +ofstream ofile;  
    +// the step length and its squared inverse for the second derivative 
    +//  Here we define global variables  used in various functions
    +//  These can be changed by using classes
    +int Dimension = 2; 
    +int NumberParticles  = 2;  //  we fix also the number of electrons to be 2
    +
    +// declaration of functions 
    +
    +// The Mc sampling for the variational Monte Carlo 
    +void  MonteCarloSampling(int, double &, double &, Vector &);
    +
    +// The variational wave function
    +double  WaveFunction(Matrix &, Vector &);
    +
    +// The local energy 
    +double  LocalEnergy(Matrix &, Vector &);
    +
    +// The quantum force
    +void  QuantumForce(Matrix &, Matrix &, Vector &);
    +
    +
    +// inline function for single-particle wave function
    +inline double SPwavefunction(double r, double alpha) { 
    +   return exp(-alpha*r*0.5);
    +}
    +
    +// inline function for derivative of single-particle wave function
    +inline double DerivativeSPwavefunction(double r, double alpha) { 
    +  return -r*alpha;
    +}
    +
    +// function for absolute value of relative distance
    +double RelativeDistance(Matrix &r, int i, int j) { 
    +      double r_ij = 0;  
    +      for (int k = 0; k < Dimension; k++) { 
    +	r_ij += (r(i,k)-r(j,k))*(r(i,k)-r(j,k));
    +      }
    +      return sqrt(r_ij); 
    +}
    +
    +// inline function for derivative of Jastrow factor
    +inline double JastrowDerivative(Matrix &r, double beta, int i, int j, int k){
    +  return (r(i,k)-r(j,k))/(RelativeDistance(r, i, j)*pow(1.0+beta*RelativeDistance(r, i, j),2));
    +}
    +
    +// function for square of position of single particle
    +double singleparticle_pos2(Matrix &r, int i) { 
    +    double r_single_particle = 0;
    +    for (int j = 0; j < Dimension; j++) { 
    +      r_single_particle  += r(i,j)*r(i,j);
    +    }
    +    return r_single_particle;
    +}
    +
    +void lnsrch(int n, Vector &xold, double fold, Vector &g, Vector &p, Vector &x,
    +		 double *f, double stpmax, int *check, double (*func)(Vector &p));
    +
    +void dfpmin(Vector &p, int n, double gtol, int *iter, double *fret,
    +	    double(*func)(Vector &p), void (*dfunc)(Vector &p, Vector &g));
    +
    +static double sqrarg;
    +#define SQR(a) ((sqrarg=(a)) == 0.0 ? 0.0 : sqrarg*sqrarg)
    +
    +
    +static double maxarg1,maxarg2;
    +#define FMAX(a,b) (maxarg1=(a),maxarg2=(b),(maxarg1) > (maxarg2) ?\
    +        (maxarg1) : (maxarg2))
    +
    +
    +// Begin of main program   
    +
    +int main(int argc, char* argv[])
    +{
    +
    +  //  MPI initializations
    +  int NumberProcesses, MyRank, NumberMCsamples;
    +  MPI_Init (&argc, &argv);
    +  MPI_Comm_size (MPI_COMM_WORLD, &NumberProcesses);
    +  MPI_Comm_rank (MPI_COMM_WORLD, &MyRank);
    +  double StartTime = MPI_Wtime();
    +  if (MyRank == 0 && argc <= 1) {
    +    cout << "Bad Usage: " << argv[0] << 
    +      " Read also output file on same line and number of Monte Carlo cycles" << endl;
    +  }
    +  // Read filename and number of Monte Carlo cycles from the command line
    +  if (MyRank == 0 && argc > 2) {
    +    string filename = argv[1]; // first command line argument after name of program
    +    NumberMCsamples  = atoi(argv[2]);
    +    string fileout = filename;
    +    string argument = to_string(NumberMCsamples);
    +    // Final filename as filename+NumberMCsamples
    +    fileout.append(argument);
    +    ofile.open(fileout);
    +  }
    +  // broadcast the number of  Monte Carlo samples
    +  MPI_Bcast (&NumberMCsamples, 1, MPI_INT, 0, MPI_COMM_WORLD);
    +  // Two variational parameters only
    +  Vector VariationalParameters(2);
    +  int TotalNumberMCsamples = NumberMCsamples*NumberProcesses; 
    +  // Loop over variational parameters
    +  for (double alpha = 0.5; alpha <= 1.5; alpha +=0.1){
    +    for (double beta = 0.1; beta <= 0.5; beta +=0.05){
    +      VariationalParameters(0) = alpha;  // value of alpha
    +      VariationalParameters(1) = beta;  // value of beta
    +      //  Do the mc sampling  and accumulate data with MPI_Reduce
    +      double TotalEnergy, TotalEnergySquared, LocalProcessEnergy, LocalProcessEnergy2;
    +      LocalProcessEnergy = LocalProcessEnergy2 = 0.0;
    +      MonteCarloSampling(NumberMCsamples, LocalProcessEnergy, LocalProcessEnergy2, VariationalParameters);
    +      //  Collect data in total averages
    +      MPI_Reduce(&LocalProcessEnergy, &TotalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    +      MPI_Reduce(&LocalProcessEnergy2, &TotalEnergySquared, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    +      // Print out results  in case of Master node, set to MyRank = 0
    +      if ( MyRank == 0) {
    +	double Energy = TotalEnergy/( (double)NumberProcesses);
    +	double Variance = TotalEnergySquared/( (double)NumberProcesses)-Energy*Energy;
    +	double StandardDeviation = sqrt(Variance/((double)TotalNumberMCsamples)); // over optimistic error
    +	ofile << setiosflags(ios::showpoint | ios::uppercase);
    +	ofile << setw(15) << setprecision(8) << VariationalParameters(0);
    +	ofile << setw(15) << setprecision(8) << VariationalParameters(1);
    +	ofile << setw(15) << setprecision(8) << Energy;
    +	ofile << setw(15) << setprecision(8) << Variance;
    +	ofile << setw(15) << setprecision(8) << StandardDeviation << endl;
    +      }
    +    }
    +  }
    +  double EndTime = MPI_Wtime();
    +  double TotalTime = EndTime-StartTime;
    +  if ( MyRank == 0 )  cout << "Time = " <<  TotalTime  << " on number of processors: "  << NumberProcesses  << endl;
    +  if (MyRank == 0)  ofile.close();  // close output file
    +  // End MPI
    +  MPI_Finalize ();  
    +  return 0;
    +}  //  end of main function
    +
    +
    +// Monte Carlo sampling with the Metropolis algorithm  
    +
    +void MonteCarloSampling(int NumberMCsamples, double &cumulative_e, double &cumulative_e2, Vector &VariationalParameters)
    +{
    +
    + // Initialize the seed and call the Mersienne algo
    +  std::random_device rd;
    +  std::mt19937_64 gen(rd());
    +  // Set up the uniform distribution for x \in [[0, 1]
    +  std::uniform_real_distribution<double> UniformNumberGenerator(0.0,1.0);
    +  std::normal_distribution<double> Normaldistribution(0.0,1.0);
    +  // diffusion constant from Schroedinger equation
    +  double D = 0.5; 
    +  double timestep = 0.05;  //  we fix the time step  for the gaussian deviate
    +  // allocate matrices which contain the position of the particles  
    +  Matrix OldPosition( NumberParticles, Dimension), NewPosition( NumberParticles, Dimension);
    +  Matrix OldQuantumForce(NumberParticles, Dimension), NewQuantumForce(NumberParticles, Dimension);
    +  double Energy = 0.0; double EnergySquared = 0.0; double DeltaE = 0.0;
    +  //  initial trial positions
    +  for (int i = 0; i < NumberParticles; i++) { 
    +    for (int j = 0; j < Dimension; j++) {
    +      OldPosition(i,j) = Normaldistribution(gen)*sqrt(timestep);
    +    }
    +  }
    +  double OldWaveFunction = WaveFunction(OldPosition, VariationalParameters);
    +  QuantumForce(OldPosition, OldQuantumForce, VariationalParameters);
    +  // loop over monte carlo cycles 
    +  for (int cycles = 1; cycles <= NumberMCsamples; cycles++){ 
    +    // new position 
    +    for (int i = 0; i < NumberParticles; i++) { 
    +      for (int j = 0; j < Dimension; j++) {
    +	// gaussian deviate to compute new positions using a given timestep
    +	NewPosition(i,j) = OldPosition(i,j) + Normaldistribution(gen)*sqrt(timestep)+OldQuantumForce(i,j)*timestep*D;
    +	//	NewPosition(i,j) = OldPosition(i,j) + gaussian_deviate(&idum)*sqrt(timestep)+OldQuantumForce(i,j)*timestep*D;
    +      }  
    +      //  for the other particles we need to set the position to the old position since
    +      //  we move only one particle at the time
    +      for (int k = 0; k < NumberParticles; k++) {
    +	if ( k != i) {
    +	  for (int j = 0; j < Dimension; j++) {
    +	    NewPosition(k,j) = OldPosition(k,j);
    +	  }
    +	} 
    +      }
    +      double NewWaveFunction = WaveFunction(NewPosition, VariationalParameters); 
    +      QuantumForce(NewPosition, NewQuantumForce, VariationalParameters);
    +      //  we compute the log of the ratio of the greens functions to be used in the 
    +      //  Metropolis-Hastings algorithm
    +      double GreensFunction = 0.0;            
    +      for (int j = 0; j < Dimension; j++) {
    +	GreensFunction += 0.5*(OldQuantumForce(i,j)+NewQuantumForce(i,j))*
    +	  (D*timestep*0.5*(OldQuantumForce(i,j)-NewQuantumForce(i,j))-NewPosition(i,j)+OldPosition(i,j));
    +      }
    +      GreensFunction = exp(GreensFunction);
    +      // The Metropolis test is performed by moving one particle at the time
    +      if(UniformNumberGenerator(gen) <= GreensFunction*NewWaveFunction*NewWaveFunction/OldWaveFunction/OldWaveFunction ) { 
    +	for (int  j = 0; j < Dimension; j++) {
    +	  OldPosition(i,j) = NewPosition(i,j);
    +	  OldQuantumForce(i,j) = NewQuantumForce(i,j);
    +	}
    +	OldWaveFunction = NewWaveFunction;
    +      }
    +    }  //  end of loop over particles
    +    // compute local energy  
    +    double DeltaE = LocalEnergy(OldPosition, VariationalParameters);
    +    // update energies
    +    Energy += DeltaE;
    +    EnergySquared += DeltaE*DeltaE;
    +  }   // end of loop over MC trials   
    +  // update the energy average and its squared 
    +  cumulative_e = Energy/NumberMCsamples;
    +  cumulative_e2 = EnergySquared/NumberMCsamples;
    +}   // end MonteCarloSampling function  
    +
    +
    +// Function to compute the squared wave function and the quantum force
    +
    +double  WaveFunction(Matrix &r, Vector &VariationalParameters)
    +{
    +  double wf = 0.0;
    +  // full Slater determinant for two particles, replace with Slater det for more particles 
    +  wf  = SPwavefunction(singleparticle_pos2(r, 0), VariationalParameters(0))*SPwavefunction(singleparticle_pos2(r, 1),VariationalParameters(0));
    +  // contribution from Jastrow factor
    +  for (int i = 0; i < NumberParticles-1; i++) { 
    +    for (int j = i+1; j < NumberParticles; j++) {
    +      wf *= exp(RelativeDistance(r, i, j)/((1.0+VariationalParameters(1)*RelativeDistance(r, i, j))));
    +    }
    +  }
    +  return wf;
    +}
    +
    +// Function to calculate the local energy without numerical derivation of kinetic energy
    +
    +double  LocalEnergy(Matrix &r, Vector &VariationalParameters)
    +{
    +
    +  // compute the kinetic and potential energy from the single-particle part
    +  // for a many-electron system this has to be replaced by a Slater determinant
    +  // The absolute value of the interparticle length
    +  Matrix length( NumberParticles, NumberParticles);
    +  // Set up interparticle distance
    +  for (int i = 0; i < NumberParticles-1; i++) { 
    +    for(int j = i+1; j < NumberParticles; j++){
    +      length(i,j) = RelativeDistance(r, i, j);
    +      length(j,i) =  length(i,j);
    +    }
    +  }
    +  double KineticEnergy = 0.0;
    +  // Set up kinetic energy from Slater and Jastrow terms
    +  for (int i = 0; i < NumberParticles; i++) { 
    +    for (int k = 0; k < Dimension; k++) {
    +      double sum1 = 0.0; 
    +      for(int j = 0; j < NumberParticles; j++){
    +	if ( j != i) {
    +	  sum1 += JastrowDerivative(r, VariationalParameters(1), i, j, k);
    +	}
    +      }
    +      KineticEnergy += (sum1+DerivativeSPwavefunction(r(i,k),VariationalParameters(0)))*(sum1+DerivativeSPwavefunction(r(i,k),VariationalParameters(0)));
    +    }
    +  }
    +  KineticEnergy += -2*VariationalParameters(0)*NumberParticles;
    +  for (int i = 0; i < NumberParticles-1; i++) {
    +      for (int j = i+1; j < NumberParticles; j++) {
    +        KineticEnergy += 2.0/(pow(1.0 + VariationalParameters(1)*length(i,j),2))*(1.0/length(i,j)-2*VariationalParameters(1)/(1+VariationalParameters(1)*length(i,j)) );
    +      }
    +  }
    +  KineticEnergy *= -0.5;
    +  // Set up potential energy, external potential + eventual electron-electron repulsion
    +  double PotentialEnergy = 0;
    +  for (int i = 0; i < NumberParticles; i++) { 
    +    double DistanceSquared = singleparticle_pos2(r, i);
    +    PotentialEnergy += 0.5*DistanceSquared;  // sp energy HO part, note it has the oscillator frequency set to 1!
    +  }
    +  // Add the electron-electron repulsion
    +  for (int i = 0; i < NumberParticles-1; i++) { 
    +    for (int j = i+1; j < NumberParticles; j++) {
    +      PotentialEnergy += 1.0/length(i,j);          
    +    }
    +  }
    +  double LocalE = KineticEnergy+PotentialEnergy;
    +  return LocalE;
    +}
    +
    +// Compute the analytical expression for the quantum force
    +void  QuantumForce(Matrix &r, Matrix &qforce, Vector &VariationalParameters)
    +{
    +  // compute the first derivative 
    +  for (int i = 0; i < NumberParticles; i++) {
    +    for (int k = 0; k < Dimension; k++) {
    +      // single-particle part, replace with Slater det for larger systems
    +      double sppart = DerivativeSPwavefunction(r(i,k),VariationalParameters(0));
    +      //  Jastrow factor contribution
    +      double Jsum = 0.0;
    +      for (int j = 0; j < NumberParticles; j++) {
    +	if ( j != i) {
    +	  Jsum += JastrowDerivative(r, VariationalParameters(1), i, j, k);
    +	}
    +      }
    +      qforce(i,k) = 2.0*(Jsum+sppart);
    +    }
    +  }
    +} // end of QuantumForce function
    +
    +
    +#define ITMAX 200
    +#define EPS 3.0e-8
    +#define TOLX (4*EPS)
    +#define STPMX 100.0
    +
    +void dfpmin(Vector &p, int n, double gtol, int *iter, double *fret,
    +	    double(*func)(Vector &p), void (*dfunc)(Vector &p, Vector &g))
    +{
    +
    +  int check,i,its,j;
    +  double den,fac,fad,fae,fp,stpmax,sum=0.0,sumdg,sumxi,temp,test;
    +  Vector dg(n), g(n), hdg(n), pnew(n), xi(n);
    +  Matrix hessian(n,n);
    +
    +  fp=(*func)(p);
    +  (*dfunc)(p,g);
    +  for (i = 0;i < n;i++) {
    +    for (j = 0; j< n;j++) hessian(i,j)=0.0;
    +    hessian(i,i)=1.0;
    +    xi(i) = -g(i);
    +    sum += p(i)*p(i);
    +  }
    +  stpmax=STPMX*FMAX(sqrt(sum),(double)n);
    +  for (its=1;its<=ITMAX;its++) {
    +    *iter=its;
    +    lnsrch(n,p,fp,g,xi,pnew,fret,stpmax,&check,func);
    +    fp = *fret;
    +    for (i = 0; i< n;i++) {
    +      xi(i)=pnew(i)-p(i);
    +      p(i)=pnew(i);
    +    }
    +    test=0.0;
    +    for (i = 0;i< n;i++) {
    +      temp=fabs(xi(i))/FMAX(fabs(p(i)),1.0);
    +      if (temp > test) test=temp;
    +    }
    +    if (test < TOLX) {
    +      return;
    +    }
    +    for (i=0;i<n;i++) dg(i)=g(i);
    +    (*dfunc)(p,g);
    +    test=0.0;
    +    den=FMAX(*fret,1.0);
    +    for (i=0;i<n;i++) {
    +      temp=fabs(g(i))*FMAX(fabs(p(i)),1.0)/den;
    +      if (temp > test) test=temp;
    +    }
    +    if (test < gtol) {
    +      return;
    +    }
    +    for (i=0;i<n;i++) dg(i)=g(i)-dg(i);
    +    for (i=0;i<n;i++) {
    +      hdg(i)=0.0;
    +      for (j=0;j<n;j++) hdg(i) += hessian(i,j)*dg(j);
    +    }
    +    fac=fae=sumdg=sumxi=0.0;
    +    for (i=0;i<n;i++) {
    +      fac += dg(i)*xi(i);
    +      fae += dg(i)*hdg(i);
    +      sumdg += SQR(dg(i));
    +      sumxi += SQR(xi(i));
    +    }
    +    if (fac*fac > EPS*sumdg*sumxi) {
    +      fac=1.0/fac;
    +      fad=1.0/fae;
    +      for (i=0;i<n;i++) dg(i)=fac*xi(i)-fad*hdg(i);
    +      for (i=0;i<n;i++) {
    +	for (j=0;j<n;j++) {
    +	  hessian(i,j) += fac*xi(i)*xi(j)
    +	    -fad*hdg(i)*hdg(j)+fae*dg(i)*dg(j);
    +	}
    +      }
    +    }
    +    for (i=0;i<n;i++) {
    +      xi(i)=0.0;
    +      for (j=0;j<n;j++) xi(i) -= hessian(i,j)*g(j);
    +    }
    +  }
    +  cout << "too many iterations in dfpmin" << endl;
    +}
    +#undef ITMAX
    +#undef EPS
    +#undef TOLX
    +#undef STPMX
    +
    +#define ALF 1.0e-4
    +#define TOLX 1.0e-7
    +
    +void lnsrch(int n, Vector &xold, double fold, Vector &g, Vector &p, Vector &x,
    +	    double *f, double stpmax, int *check, double (*func)(Vector &p))
    +{
    +  int i;
    +  double a,alam,alam2,alamin,b,disc,f2,fold2,rhs1,rhs2,slope,sum,temp,
    +    test,tmplam;
    +
    +  *check=0;
    +  for (sum=0.0,i=0;i<n;i++) sum += p(i)*p(i);
    +  sum=sqrt(sum);
    +  if (sum > stpmax)
    +    for (i=0;i<n;i++) p(i) *= stpmax/sum;
    +  for (slope=0.0,i=0;i<n;i++)
    +    slope += g(i)*p(i);
    +  test=0.0;
    +  for (i=0;i<n;i++) {
    +    temp=fabs(p(i))/FMAX(fabs(xold(i)),1.0);
    +    if (temp > test) test=temp;
    +  }
    +  alamin=TOLX/test;
    +  alam=1.0;
    +  for (;;) {
    +    for (i=0;i<n;i++) x(i)=xold(i)+alam*p(i);
    +    *f=(*func)(x);
    +    if (alam < alamin) {
    +      for (i=0;i<n;i++) x(i)=xold(i);
    +      *check=1;
    +      return;
    +    } else if (*f <= fold+ALF*alam*slope) return;
    +    else {
    +      if (alam == 1.0)
    +	tmplam = -slope/(2.0*(*f-fold-slope));
    +      else {
    +	rhs1 = *f-fold-alam*slope;
    +	rhs2=f2-fold2-alam2*slope;
    +	a=(rhs1/(alam*alam)-rhs2/(alam2*alam2))/(alam-alam2);
    +	b=(-alam2*rhs1/(alam*alam)+alam*rhs2/(alam2*alam2))/(alam-alam2);
    +	if (a == 0.0) tmplam = -slope/(2.0*b);
    +	else {
    +	  disc=b*b-3.0*a*slope;
    +	  if (disc<0.0) cout << "Roundoff problem in lnsrch." << endl;
    +	  else tmplam=(-b+sqrt(disc))/(3.0*a);
    +	}
    +	if (tmplam>0.5*alam)
    +	  tmplam=0.5*alam;
    +      }
    +    }
    +    alam2=alam;
    +    f2 = *f;
    +    fold2=fold;
    +    alam=FMAX(tmplam,0.1*alam);
    +  }
    +}
    +#undef ALF
    +#undef TOLX
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    What is OpenMP

    +
    + +

    +

    +

    Many good tutorials online and excellent textbook

    +
      +
    1. Using OpenMP, by B. Chapman, G. Jost, and A. van der Pas
    2. +
    3. Many tutorials online like OpenMP official site
    4. +
    +
    + + +









    +

    Getting started, things to remember

    +
    + +

    +

    + + +
    +
    +
    +
    +
    +
    #include <omp.h>
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    #pragma omp...
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
    + + +









    +

    OpenMP syntax

    + + + +
    +
    +
    +
    +
    +
    #pragma omp construct [ clause ...]
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    #include <omp.h>
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Different OpenMP styles of parallelism

    +

    OpenMP supports several different ways to specify thread parallelism

    + + +









    +

    General code structure

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #include <omp.h>
    +main ()
    +{
    +int var1, var2, var3;
    +/* serial code */
    +/* ... */
    +/* start of a parallel region */
    +#pragma omp parallel private(var1, var2) shared(var3)
    +{
    +/* ... */
    +}
    +/* more serial code */
    +/* ... */
    +/* another parallel region */
    +#pragma omp parallel
    +{
    +/* ... */
    +}
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Parallel region

    +
    + +

    +

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel { ... }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
    + + +









    +

    Hello world, not again, please!

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #include <omp.h>
    +#include <cstdio>
    +int main (int argc, char *argv[])
    +{
    +int th_id, nthreads;
    +#pragma omp parallel private(th_id) shared(nthreads)
    +{
    +th_id = omp_get_thread_num();
    +printf("Hello World from thread %d\n", th_id);
    +#pragma omp barrier
    +if ( th_id == 0 ) {
    +nthreads = omp_get_num_threads();
    +printf("There are %d threads\n",nthreads);
    +}
    +}
    +return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Hello world, yet another variant

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #include <cstdio>
    +#include <omp.h>
    +int main(int argc, char *argv[]) 
    +{
    + omp_set_num_threads(4); 
    +#pragma omp parallel
    + {
    +   int id = omp_get_thread_num();
    +   int nproc = omp_get_num_threads(); 
    +   cout << "Hello world with id number and processes " <<  id <<  nproc << endl;
    + } 
    +return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Variables declared outside of the parallel region are shared by all threads +If a variable like id is declared outside of the +

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel, 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    it would have been shared by various the threads, possibly causing erroneous output

    + +
    + + +









    +

    Important OpenMP library routines

    +
    + +

    + +

    +
    + + +









    +

    Private variables

    +
    + +

    +

    Private clause can be used to make thread- private versions of such variables:

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel private(id)
    +{
    + int id = omp_get_thread_num();
    + cout << "My thread num" << id << endl; 
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
    + + +









    +

    Master region

    +
    + +

    +

    It is often useful to have only one thread execute some of the code in a parallel region. I/O statements are a common example

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel 
    +{
    +  #pragma omp master
    +   {
    +      int id = omp_get_thread_num();
    +      cout << "My thread num" << id << endl; 
    +   } 
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Parallel for loop

    +
    + +

    +

    + + +
    +
    +
    +
    +
    +
    #pragma omp for
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
    + + +









    +

    Parallel computations and loops

    + +
    + +

    +

    OpenMP provides an easy way to parallelize a loop

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel for
    +  for (i=0; i<n; i++) c[i] = a[i];
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    OpenMP handles index variable (no need to declare in for loop or make private)

    + +

    Which thread does which values? Several options.

    +
    + + +









    +

    Scheduling of loop computations

    + +
    + +

    +

    We can let the OpenMP runtime decide. The decision is about how the loop iterates are scheduled +and OpenMP defines three choices of loop scheduling: +

    +
      +
    1. Static: Predefined at compile time. Lowest overhead, predictable
    2. +
    3. Dynamic: Selection made at runtime
    4. +
    5. Guided: Special case of dynamic; attempts to reduce overhead
    6. +
    +
    + + +









    +

    Example code for loop scheduling

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #include <omp.h>
    +#define CHUNKSIZE 100
    +#define N 1000
    +int main (int argc, char *argv[])
    +{
    +int i, chunk;
    +float a[N], b[N], c[N];
    +for (i=0; i < N; i++) a[i] = b[i] = i * 1.0;
    +chunk = CHUNKSIZE;
    +#pragma omp parallel shared(a,b,c,chunk) private(i)
    +{
    +#pragma omp for schedule(dynamic,chunk)
    +for (i=0; i < N; i++) c[i] = a[i] + b[i];
    +} /* end of parallel region */
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Example code for loop scheduling, guided instead of dynamic

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #include <omp.h>
    +#define CHUNKSIZE 100
    +#define N 1000
    +int main (int argc, char *argv[])
    +{
    +int i, chunk;
    +float a[N], b[N], c[N];
    +for (i=0; i < N; i++) a[i] = b[i] = i * 1.0;
    +chunk = CHUNKSIZE;
    +#pragma omp parallel shared(a,b,c,chunk) private(i)
    +{
    +#pragma omp for schedule(guided,chunk)
    +for (i=0; i < N; i++) c[i] = a[i] + b[i];
    +} /* end of parallel region */
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    More on Parallel for loop

    +
    + +

    +

    + + +
    +
    +
    +
    +
    +
    // #pragma omp parallel and #pragma omp for
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    can be combined into

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel for
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    What can happen with this loop?

    + +
    + +

    +

    What happens with code like this

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel for
    +for (i=0; i<n; i++) sum += a[i]*a[i];
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    All threads can access the sum variable, but the addition is not atomic! It is important to avoid race between threads. So-called reductions in OpenMP are thus important for performance and for obtaining correct results. OpenMP lets us indicate that a variable is used for a reduction with a particular operator. The above code becomes

    + + +
    +
    +
    +
    +
    +
    sum = 0.0;
    +#pragma omp parallel for reduction(+:sum)
    +for (i=0; i<n; i++) sum += a[i]*a[i];
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Inner product

    +
    + +

    +$$ +\sum_{i=0}^{n-1} a_ib_i +$$ + + + +

    +
    +
    +
    +
    +
    int i;
    +double sum = 0.;
    +/* allocating and initializing arrays */
    +/* ... */
    +#pragma omp parallel for default(shared) private(i) reduction(+:sum)
    + for (i=0; i<N; i++) sum += a[i]*b[i];
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Different threads do different tasks

    +
    + +

    + +

    Different threads do different tasks independently, each section is executed by one thread.

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel
    +{
    +#pragma omp sections
    +{
    +#pragma omp section
    +funcA ();
    +#pragma omp section
    +funcB ();
    +#pragma omp section
    +funcC ();
    +}
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Single execution

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #pragma omp single { ... }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    The code is executed by one thread only, no guarantee which thread

    + +

    Can introduce an implicit barrier at the end

    + + +
    +
    +
    +
    +
    +
    #pragma omp master { ... }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Code executed by the master thread, guaranteed and no implicit barrier at the end.

    +
    + + +









    +

    Coordination and synchronization

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #pragma omp barrier
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Synchronization, must be encountered by all threads in a team (or none)

    + + +
    +
    +
    +
    +
    +
    #pragma omp ordered { a block of codes }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    is another form of synchronization (in sequential order). +The form +

    + + +
    +
    +
    +
    +
    +
    #pragma omp critical { a block of codes }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    and

    + + +
    +
    +
    +
    +
    +
    #pragma omp atomic { single assignment statement }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    is more efficient than

    + + +
    +
    +
    +
    +
    +
    #pragma omp critical
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Data scope

    +
    + +

    +

    +

    What are the purposes of these attributes

    + +
    + + +









    +

    Some remarks

    +
    + +

    + +

    +
    + + +









    +

    Parallelizing nested for-loops

    +
    + +

    + +

    + + +
    +
    +
    +
    +
    +
    for (i=0; i<100; i++)
    +    for (j=0; j<100; j++)
    +        a[i][j] = b[i][j] + c[i][j];
    +    }
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + + + +
    +
    +
    +
    +
    +
    #pragma omp parallel for private(j)
    +for (i=0; i<100; i++)
    +    for (j=0; j<100; j++)
    +       a[i][j] = b[i][j] + c[i][j];
    +    }
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + +
    + + +









    +

    Nested parallelism

    +
    + +

    +

    When a thread in a parallel region encounters another parallel construct, it +may create a new team of threads and become the master of the new +team. +

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel num_threads(4)
    +{
    +/* .... */
    +#pragma omp parallel num_threads(2)
    +{
    +//  
    +}
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Parallel tasks

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #pragma omp task 
    +#pragma omp parallel shared(p_vec) private(i)
    +{
    +#pragma omp single
    +{
    +for (i=0; i<N; i++) {
    +  double r = random_number();
    +  if (p_vec[i] > r) {
    +#pragma omp task
    +   do_work (p_vec[i]);
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Common mistakes

    +
    + +

    +

    Race condition

    + + +
    +
    +
    +
    +
    +
    int nthreads;
    +#pragma omp parallel shared(nthreads)
    +{
    +nthreads = omp_get_num_threads();
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Deadlock

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel
    +{
    +...
    +#pragma omp critical
    +{
    +...
    +#pragma omp barrier
    +}
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + +

    Not all computations are simple

    +
    + +

    +

    Not all computations are simple loops where the data can be evenly +divided among threads without any dependencies between threads +

    + +

    An example is finding the location and value of the largest element in an array

    + + +
    +
    +
    +
    +
    +
    for (i=0; i<n; i++) { 
    +   if (x[i] > maxval) {
    +      maxval = x[i];
    +      maxloc = i; 
    +   }
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + +

    Not all computations are simple, competing threads

    +
    + +

    +

    All threads are potentially accessing and changing the same values, maxloc and maxval.

    +
      +
    1. OpenMP provides several ways to coordinate access to shared values
    2. +
    + + +
    +
    +
    +
    +
    +
    #pragma omp atomic
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
      +
    1. Only one thread at a time can execute the following statement (not block). We can use the critical option
    2. +
    + + +
    +
    +
    +
    +
    +
    #pragma omp critical
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
      +
    1. Only one thread at a time can execute the following block
    2. +
    +

    Atomic may be faster than critical but depends on hardware

    +
    + + +









    +

    How to find the max value using OpenMP

    +
    + +

    +

    Write down the simplest algorithm and look carefully for race conditions. How would you handle them? +The first step would be to parallelize as +

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel for
    + for (i=0; i<n; i++) {
    +    if (x[i] > maxval) {
    +      maxval = x[i];
    +      maxloc = i; 
    +    }
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Then deal with the race conditions

    +
    + +

    +

    Write down the simplest algorithm and look carefully for race conditions. How would you handle them? +The first step would be to parallelize as +

    + + +
    +
    +
    +
    +
    +
    #pragma omp parallel for
    + for (i=0; i<n; i++) {
    +#pragma omp critical
    +  {
    +     if (x[i] > maxval) {
    +       maxval = x[i];
    +       maxloc = i; 
    +     }
    +  }
    +} 
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Exercise: write a code which implements this and give an estimate on performance. Perform several runs, +with a serial code only with and without vectorization and compare the serial code with the one that uses OpenMP. Run on different archictectures if you can. +

    +
    + +









    +

    What can slow down OpenMP performance?

    +

    Give it a thought!

    + +









    +

    What can slow down OpenMP performance?

    +
    + +

    +

    Performance poor because we insisted on keeping track of the maxval and location during the execution of the loop.

    + +

    This is a common source of performance issues, namely the description of the method used to compute a value imposes additional, unnecessary requirements or properties

    + +Idea: Have each thread find the maxloc in its own data, then combine and use temporary arrays indexed by thread number to hold the values found by each thread +
    + + +









    +

    Find the max location for each thread

    +
    + +

    + + +

    +
    +
    +
    +
    +
    int maxloc[MAX_THREADS], mloc;
    +double maxval[MAX_THREADS], mval; 
    +#pragma omp parallel shared(maxval,maxloc)
    +{
    +  int id = omp_get_thread_num(); 
    +  maxval[id] = -1.0e30;
    +#pragma omp for
    +   for (int i=0; i<n; i++) {
    +       if (x[i] > maxval[id]) { 
    +           maxloc[id] = i;
    +           maxval[id] = x[i]; 
    +       }
    +    }
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Combine the values from each thread

    +
    + +

    + + +

    +
    +
    +
    +
    +
    #pragma omp flush (maxloc,maxval)
    +#pragma omp master
    +  {
    +    int nt = omp_get_num_threads(); 
    +    mloc = maxloc[0]; 
    +    mval = maxval[0]; 
    +    for (int i=1; i<nt; i++) {
    +        if (maxval[i] > mval) { 
    +           mval = maxval[i]; 
    +           mloc = maxloc[i];
    +        } 
    +     }
    +   }
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    Note that we let the master process perform the last operation.

    +
    + +









    +

    Matrix-matrix multiplication

    +

    This code computes the norm of a vector using OpenMp

    + + +
    +
    +
    +
    +
    +
    //  OpenMP program to compute vector norm by adding two other vectors
    +#include <cstdlib>
    +#include <iostream>
    +#include <cmath>
    +#include <iomanip>
    +#include  <omp.h>
    +# include <ctime>
    +
    +using namespace std; // note use of namespace
    +int main (int argc, char* argv[])
    +{
    +  // read in dimension of vector
    +  int n = atoi(argv[1]);
    +  double *a, *b, *c;
    +  int i;
    +  int thread_num;
    +  double wtime, Norm2, s, angle;
    +  cout << "  Perform addition of two vectors and compute the norm-2." << endl;
    +  omp_set_num_threads(4);
    +  thread_num = omp_get_max_threads ();
    +  cout << "  The number of processors available = " << omp_get_num_procs () << endl ;
    +  cout << "  The number of threads available    = " << thread_num <<  endl;
    +  cout << "  The matrix order n                 = " << n << endl;
    +
    +  s = 1.0/sqrt( (double) n);
    +  wtime = omp_get_wtime ( );
    +  // Allocate space for the vectors to be used
    +  a = new double [n]; b = new double [n]; c = new double [n];
    +  // Define parallel region
    +# pragma omp parallel for default(shared) private (angle, i) reduction(+:Norm2)
    +  // Set up values for vectors  a and b
    +  for (i = 0; i < n; i++){
    +      angle = 2.0*M_PI*i/ (( double ) n);
    +      a[i] = s*(sin(angle) + cos(angle));
    +      b[i] =  s*sin(2.0*angle);
    +      c[i] = 0.0;
    +  }
    +  // Then perform the vector addition
    +  for (i = 0; i < n; i++){
    +     c[i] += a[i]+b[i];
    +  }
    +  // Compute now the norm-2
    +  Norm2 = 0.0;
    +  for (i = 0; i < n; i++){
    +     Norm2  += c[i]*c[i];
    +  }
    +// end parallel region
    +  wtime = omp_get_wtime ( ) - wtime;
    +  cout << setiosflags(ios::showpoint | ios::uppercase);
    +  cout << setprecision(10) << setw(20) << "Time used  for norm-2 computation=" << wtime  << endl;
    +  cout << " Norm-2  = " << Norm2 << endl;
    +  // Free up space
    +  delete[] a;
    +  delete[] b;
    +  delete[] c;
    +  return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +









    +

    Matrix-matrix multiplication

    +

    This the matrix-matrix multiplication code with plain c++ memory allocation using OpenMP

    + + + +
    +
    +
    +
    +
    +
    //  Matrix-matrix multiplication and Frobenius norm of a matrix with OpenMP
    +#include <cstdlib>
    +#include <iostream>
    +#include <cmath>
    +#include <iomanip>
    +#include  <omp.h>
    +# include <ctime>
    +
    +using namespace std; // note use of namespace
    +int main (int argc, char* argv[])
    +{
    +  // read in dimension of square matrix
    +  int n = atoi(argv[1]);
    +  double **A, **B, **C;
    +  int i, j, k;
    +  int thread_num;
    +  double wtime, Fsum, s, angle;
    +  cout << "  Compute matrix product C = A * B and Frobenius norm." << endl;
    +  omp_set_num_threads(4);
    +  thread_num = omp_get_max_threads ();
    +  cout << "  The number of processors available = " << omp_get_num_procs () << endl ;
    +  cout << "  The number of threads available    = " << thread_num <<  endl;
    +  cout << "  The matrix order n                 = " << n << endl;
    +
    +  s = 1.0/sqrt( (double) n);
    +  wtime = omp_get_wtime ( );
    +  // Allocate space for the two matrices
    +  A = new double*[n]; B = new double*[n]; C = new double*[n];
    +  for (i = 0; i < n; i++){
    +    A[i] = new double[n];
    +    B[i] = new double[n];
    +    C[i] = new double[n];
    +  }
    +  // Define parallel region
    +# pragma omp parallel for default(shared) private (angle, i, j, k) reduction(+:Fsum)
    +  // Set up values for matrix A and B and zero matrix C
    +  for (i = 0; i < n; i++){
    +    for (j = 0; j < n; j++) {
    +      angle = 2.0*M_PI*i*j/ (( double ) n);
    +      A[i][j] = s * ( sin ( angle ) + cos ( angle ) );
    +      B[j][i] =  A[i][j];
    +    }
    +  }
    +  // Then perform the matrix-matrix multiplication
    +  for (i = 0; i < n; i++){
    +    for (j = 0; j < n; j++) {
    +       C[i][j] =  0.0;    
    +       for (k = 0; k < n; k++) {
    +            C[i][j] += A[i][k]*B[k][j];
    +       }
    +    }
    +  }
    +  // Compute now the Frobenius norm
    +  Fsum = 0.0;
    +  for (i = 0; i < n; i++){
    +    for (j = 0; j < n; j++) {
    +      Fsum += C[i][j]*C[i][j];
    +    }
    +  }
    +  Fsum = sqrt(Fsum);
    +// end parallel region and letting only one thread perform I/O
    +  wtime = omp_get_wtime ( ) - wtime;
    +  cout << setiosflags(ios::showpoint | ios::uppercase);
    +  cout << setprecision(10) << setw(20) << "Time used  for matrix-matrix multiplication=" << wtime  << endl;
    +  cout << "  Frobenius norm  = " << Fsum << endl;
    +  // Free up space
    +  for (int i = 0; i < n; i++){
    +    delete[] A[i];
    +    delete[] B[i];
    +    delete[] C[i];
    +  }
    +  delete[] A;
    +  delete[] B;
    +  delete[] C;
    +  return 0;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    © 1999-2024, Morten Hjorth-Jensen Email morten.hjorth-jensen@fys.uio.no. Released under CC Attribution-NonCommercial 4.0 license diff --git a/doc/pub/week9/ipynb/ipynb-week9-src.tar.gz b/doc/pub/week9/ipynb/ipynb-week9-src.tar.gz index 1d027324..e1f7c1fa 100644 Binary files a/doc/pub/week9/ipynb/ipynb-week9-src.tar.gz and b/doc/pub/week9/ipynb/ipynb-week9-src.tar.gz differ diff --git a/doc/pub/week9/ipynb/week9.ipynb b/doc/pub/week9/ipynb/week9.ipynb index 2a9be3da..e6cc2d25 100644 --- a/doc/pub/week9/ipynb/week9.ipynb +++ b/doc/pub/week9/ipynb/week9.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "4b5596ee", + "id": "7f934c76", "metadata": { "editable": true }, @@ -14,7 +14,7 @@ }, { "cell_type": "markdown", - "id": "d33a95e3", + "id": "5d8c5800", "metadata": { "editable": true }, @@ -27,7 +27,7 @@ }, { "cell_type": "markdown", - "id": "d548cf0f", + "id": "7bd2e26c", "metadata": { "editable": true }, @@ -45,16 +45,12 @@ "\n", "\n", "\n", - "**Teaching Material, videos and written material.**\n", - "\n", - "* Overview video on the [Bootstrap method](https://www.youtube.com/watch?v=O_Fj4q8lgmc&ab_channel=MarinStatsLectures-RProgramming%26Statistics)\n", - "\n", - "* [Marius Johnson's Master thesis on the Blocking Method](https://www.duo.uio.no/bitstream/handle/10852/68360/PhysRevE.98.043304.pdf?sequence=2&isAllowed=y)" + "Note, these notes contain additional material om optimization and parallelization. Parts of this material will be discussed this week." ] }, { "cell_type": "markdown", - "id": "930c8d02", + "id": "181ed6f2", "metadata": { "editable": true }, @@ -71,7 +67,7 @@ }, { "cell_type": "markdown", - "id": "4d06554e", + "id": "e47adac5", "metadata": { "editable": true }, @@ -90,7 +86,7 @@ }, { "cell_type": "markdown", - "id": "0be7d825", + "id": "e143647e", "metadata": { "editable": true }, @@ -108,7 +104,7 @@ }, { "cell_type": "markdown", - "id": "f008a4ba", + "id": "b8c9399e", "metadata": { "editable": true }, @@ -125,7 +121,7 @@ }, { "cell_type": "markdown", - "id": "f32d922f", + "id": "72de4651", "metadata": { "editable": true }, @@ -137,7 +133,7 @@ }, { "cell_type": "markdown", - "id": "2484aca0", + "id": "8e9371fe", "metadata": { "editable": true }, @@ -147,7 +143,7 @@ }, { "cell_type": "markdown", - "id": "7b45ca9a", + "id": "cd3e9e74", "metadata": { "editable": true }, @@ -159,7 +155,7 @@ }, { "cell_type": "markdown", - "id": "acf96a39", + "id": "9cc43d3d", "metadata": { "editable": true }, @@ -174,7 +170,7 @@ }, { "cell_type": "markdown", - "id": "9369eada", + "id": "e4d44112", "metadata": { "editable": true }, @@ -189,7 +185,7 @@ }, { "cell_type": "markdown", - "id": "50693dbd", + "id": "cd1734a3", "metadata": { "editable": true }, @@ -201,7 +197,7 @@ }, { "cell_type": "markdown", - "id": "7a026966", + "id": "0edda6fb", "metadata": { "editable": true }, @@ -211,7 +207,7 @@ }, { "cell_type": "markdown", - "id": "82e82fa9", + "id": "485ab088", "metadata": { "editable": true }, @@ -223,7 +219,7 @@ }, { "cell_type": "markdown", - "id": "5b3f8c63", + "id": "428e6c44", "metadata": { "editable": true }, @@ -233,7 +229,7 @@ }, { "cell_type": "markdown", - "id": "2e651fdb", + "id": "703c3170", "metadata": { "editable": true }, @@ -245,7 +241,7 @@ }, { "cell_type": "markdown", - "id": "bf9b0aa4", + "id": "3404f05c", "metadata": { "editable": true }, @@ -257,7 +253,7 @@ }, { "cell_type": "markdown", - "id": "0f3459da", + "id": "56c64606", "metadata": { "editable": true }, @@ -267,7 +263,7 @@ }, { "cell_type": "markdown", - "id": "3aecc426", + "id": "5c1f6ee5", "metadata": { "editable": true }, @@ -279,7 +275,7 @@ }, { "cell_type": "markdown", - "id": "451588ba", + "id": "a192cbd4", "metadata": { "editable": true }, @@ -289,7 +285,7 @@ }, { "cell_type": "markdown", - "id": "8cbe60fb", + "id": "a4458392", "metadata": { "editable": true }, @@ -301,7 +297,7 @@ }, { "cell_type": "markdown", - "id": "cf819af3", + "id": "085cafc0", "metadata": { "editable": true }, @@ -313,7 +309,7 @@ }, { "cell_type": "markdown", - "id": "94d446e9", + "id": "f8b1bd83", "metadata": { "editable": true }, @@ -328,7 +324,7 @@ }, { "cell_type": "markdown", - "id": "a3bed598", + "id": "085dc147", "metadata": { "editable": true }, @@ -338,7 +334,7 @@ }, { "cell_type": "markdown", - "id": "289ea914", + "id": "b07f69d6", "metadata": { "editable": true }, @@ -350,7 +346,7 @@ }, { "cell_type": "markdown", - "id": "e4f189cc", + "id": "524a3980", "metadata": { "editable": true }, @@ -364,7 +360,7 @@ }, { "cell_type": "markdown", - "id": "e0c75890", + "id": "aa229836", "metadata": { "editable": true }, @@ -383,7 +379,7 @@ }, { "cell_type": "markdown", - "id": "b5fc2142", + "id": "a74e265c", "metadata": { "editable": true }, @@ -395,7 +391,7 @@ }, { "cell_type": "markdown", - "id": "3ba59e20", + "id": "48a0b0b5", "metadata": { "editable": true }, @@ -407,7 +403,7 @@ }, { "cell_type": "markdown", - "id": "85c360b6", + "id": "4fa10f66", "metadata": { "editable": true }, @@ -417,7 +413,7 @@ }, { "cell_type": "markdown", - "id": "88132941", + "id": "10aaa6c6", "metadata": { "editable": true }, @@ -429,7 +425,7 @@ }, { "cell_type": "markdown", - "id": "47ce48bd", + "id": "6dc77832", "metadata": { "editable": true }, @@ -439,7 +435,7 @@ }, { "cell_type": "markdown", - "id": "a17dde39", + "id": "78b3c31c", "metadata": { "editable": true }, @@ -451,7 +447,7 @@ }, { "cell_type": "markdown", - "id": "dbcd8555", + "id": "4c584946", "metadata": { "editable": true }, @@ -463,7 +459,7 @@ }, { "cell_type": "markdown", - "id": "9753ec4d", + "id": "bb141426", "metadata": { "editable": true }, @@ -477,7 +473,7 @@ }, { "cell_type": "markdown", - "id": "aed90511", + "id": "6f36565b", "metadata": { "editable": true }, @@ -487,7 +483,7 @@ }, { "cell_type": "markdown", - "id": "3d10512f", + "id": "250b1b3e", "metadata": { "editable": true }, @@ -495,9 +491,9 @@ "## Resampling methods: Blocking\n", "\n", "The blocking method was made popular by [Flyvbjerg and Pedersen (1989)](https://aip.scitation.org/doi/10.1063/1.457480)\n", - "and has become one of the standard ways to estimate\n", - "$V(\\widehat{\\theta})$ for exactly one $\\widehat{\\theta}$, namely\n", - "$\\widehat{\\theta} = \\overline{X}$. \n", + "and has become one of the standard ways to estimate the variance\n", + "$\\mathrm{var}(\\widehat{\\theta})$ for exactly one estimator $\\widehat{\\theta}$, namely\n", + "$\\widehat{\\theta} = \\overline{X}$, the mean value. \n", "\n", "Assume $n = 2^d$ for some integer $d>1$ and $X_1,X_2,\\cdots, X_n$ is a stationary time series to begin with. \n", "Moreover, assume that the series is asymptotically uncorrelated. We switch to vector notation by arranging $X_1,X_2,\\cdots,X_n$ in an $n$-tuple. Define:" @@ -505,7 +501,7 @@ }, { "cell_type": "markdown", - "id": "297d90f5", + "id": "233f70b8", "metadata": { "editable": true }, @@ -519,7 +515,7 @@ }, { "cell_type": "markdown", - "id": "74859bc8", + "id": "b17d9e13", "metadata": { "editable": true }, @@ -534,7 +530,7 @@ }, { "cell_type": "markdown", - "id": "0bed800d", + "id": "179d551e", "metadata": { "editable": true }, @@ -550,7 +546,7 @@ }, { "cell_type": "markdown", - "id": "a7e0c557", + "id": "f89ed58e", "metadata": { "editable": true }, @@ -562,7 +558,7 @@ }, { "cell_type": "markdown", - "id": "7557cd42", + "id": "44308237", "metadata": { "editable": true }, @@ -581,7 +577,7 @@ }, { "cell_type": "markdown", - "id": "315c77b3", + "id": "1a9ef202", "metadata": { "editable": true }, @@ -605,7 +601,7 @@ }, { "cell_type": "markdown", - "id": "e11a5834", + "id": "0ca89ff0", "metadata": { "editable": true }, @@ -620,7 +616,7 @@ }, { "cell_type": "markdown", - "id": "03174f94", + "id": "6bc37505", "metadata": { "editable": true }, @@ -632,7 +628,7 @@ }, { "cell_type": "markdown", - "id": "741ee4c9", + "id": "87775adf", "metadata": { "editable": true }, @@ -644,7 +640,7 @@ }, { "cell_type": "markdown", - "id": "f14a96cb", + "id": "14149079", "metadata": { "editable": true }, @@ -662,7 +658,7 @@ }, { "cell_type": "markdown", - "id": "323e46df", + "id": "73987934", "metadata": { "editable": true }, @@ -680,7 +676,7 @@ }, { "cell_type": "markdown", - "id": "a71d9b3c", + "id": "69fbf5d1", "metadata": { "editable": true }, @@ -691,7 +687,7 @@ }, { "cell_type": "markdown", - "id": "dd98a112", + "id": "81fe5990", "metadata": { "editable": true }, @@ -702,7 +698,7 @@ }, { "cell_type": "markdown", - "id": "0e0bec22", + "id": "d4706212", "metadata": { "editable": true }, @@ -720,7 +716,7 @@ }, { "cell_type": "markdown", - "id": "7b004b32", + "id": "eb661f47", "metadata": { "editable": true }, @@ -730,7 +726,7 @@ }, { "cell_type": "markdown", - "id": "a70cba82", + "id": "2f1d607f", "metadata": { "editable": true }, @@ -748,7 +744,7 @@ }, { "cell_type": "markdown", - "id": "14a5f387", + "id": "eecababc", "metadata": { "editable": true }, @@ -758,7 +754,7 @@ }, { "cell_type": "markdown", - "id": "68a6d6ac", + "id": "49a6dfb0", "metadata": { "editable": true }, @@ -770,7 +766,7 @@ }, { "cell_type": "markdown", - "id": "86df1a10", + "id": "2c62cc63", "metadata": { "editable": true }, @@ -782,7 +778,7 @@ }, { "cell_type": "markdown", - "id": "0a2ea53b", + "id": "06fde16f", "metadata": { "editable": true }, @@ -800,7 +796,7 @@ }, { "cell_type": "markdown", - "id": "c5852aa3", + "id": "61e106ab", "metadata": { "editable": true }, @@ -810,7 +806,7 @@ }, { "cell_type": "markdown", - "id": "09cbe900", + "id": "a9f5001a", "metadata": { "editable": true }, @@ -827,15 +823,17 @@ }, { "cell_type": "markdown", - "id": "a15058db", + "id": "aed624ba", "metadata": { "editable": true }, "source": [ + "## More on the blocking method\n", + "\n", "Flyvbjerg and Petersen demonstrated that the sequence\n", "$\\{e_k\\}_{k=0}^{d-1}$ is decreasing, and conjecture that the term\n", "$e_k$ can be made as small as we would like by making $k$ (and hence\n", - "$d$) sufficiently large. The sequence is decreasing (Master of Science thesis by Marius Jonsson, UiO 2018).\n", + "$d$) sufficiently large. The sequence is decreasing.\n", "It means we can apply blocking transformations until\n", "$e_k$ is sufficiently small, and then estimate $\\mathrm{var}(\\overline{X})$ by\n", "$\\widehat{\\sigma}^2_k/n_k$. \n", @@ -845,7 +843,7 @@ }, { "cell_type": "markdown", - "id": "4f46a36c", + "id": "421f9f8b", "metadata": { "editable": true }, @@ -856,7 +854,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "9b554ff8", + "id": "8d23bb07", "metadata": { "collapsed": false, "editable": true @@ -1085,7 +1083,7 @@ }, { "cell_type": "markdown", - "id": "7dee6cbf", + "id": "7d5001dc", "metadata": { "editable": true }, @@ -1100,7 +1098,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "989a7557", + "id": "6da7142f", "metadata": { "collapsed": false, "editable": true @@ -1163,6 +1161,4877 @@ "frame = pd.DataFrame(data,index=['Values'])\n", "print(frame)" ] + }, + { + "cell_type": "markdown", + "id": "6d0b7e85", + "metadata": { + "editable": true + }, + "source": [ + "## Content\n", + "* Simple compiler options \n", + "\n", + "* Tools to benchmark your code\n", + "\n", + "* Machine architectures\n", + "\n", + "* What is vectorization?\n", + "\n", + "* How to measure code performance\n", + "\n", + "* Parallelization with OpenMP\n", + "\n", + "* Parallelization with MPI\n", + "\n", + "* Vectorization and parallelization, examples" + ] + }, + { + "cell_type": "markdown", + "id": "17407842", + "metadata": { + "editable": true + }, + "source": [ + "## Optimization and profiling\n", + "\n", + "Till now we have not paid much attention to speed and possible optimization possibilities\n", + "inherent in the various compilers. We have compiled and linked as" + ] + }, + { + "cell_type": "markdown", + "id": "ad210718", + "metadata": { + "editable": true + }, + "source": [ + " c++ -c mycode.cpp\n", + " c++ -o mycode.exe mycode.o\n" + ] + }, + { + "cell_type": "markdown", + "id": "69401651", + "metadata": { + "editable": true + }, + "source": [ + "For Fortran replace with for example **gfortran** or **ifort**.\n", + "This is what we call a flat compiler option and should be used when we develop the code.\n", + "It produces normally a very large and slow code when translated to machine instructions.\n", + "We use this option for debugging and for establishing the correct program output because\n", + "every operation is done precisely as the user specified it.\n", + "\n", + "It is instructive to look up the compiler manual for further instructions by writing" + ] + }, + { + "cell_type": "markdown", + "id": "68b36411", + "metadata": { + "editable": true + }, + "source": [ + " man c++\n" + ] + }, + { + "cell_type": "markdown", + "id": "6c321ece", + "metadata": { + "editable": true + }, + "source": [ + "## More on optimization\n", + "We have additional compiler options for optimization. These may include procedure inlining where \n", + "performance may be improved, moving constants inside loops outside the loop, \n", + "identify potential parallelism, include automatic vectorization or replace a division with a reciprocal\n", + "and a multiplication if this speeds up the code." + ] + }, + { + "cell_type": "markdown", + "id": "e8c5dca5", + "metadata": { + "editable": true + }, + "source": [ + " c++ -O3 -c mycode.cpp\n", + " c++ -O3 -o mycode.exe mycode.o\n" + ] + }, + { + "cell_type": "markdown", + "id": "5dbfe538", + "metadata": { + "editable": true + }, + "source": [ + "This (other options are -O2 or -Ofast) is the recommended option." + ] + }, + { + "cell_type": "markdown", + "id": "f67659a4", + "metadata": { + "editable": true + }, + "source": [ + "## Optimization and profiling\n", + "It is also useful to profile your program under the development stage.\n", + "You would then compile with" + ] + }, + { + "cell_type": "markdown", + "id": "77187b55", + "metadata": { + "editable": true + }, + "source": [ + " c++ -pg -O3 -c mycode.cpp\n", + " c++ -pg -O3 -o mycode.exe mycode.o\n" + ] + }, + { + "cell_type": "markdown", + "id": "92aa4055", + "metadata": { + "editable": true + }, + "source": [ + "After you have run the code you can obtain the profiling information via" + ] + }, + { + "cell_type": "markdown", + "id": "1e09c863", + "metadata": { + "editable": true + }, + "source": [ + " gprof mycode.exe > ProfileOutput\n" + ] + }, + { + "cell_type": "markdown", + "id": "ec018a86", + "metadata": { + "editable": true + }, + "source": [ + "When you have profiled properly your code, you must take out this option as it \n", + "slows down performance.\n", + "For memory tests use [valgrind](http://www.valgrind.org). An excellent environment for all these aspects, and much more, is Qt creator." + ] + }, + { + "cell_type": "markdown", + "id": "0d745d5f", + "metadata": { + "editable": true + }, + "source": [ + "## Optimization and debugging\n", + "Adding debugging options is a very useful alternative under the development stage of a program.\n", + "You would then compile with" + ] + }, + { + "cell_type": "markdown", + "id": "9d0406a1", + "metadata": { + "editable": true + }, + "source": [ + " c++ -g -O0 -c mycode.cpp\n", + " c++ -g -O0 -o mycode.exe mycode.o\n" + ] + }, + { + "cell_type": "markdown", + "id": "7c0f74b9", + "metadata": { + "editable": true + }, + "source": [ + "This option generates debugging information allowing you to trace for example if an array is properly allocated. Some compilers work best with the no optimization option **-O0**.\n", + "\n", + "**Other optimization flags.**\n", + "\n", + "Depending on the compiler, one can add flags which generate code that catches integer overflow errors. \n", + "The flag **-ftrapv** does this for the CLANG compiler on OS X operating systems." + ] + }, + { + "cell_type": "markdown", + "id": "018d7811", + "metadata": { + "editable": true + }, + "source": [ + "## Other hints\n", + "In general, irrespective of compiler options, it is useful to\n", + "* avoid if tests or call to functions inside loops, if possible. \n", + "\n", + "* avoid multiplication with constants inside loops if possible\n", + "\n", + "Here is an example of a part of a program where specific operations lead to a slower code" + ] + }, + { + "cell_type": "markdown", + "id": "897c72c9", + "metadata": { + "editable": true + }, + "source": [ + " k = n-1;\n", + " for (i = 0; i < n; i++){\n", + " a[i] = b[i] +c*d;\n", + " e = g[k];\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "adec4792", + "metadata": { + "editable": true + }, + "source": [ + "A better code is" + ] + }, + { + "cell_type": "markdown", + "id": "cb465899", + "metadata": { + "editable": true + }, + "source": [ + " temp = c*d;\n", + " for (i = 0; i < n; i++){\n", + " a[i] = b[i] + temp;\n", + " }\n", + " e = g[n-1];\n" + ] + }, + { + "cell_type": "markdown", + "id": "3d14ad95", + "metadata": { + "editable": true + }, + "source": [ + "Here we avoid a repeated multiplication inside a loop. \n", + "Most compilers, depending on compiler flags, identify and optimize such bottlenecks on their own, without requiring any particular action by the programmer. However, it is always useful to single out and avoid code examples like the first one discussed here." + ] + }, + { + "cell_type": "markdown", + "id": "002fc562", + "metadata": { + "editable": true + }, + "source": [ + "## Vectorization and the basic idea behind parallel computing\n", + "Present CPUs are highly parallel processors with varying levels of parallelism. The typical situation can be described via the following three statements.\n", + "* Pursuit of shorter computation time and larger simulation size gives rise to parallel computing.\n", + "\n", + "* Multiple processors are involved to solve a global problem.\n", + "\n", + "* The essence is to divide the entire computation evenly among collaborative processors. Divide and conquer.\n", + "\n", + "Before we proceed with a more detailed discussion of topics like vectorization and parallelization, we need to remind ourselves about some basic features of different hardware models." + ] + }, + { + "cell_type": "markdown", + "id": "d03ee544", + "metadata": { + "editable": true + }, + "source": [ + "## A rough classification of hardware models\n", + "\n", + "* Conventional single-processor computers are named SISD (single-instruction-single-data) machines.\n", + "\n", + "* SIMD (single-instruction-multiple-data) machines incorporate the idea of parallel processing, using a large number of processing units to execute the same instruction on different data.\n", + "\n", + "* Modern parallel computers are so-called MIMD (multiple-instruction-multiple-data) machines and can execute different instruction streams in parallel on different data." + ] + }, + { + "cell_type": "markdown", + "id": "9a764260", + "metadata": { + "editable": true + }, + "source": [ + "## Shared memory and distributed memory\n", + "One way of categorizing modern parallel computers is to look at the memory configuration.\n", + "* In shared memory systems the CPUs share the same address space. Any CPU can access any data in the global memory.\n", + "\n", + "* In distributed memory systems each CPU has its own memory.\n", + "\n", + "The CPUs are connected by some network and may exchange messages." + ] + }, + { + "cell_type": "markdown", + "id": "0de76fb9", + "metadata": { + "editable": true + }, + "source": [ + "## Different parallel programming paradigms\n", + "\n", + "* **Task parallelism**: the work of a global problem can be divided into a number of independent tasks, which rarely need to synchronize. Monte Carlo simulations represent a typical situation. Integration is another. However this paradigm is of limited use.\n", + "\n", + "* **Data parallelism**: use of multiple threads (e.g. one or more threads per processor) to dissect loops over arrays etc. Communication and synchronization between processors are often hidden, thus easy to program. However, the user surrenders much control to a specialized compiler. Examples of data parallelism are compiler-based parallelization and OpenMP directives." + ] + }, + { + "cell_type": "markdown", + "id": "7fd90368", + "metadata": { + "editable": true + }, + "source": [ + "## Different parallel programming paradigms\n", + "\n", + "* **Message passing**: all involved processors have an independent memory address space. The user is responsible for partitioning the data/work of a global problem and distributing the subproblems to the processors. Collaboration between processors is achieved by explicit message passing, which is used for data transfer plus synchronization.\n", + "\n", + "* This paradigm is the most general one where the user has full control. Better parallel efficiency is usually achieved by explicit message passing. However, message-passing programming is more difficult." + ] + }, + { + "cell_type": "markdown", + "id": "18ac99c8", + "metadata": { + "editable": true + }, + "source": [ + "## What is vectorization?\n", + "Vectorization is a special\n", + "case of **Single Instructions Multiple Data** (SIMD) to denote a single\n", + "instruction stream capable of operating on multiple data elements in\n", + "parallel. \n", + "We can think of vectorization as the unrolling of loops accompanied with SIMD instructions.\n", + "\n", + "Vectorization is the process of converting an algorithm that performs scalar operations\n", + "(typically one operation at the time) to vector operations where a single operation can refer to many simultaneous operations.\n", + "Consider the following example" + ] + }, + { + "cell_type": "markdown", + "id": "10f0f1c5", + "metadata": { + "editable": true + }, + "source": [ + " for (i = 0; i < n; i++){\n", + " a[i] = b[i] + c[i];\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "76cbe575", + "metadata": { + "editable": true + }, + "source": [ + "If the code is not vectorized, the compiler will simply start with the first element and \n", + "then perform subsequent additions operating on one address in memory at the time." + ] + }, + { + "cell_type": "markdown", + "id": "ac611c90", + "metadata": { + "editable": true + }, + "source": [ + "## Number of elements that can acted upon\n", + "A SIMD instruction can operate on multiple data elements in one single instruction.\n", + "It uses the so-called 128-bit SIMD floating-point register. \n", + "In this sense, vectorization adds some form of parallelism since one instruction is applied \n", + "to many parts of say a vector.\n", + "\n", + "The number of elements which can be operated on in parallel\n", + "range from four single-precision floating point data elements in so-called \n", + "Streaming SIMD Extensions and two double-precision floating-point data\n", + "elements in Streaming SIMD Extensions 2 to sixteen byte operations in\n", + "a 128-bit register in Streaming SIMD Extensions 2. Thus, vector-length\n", + "ranges from 2 to 16, depending on the instruction extensions used and\n", + "on the data type. \n", + "\n", + "IN summary, our instructions operate on 128 bit (16 byte) operands\n", + "* 4 floats or ints\n", + "\n", + "* 2 doubles\n", + "\n", + "* Data paths 128 bits vide for vector unit" + ] + }, + { + "cell_type": "markdown", + "id": "ce267e9b", + "metadata": { + "editable": true + }, + "source": [ + "## Number of elements that can acted upon, examples\n", + "We start with the simple scalar operations given by" + ] + }, + { + "cell_type": "markdown", + "id": "b5e50c74", + "metadata": { + "editable": true + }, + "source": [ + " for (i = 0; i < n; i++){\n", + " a[i] = b[i] + c[i];\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "4834c522", + "metadata": { + "editable": true + }, + "source": [ + "If the code is not vectorized and we have a 128-bit register to store a 32 bits floating point number,\n", + "it means that we have $3\\times 32$ bits that are not used. \n", + "\n", + "We have thus unused space in our SIMD registers. These registers could hold three additional integers." + ] + }, + { + "cell_type": "markdown", + "id": "cd5abf08", + "metadata": { + "editable": true + }, + "source": [ + "## Operation counts for scalar operation\n", + "The code" + ] + }, + { + "cell_type": "markdown", + "id": "c0cfbeb1", + "metadata": { + "editable": true + }, + "source": [ + " for (i = 0; i < n; i++){\n", + " a[i] = b[i] + c[i];\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "1c2b9caf", + "metadata": { + "editable": true + }, + "source": [ + "has for $n$ repeats\n", + "1. one load for $c[i]$ in address 1\n", + "\n", + "2. one load for $b[i]$ in address 2\n", + "\n", + "3. add $c[i]$ and $b[i]$ to give $a[i]$\n", + "\n", + "4. store $a[i]$ in address 2" + ] + }, + { + "cell_type": "markdown", + "id": "b3627ae3", + "metadata": { + "editable": true + }, + "source": [ + "## Number of elements that can acted upon, examples\n", + "If we vectorize the code, we can perform, with a 128-bit register four simultaneous operations, that is\n", + "we have" + ] + }, + { + "cell_type": "markdown", + "id": "c85f3d13", + "metadata": { + "editable": true + }, + "source": [ + " for (i = 0; i < n; i+=4){\n", + " a[i] = b[i] + c[i];\n", + " a[i+1] = b[i+1] + c[i+1];\n", + " a[i+2] = b[i+2] + c[i+2];\n", + " a[i+3] = b[i+3] + c[i+3];\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "b3d4e458", + "metadata": { + "editable": true + }, + "source": [ + "Four additions are now done in a single step." + ] + }, + { + "cell_type": "markdown", + "id": "c45ed72e", + "metadata": { + "editable": true + }, + "source": [ + "## Number of operations when vectorized\n", + "For $n/4$ repeats assuming floats or integers\n", + "1. one vector load for $c[i]$ in address 1\n", + "\n", + "2. one load for $b[i]$ in address 2\n", + "\n", + "3. add $c[i]$ and $b[i]$ to give $a[i]$\n", + "\n", + "4. store $a[i]$ in address 2" + ] + }, + { + "cell_type": "markdown", + "id": "6b9f160e", + "metadata": { + "editable": true + }, + "source": [ + "## [A simple test case with and without vectorization](https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program7.cpp)\n", + "We implement these operations in a simple c++ program that computes at the end the norm of a vector." + ] + }, + { + "cell_type": "markdown", + "id": "9251ed34", + "metadata": { + "editable": true + }, + "source": [ + " #include \n", + " #include \n", + " #include \n", + " #include \n", + " #include \"time.h\"\n", + " \n", + " using namespace std; // note use of namespace\n", + " int main (int argc, char* argv[])\n", + " {\n", + " // read in dimension of square matrix\n", + " int n = atoi(argv[1]);\n", + " double s = 1.0/sqrt( (double) n);\n", + " double *a, *b, *c;\n", + " // Start timing\n", + " clock_t start, finish;\n", + " start = clock();\n", + " // Allocate space for the vectors to be used\n", + " a = new double [n]; b = new double [n]; c = new double [n];\n", + " // Define parallel region\n", + " // Set up values for vectors a and b\n", + " for (int i = 0; i < n; i++){\n", + " double angle = 2.0*M_PI*i/ (( double ) n);\n", + " a[i] = s*(sin(angle) + cos(angle));\n", + " b[i] = s*sin(2.0*angle);\n", + " c[i] = 0.0;\n", + " }\n", + " // Then perform the vector addition\n", + " for (int i = 0; i < n; i++){\n", + " c[i] += a[i]+b[i];\n", + " }\n", + " // Compute now the norm-2\n", + " double Norm2 = 0.0;\n", + " for (int i = 0; i < n; i++){\n", + " Norm2 += c[i]*c[i];\n", + " }\n", + " finish = clock();\n", + " double timeused = (double) (finish - start)/(CLOCKS_PER_SEC );\n", + " cout << setiosflags(ios::showpoint | ios::uppercase);\n", + " cout << setprecision(10) << setw(20) << \"Time used for norm computation=\" << timeused << endl;\n", + " cout << \" Norm-2 = \" << Norm2 << endl;\n", + " // Free up space\n", + " delete[] a;\n", + " delete[] b;\n", + " delete[] c;\n", + " return 0;\n", + " }\n", + " \n", + " \n", + " \n", + " \n" + ] + }, + { + "cell_type": "markdown", + "id": "d4cb34c8", + "metadata": { + "editable": true + }, + "source": [ + "## Compiling with and without vectorization\n", + "We can compile and link without vectorization using the clang c++ compiler" + ] + }, + { + "cell_type": "markdown", + "id": "aba109f4", + "metadata": { + "editable": true + }, + "source": [ + " clang -o novec.x vecexample.cpp\n" + ] + }, + { + "cell_type": "markdown", + "id": "b29b64ce", + "metadata": { + "editable": true + }, + "source": [ + "and with vectorization (and additional optimizations)" + ] + }, + { + "cell_type": "markdown", + "id": "ecb51ee3", + "metadata": { + "editable": true + }, + "source": [ + " clang++ -O3 -Rpass=loop-vectorize -o vec.x vecexample.cpp \n" + ] + }, + { + "cell_type": "markdown", + "id": "ef5054ab", + "metadata": { + "editable": true + }, + "source": [ + "The speedup depends on the size of the vectors. In the example here we have run with $10^7$ elements.\n", + "The example here was run on an IMac17.1 with OSX El Capitan (10.11.4) as operating system and an Intel i5 3.3 GHz CPU." + ] + }, + { + "cell_type": "markdown", + "id": "f09ca19c", + "metadata": { + "editable": true + }, + "source": [ + " Compphys:~ hjensen$ ./vec.x 10000000\n", + " Time used for norm computation=0.04720500000\n", + " Compphys:~ hjensen$ ./novec.x 10000000\n", + " Time used for norm computation=0.03311700000\n" + ] + }, + { + "cell_type": "markdown", + "id": "0efe59e0", + "metadata": { + "editable": true + }, + "source": [ + "This particular C++ compiler speeds up the above loop operations with a factor of 1.5 \n", + "Performing the same operations for $10^9$ elements results in a smaller speedup since reading from main memory is required. The non-vectorized code is seemingly faster." + ] + }, + { + "cell_type": "markdown", + "id": "bc05c8e0", + "metadata": { + "editable": true + }, + "source": [ + " Compphys:~ hjensen$ ./vec.x 1000000000\n", + " Time used for norm computation=58.41391100\n", + " Compphys:~ hjensen$ ./novec.x 1000000000\n", + " Time used for norm computation=46.51295300\n" + ] + }, + { + "cell_type": "markdown", + "id": "57fe77fa", + "metadata": { + "editable": true + }, + "source": [ + "We will discuss these issues further in the next slides." + ] + }, + { + "cell_type": "markdown", + "id": "ee0eb549", + "metadata": { + "editable": true + }, + "source": [ + "## Compiling with and without vectorization using clang\n", + "We can compile and link without vectorization with clang compiler" + ] + }, + { + "cell_type": "markdown", + "id": "a9b8a0cb", + "metadata": { + "editable": true + }, + "source": [ + " clang++ -o -fno-vectorize novec.x vecexample.cpp\n" + ] + }, + { + "cell_type": "markdown", + "id": "05e175db", + "metadata": { + "editable": true + }, + "source": [ + "and with vectorization" + ] + }, + { + "cell_type": "markdown", + "id": "f6ecf4e7", + "metadata": { + "editable": true + }, + "source": [ + " clang++ -O3 -Rpass=loop-vectorize -o vec.x vecexample.cpp \n" + ] + }, + { + "cell_type": "markdown", + "id": "c0ee7b75", + "metadata": { + "editable": true + }, + "source": [ + "We can also add vectorization analysis, see for example" + ] + }, + { + "cell_type": "markdown", + "id": "74dcf2f8", + "metadata": { + "editable": true + }, + "source": [ + " clang++ -O3 -Rpass-analysis=loop-vectorize -o vec.x vecexample.cpp \n" + ] + }, + { + "cell_type": "markdown", + "id": "b72d5a50", + "metadata": { + "editable": true + }, + "source": [ + "or figure out if vectorization was missed" + ] + }, + { + "cell_type": "markdown", + "id": "6c936d22", + "metadata": { + "editable": true + }, + "source": [ + " clang++ -O3 -Rpass-missed=loop-vectorize -o vec.x vecexample.cpp \n" + ] + }, + { + "cell_type": "markdown", + "id": "e57b5c9b", + "metadata": { + "editable": true + }, + "source": [ + "## Automatic vectorization and vectorization inhibitors, criteria\n", + "\n", + "Not all loops can be vectorized, as discussed in [Intel's guide to vectorization](https://software.intel.com/en-us/articles/a-guide-to-auto-vectorization-with-intel-c-compilers)\n", + "\n", + "An important criteria is that the loop counter $n$ is known at the entry of the loop." + ] + }, + { + "cell_type": "markdown", + "id": "591ea5f8", + "metadata": { + "editable": true + }, + "source": [ + " for (int j = 0; j < n; j++) {\n", + " a[j] = cos(j*1.0);\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "86e62af7", + "metadata": { + "editable": true + }, + "source": [ + "The variable $n$ does need to be known at compile time. However, this variable must stay the same for the entire duration of the loop. It implies that an exit statement inside the loop cannot be data dependent." + ] + }, + { + "cell_type": "markdown", + "id": "d4eadc28", + "metadata": { + "editable": true + }, + "source": [ + "## Automatic vectorization and vectorization inhibitors, exit criteria\n", + "\n", + "An exit statement should in general be avoided. \n", + "If the exit statement contains data-dependent conditions, the loop cannot be vectorized. \n", + "The following is an example of a non-vectorizable loop" + ] + }, + { + "cell_type": "markdown", + "id": "9fb55622", + "metadata": { + "editable": true + }, + "source": [ + " for (int j = 0; j < n; j++) {\n", + " a[j] = cos(j*1.0);\n", + " if (a[j] < 0 ) break;\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "b4c7cce6", + "metadata": { + "editable": true + }, + "source": [ + "Avoid loop termination conditions and opt for a single entry loop variable $n$. The lower and upper bounds have to be kept fixed within the loop." + ] + }, + { + "cell_type": "markdown", + "id": "63f1d357", + "metadata": { + "editable": true + }, + "source": [ + "## Automatic vectorization and vectorization inhibitors, straight-line code\n", + "\n", + "SIMD instructions perform the same type of operations multiple times. \n", + "A **switch** statement leads thus to a non-vectorizable loop since different statemens cannot branch.\n", + "The following code can however be vectorized since the **if** statement is implemented as a masked assignment." + ] + }, + { + "cell_type": "markdown", + "id": "a485face", + "metadata": { + "editable": true + }, + "source": [ + " for (int j = 0; j < n; j++) {\n", + " double x = cos(j*1.0);\n", + " if (x > 0 ) {\n", + " a[j] = x*sin(j*2.0); \n", + " }\n", + " else {\n", + " a[j] = 0.0;\n", + " }\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "ad55a449", + "metadata": { + "editable": true + }, + "source": [ + "These operations can be performed for all data elements but only those elements which the mask evaluates as true are stored. In general, one should avoid branches such as **switch**, **go to**, or **return** statements or **if** constructs that cannot be treated as masked assignments." + ] + }, + { + "cell_type": "markdown", + "id": "776ddba9", + "metadata": { + "editable": true + }, + "source": [ + "## Automatic vectorization and vectorization inhibitors, nested loops\n", + "\n", + "Only the innermost loop of the following example is vectorized" + ] + }, + { + "cell_type": "markdown", + "id": "ca642f90", + "metadata": { + "editable": true + }, + "source": [ + " for (int i = 0; i < n; i++) {\n", + " for (int j = 0; j < n; j++) {\n", + " a[i][j] += b[i][j];\n", + " } \n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "14123a0d", + "metadata": { + "editable": true + }, + "source": [ + "The exception is if an original outer loop is transformed into an inner loop as the result of compiler optimizations." + ] + }, + { + "cell_type": "markdown", + "id": "3a9e1c79", + "metadata": { + "editable": true + }, + "source": [ + "## Automatic vectorization and vectorization inhibitors, function calls\n", + "\n", + "Calls to programmer defined functions ruin vectorization. However, calls to intrinsic functions like\n", + "$\\sin{x}$, $\\cos{x}$, $\\exp{x}$ etc are allowed since they are normally efficiently vectorized. \n", + "The following example is fully vectorizable" + ] + }, + { + "cell_type": "markdown", + "id": "0224f0d3", + "metadata": { + "editable": true + }, + "source": [ + " for (int i = 0; i < n; i++) {\n", + " a[i] = log10(i)*cos(i);\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "4643b973", + "metadata": { + "editable": true + }, + "source": [ + "Similarly, **inline** functions defined by the programmer, allow for vectorization since the function statements are glued into the actual place where the function is called." + ] + }, + { + "cell_type": "markdown", + "id": "bd6152a6", + "metadata": { + "editable": true + }, + "source": [ + "## Automatic vectorization and vectorization inhibitors, data dependencies\n", + "\n", + "One has to keep in mind that vectorization changes the order of operations inside a loop. A so-called\n", + "read-after-write statement with an explicit flow dependency cannot be vectorized. The following code" + ] + }, + { + "cell_type": "markdown", + "id": "3d86ddd8", + "metadata": { + "editable": true + }, + "source": [ + " double b = 15.;\n", + " for (int i = 1; i < n; i++) {\n", + " a[i] = a[i-1] + b;\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "adb31489", + "metadata": { + "editable": true + }, + "source": [ + "is an example of flow dependency and results in wrong numerical results if vectorized. For a scalar operation, the value $a[i-1]$ computed during the iteration is loaded into the right-hand side and the results are fine. In vector mode however, with a vector length of four, the values $a[0]$, $a[1]$, $a[2]$ and $a[3]$ from the previous loop will be loaded into the right-hand side and produce wrong results. That is, we have" + ] + }, + { + "cell_type": "markdown", + "id": "77d0b5d3", + "metadata": { + "editable": true + }, + "source": [ + " a[1] = a[0] + b;\n", + " a[2] = a[1] + b;\n", + " a[3] = a[2] + b;\n", + " a[4] = a[3] + b;\n" + ] + }, + { + "cell_type": "markdown", + "id": "c31215ca", + "metadata": { + "editable": true + }, + "source": [ + "and if the two first iterations are executed at the same by the SIMD instruction, the value of say $a[1]$ could be used by the second iteration before it has been calculated by the first iteration, leading thereby to wrong results." + ] + }, + { + "cell_type": "markdown", + "id": "46ed8de3", + "metadata": { + "editable": true + }, + "source": [ + "## Automatic vectorization and vectorization inhibitors, more data dependencies\n", + "\n", + "On the other hand, a so-called \n", + "write-after-read statement can be vectorized. The following code" + ] + }, + { + "cell_type": "markdown", + "id": "85a36227", + "metadata": { + "editable": true + }, + "source": [ + " double b = 15.;\n", + " for (int i = 1; i < n; i++) {\n", + " a[i-1] = a[i] + b;\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "8e5714f0", + "metadata": { + "editable": true + }, + "source": [ + "is an example of flow dependency that can be vectorized since no iteration with a higher value of $i$\n", + "can complete before an iteration with a lower value of $i$. However, such code leads to problems with parallelization." + ] + }, + { + "cell_type": "markdown", + "id": "efcc358e", + "metadata": { + "editable": true + }, + "source": [ + "## Automatic vectorization and vectorization inhibitors, memory stride\n", + "\n", + "For C++ programmers it is also worth keeping in mind that an array notation is preferred to the more compact use of pointers to access array elements. The compiler can often not tell if it is safe to vectorize the code. \n", + "\n", + "When dealing with arrays, you should also avoid memory stride, since this slows down considerably vectorization. When you access array element, write for example the inner loop to vectorize using unit stride, that is, access successively the next array element in memory, as shown here" + ] + }, + { + "cell_type": "markdown", + "id": "925846f6", + "metadata": { + "editable": true + }, + "source": [ + " for (int i = 0; i < n; i++) {\n", + " for (int j = 0; j < n; j++) {\n", + " a[i][j] += b[i][j];\n", + " } \n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "30c20cb6", + "metadata": { + "editable": true + }, + "source": [ + "## Memory management\n", + "The main memory contains the program data\n", + "1. Cache memory contains a copy of the main memory data\n", + "\n", + "2. Cache is faster but consumes more space and power. It is normally assumed to be much faster than main memory\n", + "\n", + "3. Registers contain working data only\n", + "\n", + " * Modern CPUs perform most or all operations only on data in register\n", + "\n", + "4. Multiple Cache memories contain a copy of the main memory data\n", + "\n", + " * Cache items accessed by their address in main memory\n", + "\n", + " * L1 cache is the fastest but has the least capacity\n", + "\n", + " * L2, L3 provide intermediate performance/size tradeoffs\n", + "\n", + "Loads and stores to memory can be as important as floating point operations when we measure performance." + ] + }, + { + "cell_type": "markdown", + "id": "40b4b92c", + "metadata": { + "editable": true + }, + "source": [ + "## Memory and communication\n", + "\n", + "1. Most communication in a computer is carried out in chunks, blocks of bytes of data that move together\n", + "\n", + "2. In the memory hierarchy, data moves between memory and cache, and between different levels of cache, in groups called lines\n", + "\n", + " * Lines are typically 64-128 bytes, or 8-16 double precision words\n", + "\n", + " * Even if you do not use the data, it is moved and occupies space in the cache\n", + "\n", + "Many of these performance features are not captured in most programming languages." + ] + }, + { + "cell_type": "markdown", + "id": "da5377a0", + "metadata": { + "editable": true + }, + "source": [ + "## Measuring performance\n", + "\n", + "How do we measure performance? What is wrong with this code to time a loop?" + ] + }, + { + "cell_type": "markdown", + "id": "36b63048", + "metadata": { + "editable": true + }, + "source": [ + " clock_t start, finish;\n", + " start = clock();\n", + " for (int j = 0; j < i; j++) {\n", + " a[j] = b[j]+b[j]*c[j];\n", + " }\n", + " finish = clock();\n", + " double timeused = (double) (finish - start)/(CLOCKS_PER_SEC );\n" + ] + }, + { + "cell_type": "markdown", + "id": "b73861c2", + "metadata": { + "editable": true + }, + "source": [ + "## Problems with measuring time\n", + "1. Timers are not infinitely accurate\n", + "\n", + "2. All clocks have a granularity, the minimum time that they can measure\n", + "\n", + "3. The error in a time measurement, even if everything is perfect, may be the size of this granularity (sometimes called a clock tick)\n", + "\n", + "4. Always know what your clock granularity is\n", + "\n", + "5. Ensure that your measurement is for a long enough duration (say 100 times the **tick**)" + ] + }, + { + "cell_type": "markdown", + "id": "9aec5620", + "metadata": { + "editable": true + }, + "source": [ + "## Problems with cold start\n", + "\n", + "What happens when the code is executed? The assumption is that the code is ready to\n", + "execute. But\n", + "1. Code may still be on disk, and not even read into memory.\n", + "\n", + "2. Data may be in slow memory rather than fast (which may be wrong or right for what you are measuring)\n", + "\n", + "3. Multiple tests often necessary to ensure that cold start effects are not present\n", + "\n", + "4. Special effort often required to ensure data in the intended part of the memory hierarchy." + ] + }, + { + "cell_type": "markdown", + "id": "f9313bb0", + "metadata": { + "editable": true + }, + "source": [ + "## Problems with smart compilers\n", + "\n", + "1. If the result of the computation is not used, the compiler may eliminate the code\n", + "\n", + "2. Performance will look impossibly fantastic\n", + "\n", + "3. Even worse, eliminate some of the code so the performance looks plausible\n", + "\n", + "4. Ensure that the results are (or may be) used." + ] + }, + { + "cell_type": "markdown", + "id": "7a2037ca", + "metadata": { + "editable": true + }, + "source": [ + "## Problems with interference\n", + "1. Other activities are sharing your processor\n", + "\n", + " * Operating system, system demons, other users\n", + "\n", + " * Some parts of the hardware do not always perform with exactly the same performance\n", + "\n", + "2. Make multiple tests and report\n", + "\n", + "3. Easy choices include\n", + "\n", + " * Average tests represent what users might observe over time" + ] + }, + { + "cell_type": "markdown", + "id": "2e1caede", + "metadata": { + "editable": true + }, + "source": [ + "## Problems with measuring performance\n", + "1. Accurate, reproducible performance measurement is hard\n", + "\n", + "2. Think carefully about your experiment:\n", + "\n", + "3. What is it, precisely, that you want to measure?\n", + "\n", + "4. How representative is your test to the situation that you are trying to measure?" + ] + }, + { + "cell_type": "markdown", + "id": "08ecf53f", + "metadata": { + "editable": true + }, + "source": [ + "## Thomas algorithm for tridiagonal linear algebra equations" + ] + }, + { + "cell_type": "markdown", + "id": "1d4b1934", + "metadata": { + "editable": true + }, + "source": [ + "$$\n", + "\\left( \\begin{array}{ccccc}\n", + " b_0 & c_0 & & & \\\\\n", + "\ta_0 & b_1 & c_1 & & \\\\\n", + "\t & & \\ddots & & \\\\\n", + "\t &\t & a_{m-3} & b_{m-2} & c_{m-2} \\\\\n", + "\t & & & a_{m-2} & b_{m-1}\n", + " \\end{array} \\right)\n", + "\\left( \\begin{array}{c}\n", + " x_0 \\\\\n", + " x_1 \\\\\n", + " \\vdots \\\\\n", + " x_{m-2} \\\\\n", + " x_{m-1}\n", + " \\end{array} \\right)=\\left( \\begin{array}{c}\n", + " f_0 \\\\\n", + " f_1 \\\\\n", + " \\vdots \\\\\n", + " f_{m-2} \\\\\n", + " f_{m-1} \\\\\n", + " \\end{array} \\right)\n", + "$$" + ] + }, + { + "cell_type": "markdown", + "id": "ea777020", + "metadata": { + "editable": true + }, + "source": [ + "## Thomas algorithm, forward substitution\n", + "The first step is to multiply the first row by $a_0/b_0$ and subtract it from the second row. This is known as the forward substitution step. We obtain then" + ] + }, + { + "cell_type": "markdown", + "id": "41c4860a", + "metadata": { + "editable": true + }, + "source": [ + "$$\n", + "a_i = 0,\n", + "$$" + ] + }, + { + "cell_type": "markdown", + "id": "d3d46adc", + "metadata": { + "editable": true + }, + "source": [ + "$$\n", + "b_i = b_i - \\frac{a_{i-1}}{b_{i-1}}c_{i-1},\n", + "$$" + ] + }, + { + "cell_type": "markdown", + "id": "75b8a579", + "metadata": { + "editable": true + }, + "source": [ + "and" + ] + }, + { + "cell_type": "markdown", + "id": "d554b123", + "metadata": { + "editable": true + }, + "source": [ + "$$\n", + "f_i = f_i - \\frac{a_{i-1}}{b_{i-1}}f_{i-1}.\n", + "$$" + ] + }, + { + "cell_type": "markdown", + "id": "936fd39c", + "metadata": { + "editable": true + }, + "source": [ + "At this point the simplified equation, with only an upper triangular matrix takes the form" + ] + }, + { + "cell_type": "markdown", + "id": "a862d627", + "metadata": { + "editable": true + }, + "source": [ + "$$\n", + "\\left( \\begin{array}{ccccc}\n", + " b_0 & c_0 & & & \\\\\n", + " & b_1 & c_1 & & \\\\\n", + " & & \\ddots & & \\\\\n", + "\t & & & b_{m-2} & c_{m-2} \\\\\n", + "\t & & & & b_{m-1}\n", + " \\end{array} \\right)\\left( \\begin{array}{c}\n", + " x_0 \\\\\n", + " x_1 \\\\\n", + " \\vdots \\\\\n", + " x_{m-2} \\\\\n", + " x_{m-1}\n", + " \\end{array} \\right)=\\left( \\begin{array}{c}\n", + " f_0 \\\\\n", + " f_1 \\\\\n", + " \\vdots \\\\\n", + " f_{m-2} \\\\\n", + " f_{m-1} \\\\\n", + " \\end{array} \\right)\n", + "$$" + ] + }, + { + "cell_type": "markdown", + "id": "5684ece5", + "metadata": { + "editable": true + }, + "source": [ + "## Thomas algorithm, backward substitution\n", + "The next step is the backward substitution step. The last row is multiplied by $c_{N-3}/b_{N-2}$ and subtracted from the second to last row, thus eliminating $c_{N-3}$ from the last row. The general backward substitution procedure is" + ] + }, + { + "cell_type": "markdown", + "id": "ef75d9c4", + "metadata": { + "editable": true + }, + "source": [ + "$$\n", + "c_i = 0,\n", + "$$" + ] + }, + { + "cell_type": "markdown", + "id": "9cce31b6", + "metadata": { + "editable": true + }, + "source": [ + "and" + ] + }, + { + "cell_type": "markdown", + "id": "f9769935", + "metadata": { + "editable": true + }, + "source": [ + "$$\n", + "f_{i-1} = f_{i-1} - \\frac{c_{i-1}}{b_i}f_i\n", + "$$" + ] + }, + { + "cell_type": "markdown", + "id": "5f461a3e", + "metadata": { + "editable": true + }, + "source": [ + "All that ramains to be computed is the solution, which is the very straight forward process of" + ] + }, + { + "cell_type": "markdown", + "id": "3cdbaea1", + "metadata": { + "editable": true + }, + "source": [ + "$$\n", + "x_i = \\frac{f_i}{b_i}\n", + "$$" + ] + }, + { + "cell_type": "markdown", + "id": "23e1510f", + "metadata": { + "editable": true + }, + "source": [ + "## Thomas algorithm and counting of operations (floating point and memory)\n", + "\n", + "We have in specific case the following operations with the floating operations\n", + "\n", + "* Memory Reads: $14(N-2)$;\n", + "\n", + "* Memory Writes: $4(N-2)$; \n", + "\n", + "* Subtractions: $3(N-2)$; \n", + "\n", + "* Multiplications: $3(N-2)$;\n", + "\n", + "* Divisions: $4(N-2)$." + ] + }, + { + "cell_type": "markdown", + "id": "0856ac03", + "metadata": { + "editable": true + }, + "source": [ + " // Forward substitution \n", + " // Note that we can simplify by precalculating a[i-1]/b[i-1]\n", + " for (int i=1; i < n; i++) {\n", + " b[i] = b[i] - (a[i-1]*c[i-1])/b[i-1];\n", + " f[i] = g[i] - (a[i-1]*f[i-1])/b[i-1];\n", + " }\n", + " x[n-1] = f[n-1] / b[n-1];\n", + " // Backwards substitution \n", + " for (int i = n-2; i >= 0; i--) {\n", + " f[i] = f[i] - c[i]*f[i+1]/b[i+1];\n", + " x[i] = f[i]/b[i];\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "454ee61c", + "metadata": { + "editable": true + }, + "source": [ + "## [Example: Transpose of a matrix](https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program8.cpp)" + ] + }, + { + "cell_type": "markdown", + "id": "e44cdc6b", + "metadata": { + "editable": true + }, + "source": [ + " #include \n", + " #include \n", + " #include \n", + " #include \n", + " #include \"time.h\"\n", + " \n", + " using namespace std; // note use of namespace\n", + " int main (int argc, char* argv[])\n", + " {\n", + " // read in dimension of square matrix\n", + " int n = atoi(argv[1]);\n", + " double **A, **B;\n", + " // Allocate space for the two matrices\n", + " A = new double*[n]; B = new double*[n];\n", + " for (int i = 0; i < n; i++){\n", + " A[i] = new double[n];\n", + " B[i] = new double[n];\n", + " }\n", + " // Set up values for matrix A\n", + " for (int i = 0; i < n; i++){\n", + " for (int j = 0; j < n; j++) {\n", + " A[i][j] = cos(i*1.0)*sin(j*3.0);\n", + " }\n", + " }\n", + " clock_t start, finish;\n", + " start = clock();\n", + " // Then compute the transpose\n", + " for (int i = 0; i < n; i++){\n", + " for (int j = 0; j < n; j++) {\n", + " B[i][j]= A[j][i];\n", + " }\n", + " }\n", + " \n", + " finish = clock();\n", + " double timeused = (double) (finish - start)/(CLOCKS_PER_SEC );\n", + " cout << setiosflags(ios::showpoint | ios::uppercase);\n", + " cout << setprecision(10) << setw(20) << \"Time used for setting up transpose of matrix=\" << timeused << endl;\n", + " \n", + " // Free up space\n", + " for (int i = 0; i < n; i++){\n", + " delete[] A[i];\n", + " delete[] B[i];\n", + " }\n", + " delete[] A;\n", + " delete[] B;\n", + " return 0;\n", + " }\n", + " \n" + ] + }, + { + "cell_type": "markdown", + "id": "a9959a86", + "metadata": { + "editable": true + }, + "source": [ + "## [Matrix-matrix multiplication](https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/LecturePrograms/programs/Classes/cpp/program9.cpp)\n", + "This the matrix-matrix multiplication code with plain c++ memory allocation. It computes at the end the Frobenius norm." + ] + }, + { + "cell_type": "markdown", + "id": "da017900", + "metadata": { + "editable": true + }, + "source": [ + " #include \n", + " #include \n", + " #include \n", + " #include \n", + " #include \"time.h\"\n", + " \n", + " using namespace std; // note use of namespace\n", + " int main (int argc, char* argv[])\n", + " {\n", + " // read in dimension of square matrix\n", + " int n = atoi(argv[1]);\n", + " double s = 1.0/sqrt( (double) n);\n", + " double **A, **B, **C;\n", + " // Start timing\n", + " clock_t start, finish;\n", + " start = clock();\n", + " // Allocate space for the two matrices\n", + " A = new double*[n]; B = new double*[n]; C = new double*[n];\n", + " for (int i = 0; i < n; i++){\n", + " A[i] = new double[n];\n", + " B[i] = new double[n];\n", + " C[i] = new double[n];\n", + " }\n", + " // Set up values for matrix A and B and zero matrix C\n", + " for (int i = 0; i < n; i++){\n", + " for (int j = 0; j < n; j++) {\n", + " double angle = 2.0*M_PI*i*j/ (( double ) n);\n", + " A[i][j] = s * ( sin ( angle ) + cos ( angle ) );\n", + " B[j][i] = A[i][j];\n", + " }\n", + " }\n", + " // Then perform the matrix-matrix multiplication\n", + " for (int i = 0; i < n; i++){\n", + " for (int j = 0; j < n; j++) {\n", + " double sum = 0.0;\n", + " for (int k = 0; k < n; k++) {\n", + " sum += B[i][k]*A[k][j];\n", + " }\n", + " C[i][j] = sum;\n", + " }\n", + " }\n", + " // Compute now the Frobenius norm\n", + " double Fsum = 0.0;\n", + " for (int i = 0; i < n; i++){\n", + " for (int j = 0; j < n; j++) {\n", + " Fsum += C[i][j]*C[i][j];\n", + " }\n", + " }\n", + " Fsum = sqrt(Fsum);\n", + " finish = clock();\n", + " double timeused = (double) (finish - start)/(CLOCKS_PER_SEC );\n", + " cout << setiosflags(ios::showpoint | ios::uppercase);\n", + " cout << setprecision(10) << setw(20) << \"Time used for matrix-matrix multiplication=\" << timeused << endl;\n", + " cout << \" Frobenius norm = \" << Fsum << endl;\n", + " // Free up space\n", + " for (int i = 0; i < n; i++){\n", + " delete[] A[i];\n", + " delete[] B[i];\n", + " delete[] C[i];\n", + " }\n", + " delete[] A;\n", + " delete[] B;\n", + " delete[] C;\n", + " return 0;\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "8cc611e7", + "metadata": { + "editable": true + }, + "source": [ + "## How do we define speedup? Simplest form\n", + "* Speedup measures the ratio of performance between two objects\n", + "\n", + "* Versions of same code, with different number of processors\n", + "\n", + "* Serial and vector versions\n", + "\n", + "* Try different programing languages, c++ and Fortran\n", + "\n", + "* Two algorithms computing the **same** result" + ] + }, + { + "cell_type": "markdown", + "id": "6f8e7fda", + "metadata": { + "editable": true + }, + "source": [ + "## How do we define speedup? Correct baseline\n", + "The key is choosing the correct baseline for comparison\n", + "* For our serial vs. vectorization examples, using compiler-provided vectorization, the baseline is simple; the same code, with vectorization turned off\n", + "\n", + " * For parallel applications, this is much harder:\n", + "\n", + " * Choice of algorithm, decomposition, performance of baseline case etc." + ] + }, + { + "cell_type": "markdown", + "id": "fc160e54", + "metadata": { + "editable": true + }, + "source": [ + "## Parallel speedup\n", + "For parallel applications, speedup is typically defined as\n", + "* Speedup $=T_1/T_p$\n", + "\n", + "Here $T_1$ is the time on one processor and $T_p$ is the time using $p$ processors.\n", + " * Can the speedup become larger than $p$? That means using $p$ processors is more than $p$ times faster than using one processor." + ] + }, + { + "cell_type": "markdown", + "id": "35d4c6fc", + "metadata": { + "editable": true + }, + "source": [ + "## Speedup and memory\n", + "The speedup on $p$ processors can\n", + "be greater than $p$ if memory usage is optimal!\n", + "Consider the case of a memorybound computation with $M$ words of memory\n", + " * If $M/p$ fits into cache while $M$ does not, the time to access memory will be different in the two cases:\n", + "\n", + " * $T_1$ uses the main memory bandwidth\n", + "\n", + " * $T_p$ uses the appropriate cache bandwidth" + ] + }, + { + "cell_type": "markdown", + "id": "42361424", + "metadata": { + "editable": true + }, + "source": [ + "## Upper bounds on speedup\n", + "Assume that almost all parts of a code are perfectly\n", + "parallelizable (fraction $f$). The remainder,\n", + "fraction $(1-f)$ cannot be parallelized at all.\n", + "\n", + "That is, there is work that takes time $W$ on one process; a fraction $f$ of that work will take\n", + "time $Wf/p$ on $p$ processors. \n", + "* What is the maximum possible speedup as a function of $f$?" + ] + }, + { + "cell_type": "markdown", + "id": "1aaeb684", + "metadata": { + "editable": true + }, + "source": [ + "## Amdahl's law\n", + "On one processor we have" + ] + }, + { + "cell_type": "markdown", + "id": "b81bb408", + "metadata": { + "editable": true + }, + "source": [ + "$$\n", + "T_1 = (1-f)W + fW = W\n", + "$$" + ] + }, + { + "cell_type": "markdown", + "id": "b0311044", + "metadata": { + "editable": true + }, + "source": [ + "On $p$ processors we have" + ] + }, + { + "cell_type": "markdown", + "id": "5804b8d3", + "metadata": { + "editable": true + }, + "source": [ + "$$\n", + "T_p = (1-f)W + \\frac{fW}{p},\n", + "$$" + ] + }, + { + "cell_type": "markdown", + "id": "ee810179", + "metadata": { + "editable": true + }, + "source": [ + "resulting in a speedup of" + ] + }, + { + "cell_type": "markdown", + "id": "ec2840b5", + "metadata": { + "editable": true + }, + "source": [ + "$$\n", + "\\frac{T_1}{T_p} = \\frac{W}{(1-f)W+fW/p}\n", + "$$" + ] + }, + { + "cell_type": "markdown", + "id": "ae2e230a", + "metadata": { + "editable": true + }, + "source": [ + "As $p$ goes to infinity, $fW/p$ goes to zero, and the maximum speedup is" + ] + }, + { + "cell_type": "markdown", + "id": "d67d97e1", + "metadata": { + "editable": true + }, + "source": [ + "$$\n", + "\\frac{1}{1-f},\n", + "$$" + ] + }, + { + "cell_type": "markdown", + "id": "1525f768", + "metadata": { + "editable": true + }, + "source": [ + "meaning that if \n", + "if $f = 0.99$ (all but $1\\%$ parallelizable), the maximum speedup\n", + "is $1/(1-.99)=100$!" + ] + }, + { + "cell_type": "markdown", + "id": "3e5a0366", + "metadata": { + "editable": true + }, + "source": [ + "## How much is parallelizable\n", + "If any non-parallel code slips into the\n", + "application, the parallel\n", + "performance is limited. \n", + "\n", + "In many simulations, however, the fraction of non-parallelizable work\n", + "is $10^{-6}$ or less due to large arrays or objects that are perfectly parallelizable." + ] + }, + { + "cell_type": "markdown", + "id": "0a3d70dd", + "metadata": { + "editable": true + }, + "source": [ + "## Today's situation of parallel computing\n", + "\n", + "* Distributed memory is the dominant hardware configuration. There is a large diversity in these machines, from MPP (massively parallel processing) systems to clusters of off-the-shelf PCs, which are very cost-effective.\n", + "\n", + "* Message-passing is a mature programming paradigm and widely accepted. It often provides an efficient match to the hardware. It is primarily used for the distributed memory systems, but can also be used on shared memory systems.\n", + "\n", + "* Modern nodes have nowadays several cores, which makes it interesting to use both shared memory (the given node) and distributed memory (several nodes with communication). This leads often to codes which use both MPI and OpenMP.\n", + "\n", + "Our lectures will focus on both MPI and OpenMP." + ] + }, + { + "cell_type": "markdown", + "id": "3693a713", + "metadata": { + "editable": true + }, + "source": [ + "## Overhead present in parallel computing\n", + "\n", + "* **Uneven load balance**: not all the processors can perform useful work at all time.\n", + "\n", + "* **Overhead of synchronization**\n", + "\n", + "* **Overhead of communication**\n", + "\n", + "* **Extra computation due to parallelization**\n", + "\n", + "Due to the above overhead and that certain parts of a sequential\n", + "algorithm cannot be parallelized we may not achieve an optimal parallelization." + ] + }, + { + "cell_type": "markdown", + "id": "5a5ccdc1", + "metadata": { + "editable": true + }, + "source": [ + "## Parallelizing a sequential algorithm\n", + "\n", + "* Identify the part(s) of a sequential algorithm that can be executed in parallel. This is the difficult part,\n", + "\n", + "* Distribute the global work and data among $P$ processors." + ] + }, + { + "cell_type": "markdown", + "id": "c59cac43", + "metadata": { + "editable": true + }, + "source": [ + "## Strategies\n", + "* Develop codes locally, run with some few processes and test your codes. Do benchmarking, timing and so forth on local nodes, for example your laptop or PC. \n", + "\n", + "* When you are convinced that your codes run correctly, you can start your production runs on available supercomputers." + ] + }, + { + "cell_type": "markdown", + "id": "4f099832", + "metadata": { + "editable": true + }, + "source": [ + "## How do I run MPI on a PC/Laptop? MPI\n", + "To install MPI is rather easy on hardware running unix/linux as operating systems, follow simply the instructions from the [OpenMPI website](https://www.open-mpi.org/). See also subsequent slides.\n", + "When you have made sure you have installed MPI on your PC/laptop, \n", + "* Compile with mpicxx/mpic++ or mpif90" + ] + }, + { + "cell_type": "markdown", + "id": "f34cc9cb", + "metadata": { + "editable": true + }, + "source": [ + " # Compile and link\n", + " mpic++ -O3 -o nameofprog.x nameofprog.cpp\n", + " # run code with for example 8 processes using mpirun/mpiexec\n", + " mpiexec -n 8 ./nameofprog.x\n" + ] + }, + { + "cell_type": "markdown", + "id": "991094cb", + "metadata": { + "editable": true + }, + "source": [ + "## Can I do it on my own PC/laptop? OpenMP installation\n", + "If you wish to install MPI and OpenMP \n", + "on your laptop/PC, we recommend the following:\n", + "\n", + "* For OpenMP, the compile option **-fopenmp** is included automatically in recent versions of the C++ compiler and Fortran compilers. For users of different Linux distributions, simply use the available C++ or Fortran compilers and add the above compiler instructions, see also code examples below.\n", + "\n", + "* For OS X users however, install **libomp**" + ] + }, + { + "cell_type": "markdown", + "id": "68e14aec", + "metadata": { + "editable": true + }, + "source": [ + " brew install libomp\n" + ] + }, + { + "cell_type": "markdown", + "id": "947df6e7", + "metadata": { + "editable": true + }, + "source": [ + "and compile and link as" + ] + }, + { + "cell_type": "markdown", + "id": "3b580a58", + "metadata": { + "editable": true + }, + "source": [ + " c++ -o -lomp\n" + ] + }, + { + "cell_type": "markdown", + "id": "cf57172d", + "metadata": { + "editable": true + }, + "source": [ + "## Installing MPI\n", + "For linux/ubuntu users, you need to install two packages (alternatively use the synaptic package manager)" + ] + }, + { + "cell_type": "markdown", + "id": "8954eb04", + "metadata": { + "editable": true + }, + "source": [ + " sudo apt-get install libopenmpi-dev\n", + " sudo apt-get install openmpi-bin\n" + ] + }, + { + "cell_type": "markdown", + "id": "4671e326", + "metadata": { + "editable": true + }, + "source": [ + "For OS X users, install brew (after having installed xcode and gcc, needed for the \n", + "gfortran compiler of openmpi) and then install with brew" + ] + }, + { + "cell_type": "markdown", + "id": "b08a52e8", + "metadata": { + "editable": true + }, + "source": [ + " brew install openmpi\n" + ] + }, + { + "cell_type": "markdown", + "id": "a861537e", + "metadata": { + "editable": true + }, + "source": [ + "When running an executable (code.x), run as" + ] + }, + { + "cell_type": "markdown", + "id": "d4185b13", + "metadata": { + "editable": true + }, + "source": [ + " mpirun -n 10 ./code.x\n" + ] + }, + { + "cell_type": "markdown", + "id": "fdbf1c9f", + "metadata": { + "editable": true + }, + "source": [ + "where we indicate that we want the number of processes to be 10." + ] + }, + { + "cell_type": "markdown", + "id": "4fa81e38", + "metadata": { + "editable": true + }, + "source": [ + "## Installing MPI and using Qt\n", + "With openmpi installed, when using Qt, add to your .pro file the instructions [here](http://dragly.org/2012/03/14/developing-mpi-applications-in-qt-creator/)\n", + "\n", + "You may need to tell Qt where openmpi is stored." + ] + }, + { + "cell_type": "markdown", + "id": "5fda1f95", + "metadata": { + "editable": true + }, + "source": [ + "## What is Message Passing Interface (MPI)?\n", + "\n", + "**MPI** is a library, not a language. It specifies the names, calling sequences and results of functions\n", + "or subroutines to be called from C/C++ or Fortran programs, and the classes and methods that make up the MPI C++\n", + "library. The programs that users write in Fortran, C or C++ are compiled with ordinary compilers and linked\n", + "with the MPI library.\n", + "\n", + "MPI programs should be able to run\n", + "on all possible machines and run all MPI implementetations without change.\n", + "\n", + "An MPI computation is a collection of processes communicating with messages." + ] + }, + { + "cell_type": "markdown", + "id": "5a56b5b5", + "metadata": { + "editable": true + }, + "source": [ + "## Going Parallel with MPI\n", + "**Task parallelism**: the work of a global problem can be divided\n", + "into a number of independent tasks, which rarely need to synchronize. \n", + "Monte Carlo simulations or numerical integration are examples of this.\n", + "\n", + "MPI is a message-passing library where all the routines\n", + "have corresponding C/C++-binding" + ] + }, + { + "cell_type": "markdown", + "id": "e9bda343", + "metadata": { + "editable": true + }, + "source": [ + " MPI_Command_name\n" + ] + }, + { + "cell_type": "markdown", + "id": "db0d73df", + "metadata": { + "editable": true + }, + "source": [ + "and Fortran-binding (routine names are in uppercase, but can also be in lower case)" + ] + }, + { + "cell_type": "markdown", + "id": "f3a3c4b5", + "metadata": { + "editable": true + }, + "source": [ + " MPI_COMMAND_NAME\n" + ] + }, + { + "cell_type": "markdown", + "id": "373dc478", + "metadata": { + "editable": true + }, + "source": [ + "## MPI is a library\n", + "MPI is a library specification for the message passing interface,\n", + "proposed as a standard.\n", + "\n", + "* independent of hardware;\n", + "\n", + "* not a language or compiler specification;\n", + "\n", + "* not a specific implementation or product.\n", + "\n", + "A message passing standard for portability and ease-of-use. \n", + "Designed for high performance.\n", + "\n", + "Insert communication and synchronization functions where necessary." + ] + }, + { + "cell_type": "markdown", + "id": "6958b4e5", + "metadata": { + "editable": true + }, + "source": [ + "## Bindings to MPI routines\n", + "\n", + "MPI is a message-passing library where all the routines\n", + "have corresponding C/C++-binding" + ] + }, + { + "cell_type": "markdown", + "id": "b33344da", + "metadata": { + "editable": true + }, + "source": [ + " MPI_Command_name\n" + ] + }, + { + "cell_type": "markdown", + "id": "fdeedc35", + "metadata": { + "editable": true + }, + "source": [ + "and Fortran-binding (routine names are in uppercase, but can also be in lower case)" + ] + }, + { + "cell_type": "markdown", + "id": "eee795e9", + "metadata": { + "editable": true + }, + "source": [ + " MPI_COMMAND_NAME\n" + ] + }, + { + "cell_type": "markdown", + "id": "ad0bbfc5", + "metadata": { + "editable": true + }, + "source": [ + "The discussion in these slides focuses on the C++ binding." + ] + }, + { + "cell_type": "markdown", + "id": "6a84fe02", + "metadata": { + "editable": true + }, + "source": [ + "## Communicator\n", + "* A group of MPI processes with a name (context).\n", + "\n", + "* Any process is identified by its rank. The rank is only meaningful within a particular communicator.\n", + "\n", + "* By default the communicator contains all the MPI processes." + ] + }, + { + "cell_type": "markdown", + "id": "71b44913", + "metadata": { + "editable": true + }, + "source": [ + " MPI_COMM_WORLD \n" + ] + }, + { + "cell_type": "markdown", + "id": "042775ec", + "metadata": { + "editable": true + }, + "source": [ + "* Mechanism to identify subset of processes.\n", + "\n", + "* Promotes modular design of parallel libraries." + ] + }, + { + "cell_type": "markdown", + "id": "03b93bde", + "metadata": { + "editable": true + }, + "source": [ + "## Some of the most important MPI functions\n", + "\n", + "* $MPI\\_Init$ - initiate an MPI computation\n", + "\n", + "* $MPI\\_Finalize$ - terminate the MPI computation and clean up\n", + "\n", + "* $MPI\\_Comm\\_size$ - how many processes participate in a given MPI communicator?\n", + "\n", + "* $MPI\\_Comm\\_rank$ - which one am I? (A number between 0 and size-1.)\n", + "\n", + "* $MPI\\_Send$ - send a message to a particular process within an MPI communicator\n", + "\n", + "* $MPI\\_Recv$ - receive a message from a particular process within an MPI communicator\n", + "\n", + "* $MPI\\_reduce$ or $MPI\\_Allreduce$, send and receive messages" + ] + }, + { + "cell_type": "markdown", + "id": "8f65b101", + "metadata": { + "editable": true + }, + "source": [ + "## [The first MPI C/C++ program](https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program2.cpp)\n", + "\n", + "Let every process write \"Hello world\" (oh not this program again!!) on the standard output." + ] + }, + { + "cell_type": "markdown", + "id": "4d44f32f", + "metadata": { + "editable": true + }, + "source": [ + " using namespace std;\n", + " #include \n", + " #include \n", + " int main (int nargs, char* args[])\n", + " {\n", + " int numprocs, my_rank;\n", + " // MPI initializations\n", + " MPI_Init (&nargs, &args);\n", + " MPI_Comm_size (MPI_COMM_WORLD, &numprocs);\n", + " MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);\n", + " cout << \"Hello world, I have rank \" << my_rank << \" out of \" \n", + " << numprocs << endl;\n", + " // End MPI\n", + " MPI_Finalize ();\n" + ] + }, + { + "cell_type": "markdown", + "id": "f5fca32b", + "metadata": { + "editable": true + }, + "source": [ + "## The Fortran program" + ] + }, + { + "cell_type": "markdown", + "id": "6e7dbd2c", + "metadata": { + "editable": true + }, + "source": [ + " PROGRAM hello\n", + " INCLUDE \"mpif.h\"\n", + " INTEGER:: size, my_rank, ierr\n", + " \n", + " CALL MPI_INIT(ierr)\n", + " CALL MPI_COMM_SIZE(MPI_COMM_WORLD, size, ierr)\n", + " CALL MPI_COMM_RANK(MPI_COMM_WORLD, my_rank, ierr)\n", + " WRITE(*,*)\"Hello world, I've rank \",my_rank,\" out of \",size\n", + " CALL MPI_FINALIZE(ierr)\n", + " \n", + " END PROGRAM hello\n" + ] + }, + { + "cell_type": "markdown", + "id": "5c0f2635", + "metadata": { + "editable": true + }, + "source": [ + "## Note 1\n", + "\n", + "* The output to screen is not ordered since all processes are trying to write to screen simultaneously.\n", + "\n", + "* It is the operating system which opts for an ordering. \n", + "\n", + "* If we wish to have an organized output, starting from the first process, we may rewrite our program as in the next example." + ] + }, + { + "cell_type": "markdown", + "id": "61669cf2", + "metadata": { + "editable": true + }, + "source": [ + "## [Ordered output with MPIBarrier](https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program3.cpp)" + ] + }, + { + "cell_type": "markdown", + "id": "b067e83a", + "metadata": { + "editable": true + }, + "source": [ + " int main (int nargs, char* args[])\n", + " {\n", + " int numprocs, my_rank, i;\n", + " MPI_Init (&nargs, &args);\n", + " MPI_Comm_size (MPI_COMM_WORLD, &numprocs);\n", + " MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);\n", + " for (i = 0; i < numprocs; i++) {}\n", + " MPI_Barrier (MPI_COMM_WORLD);\n", + " if (i == my_rank) {\n", + " cout << \"Hello world, I have rank \" << my_rank << \n", + " \" out of \" << numprocs << endl;}\n", + " MPI_Finalize ();\n" + ] + }, + { + "cell_type": "markdown", + "id": "76158876", + "metadata": { + "editable": true + }, + "source": [ + "## Note 2\n", + "* Here we have used the $MPI\\_Barrier$ function to ensure that that every process has completed its set of instructions in a particular order.\n", + "\n", + "* A barrier is a special collective operation that does not allow the processes to continue until all processes in the communicator (here $MPI\\_COMM\\_WORLD$) have called $MPI\\_Barrier$. \n", + "\n", + "* The barriers make sure that all processes have reached the same point in the code. Many of the collective operations like $MPI\\_ALLREDUCE$ to be discussed later, have the same property; that is, no process can exit the operation until all processes have started. \n", + "\n", + "However, this is slightly more time-consuming since the processes synchronize between themselves as many times as there\n", + "are processes. In the next Hello world example we use the send and receive functions in order to a have a synchronized\n", + "action." + ] + }, + { + "cell_type": "markdown", + "id": "f50ed3d1", + "metadata": { + "editable": true + }, + "source": [ + "## [Ordered output](https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program4.cpp)" + ] + }, + { + "cell_type": "markdown", + "id": "b548728b", + "metadata": { + "editable": true + }, + "source": [ + " .....\n", + " int numprocs, my_rank, flag;\n", + " MPI_Status status;\n", + " MPI_Init (&nargs, &args);\n", + " MPI_Comm_size (MPI_COMM_WORLD, &numprocs);\n", + " MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);\n", + " if (my_rank > 0)\n", + " MPI_Recv (&flag, 1, MPI_INT, my_rank-1, 100, \n", + " MPI_COMM_WORLD, &status);\n", + " cout << \"Hello world, I have rank \" << my_rank << \" out of \" \n", + " << numprocs << endl;\n", + " if (my_rank < numprocs-1)\n", + " MPI_Send (&my_rank, 1, MPI_INT, my_rank+1, \n", + " 100, MPI_COMM_WORLD);\n", + " MPI_Finalize ();\n" + ] + }, + { + "cell_type": "markdown", + "id": "8085bb44", + "metadata": { + "editable": true + }, + "source": [ + "## Note 3\n", + "\n", + "The basic sending of messages is given by the function $MPI\\_SEND$, which in C/C++\n", + "is defined as" + ] + }, + { + "cell_type": "markdown", + "id": "20b2a3b9", + "metadata": { + "editable": true + }, + "source": [ + " int MPI_Send(void *buf, int count, \n", + " MPI_Datatype datatype, \n", + " int dest, int tag, MPI_Comm comm)}\n" + ] + }, + { + "cell_type": "markdown", + "id": "04cce86a", + "metadata": { + "editable": true + }, + "source": [ + "This single command allows the passing of any kind of variable, even a large array, to any group of tasks. \n", + "The variable **buf** is the variable we wish to send while **count**\n", + "is the number of variables we are passing. If we are passing only a single value, this should be 1. \n", + "\n", + "If we transfer an array, it is the overall size of the array. \n", + "For example, if we want to send a 10 by 10 array, count would be $10\\times 10=100$ \n", + "since we are actually passing 100 values." + ] + }, + { + "cell_type": "markdown", + "id": "158cd5e2", + "metadata": { + "editable": true + }, + "source": [ + "## Note 4\n", + "\n", + "Once you have sent a message, you must receive it on another task. The function $MPI\\_RECV$\n", + "is similar to the send call." + ] + }, + { + "cell_type": "markdown", + "id": "10926207", + "metadata": { + "editable": true + }, + "source": [ + " int MPI_Recv( void *buf, int count, MPI_Datatype datatype, \n", + " int source, \n", + " int tag, MPI_Comm comm, MPI_Status *status )\n" + ] + }, + { + "cell_type": "markdown", + "id": "d0dece69", + "metadata": { + "editable": true + }, + "source": [ + "The arguments that are different from those in MPI\\_SEND are\n", + "**buf** which is the name of the variable where you will be storing the received data, \n", + "**source** which replaces the destination in the send command. This is the return ID of the sender.\n", + "\n", + "Finally, we have used $MPI\\_Status\\_status$, \n", + "where one can check if the receive was completed.\n", + "\n", + "The output of this code is the same as the previous example, but now\n", + "process 0 sends a message to process 1, which forwards it further\n", + "to process 2, and so forth." + ] + }, + { + "cell_type": "markdown", + "id": "3d053e2f", + "metadata": { + "editable": true + }, + "source": [ + "## [Numerical integration in parallel](https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp)\n", + "**Integrating $\\pi$.**\n", + "\n", + "* The code example computes $\\pi$ using the trapezoidal rules.\n", + "\n", + "* The trapezoidal rule" + ] + }, + { + "cell_type": "markdown", + "id": "9668b407", + "metadata": { + "editable": true + }, + "source": [ + "$$\n", + "I=\\int_a^bf(x) dx\\approx h\\left(f(a)/2 + f(a+h) +f(a+2h)+\\dots +f(b-h)+ f(b)/2\\right).\n", + "$$" + ] + }, + { + "cell_type": "markdown", + "id": "12eb35b9", + "metadata": { + "editable": true + }, + "source": [ + "Click [on this link](https://github.com/CompPhysics/ComputationalPhysics2/blob/gh-pages/doc/Programs/LecturePrograms/programs/MPI/chapter07/program6.cpp) for the full program." + ] + }, + { + "cell_type": "markdown", + "id": "02f90f49", + "metadata": { + "editable": true + }, + "source": [ + "## Dissection of trapezoidal rule with $MPI\\_reduce$" + ] + }, + { + "cell_type": "markdown", + "id": "24fb6f69", + "metadata": { + "editable": true + }, + "source": [ + " // Trapezoidal rule and numerical integration usign MPI\n", + " using namespace std;\n", + " #include \n", + " #include \n", + " \n", + " // Here we define various functions called by the main program\n", + " \n", + " double int_function(double );\n", + " double trapezoidal_rule(double , double , int , double (*)(double));\n", + " \n", + " // Main function begins here\n", + " int main (int nargs, char* args[])\n", + " {\n", + " int n, local_n, numprocs, my_rank; \n", + " double a, b, h, local_a, local_b, total_sum, local_sum; \n", + " double time_start, time_end, total_time;\n" + ] + }, + { + "cell_type": "markdown", + "id": "1eda1f15", + "metadata": { + "editable": true + }, + "source": [ + "## Dissection of trapezoidal rule" + ] + }, + { + "cell_type": "markdown", + "id": "62c86791", + "metadata": { + "editable": true + }, + "source": [ + " // MPI initializations\n", + " MPI_Init (&nargs, &args);\n", + " MPI_Comm_size (MPI_COMM_WORLD, &numprocs);\n", + " MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);\n", + " time_start = MPI_Wtime();\n", + " // Fixed values for a, b and n \n", + " a = 0.0 ; b = 1.0; n = 1000;\n", + " h = (b-a)/n; // h is the same for all processes \n", + " local_n = n/numprocs; \n", + " // make sure n > numprocs, else integer division gives zero\n", + " // Length of each process' interval of\n", + " // integration = local_n*h. \n", + " local_a = a + my_rank*local_n*h;\n", + " local_b = local_a + local_n*h;\n" + ] + }, + { + "cell_type": "markdown", + "id": "d203a19a", + "metadata": { + "editable": true + }, + "source": [ + "## Integrating with **MPI**" + ] + }, + { + "cell_type": "markdown", + "id": "be8560b4", + "metadata": { + "editable": true + }, + "source": [ + " total_sum = 0.0;\n", + " local_sum = trapezoidal_rule(local_a, local_b, local_n, \n", + " &int_function); \n", + " MPI_Reduce(&local_sum, &total_sum, 1, MPI_DOUBLE, \n", + " MPI_SUM, 0, MPI_COMM_WORLD);\n", + " time_end = MPI_Wtime();\n", + " total_time = time_end-time_start;\n", + " if ( my_rank == 0) {\n", + " cout << \"Trapezoidal rule = \" << total_sum << endl;\n", + " cout << \"Time = \" << total_time \n", + " << \" on number of processors: \" << numprocs << endl;\n", + " }\n", + " // End MPI\n", + " MPI_Finalize (); \n", + " return 0;\n", + " } // end of main program\n" + ] + }, + { + "cell_type": "markdown", + "id": "683a1a3a", + "metadata": { + "editable": true + }, + "source": [ + "## How do I use $MPI\\_reduce$?\n", + "\n", + "Here we have used" + ] + }, + { + "cell_type": "markdown", + "id": "4a3d1ec8", + "metadata": { + "editable": true + }, + "source": [ + " MPI_reduce( void *senddata, void* resultdata, int count, \n", + " MPI_Datatype datatype, MPI_Op, int root, MPI_Comm comm)\n" + ] + }, + { + "cell_type": "markdown", + "id": "b96a8840", + "metadata": { + "editable": true + }, + "source": [ + "The two variables $senddata$ and $resultdata$ are obvious, besides the fact that one sends the address\n", + "of the variable or the first element of an array. If they are arrays they need to have the same size. \n", + "The variable $count$ represents the total dimensionality, 1 in case of just one variable, \n", + "while $MPI\\_Datatype$ \n", + "defines the type of variable which is sent and received. \n", + "\n", + "The new feature is $MPI\\_Op$. It defines the type\n", + "of operation we want to do." + ] + }, + { + "cell_type": "markdown", + "id": "bcc235b8", + "metadata": { + "editable": true + }, + "source": [ + "## More on $MPI\\_Reduce$\n", + "In our case, since we are summing\n", + "the rectangle contributions from every process we define $MPI\\_Op = MPI\\_SUM$.\n", + "If we have an array or matrix we can search for the largest og smallest element by sending either $MPI\\_MAX$ or \n", + "$MPI\\_MIN$. If we want the location as well (which array element) we simply transfer \n", + "$MPI\\_MAXLOC$ or $MPI\\_MINOC$. If we want the product we write $MPI\\_PROD$. \n", + "\n", + "$MPI\\_Allreduce$ is defined as" + ] + }, + { + "cell_type": "markdown", + "id": "6f521ed1", + "metadata": { + "editable": true + }, + "source": [ + " MPI_Allreduce( void *senddata, void* resultdata, int count, \n", + " MPI_Datatype datatype, MPI_Op, MPI_Comm comm) \n" + ] + }, + { + "cell_type": "markdown", + "id": "307ef4f1", + "metadata": { + "editable": true + }, + "source": [ + "## Dissection of trapezoidal rule\n", + "\n", + "We use $MPI\\_reduce$ to collect data from each process. Note also the use of the function \n", + "$MPI\\_Wtime$." + ] + }, + { + "cell_type": "markdown", + "id": "1a2f72b4", + "metadata": { + "editable": true + }, + "source": [ + " // this function defines the function to integrate\n", + " double int_function(double x)\n", + " {\n", + " double value = 4./(1.+x*x);\n", + " return value;\n", + " } // end of function to evaluate\n", + " \n" + ] + }, + { + "cell_type": "markdown", + "id": "214ebedd", + "metadata": { + "editable": true + }, + "source": [ + "## Dissection of trapezoidal rule" + ] + }, + { + "cell_type": "markdown", + "id": "96619240", + "metadata": { + "editable": true + }, + "source": [ + " // this function defines the trapezoidal rule\n", + " double trapezoidal_rule(double a, double b, int n, \n", + " double (*func)(double))\n", + " {\n", + " double trapez_sum;\n", + " double fa, fb, x, step;\n", + " int j;\n", + " step=(b-a)/((double) n);\n", + " fa=(*func)(a)/2. ;\n", + " fb=(*func)(b)/2. ;\n", + " trapez_sum=0.;\n", + " for (j=1; j <= n-1; j++){\n", + " x=j*step+a;\n", + " trapez_sum+=(*func)(x);\n", + " }\n", + " trapez_sum=(trapez_sum+fb+fa)*step;\n", + " return trapez_sum;\n", + " } // end trapezoidal_rule \n" + ] + }, + { + "cell_type": "markdown", + "id": "740865e4", + "metadata": { + "editable": true + }, + "source": [ + "## [The quantum dot program for two electrons](https://github.com/CompPhysics/ComputationalPhysics2/blob/master/doc/Programs/ParallelizationMPI/MPIvmcqdot.cpp)" + ] + }, + { + "cell_type": "markdown", + "id": "484916ee", + "metadata": { + "editable": true + }, + "source": [ + " // Variational Monte Carlo for atoms with importance sampling, slater det\n", + " // Test case for 2-electron quantum dot, no classes using Mersenne-Twister RNG\n", + " #include \"mpi.h\"\n", + " #include \n", + " #include \n", + " #include \n", + " #include \n", + " #include \n", + " #include \n", + " #include \"vectormatrixclass.h\"\n", + " \n", + " using namespace std;\n", + " // output file as global variable\n", + " ofstream ofile; \n", + " // the step length and its squared inverse for the second derivative \n", + " // Here we define global variables used in various functions\n", + " // These can be changed by using classes\n", + " int Dimension = 2; \n", + " int NumberParticles = 2; // we fix also the number of electrons to be 2\n", + " \n", + " // declaration of functions \n", + " \n", + " // The Mc sampling for the variational Monte Carlo \n", + " void MonteCarloSampling(int, double &, double &, Vector &);\n", + " \n", + " // The variational wave function\n", + " double WaveFunction(Matrix &, Vector &);\n", + " \n", + " // The local energy \n", + " double LocalEnergy(Matrix &, Vector &);\n", + " \n", + " // The quantum force\n", + " void QuantumForce(Matrix &, Matrix &, Vector &);\n", + " \n", + " \n", + " // inline function for single-particle wave function\n", + " inline double SPwavefunction(double r, double alpha) { \n", + " return exp(-alpha*r*0.5);\n", + " }\n", + " \n", + " // inline function for derivative of single-particle wave function\n", + " inline double DerivativeSPwavefunction(double r, double alpha) { \n", + " return -r*alpha;\n", + " }\n", + " \n", + " // function for absolute value of relative distance\n", + " double RelativeDistance(Matrix &r, int i, int j) { \n", + " double r_ij = 0; \n", + " for (int k = 0; k < Dimension; k++) { \n", + " \tr_ij += (r(i,k)-r(j,k))*(r(i,k)-r(j,k));\n", + " }\n", + " return sqrt(r_ij); \n", + " }\n", + " \n", + " // inline function for derivative of Jastrow factor\n", + " inline double JastrowDerivative(Matrix &r, double beta, int i, int j, int k){\n", + " return (r(i,k)-r(j,k))/(RelativeDistance(r, i, j)*pow(1.0+beta*RelativeDistance(r, i, j),2));\n", + " }\n", + " \n", + " // function for square of position of single particle\n", + " double singleparticle_pos2(Matrix &r, int i) { \n", + " double r_single_particle = 0;\n", + " for (int j = 0; j < Dimension; j++) { \n", + " r_single_particle += r(i,j)*r(i,j);\n", + " }\n", + " return r_single_particle;\n", + " }\n", + " \n", + " void lnsrch(int n, Vector &xold, double fold, Vector &g, Vector &p, Vector &x,\n", + " \t\t double *f, double stpmax, int *check, double (*func)(Vector &p));\n", + " \n", + " void dfpmin(Vector &p, int n, double gtol, int *iter, double *fret,\n", + " \t double(*func)(Vector &p), void (*dfunc)(Vector &p, Vector &g));\n", + " \n", + " static double sqrarg;\n", + " #define SQR(a) ((sqrarg=(a)) == 0.0 ? 0.0 : sqrarg*sqrarg)\n", + " \n", + " \n", + " static double maxarg1,maxarg2;\n", + " #define FMAX(a,b) (maxarg1=(a),maxarg2=(b),(maxarg1) > (maxarg2) ?\\\n", + " (maxarg1) : (maxarg2))\n", + " \n", + " \n", + " // Begin of main program \n", + " \n", + " int main(int argc, char* argv[])\n", + " {\n", + " \n", + " // MPI initializations\n", + " int NumberProcesses, MyRank, NumberMCsamples;\n", + " MPI_Init (&argc, &argv);\n", + " MPI_Comm_size (MPI_COMM_WORLD, &NumberProcesses);\n", + " MPI_Comm_rank (MPI_COMM_WORLD, &MyRank);\n", + " double StartTime = MPI_Wtime();\n", + " if (MyRank == 0 && argc <= 1) {\n", + " cout << \"Bad Usage: \" << argv[0] << \n", + " \" Read also output file on same line and number of Monte Carlo cycles\" << endl;\n", + " }\n", + " // Read filename and number of Monte Carlo cycles from the command line\n", + " if (MyRank == 0 && argc > 2) {\n", + " string filename = argv[1]; // first command line argument after name of program\n", + " NumberMCsamples = atoi(argv[2]);\n", + " string fileout = filename;\n", + " string argument = to_string(NumberMCsamples);\n", + " // Final filename as filename+NumberMCsamples\n", + " fileout.append(argument);\n", + " ofile.open(fileout);\n", + " }\n", + " // broadcast the number of Monte Carlo samples\n", + " MPI_Bcast (&NumberMCsamples, 1, MPI_INT, 0, MPI_COMM_WORLD);\n", + " // Two variational parameters only\n", + " Vector VariationalParameters(2);\n", + " int TotalNumberMCsamples = NumberMCsamples*NumberProcesses; \n", + " // Loop over variational parameters\n", + " for (double alpha = 0.5; alpha <= 1.5; alpha +=0.1){\n", + " for (double beta = 0.1; beta <= 0.5; beta +=0.05){\n", + " VariationalParameters(0) = alpha; // value of alpha\n", + " VariationalParameters(1) = beta; // value of beta\n", + " // Do the mc sampling and accumulate data with MPI_Reduce\n", + " double TotalEnergy, TotalEnergySquared, LocalProcessEnergy, LocalProcessEnergy2;\n", + " LocalProcessEnergy = LocalProcessEnergy2 = 0.0;\n", + " MonteCarloSampling(NumberMCsamples, LocalProcessEnergy, LocalProcessEnergy2, VariationalParameters);\n", + " // Collect data in total averages\n", + " MPI_Reduce(&LocalProcessEnergy, &TotalEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n", + " MPI_Reduce(&LocalProcessEnergy2, &TotalEnergySquared, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n", + " // Print out results in case of Master node, set to MyRank = 0\n", + " if ( MyRank == 0) {\n", + " \tdouble Energy = TotalEnergy/( (double)NumberProcesses);\n", + " \tdouble Variance = TotalEnergySquared/( (double)NumberProcesses)-Energy*Energy;\n", + " \tdouble StandardDeviation = sqrt(Variance/((double)TotalNumberMCsamples)); // over optimistic error\n", + " \tofile << setiosflags(ios::showpoint | ios::uppercase);\n", + " \tofile << setw(15) << setprecision(8) << VariationalParameters(0);\n", + " \tofile << setw(15) << setprecision(8) << VariationalParameters(1);\n", + " \tofile << setw(15) << setprecision(8) << Energy;\n", + " \tofile << setw(15) << setprecision(8) << Variance;\n", + " \tofile << setw(15) << setprecision(8) << StandardDeviation << endl;\n", + " }\n", + " }\n", + " }\n", + " double EndTime = MPI_Wtime();\n", + " double TotalTime = EndTime-StartTime;\n", + " if ( MyRank == 0 ) cout << \"Time = \" << TotalTime << \" on number of processors: \" << NumberProcesses << endl;\n", + " if (MyRank == 0) ofile.close(); // close output file\n", + " // End MPI\n", + " MPI_Finalize (); \n", + " return 0;\n", + " } // end of main function\n", + " \n", + " \n", + " // Monte Carlo sampling with the Metropolis algorithm \n", + " \n", + " void MonteCarloSampling(int NumberMCsamples, double &cumulative_e, double &cumulative_e2, Vector &VariationalParameters)\n", + " {\n", + " \n", + " // Initialize the seed and call the Mersienne algo\n", + " std::random_device rd;\n", + " std::mt19937_64 gen(rd());\n", + " // Set up the uniform distribution for x \\in [[0, 1]\n", + " std::uniform_real_distribution UniformNumberGenerator(0.0,1.0);\n", + " std::normal_distribution Normaldistribution(0.0,1.0);\n", + " // diffusion constant from Schroedinger equation\n", + " double D = 0.5; \n", + " double timestep = 0.05; // we fix the time step for the gaussian deviate\n", + " // allocate matrices which contain the position of the particles \n", + " Matrix OldPosition( NumberParticles, Dimension), NewPosition( NumberParticles, Dimension);\n", + " Matrix OldQuantumForce(NumberParticles, Dimension), NewQuantumForce(NumberParticles, Dimension);\n", + " double Energy = 0.0; double EnergySquared = 0.0; double DeltaE = 0.0;\n", + " // initial trial positions\n", + " for (int i = 0; i < NumberParticles; i++) { \n", + " for (int j = 0; j < Dimension; j++) {\n", + " OldPosition(i,j) = Normaldistribution(gen)*sqrt(timestep);\n", + " }\n", + " }\n", + " double OldWaveFunction = WaveFunction(OldPosition, VariationalParameters);\n", + " QuantumForce(OldPosition, OldQuantumForce, VariationalParameters);\n", + " // loop over monte carlo cycles \n", + " for (int cycles = 1; cycles <= NumberMCsamples; cycles++){ \n", + " // new position \n", + " for (int i = 0; i < NumberParticles; i++) { \n", + " for (int j = 0; j < Dimension; j++) {\n", + " \t// gaussian deviate to compute new positions using a given timestep\n", + " \tNewPosition(i,j) = OldPosition(i,j) + Normaldistribution(gen)*sqrt(timestep)+OldQuantumForce(i,j)*timestep*D;\n", + " \t//\tNewPosition(i,j) = OldPosition(i,j) + gaussian_deviate(&idum)*sqrt(timestep)+OldQuantumForce(i,j)*timestep*D;\n", + " } \n", + " // for the other particles we need to set the position to the old position since\n", + " // we move only one particle at the time\n", + " for (int k = 0; k < NumberParticles; k++) {\n", + " \tif ( k != i) {\n", + " \t for (int j = 0; j < Dimension; j++) {\n", + " \t NewPosition(k,j) = OldPosition(k,j);\n", + " \t }\n", + " \t} \n", + " }\n", + " double NewWaveFunction = WaveFunction(NewPosition, VariationalParameters); \n", + " QuantumForce(NewPosition, NewQuantumForce, VariationalParameters);\n", + " // we compute the log of the ratio of the greens functions to be used in the \n", + " // Metropolis-Hastings algorithm\n", + " double GreensFunction = 0.0; \n", + " for (int j = 0; j < Dimension; j++) {\n", + " \tGreensFunction += 0.5*(OldQuantumForce(i,j)+NewQuantumForce(i,j))*\n", + " \t (D*timestep*0.5*(OldQuantumForce(i,j)-NewQuantumForce(i,j))-NewPosition(i,j)+OldPosition(i,j));\n", + " }\n", + " GreensFunction = exp(GreensFunction);\n", + " // The Metropolis test is performed by moving one particle at the time\n", + " if(UniformNumberGenerator(gen) <= GreensFunction*NewWaveFunction*NewWaveFunction/OldWaveFunction/OldWaveFunction ) { \n", + " \tfor (int j = 0; j < Dimension; j++) {\n", + " \t OldPosition(i,j) = NewPosition(i,j);\n", + " \t OldQuantumForce(i,j) = NewQuantumForce(i,j);\n", + " \t}\n", + " \tOldWaveFunction = NewWaveFunction;\n", + " }\n", + " } // end of loop over particles\n", + " // compute local energy \n", + " double DeltaE = LocalEnergy(OldPosition, VariationalParameters);\n", + " // update energies\n", + " Energy += DeltaE;\n", + " EnergySquared += DeltaE*DeltaE;\n", + " } // end of loop over MC trials \n", + " // update the energy average and its squared \n", + " cumulative_e = Energy/NumberMCsamples;\n", + " cumulative_e2 = EnergySquared/NumberMCsamples;\n", + " } // end MonteCarloSampling function \n", + " \n", + " \n", + " // Function to compute the squared wave function and the quantum force\n", + " \n", + " double WaveFunction(Matrix &r, Vector &VariationalParameters)\n", + " {\n", + " double wf = 0.0;\n", + " // full Slater determinant for two particles, replace with Slater det for more particles \n", + " wf = SPwavefunction(singleparticle_pos2(r, 0), VariationalParameters(0))*SPwavefunction(singleparticle_pos2(r, 1),VariationalParameters(0));\n", + " // contribution from Jastrow factor\n", + " for (int i = 0; i < NumberParticles-1; i++) { \n", + " for (int j = i+1; j < NumberParticles; j++) {\n", + " wf *= exp(RelativeDistance(r, i, j)/((1.0+VariationalParameters(1)*RelativeDistance(r, i, j))));\n", + " }\n", + " }\n", + " return wf;\n", + " }\n", + " \n", + " // Function to calculate the local energy without numerical derivation of kinetic energy\n", + " \n", + " double LocalEnergy(Matrix &r, Vector &VariationalParameters)\n", + " {\n", + " \n", + " // compute the kinetic and potential energy from the single-particle part\n", + " // for a many-electron system this has to be replaced by a Slater determinant\n", + " // The absolute value of the interparticle length\n", + " Matrix length( NumberParticles, NumberParticles);\n", + " // Set up interparticle distance\n", + " for (int i = 0; i < NumberParticles-1; i++) { \n", + " for(int j = i+1; j < NumberParticles; j++){\n", + " length(i,j) = RelativeDistance(r, i, j);\n", + " length(j,i) = length(i,j);\n", + " }\n", + " }\n", + " double KineticEnergy = 0.0;\n", + " // Set up kinetic energy from Slater and Jastrow terms\n", + " for (int i = 0; i < NumberParticles; i++) { \n", + " for (int k = 0; k < Dimension; k++) {\n", + " double sum1 = 0.0; \n", + " for(int j = 0; j < NumberParticles; j++){\n", + " \tif ( j != i) {\n", + " \t sum1 += JastrowDerivative(r, VariationalParameters(1), i, j, k);\n", + " \t}\n", + " }\n", + " KineticEnergy += (sum1+DerivativeSPwavefunction(r(i,k),VariationalParameters(0)))*(sum1+DerivativeSPwavefunction(r(i,k),VariationalParameters(0)));\n", + " }\n", + " }\n", + " KineticEnergy += -2*VariationalParameters(0)*NumberParticles;\n", + " for (int i = 0; i < NumberParticles-1; i++) {\n", + " for (int j = i+1; j < NumberParticles; j++) {\n", + " KineticEnergy += 2.0/(pow(1.0 + VariationalParameters(1)*length(i,j),2))*(1.0/length(i,j)-2*VariationalParameters(1)/(1+VariationalParameters(1)*length(i,j)) );\n", + " }\n", + " }\n", + " KineticEnergy *= -0.5;\n", + " // Set up potential energy, external potential + eventual electron-electron repulsion\n", + " double PotentialEnergy = 0;\n", + " for (int i = 0; i < NumberParticles; i++) { \n", + " double DistanceSquared = singleparticle_pos2(r, i);\n", + " PotentialEnergy += 0.5*DistanceSquared; // sp energy HO part, note it has the oscillator frequency set to 1!\n", + " }\n", + " // Add the electron-electron repulsion\n", + " for (int i = 0; i < NumberParticles-1; i++) { \n", + " for (int j = i+1; j < NumberParticles; j++) {\n", + " PotentialEnergy += 1.0/length(i,j); \n", + " }\n", + " }\n", + " double LocalE = KineticEnergy+PotentialEnergy;\n", + " return LocalE;\n", + " }\n", + " \n", + " // Compute the analytical expression for the quantum force\n", + " void QuantumForce(Matrix &r, Matrix &qforce, Vector &VariationalParameters)\n", + " {\n", + " // compute the first derivative \n", + " for (int i = 0; i < NumberParticles; i++) {\n", + " for (int k = 0; k < Dimension; k++) {\n", + " // single-particle part, replace with Slater det for larger systems\n", + " double sppart = DerivativeSPwavefunction(r(i,k),VariationalParameters(0));\n", + " // Jastrow factor contribution\n", + " double Jsum = 0.0;\n", + " for (int j = 0; j < NumberParticles; j++) {\n", + " \tif ( j != i) {\n", + " \t Jsum += JastrowDerivative(r, VariationalParameters(1), i, j, k);\n", + " \t}\n", + " }\n", + " qforce(i,k) = 2.0*(Jsum+sppart);\n", + " }\n", + " }\n", + " } // end of QuantumForce function\n", + " \n", + " \n", + " #define ITMAX 200\n", + " #define EPS 3.0e-8\n", + " #define TOLX (4*EPS)\n", + " #define STPMX 100.0\n", + " \n", + " void dfpmin(Vector &p, int n, double gtol, int *iter, double *fret,\n", + " \t double(*func)(Vector &p), void (*dfunc)(Vector &p, Vector &g))\n", + " {\n", + " \n", + " int check,i,its,j;\n", + " double den,fac,fad,fae,fp,stpmax,sum=0.0,sumdg,sumxi,temp,test;\n", + " Vector dg(n), g(n), hdg(n), pnew(n), xi(n);\n", + " Matrix hessian(n,n);\n", + " \n", + " fp=(*func)(p);\n", + " (*dfunc)(p,g);\n", + " for (i = 0;i < n;i++) {\n", + " for (j = 0; j< n;j++) hessian(i,j)=0.0;\n", + " hessian(i,i)=1.0;\n", + " xi(i) = -g(i);\n", + " sum += p(i)*p(i);\n", + " }\n", + " stpmax=STPMX*FMAX(sqrt(sum),(double)n);\n", + " for (its=1;its<=ITMAX;its++) {\n", + " *iter=its;\n", + " lnsrch(n,p,fp,g,xi,pnew,fret,stpmax,&check,func);\n", + " fp = *fret;\n", + " for (i = 0; i< n;i++) {\n", + " xi(i)=pnew(i)-p(i);\n", + " p(i)=pnew(i);\n", + " }\n", + " test=0.0;\n", + " for (i = 0;i< n;i++) {\n", + " temp=fabs(xi(i))/FMAX(fabs(p(i)),1.0);\n", + " if (temp > test) test=temp;\n", + " }\n", + " if (test < TOLX) {\n", + " return;\n", + " }\n", + " for (i=0;i test) test=temp;\n", + " }\n", + " if (test < gtol) {\n", + " return;\n", + " }\n", + " for (i=0;i EPS*sumdg*sumxi) {\n", + " fac=1.0/fac;\n", + " fad=1.0/fae;\n", + " for (i=0;i stpmax)\n", + " for (i=0;i test) test=temp;\n", + " }\n", + " alamin=TOLX/test;\n", + " alam=1.0;\n", + " for (;;) {\n", + " for (i=0;i0.5*alam)\n", + " \t tmplam=0.5*alam;\n", + " }\n", + " }\n", + " alam2=alam;\n", + " f2 = *f;\n", + " fold2=fold;\n", + " alam=FMAX(tmplam,0.1*alam);\n", + " }\n", + " }\n", + " #undef ALF\n", + " #undef TOLX\n", + " \n" + ] + }, + { + "cell_type": "markdown", + "id": "a2672ee2", + "metadata": { + "editable": true + }, + "source": [ + "## What is OpenMP\n", + "* OpenMP provides high-level thread programming\n", + "\n", + "* Multiple cooperating threads are allowed to run simultaneously\n", + "\n", + "* Threads are created and destroyed dynamically in a fork-join pattern\n", + "\n", + " * An OpenMP program consists of a number of parallel regions\n", + "\n", + " * Between two parallel regions there is only one master thread\n", + "\n", + " * In the beginning of a parallel region, a team of new threads is spawned\n", + "\n", + " * The newly spawned threads work simultaneously with the master thread\n", + "\n", + " * At the end of a parallel region, the new threads are destroyed\n", + "\n", + "Many good tutorials online and excellent textbook\n", + "1. [Using OpenMP, by B. Chapman, G. Jost, and A. van der Pas](http://mitpress.mit.edu/books/using-openmp)\n", + "\n", + "2. Many tutorials online like [OpenMP official site](http://www.openmp.org)" + ] + }, + { + "cell_type": "markdown", + "id": "ab7b0b53", + "metadata": { + "editable": true + }, + "source": [ + "## Getting started, things to remember\n", + " * Remember the header file" + ] + }, + { + "cell_type": "markdown", + "id": "50cd6880", + "metadata": { + "editable": true + }, + "source": [ + " #include \n" + ] + }, + { + "cell_type": "markdown", + "id": "4b877c85", + "metadata": { + "editable": true + }, + "source": [ + "* Insert compiler directives in C++ syntax as" + ] + }, + { + "cell_type": "markdown", + "id": "edc15526", + "metadata": { + "editable": true + }, + "source": [ + " #pragma omp...\n" + ] + }, + { + "cell_type": "markdown", + "id": "4f80c44e", + "metadata": { + "editable": true + }, + "source": [ + "* Compile with for example *c++ -fopenmp code.cpp*\n", + "\n", + "* Execute\n", + "\n", + " * Remember to assign the environment variable **OMP NUM THREADS**\n", + "\n", + " * It specifies the total number of threads inside a parallel region, if not otherwise overwritten" + ] + }, + { + "cell_type": "markdown", + "id": "ae290278", + "metadata": { + "editable": true + }, + "source": [ + "## OpenMP syntax\n", + "* Mostly directives" + ] + }, + { + "cell_type": "markdown", + "id": "0b47582b", + "metadata": { + "editable": true + }, + "source": [ + " #pragma omp construct [ clause ...]\n" + ] + }, + { + "cell_type": "markdown", + "id": "29680064", + "metadata": { + "editable": true + }, + "source": [ + "* Some functions and types" + ] + }, + { + "cell_type": "markdown", + "id": "add80e8f", + "metadata": { + "editable": true + }, + "source": [ + " #include \n" + ] + }, + { + "cell_type": "markdown", + "id": "badeef32", + "metadata": { + "editable": true + }, + "source": [ + "* Most apply to a block of code\n", + "\n", + " * Specifically, a **structured block**\n", + "\n", + " * Enter at top, exit at bottom only, exit(), abort() permitted" + ] + }, + { + "cell_type": "markdown", + "id": "1655cea6", + "metadata": { + "editable": true + }, + "source": [ + "## Different OpenMP styles of parallelism\n", + "OpenMP supports several different ways to specify thread parallelism\n", + "\n", + "* General parallel regions: All threads execute the code, roughly as if you made a routine of that region and created a thread to run that code\n", + "\n", + "* Parallel loops: Special case for loops, simplifies data parallel code\n", + "\n", + "* Task parallelism, new in OpenMP 3\n", + "\n", + "* Several ways to manage thread coordination, including Master regions and Locks\n", + "\n", + "* Memory model for shared data" + ] + }, + { + "cell_type": "markdown", + "id": "542d46d6", + "metadata": { + "editable": true + }, + "source": [ + "## General code structure" + ] + }, + { + "cell_type": "markdown", + "id": "5f3a057b", + "metadata": { + "editable": true + }, + "source": [ + " #include \n", + " main ()\n", + " {\n", + " int var1, var2, var3;\n", + " /* serial code */\n", + " /* ... */\n", + " /* start of a parallel region */\n", + " #pragma omp parallel private(var1, var2) shared(var3)\n", + " {\n", + " /* ... */\n", + " }\n", + " /* more serial code */\n", + " /* ... */\n", + " /* another parallel region */\n", + " #pragma omp parallel\n", + " {\n", + " /* ... */\n", + " }\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "e600a25b", + "metadata": { + "editable": true + }, + "source": [ + "## Parallel region\n", + "* A parallel region is a block of code that is executed by a team of threads\n", + "\n", + "* The following compiler directive creates a parallel region" + ] + }, + { + "cell_type": "markdown", + "id": "7cd27256", + "metadata": { + "editable": true + }, + "source": [ + " #pragma omp parallel { ... }\n" + ] + }, + { + "cell_type": "markdown", + "id": "4af8979b", + "metadata": { + "editable": true + }, + "source": [ + "* Clauses can be added at the end of the directive\n", + "\n", + "* Most often used clauses:\n", + "\n", + " * **default(shared)** or **default(none)**\n", + "\n", + " * **public(list of variables)**\n", + "\n", + " * **private(list of variables)**" + ] + }, + { + "cell_type": "markdown", + "id": "d314f63a", + "metadata": { + "editable": true + }, + "source": [ + "## Hello world, not again, please!" + ] + }, + { + "cell_type": "markdown", + "id": "476b27d6", + "metadata": { + "editable": true + }, + "source": [ + " #include \n", + " #include \n", + " int main (int argc, char *argv[])\n", + " {\n", + " int th_id, nthreads;\n", + " #pragma omp parallel private(th_id) shared(nthreads)\n", + " {\n", + " th_id = omp_get_thread_num();\n", + " printf(\"Hello World from thread %d\\n\", th_id);\n", + " #pragma omp barrier\n", + " if ( th_id == 0 ) {\n", + " nthreads = omp_get_num_threads();\n", + " printf(\"There are %d threads\\n\",nthreads);\n", + " }\n", + " }\n", + " return 0;\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "b3150e25", + "metadata": { + "editable": true + }, + "source": [ + "## Hello world, yet another variant" + ] + }, + { + "cell_type": "markdown", + "id": "d7994f3d", + "metadata": { + "editable": true + }, + "source": [ + " #include \n", + " #include \n", + " int main(int argc, char *argv[]) \n", + " {\n", + " omp_set_num_threads(4); \n", + " #pragma omp parallel\n", + " {\n", + " int id = omp_get_thread_num();\n", + " int nproc = omp_get_num_threads(); \n", + " cout << \"Hello world with id number and processes \" << id << nproc << endl;\n", + " } \n", + " return 0;\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "a1b0ca46", + "metadata": { + "editable": true + }, + "source": [ + "Variables declared outside of the parallel region are shared by all threads\n", + "If a variable like **id** is declared outside of the" + ] + }, + { + "cell_type": "markdown", + "id": "af8f316a", + "metadata": { + "editable": true + }, + "source": [ + " #pragma omp parallel, \n" + ] + }, + { + "cell_type": "markdown", + "id": "f2def2ed", + "metadata": { + "editable": true + }, + "source": [ + "it would have been shared by various the threads, possibly causing erroneous output\n", + " * Why? What would go wrong? Why do we add possibly?" + ] + }, + { + "cell_type": "markdown", + "id": "5a2e2450", + "metadata": { + "editable": true + }, + "source": [ + "## Important OpenMP library routines\n", + "\n", + "* **int omp get num threads ()**, returns the number of threads inside a parallel region\n", + "\n", + "* **int omp get thread num ()**, returns the a thread for each thread inside a parallel region\n", + "\n", + "* **void omp set num threads (int)**, sets the number of threads to be used\n", + "\n", + "* **void omp set nested (int)**, turns nested parallelism on/off" + ] + }, + { + "cell_type": "markdown", + "id": "66bbd83e", + "metadata": { + "editable": true + }, + "source": [ + "## Private variables\n", + "Private clause can be used to make thread- private versions of such variables:" + ] + }, + { + "cell_type": "markdown", + "id": "6d307151", + "metadata": { + "editable": true + }, + "source": [ + " #pragma omp parallel private(id)\n", + " {\n", + " int id = omp_get_thread_num();\n", + " cout << \"My thread num\" << id << endl; \n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "2a35060f", + "metadata": { + "editable": true + }, + "source": [ + "* What is their value on entry? Exit?\n", + "\n", + "* OpenMP provides ways to control that\n", + "\n", + "* Can use default(none) to require the sharing of each variable to be described" + ] + }, + { + "cell_type": "markdown", + "id": "9721e8f2", + "metadata": { + "editable": true + }, + "source": [ + "## Master region\n", + "It is often useful to have only one thread execute some of the code in a parallel region. I/O statements are a common example" + ] + }, + { + "cell_type": "markdown", + "id": "a99bffb1", + "metadata": { + "editable": true + }, + "source": [ + " #pragma omp parallel \n", + " {\n", + " #pragma omp master\n", + " {\n", + " int id = omp_get_thread_num();\n", + " cout << \"My thread num\" << id << endl; \n", + " } \n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "bf58be92", + "metadata": { + "editable": true + }, + "source": [ + "## Parallel for loop\n", + " * Inside a parallel region, the following compiler directive can be used to parallelize a for-loop:" + ] + }, + { + "cell_type": "markdown", + "id": "b9ad7fc0", + "metadata": { + "editable": true + }, + "source": [ + " #pragma omp for\n" + ] + }, + { + "cell_type": "markdown", + "id": "dfd937c8", + "metadata": { + "editable": true + }, + "source": [ + "* Clauses can be added, such as\n", + "\n", + " * **schedule(static, chunk size)**\n", + "\n", + " * **schedule(dynamic, chunk size)** \n", + "\n", + " * **schedule(guided, chunk size)** (non-deterministic allocation)\n", + "\n", + " * **schedule(runtime)**\n", + "\n", + " * **private(list of variables)**\n", + "\n", + " * **reduction(operator:variable)**\n", + "\n", + " * **nowait**" + ] + }, + { + "cell_type": "markdown", + "id": "39645b2c", + "metadata": { + "editable": true + }, + "source": [ + "## Parallel computations and loops\n", + "\n", + "OpenMP provides an easy way to parallelize a loop" + ] + }, + { + "cell_type": "markdown", + "id": "31bf0645", + "metadata": { + "editable": true + }, + "source": [ + " #pragma omp parallel for\n", + " for (i=0; i\n", + " #define CHUNKSIZE 100\n", + " #define N 1000\n", + " int main (int argc, char *argv[])\n", + " {\n", + " int i, chunk;\n", + " float a[N], b[N], c[N];\n", + " for (i=0; i < N; i++) a[i] = b[i] = i * 1.0;\n", + " chunk = CHUNKSIZE;\n", + " #pragma omp parallel shared(a,b,c,chunk) private(i)\n", + " {\n", + " #pragma omp for schedule(dynamic,chunk)\n", + " for (i=0; i < N; i++) c[i] = a[i] + b[i];\n", + " } /* end of parallel region */\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "bcbc9208", + "metadata": { + "editable": true + }, + "source": [ + "## Example code for loop scheduling, guided instead of dynamic" + ] + }, + { + "cell_type": "markdown", + "id": "a9bcffe4", + "metadata": { + "editable": true + }, + "source": [ + " #include \n", + " #define CHUNKSIZE 100\n", + " #define N 1000\n", + " int main (int argc, char *argv[])\n", + " {\n", + " int i, chunk;\n", + " float a[N], b[N], c[N];\n", + " for (i=0; i < N; i++) a[i] = b[i] = i * 1.0;\n", + " chunk = CHUNKSIZE;\n", + " #pragma omp parallel shared(a,b,c,chunk) private(i)\n", + " {\n", + " #pragma omp for schedule(guided,chunk)\n", + " for (i=0; i < N; i++) c[i] = a[i] + b[i];\n", + " } /* end of parallel region */\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "2e3fc84f", + "metadata": { + "editable": true + }, + "source": [ + "## More on Parallel for loop\n", + "* The number of loop iterations cannot be non-deterministic; break, return, exit, goto not allowed inside the for-loop\n", + "\n", + "* The loop index is private to each thread\n", + "\n", + "* A reduction variable is special\n", + "\n", + " * During the for-loop there is a local private copy in each thread\n", + "\n", + " * At the end of the for-loop, all the local copies are combined together by the reduction operation\n", + "\n", + "* Unless the nowait clause is used, an implicit barrier synchronization will be added at the end by the compiler" + ] + }, + { + "cell_type": "markdown", + "id": "f8bc1f7a", + "metadata": { + "editable": true + }, + "source": [ + " // #pragma omp parallel and #pragma omp for\n" + ] + }, + { + "cell_type": "markdown", + "id": "b573c665", + "metadata": { + "editable": true + }, + "source": [ + "can be combined into" + ] + }, + { + "cell_type": "markdown", + "id": "1549ba33", + "metadata": { + "editable": true + }, + "source": [ + " #pragma omp parallel for\n" + ] + }, + { + "cell_type": "markdown", + "id": "2c132037", + "metadata": { + "editable": true + }, + "source": [ + "## What can happen with this loop?\n", + "\n", + "What happens with code like this" + ] + }, + { + "cell_type": "markdown", + "id": "823a30b6", + "metadata": { + "editable": true + }, + "source": [ + " #pragma omp parallel for\n", + " for (i=0; i r) {\n", + " #pragma omp task\n", + " do_work (p_vec[i]);\n" + ] + }, + { + "cell_type": "markdown", + "id": "790aa2d3", + "metadata": { + "editable": true + }, + "source": [ + "## Common mistakes\n", + "Race condition" + ] + }, + { + "cell_type": "markdown", + "id": "30b561e2", + "metadata": { + "editable": true + }, + "source": [ + " int nthreads;\n", + " #pragma omp parallel shared(nthreads)\n", + " {\n", + " nthreads = omp_get_num_threads();\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "4d2a0797", + "metadata": { + "editable": true + }, + "source": [ + "Deadlock" + ] + }, + { + "cell_type": "markdown", + "id": "3cd95899", + "metadata": { + "editable": true + }, + "source": [ + " #pragma omp parallel\n", + " {\n", + " ...\n", + " #pragma omp critical\n", + " {\n", + " ...\n", + " #pragma omp barrier\n", + " }\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "68c2a4fe", + "metadata": { + "editable": true + }, + "source": [ + "## Not all computations are simple\n", + "Not all computations are simple loops where the data can be evenly \n", + "divided among threads without any dependencies between threads\n", + "\n", + "An example is finding the location and value of the largest element in an array" + ] + }, + { + "cell_type": "markdown", + "id": "047f7c41", + "metadata": { + "editable": true + }, + "source": [ + " for (i=0; i maxval) {\n", + " maxval = x[i];\n", + " maxloc = i; \n", + " }\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "e60ece38", + "metadata": { + "editable": true + }, + "source": [ + "## Not all computations are simple, competing threads\n", + "All threads are potentially accessing and changing the same values, **maxloc** and **maxval**.\n", + "1. OpenMP provides several ways to coordinate access to shared values" + ] + }, + { + "cell_type": "markdown", + "id": "f56129e7", + "metadata": { + "editable": true + }, + "source": [ + " #pragma omp atomic\n" + ] + }, + { + "cell_type": "markdown", + "id": "7b02497f", + "metadata": { + "editable": true + }, + "source": [ + "1. Only one thread at a time can execute the following statement (not block). We can use the critical option" + ] + }, + { + "cell_type": "markdown", + "id": "466f17e6", + "metadata": { + "editable": true + }, + "source": [ + " #pragma omp critical\n" + ] + }, + { + "cell_type": "markdown", + "id": "35fb343a", + "metadata": { + "editable": true + }, + "source": [ + "1. Only one thread at a time can execute the following block\n", + "\n", + "Atomic may be faster than critical but depends on hardware" + ] + }, + { + "cell_type": "markdown", + "id": "50838011", + "metadata": { + "editable": true + }, + "source": [ + "## How to find the max value using OpenMP\n", + "Write down the simplest algorithm and look carefully for race conditions. How would you handle them? \n", + "The first step would be to parallelize as" + ] + }, + { + "cell_type": "markdown", + "id": "1ac70c07", + "metadata": { + "editable": true + }, + "source": [ + " #pragma omp parallel for\n", + " for (i=0; i maxval) {\n", + " maxval = x[i];\n", + " maxloc = i; \n", + " }\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "06de5d43", + "metadata": { + "editable": true + }, + "source": [ + "## Then deal with the race conditions\n", + "Write down the simplest algorithm and look carefully for race conditions. How would you handle them? \n", + "The first step would be to parallelize as" + ] + }, + { + "cell_type": "markdown", + "id": "4674fff0", + "metadata": { + "editable": true + }, + "source": [ + " #pragma omp parallel for\n", + " for (i=0; i maxval) {\n", + " maxval = x[i];\n", + " maxloc = i; \n", + " }\n", + " }\n", + " } \n" + ] + }, + { + "cell_type": "markdown", + "id": "1a21898f", + "metadata": { + "editable": true + }, + "source": [ + "Exercise: write a code which implements this and give an estimate on performance. Perform several runs,\n", + "with a serial code only with and without vectorization and compare the serial code with the one that uses OpenMP. Run on different archictectures if you can." + ] + }, + { + "cell_type": "markdown", + "id": "361afe33", + "metadata": { + "editable": true + }, + "source": [ + "## What can slow down OpenMP performance?\n", + "Give it a thought!" + ] + }, + { + "cell_type": "markdown", + "id": "0872a8b1", + "metadata": { + "editable": true + }, + "source": [ + "## What can slow down OpenMP performance?\n", + "Performance poor because we insisted on keeping track of the maxval and location during the execution of the loop.\n", + " * We do not care about the value during the execution of the loop, just the value at the end.\n", + "\n", + "This is a common source of performance issues, namely the description of the method used to compute a value imposes additional, unnecessary requirements or properties\n", + "\n", + "**Idea: Have each thread find the maxloc in its own data, then combine and use temporary arrays indexed by thread number to hold the values found by each thread**" + ] + }, + { + "cell_type": "markdown", + "id": "573d56fe", + "metadata": { + "editable": true + }, + "source": [ + "## Find the max location for each thread" + ] + }, + { + "cell_type": "markdown", + "id": "371fbbfd", + "metadata": { + "editable": true + }, + "source": [ + " int maxloc[MAX_THREADS], mloc;\n", + " double maxval[MAX_THREADS], mval; \n", + " #pragma omp parallel shared(maxval,maxloc)\n", + " {\n", + " int id = omp_get_thread_num(); \n", + " maxval[id] = -1.0e30;\n", + " #pragma omp for\n", + " for (int i=0; i maxval[id]) { \n", + " maxloc[id] = i;\n", + " maxval[id] = x[i]; \n", + " }\n", + " }\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "7d7972d7", + "metadata": { + "editable": true + }, + "source": [ + "## Combine the values from each thread" + ] + }, + { + "cell_type": "markdown", + "id": "7119f580", + "metadata": { + "editable": true + }, + "source": [ + " #pragma omp flush (maxloc,maxval)\n", + " #pragma omp master\n", + " {\n", + " int nt = omp_get_num_threads(); \n", + " mloc = maxloc[0]; \n", + " mval = maxval[0]; \n", + " for (int i=1; i mval) { \n", + " mval = maxval[i]; \n", + " mloc = maxloc[i];\n", + " } \n", + " }\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "4a3de8bd", + "metadata": { + "editable": true + }, + "source": [ + "Note that we let the master process perform the last operation." + ] + }, + { + "cell_type": "markdown", + "id": "dace2927", + "metadata": { + "editable": true + }, + "source": [ + "## [Matrix-matrix multiplication](https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPvectornorm.cpp)\n", + "This code computes the norm of a vector using OpenMp" + ] + }, + { + "cell_type": "markdown", + "id": "fb2465d2", + "metadata": { + "editable": true + }, + "source": [ + " // OpenMP program to compute vector norm by adding two other vectors\n", + " #include \n", + " #include \n", + " #include \n", + " #include \n", + " #include \n", + " # include \n", + " \n", + " using namespace std; // note use of namespace\n", + " int main (int argc, char* argv[])\n", + " {\n", + " // read in dimension of vector\n", + " int n = atoi(argv[1]);\n", + " double *a, *b, *c;\n", + " int i;\n", + " int thread_num;\n", + " double wtime, Norm2, s, angle;\n", + " cout << \" Perform addition of two vectors and compute the norm-2.\" << endl;\n", + " omp_set_num_threads(4);\n", + " thread_num = omp_get_max_threads ();\n", + " cout << \" The number of processors available = \" << omp_get_num_procs () << endl ;\n", + " cout << \" The number of threads available = \" << thread_num << endl;\n", + " cout << \" The matrix order n = \" << n << endl;\n", + " \n", + " s = 1.0/sqrt( (double) n);\n", + " wtime = omp_get_wtime ( );\n", + " // Allocate space for the vectors to be used\n", + " a = new double [n]; b = new double [n]; c = new double [n];\n", + " // Define parallel region\n", + " # pragma omp parallel for default(shared) private (angle, i) reduction(+:Norm2)\n", + " // Set up values for vectors a and b\n", + " for (i = 0; i < n; i++){\n", + " angle = 2.0*M_PI*i/ (( double ) n);\n", + " a[i] = s*(sin(angle) + cos(angle));\n", + " b[i] = s*sin(2.0*angle);\n", + " c[i] = 0.0;\n", + " }\n", + " // Then perform the vector addition\n", + " for (i = 0; i < n; i++){\n", + " c[i] += a[i]+b[i];\n", + " }\n", + " // Compute now the norm-2\n", + " Norm2 = 0.0;\n", + " for (i = 0; i < n; i++){\n", + " Norm2 += c[i]*c[i];\n", + " }\n", + " // end parallel region\n", + " wtime = omp_get_wtime ( ) - wtime;\n", + " cout << setiosflags(ios::showpoint | ios::uppercase);\n", + " cout << setprecision(10) << setw(20) << \"Time used for norm-2 computation=\" << wtime << endl;\n", + " cout << \" Norm-2 = \" << Norm2 << endl;\n", + " // Free up space\n", + " delete[] a;\n", + " delete[] b;\n", + " delete[] c;\n", + " return 0;\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "7aca0759", + "metadata": { + "editable": true + }, + "source": [ + "## [Matrix-matrix multiplication](https://github.com/CompPhysics/ComputationalPhysicsMSU/blob/master/doc/Programs/ParallelizationOpenMP/OpenMPmatrixmatrixmult.cpp)\n", + "This the matrix-matrix multiplication code with plain c++ memory allocation using OpenMP" + ] + }, + { + "cell_type": "markdown", + "id": "e4f14771", + "metadata": { + "editable": true + }, + "source": [ + " // Matrix-matrix multiplication and Frobenius norm of a matrix with OpenMP\n", + " #include \n", + " #include \n", + " #include \n", + " #include \n", + " #include \n", + " # include \n", + " \n", + " using namespace std; // note use of namespace\n", + " int main (int argc, char* argv[])\n", + " {\n", + " // read in dimension of square matrix\n", + " int n = atoi(argv[1]);\n", + " double **A, **B, **C;\n", + " int i, j, k;\n", + " int thread_num;\n", + " double wtime, Fsum, s, angle;\n", + " cout << \" Compute matrix product C = A * B and Frobenius norm.\" << endl;\n", + " omp_set_num_threads(4);\n", + " thread_num = omp_get_max_threads ();\n", + " cout << \" The number of processors available = \" << omp_get_num_procs () << endl ;\n", + " cout << \" The number of threads available = \" << thread_num << endl;\n", + " cout << \" The matrix order n = \" << n << endl;\n", + " \n", + " s = 1.0/sqrt( (double) n);\n", + " wtime = omp_get_wtime ( );\n", + " // Allocate space for the two matrices\n", + " A = new double*[n]; B = new double*[n]; C = new double*[n];\n", + " for (i = 0; i < n; i++){\n", + " A[i] = new double[n];\n", + " B[i] = new double[n];\n", + " C[i] = new double[n];\n", + " }\n", + " // Define parallel region\n", + " # pragma omp parallel for default(shared) private (angle, i, j, k) reduction(+:Fsum)\n", + " // Set up values for matrix A and B and zero matrix C\n", + " for (i = 0; i < n; i++){\n", + " for (j = 0; j < n; j++) {\n", + " angle = 2.0*M_PI*i*j/ (( double ) n);\n", + " A[i][j] = s * ( sin ( angle ) + cos ( angle ) );\n", + " B[j][i] = A[i][j];\n", + " }\n", + " }\n", + " // Then perform the matrix-matrix multiplication\n", + " for (i = 0; i < n; i++){\n", + " for (j = 0; j < n; j++) {\n", + " C[i][j] = 0.0; \n", + " for (k = 0; k < n; k++) {\n", + " C[i][j] += A[i][k]*B[k][j];\n", + " }\n", + " }\n", + " }\n", + " // Compute now the Frobenius norm\n", + " Fsum = 0.0;\n", + " for (i = 0; i < n; i++){\n", + " for (j = 0; j < n; j++) {\n", + " Fsum += C[i][j]*C[i][j];\n", + " }\n", + " }\n", + " Fsum = sqrt(Fsum);\n", + " // end parallel region and letting only one thread perform I/O\n", + " wtime = omp_get_wtime ( ) - wtime;\n", + " cout << setiosflags(ios::showpoint | ios::uppercase);\n", + " cout << setprecision(10) << setw(20) << \"Time used for matrix-matrix multiplication=\" << wtime << endl;\n", + " cout << \" Frobenius norm = \" << Fsum << endl;\n", + " // Free up space\n", + " for (int i = 0; i < n; i++){\n", + " delete[] A[i];\n", + " delete[] B[i];\n", + " delete[] C[i];\n", + " }\n", + " delete[] A;\n", + " delete[] B;\n", + " delete[] C;\n", + " return 0;\n", + " }\n", + " \n", + " \n" + ] } ], "metadata": {}, diff --git a/doc/pub/week9/pdf/week9-beamer.pdf b/doc/pub/week9/pdf/week9-beamer.pdf index ffae9f55..1cc1de25 100644 Binary files a/doc/pub/week9/pdf/week9-beamer.pdf and b/doc/pub/week9/pdf/week9-beamer.pdf differ diff --git a/doc/pub/week9/pdf/week9.pdf b/doc/pub/week9/pdf/week9.pdf index eac21e0c..a0d81b42 100644 Binary files a/doc/pub/week9/pdf/week9.pdf and b/doc/pub/week9/pdf/week9.pdf differ diff --git a/doc/src/week9/programs/automersenne.cpp b/doc/src/week9/programs/automersenne.cpp new file mode 100644 index 00000000..7a5a5bab --- /dev/null +++ b/doc/src/week9/programs/automersenne.cpp @@ -0,0 +1,76 @@ +// This function computes the autocorrelation function for +// the standard c++ random number generator + +#include +#include +#include +#include +#include + +using namespace std; +// output file as global variable +ofstream ofile; + +// Main function begins here +int main(int argc, char* argv[]) +{ + int n; + char *outfilename; + + cin >> n; + double MCint = 0.; double MCintsqr2=0.; + // Initialize the seed and call the Mersienne algo + std::random_device rd; + std::mt19937_64 gen(rd()); + // Set up the uniform distribution for x \in [[0, 1] + std::uniform_real_distribution RandomNumberGenerator(0.0,1.0); + // Compute the variance and the mean value of the uniform distribution + // Compute also the specific values x for each cycle in order to be able to + // the covariance and the correlation function + // Read in output file, abort if there are too few command-line arguments + if( argc <= 2 ){ + cout << "Bad Usage: " << argv[0] << + " read also output file and number of cycles on same line" << endl; + exit(1); + } + else{ + outfilename=argv[1]; + } + ofile.open(outfilename); + // Get the number of Monte-Carlo samples + n = atoi(argv[2]); + double *X; + X = new double[n]; + for (int i = 0; i < n; i++){ + double x = RandomNumberGenerator(gen); + X[i] = x; + MCint += x; + MCintsqr2 += x*x; + } + double Mean = MCint/((double) n ); + MCintsqr2 = MCintsqr2/((double) n ); + double STDev = sqrt(MCintsqr2-Mean*Mean); + double Variance = MCintsqr2-Mean*Mean; +// Write mean value and standard deviation + cout << " Standard deviation= " << STDev << " Integral = " << Mean << endl; + + // Now we compute the autocorrelation function + double *autocor; autocor = new double[n]; + for (int j = 0; j < n; j++){ + double sum = 0.0; + for (int k = 0; k < (n-j); k++){ + sum += (X[k]-Mean)*(X[k+j]-Mean); + } + autocor[j] = sum/Variance/((double) n ); + ofile << setiosflags(ios::showpoint | ios::uppercase); + ofile << setw(15) << setprecision(8) << j; + ofile << setw(15) << setprecision(8) << autocor[j] << endl; + } + ofile.close(); // close output file + return 0; +} // end of main program + + + + + diff --git a/doc/src/week9/programs/mc.py b/doc/src/week9/programs/mc.py new file mode 100644 index 00000000..225b348e --- /dev/null +++ b/doc/src/week9/programs/mc.py @@ -0,0 +1,54 @@ +from matplotlib import pyplot as plt +from math import exp, acos, log10 +import numpy as np +from sympy import Symbol, integrate, exp, oo +import random + + +# function for the trapezoidal rule +def TrapezoidalRule(a,b,f,n): + h = (b-a)/float(n) + s = 0 + x = a + for i in range(1,n,1): + x = x+h + s = s+ f(x) + s = 0.5*(f(a)+f(b))+s + return h*s +# function to perform the Monte Carlo calculations +def MonteCarloIntegration(f,n): + sum = 0 +# Define the seed for the rng + random.seed() + for i in range (1, n, 1): + x = random.random() + sum = sum +f(x) + return sum/n + +# function to compute +def function(x): + return 4/(1+x*x) + +# Integration limits for the Trapezoidal rule +a = 0.0; b = 1.0 +# define x as a symbol to be used by sympy +x = Symbol('x') +# find result from sympy +#exact = integrate(function(x), (x, a, b)) +exact = acos(-1.0) +# set up the arrays for plotting the relative error +log10n = np.zeros(6); Trapez = np.zeros(6); MCint = np.zeros(6); +# find the relative error as function of integration points +for i in range(1, 6): + npts = 10**(i+1) + log10n[i] = log10(npts) + Trapez[i] = log10(abs((TrapezoidalRule(a,b,function,npts)-exact)/exact)) + MCint[i] = log10(abs((MonteCarloIntegration(function,npts)-exact)/exact)) +plt.plot(log10n, Trapez ,'b-',log10n, MCint,'g-') +plt.axis([1,6,-14.0, 0.0]) +plt.xlabel('$\log_{10}(n)$') +plt.ylabel('Relative error') +plt.title('Relative errors for Monte Carlo integration and Trapezoidal rule') +plt.legend(['Trapezoidal rule', 'Brute force Monte Carlo integration'], loc='best') +plt.savefig('mcintegration.pdf') +plt.show() diff --git a/doc/src/week9/programs/plot.py b/doc/src/week9/programs/plot.py new file mode 100644 index 00000000..ecea29a7 --- /dev/null +++ b/doc/src/week9/programs/plot.py @@ -0,0 +1,16 @@ +import numpy as np +from matplotlib import pyplot as plt +# Load in data file +data = np.loadtxt("autocor.dat") +data1 = np.loadtxt("automersenne.dat") +# Make arrays containing x-axis and binding energies as function of A +x = data[:,0] +corr = data[:,1] +corr2 = data1[:,1] +plt.plot(x, corr ,'ro', x, corr2, 'b') +plt.axis([0,1000,-0.2, 1.1]) +plt.xlabel(r'$d$') +plt.ylabel(r'$C_d$') +plt.title(r'autocorrelation function for RNG') +plt.savefig('autocorr.pdf') +plt.show() diff --git a/doc/src/week9/programs/uniformhisto.py b/doc/src/week9/programs/uniformhisto.py new file mode 100644 index 00000000..87eb902e --- /dev/null +++ b/doc/src/week9/programs/uniformhisto.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python +import numpy as np +import matplotlib.mlab as mlab +import matplotlib.pyplot as plt +import random + +# initialize the rng with a seed +random.seed() +counts = 10000 +values = np.zeros(counts) +for i in range (1, counts, 1): + values[i] = random.random() + +# the histogram of the data +n, bins, patches = plt.hist(values, 10, facecolor='green') + +plt.xlabel('$x$') +plt.ylabel('Number of counts') +plt.title(r'Test of uniform distribution') +plt.axis([0, 1, 0, 1100]) +plt.grid(True) + +plt.show()