diff --git a/.eleventy.js b/.eleventy.js index fca1da2a..4f7e205f 100644 --- a/.eleventy.js +++ b/.eleventy.js @@ -19,6 +19,10 @@ str const featuredPosts = (post) => post.data.featured; module.exports = function(eleventyConfig) { + eleventyConfig.setQuietMode(true); + // Reuse 11ty's built-in `slugify` filter. + const slugifyFn = eleventyConfig.getFilter("slugify"); + // Support .yaml extension in _data eleventyConfig.addDataExtension("yaml", contents => yaml.load(contents)); @@ -81,6 +85,29 @@ module.exports = function(eleventyConfig) { .filter(featuredPosts); }); + // for defn things see https://github.com/11ty/eleventy/issues/2565 + eleventyConfig.addCollection("defns", function (collectionApi) { + const res = []; + for (const post of collectionApi.getAll()) { + // see https://github.com/11ty/eleventy/discussions/2153, re: "Tried to use templateContent too early". + const defs = post.template.frontMatter?.content.trim().matchAll(/{% defn (?.*?) %}/gi); + for (const { groups } of defs) { + const [ term, text ] = groups.value.split(",").map(s => s.trim().replace(/"/g, "")); + res.push({ url: post.url, term, text }); + } + } + return res; + }); + + eleventyConfig.addCollection("defnTerms", function (collectionApi) { + const defns = collectionApi.getFilteredByTag("glossary"); + // Possibly over-engineered here. We might really only care about the slugified title and not the unslugged version. + const data = defns.reduce((acc, { data }) => { + return Object.assign(acc, {[slugifyFn(data.title)]: data.title}); + }, {}); + return data; + }); + eleventyConfig.addCollection('glossary', (collection) => { return collection .getFilteredByGlob("./src/doc/reference/glossary/*.md") @@ -151,16 +178,21 @@ module.exports = function(eleventyConfig) { function(source='', size='400') { return `` } ); - // Shortcode for glossary links + // Shortcode for glossary links (see https://github.com/11ty/eleventy/issues/2565#issuecomment-1246106301) eleventyConfig.addShortcode( "defn", - function(term='', text='') { - const url = "/doc/reference/glossary/#" + slugify(term) - if (text=='') - docText = term - else - docText = text - return `${docText}` + function(term='', text='') { + // Some sneaky stuff here to get the context from the `this.ctx` object so we can access + // `collections` and `page` variables from within our shortcode. + const { collections, page } = this.ctx; + const slug = slugifyFn(term); + const glossaryUrl = `/doc/reference/glossary/#${slug}`; + // If our `collections.defnTerms` does NOT include the current slug, it's likely a bad link/defn. + if (!Object.keys(collections.defnTerms).includes(slug)) { + console.error(`[${page.url}] Unknown term: "${term}" => ${glossaryUrl}`); + // process.exitCode = 1; + } + return `${text || term}`; } ); diff --git a/.eleventyignore b/.eleventyignore index 4400aea9..c8b1e76f 100644 --- a/.eleventyignore +++ b/.eleventyignore @@ -1,2 +1,3 @@ node_modules -README.md \ No newline at end of file +README.md +src/sphinx-bootstrap-theme diff --git a/.gitignore b/.gitignore index c2c4cfc5..3487db29 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,16 @@ _site/ _tmp/ .DS_Store +env node_modules/ .vscode/ *git/ package-lock.json .env +src/sphinx +src/sphinx-bootstrap-theme +src/pydata-sphinx-theme +*~ # Local Netlify folder .netlify diff --git a/package.json b/package.json index ecac4070..176d1621 100644 --- a/package.json +++ b/package.json @@ -28,6 +28,7 @@ "@11ty/eleventy": "^1.0.0", "@11ty/eleventy-navigation": "^0.3.2", "@11ty/eleventy-plugin-rss": "^1.1.2", + "dotenv": "^16.0.2", "eleventy-plugin-toc": "^1.1.5", "js-yaml": "^4.1.0", "luxon": "^2.3.1", diff --git a/src/_data/metadata.json b/src/_data/metadata.json index cd7b5101..358ad497 100644 --- a/src/_data/metadata.json +++ b/src/_data/metadata.json @@ -3,7 +3,7 @@ "author": "The OpenCilk Team", "email": "contact@opencilk.org", "attribution": { - "cilkplus": "This work is derived from Cilk Plus documentation with permission of Intel Corporation and Cilk Arts." + "cilkplus": "This work is derived from Cilk++ documentation with permission of Intel Corporation." }, "blog": { "title": "Fast Code", diff --git a/src/_includes/code/qsort.cpp b/src/_includes/code/qsort.cpp new file mode 100644 index 00000000..238acc9e --- /dev/null +++ b/src/_includes/code/qsort.cpp @@ -0,0 +1,40 @@ +#include +#include +#include +#include + +constexpr std::ptrdiff_t BASE_CASE_LENGTH = 32; + +template +void sample_qsort(T* begin, T* end) { + if (end - begin < BASE_CASE_LENGTH) { + std::sort(begin, end); // base case: serial sort + } else { + --end; // exclude last element (pivot) from partition + T* middle = std::partition(begin, end, + [pivot=*end](T a) { return a < pivot; }); + std::swap(*end, *middle); // move pivot to middle + cilk_scope { + cilk_spawn sample_qsort(begin, middle); + sample_qsort(++middle, ++end); // exclude pivot and restore end + } + } +} + +int main(int argc, char* argv[]) { + long n = 100 * 1000 * 1000; + if (argc == 2) + n = std::atoi(argv[1]); + + std::default_random_engine rng; + std::uniform_int_distribution dist(0,n); + std::vector a(n); + std::generate(a.begin(), a.end(), [&]() { return dist(rng); }); + + std::cout << "Sorting " << n << " random integers" << std::endl; + sample_qsort(a.data(), a.data() + a.size()); + + bool pass = std::is_sorted(a.cbegin(), a.cend()); + std::cout << "Sort " << ((pass) ? "succeeded" : "failed") << "\n"; + return (pass) ? 0 : 1; +} \ No newline at end of file diff --git a/src/_includes/layouts/base.njk b/src/_includes/layouts/base.njk index d32bea5b..b9d12229 100644 --- a/src/_includes/layouts/base.njk +++ b/src/_includes/layouts/base.njk @@ -1,3 +1,4 @@ +{% if permalink !== false %} {% include 'partials/header.njk' %} @@ -92,3 +93,4 @@ +{% endif %} diff --git a/src/community.md b/src/community.md index 56322e5a..24f901ed 100644 --- a/src/community.md +++ b/src/community.md @@ -9,7 +9,7 @@ eleventyNavigation: order: 5 --- -OpenCilk is a community-driven open source project developed by a diverse group of contributors. [Join us](/contribute)! +OpenCilk is a community-driven open source project developed by a diverse group of contributors. [Join us](/community/join-us/)! ## Where can I get help? diff --git a/src/community/community.11tydata.js b/src/community/community.11tydata.js new file mode 100644 index 00000000..9fd586d9 --- /dev/null +++ b/src/community/community.11tydata.js @@ -0,0 +1,54 @@ +require('dotenv').config(); + +const isDevEnv = process.env.ELEVENTY_ENV === 'development'; +const todaysDate = new Date(); + +function showDraft(data) { + const isDraft = 'draft' in data && data.draft !== false; + const isFutureDate = data.page.date > todaysDate; + return isDevEnv || (!isDraft && !isFutureDate); +} + +module.exports = function() { + return { + eleventyComputed: { + eleventyExcludeFromCollections: function(data) { + if(showDraft(data)) { + return data.eleventyExcludeFromCollections; + } + else { + return true; + } + }, + permalink: function(data) { + if(showDraft(data)) { + return data.permalink + } + else { + return false; + } + }, + eleventyNavigation: { + key: function(data) { + if(showDraft(data)) { + return data.title + } + else { + return false; + } + } + }, + sidebar: function(data) { + return 'toc'; + }, + background: function(data) { + if(('draft' in data && data.draft !== false) || (data.page.date > todaysDate)) { + return 'text-white bg-info' + } + else { + return 'bg-white'; + } + } + } + } +} \ No newline at end of file diff --git a/src/community/join-us.md b/src/community/join-us.md new file mode 100644 index 00000000..ec851790 --- /dev/null +++ b/src/community/join-us.md @@ -0,0 +1,19 @@ +--- +layout: layouts/page.njk +sidebar: toc +title: Join us +eleventyNavigation: + key: Join us + order: 1 +--- + +The OpenCilk project welcomes your expertise and enthusiasm. Please fill out the form below to get started. Thank you for your interest! + + + \ No newline at end of file diff --git a/src/community/resources.md b/src/community/resources.md index d094928f..315e02a5 100644 --- a/src/community/resources.md +++ b/src/community/resources.md @@ -2,6 +2,7 @@ layout: layouts/page.njk sidebar: toc title: Resources +draft: true eleventyNavigation: key: Resources parent: Community diff --git a/src/community/software.md b/src/community/software.md new file mode 100644 index 00000000..c1e83665 --- /dev/null +++ b/src/community/software.md @@ -0,0 +1,42 @@ +--- +layout: layouts/page.njk +sidebar: toc +title: Software +eleventyNavigation: + key: Software + parent: Community + order: 99 +--- + +This page lists software contributed by the community that is related to OpenCilk, including OpenCilk-powered applications and libraries and +miscellaneous tools to help developers write Cilk programs. + +## OpenCilk-powered libraries + +The following third-party libraries are known to work with OpenCilk out of the +box for parallel execution. + +- [SG-t-SNE-Π](https://github.com/fcdimitr/sgtsnepi): Low-dimensional embedding + of sparse stochastic graphs. +- [FGLT](https://github.com/ailiop/fglt): Fast graphlet transform. +- [RecFMM](https://github.com/zhang416/recfmm): Adaptive fast multipole method. +- [ParlayLib](https://github.com/cmuparlay/parlaylib): A toolkit for programming parallel algorithms on shared-memory multicore machines. + +## OpenCilk-powered applications + +The following third-party applications are known to work with OpenCilk. + +- [The Problem Based Benchmark Suite (V2)](https://cmuparlay.github.io/pbbsbench/): A collection of over 20 benchmarks defined in terms of their IO characteristics. +- [GBBS: Graph Based Benchmark Suite](https://github.com/ParAlg/gbbs): A collection of fast parallel graph algorithms. +- [mold](https://github.com/wheatman/mold): A port of the mold linker to OpenCilk. + +## Miscellaneous developer tools + +- [cilk-mode.el](https://github.com/ailiop/cilk-mode/): Emacs minor mode for + Cilk source code. + +## Contribute + +Want your OpenCilk-powered software listed here? Contact us at [contact@opencilk.org](mailto:contact@opencilk.org). + +You can find more information on contributing to OpenCilk [here](/contribute). diff --git a/src/community/teach performance.md b/src/community/teach performance.md index 63a4f7d9..657e1fc0 100644 --- a/src/community/teach performance.md +++ b/src/community/teach performance.md @@ -6,47 +6,68 @@ eleventyNavigation: parent: Community --- -OpenCilk is an important component of a portfolio of software technologies and techniques that allow engineers to develop fast code for applications that run on commodity and cloud multicore computers. As the economics of the semiconductor industry, coupled with the end of many years of Moore's-Law and Dennard scaling, spell the end of a long run of historical performance gains due to technology improvements alone, the importance of _performance engineering_ to continue to improve performance is growing in importance. Continued performance growth is vital for continued advances in vital computing domains such as machine learning, simulation of physical phenomena, and computer security. Indeed, this need has made performance engineering a national strategic priority. +The end of Moore's Law makes software performance engineering a priority for modern computer-science curricula. +OpenCilk enables you to teach principles of multicore computing using a state-of-the-art task-parallel platform that is easy to learn. -Traditional computer science and engineering curricula feature courses that concentrate on one particular technology, such as compilers, operating systems, algorithms, or machine learning. In contrast, performance engineering brings together technologies and techniques that span many layers of a computing system, including architecture, compilers, programming languages, parallel computing theory and implementation, and algorithms. +## List of classes and workshops -OpenCilk is well-positioned as an important component to help academics teach performance engineering because of its simple language, small codebase, and mathematically provable guarantees of good performance. +To help you develop your own course or module on performance engineering, we are compiling a list of relevant classes and workshops with materials that you can adapt. Do you have your own class or module to add to our list? Please [let us know](/contribute/contact/). +- [Performance engineering of software systems](#performance-engineering-of-software-systems) +- [Modern algorithms workshop: parallel algorithms](#modern-algorithms-workshop:-parallel-algorithms) -The performance engineering community is eager to attract faculty to join this community and teach performance engineering material in new or existing courses. We wish to make adoption of this material as easy as possible and hope faculty with interest in this topic will help grow our community and build and extend its relevant course material. +Each listing includes links to basic materials (e.g., lecture PDFs). Additional materials (e.g., editable slide decks, and solutions for homeworks and quizzes) are often also available, as detailed below, when you [join the OpenCilk community](../join-us/). -## Topics Covered +## Performance engineering of software systems -At MIT, _Performance Engineering of Software Systems_ is an upper-division undergraduate course with prereqs of introductory computer architecture, algorithms, and programming courses. Other faculty have adapted this material as an introductory graduate course. The class uses the C language with OpenCilk task-parallel extensions. [Recent _Performance Engineering of Software Systems_ offerings](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/), which provide free and open course material for adoption, have featured roughly 20 lectures on the following topics: +At MIT, _Performance Engineering of Software Systems_ is an upper-division undergraduate course with prereqs of introductory computer architecture, algorithms, and programming courses. Other faculty have adapted this material as an introductory graduate course. The class uses the C language with OpenCilk task-parallel extensions. Materials from Fall 2018 are available on [MIT Open CourseWare](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/), which includes 23 lectures (listed below), [10 homeworks](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/pages/assignments), [4 projects](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/pages/projects), [4 quizzes](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/pages/quizzes), and [practice problems](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/pages/recitation-problems) from selected recitations. -- Intro and Matrix Multiplication -- Bentley Rules -- Bit Hacks -- Architecture and Vectorization -- C to Assembly -- Compilers -- Multicore Programming -- Races and Parallelism -- Analysis of Parallel Algorithms -- Measurement and Timing -- Cilk Runtime System -- Caching and Cache Efficient Algorithms -- Cache Oblivious Algorithms -- Synchronization -- Speculative Parallelism +#### Lectures -## Projects +1. Introduction & Matrix Multiplication ([PDF](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/mit6_172f18_lec1/), [video](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/lecture-1-intro-and-matrix-multiplication/)) +1. Bentley Rules for Optimizing Work ([PDF](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/mit6_172f18_lec2/), [video](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/lecture-2-bentley-rules-for-optimizing-work/)) +1. Bit Hacks ([PDF](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/mit6_172f18_lec3/), [video](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/lecture-3-bit-hacks/)) +1. Assembly Language and Computer Architecture ([PDF](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/mit6_172f18_lec4/), [video](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/lecture-4-assembly-language-computer-architecture/)) +1. C to Assembly ([PDF](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/mit6_172f18_lec5/), [video](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/lecture-5-c-to-assembly/)) +1. Multicore Programming ([PDF](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/mit6_172f18_lec6/), [video](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/lecture-6-multicore-programming/)) +1. Races and Parallelism ([PDF](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/mit6_172f18_lec7/), [video](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/lecture-7-races-and-parallelism/)) +1. Analysis of Multithreaded Algorithms ([PDF](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/mit6_172f18_lec8/), [video](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/lecture-8-analysis-of-multithreaded-algorithms/)) +1. What Compilers Can and Cannot Do ([PDF](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/mit6_172f18_lec9/), [video](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/lecture-9-what-compilers-can-and-cannot-do/)) +1. Measurement and Timing ([PDF](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/mit6_172f18_lec10/), [video](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/lecture-10-measurement-and-timing/)) +1. Storage Allocation ([PDF](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/mit6_172f18_lec11/), [video](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/lecture-11-storage-allocation/)) +1. Parallel Storage Allocation ([PDF](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/mit6_172f18_lec12/), [video](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/lecture-12-parallel-storage-allocation/)) +1. The Cilk Runtime System ([PDF](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/mit6_172f18_lec13/), [video](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/lecture-13-the-cilk-runtime-system/)) +1. Caching and Cache-Efficient Algorithms ([PDF](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/mit6_172f18_lec14/), [video](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/lecture-14-caching-and-cache-efficient-algorithms/)) +1. Cache-Oblivious Algorithms ([PDF](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/mit6_172f18_lec15/), [video](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/lecture-15-cache-oblivious-algorithms/)) +1. Nondeterministic Parallel Programming ([PDF](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/mit6_172f18_lec16/), [video](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/lecture-16-nondeterministic-parallel-programming/)) +1. Synchronization Without Locks ([PDF](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/mit6_172f18_lec17/), [video](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/lecture-17-synchronization-without-locks/)) +1. Domain Specific Languages and Autotuning ([PDF](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/mit6_172f18_lec18/), [video](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/lecture-18-domain-specific-languages-and-autotuning/)) +1. Leiserchess Codewalk ([PDF](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/mit6_172f18_lec19/), [video](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/lecture-19-leiserchess-codewalk/)) +1. Speculative Parallelism & Leiserchess ([PDF](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/mit6_172f18_lec20/), [video](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/lecture-20-speculative-parallelism-leiserchess/)) +1. Tuning a TSP Algorithm ([PDF](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/mit6_172f18_lec21/), [video](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/lecture-21-tuning-a-tsp-algorithm/)) +1. Graph Optimization ([PDF](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/mit6_172f18_lec22/), [video](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/lecture-22-graph-optimization/)) +1. High Performance in Dynamic Languages ([PDF](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/mit6_172f18_lec23/), [video](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/resources/lecture-23-high-performance-in-dynamic-languages/)) MIT's course has a significant homework/project component. Enabling students to achieve high performance on project submissions is a significant goal of the course. MIT typically uses servers from commercial cloud providers (historically AWS) to accept project submissions and measure performance. - Recent projects have included rotating an image, parallelization of a physical simulation, and memory management. The traditional MIT capstone project is a bot that plays one side of a 2-player game, where optimized bot performance is a necessity for a competitive submission. -## What can we provide? - -Upon [request](link-here), we can provide: +[Join the OpenCilk community](../join-us/) for access to - PowerPoint source for slides -- Video recordings of a past course offering's lectures - LaTeX source for homework and projects from a past course offering - Reference code for homeworks and projects - A library of past quiz and exam questions -[![](/img/mit-ocw-6-172.png)](https://ocw.mit.edu/courses/6-172-performance-engineering-of-software-systems-fall-2018/) +## Modern algorithms workshop: parallel algorithms + +Originally created as a single full-day class, this workshop includes an introduction and 8 separate modules listed below. + +- Cilk model +- Detecting nondeterminism +- What Is parallelism? +- Scheduling theory primer +- Analysis of parallel loops +- Case study: matrix multiplication +- Case study: Jaccard similarity +- Post-Moore software + +[Join the OpenCilk community](../join-us/) for access to editable slide decks. + diff --git a/src/contribute.md b/src/contribute.md index 5a0258dd..1f21e4a9 100644 --- a/src/contribute.md +++ b/src/contribute.md @@ -11,6 +11,20 @@ eleventyNavigation: The OpenCilk project welcomes your expertise and enthusiasm. A few specific opportunities are listed below. If you see anything interesting, or have ideas that we haven't thought of, please [contact us](/contribute/contact/). + +## Teaching + +{% imgLeft "/img/owl.png", 60 %} +Are you teaching with OpenCilk? + +We hope you'll join our [community of educators](/community/teach-performance) who are developing resources for teaching topics in software performance engineering. + +## Documenting +Are you doing something with performance engineering that you want people to hear about? Are you writing how-tos or tutorials to help students with OpenCilk? We would love to hear! Have you prepared a presentation, video, or other educational materials about OpenCilk? Let us know! If you’re unsure where to start or how your skills fit in, [reach out](/contribute/contact/)! + +## Open projects +We are looking for people to help us with several OpenCilk projects, including develping an OpenCilk language server, improving Cilksan reporting, and fixing relevant debuggers to interface properly with the OpenCilk runtime system. See [open projects](./open-projects) for more. + ## Testing We are interested in your experiences or issues in installing and writing code and running OpenCilk programs. We are also interested in collecting performance figures for different computer systems. @@ -18,8 +32,8 @@ We are also interested in collecting performance figures for different computer - Please report runtime bugs at https://github.com/OpenCilk/cheetah/issues - Please report website and documentation bugs at https://github.com/OpenCilk/www.opencilk.org/issues -## Porting -You can help by porting OpenCilk to other platforms. +## Porting and scripting +You can help by porting OpenCilk to other platforms, and writing scripts to automate release testing. ## Coding Want to customize your own compiler or runtime environment? Check out OpenCilk code that's maintained in these GitHub repositories: @@ -31,18 +45,6 @@ Want to customize your own compiler or runtime environment? Check out OpenCilk c In addition to developing the OpenCilk codebase, we need your help extending existing code libraries to run in parallel with OpenCilk. Notable opportunities include the C++ Standard Template Library (STL) and the [GraphBLAS](https://graphblas.org/) Graph Linear Algebra API. Also improving the productivity tools. -## Scripting -Writing scripts to automate release testing. - -## Documenting -Are you doing something with performance engineering that you want people to hear about? Are you writing how-tos or tutorials to help students with OpenCilk? We would love to hear! Have you prepared a presentation, video, or other educational materials about OpenCilk? Let us know! If you’re unsure where to start or how your skills fit in, [reach out](https://github.com/OpenCilk/opencilk-project/discussions)! - -## Educating -Are you teaching with OpenCilk? We welcome your contributions in this area. - -## Forums -Ask or answer questions on the OpenCilk forums. - ## Postdoc Positions The Supertech research group in the MIT Computer Science and Artificial Intelligence Laboratory seeks multiyear Postdoctoral Associates to join the OpenCilk development team led by Professor Charles E. Leiserson, Dr. Tao B. Schardl, and Research Scientist Dorothy Curtis. The open-source OpenCilk software platform, a new implementation of the Cilk parallelprogramming platform, will feature a superior compiler based on the Tapir/LLVM compiler (Best Paper, PPoPP 2017), a new work-stealing runtime system, and a suite of parallelprogramming productivity tools. Candidates should be recent Ph.D. graduates in computer science and engineering with excellent C/C++ programming skills and publications in one or more of the following areas: diff --git a/src/contribute/contact.md b/src/contribute/contact.md index 6c6ca667..f1ba6e67 100644 --- a/src/contribute/contact.md +++ b/src/contribute/contact.md @@ -3,12 +3,12 @@ layout: layouts/page.njk title: Contact us eleventyNavigation: key: Contact - order: 0 + order: 99 --- Do you have questions or comments about OpenCilk, our open-source community, or software performance engineering? -We would love to hear from you. +We would love to hear from you. Please use the form below or email `contact@opencilk.org`. Thank you!
diff --git a/src/contribute/contribute.11tydata.js b/src/contribute/contribute.11tydata.js new file mode 100644 index 00000000..9fd586d9 --- /dev/null +++ b/src/contribute/contribute.11tydata.js @@ -0,0 +1,54 @@ +require('dotenv').config(); + +const isDevEnv = process.env.ELEVENTY_ENV === 'development'; +const todaysDate = new Date(); + +function showDraft(data) { + const isDraft = 'draft' in data && data.draft !== false; + const isFutureDate = data.page.date > todaysDate; + return isDevEnv || (!isDraft && !isFutureDate); +} + +module.exports = function() { + return { + eleventyComputed: { + eleventyExcludeFromCollections: function(data) { + if(showDraft(data)) { + return data.eleventyExcludeFromCollections; + } + else { + return true; + } + }, + permalink: function(data) { + if(showDraft(data)) { + return data.permalink + } + else { + return false; + } + }, + eleventyNavigation: { + key: function(data) { + if(showDraft(data)) { + return data.title + } + else { + return false; + } + } + }, + sidebar: function(data) { + return 'toc'; + }, + background: function(data) { + if(('draft' in data && data.draft !== false) || (data.page.date > todaysDate)) { + return 'text-white bg-info' + } + else { + return 'bg-white'; + } + } + } + } +} \ No newline at end of file diff --git a/src/contribute/open-projects.md b/src/contribute/open-projects.md new file mode 100644 index 00000000..6cdceea6 --- /dev/null +++ b/src/contribute/open-projects.md @@ -0,0 +1,23 @@ +--- +layout: layouts/page.njk +sidebar: toc +title: Open projects +eleventyNavigation: + key: Open projects + order: 1 +--- + +We are looking for people to help us with the following projects with OpenCilk. If you see anything interesting, please [contact us](/contribute/contact/). + +## Language server + +Develop an **OpenCilk [language server](https://microsoft.github.io/language-server-protocol/)** that integrates with OpenCilk's tools. For example, the language server would integrate with Cilksan to allow editors to mark locations in the program's source that are involved in a determinacy race. + +## Cilksan reporting + +Improve the output of Cilksan to syntax-highlight names of functions in the call stack, especially the namespaces and types in C++ function names. + +## Debuggers + +- Fix GDB to correctly identify stack frames in the cactus stack of a Cilk program. +- Fix the RR debugger's behavior to handle Cilk's stack switching when rewinding the parallel execution of a Cilk program. \ No newline at end of file diff --git a/src/doc/doc.11tydata.js b/src/doc/doc.11tydata.js index 0ede035c..9fd586d9 100644 --- a/src/doc/doc.11tydata.js +++ b/src/doc/doc.11tydata.js @@ -1,9 +1,54 @@ -module.exports = { - eleventyComputed: { +require('dotenv').config(); + +const isDevEnv = process.env.ELEVENTY_ENV === 'development'; +const todaysDate = new Date(); + +function showDraft(data) { + const isDraft = 'draft' in data && data.draft !== false; + const isFutureDate = data.page.date > todaysDate; + return isDevEnv || (!isDraft && !isFutureDate); +} + +module.exports = function() { + return { + eleventyComputed: { + eleventyExcludeFromCollections: function(data) { + if(showDraft(data)) { + return data.eleventyExcludeFromCollections; + } + else { + return true; + } + }, + permalink: function(data) { + if(showDraft(data)) { + return data.permalink + } + else { + return false; + } + }, eleventyNavigation: { - key: data => data.title + key: function(data) { + if(showDraft(data)) { + return data.title + } + else { + return false; + } + } }, - sidebar: 'toc', - background: 'bg-white' - } - }; \ No newline at end of file + sidebar: function(data) { + return 'toc'; + }, + background: function(data) { + if(('draft' in data && data.draft !== false) || (data.page.date > todaysDate)) { + return 'text-white bg-info' + } + else { + return 'bg-white'; + } + } + } + } +} \ No newline at end of file diff --git a/src/doc/reference/cilkscale.md b/src/doc/reference/cilkscale.md new file mode 100644 index 00000000..8d2c1f41 --- /dev/null +++ b/src/doc/reference/cilkscale.md @@ -0,0 +1,436 @@ +--- +title: Cilkscale reference +tags: + - cilkscale + - tools +date: 2022-09-01 +author: Alexandros-Stavros Iliopoulos +eleventyNavigation: + key: Cilkscale reference +--- + +The OpenCilk Cilkscale tool comprises three main components: + +- Infrastructure in the OpenCilk compiler and runtime system for work/span + analysis, +- A C/C++ API for fine-grained analysis of program regions, and +- A Python script that automates scalability analysis, benchmarking on multiple + cores, and visualization of parallel performance results. + +This reference page summarizes the work/span analysis measurements reported by +Cilkscale, and details the interface, options, and output of each component. +To learn more about how to use Cilkscale to analyze the parallel performance of +your Cilk program, see the [Cilkscale user's +guide](/doc/users-guide/cilkscale). + +{% alert "info" %} + +_**Note:**_ The terminal command examples in this page assume that OpenCilk is +installed within `/opt/opencilk/`, as shown in the [Install +page](/doc/users-guide/install/#example). + +{% endalert %} + + +## Work/span analysis measurements + +Cilkscale work/span analysis reports contain the following measurements for +each analyzed program region. + +- {% defn "Work" %}: the CPU time of the computation when run on one processor, + sometimes denoted $T_1$. The actual wall-clock time it takes to run the + computation in parallel will generally be smaller than the work, since the + latter adds together the time spent on different CPU cores. + +- {% defn "Span" %}: the theoretically fastest CPU time of the computation when + run on an infinite number of parallel processors (discounting overheads for + communication and scheduling), sometimes denoted $T_{\infty}$. The span is + the maximum amount of work along any path in the {% defn "parallel trace" %} + of the computation. + +- {% defn "Parallelism" %}: the ratio of work to span for a computation $(T_1 / + T_{\infty})$. Parallelism can be interpreted as the maximum possible speedup + of the computation, or as the maximum number of processors that could + theoretically yield {% defn "perfect linear speedup" %}. + +- ***Burdened span***: similar to span after accounting for worst-case + scheduling overhead or "burden". The scheduling burden is based on a + heuristic estimate of the costs associated with migrating and synchronizing + parallel tasks among processors. The worst-case scenario is when every time + it is possible for a task to be migrated, the scheduler does migrate it. (In + practice, there are additional factors besides scheduling overhead that can + slow down parallel execution, such as insufficient memory bandwidth, + contention on parallel resources, false sharing, etc.) + +- ***Burdened parallelism***: the ratio of work to burdened span. It can be + interpreted as a lower bound for the parallelism of the computation assuming + worst-case parallel scheduling (and ignoring other possible factors of + parallel slowdown). + +{% alert "info" %} + +_**References:**_ + +- Y. He, C.E. Leiserson, and W.M. Leiserson, [_The Cilkview scalability + analyzer_](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/cilkview.pdf), + SPAA 2010, pp. 145–156. +- T.B. Schardl, B.C. Kuszmaul, I.T.A. Lee, W.M. Leiserson, and C.E. Leiserson, + [_The Cilkprof scalability + profiler_](http://supertech.csail.mit.edu/papers/cilkprof.pdf), SPAA 2015, + pp. 89–100. + +{% endalert %} + + +## Compiler options + +- `-fcilktool=cilkscale` + Instrument the program to measure work and span in + seconds. Work/span measurements in seconds are non-deterministic. + +- `-fcilktool=cilkscale-instructions` + Instrument the program to measure work and span in LLVM pseudo-instructions: + weighted counts of architecture-independent instructions based on LLVM's + internal representation and cost model. Work/span measurements in + pseudo-instructions are deterministic but introduce higher runtime overhead + than measurements in seconds. + +- `-fcilktool=cilkscale-benchmark` + Instrument the program to measure wall-clock execution time (in seconds) + instead of work and span. + +{% alert "primary" %} + +_**Example:**_ +```shell-session +$ /opt/opencilk/bin/clang qsort.c -fopencilk -fcilktool=cilkscale -O3 -o qsort_cs +$ /opt/opencilk/bin/clang qsort.c -fopencilk -fcilktool=cilkscale-instructions -O3 -o qsort_csinstr +$ /opt/opencilk/bin/clang qsort.c -fopencilk -fcilktool=cilkscale-benchmark -O3 -o qsort_cs_bench +``` + +{% endalert %} + +{% alert "info" %} + +_**Note:**_ The Cilkscale instrumentation flags must be used for both +compilation and linking. + +{% endalert %} + + +## Analysis report file + +When a Cilkscale-instrumented program is executed, Cilkscale reports its +measurements by printing them to the standard output stream by default. To +output Cilkscale measurements into a file instead of the standard output, set +the desired file path as the value of the environment variable `CILKSCALE_OUT`. + +{% alert "primary" %} + +_**Example:**_ + +```shell-session +$ CILKSCALE_OUT=qsort_workspan_report.csv ./qsort_cs 100000000 +[...program output without Cilkscale's report...] +$ cat qsort_workspan_report.csv +tag,work (seconds),span (seconds),parallelism,burdened_span (seconds),burdened_parallelism +,26.3454,2.22239,11.8545,2.22271,11.8528 +``` + +{% endalert %} + +{% alert "info" %} + +_**Note:**_ Cilkscale assumes that the path in `CILKSCALE_OUT` points to a file +in an existing directory. + +- If the directory does not exist, the report is printed to the standard output + instead. +- If the file already exists, the report will overwrite the file's contents. + +{% endalert %} + + +## C/C++ API for fine-grained analysis + +The Cilkscale C/C++ API enables fine-grained analysis of specific code regions. +If the program is compiled with the flag `-fcilktool=cilkscale` or +`-fcilktool=cilkscale-instructions`, the Cilkscale API functions measure work +and span (in seconds or pseudo-instructions, respectively) as described below. +If, however, the program is compiled with the flag +`-fcilktool=cilkscale-benchmark`, then the functions below measure wall-clock +execution time instead of work and span. + +{% alert "info" %} + +_**Note:**_ Calls to the Cilkscale API functions are elided if the program is +compiled without any of the Cilkscale instrumentation flags. + +{% endalert %} + +### Cilkscale API header file + +```c +#include +``` + +### Work/span measurement type + +```c +wsp_t workspan_measurement_var; +``` + +### Get work/span measurement at point + +```c +wsp_t wsp_getworkspan() +``` + +Return the work and span of the computation from the beginning of the program +up to the point of the `wsp_getworkspan()` function call in the program's +parallel trace. + +### Zero-initialized work/span variable + +```c +wsp_t wsp_zero() +``` + +Return a value with zero work and span measurements. Needed to initialize +work/span accumulation variables. + +### Work/span difference + +```c +wsp_t wsp_sub(wsp_t lhs, wsp_t rhs) +``` + +Return the work/span difference between the `lhs` and `rhs` measurements. +Useful for measuring the work and span of program regions between +`wsp_getworkspan()` calls. + +### Work/span sum + +```c +wsp_t wsp_add(wsp_t lhs, wsp_t rhs) +``` + +Return the work/span sum of the `lhs` and `rhs` measurements. Useful when +accumulating work/span measurements of non-contiguous program regions. + +### Print CSV row of work/span measurements + +```c +void wsp_dump(wsp_t wsp, const char *tag) +``` + +Print a tagged row with the measurements of `wsp` in [CSV +format](https://en.wikipedia.org/wiki/Comma-separated_values). The fields in +the printed row are, in order: the `tag` string, work, span, parallelism, +burdened span, and burdened parallelism. + +See also: [Cilkscale work/span analysis +measurements](#workspan-analysis-measurements). + +### C++ operator overloads + +In C++, the Cilkscale API also defines operator overloads for `wsp_t` +variables: + +- The `-` and `-=` operators correspond to calls to `wsp_sub()`. +- The `+` and `+=` operators correspond to calls to `wsp_add()`. +- The `<<` operator can be used with a prefix argument of type `std::ostream` + or `std::ofstream` to print work/span measurements. The `<<` operator + behaves similarly to `wsp_dump()`, except that (1) it does not print a tag + field, and (2) its output stream is unaffected by the `CILKSCALE_OUT` + environment variable. + +### Examples + +{% alert "primary" %} + +_**Example 1:**_ Measure the work and span of a computation in a contiguous +code region. + +```c +wsp_t start = wsp_getworkspan(); +/* ...analyzed computation... */ +wsp_t end = wsp_getworkspan(); +wsp_t elapsed = wsp_sub(end, start); +wsp_dump(elapsed, "my computation"); +``` + +{% endalert %} + +{% alert "primary" %} + +_**Example 2:**_ Measure the work and span of an iterative computation. The +analyzed code region is non-contiguous in the program's execution trace. + +```c +wsp_t wsp_iter = wsp_zero(); +while (iteration_condition) { + /* ...non-analyzed code... (e.g., printing to logs, diagnostic computations, etc) */ + wsp_t start = wsp_getworkspan(); + /* ...analyzed iterative computation... */ + wsp_t end = wsp_getworkspan(); + wsp_t elapsed = wsp_sub(end, start); + wsp_iter = wsp_add(wsp_iter, elapsed); +} +wsp_dump(wsp_iter, "iterative computation"); +``` + +{% endalert %} + + +## Benchmarking and visualization Python script + +The Cilkscale benchmarking and visualization Python script is found at +`share/Cilkscale_vis/cilkscale.py` within the OpenCilk installation directory. +It takes as input two Cilkscale-instrumented binaries of the same application +and a number of optional arguments. Its output is a table and set of graphical +plots of parallel performance and scalability measurements. + +{% alert "warning" %} + +_**Prerequisites:**_ To use the `cilkscale.py` script, you need: + +- [Python](https://www.python.org/downloads/) 3.8 or later. +- (Optional) [matplotlib](https://pypi.org/project/matplotlib/) 3.5.0 or later; + only required if producing graphical plots. + +{% endalert %} + +### Running the `cilkscale.py` script + +```shell-session +$ python3 /opt/opencilk/share/Cilkscale_vis/cilkscale.py ARGUMENTS +``` + +#### Arguments + +- `-h`, `--help` + Print usage documentation and exit. + +- `-c BIN_CILKSCALE`, `--cilkscale BIN_CILKSCALE` + _(Required)_ Path to program binary instrumented with `-fcilktool=cilkscale`. + +- `-b BIN_CILKSCALE_BENCH`, `--cilkscale-benchmark BIN_CILKSCALE_BENCH` + _(Required)_ Path to program binary instrumented with + `-fcilktool=cilkscale-benchmark`. + +- `-cpus CPU_COUNTS`, `--cpu-counts CPU_COUNTS` + _(Optional)_ Comma-separated list of how many cores to use when running + empirical performance benchmarks. Cilkscale runs the benchmark binary once + for each core count in the list. For each run, the number of parallel Cilk + workers is equal to the number of cores, and the latter are specified + explicitly by setting the [processor + affinity](https://en.wikipedia.org/wiki/Processor_affinity) of the run. On + systems with [simultaneous multithreading + (SMT)](https://en.wikipedia.org/wiki/Simultaneous_multithreading) (aka + "hyper-threading" on Intel CPUs), Cilkscale only uses distinct physical + cores. On systems with multiple processor nodes with [non-uniform memory + access (NUMA)](https://en.wikipedia.org/wiki/Non-uniform_memory_access), + Cilkscale + selects all cores from one processor before moving on to other processors. + _Default:_ `1,2,...,P`, where `P` is the total number of available physical + cores. + +- `-ocsv OUTPUT_CSV`, `--output-csv OUTPUT_CSV` + _(Optional)_ Path to CSV file for table of work/span and benchmarking measurements. + _Default:_ `out.csv`. + +- `-oplot OUTPUT_PLOT`, `--output-plot OUTPUT_PLOT` + _(Optional)_ Path to PDF file for graphical plots of work/span and + benchmarking measurements. + _Default:_ `plot.pdf` + +- `-rplot ROWS_TO_PLOT`, `--rows-to-plot ROWS_TO_PLOT` + _(Optional)_ Comma-separated list of rows (0-indexed) in the CSV table for + which to generate plots; or `all` to plot all rows. + _Default:_ `all`. + +- `-a ARG1 ARG2 ...`, `--args ARG1 ARG2 ...` + _(Optional)_ Space-separated list of command-line arguments to pass to the + program binary. + _Default:_ no arguments. + +{% alert "primary" %} + +_**Example:**_ + +```shell-session +$ /opt/opencilk/bin/clang qsort.c -fopencilk -fcilktool=cilkscale -O3 -o qsort_cs +$ /opt/opencilk/bin/clang qsort.c -fopencilk -fcilktool=cilkscale-benchmark -O3 -o qsort_cs_bench +$ python3 /opt/opencilk/share/Cilkscale_vis/cilkscale.py \ + -c ./qsort_cs -b ./qsort_cs_bench \ + -ocsv qsort-bench.csv -oplot qsort-scalability-plots.pdf \ + --args 100000000 +Namespace(args=['100000000'], cilkscale='./qsort_cs', cilkscale_benchmark='./qsort_cs_bench', cpu_counts=None, output_csv='qsort-bench.csv', output_plot='qsort-scalability-plots.pdf', rows_to_plot='all') + +\>> STDOUT (./qsort_cilkscale 100000000) +Sorting 100000000 random integers +Sort succeeded +<< END STDOUT + +\>> STDERR (./qsort_cilkscale 100000000) +<< END STDERR + +INFO:runner:Generating scalability data for 8 cpus. +INFO:runner:CILK_NWORKERS=1 taskset -c 0 ./qsort_cilkscale_bench 100000000 +INFO:runner:CILK_NWORKERS=2 taskset -c 0,2 ./qsort_cilkscale_bench 100000000 +INFO:runner:CILK_NWORKERS=3 taskset -c 0,2,4 ./qsort_cilkscale_bench 100000000 +INFO:runner:CILK_NWORKERS=4 taskset -c 0,2,4,6 ./qsort_cilkscale_bench 100000000 +INFO:runner:CILK_NWORKERS=5 taskset -c 0,2,4,6,8 ./qsort_cilkscale_bench 100000000 +INFO:runner:CILK_NWORKERS=6 taskset -c 0,2,4,6,8,10 ./qsort_cilkscale_bench 100000000 +INFO:runner:CILK_NWORKERS=7 taskset -c 0,2,4,6,8,10,12 ./qsort_cilkscale_bench 100000000 +INFO:runner:CILK_NWORKERS=8 taskset -c 0,2,4,6,8,10,12,14 ./qsort_cilkscale_bench 100000000 +INFO:plotter:Generating plot (2 subplots) +``` + +{% endalert %} + +### Performance and scalability analysis plots + +An example set of plots produced by the `cilkscale.py` script is shown +below. In this example, the instrumented application is a parallel quicksort +and the Cilkscale API was used to analyze one program region (tagged as +"sampled_qsort" in the relevant call to `wsp_dump()`) in addition to the whole +program which is always analyzed by Cilkscale. Details on how these plots were +generated can be found in the [Cilkscale user's +guide](/doc/users-guide/cilkscale). + +{% img "/img/qsort-cilkscale-scalability-plots.png", "100%" %} + +The Cilkscale visualization plots are arranged in two columns and as many rows +as calls to the Cilkscale API `wsp_dump()` function (plus one untagged row for +the whole-program execution). + +**Wall-clock execution time.** The left-column plots show wall-clock execution +time in seconds as a function of the number of parallel Cilk workers. +Specifically, these figures plot four types of measurements: + +- Magenta-colored dots show the _observed_ timing measurements for the + benchmarking runs. Benchmarking runs do not involve work/span analysis + measurement overheads. +- A dark green line shows what the execution time would be if the computation + exhibited _perfect linear speedup_, that is, if the time on $P$ cores were to + be $P$ times smaller than the time it took on one core. +- A teal line shows the heuristic _burdened-dag bound_ of the execution time + (the parallel trace of the computation is sometimes also referred to as its + directed acyclic graph or dag). In the absence of other sources of parallel + slowdown such as insufficient memory bandwidth, contention, etc., the + burdened-dag bound serves as a heuristic lower bound for the execution time + if the parallel computation does not exhibit sufficient parallelism and is + not too fine-grained. +- A mustard-yellow horizontal line shows the _span bound_, that is, the minimum + possible execution time if the computation was run on infinitely many + processing cores and there were no additional overheads for parallel + scheduling, etc. + +**Parallel speedup.** The right-column plots contain the same information as +those in the left column, except that the y-axis shows parallel speedup. That +is, all execution time measurements are divided by the execution time of the +computation on one core. The horizontal line for parallelism (serial execution +time divided by span) is not visible in the speedup plots if its value falls +outside the range of the y-axis. diff --git a/src/doc/reference/glossary.md b/src/doc/reference/glossary.md index 7e3e9667..1fcc42bc 100644 --- a/src/doc/reference/glossary.md +++ b/src/doc/reference/glossary.md @@ -1,6 +1,7 @@ --- title: Glossary stylesheet: glossary.css +attribution: true eleventyNavigation: key: Glossary parent: Reference diff --git a/src/doc/reference/glossary/atomic.md b/src/doc/reference/glossary/atomic.md new file mode 100644 index 00000000..77da80ef --- /dev/null +++ b/src/doc/reference/glossary/atomic.md @@ -0,0 +1,11 @@ +--- +title: Atomic +tags: atomic +--- +Indivisible. An {% defn "instruction" %} sequence +executed by a {% defn "strand" %} is atomic if it +appears at any moment to any other +strand as if either no instructions in the +sequence have been executed or all +instructions in the sequence have been +executed. \ No newline at end of file diff --git a/src/doc/reference/glossary/centralized.md b/src/doc/reference/glossary/centralized.md deleted file mode 100644 index 2ff7b2ff..00000000 --- a/src/doc/reference/glossary/centralized.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -title: Centralized -tags: Centralized ---- - -A good scheduler operates in a distributed fashion, where -the threads implementing the scheduler cooperate to load-balance the computation. -Provably good online, distributed schedulers exist, but analyzing them is complicated. -To keep the analysis simple, one may consider an online *centralized* -scheduler that knows the global state of the computation at any moment. \ No newline at end of file diff --git a/src/doc/reference/glossary/child.md b/src/doc/reference/glossary/child.md deleted file mode 100644 index cf7a483a..00000000 --- a/src/doc/reference/glossary/child.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: Child -tags: child ---- - -A *child* process is one that results from a spawn. -It may run in parallel with the procedure that executed the spawn—its {% defn "parent" %}. diff --git a/src/doc/reference/glossary/chip_multiprocessor.md b/src/doc/reference/glossary/chip_multiprocessor.md new file mode 100644 index 00000000..38ca76a6 --- /dev/null +++ b/src/doc/reference/glossary/chip_multiprocessor.md @@ -0,0 +1,6 @@ +--- +title: Chip multiprocessor +tags: Chip multiprocessor +--- +A general-purpose {% defn "multiprocessor" %} +implemented as a single {% defn "multicore" %} chip. \ No newline at end of file diff --git a/src/doc/reference/glossary/cilk.md b/src/doc/reference/glossary/cilk.md new file mode 100644 index 00000000..f64e73d1 --- /dev/null +++ b/src/doc/reference/glossary/cilk.md @@ -0,0 +1,11 @@ +--- +title: Cilk +tags: cilk +--- +An extension to the C and C++ +programming languages that allows a +programmer to express {% defn "task-parallel", "task parallelism" %}. +Important milestones in the history of Cilk include +{% defn "Cilk-1" %}, {% defn "Cilk-5" %}, +{% defn "Cilkpp", "Cilk++" %}, {% defn "Cilk Plus" %}, +and {% defn "OpenCilk" %}. \ No newline at end of file diff --git a/src/doc/reference/glossary/cilk_1.md b/src/doc/reference/glossary/cilk_1.md new file mode 100644 index 00000000..5a09fa52 --- /dev/null +++ b/src/doc/reference/glossary/cilk_1.md @@ -0,0 +1,6 @@ +--- +title: Cilk-1 +tags: cilk +--- +The first important milestone in the history of {% defn "Cilk" %}, +Cilk-1 was developed at MIT and provided provably efficient work-stealing runtime support but little linguistic support. \ No newline at end of file diff --git a/src/doc/reference/glossary/cilk_5.md b/src/doc/reference/glossary/cilk_5.md new file mode 100644 index 00000000..cf6c1692 --- /dev/null +++ b/src/doc/reference/glossary/cilk_5.md @@ -0,0 +1,8 @@ +--- +title: Cilk-5 +tags: cilk +--- +The next important milestone after {% defn "Cilk-1" %} +in the history of {% defn "Cilk" %}, +Cilk-5 was developed at MIT and provided simple +linguistic extensions to ANSI C for multithreading. \ No newline at end of file diff --git a/src/doc/reference/glossary/cilk_for.md b/src/doc/reference/glossary/cilk_for.md new file mode 100644 index 00000000..b0e09979 --- /dev/null +++ b/src/doc/reference/glossary/cilk_for.md @@ -0,0 +1,8 @@ +--- +title: cilk_for +tags: cilk_for +--- +A keyword in the Cilk language that +indicates a `for` loop whose iterations +can be executed independently in +parallel. \ No newline at end of file diff --git a/src/doc/reference/glossary/cilk_plus.md b/src/doc/reference/glossary/cilk_plus.md new file mode 100644 index 00000000..0fe3209d --- /dev/null +++ b/src/doc/reference/glossary/cilk_plus.md @@ -0,0 +1,7 @@ +--- +title: Cilk Plus +tags: cilk +--- +An important milestone in the history of {% defn "Cilk" %} +developed by Intel Corporation, Cilk Plus extended {% defn "Cilk++" %} +with transparent interoperability with legacy C/C++ binary executables. \ No newline at end of file diff --git a/src/doc/reference/glossary/cilk_scope.md b/src/doc/reference/glossary/cilk_scope.md new file mode 100644 index 00000000..aa6b4fc1 --- /dev/null +++ b/src/doc/reference/glossary/cilk_scope.md @@ -0,0 +1,5 @@ +--- +title: cilk_scope +tags: cilk_scope +--- +A keyword in the Cilk language that indicates that all spawned children within the scoped region must complete before proceeding. \ No newline at end of file diff --git a/src/doc/reference/glossary/cilk_spawn.md b/src/doc/reference/glossary/cilk_spawn.md new file mode 100644 index 00000000..3fdde6c5 --- /dev/null +++ b/src/doc/reference/glossary/cilk_spawn.md @@ -0,0 +1,8 @@ +--- +title: cilk_spawn +tags: cilk_spawn +--- +A keyword in the Cilk language that +indicates that the named subroutine can +execute independently and in parallel +with the caller. \ No newline at end of file diff --git a/src/doc/reference/glossary/cilk_sync.md b/src/doc/reference/glossary/cilk_sync.md new file mode 100644 index 00000000..1a1a699f --- /dev/null +++ b/src/doc/reference/glossary/cilk_sync.md @@ -0,0 +1,9 @@ +--- +title: cilk_sync +tags: cilk_sync +--- +A keyword in the Cilk language that +indicates that all functions spawned +within the current function must complete +before statements following the +`cilk_sync` can be executed. \ No newline at end of file diff --git a/src/doc/reference/glossary/cilkpp.md b/src/doc/reference/glossary/cilkpp.md new file mode 100644 index 00000000..91ea4e80 --- /dev/null +++ b/src/doc/reference/glossary/cilkpp.md @@ -0,0 +1,8 @@ +--- +title: Cilk++ +permalink: /doc/reference/glossary/cilkpp/ +tags: cilk +--- +An important milestone in the history of {% defn "Cilk" %} +developed by CilkArts, Inc., Cilk++ extended {% defn "Cilk-5" %} +to C++ and introduced {% defn "reducer", "reducer hyperobjects" %} as an efficient means for resolving races on nonlocal variables. \ No newline at end of file diff --git a/src/doc/reference/glossary/cilksan.md b/src/doc/reference/glossary/cilksan.md new file mode 100644 index 00000000..d74a0cc1 --- /dev/null +++ b/src/doc/reference/glossary/cilksan.md @@ -0,0 +1,7 @@ +--- +title: Cilksan +tags: Cilksan +--- +The Cilksan race detector is a tool +provided in OpenCilk for +finding {% defn "race condition" %} defects in Cilk code. \ No newline at end of file diff --git a/src/doc/reference/glossary/coarsen.md b/src/doc/reference/glossary/coarsen.md deleted file mode 100644 index 7f9bcd2a..00000000 --- a/src/doc/reference/glossary/coarsen.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -title: Coarsen -tags: coarsen ---- - -To reduce the overhead of recursive spawning, task-parallel platforms sometimes -*coarsen* the leaves of the recursion by executing several iterations in a single leaf, -either automatically or under programmer control. This optimization comes at -the expense of reducing the parallelism. If the computation has sufficient parallel -slackness, however, near-perfect linear speedup won’t be sacrificed. \ No newline at end of file diff --git a/src/doc/reference/glossary/commutative_operation.md b/src/doc/reference/glossary/commutative_operation.md new file mode 100644 index 00000000..969f8789 --- /dev/null +++ b/src/doc/reference/glossary/commutative_operation.md @@ -0,0 +1,10 @@ +--- +title: Commutative operation +tags: commutative operation +--- +An operation $\circ$ over a type $T$ is +commutative if $a \circ b = b \circ a$ for +any two objects $a$ and $b$ of type $T$. +Integer addition and set union are +commutative, but string concatenation is +not. \ No newline at end of file diff --git a/src/doc/reference/glossary/complete-step.md b/src/doc/reference/glossary/complete-step.md deleted file mode 100644 index 7fc98708..00000000 --- a/src/doc/reference/glossary/complete-step.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Complete step -tags: complete step ---- - -As a {% defn "greedy scheduler" %} manages a computation on $P$ processors, each step is classified as *complete* or {% defn "incomplete step", "incomplete" %}. -In a complete step, at least $P$ strands are ready to execute, meaning that all strands -on which they depend have finished execution. A greedy scheduler assigns -any $P$ of the ready strands to the processors, completely utilizing all the processor resources. \ No newline at end of file diff --git a/src/doc/reference/glossary/computation-dag.md b/src/doc/reference/glossary/computation-dag.md deleted file mode 100644 index 1ccadecc..00000000 --- a/src/doc/reference/glossary/computation-dag.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -title: Computation dag -tags: computation dag ---- - -See {% defn "parallel trace" %}. \ No newline at end of file diff --git a/src/doc/reference/glossary/concurrency.md b/src/doc/reference/glossary/concurrency.md deleted file mode 100644 index bd482481..00000000 --- a/src/doc/reference/glossary/concurrency.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Concurrency -tags: concurrency ---- - -With some parallel-programming models, a programmer cannot -accomplish anything significant without dealing with -*concurrency*, where the programmer herself mitigates -interactions between otherwise independent subcomputations. \ No newline at end of file diff --git a/src/doc/reference/glossary/concurrent_agent.md b/src/doc/reference/glossary/concurrent_agent.md new file mode 100644 index 00000000..3f85af47 --- /dev/null +++ b/src/doc/reference/glossary/concurrent_agent.md @@ -0,0 +1,8 @@ +--- +title: Concurrent agent +tags: concurrent agent +--- +A {% defn "processor" %}, {% defn "process" %}, {% defn "thread" %}, {% defn "strand" %}, or other entity that executes a program +instruction sequence in a computing +environment containing other such +entities. \ No newline at end of file diff --git a/src/doc/reference/glossary/core.md b/src/doc/reference/glossary/core.md new file mode 100644 index 00000000..92f2a273 --- /dev/null +++ b/src/doc/reference/glossary/core.md @@ -0,0 +1,10 @@ +--- +title: Core +tags: core +--- +A single {% defn "processor" %} unit of a {% defn "multicore" %} +chip. The terms "processor" and "{% defn "CPU" %}" +are often used in place of "core," +although industry usage varies. +*Archaic*: A solid-state memory made of +magnetized toroidal memory elements. \ No newline at end of file diff --git a/src/doc/reference/glossary/cpu.md b/src/doc/reference/glossary/cpu.md new file mode 100644 index 00000000..796e36a7 --- /dev/null +++ b/src/doc/reference/glossary/cpu.md @@ -0,0 +1,7 @@ +--- +title: CPU +tags: cpu +--- +Central Processing Unit. We use this +term as a synonym for "{% defn "core" %}," or a single +processor of a multicore chip. \ No newline at end of file diff --git a/src/doc/reference/glossary/critical-path.md b/src/doc/reference/glossary/critical-path.md deleted file mode 100644 index 2b21b0cf..00000000 --- a/src/doc/reference/glossary/critical-path.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: Critical path -tags: critical path ---- - -The *critical path* of a task-parallel trace is the -longest path, weighted by execution time. -The weight of the critical path is called the {% defn "span" %} of the trace. \ No newline at end of file diff --git a/src/doc/reference/glossary/critical_path_length.md b/src/doc/reference/glossary/critical_path_length.md new file mode 100644 index 00000000..11b131b9 --- /dev/null +++ b/src/doc/reference/glossary/critical_path_length.md @@ -0,0 +1,5 @@ +--- +title: Critical-path length +tags: span +--- +See {% defn "span" %}. \ No newline at end of file diff --git a/src/doc/reference/glossary/critical_section.md b/src/doc/reference/glossary/critical_section.md new file mode 100644 index 00000000..8365266c --- /dev/null +++ b/src/doc/reference/glossary/critical_section.md @@ -0,0 +1,5 @@ +--- +title: Critical section +tags: critical section +--- +The code executed by a {% defn "strand" %} while holding a {% defn "lock" %}. \ No newline at end of file diff --git a/src/doc/reference/glossary/data_race.md b/src/doc/reference/glossary/data_race.md new file mode 100644 index 00000000..14784ee4 --- /dev/null +++ b/src/doc/reference/glossary/data_race.md @@ -0,0 +1,10 @@ +--- +title: Data race +tags: data race +--- +A {% defn "race condition" %} that occurs when two or +more parallel strands, holding no {% defn "lock" %} in +common, access the same memory +location and at least one of the strands +performs a write. Compare with +{% defn "determinacy race" %}. \ No newline at end of file diff --git a/src/doc/reference/glossary/deadlock.md b/src/doc/reference/glossary/deadlock.md new file mode 100644 index 00000000..7eac250c --- /dev/null +++ b/src/doc/reference/glossary/deadlock.md @@ -0,0 +1,9 @@ +--- +title: Deadlock +tags: deadlock +--- +A situation when two or more {% defn "strand", "strands" %} +are each waiting for another to +release a resource, and the "waiting-for" +relation forms a cycle so that none can +ever proceed. \ No newline at end of file diff --git a/src/doc/reference/glossary/determinacy-race.md b/src/doc/reference/glossary/determinacy-race.md deleted file mode 100644 index 4bc05e62..00000000 --- a/src/doc/reference/glossary/determinacy-race.md +++ /dev/null @@ -1,14 +0,0 @@ ---- -title: Determinacy race -tags: determinacy race ---- - -A *determinacy race* occurs when two logically parallel instructions access the -same memory location and at least one of the instructions modifies the value stored -in the location. For a computation with a determinacy race, -the results can vary depending on the how the instructions are scheduled on the multicore computer. -Often in practice, most instruction orderings produce correct results, -but some orderings generate improper results when the instructions interleave. -Consequently, races can be extremely hard to test for. -Task-parallel programming environments often provide race-detection -productivity tools to help you isolate race bugs. \ No newline at end of file diff --git a/src/doc/reference/glossary/determinacy_race.md b/src/doc/reference/glossary/determinacy_race.md new file mode 100644 index 00000000..cecfdd80 --- /dev/null +++ b/src/doc/reference/glossary/determinacy_race.md @@ -0,0 +1,8 @@ +--- +title: Determinacy race +tags: determincacy race +--- +A {% defn "race condition" %} that occurs when two +logically parallel {% defn "strand", "strands" %} access the same +memory location and at least one strand +performs a write. \ No newline at end of file diff --git a/src/doc/reference/glossary/determinism.md b/src/doc/reference/glossary/determinism.md new file mode 100644 index 00000000..fbb5d886 --- /dev/null +++ b/src/doc/reference/glossary/determinism.md @@ -0,0 +1,9 @@ +--- +title: Determinism +tags: determinism +--- +The property of a program when it +behaves identically from run to run when +executed on the same inputs. +Deterministic programs are usually +easier to debug. \ No newline at end of file diff --git a/src/doc/reference/glossary/deterministic.md b/src/doc/reference/glossary/deterministic.md deleted file mode 100644 index 3f2ca6da..00000000 --- a/src/doc/reference/glossary/deterministic.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -title: Deterministic -tags: deterministic ---- - -A parallel algorithm is *deterministic* if it always does the same thing on the same -input, no matter how the instructions are scheduled on the multicore computer. -It is nondeterministic if its behavior might vary from run to run when the input is the same. -A parallel algorithm that is intended to be deterministic may nevertheless -act nondeterministically, however, if it contains a difficult-to-diagnose bug called a -determinacy race. \ No newline at end of file diff --git a/src/doc/reference/glossary/distributed-memory.md b/src/doc/reference/glossary/distributed-memory.md deleted file mode 100644 index fea35576..00000000 --- a/src/doc/reference/glossary/distributed-memory.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Distributed memory -tags: distributed memory ---- - -Multicore clusters usually have a distributed memory, where one multicore's memory -cannot be accessed directly by a processor in another multicore. -Instead, the processor must explicitly send a message over the cluster -network to a processor in the remote multicore to request any data it requires. \ No newline at end of file diff --git a/src/doc/reference/glossary/distributed_memory.md b/src/doc/reference/glossary/distributed_memory.md new file mode 100644 index 00000000..0b3fb701 --- /dev/null +++ b/src/doc/reference/glossary/distributed_memory.md @@ -0,0 +1,10 @@ +--- +title: Distributed memory +tags: distributed memory +--- +Computer storage that is partitioned +among several {% defn "processors" %}. A distributed-memory {% defn "multiprocessor" %} is a computer in +which processors must send messages +to remote processors to access data in +remote processor memory. Contrast with +{% defn "shared memory" %}. \ No newline at end of file diff --git a/src/doc/reference/glossary/execution_time.md b/src/doc/reference/glossary/execution_time.md new file mode 100644 index 00000000..4685354c --- /dev/null +++ b/src/doc/reference/glossary/execution_time.md @@ -0,0 +1,6 @@ +--- +title: Execution time +tags: execution time +--- +How long a program takes to execute on a given computer system. +Also called {% defn "running time" %}. \ No newline at end of file diff --git a/src/doc/reference/glossary/fake_lock.md b/src/doc/reference/glossary/fake_lock.md new file mode 100644 index 00000000..e91a21bc --- /dev/null +++ b/src/doc/reference/glossary/fake_lock.md @@ -0,0 +1,9 @@ +--- +title: Fake lock +tags: fake lock +--- +A construct that `Cilksan` treats as +a lock but which behaves like a no-op +during actual running of the program. A +fake lock can be used to suppress the +reporting of an intentional {% defn "race condition" %}. \ No newline at end of file diff --git a/src/doc/reference/glossary/false_sharing.md b/src/doc/reference/glossary/false_sharing.md new file mode 100644 index 00000000..bb838efd --- /dev/null +++ b/src/doc/reference/glossary/false_sharing.md @@ -0,0 +1,11 @@ +--- +title: False sharing +tags: false sharing +--- +The situation that occurs when two +{% defn "strand", "strands" %} access different memory +locations residing on the same cache +block, thereby contending for the cache +block. For more information, see the +Wikipedia entry +[https://en.wikipedia.org/wiki/False_sharing](https://en.wikipedia.org/wiki/False_sharing). \ No newline at end of file diff --git a/src/doc/reference/glossary/fork-join-parallelism.md b/src/doc/reference/glossary/fork-join-parallelism.md deleted file mode 100644 index 645e2300..00000000 --- a/src/doc/reference/glossary/fork-join-parallelism.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -title: Fork-join parallelism -tags: fork-join parallelism ---- - -Almost all task-parallel environments support *fork-join parallelism*, which is typically embodied -in two linguistic features: *spawning* and *parallel loops*. Spawning allows a subroutine to be “forked”: executed like a subroutine call, except that the caller can continue to execute while the spawned subroutine computes its result. A parallel -loop is like an ordinary `for` loop, except that multiple iterations of the loop can -execute at the same time. - -*Fork-join* parallel algorithms employ spawning and parallel loops to describe -parallelism. A key aspect of this parallel model, inherited from the task-parallel -model but different from the thread model, is that the programmer does not specify -which tasks in a computation must run in parallel, only which tasks may run in -parallel. The underlying runtime system uses threads to load-balance the tasks -across the processors. \ No newline at end of file diff --git a/src/doc/reference/glossary/global_variable.md b/src/doc/reference/glossary/global_variable.md new file mode 100644 index 00000000..81b52e90 --- /dev/null +++ b/src/doc/reference/glossary/global_variable.md @@ -0,0 +1,6 @@ +--- +title: Global variable +tags: global variable +--- +A variable that is bound outside of all local scopes. +See also {% defn "nonlocal variable" %}. \ No newline at end of file diff --git a/src/doc/reference/glossary/glossary.json b/src/doc/reference/glossary/glossary.json index d6b4ed60..0c27bcb5 100644 --- a/src/doc/reference/glossary/glossary.json +++ b/src/doc/reference/glossary/glossary.json @@ -4,5 +4,6 @@ "permalink": "/doc/reference/glossary/{{ title | slugify }}/", "eleventyNavigation": { "parent": "Glossary" - } + }, + "tags": "glossary" } \ No newline at end of file diff --git a/src/doc/reference/glossary/greedy-scheduler.md b/src/doc/reference/glossary/greedy-scheduler.md deleted file mode 100644 index 23294f05..00000000 --- a/src/doc/reference/glossary/greedy-scheduler.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: Greedy scheduler -tags: greedy scheduler ---- - -A *greedy scheduler* assigns as many strands to -processors as possible in each time step, never leaving a processor idle if there is -work that can be done. \ No newline at end of file diff --git a/src/doc/reference/glossary/hyperobject.md b/src/doc/reference/glossary/hyperobject.md new file mode 100644 index 00000000..41ba6525 --- /dev/null +++ b/src/doc/reference/glossary/hyperobject.md @@ -0,0 +1,13 @@ +--- +title: Hyperobject +tags: hyperobject +--- +A linguistic construct supported by the +OpenCilk runtime system +that allows many {% defn "strand", "strands" %} +to coordinate in updating a shared +variable or data structure independently +by providing different {% defn "view", "views" %} +of the hyperobject to different strands at +the same time. The {% defn "reducer" %} is the only +hyperobject currently provided by OpenCilk. \ No newline at end of file diff --git a/src/doc/reference/glossary/ideal-parallel-computer.md b/src/doc/reference/glossary/ideal-parallel-computer.md deleted file mode 100644 index 212d1400..00000000 --- a/src/doc/reference/glossary/ideal-parallel-computer.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -title: Ideal parallel computer -tags: ideal parallel computer ---- - -Our analyses generally assume that parallel algorithms execute on an *ideal parallel -computer*, which consists of a set of processors and a [sequentially consistent](/doc/reference/glossary/#sequentially-consistent) -shared memory. -The ideal parallel-computer model also assumes that each processor in the machine has equal computing power, -and it ignores the cost of scheduling. -This last assumption may sound optimistic, but it turns out that -for algorithms with sufficient [parallelism](/doc/reference/glossary/#parallelism), -the overhead of scheduling is generally minimal in practice. \ No newline at end of file diff --git a/src/doc/reference/glossary/incomplete-step.md b/src/doc/reference/glossary/incomplete-step.md deleted file mode 100644 index 5909e081..00000000 --- a/src/doc/reference/glossary/incomplete-step.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Incomplete step -tags: incomplete step ---- - -As a greedy scheduler manages a computation on $P$ processors, each step is classified as complete or incomplete. -In an *incomplete step*, fewer than $P$ strands are ready to execute. -A greedy scheduler assigns each ready strand to its own processor, leaving some processors -idle for the step, but executing all the ready strands. \ No newline at end of file diff --git a/src/doc/reference/glossary/instruction.md b/src/doc/reference/glossary/instruction.md new file mode 100644 index 00000000..b559073c --- /dev/null +++ b/src/doc/reference/glossary/instruction.md @@ -0,0 +1,5 @@ +--- +title: Instruction +tags: instruction +--- +A single operation executed by a {% defn "processor" %}. \ No newline at end of file diff --git a/src/doc/reference/glossary/invocation-tree.md b/src/doc/reference/glossary/invocation-tree.md deleted file mode 100644 index 009db933..00000000 --- a/src/doc/reference/glossary/invocation-tree.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -title: Invocation tree -tags: invocation tree ---- - -A fork-join parallel trace can be pictured as a dag of strands embedded in an -*invocation tree* of procedure instances. -All directed edges connecting strands -run either within a procedure or along undirected edges of the invocation tree. -More general task-parallel traces that are not fork-join traces may -contain some directed edges that do not run along the undirected tree edges. \ No newline at end of file diff --git a/src/doc/reference/glossary/knot.md b/src/doc/reference/glossary/knot.md new file mode 100644 index 00000000..0b0e9a42 --- /dev/null +++ b/src/doc/reference/glossary/knot.md @@ -0,0 +1,15 @@ +--- +title: Knot +tags: knot +--- +A point at which the end of one {% defn "strand" %} +meets the end of another. If a knot has +one incoming strand and one outgoing +strand, it is a *serial knot*. If it has one +incoming strand and two outgoing +strands, it is a *spawn knot*. If it has +multiple incoming strands and one +outgoing strand, it is a *sync knot*. A +Cilk program does not produce serial +knots or knots with both multiple +incoming and multiple outgoing strands. \ No newline at end of file diff --git a/src/doc/reference/glossary/linear-speedup.md b/src/doc/reference/glossary/linear-speedup.md deleted file mode 100644 index c061dc6c..00000000 --- a/src/doc/reference/glossary/linear-speedup.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: Linear speedup -tags: linear speedup ---- - -When the speedup of a computation on $P$ processesors -is linear in the number of processors, that is, when -$T_1/T_P = \Theta(P)$, the computation exhibits *linear speedup*. \ No newline at end of file diff --git a/src/doc/reference/glossary/linear_speedup.md b/src/doc/reference/glossary/linear_speedup.md new file mode 100644 index 00000000..b90d23c0 --- /dev/null +++ b/src/doc/reference/glossary/linear_speedup.md @@ -0,0 +1,6 @@ +--- +title: Linear speedup +tags: linear speedup +--- +{% defn "Speedup" %} proportional to the {% defn "processor" %} count. +See also {% defn "perfect linear speedup" %}. \ No newline at end of file diff --git a/src/doc/reference/glossary/load-instructions.md b/src/doc/reference/glossary/load-instructions.md deleted file mode 100644 index 4c788893..00000000 --- a/src/doc/reference/glossary/load-instructions.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: Load instructions -tags: load instructions ---- - -Memory is accessed by load instructions and by store instructions. -*Load instructions* copy data from a location in -memory to a register within a processor. \ No newline at end of file diff --git a/src/doc/reference/glossary/lock.md b/src/doc/reference/glossary/lock.md new file mode 100644 index 00000000..512323af --- /dev/null +++ b/src/doc/reference/glossary/lock.md @@ -0,0 +1,12 @@ +--- +title: Lock +tags: lock +--- +A synchronization mechanism for +providing {% defn "atomic" %} operation by limiting +concurrent access to a resource. +Important operations on locks include +acquire (lock) and release (unlock). +Many locks are implemented as a {% defn "mutex" %}, +whereby only one {% defn "strand" %} can hold the +lock at any time. \ No newline at end of file diff --git a/src/doc/reference/glossary/lock_contention.md b/src/doc/reference/glossary/lock_contention.md new file mode 100644 index 00000000..067119c9 --- /dev/null +++ b/src/doc/reference/glossary/lock_contention.md @@ -0,0 +1,5 @@ +--- +title: Lock contention +tags: lock contention +--- +The situation wherein multiple {% defn "strand", "strands" %} vie for the same {% defn "lock" %}. \ No newline at end of file diff --git a/src/doc/reference/glossary/logical-parallelism.md b/src/doc/reference/glossary/logical-parallelism.md deleted file mode 100644 index 576c9d60..00000000 --- a/src/doc/reference/glossary/logical-parallelism.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -title: Logical parallelism -tags: logical parallelism ---- - -Parallel keywords like `cilk_spawn`, `cilk_scope`, `cilk_sync`, and `cilk_for` express -the *logical parallelism* of a computation, indicating which parts of the computation -*may* proceed in parallel (without requiring that they *must* do so). At runtime, it is up to a scheduler to determine -which subcomputations actually run in parallel by assigning them to available processors -as the computation unfolds. \ No newline at end of file diff --git a/src/doc/reference/glossary/logically-in-parallel.md b/src/doc/reference/glossary/logically-in-parallel.md deleted file mode 100644 index e9ec06ff..00000000 --- a/src/doc/reference/glossary/logically-in-parallel.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: Logically in parallel -tags: logically in parallel ---- - -If $u$ and $v$ are strands in [parallel trace](/doc/reference/glossary/#parallel-trace) $G$, -and $G$ contains no directed path from $u$ to $v$ or from $v$ to $u$, -then the strands are *(logically) in parallel*. \ No newline at end of file diff --git a/src/doc/reference/glossary/logically-in-series.md b/src/doc/reference/glossary/logically-in-series.md deleted file mode 100644 index 9a771b1b..00000000 --- a/src/doc/reference/glossary/logically-in-series.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: Logically in series -tags: logically in series ---- - -If $u$ and $v$ are strands in [parallel trace](/doc/reference/glossary/#parallel-trace) $G$, -and $G$ contains a directed path from $u$ to $v$, -then the strands are *(logically) in series*. \ No newline at end of file diff --git a/src/doc/reference/glossary/multicore.md b/src/doc/reference/glossary/multicore.md new file mode 100644 index 00000000..6b452825 --- /dev/null +++ b/src/doc/reference/glossary/multicore.md @@ -0,0 +1,5 @@ +--- +title: Multicore +tags: multicore +--- +A semiconductor chip containing more than one {% defn "processor" %} {% defn "core" %}. diff --git a/src/doc/reference/glossary/multicores.md b/src/doc/reference/glossary/multicores.md deleted file mode 100644 index c15f46b7..00000000 --- a/src/doc/reference/glossary/multicores.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: Multicores -tags: multicores ---- - -Parallel computers—computers with multiple processing units—are ubiquitous. Handheld, laptop, desktop, and cloud machines are all -multicore computers, or simply, multicores, containing multiple processing "cores." Each processing core is a -full-fledged processor that can directly access any location in a common shared memory. \ No newline at end of file diff --git a/src/doc/reference/glossary/multiprocessor.md b/src/doc/reference/glossary/multiprocessor.md new file mode 100644 index 00000000..30228f03 --- /dev/null +++ b/src/doc/reference/glossary/multiprocessor.md @@ -0,0 +1,5 @@ +--- +title: Multiprocessor +tags: multiprocessor +--- +A computer containing multiple general-purpose {% defn "processor", "processors" %}. \ No newline at end of file diff --git a/src/doc/reference/glossary/mutex.md b/src/doc/reference/glossary/mutex.md new file mode 100644 index 00000000..0db35839 --- /dev/null +++ b/src/doc/reference/glossary/mutex.md @@ -0,0 +1,10 @@ +--- +title: Mutex +tags: mutex +--- +A "mutually exclusive" {% defn "lock" %} that only one +{% defn "strand" %} can acquire at a time, thereby +ensuring that only one strand executes +the {% defn "critical section" %} protected by the +mutex at a time. +For example, Linux* OS supports Pthreads `pthread_mutex_t` objects. \ No newline at end of file diff --git a/src/doc/reference/glossary/nondeterminism.md b/src/doc/reference/glossary/nondeterminism.md new file mode 100644 index 00000000..f514b05d --- /dev/null +++ b/src/doc/reference/glossary/nondeterminism.md @@ -0,0 +1,9 @@ +--- +title: Nondeterminism +tags: nondeterminism +--- +The property of a program when it +behaves differently from run to run when +executed on exactly the same inputs. +Nondeterministic programs are usually +hard to debug. \ No newline at end of file diff --git a/src/doc/reference/glossary/nondeterministic.md b/src/doc/reference/glossary/nondeterministic.md deleted file mode 100644 index 104760fa..00000000 --- a/src/doc/reference/glossary/nondeterministic.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -title: Nondeterministic -tags: nondeterministic ---- - -A parallel algorithm is *nondeterministic* if its behavior might vary from run to run when the input is the same. -It is *deterministic* if it always does the same thing on the same -input, no matter how the instructions are scheduled on the multicore computer. -A parallel algorithm that is intended to be deterministic may nevertheless -act nondeterministically, however, if it contains a difficult-to-diagnose bug called a -determinacy race. \ No newline at end of file diff --git a/src/doc/reference/glossary/nonlocal_variable.md b/src/doc/reference/glossary/nonlocal_variable.md new file mode 100644 index 00000000..bf33ab4d --- /dev/null +++ b/src/doc/reference/glossary/nonlocal_variable.md @@ -0,0 +1,10 @@ +--- +title: Nonlocal variable +tags: nonlocal variable +--- +A program variable that is bound outside +of the scope of the function, method, or +class in which it is used. In Cilk +programs, we also use this term to refer +to variables with a scope outside a +`cilk_for` loop. \ No newline at end of file diff --git a/src/doc/reference/glossary/online.md b/src/doc/reference/glossary/online.md deleted file mode 100644 index 6eb675bb..00000000 --- a/src/doc/reference/glossary/online.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: Online -tags: online ---- - -A task-parallel scheduler must must operate *online*, -scheduling the computation without knowing in advance -when procedures will be spawned or when they will finish. \ No newline at end of file diff --git a/src/doc/reference/glossary/opencilk.md b/src/doc/reference/glossary/opencilk.md new file mode 100644 index 00000000..b1a06f7a --- /dev/null +++ b/src/doc/reference/glossary/opencilk.md @@ -0,0 +1,5 @@ +--- +title: OpenCilk +tags: opencilk +--- +A {% defn "task-parallel" %} programming platform for multicore computers based on {% defn "Cilk" %} technology. \ No newline at end of file diff --git a/src/doc/reference/glossary/parallel-algorithms.md b/src/doc/reference/glossary/parallel-algorithms.md deleted file mode 100644 index 06b060b3..00000000 --- a/src/doc/reference/glossary/parallel-algorithms.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -title: Parallel algorithms -tags: parallel algorithms ---- - -Algorithms where multiple instructions can execute simultaneously. \ No newline at end of file diff --git a/src/doc/reference/glossary/parallel-loops.md b/src/doc/reference/glossary/parallel-loops.md deleted file mode 100644 index ac3ed678..00000000 --- a/src/doc/reference/glossary/parallel-loops.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: Parallel loops -tags: parallel loops ---- - -A *parallel loop* is like an ordinary `for` loop, -except that multiple iterations of the loop can execute at the same time. -You can write parallel loops in OpenCilk with `cilk_for`. \ No newline at end of file diff --git a/src/doc/reference/glossary/parallel-slackness.md b/src/doc/reference/glossary/parallel-slackness.md deleted file mode 100644 index ded9906e..00000000 --- a/src/doc/reference/glossary/parallel-slackness.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Parallel slackness -tags: parallel slackness ---- - -We define the *(parallel) slackness* of a task-parallel computation executed on -an ideal parallel computer with $P$ processors to be the ratio $(T_1/T_{\infty})/P = T_1/(PT_{\infty})$, -which is the factor by which the parallelism of the computation exceeds the number of processors in the machine. -If the slackness is less than 1, perfect linear speedup is impossible. \ No newline at end of file diff --git a/src/doc/reference/glossary/parallel-trace.md b/src/doc/reference/glossary/parallel-trace.md deleted file mode 100644 index 92831cf7..00000000 --- a/src/doc/reference/glossary/parallel-trace.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -title: Parallel trace -tags: parallel trace ---- - -It helps to view the execution of a parallel computation—the dynamic stream of -runtime instructions executed by processors under the direction of a parallel program—as -a directed acyclic graph $G=(V,E)$, called a *(parallel) trace*. Conceptually, -the vertices in $V$ are executed instructions, and the edges in $E$ represent -dependencies between instructions, where $(u,v)\in E$ means that the parallel -program required instruction $u$ to execute before instruction $v$. \ No newline at end of file diff --git a/src/doc/reference/glossary/parallel_loop.md b/src/doc/reference/glossary/parallel_loop.md new file mode 100644 index 00000000..1036c26e --- /dev/null +++ b/src/doc/reference/glossary/parallel_loop.md @@ -0,0 +1,8 @@ +--- +title: Parallel loop +tags: parallel loop +--- +A `for` loop all of whose iterations can be +run independently in parallel. The +`cilk_for` keyword designates a parallel +loop. \ No newline at end of file diff --git a/src/doc/reference/glossary/parallelism.md b/src/doc/reference/glossary/parallelism.md index eed192ef..d4fffd42 100644 --- a/src/doc/reference/glossary/parallelism.md +++ b/src/doc/reference/glossary/parallelism.md @@ -2,12 +2,7 @@ title: Parallelism tags: parallelism --- - -The ratio $T_1/T_{\infty}$ of the work to the span gives the *parallelism* of a parallel -computation. We can view the parallelism from three perspectives. As a ratio, the -parallelism denotes the average amount of work that can be performed in parallel -for each step along the critical path. As an upper bound, the parallelism gives the -maximum possible speedup that can be achieved on any number of processors. Perhaps -most important, the parallelism provides a limit on the possibility of attaining -perfect linear speedup. Specifically, once the number of processors exceeds the -parallelism, the computation cannot possibly achieve perfect linear speedup. \ No newline at end of file +The ratio of {% defn "work" %} to {% defn "span" %}, which is the +largest speedup an application could +possibly attain when run on an infinite +number of {% defn "processor", "processors" %}. \ No newline at end of file diff --git a/src/doc/reference/glossary/parent.md b/src/doc/reference/glossary/parent.md deleted file mode 100644 index 69409e8c..00000000 --- a/src/doc/reference/glossary/parent.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: Parent -tags: parent ---- - -A *parent* process is one that executes a spawn, -after which it may continue to execute in parallel with the spawned subroutine—its child. \ No newline at end of file diff --git a/src/doc/reference/glossary/perfect-linear-speedup.md b/src/doc/reference/glossary/perfect-linear-speedup.md deleted file mode 100644 index 597954a2..00000000 --- a/src/doc/reference/glossary/perfect-linear-speedup.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -title: Perfect linear speedup -tags: perfect linear speedup ---- - -The maximum possible speedup of a computation on $P$ processors is - -$T_1/T_P = P$, - -which is called *perfect linear speedup*. \ No newline at end of file diff --git a/src/doc/reference/glossary/perfect_linear_speedup.md b/src/doc/reference/glossary/perfect_linear_speedup.md new file mode 100644 index 00000000..6ef53dff --- /dev/null +++ b/src/doc/reference/glossary/perfect_linear_speedup.md @@ -0,0 +1,6 @@ +--- +title: Perfect linear speedup +tags: perfect linear speedup +--- +{% defn "Speedup" %} equal to the {% defn "processor" %} count. +See also {% defn "linear speedup" %}. \ No newline at end of file diff --git a/src/doc/reference/glossary/process.md b/src/doc/reference/glossary/process.md new file mode 100644 index 00000000..1019352d --- /dev/null +++ b/src/doc/reference/glossary/process.md @@ -0,0 +1,11 @@ +--- +title: Process +tags: process +--- +A self-contained {% defn "concurrent agent" %} that by +default executes a serial chain of +{% defn "instruction", "instructions" %}. More than one {% defn "thread" %} may +run within a process, but a process does +not usually share memory with other +processes. Scheduling of processes is +typically managed by the operating system. \ No newline at end of file diff --git a/src/doc/reference/glossary/processor.md b/src/doc/reference/glossary/processor.md new file mode 100644 index 00000000..d6bd868b --- /dev/null +++ b/src/doc/reference/glossary/processor.md @@ -0,0 +1,12 @@ +--- +title: Processor +tags: processor +--- +A processor implements the logic to +execute program {% defn "instruction", "instructions" %} +sequentially; we use the term "{% defn "core" %}" as a +synonym. This document does not use +the term "processor" to refer to multiple +processing units on the same or multiple +chips, although other documents may +use the term that way. \ No newline at end of file diff --git a/src/doc/reference/glossary/race_condition.md b/src/doc/reference/glossary/race_condition.md new file mode 100644 index 00000000..8d726125 --- /dev/null +++ b/src/doc/reference/glossary/race_condition.md @@ -0,0 +1,9 @@ +--- +title: Race condition +tags: race condition +--- +A source of {% defn "nondeterminism" %} whereby the +result of a concurrent computation +depends on the timing or relative order of +the execution of instructions in each +individual {% defn "strand" %}. \ No newline at end of file diff --git a/src/doc/reference/glossary/ready.md b/src/doc/reference/glossary/ready.md deleted file mode 100644 index 66dbab35..00000000 --- a/src/doc/reference/glossary/ready.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: Ready -tags: ready ---- - -A strand is *ready* to execute when all strands on which it -depends have finished execution. \ No newline at end of file diff --git a/src/doc/reference/glossary/receiver.md b/src/doc/reference/glossary/receiver.md new file mode 100644 index 00000000..6d226386 --- /dev/null +++ b/src/doc/reference/glossary/receiver.md @@ -0,0 +1,5 @@ +--- +title: Receiver +tags: receiver +--- +A variable to receive the result of a function call. \ No newline at end of file diff --git a/src/doc/reference/glossary/reducer.md b/src/doc/reference/glossary/reducer.md new file mode 100644 index 00000000..4d5840ce --- /dev/null +++ b/src/doc/reference/glossary/reducer.md @@ -0,0 +1,15 @@ +--- +title: Reducer +tags: reducer +--- +A {% defn "hyperobject" %} with a defined (usually +associative) `reduce()` binary operator +which the OpenCilk runtime system uses to +combine the each {% defn "view" %} of each separate +{% defn "strand" %}. +A reducer must have two methods: +- A default constructor which initializes the +reducer to its identity value +- A `reduce()` method which merges the +value of right reducer into the left (this) +reducer. \ No newline at end of file diff --git a/src/doc/reference/glossary/response_time.md b/src/doc/reference/glossary/response_time.md new file mode 100644 index 00000000..0b345009 --- /dev/null +++ b/src/doc/reference/glossary/response_time.md @@ -0,0 +1,8 @@ +--- +title: Response time +tags: response time +--- +The time it takes to execute a +computation from the time a human user +provides an input to the time the user +gets the result. \ No newline at end of file diff --git a/src/doc/reference/glossary/running_time.md b/src/doc/reference/glossary/running_time.md new file mode 100644 index 00000000..db1b0ba8 --- /dev/null +++ b/src/doc/reference/glossary/running_time.md @@ -0,0 +1,6 @@ +--- +title: Running time +tags: running time +--- +How long a program takes to execute on a given computer system. +Also called {% defn "execution time" %}. \ No newline at end of file diff --git a/src/doc/reference/glossary/scale_down.md b/src/doc/reference/glossary/scale_down.md new file mode 100644 index 00000000..25cce075 --- /dev/null +++ b/src/doc/reference/glossary/scale_down.md @@ -0,0 +1,6 @@ +--- +title: Scale down +tags: scale down +--- +The ability of a parallel application to run efficiently on one +or a small number of {% defn "processor", "processors" %}. \ No newline at end of file diff --git a/src/doc/reference/glossary/scale_up.md b/src/doc/reference/glossary/scale_up.md new file mode 100644 index 00000000..411417c3 --- /dev/null +++ b/src/doc/reference/glossary/scale_up.md @@ -0,0 +1,7 @@ +--- +title: Scale up +tags: scale up +--- +The ability of a parallel application to run efficiently +on a large number of {% defn "processors" %}. +See also {% defn "linear speedup" %}. \ No newline at end of file diff --git a/src/doc/reference/glossary/scheduler.md b/src/doc/reference/glossary/scheduler.md deleted file mode 100644 index 5e2d806f..00000000 --- a/src/doc/reference/glossary/scheduler.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: Scheduler -tags: scheduler ---- - -The *scheduler* for task-parallel computations -determines at runtime which subcomputations actually execute in parallel by assigning -them to available processors as the computation unfolds. \ No newline at end of file diff --git a/src/doc/reference/glossary/sequential_consistency.md b/src/doc/reference/glossary/sequential_consistency.md new file mode 100644 index 00000000..d7899248 --- /dev/null +++ b/src/doc/reference/glossary/sequential_consistency.md @@ -0,0 +1,11 @@ +--- +title: Sequential consistency +tags: sequential consistency +--- +The memory model for concurrency +wherein the effect of {% defn "concurrent agents" %} is +as if their operations on {% defn "shared memory" %} +were interleaved in a global order +consistent with the orders in which each +agent executed them. This model was +advanced in 1976 by [Leslie Lamport](https://research.microsoft.com/en-us/um/people/lamport/). \ No newline at end of file diff --git a/src/doc/reference/glossary/sequentially-consistent.md b/src/doc/reference/glossary/sequentially-consistent.md deleted file mode 100644 index 6c16f2f4..00000000 --- a/src/doc/reference/glossary/sequentially-consistent.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -title: Sequential consistency -tags: sequential consistency ---- - -*Sequential consistency* means that even if -multiple processors attempt to access memory simultaneously, -the shared memory behaves as if exactly one instruction from one of the processors is executed at -a time, even though the actual transfer of data may happen at the same time. It is -as if the instructions were executed one at a time sequentially according to some -global linear order among all the processors that preserves the individual orders in -which each processor executes its own instructions. \ No newline at end of file diff --git a/src/doc/reference/glossary/serial-algorithms.md b/src/doc/reference/glossary/serial-algorithms.md deleted file mode 100644 index f7188a5c..00000000 --- a/src/doc/reference/glossary/serial-algorithms.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: Serial algorithms -tags: serial algorithms ---- - -*Serial algorithms* are suitable for running on a uniprocessor computer that executes only one -instruction at a time. \ No newline at end of file diff --git a/src/doc/reference/glossary/serial-projection.md b/src/doc/reference/glossary/serial-projection.md deleted file mode 100644 index 527dd1f1..00000000 --- a/src/doc/reference/glossary/serial-projection.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: Serial projection -tags: serial projection ---- - -The *serial projection* of a parallel algorithm is the serial algorithm that results from ignoring the parallel directives, -such as `cilk_spawn`, `cilk-sync`, and `cilk_for`. \ No newline at end of file diff --git a/src/doc/reference/glossary/serial_execution.md b/src/doc/reference/glossary/serial_execution.md new file mode 100644 index 00000000..5ea931a3 --- /dev/null +++ b/src/doc/reference/glossary/serial_execution.md @@ -0,0 +1,5 @@ +--- +title: Serial execution +tags: serial execution +--- +Execution of the {% defn "serial projection" %} of a Cilk program. \ No newline at end of file diff --git a/src/doc/reference/glossary/serial_projection.md b/src/doc/reference/glossary/serial_projection.md new file mode 100644 index 00000000..aea60582 --- /dev/null +++ b/src/doc/reference/glossary/serial_projection.md @@ -0,0 +1,14 @@ +--- +title: Serial projection +tags: serial projection +--- +The C or C++ program that results from +stubbing out the keywords of a Cilk +program, where `cilk_spawn`, `cilk_scope`, and +`cilk_sync` are elided and `cilk_for` is +replaced with an ordinary `for`. The +serial projection can be used for debugging +and, in the case of a converted C/C++ +program, will behave exactly as the +original C/C++ program. The terms "*serialization*" and "*serial elision*" are used in some of the literature. +Also, see "{% defn "serial semantics" %}". \ No newline at end of file diff --git a/src/doc/reference/glossary/serial_semantics.md b/src/doc/reference/glossary/serial_semantics.md new file mode 100644 index 00000000..bd842fd7 --- /dev/null +++ b/src/doc/reference/glossary/serial_semantics.md @@ -0,0 +1,8 @@ +--- +title: Serial semantics +tags: serial semantics +--- +The behavior of a Cilk program when executed as the +{% defn "serial projection" %} of the program. +See the following article: +Four Reasons Why Parallel Programs Should Have Serial Semantics. \ No newline at end of file diff --git a/src/doc/reference/glossary/serialization.md b/src/doc/reference/glossary/serialization.md new file mode 100644 index 00000000..11a070d1 --- /dev/null +++ b/src/doc/reference/glossary/serialization.md @@ -0,0 +1,5 @@ +--- +title: Serialization +tags: serial projection +--- +See {% defn "serial projection" %}. \ No newline at end of file diff --git a/src/doc/reference/glossary/shared-memory.md b/src/doc/reference/glossary/shared-memory.md deleted file mode 100644 index 3153d127..00000000 --- a/src/doc/reference/glossary/shared-memory.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: Shared memory -tags: shared memory ---- - -In a multicore computer, *shared memory* can be directly accessed at any location -by any of the processing cores. \ No newline at end of file diff --git a/src/doc/reference/glossary/shared_memory.md b/src/doc/reference/glossary/shared_memory.md new file mode 100644 index 00000000..c1b6be19 --- /dev/null +++ b/src/doc/reference/glossary/shared_memory.md @@ -0,0 +1,10 @@ +--- +title: Shared memory +tags: shared memory +--- +Computer storage that is shared among +several processors. A shared-memory +{% defn "multiprocessor" %} is a computer in which +each {% defn "processor" %} can directly address any +memory location. Contrast with +{% defn "distributed memory" %}. \ No newline at end of file diff --git a/src/doc/reference/glossary/span-law.md b/src/doc/reference/glossary/span-law.md deleted file mode 100644 index a9e4feb0..00000000 --- a/src/doc/reference/glossary/span-law.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -title: Span law -tags: span ---- - -The span provides a lower bound on the running time $T_P$ of a task-parallel computation on $P$ processors. -A $P$-processor ideal parallel computer cannot run any faster than a machine -with an unlimited number of processors. Looked at another way, a machine -with an unlimited number of processors can emulate a $P$-processor machine by -using just $P$ of its processors. Thus, the *span law* follows: - -$T_P \geq T_{\infty}$. \ No newline at end of file diff --git a/src/doc/reference/glossary/span.md b/src/doc/reference/glossary/span.md index 83e54d38..6383dfca 100644 --- a/src/doc/reference/glossary/span.md +++ b/src/doc/reference/glossary/span.md @@ -2,10 +2,10 @@ title: Span tags: span --- - -The *span* is the fastest possible time to execute the computation on an -unlimited number of processors, which corresponds to the sum of the times taken -by the strands along a longest path in the trace, where “longest” means that each -strand is weighted by its execution time. Such a longest path is called the *critical -path* of the trace, and thus the span is the weight of the longest (weighted) path -in the trace. \ No newline at end of file +The theoretically fastest execution time +for a parallel program when run on an +infinite number of {% defn "processor", "processors" %}, +discounting overheads for +communication and scheduling. Often +denoted by $T_{\infty}$ in the literature, and +sometimes called *critical-path length*. \ No newline at end of file diff --git a/src/doc/reference/glossary/spawn.md b/src/doc/reference/glossary/spawn.md new file mode 100644 index 00000000..bda60de5 --- /dev/null +++ b/src/doc/reference/glossary/spawn.md @@ -0,0 +1,8 @@ +--- +title: Spawn +tags: spawn +--- +To call a function without waiting for it to +return, as in a normal call. The caller can +continue to execute in parallel with the +called function. See also {% defn "cilk_spawn" %}. \ No newline at end of file diff --git a/src/doc/reference/glossary/spawning.md b/src/doc/reference/glossary/spawning.md deleted file mode 100644 index ff88998c..00000000 --- a/src/doc/reference/glossary/spawning.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -title: Spawning -tags: spawning ---- - -*Spawning* occurs when the keyword `cilk_spawn` precedes a procedure call. -The semantics of a spawn differs from an ordinary procedure call in -that the procedure instance that executes the spawn—the *parent*—may continue -to execute in parallel with the spawned subroutine—its *child*—instead of waiting -for the child to finish, as would happen in a serial execution. \ No newline at end of file diff --git a/src/doc/reference/glossary/speedup.md b/src/doc/reference/glossary/speedup.md index a230b66d..4a0d05d4 100644 --- a/src/doc/reference/glossary/speedup.md +++ b/src/doc/reference/glossary/speedup.md @@ -2,9 +2,9 @@ title: Speedup tags: speedup --- - -We define the *speedup* of a computation on $P$ processors by the ratio $T_1/T_P$, -which says how many times faster the computation runs on $P$ processors than -on one processor. By the work law, we have $T_P \geq T_1/P$, which implies that -$T_1/T_P \leq P$. Thus, the speedup on a $P$-processor ideal parallel computer can be -at most $P$. \ No newline at end of file +How many times faster a program is +when run in parallel than when run on +one {% defn "processor" %}. Speedup can be +computed by dividing the running time $T_P$ +of the program on $P$ processors by its +running time $T_1$ on one processor. \ No newline at end of file diff --git a/src/doc/reference/glossary/store-instructions.md b/src/doc/reference/glossary/store-instructions.md deleted file mode 100644 index 2c6a0110..00000000 --- a/src/doc/reference/glossary/store-instructions.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: Store instructions -tags: store instructions ---- - -Memory is accessed by store instructions and by load instructions. -*Store instructions* copy data -from a processor register to a location in the memory. \ No newline at end of file diff --git a/src/doc/reference/glossary/strand.md b/src/doc/reference/glossary/strand.md index f03af391..23a89361 100644 --- a/src/doc/reference/glossary/strand.md +++ b/src/doc/reference/glossary/strand.md @@ -2,12 +2,6 @@ title: Strand tags: strand --- - -It’s sometimes inconvenient, especially if we want to focus on the parallel structure -of a computation, for a vertex of a [**trace**](/doc/reference/glossary/#parallel-trace) to represent only one executed instruction. -Consequently, if a chain of instructions contains no parallel or procedural -control (no `cilk_spawn`, `cilk_sync`, procedure call, or return—via either an explicit `return` -statement or the return that happens implicitly upon reaching the end of a procedure), -we group the entire chain into a single *strand*. -Strands do not include instructions that involve parallel or procedural -control. These control dependencies must be represented as edges in the trace. +A serial chain of executed {% defn "instruction", "instructions" %} without any parallel +control (such as a {% defn "spawn" %}, {% defn "sync" %}, return +from a spawn, etc.) \ No newline at end of file diff --git a/src/doc/reference/glossary/sync.md b/src/doc/reference/glossary/sync.md new file mode 100644 index 00000000..3dd62b7a --- /dev/null +++ b/src/doc/reference/glossary/sync.md @@ -0,0 +1,9 @@ +--- +title: Sync +tags: sync +--- +To wait for a set of {% defn "spawn", "spawned" %} functions to +return before proceeding. The current +function is dependent upon the spawned +functions and cannot proceed in parallel +with them. See also {% defn "cilk_sync" %}. \ No newline at end of file diff --git a/src/doc/reference/glossary/task-parallel-programming.md b/src/doc/reference/glossary/task-parallel-programming.md deleted file mode 100644 index f7124be0..00000000 --- a/src/doc/reference/glossary/task-parallel-programming.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -title: Task-parallel platforms, programming, and algorithms -tags: task-parallel ---- - -*Task-parallel platforms* provide a layer of software on top of threads to coordinate, schedule, -and manage the processors of a multicore. Some task-parallel platforms are built as -runtime libraries, but others provide full-fledged parallel languages with compiler and runtime support. - -*Task-parallel programming* allows parallelism to be specified in a “processor-oblivious” fashion, where the programmer identifies what computational tasks may run in parallel but does not indicate which thread or processor performs the task. -Thus, the programmer is freed from worrying about communication protocols, load -balancing, and other vagaries of thread programming. The task-parallel platform -contains a scheduler, which automatically load-balances the tasks across the processors, thereby greatly simplifying the programmer’s chore. *Task-parallel algorithms* provide a natural extension to ordinary serial algorithms, allowing performance to be reasoned about mathematically using “work/span analysis.” \ No newline at end of file diff --git a/src/doc/reference/glossary/task-parallel.md b/src/doc/reference/glossary/task-parallel.md new file mode 100644 index 00000000..7427d15a --- /dev/null +++ b/src/doc/reference/glossary/task-parallel.md @@ -0,0 +1,8 @@ +--- +title: Task-parallel +tags: [task-parallel] +--- + +*Task-parallel platforms* provide a layer of software on top of {% defn "thread", "threads" %} to coordinate, schedule, and manage the {% defn "processor", "processors" %} of a {% defn "multicore" %}. Some task-parallel platforms are built as runtime libraries, but others provide full-fledged parallel languages with compiler and runtime support. + +*Task-parallel programming* allows parallelism to be specified in a "processor-oblivious" fashion, where the programmer identifies what computational tasks may run in parallel but does not indicate which thread or processor performs the task. Thus, the programmer is freed from worrying about communication protocols, load balancing, and other vagaries of thread programming. The task-parallel platform contains a scheduler, which automatically load-balances the tasks across the processors, thereby greatly simplifying the programmer’s chore. Task-parallel algorithms provide a natural extension to ordinary serial algorithms, allowing performance to be reasoned about mathematically using "work/span analysis." \ No newline at end of file diff --git a/src/doc/reference/glossary/thread-parallelism.md b/src/doc/reference/glossary/thread-parallelism.md deleted file mode 100644 index ec8ecefa..00000000 --- a/src/doc/reference/glossary/thread-parallelism.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -title: Threads and thread parallelism -tags: threads ---- - -One approach to programming multicores is *thread parallelism*. This processor-centric -parallel-programming model employs a software abstraction of "virtual -processors," or *threads* that share a common memory. Each thread maintains its -own program counter and can execute code independently of the other threads. The -operating system loads a thread onto a processing core for execution and switches -it out when another thread needs to run. diff --git a/src/doc/reference/glossary/thread.md b/src/doc/reference/glossary/thread.md new file mode 100644 index 00000000..717fde3e --- /dev/null +++ b/src/doc/reference/glossary/thread.md @@ -0,0 +1,7 @@ +--- +title: Thread +tags: thread +--- +A thread executes a serial {% defn "instruction" %} chain. +Scheduling of threads is typically managed by the operating +system. \ No newline at end of file diff --git a/src/doc/reference/glossary/throughput.md b/src/doc/reference/glossary/throughput.md new file mode 100644 index 00000000..c13888ce --- /dev/null +++ b/src/doc/reference/glossary/throughput.md @@ -0,0 +1,5 @@ +--- +title: Throughput +tags: throughput +--- +A number of operations performed per unit time. \ No newline at end of file diff --git a/src/doc/reference/glossary/view.md b/src/doc/reference/glossary/view.md new file mode 100644 index 00000000..7ffd53dc --- /dev/null +++ b/src/doc/reference/glossary/view.md @@ -0,0 +1,5 @@ +--- +title: View +tags: view +--- +The state of a {% defn "hyperobject" %} as seen by a given {% defn "strand" %}. \ No newline at end of file diff --git a/src/doc/reference/glossary/work-law.md b/src/doc/reference/glossary/work-law.md deleted file mode 100644 index aec2b196..00000000 --- a/src/doc/reference/glossary/work-law.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -title: Work law -tags: work ---- - -The work provides a lower bound on the running time $T_P$ of a task-parallel computation on $P$ processors: -In one step, an ideal parallel computer with $P$ processors can do at most $P$ -units of work, and thus in $T_P$ time, it can perform at most $PT_P$ work. -Since the total work to do is $T_1$, we have $PT_P \geq T_1$. Dividing by $P$ yields the *work law*: - -$T_P \geq T_1/P$. diff --git a/src/doc/reference/glossary/work.md b/src/doc/reference/glossary/work.md index 4a76b3a3..86a989e9 100644 --- a/src/doc/reference/glossary/work.md +++ b/src/doc/reference/glossary/work.md @@ -2,8 +2,5 @@ title: Work tags: work --- - -The *work* of a task-parallel computation is the total time to execute the entire computation on -one processor. In other words, the work is the sum of the times taken by each of -the strands. If each strand takes unit time, the work is just the number of vertices -in the trace. \ No newline at end of file +The running time of a program when run on one {% defn "processor" %}, +sometimes denoted by $T_1$. \ No newline at end of file diff --git a/src/doc/reference/glossary/work_stealing.md b/src/doc/reference/glossary/work_stealing.md new file mode 100644 index 00000000..67a2113d --- /dev/null +++ b/src/doc/reference/glossary/work_stealing.md @@ -0,0 +1,12 @@ +--- +title: Work stealing +tags: work stealing +--- +A scheduling strategy where {% defn "processor", "processors" %} +post parallel work locally and, when a +processor runs out of local work, it steals +work from another processor. Work-stealing schedulers are notable for their efficiency, because they incur no +communication or synchronization +overhead when there is ample +{% defn "parallelism" %}. The OpenCilk runtime system +employs a work-stealing scheduler. \ No newline at end of file diff --git a/src/doc/reference/glossary/worker.md b/src/doc/reference/glossary/worker.md new file mode 100644 index 00000000..623bdfd2 --- /dev/null +++ b/src/doc/reference/glossary/worker.md @@ -0,0 +1,6 @@ +--- +title: Worker +tags: worker +--- +A {% defn "thread" %} that, together with other workers, +implements the OpenCilk runtime system's {% defn "work stealing" %} scheduler. \ No newline at end of file diff --git a/src/doc/reference/opencilk-language-specification.md b/src/doc/reference/opencilk-language-specification.md deleted file mode 100644 index 8b076f6e..00000000 --- a/src/doc/reference/opencilk-language-specification.md +++ /dev/null @@ -1,861 +0,0 @@ ---- -layout: layouts/page.njk -stylesheet: language-specification.css -title: OpenCilk language specification -date: 2022-07-14T21:37:03.433Z -eleventyNavigation: - key: Language specification ---- - -

OpenCilk Language Extension Specification
- Version 1.0 (2021-02-01)

-

Copyright © 2020, 2021 Massachusetts Institute of Technology. All rights reserved.

-

More information about OpenCilk can be found at - opencilk.org

-

Feedback on this specification is encouraged and welcome; please send to - contact@opencilk.org

- -# Introduction - -

This document is one of a set of technical - specifications describing the OpenCilk language - and the run-time support for the language. Together, these documents provide the - detail needed to implement a compliant compiler. At this time the following specifications are available:

-
    -
  • The OpenCilk Language Specification, this document
  • -
  • The OpenCilk Application Binary Interface
  • (XXX Add link) -
-

This document defines the OpenCilk extension to C and C++. The - language extension is supported by a run-time user-mode work-stealing task scheduler - which is not directly exposed to the application programmer. However, some of the - semantics of the language and some of the guarantees provided require specific behavior - of the task scheduler. The programmer visible parts of the language include the - following constructs:

-
    -
  1. Four keywords (cilk_scope, cilk_for, cilk_spawn and cilk_sync) - to express tasking
  2. -
  3. Hyperobjects, which provide local views to shared objects
  4. -
-

An implementation of the language may take advantage of all parallelism resources - available in the hardware. On a typical CPU, these include at least multiple cores - and vector units. Some of the language constructs, e.g. cilk_spawn, - utilize only core parallelism; some, e.g. SIMD loops, utilize only vector parallelism, - and some, e.g. SIMD-enabled functions, utilize both. The defined behavior of every - deterministic Cilk program is the same as the behavior of a similar C or C++ program - known as the “serialization.” While execution of a C or C++ program - may be considered as a linear sequence of statements, execution of a tasking program - is in general a directed acyclic graph. Parallel control flow may yield a new kind - of undefined behavior, a “data race,” whereby parts of the program - that may execute in parallel access the same memory location in an indeterminate - order, with at least one of the accesses being a write access. In addition, throwing - if an exception may result in is thrown, code - being may still be executed that would not have been executed - in a serial execution.

-

The word “shall” is used in this specification to express a diagnosable - constraint on a Cilk Plus program.

- -# Related documents - -
    -
  1. The OpenCilk Application Binary Interface
  2. -
  3. ISO/IEC 9899:2011, Information Technology – Programming languages – - C
  4. -
  5. ISO/IEC 14882:2011, Information Technology – Programming languages – - C++
  6. -
  7. OpenMP Application Program Interface, Version 4.0 - July 2013
  8. -
- -# Keywords for Tasking - -

OpenCilk adds the following new keywords:

-
    -
  • cilk_scope
  • -
  • cilk_for
  • -
  • cilk_sync
  • -
  • cilk_spawn
  • - -
-

A program that uses these keywords other than as defined in the grammar extension - below is ill-formed.

- -## Grammar - -

The three keywords are used in the following new productions:

-
-
jump-statement:
-
cilk_sync ;
-
-

The call production of the grammar is modified to permit the keyword cilk_spawn - before the expression denoting the function to be called:

-
-
postfix-expression:
-
cilk_spawnopt postfix-expression ( - expression-listopt )
-
-

Consecutive cilk_spawn tokens are not permitted. The postfix-expression - following cilk_spawn is called a spawned function. The - spawned function may be a normal function call, a member-function call, or the function-call - (parentheses) operator of a function object (functor) or a call to a lambda expression. - Overloaded operators other than the parentheses operator may be spawned only by - using the function-call notation (e.g., operator+(arg1,arg2)). There - shall be no more than one cilk_spawn within a full expression. A function - that contains a spawn statement is called a spawning function.

-

Note: The spawned function call may be a normal function - call, a member-function call, the function-call (parentheses) operator of a function - object (functor), or a call to a lambda expression.

-

A program is considered ill formed if the cilk_spawn form - of this expression appears other than in one of the following contexts:

-
    -
  • as the entire body full-expression of an expression statement,
  • -
  • as the entire right hand side of an assignment expression that is the entire - body full-expression of an expression statement, or
  • -
  • as the entire initializer-clause in a simple declaration for an - object with automatic storage duration.
  • -
-

(A cilk_spawn expression may be permitted in more contexts in - the future.) The rank of a spawned function call shall be zero. (See The section expression.)

-

A statement with a cilk_spawn on the right hand side of an assignment - or declaration is called an assignment spawn - or initializer spawn, respectively and the object assigned or initialized - by the spawn is called the receiver.

-

The iteration-statement is extended by adding another form of for loop:

-
-
grainsize-pragma:
-
# pragma cilk grainsize = expression new-line
-
-
-
iteration-statement:
-
grainsize-pragmaopt cilk_for ( expression - ; expression ; expression ) - statement
-
grainsize-pragmaopt cilk_for ( declaration - expression ; expression ) statement
-
-

The three items inside parentheses in the grammar, separated by semicolons, - are called the initialization, condition, and increment, - respectively. (A semicolon is included in the grammar of declaration.)

- -## Semantics - -### Tasking Execution Model - -

A strand is a serially-executed sequence of instructions that does not - contain a spawn point or sync point (as defined below). At a spawn point, one strand - (the initial strand) ends and two strands (the new strands) begin. The initial strand - runs in series with is sequenced before each of the new strands - but the new strands are unsequenced with respect to one another (i.e. they - may run in parallel with each other). At a sync point, one or more strands - (the initial strands) end and one strand (the new strand) begins. The initial strands - may run in parallel with one another are unsequenced with respect to - one another but each of the initial strands runs in series with - is sequenced before the new strand. A single strand can be subdivided - into a sequence of shorter strands in any manner that is convenient for modeling - the computation. A maximal strand is one that cannot be included in a - longer strand.

-

The strands in an execution of a program form a directed acyclic graph (DAG) in which - spawn points and sync points comprise the vertices and the strands comprise the - directed edges, with time defining the direction of each edge. (In an alternative - DAG representation, sometimes seen in the literature, the strands comprise the vertices - and the dependencies between the strands comprise the edges.)

- -### Serialization rule - -

The behavior of a deterministic OpenCilk program is defined - in terms of its serialization, as defined in this section. If the serialization - has undefined behavior, the OpenCilk program also has undefined - behavior.

-

The strands in an execution of a program are ordered according to the order of execution - of the equivalent code in the program's serialization. Given two strands, the earlier - strand is defined as the strand that would execute first in the serial execution - of the same program with the same inputs, even though the two strands may execute - in either order or concurrently in the actual parallel execution. Similarly, the - terms “earliest,” “latest,” and “later” - are used to designate strands according to their serial ordering. The terms “left,” - “leftmost,” “right,” and “rightmost” are - equivalent to “earlier,” “earliest,” “later,” - and “latest,” respectively.

-

The serialization of a pure C or C++ program is itself.

-

If a C or C++ program has defined behavior and does not use the tasking keywords - or library functions, it is an OpenCilk with the same defined behavior.

-

The serializations of cilk_spawn and cilk_sync - are empty.

-

If an OpenCilk program has defined deterministic behavior, then that behavior is - the same as the behavior of the C or C++ program derived from the original by removing - all instances of the keywords cilk_spawn, and cilk_sync.

-

The serialization of cilk_for is for.

-

If an OpenCilk program has defined deterministic behavior, then that behavior is - the same as the behavior of the C or C++ program derived from the original by replacing - each instance of the cilk_for keyword with for.

- -## Spawning Task blocks - -

A spawning task block is a region of the program subject to - special rules. Task blocks may be nested. The body of a nested task block is not - part of the outer task block. Task blocks never partially overlap. The body of a - spawning function is a task block. A cilk_for statement is a task - block and the body of the cilk_for loop is a (nested) task block.

-

Every spawning task block includes an implicit cilk_sync - executed on exit from the block, including abnormal exit due to an exception. Destructors - for automatic objects with scope ending at the end of the task block are invoked - before the implicit cilk_sync. The receiver is assigned or initialized - to the return value before executing the implicit cilk_sync at the - end of a function. An implicit or explicit cilk_sync within a nested - task block will synchronize with cilk_spawn statements only within - that task block, and not with cilk_spawn statements in the surrounding - task block.

- -

The scope of a label defined in a spawning block is limited to that spawning block.

-

Programmer note: Therefore, goto may not be used to enter - or exit a spawning block.

-
- -## cilk_for Loops - - -

The constraints and semantics of a cilk_for loop are the same as those - of its serialization, unless specified otherwise.

-

Each iteration of a cilk_for loop is a separate strand; they need not - be executed serially.

-
-

Within each iteration of the loop body, the control variable is considered a - unique variable whose address is no longer valid when the iteration completes. - the name of the control variable refers to a local object, as if the name were - declared as an object within the body of the loop, with automatic storage duration - and with the type of the original object. If the control variable is declared - before the loop initialization, then the address of the variable at the end of - the loop is the same as the address of the variable before the loop initialization - and the final value of the control variable is the same as - for the serialization of the program.

- -### Syntactic constraints - -

To simplify the grammar, some restrictions on cilk_for loops are stated - here in text form. The three items inside parentheses in the grammar, separated - by semicolons, are the initialization, condition, and increment. - Where a constraint on an expression is expressed grammatically, parentheses around - a required expression or sub-expression are allowed.

- -

A program that contains a return, break, or goto - statement that would transfer control into or out of a cilk_for loop - is ill-formed.

-
-

The initialization shall declare or initialize a single variable, called the - control variable. In C only, the control variable may be previously declared, - but if so shall be reinitialized, i.e., assigned, in the initialization clause. - In C++, the control variable shall be declared and initialized within the initialization - clause of the cilk_for loop. The variable shall have automatic storage - duration. No storage class may be specified for the variable within the initialization - clause. The variable shall have integral, pointer, or class type. The variable may - not be const or volatile. The variable shall - be initialized. Initialization may be explicit, using assignment or constructor - syntax, or implicit via a nontrivial default constructor. Within each iteration - of the loop body, the control variable is considered a unique variable whose address - is no longer valid when the iteration completes. If the control variable is declared - before the loop initialization, then the address of the variable at the end of the - loop is the same as the address of the variable before the loop initialization and - the value of the control variable is the same as for the serialization of the program.

- -

The condition shall have one of the following two forms:

-
-
var OP shift-expression
-
shift-expression OP var
-
-

where var is the control variable, optionally enclosed in parentheses. - The operator denoted OP shall be one of !=, <=, - <, >=, or >. The shift-expression - that is not the control variable is called the loop limit.

-
-

The condition shall have one of the following forms:

-
-
expression < expression
-
expression > expression
-
expression <= expression
-
expression >= expression
-
expression != expression
-
-

Exactly one of the operands of the comparison operator shall be just the name of - the loop's control variable. The operand that is not the control variable is called - the limit expression. Any implicit conversion applied to that operand - is not considered part of the limit expression.

-
-

The loop increment shall have one of the following forms: where var - is the loop control variable, optionally enclosed in parentheses, and incr - is a conditional-expression with integral or enum type. The table indicates - the stride corresponding to the syntactic form.

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
SyntaxStride
++var+1
var+++1
--var-1
var---1
var += incrincr
var -= incr-(incr)
-

The notion of stride exists for exposition only and does not need to be computed. - In particular, for the case of var -= incr, a - program may be well formed even if incr is unsigned.

-
-
-
++ identifier
-
identifier ++
-
-- identifier
-
identifier --
-
identifier += expression
-
identifier -= expression
-
-

The variable modified by the increment shall be the control variable.

-
-

A program that contains a return, break, goto - or switch statement that would transfer control into or - out of a cilk_for loop is ill-formed.

- -### Requirements on types and operators - -

The type of var shall be copy constructible. (For the purpose of - specification, all C types are considered copy constructible.) The control - variable shall have unqualified integral, pointer, or copy-constructible class type.

-

The initialization, condition, and increment parts of a cilk_for shall - be defined such that the total number of iterations (loop count) can be determined - before beginning the loop execution. Specifically, the parts of the cilk_for - loop shall meet all of the semantic requirements of the corresponding - serial for statement. In addition, depending on the syntactic form - of the condition, a cilk_for adds the following requirements on the - types of var the control variable, limit - the limit expression, and stride the stride. - (and by extension incr), and

-

The loop count is computed as follows, evaluated in infinite integer precision - when the control variable and limit both have integral or pointer type. ( - In the following table, first is the value of var immediately - after initialization, var” stands for an expression - with the type and value of the control variable, “limit” - stands for an expression with the type and value of the limit expression, and “stride” - stands for an expression with the type and value of the stride expression. The loop - count is computed after the loop initialization is performed, and before the control - variable is modified by the loop. The loop count expression shall be well-formed, - and shall have integral type. When a stride expression is present, if the divisor - of the division is not greater than zero, the behavior is undefined. - )

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Condition syntaxRequirementsLoop count
-
var < limit
-limit > var
-
(limit) - (first) shall be well-formed and shall - yield an integral difference_type;
- stride shall be > 0
-
(( limit ) - ( first )) / stride
-
-
var > limit
-limit < var
-
(first) - (limit) shall be well-formed and shall - yield an integral difference_type;
- stride shall be < 0
-
(( first ) - ( limit )) / -stride
-
-
var <= limit
-limit >= var
-
(limit) - (first) shall be well-formed and shall - yield an integral difference_type;
- stride shall be > 0
-
(( limit ) - ( first ) + 1) / stride
-
-
var >= limit
-limit <= var
-
(first) - (limit) shall be well-formed and shall - yield an integral difference_type;
- stride shall be < 0
-
(( first ) - ( limit ) + 1) / -stride
-
-
var != limit
-limit != var
-
(limit) - (first) and (first) - - (limit) shall be well-formed and yield the same integral difference_type;
- stride shall be != 0
-
if stride is positive
-then ((limit) - (first)) / stride
-else ((first) - (limit)) / -stride
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Loop count expression - and value
Form of - conditionForm of - increment
var++
- ++var
var--
- --var
var += stridevar -= stride
-
var < limit
-limit > var
-
-
((limit)-(var))
-
n/a -
((limit)-(var)-1)/(stride)+1
-
-
((limit)-(var)-1)/-(stride)+1
-
-
var > limit
-limit < var
-
n/a -
((var)-(limit))
-
-
((var)-(limit)-1)/-(stride)+1
-
-
((var)-(limit)-1)/(stride)+1
-
-
var <= limit
-limit >= var
-
-
((limit)-(var))+1
-
n/a -
((limit)-(var))/(stride)+1
-
-
((limit)-(var))/-(stride)+1
-
-
var >= limit
-limit <= var
-
n/a -
((var)-(limit))+1
-
-
((var)-(limit))/-(stride)+1
-
-
((var)-(limit))/(stride)+1
-
-
var != limit
-limit != var
-
-
((limit)-(var))
-
-
((var)-(limit))
-
-
((stride)<0) ?
-((var)-(limit)-1)/-(stride)+1 :
-((limit)-(var)-1)/(stride)+1
-
-
((stride)<0) ?
-((limit)-(var)-1)/-(stride)+1 :
-((var)-(limit)-1)/(stride)+1
-
-
-

The incr expression shall have integral or enumeration type. - The type of the difference between the limit expression and the control variable - is the subtraction type, which shall be integral. When the condition - operation is !=, (limit)-(var) and - (var)-(limit) shall have the same type. The stride - shall be convertible to the subtraction type.

-

For some expression X with the same type as the subtraction type, if the - loop increment uses operator ++ or +=, the expression:

-
var += (difference_type)(incr) X
-

shall be well-formed; if the loop increment it uses operator - -- or -=, the expression

-
var -= (difference_type)(incr) X
-

shall be well-formed. The loop is a use an odr-use of the required - operator += or -= function.

- -### Dynamic constraints - - -

If the stride does not meet the requirements in the table above, the behavior is - undefined. If this condition can be determined statically, the compiler is encouraged - (but not required) to issue a warning. (Note that the incorrect loop might occur - in an unexecuted branch, e.g., of a function template, and thus should not cause - a compilation failure in all cases.)

-
-

If the control variable is modified other than as a side effect of evaluating the - loop increment expression, the behavior of the program is undefined.

-

If X and Y are values of var the - control variable that occur in consecutive evaluations of the loop condition - in the serialization, then the behavior is undefined if

-
((limit) - X) - ((limit) - Y)
-

evaluated in infinite integer precision, shall does not equal - the stride. If the condition expression is true on entry to the loop, then the - behavior is undefined if the computed loop count shall be non-negative - is not greater than zero. If the computed loop count is not representable - as a value of type unsigned long long, the behavior is undefined.

-

Programmer note: Unsigned wraparound is not allowed.

-

If the body of the loop is executed, the increment and limit expressions - may be evaluated fewer a different number of times than in - the serialization. If different evaluations of the same expression yield different - values, the behavior of the program is undefined.

-

The copy constructor for the control variable may be executed more times than in - the serialization.

-

If evaluation of the increment or limit expression, or a required operator+= - or operator-= throws an exception, the behavior of the program is undefined.

-

If the loop body throws an exception that is not caught within the same iteration - of the loop, it is unspecified which other loop iterations execute, but no other - iteration is terminated early. If multiple loop iterations throw exceptions - that are not caught in the loop body, the cilk_for statement throws - the exception that would have occurred first in the serialization of the program.

- -### Grainsize pragma - -

A cilk_for iteration-statement may optionally be preceded - by a grainsize-pragma. The grainsize pragma shall immediately precede - a cilk_for loop and may not appear anywhere else in a program, except - that other pragmas that appertain to the cilk_for loop may appear - between the grainsize-pragma and the cilk_for loop. The - expression in the grainsize pragma shall evaluate to a type convertible to long.

-

The presence of the pragma provides a hint to the runtime specifying the number of - serial iterations desired in each chunk of the parallel loop. The grainsize expression - is evaluated at runtime. The grainsize expression need not be evaluated. - If it is evaluated, that evaluation is sequenced after the execution of the statement - preceding the loop, is sequenced before any execution of the loop body, and is unsequenced - with respect to the loop initialization and the evaluation of the limit and stride - expressions. If there is no grainsize pragma, or if the grainsize - evaluates to 0, then the runtime will pick a grainsize using its own internal heuristics. - If the grainsize evaluates to a negative value, the behavior is unspecified. (The - meaning of negative grainsizes is reserved for future extensions.) The grainsize - pragma applies only to the cilk_for statement that immediately follows - it – the grain sizes for other cilk_for statements are not - affected.

- -## Spawn - -

The cilk_spawn keyword suggests to the implementation that an executed - statement or part of a statement may be run in parallel with following statements. - A consequence of this parallelism is that the program may exhibit undefined behavior - not present in the serialization. Execution of a cilk_spawn keyword - is called a spawn. Execution of a cilk_sync statement is - called a sync. A statement An expression statement or declaration - statement that contains a spawn is called a spawning statement. - In a declaration containing a cilk_spawn keyword, the initialization - of each object declared is treated as a separate statement.

-

The following sync of a cilk_spawn - refers to the next cilk_sync executed (dynamically, not lexically) - in the same task block. Which spawn the sync follows is implied from context. The - following sync may be the implicit cilk_sync at the end of a task - block.

-

A spawn point is a C sequence point at which - a control flow fork is considered to have taken place. Any operations within the - spawning expression that are not required by the C/C++ standards to be sequenced - after the spawn point shall be executed are sequenced before - the spawn point. The strand that begins at the statement immediately following the - spawning statement (in execution order) is called the continuation of - the spawn. The sequence of operations within the spawning statement that are sequenced - after the spawn point comprise the child of the spawn. The scheduler - may execute the child and the continuation in parallel. Informally, the parent - is the task block containing the initial strand, the spawning statements, and their - continuations but excluding the children of all of the spawns. The children of the - spawns within a single task block are siblings of one another.

-

The spawn points associated with different spawning statements are as follows: -

-
    -
  • The body of a cilk_for loop is a spawning statement with spawn point - at the end of the loop condition test.
  • -
  • An expression statement containing a single cilk_spawn has a spawn - point at the sequence point at the call to the spawned function. Any unnamed temporary - variables created prior to the spawn point are not destroyed until after the spawn - point (i.e., the destructors are invoked in the child).
  • -
  • A declaration statement in which an identifier is initialized or assigned with a - result of a function call that is being spawned has a spawn point at the sequence - point at the call to the spawned function. A declaration statement may consist of - multiple comma separated declarations. Each of them may or may not have a spawn, - and there can be at most one spawn per expression. The conversion of the function - return value, if necessary, and the assignment or initialization of the receiver - takes place after the spawn point (i.e., in the child). Any unnamed temporary variables - created prior to the spawn point are not destroyed until after the spawn point (i.e., - their destructors are invoked in the child).
  • -
-

For example, in the following two statements:

-
x[g()] = cilk_spawn f(a + b);
-a++;
-

The call to function f is the spawn point and the statement a++; - is the continuation. The expression a + b and the initialization of - the temporary variable holding that value, and the evaluation of x\[g()] - take place before the spawn point. The execution of f, the assignment - to x\[g()], and the destruction of the temporary variable holding - a + b take place in the child.

-

If a statement is followed by an implicit sync, that sync is the spawn continuation.

-

Programmer note: The sequencing may be more clear if

-
x[g()] = cilk_spawn f(a + b);
-

is considered to mean

-
{
-	// Evaluate arguments and receiver address before spawn point
-	T tmp = a + b; // T is the type of a + b
-	U &r = x[g()]; // U is the type of x[0]
-	cilk_spawn { r = f(tmp); tmp.~T(); }
-}
-

A setjmp/longjmp call pair within the same task block has - undefined behavior if a spawn or sync is executed between the setjmp - and the longjmp. A setjmp/longjmp call pair - that crosses a task block boundary has undefined behavior. A goto statement - is not permitted to enter or exit a task block.

- -## Sync - -

A sync statement indicates that all children of the current task block must finish - executing before execution may continue within the task block. The new strand coming - out of the cilk_sync is not running in parallel with any child strands, - but may still be running in parallel with parent and sibling strands (other children - of the calling function).

-

There is an implicit sync at the end of every task block. If a spawning statement - appears within a try block, a sync is implicitly executed at the end of - on exit from that try block, as if the body of the try were a task block. - If a task block has no children at the time of a sync, then the sync has no observable - effect. (The compiler may elide an explicit or implicit sync if it can statically - determine that the sync will have no observable effect.)

-

Programmer note: Because implicit syncs follow destructors, writing - cilk_sync at the end of a function may produce a different effect - than the implicit sync. In particular, if an assignment spawn or initializer spawn - is used to modify a local variable, the function will generally need an explicit - cilk_sync to avoid a race between assignment to the local variable - by the spawned function and destruction of the local variable by the parent function.

- -## Exceptions - -

There is an implicit cilk_sync before a throw, after - the exception object has been constructed. try-block.

-

If a spawned function terminates with an exception, the exception propagates from - the point of the corresponding sync.

-

When several exceptions are pending and not yet caught, later exception objects (in - the serial execution order of the program) are destructed in an unspecified order - before the earliest exception is caught.

- -# Hyperobjects - -## Description - -

Cilk Plus defines a category of objects called “hyperobjects”. - Hyperobjects allow thread-safe access to shared objects by giving each parallel - strand running in parallel a separate instance of the object.

-

Parallel code uses a hyperobject by performing a hyperobject lookup operation. - The hyperobject lookup returns a reference to an object, called a view, - that is guaranteed not to be shared with any other active strands in the program. - The sequencing of a hyperobject lookup within an expression is not specified. The - runtime system creates a view when needed, using callback functions provided by - the hyperobject type. When strands synchronize, the hyperobject views are merged - into a single view, using another callback function provided by the hyperobject - type.

-

The view of a hyperobject visible to a program may change at any spawn or sync (including - the implicit spawns and syncs within a cilk_for loop). The identity - (address) of the view does not change within a single strand. The view of a given - hyperobject visible within a given strand is said to be associated with - that view. A hyperobject has the same view before the first spawn within a task - block as after a sync within the same task block, even though the thread ID may - not be the same (i.e., hyperobject views are not tied to threads). A hyperobject - has the same view upon entering and leaving a cilk_for loop and within - the first iteration (at least) of the cilk_for loop. A special view - is associated with a hyperobject when the hyperobject is initially created. This - special view is called the leftmost view or earliest view - because it is always visible to the leftmost (earliest) descendent in the depth-first, - left-to-right traversal of the program's spawn tree. The leftmost view is given - an initial value when the hyperobject is created.

-

Programmer note: If two expressions compute the same address for a view, - then they have not been scheduled in parallel. This property yields one of the simplest - ways by which a program can observe the runtime behavior of the scheduler.

-

Implementation note: An implementation can optimize hyperobject lookups - by performing them only when a view has (or might have) changed. This optimization - can be facilitated by attaching implementation-specific attributes to the hyperobject - creation, lookup, and/or destruction operations.

- -## Reducers - -

The vast majority of hyperobjects belong to a category known as “reducers.” - Each reducer type provides a reduce callback operation that merges - two views in a manner specific to the reducer. For a pair of views V1 - and V2, the result of calling reduce(V1, - V2) is notated as V1⊗V2. - Each reducer also provides an identity callback operation that initializes - a new view.

-

The reduce callback for a “classical” reducer implements - an operation ⊗ such that (a⊗b)⊗c==a⊗(b⊗c) - (i.e., ⊗ is associative). The view-initialization callback for such a reducer - sets the view to an identity value I such that I⊗v==v - and v⊗I==v for any value v of value_type. - Given an associative ⊗ and an identity I, the triplet (value_type, - ⊗, I) describes a mathematical monoid. For example, - (int, +, 0) is a monoid, as is (list, - concatenate, empty). If each individual view, R, - of a classical reducer is modified using only expressions that are equivalent to - RRv (where v is of - value_type), then the reducer computes the same value in the parallel - program as would be computed in the serialization of the program. (In actuality, - the “⊗” in the expression “RRv” - can represent a set of mutually-associative operations. For example, += - and -= are mutually associative.) For example, a spawned function or - cilk_for body can append items onto the view of a list reducer with - monoid (list, concatenate, empty). At the end - of the parallel section of code, the reducer's view contains the same list items - in the same order as would be generated in a serial execution of the same code.

-

Given a set of strands entering a sync, S1,S2,S3,…Sn, - associated with views V1,V2,V3,…Vn, - respectively such that Si is earlier in the serial ordering - than Si+1, a single view, W, emerges from the sync - with value W←V1⊗V2⊗V3⊗…⊗Vn, - such that the left-to-right order is maintained but the grouping (associativity) - of the operations is unspecified. The timing of this “reduction” is - unspecified – in particular, subsequences typically will be computed asynchronously - as child tasks complete. Every view except the one emerging from the sync is destroyed - after the merge. If any of the strands does not have an associated view, then the - invocation of the reduce callback function can be elided (i.e., the - missing view is treated as an identity).

-

A strand is never associated with more than one view for a given reducer, but multiple - strands can be associated with the same view if those strands are not scheduled - in parallel (at run time). Specifically, for a given reducer, the association of - a strand to a view of the reducer obeys the following rules:

-
    -
  1. The strand that initializes the reducer is associated with the leftmost view.
  2. -
  3. If two strands execute in series (i.e., both strands are part of a larger strand), - then both are associated with the same view.
  4. -
  5. The child strand of a spawn is associated with the same view as the strand that - entered the spawn.
  6. -
  7. If the continuation strand of a spawn is scheduled in parallel with the child, then - the continuation strand is associated with a new view, initialized using identity. - The implementation may create the new view at any time up until the first hyperobject - lookup following the spawn. If the continuation strand does not perform a hyperobject - lookup, then the implementation is not required to create a view for that strand.
  8. -
  9. If the continuation strand of a spawn is not scheduled in parallel with the child - strand (i.e., the child and the continuation execute in series), then the continuation - strand is associated with the same view as the child strand.
  10. -
  11. The strand that emerges from a sync is associated with the same view as the leftmost - strand entering the sync.
  12. -
-

Even before the final reduction, the leftmost view of a reducer will contain the - same value as in the serial execution. Other views, however, will contain partial - values that are different from the serial execution.

-

If ⊗ is not associative or if identity does not yield a true - identity value then the result of a set of reductions will be non-deterministic - (i.e., it will vary based on runtime scheduling). Such “non-classical” - reducers are nevertheless occasionally useful. Note that, for a classical reducer, - the ⊗ operator needs to be associative, but does not need to be commutative.

- - -# Disclaimer and other legal information - -

Copyright (c) 2020 Massachusetts Institute of Technology

- -

Permission is hereby granted, free of charge, to any person obtaining a -copy of this software and associated documentation files (the "Software"), -to deal with the Software without restriction, including without limitation -the rights to use, copy, modify, merge, publish, distribute, sublicense, -and/or sell copies of the Software, and to permit persons to whom the -Software is furnished to do so, subject to the following conditions:

- -

The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software.

- -

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE.

- -# Acknowledgements - -

We gratefully acknowledge Intel Corporation for generously allowing this OpenCilk document to incorporate -material from the following Intel document number 324396-003USr, which may be available here: - https://www.cilkplus.org/sites/default/files/open_specifications/Intel_Cilk_plus_lang_spec_1.2.htm

- -

The OpenCilk project was sponsored in part by the United States Air Force Research Laboratory and was accomplished under Cooperative Agreement Number FA8750-19-2-1000. The views and conclusions contained in this document are those of the authors and should not be interpreted as representing the official policies, either expressed or implied, of the United States Air Force or the U.S. Government. The U.S. Government is authorized to reproduce and distribute reprints for Government purposes notwithstanding any copyright notation herein.

diff --git a/src/doc/reference/reducers.md b/src/doc/reference/reducers.md new file mode 100644 index 00000000..7ffbfdf0 --- /dev/null +++ b/src/doc/reference/reducers.md @@ -0,0 +1,260 @@ +--- +title: Reducers +author: John F. Carr +--- + +# Reducers + +_Reducers_ are a new data type to help programmers avoid _{% defn +"data race", "data races" %}_. Data races happen when one thread +modifies an object while a second thread is using it. According to +the C and C++ language standards a race is undefined behavior. A +program can give incorrect results, crash, or worse. A counter may +not increment reliably or a linked list may become corrupt. + +A reducer is a special case of a more general type known as a +_hyperobject_. Different types of hyperobjects are used depending on +the desired semantics. + +Reducers are used when the final value in a variable is built up from +a series of independent modifications, such as adding a number to an +accumulator or appending an item to a list. As long as the operation +is _associative_ (`A ⊕ (B ⊕ C) = (A ⊕ B) ⊕ C`) the final result will +be correct. + +Formally, a reducer is a mathematical object called a _{% defn +"monoid" %}_, meaning it has the following components: +* a type (e.g `double`), +* an _identity_ value (`0.0`), and +* an associative binary operation (`+`). + +The operation does not need to be commutative. A reducer can hold a +list with the binary operation being concatenation. Associativity is +essential. + +## Reducers and views + +OpenCilk ensures that every reference to a reducer uses a private +copy, called a _view_. The address of the current view can change at +any spawn or sync, including the implicit spawns and syncs associated +with `cilk_for` and `cilk_scope`. The address operator `&` returns +the address of the current view, so the address of a reducer can +change when the address of a normal variable would be constant over +its lifetime. Be careful about saving the address of a reducer. The +race detector (Cilk sanitizer) can be used to check for improper +retention of a pointer to a view. + +Views are created and merged using programmer-provided callback +functions commonly named `identity` and `reduce`. The identity +callback takes a pointer to the value to be initialized (cast to +`void` `*`). The reduce callback takes two pointer arguments +pointing to the two values to be combined. The value pointed to by +the second argument should be merged into the value pointed to by the +first argument, and storage held by the second argument should be +freed. Even if the operation is commutative, the result should be +stored in the first argument. + +There is a total order on views, the order in which they would have +been created in a {% defn "serial projection", "serial" %} program. +The older of any pair of views is conventionally called the _left_ +view and the younger of the pair is called the _right_ view. The left +view is the first argument to the reduce callback. The variable +declared by the programmer is the _leftmost_ view. The programmer +needs to initialize or construct the variable just like any other. +See `` for an example where the leftmost view +does not get the identity value. + +## Declaring a reducer + +A reducer is declared with the `cilk_reducer` keyword, with the +identity and reduce functions as arguments. + +For example, to declare a reducer holding sums of `double`s +one can write + +```c +void zero_double(void *view) { *(double *)view = 0.0; } +void add_double(void *left, void *right) + { *(double *)left += *(double *)right; } +double cilk_reducer(zero_double, add_double) sum; +``` + +When necessary the runtime calls the identity callback (constructor) +to create a new view. All views created by the runtime will +eventually be combined with an older view using the reduction +operation. Any information that needs to be saved should be merged +into the left view. This may be as simple as adding two numbers. +Arbitrarily complicated data manipulation is possible. (When the +right view is discarded without saving its contents the hyperobject is +called a _holder_. Holders act as a form of thread-local storage that +does not remain valid across a spawn or sync.) + +The memory occupied by the view itself is allocated by and owned by +the runtime. In C++ `operator new` is not called. If the type has a +C++ constructor, use placement `new` in the identity function. If it +has a destructor, call the destructor explicitly instead of using +`delete`: + +```cpp +void identity(void *view) +{ + new (view) Type(); +} +void reduce(void *left, void *right) +{ + // Here data moves from the right view to the left view. + static_cast(left)->reduce(static_cast(right)); + static_cast(right)->~Type(); + // The right view will be freed on return from this function. +} +Type cilk_reducer(identity, reduce) var; // var is a reducer +``` + +If the data type requires a custom allocator a level of indirection +can be added by using a pointer type: + +```cpp +void identity(void *view) +{ + // Type::operator new will be used, if defined. + *static_cast(view) = new Type(); +} +void reduce(void *left, void *right) +{ + (*static_cast(left))->reduce(*static_cast(right)); + delete *static_cast(right); +} +Type *cilk_reducer(identity, reduce) var; +``` + +Formally, the `cilk_reducer` keyword is part of the type of the +variable rather than an attribute of the variable itself. It binds +much like `*`. In particular, + +```c +Type cilk_reducer(identity, reduce) a, b; +``` + +declares a reducer and a non-reducer variable, like + +```c +Type *a, b; +``` + +declares a pointer and a non-pointer. A `typedef` can be used +for more pleasing declarations: + +```c +typedef Type cilk_reducer(identity, reduce) TypeReducer; +TypeReducer a, b; +``` + +Modifications to a reducer should be consistent with the binary +operator. For example, if the reduction adds two views then all +modifications of the reducer should use `+=`. At least, the total of +all modifications between a `cilk_spawn` and the next `cilk_sync` +should be equivalent to `+=` (or whatever the `reduce` function does). +This is because the value of a reducer is unpredictable in parallel +code. It may become the identity at any `cilk_spawn` or change +abruptly at any `cilk_sync`. The runtime ensures that the sum (for +example) is always correct at the end, but not in the middle. + +Declaring a variable to be a reducer does not change its size. In the +current implementation all views allocated by the runtime are aligned +to the size of a cache line (64 bytes on supported platforms). This +alignment avoids {% defn "false sharing" %} on reducer accesses. If +greater alignment is required a level of indirection must be added. + +Because reducers are types, pointers to reducers are possible. Use +`__builtin_addressof` to get a pointer to a reducer treated as a +reducer instead of a view. This pointer can be passed to +reducer-aware code. + +```c +extern long f(int index); +// The argument is a pointer to a reducer. +void compute_sum(long cilk_reducer(zero, add) *reducer) +{ + cilk_for (int i = 0; i < 10000000; ++i) + *sum += f(i); // dereferenced pointer converts to current view +} +long provide_reducer() +{ + long cilk_reducer(zero, add) sum = 0L; // must be initialized + compute_sum(__builtin_address(sum)); + return sum; +} +``` + +## Limitations + +In OpenCilk 2.0 a reducer must be a variable. Reducers may not be +dynamically allocated and may not be members of structures or arrays. +This limitation is planned to be removed in a future version of OpenCilk. + +Reducers may not contain reducers. + +Callback functions should not spawn. + +Callback functions should be passed by name to `cilk_reduce`. Two +reducers have the same type if they have the same data type and +equivalent callback functions. If the callback functions are +expressions other than the names of functions the compiler does not +know whether they are equivalent and may give spurious errors about +type incompatibility. Proving expression equivalence is an unsolvable +problem in the general case. + +In C++, reducers are not implicitly converted to views when binding +references. This limitation is planned to be removed in a future +version of OpenCilk. As a workaround, take the address of the +reducer, yielding a pointer to the current view, and dereference the +pointer. + +```cpp +extern void f(int &, int _Hyperobject &); +void g(int _Hyperobject *p) +{ + f(*&*p, *p); // ideally you could write f(*p, *p); +} +``` + +## Porting from Cilk Plus + +The macros used by Intel Cilk Plus are no longer required. +The example from former `` + +```c +CILK_C_DECLARE_REDUCER(int) my_add_int_reducer = + CILK_C_INIT_REDUCER(int, + add_int_reduce, + add_int_identity, + 0, + 0); +``` + +becomes + +```c +int cilk_reducer(add_int_identity, add_int_reduce) my_add_int_reducer; +``` + +Where Cilk Plus allowed up to five callback functions, OpenCilk has +only two and they have different signatures. + +* The identity and reduce functions lose their first argument, +which was a pointer to the hyperobject itself. + +* The destructor is no longer a separate function. The right operand +to reduce is always destroyed immediately after reduction and no +functionality is added by having a separate destructor. Cilk Plus +reduce functions may need to have a destructor call added to work as +OpenCilk reduce functions. + +* Custom memory allocation functions are not supported by OpenCilk. +Memory for the view is provided by the runtime. Reducers may allocate +their own additional storage. + +As noted above, heap-allocated reducers are not supported in +OpenCilk 2.0. + + diff --git a/src/doc/reference/reducers.md~ b/src/doc/reference/reducers.md~ new file mode 100644 index 00000000..6c8de1d6 --- /dev/null +++ b/src/doc/reference/reducers.md~ @@ -0,0 +1,257 @@ +--- +title: Reducers +author: John F. Carr +--- + +# Reducers + +_Reducers_ are a new data type to help programmers avoid _{% defn +"data race", "data races" %}_. Data races happen when one thread +modifies an object while a second thread is using it. According to +the C and C++ language standards a race is undefined behavior. A +program can give incorrect results, crash, or worse. A counter may +not increment reliably or a linked list may become corrupt. + +A reducer is a special case of a more general type known as a +_hyperobject_. Different types of hyperobjects are used depending on +the desired semantics. + +Reducers are used when the final value in a variable is built up from +a series of independent modifications. The modifications should all +be of the same kind, such as by adding a number to an accumulator or +appending an item to a list. As long as the operation is +_associative_ (`A ⊕ (B ⊕ C) = (A ⊕ B) ⊕ C`) the final result will be +correct. + +Formally, a reducer is a mathematical object called a _{% defn +"monoid" %}_. A reducer has a type (e.g., `double`), an _identity_ +value (`0.0`), and an associative binary operation (`+`). The +operation does not need to be commutative. A reducer can hold a list +with the binary operation being concatenation. + +## Reducers and views + +OpenCilk ensures that every reference to a reducer uses a private +copy, called a _view_. The address of the current view can change at +any spawn or sync, including the implicit spawns and syncs associated +with `cilk_for` and `cilk_scope`. The address operator `&` returns +the address of the current view, so the address of a reducer can +change when the address of a normal variable would be constant over +its lifetime. Be careful about saving the address of a reducer. The +race detector (Cilk sanitizer) can be used to check for improper +retention of a pointer to a view. + +Views are created and merged using programmer-provided callback +functions commonly named `identity` and `reduce`. The identity +callback takes a pointer to the value to be initialized (cast to +`void` `*`). The reduce callback takes two pointer arguments +pointing to the two values to be combined. The value pointed to by +the second argument should be merged into the value pointed to by the +first argument, and storage held by the second argument should be +freed. Even if the operation is commutative, the result should be +stored in the first argument. + +There is a total order on views, the order in which they would have +been created in a {% defn "serial projection", "serial" %} program. +The older of any pair of views is conventionally called the _left_ +view and the younger of the pair is called the _right_ view. The left +view is the first argument to the reduce callback. The variable +declared by the programmer is the _leftmost_ view. The programmer +needs to initialize or construct the variable just like any other. +See `` for an example where the leftmost view +does not get the identity value. + +## Declaring a reducer + +A reducer is declared with the `cilk_reducer` keyword, with the +identity and reduce functions as arguments. + +For example, to declare a reducer holding sums of `double`s +one can write + +```c + void zero_double(void *view) { *(double *)view = 0.0; } + void add_double(void *left, void *right) + { *(double *)left += *(double *)right; } + double cilk_reducer(zero_double, add_double) sum; +``` + +When necessary the runtime calls the identity callback (constructor) +to create a new view. All views created by the runtime will +eventually be combined with an older view using the reduction +operation. Any information that needs to be saved should be merged +into the left view. This may be as simple as adding two numbers. +Arbitrarily complicated data manipulation is possible. (When the +right view is discarded without saving its contents the hyperobject is +called a _holder_. Holders act as a form of thread-local storage that +does not remain valid across a spawn or sync.) + +The memory occupied by the view itself is allocated by and owned by +the runtime. In C++ `operator new` is not called. If the type has a +C++ constructor, use placement `new` in the identity function. If it +has a destructor, call the destructor explicitly instead of using +`delete`: + +```cpp + void identity(void *view) + { + new (view) Type(); + } + void reduce(void *left, void *right) + { + // Here data moves from the right view to the left view. + static_cast(left)->reduce(static_cast(right)); + static_cast(right)->~Type(); + // The right view will be freed on return from this function. + } + Type cilk_reducer(identity, reduce) var; // var is a reducer +``` + +If the data type requires a custom allocator a level of indirection +can be added by using a pointer type: + +```cpp + void identity(void *view) + { + // Type::operator new will be used, if defined. + *static_cast(view) = new Type(); + } + void reduce(void *left, void *right) + { + (*static_cast(left))->reduce(*static_cast(right)); + delete *static_cast(right); + } + Type *cilk_reducer(identity, reduce) var; +``` + +Formally, the `cilk_reducer` keyword is part of the type of the +variable rather than an attribute of the variable itself. It binds +much like `*`. In particular, + +```c + Type cilk_reducer a, b; +``` + +declares a reducer and a non-reducer variable, like + +```c + Type *a, b; +``` + +declares a pointer and a non-pointer. A `typedef` can be used +for more pleasing declarations: + +```c + typedef Type cilk_reducer TypeReducer; + TypeReducer a, b; +``` + +Modifications to a reducer should be consistent with the binary +operator. For example, if the reduction adds two views then all +modifications of the reducer should use `+=`. At least, the total of +all modifications between a `cilk_spawn` and the next `cilk_sync` +should be equivalent to `+=` (or whatever the `reduce` function does). +This is because the value of a reducer is unpredictable in parallel +code. It may become the identity at any `cilk_spawn` or change +abruptly at any `cilk_sync`. The runtime ensures that the sum (for +example) is always correct at the end, but not in the middle. + +Declaring a variable to be a reducer does not change its size. In the +current implementation all views allocated by the runtime are aligned +to the size of a cache line (64 bytes on supported platforms). This +alignment avoids {% defn "false sharing" %} on reducer accesses. If +greater alignment is required a level of indirection must be added. + +Because reducers are types, pointers to reducers are possible. Use +`__builtin_addressof` to get a pointer to a reducer treated as a +reducer instead of a view. This pointer can be passed to +reducer-aware code. + +```c + extern long f(int index); + // The argument is a pointer to a reducer. + void compute_sum(long cilk_reducer(zero, add) *reducer) + { + cilk_for (int i = 0; i < 10000000; ++i) + *sum += f(i); // dereferenced pointer converts to current view + } + long provide_reducer() + { + long cilk_reducer(zero, add) sum = 0L; // must be initialized + compute_sum(__builtin_address(sum)); + return sum; + } +``` + +## Limitations + +In OpenCilk 2.0 a reducer must be a variable. Reducers may not be +dynamically allocated and may not be members of structures or arrays. +This limitation is planned to be removed in a future version of OpenCilk. + +Reducers may not contain reducers. + +Callback functions should not spawn. + +Callback functions should be passed by name to `cilk_reduce`. Two +reducers have the same type if they have the same data type and +equivalent callback functions. If the callback functions are +expressions other than the names of functions the compiler does not +know whether they are equivalent and may give spurious errors about +type incompatibility. Proving expression equivalence is an unsolvable +problem in the general case. + +In C++, reducers are not implicitly converted to views when binding +references. This limitation is planned to be removed in a future +version of OpenCilk. As a workaround, take the address of the +reducer, yielding a pointer to the current view, and dereference the +pointer. + +```cpp +extern void f(int &, int _Hyperobject &); +void g(int _Hyperobject *p) +{ + f(*&*p, *p); // ideally you could write f(*p, *p); +} +``` + +## Porting from Cilk Plus + +The macros used by Intel Cilk Plus are no longer required. +The example from former `` + +```c + CILK_C_DECLARE_REDUCER(int) my_add_int_reducer = + CILK_C_INIT_REDUCER(int, + add_int_reduce, + add_int_identity, + 0, + 0); +``` + +becomes + +```c + int cilk_reducer(add_int_identity, add_int_reduce) my_add_int_reducer; +``` + +Where Cilk Plus allowed up to five callback functions, OpenCilk has +only two and they have different signatures. + +* The identity and reduce functions lose their first argument, +which was a pointer to the hyperobject itself. + +* The destructor is no longer a separate function. The right operand +to reduce is always destroyed immediately after reduction and no +functionality is added by having a separate destructor. Cilk Plus +reduce functions may need to have a destructor call added to work as +OpenCilk reduce functions. + +* Custom memory allocation functions are not supported by OpenCilk. +Memory for the view is provided by the runtime. Reducers may allocate +their own additional storage. + +As noted above, heap-allocated reducers are not supported in +OpenCilk 2.0. + + diff --git a/src/doc/reference/reference.json b/src/doc/reference/reference.json index 356142e7..baa87fba 100644 --- a/src/doc/reference/reference.json +++ b/src/doc/reference/reference.json @@ -1,5 +1,6 @@ { "layout": "layouts/page.njk", + "stylesheet": "users-guide.css", "background": "bg-white", "permalink": "/doc/reference/{{ page.fileSlug | slugify }}/", "eleventyNavigation": { diff --git a/src/doc/users-guide/cilkscale.md b/src/doc/users-guide/cilkscale.md new file mode 100644 index 00000000..fa1d009e --- /dev/null +++ b/src/doc/users-guide/cilkscale.md @@ -0,0 +1,458 @@ +--- +title: Cilkscale — scalability analysis & benchmarking tool +image: /img/qsort-cilkscale-scalability-plots-sample-qsort-only.png +tags: + - cilkscale + - tools +attribution: false +author: Alexandros-Stavros Iliopoulos +date: 2022-08-31 +eleventyNavigation: + order: 0 +--- + +Cilkscale can help you reason about the parallel performance and scalability of +your Cilk program. Cilkscale enables you to: + +- Collect statistics of parallel performance for your application. +- Measure the {% defn "work" %}, {% defn "span" %}, and {% defn "parallelism" + %} of your (sub-)computations and predict how their performance will scale on + multiple processors. +- Automatically benchmark your program on different numbers of processors. +- Produce tables and graphical plots with the above performance and scalability + measurements. + +This guide will walk you through the basic steps of profiling the parallel +performance and scalability of your Cilk application with Cilkscale. By the +end of this guide, you will know how to generate performance and scalability +tables and plots like the ones shown below and have a basic understanding of +how to use them to diagnose parallel performance limitations of your +application. For details on the Cilkscale components, user options, and output +information, see the [Cilkscale reference page](/doc/reference/cilkscale). + +{% img "/img/qsort-cilkscale-scalability-plots-sample-qsort-only.png", "100%" %} + +{% alert "info" %} + +_**Note:**_ This guide assumes that OpenCilk is installed within +`/opt/opencilk/` and that the OpenCilk C++ compiler can be invoked from the +terminal as `/opt/opencilk/bin/clang++`, as shown in [this +example](/doc/users-guide/install/#example). + +{% endalert %} + +{% alert "info" %} + +_**System setup for reported performance measurements:**_ All timings reported +in this page are measured on a laptop with an 8-core Intel Core i7-10875H CPU, +using OpenCilk 2.0.1 on Ubuntu 20.04 (via the Windows Subsystem for Linux v2 on +Windows 10). + +{% endalert %} + + +## Example application + +We shall illustrate how to use the various components of Cilkscale with a +Cilk/C++ application that implements a parallel divide-and-conquer +[quicksort](https://en.wikipedia.org/wiki/Quicksort). The source code for our +simple program, `qsort.cpp`, is shown below. + +```cilkcpp# +{% include 'code/qsort.cpp' %} +``` + +The `qsort.cpp` program simply generates a vector of pseudorandom numbers, +sorts it in parallel with the `sample_qsort()` function, and verifies the +result. We can compile and run it as follows. + +```shell-session +$ /opt/opencilk/bin/clang++ qsort.cpp -fopencilk -O3 -o qsort +$ ./qsort 100000000 +Sorting 100000000 random integers +Sort succeeded +``` + + +## Benchmarking and work/span analysis + +Cilkscale instruments your Cilk program to collect performance measurements +during its execution. Cilkscale instrumentation operates in one of two modes: + +- _Benchmarking_ mode: Cilkscale measures the wall-clock execution time of your + program. +- _Work/span analysis_ mode: Cilkscale measures the {% defn "work" %}, {% defn + "span" %}, and {% defn "parallelism" %} of your program. + +In either mode, you can use Cilkscale with two simple steps: + +1. Pass a [Cilkscale instrumentation + flag](/doc/reference/cilkscale/#compiler-options-for-cilkscale-instrumentation) + to the OpenCilk compiler when you compile and link your program. The result + is a Cilkscale-instrumented binary. +2. Run the instrumented binary. Cilkscale collects performance measurements + and prints them to the standard output. (To output the report to a file, + set the + [`CILKSCALE_OUT`](/doc/reference/cilkscale/#cilkscale-report-output-file) + environment variable.) Your program otherwise runs as it normally would. + +By default, Cilkscale only reports performance results for whole-program +execution. We will see how to perform fine-grained analyses of specific +sub-computations in the next section, after we show how to use Cilkscale in +benchmarking and work/span analysis mode. + +### Benchmarking instrumentation + +To benchmark your application with Cilkscale, pass the +`-fcilktool=cilkscale-benchmark` flag to the OpenCilk compiler: + +```shell-session +$ /opt/opencilk/bin/clang++ qsort.cpp -fopencilk -fcilktool=cilkscale-benchmark -O3 -o qsort_cs_bench +``` + +Running the instrumented binary now produces the program output as before, +followed by two lines with timing results in [CSV +format](https://en.wikipedia.org/wiki/Comma-separated_values): + +```shell-session +$ ./qsort_cs_bench 100000000 +[...] +tag,time (seconds) +,2.29345 +``` + +The report table above contains a single, untagged row with the execution time +for the entire program. We will see shortly that if we use the Cilkscale API +for [fine-grained analysis](#fine-grained-analysis), then the report table will +contain additional rows. + +### Work/span analysis instrumentation + +To analyze the parallel scalability of your application with Cilkscale, pass +the `-fcilktool=cilkscale` flag to the OpenCilk compiler: + +```shell-session +$ /opt/opencilk/bin/clang++ qsort.cpp -fopencilk -fcilktool=cilkscale -O3 -o qsort_cs +``` + +When you run the instrumented binary, the program output is followed by the +Cilkscale work/span analysis report in CSV format: + +```shell-session +$ ./qsort_cs 100000000 +[...] +tag,work (seconds),span (seconds),parallelism,burdened_span (seconds),burdened_parallelism +,26.9397,2.29954,11.7153,2.29986,11.7136 +``` + +The work, span, and parallelism measurements in the report depend on your +program's input and {% defn "logical parallelism" %} but not on the number of +processors on which it is run. The Cilkscale reference page describes the +[specific quantities reported by +Cilkscale](/doc/reference/cilkscale/#workspan-analysis-measurements-reported-by-cilkscale). + +As before, the reported measurements above are untagged and refer to +whole-program execution. + +{% alert "info" %} + +_**Note:**_ The Cilkscale-instrumented binary in work/span analysis mode is +slower than its non-instrumented counterpart. The slowdown is generally no +larger than 10x and typically less than 2x. In the examples above, `qsort` and +`qsort_cs_bench` took 2.3s while `qsort_cs` took 3.4s (slowdown = 1.5x). + +{% endalert %} + + +## Fine-grained analysis + +Cilkscale provides a [C/C++ +API](/doc/reference/cilkscale/#cc++-api-for-fine-grained-workspan-analysis) for +benchmarking or analyzing specific regions in a program. The Cilkscale API +allows you to focus on and distinguish between specific parallel regions of +your computation when measuring its parallel performance and scalability. +Using the Cilkscale API is similar to using common C/C++ APIs for timing +regions of interest (such as the C++ `std::chrono` library or the POSIX +`clock_gettime()` function). + +Let's see how we can use the Cilkscale API to analyze the execution of +`sample_qsort()` function in our example quicksort application. That is, we +want to exclude the computations for initializing a random vector of integers +or verifying the sort correctness, which are all executed serially anyway. To +achieve this, make the following three changes to the code. + +1. Include the Cilkscale API header file. E.g., after line 4 in `qsort.cpp`: + + ```cpp + #include + ``` + +2. Create work-span snapshots using calls to `wsp_getworkspan()` around the + region we want to analyze. E.g., around the call to `sample_qsort()` in + line 35 in `qsort.cpp`: + + ```cpp + wsp_t start = wsp_getworkspan(); + sample_qsort(a.data(), a.data() + a.size()); + wsp_t end = wsp_getworkspan(); + ``` + +3. Evaluate the work and span between the relevant snapshots and print the + analysis results with a descriptive tag. E.g., just before the program + terminates in line 39 in `qsort.cpp`: + + ```cpp + wsp_t elapsed = wsp_sub(end, start); + wsp_dump(elapsed, "qsort_sample"); + ``` + +Then, save the edited program (here, we save it as `qsort_wsp.cpp`), compile it +with Cilkscale instrumentation as before, and run it: + +```shell-session +$ /opt/opencilk/bin/clang++ qsort_wsp.cpp -fopencilk -fcilktool=cilkscale -O3 -o qsort_wsp_cs +$ ./qsort_wsp_cs 100000000 +[...] +tag,work (seconds),span (seconds),parallelism,burdened_span (seconds),burdened_parallelism +sample_qsort,26.1502,1.08122,24.1859,1.08153,24.1788 +,27.3133,2.24433,12.1699,2.24465,12.1682 +``` + +Notice that the Cilkscale report above now contains an additional row tagged +`sample_qsort`, which was output by the corresponding call to `wsp_dump()`: + +```shell-session +sample_qsort,26.1502,1.08122,24.1859,1.08153,24.1788 +``` + +The last row in the Cilkscale report is always untagged and corresponds to +the execution of the whole program. + +{% alert "info" %} + +_**Note:**_ If you compile your code without a Cilkscale instrumentation flag, +calls to the Cilkscale API are effectively ignored with zero overhead. + +{% endalert %} + +For more detailed information on the Cilkscale C/C++ API, as well as an example +of how to aggregate work/span analysis measurements from disjoint code regions, +see the relevant section of the [Cilkscale reference +page](/doc/reference/cilkscale/#cc++-api-for-fine-grained-workspan-analysis). + + +## Automatic scalability benchmarks and visualization + +Cilkscale includes a Python script which automates the process of benchmarking +and analyzing the scalability of your Cilk program. Specifically, the +Cilkscale Python script helps you do the following: + +1. Collect work/span analysis measurements for your program. +2. Benchmark your program on different numbers of processors and collect + empirical scalability measurements. +3. Store the combined analysis and benchmarking results in a CSV table. +4. Visualize the analysis and benchmarking results with informative execution + time and speedup plots. + +The Cilkscale Python script is found at `share/Cilkscale_vis/cilkscale.py` +within the OpenCilk installation directory. + +{% alert "warning" %} + +_**Prerequisites:**_ To use the Cilkscale Python script, you need: + +- [Python](https://www.python.org/downloads/) 3.8 or later. +- (Optional) [matplotlib](https://pypi.org/project/matplotlib/) 3.5.0 or later; + only required if producing graphical plots. + +{% endalert %} + +### How to run + +To use the `cilkscale.py` script, you must pass it two Cilkscale-instrumented +binaries of your program — one with `-fcilktool=cilkscale-benchmark` and one with +`-fcilktool=cilkscale` — along with a number of optional arguments. +For a description of the `cilkscale.py` script's arguments, see the [Cilkscale reference page](/doc/reference/cilkscale/). + +Let's now see an example of using the `cilkscale.py` script to analyze and +benchmark our `qsort_wsp.cpp` program, which uses the Cilkscale API to profile +the `sample_qsort()` function. First, we build the two Cilkscale-instrumented +binaries: + +```shell-session +$ /opt/opencilk/bin/clang++ qsort_wsp.cpp -fopencilk -fcilktool=cilkscale-benchmark -O3 -o qsort_cs_bench +$ /opt/opencilk/bin/clang++ qsort_wsp.cpp -fopencilk -fcilktool=cilkscale -O3 -o qsort_cs +``` + +Then, we run `cilkscale.py` with our instrumented binaries on a sequence of +100,000,000 random integeres, and specify the output paths for the resulting +CSV table and PDF document of visualization plots: + +```shell-session +$ python3 /opt/opencilk/share/Cilkscale_vis/cilkscale.py \ + -c ./qsort_cs -b ./qsort_cs_bench \ + -ocsv cstable_qsort.csv -oplot csplots_qsort.pdf \ + --args 100000000 +``` + +### Terminal output + +The `cilkscale.py` script first echoes the values for all of its parameters, +including unspecified parameters with default values: + +```shell-session +Namespace(args=['100000000'], cilkscale='./qsort_cs', cilkscale_benchmark='./qsort_cs_bench', cpu_counts=None, output_csv='cstable_qsort.csv', output_plot='csplots_qsort.pdf', rows_to_plot='all') +``` + +Then, it runs the instrumented binary for work/span analysis on all available +cores and prints its standard output and standard error streams. You should +make sure that the program output is as expected. + +```shell-session +>> STDOUT (./qsort_cs 100000000) +Sorting 100000000 random integers +Sort succeeded +<< END STDOUT + +>> STDERR (./qsort_cs 100000000) +<< END STDERR +``` + +Once the work/span analysis pass is done, `cilkscale.py` runs the instrumented +binary for benchmarking on different numbers of processors. The number of +benchmarking runs and corresponding numbers of processors are determined by the +`-cpus` arguments to `cilkcsale.py`. (If this argument is not specified, the +program will run on $1, 2, \ldots, P$ processors, where $P$ is the number of +available physical cores in the system.) + +```shell-session +INFO:runner:Generating scalability data for 8 cpus. +INFO:runner:CILK_NWORKERS=1 taskset -c 0 ./qsort_cs_bench 100000000 +INFO:runner:CILK_NWORKERS=2 taskset -c 0,2 ./qsort_cs_bench 100000000 +INFO:runner:CILK_NWORKERS=3 taskset -c 0,2,4 ./qsort_cs_bench 100000000 +INFO:runner:CILK_NWORKERS=4 taskset -c 0,2,4,6 ./qsort_cs_bench 100000000 +INFO:runner:CILK_NWORKERS=5 taskset -c 0,2,4,6,8 ./qsort_cs_bench 100000000 +INFO:runner:CILK_NWORKERS=6 taskset -c 0,2,4,6,8,10 ./qsort_cs_bench 100000000 +INFO:runner:CILK_NWORKERS=7 taskset -c 0,2,4,6,8,10,12 ./qsort_cs_bench 100000000 +INFO:runner:CILK_NWORKERS=8 taskset -c 0,2,4,6,8,10,12,14 ./qsort_cs_bench 100000000 +``` + +In this example, the program is benchmarked on up to 8 CPU cores with IDs 0, 2, +4, …. This is because `cilkscale.py` only uses distinct *physical* cores by +default. In the computer used for this example, core IDs 1, 3, 5, … correspond +to *logical* cores used in [simultaneous +multithreading](https://en.wikipedia.org/wiki/Simultaneous_multithreading) or +"hyper-threading". + +Finally, `cilkscale.py` processes the collected benchmarking and work/span +analysis measurements and generates runtime and speedup plots for each analyzed +region (and the entire program). + +```shell-session +INFO:plotter:Generating plot (2 subplots) +``` + +The Cilkscale benchmarking and scalability analysis reports are returned in +tabular and graphical form. + +### Tabular output + +The raw measurements are output as a CSV table in the file pointed to by the +`-ocsv` argument to `cilkscale.py`. The CSV table contains, for each analyzed +region, the work/span analysis results and benchmark times for all numbers of +processors. + +For example, the above run produced the following table: + +```shell-session +$ cat cstable_qsort.csv +tag,work (seconds),span (seconds),parallelism,burdened_span (seconds),burdened_parallelism,1c time (seconds),2c time (seconds),3c time (seconds),4c time (seconds),5c time (seconds),6c time (seconds),7c time (seconds),8c time (seconds) +sample_qsort,26.5126,0.986602,26.8726,0.986927,26.8638,8.67705,4.6205,3.3648,2.75881,2.43091,2.1171,1.93193,1.7941 +,27.6918,2.16583,12.7858,2.16616,12.7839,9.68071,5.52596,4.26341,3.65358,3.32762,3.02633,2.82155,2.67563 +``` + +To see the table contents more clearly, you can import `cstable_qsort.csv` into +a spreadsheet (e.g., with [LibreOffice](https://www.libreoffice.org/)) or +[pretty-print it with command-line +tools](https://chrisjean.com/view-csv-data-from-the-command-line/): + +```shell-session +$ cat cstable_qsort.csv | sed -e 's/^,/ ,/g' | column -s, -t | less -#5 -N -S +1 tag work (seconds) span (seconds) parallelism burdened_span (seconds) burdened_parallelism 1c time (seconds) . . . +2 sample_qsort 26.5126 0.986602 26.8726 0.986927 26.8638 8.67705 . . . +3 27.6918 2.16583 12.7858 2.16616 12.7839 9.68071 . . . +``` + +### Scalability plots + +Cilkscale produces a set of scalability plots from the raw measurements in its +reported table. These plots are stored the PDF file pointed to by the `-oplot` +argument to `cilkscale.py`. Specifically, Cilkscale produces two figures for +each analyzed region (i.e., row in the CSV table): one which plots execution +time and one which plots parallel speedup. For a more detailed description of +these plots' contents, see the [Cilkscale reference +page](/doc/reference/cilkscale/#performance-and-scalability-analysis-plots). + +Here are the plots in `csplots_qsort.pdf` for the above example: + +{% img "/img/qsort-cilkscale-scalability-plots.png", "100%" %} + + +## Discussion: diagnosing performance limitations + +We have seen how to measure and explore the parallel performance and +scalability of a Cilk program. So... what next? How can we translate the +Cilkscale results into actionable insights on how to _improve_ performance? As +with serial-program profiling, the answer varies somewhat depending on the +program at hand. We will return to this question with forthcoming +documentation and blog posts. Please [let us know](/contribute/contact/) if +you'd like to be notified about important updates to OpenCilk and its +documentation. + +In the meantime, we offer a brief discussion regarding the parallel scalability +of our `qsort.cpp` example, specifically the `sample_qsort()` function. + +We observe the following: + +- Our program shows sub-linear scalability. With 8 processor cores, the + parallel speedup is only about 4.9x. +- The observed performance roughly follows the burdened-dag bound and falls + short of it as the number of cores increases. +- The parallelism of `sample_qsort()` is about 27, which is just over three + times as large as the amount of cores on the laptop where the experiments + were run. + +A main issue with our parallel `sample_qsort()` is that it does not exhibit +sufficient parallelism. The parallelism of a computation upper-bounds the +number of processors that may productively work in parallel. Moreover, +computations with insufficient {% defn "parallel slackness" %} are typically +impacted adversely by scheduling and migration overheads. As a rule of thumb, +the parallelism of a computation is deemed sufficient if it is about 10x larger +(or more) than the number of available processors. On the other hand, if the +parallelism is too high — say, several orders of magnitude higher than the +number of processors — then the overhead for spawning tasks that are too +fine-grained may become substantial. In our case, the parallelism is low and +exhibits sufficient slackness for only 2–3 cores. + +An additional issue is that the memory bandwidth of the laptop that was used in +these experiments becomes insufficient as more processing cores are used. This +is often the case for computations with low {% defn "arithmetic intensity" %} +when the observed parallel speedup falls below the burdened-dag speedup bound. +(Another possible cause for speedup below the burdened-dag bound is {% defn +"contention" %} of parallel resources.) The memory bandwidth ceiling was +measured at about 24 GB/s with the +[STREAM](https://www.cs.virginia.edu/stream/) "copy" benchmark in either serial +or parallel mode. + +If we want to improve the parallel performance of `sample_qsort()`, it appears +that our efforts, at least initially, are best spent increasing its +parallelism. One way to do that might be to undo the {% defn "coarsening" %} +of the base case (e.g., setting `BASE_CASE_LENGTH = 1`) but that makes the +recursion too fine-grained and introduces unnecessary spawning overhead — that +is, we may get better parallel speedup but slower execution overall. The one +remaining option then is to parallelize `std::partition()`, which is currently +serial and whose cost is linear with respect to the size of the input array. + +We will not cover parallel partition algorithms for quicksort here, but warn +that designing and implementing efficient parallel partitions is an interesting +and nontrivial exercise! diff --git a/src/doc/users-guide/getting-started.md b/src/doc/users-guide/getting-started.md index d98c5706..28af1d3a 100644 --- a/src/doc/users-guide/getting-started.md +++ b/src/doc/users-guide/getting-started.md @@ -102,9 +102,9 @@ Time(fib) = 1.459649400 sec ## Using Cilksan -Use the OpenCilk [Cilksan race detector](#) to verify that your +Use the OpenCilk Cilksan race detector to verify that your parallel Cilk program is deterministic. Cilksan instruments a program to -detect [determinacy race bugs](#) at runtime. Cilksan is guaranteed to +detect {% defn "determinacy race" %} bugs at runtime. Cilksan is guaranteed to find any and all determinacy races that arise in a given program execution. If there are no races, Cilksan will report that the execution was race-free. @@ -143,11 +143,11 @@ Race detected on location 7f515c3f34f6 Spawn 4995b3 nqueens /home/user/opencilk/tutorial/nqueens.c:67:29 [...output truncated...] -1.137000 +Time(nqueens) = 2.325475944 sec Total number of solutions : 14200 Cilksan detected 1 distinct races. -Cilksan suppressed 781409 duplicate race reports. +Cilksan suppressed 3479367 duplicate race reports. ``` Programs instrumented with Cilksan are always run serially, regardless of the @@ -168,11 +168,9 @@ $ xcrun /opt/opencilk/bin/clang -fopencilk -fsanitize=cilk -Og -g -D_FORTIFY_SOU ## Using Cilkscale -Use the OpenCilk [Cilkscale scalability analyzer](#) script to measure -the [work, span, and -parallelism](../../../posts/2022-05-20-what-the-is-parallelism-anyhow/ "What -the \$#@! is parallelism, anyhow?") of your Cilk program, and to benchmark -parallel its speedup on different numbers of cores. +Use the OpenCilk [Cilkscale scalability analyzer](/doc/users-guide/cilkscale) script to measure +the work, span, and parallelism of your Cilk program, and to benchmark +its parallel speedup on different numbers of cores. To measure work and span with Cilkscale, add the `-fcilktool=cilkscale` flag during compilation and linking: @@ -189,8 +187,9 @@ example: $ ./qsort 10000000 Sorting 10000000 integers All sorts succeeded +Time(sample_qsort) = 0.721748768 sec tag,work (seconds),span (seconds),parallelism,burdened_span (seconds),burdened_parallelism -,11.9831,0.167725,71.4447,0.168013,71.3225 +,7.32019,0.168512,43.4402,0.168877,43.3462 ``` To output the Cilkscale measurements to a file, set the `CILKSCALE_OUT` @@ -200,16 +199,17 @@ environment variable: $ CILKSCALE_OUT=qsort_workspan.csv ./qsort 10000000 Sorting 10000000 integers All sorts succeeded +Time(sample_qsort) = 0.711326910 sec $ cat qsort_workspan.csv tag,work (seconds),span (seconds),parallelism,burdened_span (seconds),burdened_parallelism -,12.3098,0.166994,73.7141,0.167288,73.5847 +,7.15883,0.168538,42.4761,0.168909,42.3828 ``` {% alert "info" %} ***Work-span analysis of specific program regions:*** By default, Cilkscale will only analyze whole-program execution. To analyze specific regions of your -Cilk program, use the [Cilkscale work-span API](#). +Cilk program, use the [Cilkscale work-span API](/doc/reference/cilkscale/#cc++-api-for-fine-grained-analysis).
{% alert "primary" %} @@ -249,6 +249,7 @@ cpu_counts=None, output_csv='out.csv', output_plot='plot.pdf', rows_to_plot='all >> STDOUT (./qsort_wsp 10000000) Sorting 10000000 integers All sorts succeeded +Time(sample_qsort) = 0.713108289 sec << END STDOUT >> STDERR (./qsort_wsp 10000000) @@ -276,4 +277,4 @@ Running the `cilkscale.py` script as above does the following: as plots in a PDF document (`plot.pdf`). For more information on the Cilkscale scalability analysis and visualization -script, see the [Cilkscale documentation page](#). +script, see the [Cilkscale documentation page](/doc/users-guide/cilkscale). \ No newline at end of file diff --git a/src/doc/users-guide/install.md b/src/doc/users-guide/install.md index 6ece1825..5c22e96e 100644 --- a/src/doc/users-guide/install.md +++ b/src/doc/users-guide/install.md @@ -103,11 +103,11 @@ Optionally, you can configure your system so that `clang` and `clang++` point to compilers (e.g., by setting your `PATH` environment variable or installing system-wide symbolic links). {% alert "primary" %} -### Example -The following example shows the process on Ubuntu 20.04 to install OpenCilk into `/opt/opencilk` -without adding a version-specific subdirectory. -The installation and setup process is analogous for macOS and other Linux systems. +_**Example:**_ The following example shows the +process on Ubuntu 20.04 to install OpenCilk into `/opt/opencilk` without adding +a version-specific subdirectory. The installation and setup process is +analogous for macOS and other Linux systems. - Download the precompiled [OpenCilk shell archive](/doc/users-guide/install/#installing-using-a-shell-archive) for your diff --git a/src/img/John Owens Headshot.png b/src/img/John Owens Headshot.png new file mode 100644 index 00000000..e3b2f161 Binary files /dev/null and b/src/img/John Owens Headshot.png differ diff --git a/src/img/qsort-cilkscale-scalability-plots-sample-qsort-only.png b/src/img/qsort-cilkscale-scalability-plots-sample-qsort-only.png new file mode 100644 index 00000000..9fe648aa Binary files /dev/null and b/src/img/qsort-cilkscale-scalability-plots-sample-qsort-only.png differ diff --git a/src/img/qsort-cilkscale-scalability-plots.png b/src/img/qsort-cilkscale-scalability-plots.png new file mode 100644 index 00000000..2d4c7f2d Binary files /dev/null and b/src/img/qsort-cilkscale-scalability-plots.png differ diff --git a/src/posts/2022-07-15-opencilk-2-0-released.md b/src/posts/2022-07-15-opencilk-2-0-released.md index 683c5b0c..ae6c4189 100644 --- a/src/posts/2022-07-15-opencilk-2-0-released.md +++ b/src/posts/2022-07-15-opencilk-2-0-released.md @@ -16,7 +16,7 @@ OpenCilk 2.0 is now available. See the [Install](/doc/users-guide/install) page ## Major changes OpenCilk 2.0 features the following major changes from OpenCilk 1.1: -- **[Beta]** Cilk reducer hyperobjects (a.k.a., reducers) are now supported through a new language syntax and implementation. A local or global variable in C or C++ can be made into a reducer by adding `cilk_reducer(I,R)` to its type, where `I` and `R` designate the identity and reduce functions for the reducer. Documentation on the new reducer syntax and implementation is forthcoming, but as a simple example, here is how a simple integer-summation reducer can be implemented using the new reducer syntax: +- **[Beta feature]** Cilk reducer hyperobjects (a.k.a., reducers) are now supported through a new language syntax and implementation. A local or global variable in C or C++ can be made into a reducer by adding `cilk_reducer(I,R)` to its type, where `I` and `R` designate the identity and reduce functions for the reducer. You can find documentation on the new reducer syntax [here](/doc/reference/reducers). As a simple example, here is how a simple integer-summation reducer can be implemented using the new reducer syntax: ```c #include @@ -36,8 +36,8 @@ int foo(int *A, int n) { } ``` - The compiler has been upgraded to be based on LLVM 14.0.6. -- Support has been improved and optimized for pedigrees and built-in deterministic parallel random-number generation. In particular, pedigrees are now correctly updated at both spawns and syncs. -- Support for pedigrees has been streamlined. To enable pedigree support, simply link the Cilk program with the pedigree library, `-lopencilk-pedigrees`. +- Support has been improved and optimized for deterministic parallel random-number generators (DPRNGs). Pedigrees — which can be used to implement custom DPRNGs — are now correctly updated at both spawns and syncs. The runtime system also supports a fast built-in DPRNG. +- Support for DPRNGs (and pedigrees) has been streamlined. To enable this support, simply link the Cilk program with the library, `-lopencilk-pedigrees`. - Many bug fixes and performance improvements have been included compared to the previous version. ## Known limitations @@ -47,7 +47,7 @@ int foo(int *A, int n) { - The default setting of floating-point contraction is now `-ffp-contract=on`. As a result, floating-point computation may behave differently with this version of OpenCilk. You can opt back into the old floating-point-contraction behavior by passing the compiler flag `-ffp-contract=off`. See [here](https://releases.llvm.org/14.0.0/tools/clang/docs/ReleaseNotes.html#floating-point-support-in-clang) for more details. - There are some standard library functions and LLVM intrinsic functions that Cilksan does not recognize. When Cilksan fails to recognize such a function, it may produce a link-time error of the form, `undefined reference to '__csan_FUNC'` for some function name `__csan_FUNC`. - Please report these missing functions as issues on the [OpenCilk issue tracker](https://github.com/OpenCilk/opencilk-project/issues). - - **[Beta]** You can work around this issue by passing the additional flag ``-mllvm -cilksan-bc-path=`find /path/to/opencilk/ -name "libcilksan.bc"` `` when compiling the Cilk program. + - **[Beta feature]** You can work around this issue by passing the additional flag ``-mllvm -cilksan-bc-path=`find /path/to/opencilk/ -name "libcilksan*.bc"` `` when compiling the Cilk program. (Note that `/path/to/opencilk/` should *not* be the path to the `clang` executable, but is instead the path to the whole OpenCilk installation, such as `/opt/opencilk/`.) ## Acknowledgments diff --git a/src/posts/2022-09-07-my-experience-teaching-performance-engineering.md b/src/posts/2022-09-07-my-experience-teaching-performance-engineering.md new file mode 100644 index 00000000..15e0b82b --- /dev/null +++ b/src/posts/2022-09-07-my-experience-teaching-performance-engineering.md @@ -0,0 +1,120 @@ +--- +layout: layouts/post.njk +title: My experience teaching software performance engineering +tagline: Lessons I learned creating a new course at my university +author: John Owens +date: 2022-09-07T20:08:01.475Z +attribution: false +--- +{% imgLeft "/img/John Owens Headshot.png", "130px" %} + +In fall 2021, I taught a graduate course in performance engineering at UC Davis, based on [MIT's 6.106](/community/teach-performance/#performance-engineering-of-software-systems). +I had an absolutely fantastic time learning and teaching this material, and was fortunate to have such a rich set of teaching material from which to begin. +I hope this account inspires you to try your own hand teaching performance engineering. + +My friends in industry tell me they see so many job candidates who can program in JavaScript and Python but simply don't have the background to dive into performance at the level needed at their companies. +What I internalized from my teaching experience is that performance engineering is something that can be taught, and taught well, and can deliver real and rare skills to our students. +So much of our teaching is small and deep slices of technical material, +but it's at least as important to understand the big picture of techniques and approaches that cross layers of the computing stack. +Finally, besides its importance, I found this material provided a rich intellectual experience for me, and a lot of fun besides. + +If you have any questions or want to discuss teaching performance engineering, please [let me know](https://www.ece.ucdavis.edu/~jowens/). + +## Course context: students, prerequisites, and academic calendars + +My course (EEC 289Q) had a number of structural differences from MIT's offering: + +- MIT's course targeted undergraduates, while ours was an introductory graduate course with a significant fraction of undergraduates (over 20%). In the future, I hope my course will be offered at a "mezzanine" level for advanced undergraduates and beginning graduate students. +- MIT had recitation sections alongside lectures, but we did not. +- MIT students have a more uniform background, and so the instructors can count on particular prerequisites. Our graduate students come from many universities and vary significantly in their backgrounds. +- Our students were mostly from my home department, electrical and computer engineering, and hence generally had a solid computer engineering background but less computer science. +- Our course had 53 students; MIT's is larger, with about 140. + +Fortunately, this course material can be taught without an enormous amount of prerequisites, or at least this was my experience. For example, unlike their peers at MIT, many of my students had quite modest theory backgrounds, but I feel that I was able to successfully communicate the theory required in the course. The essential prerequisite, in my opinion, was programming experience in C, but even students who lacked that were still able to succeed (with effort). + +I am thankful that my lecture style is fairly similar to Professor Leiserson's in terms of pace and the kind of material we put on our slides. Perhaps the most significant difference is that his strength in theory allows him to dive deep into theory details, whereas my more modest background forced me to try to acquire a more intuitive, less math-y theory understanding (which is how I presented it to my students). + +The other important difference in my course was the length. UC Davis has a quarter system and MIT is on semesters, and beyond that, the MIT course appears to be 1.5x the number of units of a normal MIT semester course. Thus MIT students in 6.106 are doing more than double the work compared to my course offering. + +## Course structure: homeworks and projects + +MIT had more homeworks and more projects than UC Davis's offering. We had two homeworks: + +- A "getting started" assignment: "This assignment introduces the environment and tools you will be using to complete your future project assignments. It includes a quick C primer. You should use this assignment to familiarize yourself with the tools you will be using throughout the course." +- A Cilk tools assignment: "In this homework, you will experiment with parallelizing in Cilk. You will learn how to use Cilksan to detect and solve determinacy races in your multithreaded code, and how to measure a program’s parallelism using the Cilkscale scalability analyzer." + +And we had three projects: + +- Rotating a large bitmapped image (we used MIT's assignment with virtually no changes): "This project provides you with an opportunity to learn how to improve the performance of programs using the `perf` tool and to experiment with the abstraction of a computer word as a vector of bits on which bitwise arithmetic and logical operations can be performed." +- Parallel Graphical Screensaver with OpenCilk (also MIT's project with virtually no changes): "Optimize a graphical screensaver program for multicore processors using the OpenCilk parallel programming platform." +- Find a valid route on a 2D grid with global and local constraints (requiring a parallel search). I wrote this assignment from scratch. It involved solving the logic puzzle [Masyu](https://en.wikipedia.org/wiki/Masyu). (The students did not know this.) + +The MIT homeworks overall were written well, with copious detail, and while I would have enjoyed giving them all if we had had time in the quarter, we did not. I thus focused the homework on tools and Cilk preparation. + +I personally have not yet written even a single line of Cilk, nor did I do any of the assignments except for writing the serial version of the third project. I simply did not have the time to do this work and thus sadly embody the adage, "Those who can, do, and those who can't, teach." + +MIT has a large capstone project ("Leiserchess") that I chose not to offer; I think it is simply too large for a quarter-length course. It certainly encapsulates many of the principles from the course, but it also is fairly chess-specific and benefits from the extensive knowledge on that particular topic from the MIT course staff (experience that I lack). + +Projects are quite time-consuming to create and a library of projects available to instructors would ease a significant concern among potential instructors of this course. I chose a search-based project for the one I created because I wanted to replace the (largely) search-based Leiserchess project from 6.106. I would do well to better understand the wide range of Cilk application domains, however. + +## Course infrastructure: reliable performance measurements + +The course itself emphasizes challenges with getting reliable computer performance measurements. These are significant challenges, especially when a course offering does not fully control its own computing infrastructure. Thus MIT's offering of this course goes to considerable effort to configure AWS machines to give the most reliable measurements possible. + +We adapted MIT's configuration of AWS for our own course. MIT TAs were quite helpful in getting the scripts running. However, it was a significant amount of time for us to maintain this infrastructure; we had one 10-hour-a-week TA who basically did nothing but keep the servers running, with the infrastructure often failing in a way that required manual TA intervention. The difficulty of setting up AWS infrastructure (and billing) and especially keeping the infrastructure running without intervention is a significant hurdle to exporting this class. Rock-solid scripts that can be easily ported to other universities would be highly desirable. Without more core infrastructure development, I anticipate that other faculty will face similar difficulties as we did (per-university ad hoc scripts written or modified by non-experts that require significant manual intervention). + +We spent $155 per enrolled student on compute time, which is in line with my and MIT's expectations. My department funded this directly in fall 2021. We face a hurdle in finding long-term funding for this course, however. + +## TA allocation + +We had two TAs: + +- The 20-hours-a-week TA was largely responsible for grading and evaluation, which included a fair amount of TA-developed auto-grading infrastructure (students submitted code and the TA's code would automatically evaluate it for correctness and performance). It may be there is existing infrastructure for some of this (possibly open-source software?) but I am at this time unaware of such infrastructure. Some of what was developed can be reused in future years, although the projects will likely change. This TA also graded student writeups associated with student assignment submissions. In general this part of TA responsibilities will scale with the number of students (twice as many students means twice as much effort), although hopefully the auto-grading would not (it costs the same amount of time whether we have one or 100 students). +- The 10-hour-a-week TA spent all of his time with AWS: how to use it and how to keep it running. This part of TA responsibilities does not have to scale (it should take the same amount of effort no matter how many students we have). + +MIT has regular recitations led by TAs as well as industry professionals reviewing code. I would have loved to have had either, but did not have the budget (or bandwidth) to do so. + +## Lectures + +MIT did not fully populate its (presumably) 30 lecture slots with regular lectures (instead using some guest speakers and code walkthroughs), so I was able to present most course material that was covered by the MIT course in the 20 lectures I had available. Those 20 lectures included: + +- Introduction and Matrix-Matrix Multiply +- The Bentley Rules +- Bit Hacks +- Computer Architecture +- C2Assembly +- What Compilers Can and Cannot Do +- Multicore Programming +- Races and Parallelism +- Parallel Algorithms (2 lectures) +- Measurement and Timing +- Cheetah Cilk Runtime +- Storage Allocation +- Parallel Storage Allocation +- Cache Efficient Algorithms +- Cache-Oblivious Algorithms +- Nondeterministic Parallel Programming +- Synchronization without Locks +- Potpourri +- Speculative Parallelism + +The only brand-new lecture from me was Potpourri, where I presented 5 vignettes: +- Brendan Gregg's [Flame Graphs](https://www.brendangregg.com/FlameGraphs/cpuflamegraphs.html), with hopes students would feel comfortable writing their own simple visualization tools to make sense of performance data. +- An ARM spinlock, [explained in great detail](https://blog.regehr.org/archives/2173). +- Timsort, how it works, and what it targets (not "sort a random array" but instead real arrays that typically have some structure). See [here](https://bugs.python.org/file4451/timsort.txt), [here](https://hackernoon.com/timsort-the-fastest-sorting-algorithm-youve-never-heard-of-36b28417f399), and [here](https://medium.com/@rylanbauermeister/understanding-timsort-191c758a42f3). +- [Fast case conversion](https://github.com/apankrat/notes/tree/master/fast-case-conversion), a simple problem but one with good compute-vs.-space tradeoffs and good engineering. +- "[How I Cut GTA Online Loading Times by 70%](https://nee.lv/2021/02/28/How-I-cut-GTA-Online-loading-times-by-70/)", a wonderful story of debugging and performance analysis with a lovely ending. + +The MIT teaching staff (Charles Leiserson and Jonathan Ragan-Kelley) made both their lecture slides (PPT) and their lecture recordings available to me. I watched every lecture. (Having played them all at 150%, I was a bit shocked when I finally talked with Professor Leiserson and discovered he spoke at a normal rate.) I also converted all lecture slides from PowerPoint to Keynote and made notes on what I changed/added; for me, converting slides is time well spent because I get a chance to go over every slide. + +Having access to slides and recordings was absolutely critical for me to learn the (significant amount of) material I didn't know, and to begin with high-quality slides, freeing me from creating my own. I made detailed notes after each lecture on what went well, what didn't go well, what changes I made before the lecture, and what I'd change for the next time. Professor Leiserson and TB Schardl were generous with their time in answering my questions and receiving my feedback. + +## The Future + +I would like to survey industry colleagues to understand the topics and technologies they would find useful. One lecture I would like to develop is vectorization, including the underlying hardware instruction sets; how they are accessed from C; and what compilers can and cannot do with respect to auto-vectorization. + +I have communicated the following to the MIT course staff, which has been understanding: the course is rather MIT-focused, with much of the course material building from research done at MIT. This is of course Absolutely Appropriate for an MIT-taught course, but exporting 6.106 to other campuses allows a fresh perspective. The lecture on speculative parallelism was highly chess-focused and was the most difficult to give simply because I did not have the deep background required to give it; I would leave it out next time. + +The course also has a strong focus on Cilk. First, this is absolutely appropriate for this course, and the dual theoretical and practical treatment of Cilk in the course was really a treat to learn and teach. That being said, Cilk is not a panacea for parallel computing, and moreover, due to its not-quite-mainstream status in modern toolchains, it may not even be available to all potential users. The course should not present Cilk as the culmination of parallel computing programming models/environments. A deeper dive into alternate models and programming environments for parallelism (e.g., modern-C++ std::thread) would strengthen this class. Parallel data structures (e.g., parallel STL) might also be a good topic. + +Cilk builds on more than two decades of research and it was, more than once, a challenge to understand some of the underlying motivations for / principles of Cilk. I could find some of them in 20+ year old papers but there is no modern description of Cilk, its design goals and decisions, and principles anywhere. (It was a significant challenge to dig through the Cilk literature to get answers to some of my questions.) It would benefit from a from-scratch paper that explained it as if it were a new system. diff --git a/src/posts/2023-07-07-summary-of-educational-outreach.md b/src/posts/2023-07-07-summary-of-educational-outreach.md new file mode 100644 index 00000000..84d80cfd --- /dev/null +++ b/src/posts/2023-07-07-summary-of-educational-outreach.md @@ -0,0 +1,244 @@ +--- +layout: layouts/post.njk +title: (title) +tagline: (tagline) +author: John Owens +date: 2023-07-07T00:00:00.000Z +attribution: false +--- +Over the past year, the OpenCilk Academic Board has conducted focused interviews with more than a dozen faculty worldwide. We believe that a key part of our effort to create a community around performance engineering is teaching students. We hope that we can produce students ready to meet the significant need of the computing community by developing a high-quality curriculum in performance engineering, and ensure that curriculum can be easily adopted by faculty at both research- and teaching-focused schools. + +We are fortunate to be able to begin a curriculum with the outstanding MIT course led by Charles Leiserson over the past twenty years. Many faculty with whom we spoke were interested in the educational side of performance engineering specifically because of your class (and that's how I found this community as well). + +In our outreach, we focused on the following questions: + +- What constitutes a performance engineering curriculum? +- What are the obstacles to faculty offering performance engineering course material? +- What materials can we provide to the community that helps faculty teach performance engineering? +- What are the directions we should take a performance engineering curriculum? + +The majority of our outreach was to North American faculty at R1 (research-focused) universities, but we were also able to talk to faculty outside of North America and to faculty at teaching-focused universities. Because we talked to faculty who had already expressed interest in the topic, and/or had been recommended as having an interest by other faculty, we naturally had a positive reception. + +---- + +# Curriculum + +In general, faculty agreed that the syllabus for the MIT course was broad and appropriate. Faculty appreciate that this course spans multiple levels of the stack (architecture, programming systems, theory), in contrast to much of the computer science and engineering curriculum that focuses on one level only. (Faculty feel that both single-layer and multi-layer courses in the curriculum are relevant and appropriate, but multi-layer courses are less common.) + +Some (but not all) faculty would expect to teach only a subset of the MIT course, for several (totally reasonable) reasons: + +- The material they would leave out would have been well-covered in previous courses +- They don't have a background in the material they would leave out +- They prefer to focus more deeply on a subset of the MIT course material +- They would include other material and thus could not cover all the material from the MIT course + +There was a significant hope for a "modular" performance engineering curriculum where different pieces of material could be easily integrated into an existing course. + +## What should we add to the curriculum? + +The primary topics mentioned were distributed computing (multi-socket and certainly multi-node, including frameworks like Spark), GPUs, and performance engineering of networks and I/O. We would like to provide a roadmap for faculty who approach this material from a theory perspective and from a parallel-programming perspective; both perspectives have many faculty who teach that course and if we provided a smooth on-ramp to add performance-engineering material, that would be well-received. + +### Specific topics mentioned beyond the scope of the MIT course + +- Vectorization, although I understand MIT has recently added this topic to their course. I believe the important outcome of this material is for students to understand what the compiler can do (automatically) with respect to vectorization and what it cannot. I have looked at [Google Highway](https://github.com/google/highway) as an interesting abstraction. The Google team noted LLVM's [Vectors and Extended Vectors](https://clang.llvm.org/docs/LanguageExtensions.html#vectors-and-extended-vectors). +- C++ performance issues, e.g., in-place vs. insert, constructors, pass-by-value vs. pass-by-reference. +- Better intuition on how to think about programming in Cilk +- Test suites and test-driven development, which MIT covers in the software-engineering prerequisite to its performance-engineering course. +- Performance issues with managed languages, with Rust mentioned most often. Garbage collection. + - JavaScript has ~little uptake in universities but is highly optimized and highly important in industry because of its use in web browsers. The kind of virtual-machine environment used in JavaScript would be useful for students to know. +- A focus on traditional (parallel) computing primitives (e.g., scatter, gather, map, reduce, sort) and then how those are implemented in different parallel programming models +- Benchmarks: different levels of benchmarks, how they're useful, what are the characteristics of good ones +- Performance counters. +- How security interacts with performance engineering +- A lab on performance debugging. (This could be a walk-through in a complex project: how can we start with something that's not performing well where we don't know why and dig down until we do?) +- Industry feedback on what should be included in a compiler lecture: Icache alignment, which is hard for C compilers to optimize; data layout; calling conventions; SOA vs. AOS. +- Entire courses on: + - Virtual machine design, where students build a compiler that supports VM, including an object model and a garbage collector ([Mario Wolczko's fall 2015 UC Berkeley course](http://www.wolczko.com/CS294/index.html)) + - GPU performance engineering + - Case studies, using skills from the intro course to address real-world / complex problems (from industry) + +### Specific tools mentioned that are useful + +Nearly every faculty member and industry expert noted "perf", the Unix profiler, was a critical tool to teach. Other recommended tools: + +- Cilk internal tools (Cilkscale, Cilk*) +- PAPI (hardware counters) +- Cachegrind (cache behavior) +- vTune (Intel) +- Google pprof +- [uops.info](https://uops.info/uiCA.html) +- Google's [Perfetto](https://perfetto.dev/) ("System profiling, app tracing and trace analysis") +- Emery Berger uses and teaches [Coz](https://arxiv.org/abs/1608.03676) in his class + +### Profiling vs. instrumenting + +The MIT course focuses on using performance tools (e.g., profilers) to measure performance, identify bottlenecks, and address them. Such tools (broadly) don't modify source code but instead measure. + +One of our faculty contacts instead described his philosophy of _instrumenting_ code rather than _profiling_ it. He inserts measurement code (instrumentation) into his source code to understand how it is running. + +Both approaches are valuable and it was an interesting discussion within the OpenCilk group on the need to also teach instrumentation. This is certainly an interesting topic for an additional lecture. + +## Sharing course material + +Many (but not all) faculty, because of time pressure, will eagerly begin their course preparation with course material developed by other faculty. (I never would have considered teaching this course without such material.) In general faculty are theoretically interested in sharing their course material but nervous about making it available to the whole world; the faculty concern with the latter is having some degree of control over their material. Limited sharing with a known community seems to be a good middle ground. + +Thus we are addressing this by asking faculty to share their material in a private share accessible to other faculty within the performance engineering community. We attempt to set a good example by sharing our material at the outset. Faculty have been positively responsive to this plan. + +## Long-term curriculum goal + +The goal of a modular performance engineering curriculum is laudable. The MIT course is already fairly modular (in my opinion); the list of topics is clear and distinct and most lectures are self-contained. The ideal curriculum, though, has the following characteristics: + +- **A broader set of material**. The current course focuses primarily on single-socket CPUs. Many of the interested faculty come from a supercomputing or distributed-computing background and teach parallel programming or theory. The current course does not address network or I/O performance. And most faculty mentioned performance engineering of GPUs as a necessary component. +- **Material at different levels of detail**. Any topic in the current course could likely be turned into an entire course, and probably has. In an ideal curriculum, any particular topic would have material at multiple levels: (for instance) 20 minutes of lecture material, one lecture, three lectures. Another word for this goal is a _hierarchical_ curriculum. +- **A dependency tree of material**. For both the existing course and in adding future material, we should carefully understand the knowledge that must be covered before presenting any particular piece of material. What we need is a dependency tree: if you want to present X, that's dependent on Y and Z, and Z is dependent on A. Such a tree will hopefully allow instructors to mix and match material with confidence. +- **Faculty who "own" or manage particular lectures or materials**. On some topics, faculty are experts, and readily and gladly update their own course material year after year. We face a tension of existing instructors wanting to entirely "own" their own courses but also wanting to incorporate up-to-date and thoughtfully written material from others. One possible direction we could go is to allow faculty to take charge of particular aspects of the curriculum and commit to update materials and answer questions about them. Our hope is that for an expert faculty member, this is not burdensome but instead a willing task, and that the collective efforts of many expert faculty members will create a greater good for all. + +## Relevant projects we heard about during our conversations + +- https://csinparallel.org/index.html + - [Patternlets](https://csinparallel.org/csinparallel/modules/patternlets.html) were the best received outcome of CSinParallel +- https://eduwrench.ics.hawaii.edu/pedagogic_modules/multi_core_computing/ +- https://accidentallyquadratic.tumblr.com/ +- https://lemire.me/blog/ +- http://cs-materials.herokuapp.com/ +- https://bridgesuncc.github.io/newassignments.html +- [NERSC training](https://www.nersc.gov/users/training/events/) + +# Faculty obstacles to teaching + +At R1 schools, the primary obstacle to adding a new course or new course material is faculty time. Unanimously faculty indicated their time to develop new courses or material is quite limited, and additionally not rewarded well by their universities. Many faculty have little ability to choose their teaching assignments (they are needed to teach existing classes), and computer science and engineering departments today often have large enrollments so faculty are often unable to add new courses when their current courses are in high demand. + +That being said, faculty were enthusiastic about both the need for the material and the existing (MIT) course that teaches it. They were also enthusiastic about the topic and that the scope of the topic was (at least) the size of an entire course. Many faculty expressed optimism that the combination of a clear benefit/need for this curriculum and their interest in teaching it would allow them to regularly teach such a course in the future (though not necessarily immediately). + +In general faculty felt this course was best suited as an upper-division undergraduate course (although at my university I am advocating it as a "mezzanine course" for new grad students and advanced undergrads, and I first taught it as a graduate course). One faculty member noted that the largest body of degree-holders that enter industry are BS degrees and thus we should focus on that cohort of students. Other faculty noted the growth of their (online) MS programs and that this material would be useful there. (Finally, some departments also offer industry-focused courses and this material potentially fits that need.) + +The OpenCilk team has discussed targeting newly hired faculty at R1 schools, as these faculty often have the ability to introduce a new course as they begin their teaching careers, and would likely be receptive to a high-quality course with ample available course material. + +## What is different about teaching schools? + +- Teaching schools have limited computing resources---R1 schools likely have large compute resources on campus / from the labs of faculty, but teaching schools lack these. +- Departments also have very little resources (e.g., to fund cloud compute time), although many teaching schools require lab fees. +- Teaching schools may have more flexibility to offer new courses if they fill needs in the curriculum. However, given teaching-school constraints, it may be more successful to target how to integrate our material into existing courses rather than introduce new courses. +- Their major challenge is that faculty are stretched thin in terms of their existing teaching assignments. One professor noted that in fact, at teaching schools, faculty are often asked to teach material that is far from their experience or teaching interest. +- Faculty are more invested in sharing their teaching experience in an academic setting such as SigCSE + +## Training + +Our team settled early on training professors to teach performance engineering as a key part of our effort. We had lengthy discussions about this, and I present my views: + +- Faculty are extremely busy and any training is a significant commitment on their part. Any effort we make must be scheduled well in advance and attract a critical mass of faculty to be worth the time that all instructors and attendees would put into the event. The disaster scenario is a well-planned training where only three people attend. +- For faculty to commit, we must present to them an experience that is clearly worth their time. Attractive elements include: + - (Likely) a summer timeframe, because it is too hard to schedule a good time for a large fraction of faculty during the academic year + - Location. Two attractive alternatives: + - Just before or after a popular conference + - A location where faculty would happily visit. Hawaii was mentioned more than once. + - A group of faculty who would be happy to spend time together. I'm confident that the faculty with whom we spoke over the past year would be fast friends with many common interests, and if a significant number of them committed to going, other faculty within and without the group we spoke to would find spending a few days with that group highly attractive. + - A timeframe that delivered a lot of value per hour spent. We have not internally discussed the right timeframe; I tend to think one or two days would be a good length of time. And it would take a significant amount of effort on our part to make our training valuable. + - Travel costs covered at least, with possibly a stipend. +- The contents of such a training are challenging given that nearly all faculty involved in our discussions have never attended nor led teacher training. The default training would be "explain the material, go over the syllabus" but that is not nearly as helpful in my opinion as more focused material on helping faculty teach it better / use their time more wisely. (For instance, it is straightforward to ask faculty to read the syllabus and peruse the slides ahead of time.) More valuable is guidance that the trainers can convey that is best suited for an in-person setting: here are the sticky points of teaching this material, here is the string of student questions you'll get, here's where you want to guide students with this material. The goal of training should not be "familiarize faculty with the material" (that's nice, but that's not most important); instead perhaps lessons learned from teaching the material, and feedback from students and how that should influence future curriculum development. +- Teaching-university faculty have fairly different goals and interests from R1 faculty. Teaching-university faculty are rewarded by their institutions for training and for attending training, and have had significant success in organizing training. In addition their funding model is more geared to attending such training (R1 faculty grants or department/college resources would rarely cover attending training). Successful previous teaching-university workshops have been: + - (Workshop series 1): Regional (short travel needs) and paying for food/housing + - (Workshop series 2): Stipends for both attending AND (additionally) coming back the next year and presenting adopted material. + +Beyond the training efforts above, few faculty (and very few without an MIT lineage) have any Cilk experience at all. (I did not when I taught my course. I think it's possible to teach this material without practical experience, but it is not ideal.) Online materials for faculty to learn what _they_ need to know about Cilk would be ideal. One faculty member specifically noted he wanted a "NOT gentle introduction to Cilk, one that is hands-on". He does not want "hello world" in Cilk because then if that's all he knows, he can't help students beyond that. + +### TA Training + +One faculty member noted his concern that his _TAs_ did not have any experience with some of the technologies in this course, notably Cilk. He was (reasonably!) concerned with introducing these technologies into his course and his TAs not being able to support them. The OpenCilk group should absolutely have online training materials for TAs that tell TAs what they need to know to supervise OpenCilk instruction and projects. + +Another faculty member listed three hopes he has for his TAs: + +- Guide students when they are solving homework problems / projects +- Grading homeworks / projects and timing them +- Providing useful comments and suggestions on student code + + +## Textbook + +Several faculty expressed a strong desire for a textbook. Several others were clear they don't care if there's a textbook. And several others felt a textbook would be useful but is not required. Everyone agreed there was no appropriate current textbook. + +## Grading + +Grading, and the use of TAs to make detailed grading viable, is a significant concern. Infrastructure for auto-grading assignments (measuring their correctness and performance) would be appreciated. + +## OpenCilk and long-term viability + +Many faculty were enthusiastic about OpenCilk as a powerful way to program multicore machines. However, they were understandably concerned about teaching a technology where they felt students would have little ability to use that technology once they reach industry. The largest computing companies today simply don't use OpenCilk. The reasons that OpenCilk is not in LLVM (or other widely used technologies that are used in these companies) and thus unavailable to programmers at those companies are long, complex, and well beyond the scope of this document. But, it is clear to me that the long-term impact of OpenCilk is gated by its ability to live within standard toolchains and thus be available in generic industry computing environments (e.g., using a standard compiler toolchain). + +Professors mentioned that students' intuition on Cilk could be improved, and that the community would benefit from a single (new) paper that summarizes Cilk as a programming model and implementation and discusses design decisions. If we built Cilk from scratch tomorrow for the first time, how would we write it up? + +## OpenCilk as a pinnacle + +The MIT course treats OpenCilk as the pinnacle of parallel-programming environments. OpenCilk is beautiful, both theoretically and in practice, and honestly is a joy to teach. But, I found myself lacking a broader context of the scope of multicore parallel programming environment. I would like to see more of a taxonomy of the type of parallel computing problems (e.g., OpenCilk excels at divide-and-conquer, but how about structured-grid computation? sparse matrix operations?) and different programming models and how they approach points in this taxonomy, and why certain models are a good fit for certain kinds of problems. This is especially important for students who may (soon!) work in companies where OpenCilk is not available, and who may need to choose other environments. + +# Materials + +Faculty had a broad variety of materials they would find useful in preparing a performance engineering course. Some were obvious, some less so. + +(Personally, when I taught this class for the first time, I benefited greatly from PowerPoint lecture slides, from recordings of the course that used those slides, and from the assignments that were used in one instance of the course. The MIT teaching staff also shared their AWS setup, which we adapted for our course's use.) + +## Lecture slides and videos + +The MIT course's lecture slides have been developed over many years and are superb. The combination of lecture slides and videos of those lectures being delivered are the single most important item that we can provide. MIT uses PowerPoint and I converted them to Keynote but PowerPoint is, generally, the most portable. + +## Assignments + +MIT divided student work into two categories: assignments, which are more about learning the tools and the software environment, and projects, which are larger and more open-ended. I adapted existing MIT assignments and projects into the work I gave students, and also wrote one project. + +A library of existing projects that are interesting and educational for students is highly desirable. The worse-case scenario for faculty is for a non-expert faculty member to attempt to develop a new project, to spend significant time on it, and to eventually decide it is not a good project. The faculty member can't get that time back and is often up against a deadline in the first place. Existing projects both save time and eliminate that worst-case scenario. The challenge is even greater when developing Cilk projects because almost no faculty members are expert enough at Cilk to know at the outset if a project is good or not. Timely feedback from the Cilk team would take little time from that team but would greatly help faculty avoid developing non-viable projects. + +MIT's philosophy in writing projects is to give students working code that fulfills the project description but is unoptimized and/or slow and/or unparallelized. Students are then responsible to improve project performance. This is a good philosophy (in my opinion) but it does require identifying projects where the performance gap between initial code and final code is both large and interesting from a course-materials perspective (e.g., parallelization using Cilk is an interesting and appropriate parallelization). + +## Case studies + +One topic that came up during our discussions with the Google Chrome team is their need to optimize existing code. In contrast, most course assignments involve writing new code, not analyzing and testing existing code. Through our discussions, we collectively believe it would be highly valuable to the performance-engineering community to have a set of _case studies_: existing code that is slow, for some reason, where students must profile and/or instrument this code to find where it is inefficient and fix it. For the Chrome team, and likely by extension for many industry scenarios where performance is vital, this is a significant part of their job. University courses do not teach this. Now, identifying and writing good case studies is an enormous challenge; it seems imposing to be able to write one from scratch, but the alternative is identifying such a performance-improvement opportunity in the wild, and that is equally imposing. Faculty also noted that once such an assignment is released, it is no longer able to be reused. All that being said, business schools rely heavily on case studies in their instruction; we should consider why and how they are used; and more effort on our part to identify interesting case studies and see how they can be used in the curriculum is time well spent. + +Georgia Tech's [Scientific Software Engineering Center](https://ssecenter.cc.gatech.edu/) might be a source for case studies. + +## Computing resources + +Teaching performance engineering requires accurately measuring performance. The MIT course stresses reproducible, consistent measurements by configuring servers to allow this ("quiescing"). We believe quiesced servers are an important aspect of performance-engineering course assignments, but it means that a performance engineering class can almost certainly not use generic department-supplied computing environments without effort. (For instance, the typical department computing resource is a pool of Unix workstations that allow local and remote logins; it is very difficult to get good performance measurements when many students can be logged into, and running jobs on, the same machine.) + +We began our outreach with two tentative paths toward providing course computing resources that would allow accurate and repeatable performance measurements. The first is using a cloud provider (e.g., AWS). The second is internal university computing resources. Both, of course, have pros and cons. + +### Using a cloud provider + +The benefit of a cloud provider is that it "only" costs money but does not require internal department resources, either computers or sysadmin time. That being said, setup is not trivial, but we can provide our instructions for others. + +We expect computing demand to vary significantly during an academic term (with peaks near assignment deadlines). Cloud providers are able to provide elastic service and easily handle this variance. + +When I taught my course at UC Davis, we used AWS for student assignments. UC Davis has an AWS relationship already so billing and student account creation were more straightforward than if we did not have this relationship. MIT also has used AWS. Our cost was $155 per enrolled student over an academic quarter (ten weeks). + +This option is perhaps best for departments that have a little more cash than time or computing resources. It is also probably more straightforward for departments that want to offer an experimental (one-time) course before committing to adding it to their curriculum. + +The OpenCilk team also had informal discussions with a cloud computing provider with the hopes that such a provider could supply a customized service so that complex configuration was not necessary. While this has not been successful to date, it remains a future option. + +### Local machine resources + +The other alternative is configuring local machines within a department. Because of quiescing requirements, this machine or machines would likely be a server with managed access. UC Davis has turned to this alternative for our next offering. We will configure a single server to be used only for this class for the entire academic term, and control access to this server via a job submission interface (Slurm). + +This option is perhaps best for departments that have a server or servers that can be dedicated to the course for the entire academic term. + +### A third option: NSF ACCESS + +One of our faculty contacts noted his success with the [NSF ACCESS](https://access-ci.org/) program for student assignments ([request here](https://allocations.access-ci.org/prepare-requests-overview)). He also noted that part of his responsibility as an instructor when using ACCESS was making his educational materials available to the ACCESS program and presumably the computing community. (This is an excellent tradeoff!) + +We have not yet investigated the suitability of ACCESS for quiesced servers or for a typical student computing load in a performance engineering course. + +### Cost sharing + +The OpenCilk team also discussed pursuing grant funding with which we could subsidize computing time for newly-introduced performance engineering courses at other universities. We hope this would help faculty convince their departments of both external interest in the course material as well as a vested interest into making it easier for the department to offer the course. Our idea of a subsidy (rather than fully covering the cost) was to ensure that the department also had a (financial) interest in the course, although of course many models are possible here. + +# Industry feedback + +- "New candidates (university grads) can’t use perf, don’t know assembly." "If you've never done this before, it takes a long time to learn. ... takes 6--12 months to get really effective." +- "The most common problems we saw with new engineers were (1) lack of performance skills, and (2) a complete and abject fear of parallelism." +- Industry teams use many tools including homegrown ones, but "perf" is ubiquitous. Many homegrown tools are built simply to solve a particular problem and can be thrown away afterwards---"build the simplest tool that does the job". +- The skills needed to be a good performance engineer overlap substantially with the skills needed to be a good security expert. +- New college graduates should: + - "Have an appetite to dig beyond the 'obvious thing'." + - Work in a VM space on some industry product (e.g., JVM) + - Have built something successfully, improved something successfully. Signal that they are capable on the technical level. Not so many candidates work in the VM space. + - “Performance work”. Similar skills: security people who can exploit a VM/compiler. + - Understand lowest level of C++, e.g., pointers, performance tools. +- Many many candidates have never seen C++ and don't understand memory allocation. Much C++ code looks like it was written by a Java or Python programmer (this is undesirable). diff --git a/src/webdev/defns.md b/src/webdev/defns.md new file mode 100644 index 00000000..91466ebe --- /dev/null +++ b/src/webdev/defns.md @@ -0,0 +1,43 @@ +--- +title: Defns +--- +{{ collections.defnTerms | log }} +{% for key, value in collections.defnTerms %} +{{ key }} : {{ value }} +{% endfor %} + +```cilkc +void qsort(int* begin, int* end) { + if (begin < end) { + int last = *(end - 1); // get last element + int * middle = partition(begin, end - 1, last); // partition and return ptr to first elem >= last + swap((end - 1), middle); // move pivot to middle + + qsort(middle+1, end); // sort lower partition + qsort(begin, middle); // sort upper partition (excluding pivot) + + } +} +``` + +```cilkc +void m_mult(m_t A, m_t B, m_t C) { + for (int i = 0; i < A.rows; ++i) { + for (int j = 0; j < B.cols; ++j) { + for (int k = 0; k < A.cols; ++k) + C[i][j] += A[i][k] * B[k][j]; + } + } +} +``` + +```cilkc +void p_m_mult(m_t A, m_t B, m_t C) { + cilk_for (int i = 0; i < A.rows; ++i) { + cilk_for (int j = 0; j < B.cols; ++j) { + for (int k = 0; k < A.cols; ++k) + C[i][j] += A[i][k] * B[k][j]; + } + } +} +``` \ No newline at end of file diff --git a/src/webdev/webdev.11tydata.js b/src/webdev/webdev.11tydata.js new file mode 100644 index 00000000..9edba868 --- /dev/null +++ b/src/webdev/webdev.11tydata.js @@ -0,0 +1,26 @@ +require('dotenv').config(); + +const isDevEnv = process.env.ELEVENTY_ENV === 'development'; + +module.exports = function() { + return { + eleventyComputed: { + eleventyExcludeFromCollections: function(data) { + if(isDevEnv) { + return data.eleventyExcludeFromCollections; + } + else { + return true; + } + }, + permalink: function(data) { + if(!isDevEnv) { + return false; + } + else { + return data.page.filePathStem.replace('webdev/', '/') + '/'; + } + } + } + } +} \ No newline at end of file diff --git a/src/webdev/webdev.json b/src/webdev/webdev.json new file mode 100644 index 00000000..c6de2f19 --- /dev/null +++ b/src/webdev/webdev.json @@ -0,0 +1,7 @@ +{ + "layout": "layouts/page.njk", + "stylesheet": "users-guide.css", + "sidebar": "toc", + "background": "text-white bg-info", + "tags": "webdev" +} \ No newline at end of file diff --git a/src/webdev/webdev.md b/src/webdev/webdev.md new file mode 100644 index 00000000..bd9c476f --- /dev/null +++ b/src/webdev/webdev.md @@ -0,0 +1,16 @@ +--- +title: Webdev +--- + +## Web development + +Files in the `/src/webdev/` folder will be ignored in production and compiled only in development environments. +You can set your environment as development with the file `/.env`: + +```bash +ELEVENTY_ENV=development +``` + +You can also put `draft: true` into the front matter of any article in `/src/doc/`, and it will be ignored in production and compiled only in development environments. + +The alternative color scheme seen here indicates that the page is `draft` and does not appear in production environments.