From 7f0ee87e933665d02de5ef28d665bdca03a2ed72 Mon Sep 17 00:00:00 2001 From: Nara Date: Wed, 20 Nov 2024 17:45:47 +0100 Subject: [PATCH] Develop stream 2024-10-28 (#574) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ci: set up sccache * ci: Add crush statistical tests * ci: Add statistical tests for distributions * ci: Use sccache with statistical tests * build(docs): update and cleanup depedencies * ci: Do not create packages with static libraries Binaries of tests and benchmarks are large with static linking, packing takes about 12 minutes (though these packages are not used). * remove TBB dependency The host-side generators are only really provided for cuRAND compatibility. Their performance is not really of interest, and removing the TBB dependency eases packaging and maintenance. * Resolve "Add small examples to API documentation" * fix(benchmark/tuning): fix one definition rule violation * Apply suggestions from code review docs(api-reference/cpp-api): improve grammar and punctuation Co-authored-by: Jeffrey Novotny * docs(api-reference/cpp-api): reword sections in c host api * chore: bump version and fix changelog --------- Co-authored-by: Robin Voetter Co-authored-by: Anton Gorenko Co-authored-by: Borys Petrov Co-authored-by: Mátyás Aradi Co-authored-by: Jeffrey Novotny --- .gitlab-ci.yml | 113 ++++++++++++++++++++++++++- CHANGELOG.md | 11 +-- CMakeLists.txt | 2 +- benchmark/custom_csv_formater.hpp | 6 +- docs/.gitignore | 1 + docs/api-reference/cpp-api.rst | 124 ++++++++++++++++++++++++++++++ docs/sphinx/requirements.txt | 15 ++++ library/CMakeLists.txt | 17 ---- 8 files changed, 261 insertions(+), 28 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b96c07e9d..93e3a4fc8 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -34,6 +34,7 @@ include: - /deps-rocm.yaml - /deps-nvcc.yaml - /deps-windows.yaml + - /deps-compiler-acceleration.yaml - /gpus-rocm.yaml - /gpus-nvcc.yaml - /rules.yaml @@ -68,9 +69,11 @@ copyright-date: extends: - .deps:rocm - .deps:cmake-minimum + - .deps:compiler-acceleration before_script: - !reference [".deps:rocm", before_script] - !reference [".deps:cmake-minimum", before_script] + - !reference [".deps:compiler-acceleration", before_script] .rocm:cmake-latest: variables: @@ -78,9 +81,11 @@ copyright-date: extends: - .deps:rocm - .deps:cmake-latest + - .deps:compiler-acceleration before_script: - !reference [".deps:rocm", before_script] - !reference [".deps:cmake-latest", before_script] + - !reference [".deps:compiler-acceleration", before_script] .rocm-hipcc:cmake-minimum: variables: @@ -88,9 +93,11 @@ copyright-date: extends: - .deps:rocm - .deps:cmake-minimum + - .deps:compiler-acceleration before_script: - !reference [".deps:rocm", before_script] - !reference [".deps:cmake-minimum", before_script] + - !reference [".deps:compiler-acceleration", before_script] .nvcc:cmake-minimum: variables: @@ -98,9 +105,11 @@ copyright-date: extends: - .deps:nvcc - .deps:cmake-minimum + - .deps:compiler-acceleration before_script: - !reference [".deps:nvcc", before_script] - !reference [".deps:cmake-minimum", before_script] + - !reference [".deps:compiler-acceleration", before_script] .nvcc:cmake-latest: variables: @@ -108,9 +117,11 @@ copyright-date: extends: - .deps:nvcc - .deps:cmake-latest + - .deps:compiler-acceleration before_script: - !reference [".deps:nvcc", before_script] - !reference [".deps:cmake-latest", before_script] + - !reference [".deps:compiler-acceleration", before_script] .nvcc-clang:cmake-minimum: variables: @@ -118,9 +129,11 @@ copyright-date: extends: - .deps:nvcc - .deps:cmake-latest + - .deps:compiler-acceleration before_script: - !reference [".deps:nvcc", before_script] - !reference [".deps:cmake-latest", before_script] + - !reference [".deps:compiler-acceleration", before_script] .rocm:build: variables: @@ -148,8 +161,10 @@ copyright-date: -D BUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} -D AMDGPU_TARGETS=${GPU_TARGETS} -D DISABLE_WERROR=OFF + -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c + -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx - cmake --build $CI_PROJECT_DIR/build - - cmake --build $CI_PROJECT_DIR/build --target package + - if [[ "${BUILD_SHARED_LIBS}" = "ON" ]]; then cmake --build $CI_PROJECT_DIR/build --target package; fi .nvcc:build: stage: build @@ -171,8 +186,11 @@ copyright-date: -D BUILD_FILE_REORG_BACKWARD_COMPATIBILITY=OFF -D BUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} -D NVGPU_TARGETS=${GPU_TARGETS} + -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c + -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx + -D CMAKE_CUDA_COMPILER_LAUNCHER=phc_sccache_cuda - cmake --build $CI_PROJECT_DIR/build - - cmake --build $CI_PROJECT_DIR/build --target package + - if [[ "${BUILD_SHARED_LIBS}" = "ON" ]]; then cmake --build $CI_PROJECT_DIR/build --target package; fi # Build output is too big to upload in the case of the static library builds .save-artifacts: @@ -304,6 +322,8 @@ benchmark:benchmark-tuning: -D BENCHMARK_TUNING_MIN_GRID_SIZE=${BENCHMARK_TUNING_MIN_GRID_SIZE} -D BENCHMARK_TUNING_THREAD_OPTIONS="${BENCHMARK_TUNING_THREAD_OPTIONS}" -D BENCHMARK_TUNING_BLOCK_OPTIONS="${BENCHMARK_TUNING_BLOCK_OPTIONS}" + -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c + -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx - cmake --build $CI_PROJECT_DIR/build --target benchmark_rocrand_tuning - $CI_PROJECT_DIR/build/benchmark/tuning/benchmark_rocrand_tuning --benchmark_out_format=json --benchmark_out=$CI_PROJECT_DIR/build/rocrand_config_tuning_${GPU_TARGET}_${CI_JOB_ID}.json artifacts: @@ -618,3 +638,92 @@ test:windows: - cmake -E copy "$CI_PROJECT_DIR/build/install/bin/rocRAND.dll" "$CI_PROJECT_DIR/build_install_test" *>&1 # Run package test - ctest --test-dir "$CI_PROJECT_DIR/build_install_test" -C $CMAKE_BUILD_TYPE --output-on-failure *>&1 + +.statistical-test: + stage: test + needs: + - build:rocm-cmake-minimum + extends: + - .rocm:cmake-minimum + variables: + ROCRAND_STAT_TESTS_GIT_BRANCH: "develop_stream" + ROCRAND_STAT_TESTS_DIR: ${CI_PROJECT_DIR}/rocrand-statistical-tests + LOGS_DIR: ${CI_PROJECT_DIR}/logs + script: + - $SUDO_CMD apt-get update -qq + - $SUDO_CMD apt-get install -y -qq python3 + - cd $CI_PROJECT_DIR/build + - $SUDO_CMD dpkg -i rocrand_*.deb rocrand-dev*.deb + - cd $CI_PROJECT_DIR + - git clone -b ${ROCRAND_STAT_TESTS_GIT_BRANCH} https://gitlab-ci-token:${CI_JOB_TOKEN}@${ROCRAND_STAT_TESTS_GIT_URL} + - cmake + -S ${ROCRAND_STAT_TESTS_DIR} + -B ${ROCRAND_STAT_TESTS_DIR}/build + -G Ninja + -D CMAKE_CXX_COMPILER=${COMPILER} + -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c + -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx + - cmake --build ${ROCRAND_STAT_TESTS_DIR}/build + - mkdir ${LOGS_DIR} + - cd ${ROCRAND_STAT_TESTS_DIR}/build + - nproc + artifacts: + paths: + - ${LOGS_DIR}/* + expire_in: never + +# TestU01 SmallCrush, 10 tests, 15 statistics, takes about 5 seconds +statistical-test:crush-small: + tags: + - rocm + extends: + - .statistical-test + - .rules:manual + script: + - !reference [".statistical-test", script] + - python3 -u run_crush.py --logs_dir ${LOGS_DIR} --battery small |& tee ${LOGS_DIR}/summary-crush-small.log + +# TestU01 Crush, 96 tests, 144 statistics, takes about 30 minutes +statistical-test:crush-medium: + tags: + - rocm + - big-parallel + extends: + - .statistical-test + - .rules:manual + script: + - !reference [".statistical-test", script] + - python3 -u run_crush.py --logs_dir ${LOGS_DIR} --battery medium |& tee ${LOGS_DIR}/summary-crush-medium.log + +# TestU01 BigCrush, 116 tests, 160 statistics, takes about 3 hours +statistical-test:crush-big: + tags: + - rocm + - big-parallel + extends: + - .statistical-test + - .rules:manual + timeout: 8h + script: + - !reference [".statistical-test", script] + - python3 -u run_crush.py --logs_dir ${LOGS_DIR} --battery big |& tee ${LOGS_DIR}/summary-crush-big.log + +# Dieharder, 114 tests, takes about 25 minutes +statistical-test:dieharder: + extends: + - .statistical-test + - .rules:manual + script: + - !reference [".statistical-test", script] + - $SUDO_CMD apt-get install -y -qq dieharder + - python3 -u run_dieharder.py --logs_dir ${LOGS_DIR} |& tee ${LOGS_DIR}/summary-dieharder.log + +# Distribution tests: Pearson, Anderson-Darling, Kolmogorov-Smirnov +statistical-test:distributions: + extends: + - .statistical-test + - .rules:test + - .gpus:rocm + script: + - !reference [".statistical-test", script] + - ./test_distributions_generate --engine all --dis all --runs 100 |& tee ${LOGS_DIR}/summary-distributions.log diff --git a/CHANGELOG.md b/CHANGELOG.md index eb014a668..94ead2369 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,10 +3,10 @@ Documentation for rocRAND is available at [https://rocm.docs.amd.com/projects/rocRAND/en/latest/](https://rocm.docs.amd.com/projects/rocRAND/en/latest/) - -## (Unreleased) rocRAND 3.2.0 for ROCm 6.4 +## (Unreleased) rocRAND 3.3.0 for ROCm 6.4 ### Added + * Added extended tests to `rtest.py`. These tests are extra tests that did not fit the criteria of smoke and regression tests. These tests will take much longer to run relative to smoke and regression tests. Use `python rtest.py [--emulation|-e|--test|-t]=extended` to run these tests. * Added regression tests to `rtest.py`. These tests recreate scenarios that have caused hardware problems in past emulation environments. Use `python rtest.py [--emulation|-e|--test|-t]=regression` to run these tests. * Added smoke test options, which runs a subset of the unit tests and ensures that less than 2gb of VRAM will be used. Use `python rtest.py [--emulation|-e|--test|-t]=smoke` to run these tests. @@ -15,6 +15,7 @@ Documentation for rocRAND is available at ### Changed * `--test|-t` is no longer a required flag for `rtest.py`. Instead, the user can use either `--emulation|-e` or `--test|-t`, but not both. +* Removed TBB dependency for multi-core processing of host-side generation. ## rocRAND 3.2.0 for ROCm 6.3.0 @@ -34,7 +35,7 @@ Documentation for rocRAND is available at ## Fixes -* Fixed " unknown extension ?>" issue in scripts/config-tuning/select_best_config.py +* Fixed " unknown extension ?>" issue in scripts/config-tuning/select_best_config.py when using python version thats older than 3.11 * Fixed low random sequence quality of `ROCRAND_RNG_PSEUDO_THREEFRY2_64_20` and `ROCRAND_RNG_PSEUDO_THREEFRY4_64_20`. @@ -60,7 +61,7 @@ Documentation for rocRAND is available at * If TBB is not found when configuring rocRAND, the configuration is still successful, and the host generators are executed on a single CPU thread. * Added the option to create a host generator to the Python wrapper * Added the option to create a host generator to the Fortran wrapper -* Added dynamic ordering. This ordering is free to rearrange the produced numbers, +* Added dynamic ordering. This ordering is free to rearrange the produced numbers, which can be specific to devices and distributions. It is implemented for: * XORWOW, MRG32K3A, MTGP32, Philox 4x32-10, MRG31K3P, LFSR113, and ThreeFry * For the NVIDIA platform compilation using clang as the host compiler is now supported. @@ -79,7 +80,7 @@ Documentation for rocRAND is available at ### Changes -* For device-side generators, you can now wrap calls to rocrand_generate_* inside of a hipGraph. There are a few +* For device-side generators, you can now wrap calls to rocrand_generate_* inside of a hipGraph. There are a few things to be aware of: - Generator creation (rocrand_create_generator), initialization (rocrand_initialize_generator), and destruction (rocrand_destroy_generator) must still happen outside the hipGraph. - After the generator is created, you may call API functions to set its seed, offset, and order. diff --git a/CMakeLists.txt b/CMakeLists.txt index 7f7344f99..b667a43ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -156,7 +156,7 @@ if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY AND NOT WIN32) endif() # Set version variables -rocm_setup_version( VERSION "3.2.0" ) +rocm_setup_version( VERSION "3.3.0" ) set ( rocrand_VERSION ${rocRAND_VERSION} ) # Old-style version number used within the library's API. rocrand_get_version should be modified. math(EXPR rocrand_VERSION_NUMBER "${rocRAND_VERSION_MAJOR} * 100000 + ${rocRAND_VERSION_MINOR} * 100 + ${rocRAND_VERSION_PATCH}") diff --git a/benchmark/custom_csv_formater.hpp b/benchmark/custom_csv_formater.hpp index 5f89053c9..75d05fa6d 100644 --- a/benchmark/custom_csv_formater.hpp +++ b/benchmark/custom_csv_formater.hpp @@ -86,13 +86,13 @@ class customCSVReporter : public BenchmarkReporter "error_message"}; }; -bool customCSVReporter::ReportContext(const Context& context) +inline bool customCSVReporter::ReportContext(const Context& context) { PrintBasicContext(&GetErrorStream(), context); return true; } -void customCSVReporter::ReportRuns(const std::vector& reports) +inline void customCSVReporter::ReportRuns(const std::vector& reports) { std::ostream& Out = GetOutputStream(); @@ -149,7 +149,7 @@ void customCSVReporter::ReportRuns(const std::vector& reports) } } -void customCSVReporter::PrintRunData(const Run& run) +inline void customCSVReporter::PrintRunData(const Run& run) { std::ostream& Out = GetOutputStream(); std::ostream& Err = GetErrorStream(); diff --git a/docs/.gitignore b/docs/.gitignore index d9fe2f7ba..5d6668423 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -4,3 +4,4 @@ /doxygen/xml/ /doxygen/*.tag /sphinx/_toc.yml +fortran-api-reference.md diff --git a/docs/api-reference/cpp-api.rst b/docs/api-reference/cpp-api.rst index 9474b6420..6db52ae88 100644 --- a/docs/api-reference/cpp-api.rst +++ b/docs/api-reference/cpp-api.rst @@ -17,12 +17,136 @@ To search an API, refer to the API :ref:`genindex`. Device functions ================ + +To use the device API, include the file ``rocrand_kernel.h`` in files that define kernels that use rocRAND device functions. The typical usage of device functions consists of the following operations in the device kernel definition: + +1. Create a new generator state object of the desired generator type. + +2. Initialize the generator state parameters using ``rocrand_init``. + +3. Generate random numbers by calling the generation function on the generator state. + +4. Use the results. + +Since the rocRAND device functions are invoked from inside the user kernel, the generated numbers can be used right away in the kernel without the need to copy them to the host memory. + +In the below example, random number generation is using the XORWOW generator. + +.. code-block:: cpp + + #include + #include + + __global__ + void test() + { + uint tid = blockDim.x * blockIdx.x + threadIdx.x; + rocrand_state_xorwow state; + rocrand_init(123, tid, 0, &state); + + for(int i = 0; i < 3; ++i) + { + const auto value = rocrand(&state); + printf("thread %d, index %u: %u\n", tid, i, value); + } + } + + int main() + { + test<<>>(); + hipDeviceSynchronize(); + } + .. doxygengroup:: rocranddevice C host API ========== + +The C host API allows encapsulation of the internal generator state. Random numbers may be produced either on the host or device, depending on the created generator object. The typical sequence of operations for device generation consists of the following steps: + +1. Allocate memory on the device with ``hipMalloc``. + +2. Create a new generator of the desired type with ``rocrand_create_generator``. + +3. Set the generator options, for example, use ``rocrand_set_seed`` to set the seed. + +4. Generate random numbers with ``rocrand_generate`` or another generation function. + +5. Use the results. + +6. Clean up with ``rocrand_destroy_generator`` and ``hipFree``. + +To generate random numbers on the host, the memory allocation in step one should be made using a host memory allocation call. In step two ``rocrand_create_generator_host`` should be called instead. In the last step, the appropriate memory release should be made using the ``rocrand_destroy_generator``. All other calls work identically whether you are generating random numbers on the device or on the host CPU. + +In the example below, the C host API is used to generate 10 random floats using GPU capabilities. + +.. code-block:: c + + #include + #include + #include + + int main() + { + size_t n = 10; + + rocrand_generator gen; + float * d_rand, *h_rand; + + h_rand = (float*)malloc(sizeof(float) * n); + hipMalloc((void**)&d_rand, n * sizeof(float)); + + rocrand_create_generator(&gen, ROCRAND_RNG_PSEUDO_DEFAULT); + rocrand_set_seed(gen, 123); + rocrand_generate_uniform(gen, d_rand, n); + + hipMemcpy(h_rand, d_rand, n * sizeof(float), hipMemcpyDeviceToHost); + + for(int i = 0; i < n; i++) + { + printf("%f\n", h_rand[i]); + } + + rocrand_destroy_generator(gen); + hipFree(d_rand); + + return 0; + } + .. doxygengroup:: rocrandhost C++ host API wrapper ==================== + +The C++ host API wrapper provides resource management and an object-oriented interface for random number generation facilities. + +In the example below C++ host API wrapper is used to produce a random number using the default generation parameters. + +.. code-block:: cpp + + #include + #include + + #include + + int main() + { + float* d_rand; + float h_rand; + hipMalloc((void**)&d_rand, sizeof(float)); + + rocrand_cpp::xorwow gen; + rocrand_cpp::normal_distribution<> dist; + + dist(gen, d_rand, 1); + + hipMemcpy(&h_rand, d_rand, sizeof(float), hipMemcpyDeviceToHost); + + std::cout << h_rand << std::endl; + + hipFree(d_rand); + + return 0; + } + .. doxygengroup:: rocrandhostcpp diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index d5266bba3..38a36a8af 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -36,6 +36,8 @@ docutils==0.21.2 # myst-parser # pydata-sphinx-theme # sphinx +doxysphinx==3.3.7 + # via rocm-docs-core fastjsonschema==2.19.1 # via rocm-docs-core gitdb==4.0.11 @@ -50,6 +52,10 @@ jinja2==3.1.4 # via # myst-parser # sphinx +libsass==0.22.0 + # via doxysphinx +lxml==4.9.4 + # via doxysphinx markdown-it-py==3.0.0 # via # mdit-py-plugins @@ -60,6 +66,8 @@ mdit-py-plugins==0.4.1 # via myst-parser mdurl==0.1.2 # via markdown-it-py +mpire==2.10.2 + # via doxysphinx myst-parser==3.0.1 # via rocm-docs-core numpy==2.1.3 @@ -79,12 +87,17 @@ pygithub==2.3.0 pygments==2.18.0 # via # accessible-pygments + # mpire # pydata-sphinx-theme # sphinx +pyjson5==1.6.6 + # via doxysphinx pyjwt[crypto]==2.8.0 # via pygithub pynacl==1.5.0 # via pygithub +pyparsing==3.1.2 + # via doxysphinx pyyaml==6.0.1 # via # myst-parser @@ -137,6 +150,8 @@ sphinxcontrib-serializinghtml==1.1.10 # via sphinx tomli==2.0.1 # via sphinx +tqdm==4.66.5 + # via mpire typing-extensions==4.12.2 # via # pydata-sphinx-theme diff --git a/library/CMakeLists.txt b/library/CMakeLists.txt index fba119333..ff67232cb 100644 --- a/library/CMakeLists.txt +++ b/library/CMakeLists.txt @@ -123,23 +123,6 @@ else() target_link_libraries(rocrand PUBLIC hip::host) endif() -option(ROCRAND_USE_TBB "Use TBB for host-side generators if available." ON) -if(NOT WIN32 AND ROCRAND_USE_TBB) - find_package(TBB QUIET) - if(NOT TARGET TBB::tbb) - message(WARNING "TBB is not found. Building without parallel STL support") - else() - target_link_libraries(rocrand PRIVATE TBB::tbb) - rocm_package_add_deb_dependencies(DEPENDS "libtbb-dev") - set(CPACK_DEBIAN_PACKAGE_DEPENDS "${CPACK_DEBIAN_PACKAGE_DEPENDS}" PARENT_SCOPE) - rocm_package_add_rpm_dependencies(DEPENDS "(tbb-devel or tbb)") - set(CPACK_RPM_PACKAGE_REQUIRES "${CPACK_RPM_PACKAGE_REQUIRES}" PARENT_SCOPE) - - # Older libstdc++ headers require TBB to be installed to be able to #include - target_compile_definitions(rocrand PRIVATE ROCRAND_PARALLEL_STL) - endif() -endif() - rocm_set_soversion(rocrand ${rocrand_SOVERSION}) set_target_properties(rocrand PROPERTIES