From 7c80053fb509253ab1d75cd97c434b2afc2b4b86 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Sun, 13 May 2018 14:37:20 -0700 Subject: [PATCH] example of linking RAJA with CUDA --- raja-cuda-build/main.cpp | 200 ++++++++++++++++++++++++++++++ raja-cuda-build/makefile | 51 ++++++++ raja-cuda-build/memoryManager.hpp | 56 +++++++++ 3 files changed, 307 insertions(+) create mode 100644 raja-cuda-build/main.cpp create mode 100644 raja-cuda-build/makefile create mode 100644 raja-cuda-build/memoryManager.hpp diff --git a/raja-cuda-build/main.cpp b/raja-cuda-build/main.cpp new file mode 100644 index 0000000..f5f4d78 --- /dev/null +++ b/raja-cuda-build/main.cpp @@ -0,0 +1,200 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-18, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-689114 +// +// All rights reserved. +// +// This file is part of RAJA. +// +// For details about use and distribution, please read RAJA/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include + +#include "memoryManager.hpp" + +#include "RAJA/RAJA.hpp" + +/* + * Vector Addition Example + * + * Computes c = a + b, where a, b, c are vectors of ints. + * It illustrates similarities between a C-style for-loop and a RAJA + * forall loop. + * + * RAJA features shown: + * - `forall` loop iteration template method + * - Index range segment + * - Execution policies + * + * If CUDA is enabled, CUDA unified memory is used. + */ + +/* + CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block +*/ +#if defined(RAJA_ENABLE_CUDA) +const int CUDA_BLOCK_SIZE = 256; +#endif + +// +// Functions for checking and printing results +// +void checkResult(int* res, int len); +void printResult(int* res, int len); + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nRAJA vector addition example...\n"; + +// +// Define vector length +// + const int N = 1000000; + +// +// Allocate and initialize vector data +// + int *a = memoryManager::allocate(N); + int *b = memoryManager::allocate(N); + int *c = memoryManager::allocate(N); + + for (int i = 0; i < N; ++i) { + a[i] = -i; + b[i] = i; + } + + +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-version of vector addition...\n"; + + for (int i = 0; i < N; ++i) { + c[i] = a[i] + b[i]; + } + + checkResult(c, N); +//printResult(c, N); + + +//----------------------------------------------------------------------------// +// RAJA::seq_exec policy enforces strictly sequential execution.... +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA sequential vector addition...\n"; + + RAJA::forall(RAJA::RangeSegment(0, N), [=] (int i) { + c[i] = a[i] + b[i]; + }); + + checkResult(c, N); +//printResult(c, N); + + +//----------------------------------------------------------------------------// +// RAJA::simd_exec policy should force the compiler to generate SIMD +// vectorization optimizations.... +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA SIMD vector addition...\n"; + + RAJA::forall(RAJA::RangeSegment(0, N), [=] (int i) { + c[i] = a[i] + b[i]; + }); + + checkResult(c, N); +//printResult(c, N); + + +//----------------------------------------------------------------------------// +// RAJA::loop_exec policy means that the compiler is allowed to generate +// optimizations (e.g., SIMD) if it thinks it is safe to do so... +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA loop-exec vector addition...\n"; + + RAJA::forall(RAJA::RangeSegment(0, N), [=] (int i) { + c[i] = a[i] + b[i]; + }); + + checkResult(c, N); +//printResult(c, N); + + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_OPENMP) + std::cout << "\n Running RAJA OpenMP vector addition...\n"; + + RAJA::forall(RAJA::RangeSegment(0, N), [=] (int i) { + c[i] = a[i] + b[i]; + }); + + checkResult(c, N); +//printResult(c, N); +#endif + + +//----------------------------------------------------------------------------// +#if defined(RAJA_ENABLE_CUDA) + std::cout << "\n Running RAJA CUDA vector addition...\n"; + + RAJA::forall>(RAJA::RangeSegment(0, N), + [=] RAJA_DEVICE (int i) { + c[i] = a[i] + b[i]; + }); + + checkResult(c, N); +//printResult(c, N); +#endif + +//----------------------------------------------------------------------------// + +// +// Clean up. +// + memoryManager::deallocate(a); + memoryManager::deallocate(b); + memoryManager::deallocate(c); + + std::cout << "\n DONE!...\n"; + + return 0; +} + +// +// Function to check result and report P/F. +// +void checkResult(int* res, int len) +{ + bool correct = true; + for (int i = 0; i < len; i++) { + if ( res[i] != 0 ) { correct = false; } + } + if ( correct ) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +} + +// +// Function to print result. +// +void printResult(int* res, int len) +{ + std::cout << std::endl; + for (int i = 0; i < len; i++) { + std::cout << "result[" << i << "] = " << res[i] << std::endl; + } + std::cout << std::endl; +} + diff --git a/raja-cuda-build/makefile b/raja-cuda-build/makefile new file mode 100644 index 0000000..f7951ec --- /dev/null +++ b/raja-cuda-build/makefile @@ -0,0 +1,51 @@ +#-----[Build-type]------ +Build-type = CUDA +#Build-type = CPU + + +#-----[RAJA and CUDA directories]---- +RAJA_DIR ?= /home/arturo/git-repo/RAJA/develop/build +CUDA_DIR ?= /usr/local/cuda-9.0 + +rajaInc = -I$(RAJA_DIR)/include +rajaLib = $(RAJA_DIR)/lib/libRAJA.a +cudaLib = -Wl,-rpath -Wl,$(CUDA_DIR)/lib64 -L$(CUDA_DIR)/lib64 -lcuda -lcudart -lcudadevrt -lnvToolsExt +#=================================== + +#---[Host compiler]----- +host-compiler = g++-6 +host-compilerFlags = '-O3 -g -std=c++11 -m64 -fopenmp' +compilerFlags = -O3 -g -std=c++11 -m64 -fopenmp +paths = -I ./$(iPath) +paths += $(rajaInc) +linker = $(host-compiler) +#====================== + +#----[device compiler]---- +device-compiler=nvcc +device-flags = -g -std=c++11 -Xptxas=-v -lineinfo --expt-extended-lambda --restrict +device-flags += -ccbin=$(linker) -Xcompiler $(host-compilerFlags) -x=cu -arch=sm_50 +#====================== + +#----[Cuda - Compilation]--------- +ifeq ($(Build-type),CUDA) +main: main.cpp + @echo Compiling for CUDA - start + $(device-compiler) $(device-flags) $(paths) -g -c -o main.o main.cpp + $(linker) -o main main.o $(cudaLib) -fopenmp $(rajaLib) + @echo Compiling for CUDA - end +else +#----[CPU - Compilation]--------- +main: main.cpp + @echo Compiling for CPU - start + $(host-compiler) $(compilerFlags) $(paths) -g -c -o main main.cpp + @echo Compiling for CPU - end +endif +#====================== + + + +#-----[Clean up]------- +clean: + rm main + rm -rf main main.o diff --git a/raja-cuda-build/memoryManager.hpp b/raja-cuda-build/memoryManager.hpp new file mode 100644 index 0000000..efb211d --- /dev/null +++ b/raja-cuda-build/memoryManager.hpp @@ -0,0 +1,56 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-18, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-689114 +// +// All rights reserved. +// +// This file is part of RAJA. +// +// For details about use and distribution, please read RAJA/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef EXAMPLES_MEMORYMANAGER_HPP +#define EXAMPLES_MEMORYMANAGER_HPP + +#include "RAJA/RAJA.hpp" +#include "RAJA/util/defines.hpp" + +/* + As RAJA does not manage memory we include a general purpose memory + manager which may be used to perform c++ style allocation/deallocation + or allocate/deallocate CUDA unified memory. The type of memory allocated + is dependent on how RAJA was configured. +*/ +namespace memoryManager{ + + template + T *allocate(RAJA::Index_type size) + { + T *ptr; +#if defined(RAJA_ENABLE_CUDA) + cudaMallocManaged((void **)&ptr, sizeof(T) * size, cudaMemAttachGlobal); +#else + ptr = new T[size]; +#endif + return ptr; + } + + template + void deallocate(T *&ptr) + { + if (ptr) { +#if defined(RAJA_ENABLE_CUDA) + cudaFree(ptr); +#else + delete[] ptr; +#endif + ptr = nullptr; + } + } + +}; +#endif