From 84a41d0d3ed850c0cbdd13cd65d883b52dafb3df Mon Sep 17 00:00:00 2001 From: Yan Xu Date: Fri, 18 Sep 2020 12:46:46 +0800 Subject: [PATCH] Port CUDAStream to Go (#332) * port CUDA apis * update * update * fix ut * update * update * update * more unittest * update * update * update Co-authored-by: Yi Wang --- cgotorch/Makefile | 3 +- cgotorch/build.sh | 10 ++++-- cgotorch/cgotorch.h | 16 +++++++-- cgotorch/cuda.cc | 88 +++++++++++++++++++++++++++++++++++++++++++++ cgotorch/device.cc | 4 --- cgotorch/tensor.cc | 14 ++++++++ cuda.go | 44 +++++++++++++++++++++++ cuda_test.go | 55 ++++++++++++++++++++++++++++ tensor.go | 12 +++++++ tensor_test.go | 21 +++++++++++ 10 files changed, 256 insertions(+), 11 deletions(-) create mode 100644 cgotorch/cuda.cc create mode 100644 cuda.go create mode 100644 cuda_test.go diff --git a/cgotorch/Makefile b/cgotorch/Makefile index 82b2dc44..c3bdc21c 100644 --- a/cgotorch/Makefile +++ b/cgotorch/Makefile @@ -18,7 +18,8 @@ libcgotorch.so : $(srcs) $(hdrs) ${LIBTORCH_DIR} -Wl,-rpath,libtorch/lib \ -Wl,-${LOAD} libtorch/lib/libc10.${LIB_SUFFIX} \ -lc10 -ltorch -ltorch_cpu \ - -D_GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI} + -D_GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI} \ + ${CUDA_FLAGS} clean: rm -rf *.so diff --git a/cgotorch/build.sh b/cgotorch/build.sh index b83d8ba0..d560a7fe 100755 --- a/cgotorch/build.sh +++ b/cgotorch/build.sh @@ -12,6 +12,7 @@ GLIBCXX_USE_CXX11_ABI="1" LOAD="force_load" LIB_SUFFIX="so" INSTALL_NAME="" +CUDA_FLAGS="" function build_linux_no_cuda() { CXX="clang++" @@ -26,7 +27,7 @@ function build_linux_no_cuda() { if [[ "$OS" == "linux" ]]; then if [[ "$ARCH" =~ arm* ]]; then echo "Building for Raspbian ..."; - CXX="g++" + CXX="g++" LIBTORCH_DIR="rpi/libtorch" if [[ ! -d "$DIR/$LIBTORCH_DIR" ]]; then curl -LsO 'https://github.com/ljk53/pytorch-rpi/raw/master/libtorch-rpi-cxx11-abi-shared-1.6.0.zip'; @@ -34,8 +35,9 @@ if [[ "$OS" == "linux" ]]; then fi elif $(whereis cuda | cut -f 2 -d ' ')/bin/nvcc --version > /dev/null; then CXX="clang++" - NVCC=$(whereis cuda | cut -f 2 -d ' ')/bin/nvcc + NVCC=$(whereis cuda | cut -f 2 -d ' ')/bin/nvcc CUDA_VERSION=$("$NVCC" --version | grep release | grep -Eo "[0-9]+.[0-9]+" | head -1) + CUDA_FLAGS="$CUDA_FLAGS -DWITH_CUDA -I /usr/local/cuda/include" if [[ "$CUDA_VERSION" == "10.1" ]]; then echo "Building for Linux with CUDA 10.1"; LIBTORCH_DIR="linux-cuda101/libtorch" @@ -77,6 +79,8 @@ make CXX="$CXX" \ INSTALL_NAME="$INSTALL_NAME" \ LIBTORCH_DIR="$LIBTORCH_DIR" \ GLIBCXX_USE_CXX11_ABI="$GLIBCXX_USE_CXX11_ABI" \ - LOAD="$LOAD" + LOAD="$LOAD" \ + CUDA_FLAGS="$CUDA_FLAGS" \ + -f Makefile; popd diff --git a/cgotorch/cgotorch.h b/cgotorch/cgotorch.h index 40d69b1b..729dd9d8 100644 --- a/cgotorch/cgotorch.h +++ b/cgotorch/cgotorch.h @@ -22,6 +22,7 @@ typedef void *Normalize; typedef void *Device; typedef void *ByteBuffer; #endif +typedef void *CUDAStream; //////////////////////////////////////////////////////////////////////////////// // Helper functions @@ -226,14 +227,23 @@ void Optimizer_Close(Optimizer opt); //////////////////////////////////////////////////////////////////////////////// const char *Torch_Device(const char *device_type, Device *device); -bool IsCUDAAvailable(); -bool IsCUDNNAvailable(); - const char *Tensor_To(Tensor input, Device device, int8_t dtype, Tensor *output); const char *Tensor_CastTo(Tensor input, int8_t dtype, Tensor *output); const char *Tensor_CopyTo(Tensor input, Device device, Tensor *output); const char *Tensor_PinMemory(Tensor input, Tensor *output); +const char *Tensor_CUDA(Tensor input, Device device, int8_t non_blocking, + Tensor *output); +//////////////////////////////////////////////////////////////////////////////// +// Nvidia CUDA +//////////////////////////////////////////////////////////////////////////////// +bool IsCUDAAvailable(); +bool IsCUDNNAvailable(); +const char *CUDA_GetCurrentCUDAStream(CUDAStream *stream, Device *device); +const char *CUDA_SetCurrentCUDAStream(CUDAStream stream); +const char *CUDA_GetCUDAStreamFromPool(CUDAStream *stream, Device *device); +const char *CUDA_Synchronize(CUDAStream stream); +const char *CUDA_Query(CUDAStream stream, int8_t *result); #ifdef __cplusplus } diff --git a/cgotorch/cuda.cc b/cgotorch/cuda.cc new file mode 100644 index 00000000..ed3c1ebd --- /dev/null +++ b/cgotorch/cuda.cc @@ -0,0 +1,88 @@ +// Copyright 2020, GoTorch Authors +#include +#include +#include +#include + +#include "torch/script.h" +#include "torch/torch.h" + +#ifdef WITH_CUDA +#include "c10/cuda/CUDAStream.h" +#endif + +// FIXME(shendiaomo): including cgotorch.h before torch/torch.h will fail +#include "cgotorch/cgotorch.h" + +bool IsCUDAAvailable() { return torch::cuda::is_available(); } + +bool IsCUDNNAvailable() { return torch::cuda::cudnn_is_available(); } + +const char *CUDA_GetCurrentCUDAStream(CUDAStream *stream, Device *device) { +#ifdef WITH_CUDA + try { + *stream = static_cast(new at::cuda::CUDAStream( + at::cuda::getCurrentCUDAStream((*device)->index()))); + return nullptr; + } catch (const std::exception &e) { + return exception_str(e.what()); + } +#else + return exception_str("CUDA API needs -DWITH_CUDA on building libcgotorch.so"); +#endif +} + +const char *CUDA_GetCUDAStreamFromPool(CUDAStream *stream, Device *device) { +#ifdef WITH_CUDA + try { + *stream = static_cast( + new at::cuda::CUDAStream(at::cuda::getStreamFromPool( + false /**isHighPriority**/, (*device)->index()))); + return nullptr; + } catch (const std::exception &e) { + return exception_str(e.what()); + } +#else + return exception_str("CUDA API needs -DWITH_CUDA on building libcgotorch.so"); +#endif +} + +const char *CUDA_SetCurrentCUDAStream(CUDAStream stream) { +#ifdef WITH_CUDA + try { + at::cuda::setCurrentCUDAStream( + *static_cast(stream)); + return nullptr; + } catch (const std::exception &e) { + return exception_str(e.what()); + } +#else + return exception_str("CUDA API needs -DWITH_CUDA on building libcgotorch.so"); +#endif +} + +const char *CUDA_Synchronize(CUDAStream stream) { +#ifdef WITH_CUDA + try { + static_cast(stream)->synchronize(); + return nullptr; + } catch (const std::exception &e) { + return exception_str(e.what()); + } +#else + return exception_str("CUDA API needs -DWITH_CUDA on building libcgotorch.so"); +#endif +} + +const char *CUDA_Query(CUDAStream stream, int8_t *result) { +#ifdef WITH_CUDA + try { + *result = static_cast(stream)->query() ? 1 : 0; + return nullptr; + } catch (const std::exception &e) { + return exception_str(e.what()); + } +#else + return exception_str("CUDA API needs -DWITH_CUDA on building libcgotorch.so"); +#endif +} diff --git a/cgotorch/device.cc b/cgotorch/device.cc index 709aad13..52412d79 100644 --- a/cgotorch/device.cc +++ b/cgotorch/device.cc @@ -24,7 +24,3 @@ const char *Torch_Device(const char *device_type, Device *device) { } return nullptr; } - -bool IsCUDAAvailable() { return torch::cuda::is_available(); } - -bool IsCUDNNAvailable() { return torch::cuda::cudnn_is_available(); } diff --git a/cgotorch/tensor.cc b/cgotorch/tensor.cc index e86295bc..ed66d5cc 100644 --- a/cgotorch/tensor.cc +++ b/cgotorch/tensor.cc @@ -128,6 +128,20 @@ const char *Tensor_To(Tensor input, Device device, int8_t dtype, } } +const char *Tensor_CUDA(Tensor input, Device device, int8_t non_blocking, + Tensor *output) { + try { + if (!device->is_cuda()) { + return exception_str("the device should be CUDA device"); + } + auto result = input->to(*device, static_cast(non_blocking)); + *output = new at::Tensor(result); + return nullptr; + } catch (const std::exception &e) { + return exception_str(e.what()); + } +} + const char *Tensor_CastTo(Tensor input, int8_t dtype, Tensor *output) { try { auto result = input->to(static_cast(dtype)); diff --git a/cuda.go b/cuda.go new file mode 100644 index 00000000..cc85cfa2 --- /dev/null +++ b/cuda.go @@ -0,0 +1,44 @@ +package gotorch + +// #cgo CFLAGS: -I ${SRCDIR}/cgotorch -I ${SRCDIR}/cgotorch/libtorch/include +// #cgo LDFLAGS: -L ${SRCDIR}/cgotorch -Wl,-rpath ${SRCDIR}/cgotorch -lcgotorch +// #cgo LDFLAGS: -L ${SRCDIR}/cgotorch/libtorch/lib -Wl,-rpath ${SRCDIR}/cgotorch/libtorch/lib -lc10 -ltorch -ltorch_cpu +// #include "cgotorch.h" +import "C" +import "unsafe" + +// CUDAStream struct wrapped Nvidia CUDA Stream +type CUDAStream struct { + P C.CUDAStream +} + +// Query returns true if all tasks completed on this CUDA stream +func (s CUDAStream) Query() bool { + var b int8 + MustNil(unsafe.Pointer(C.CUDA_Query(s.P, (*C.int8_t)(&b)))) + return b != 0 +} + +// Synchronize wait until all tasks completed on this CUDA stream +func (s CUDAStream) Synchronize() { + MustNil(unsafe.Pointer(C.CUDA_Synchronize(s.P))) +} + +// GetCurrentCUDAStream returns the current stream on device +func GetCurrentCUDAStream(device Device) CUDAStream { + var stream C.CUDAStream + MustNil(unsafe.Pointer(C.CUDA_GetCurrentCUDAStream(&stream, &device.T))) + return CUDAStream{stream} +} + +// SetCurrentCUDAStream set stream as the current CUDA stream +func SetCurrentCUDAStream(stream CUDAStream) { + MustNil(unsafe.Pointer(C.CUDA_SetCurrentCUDAStream(stream.P))) +} + +// NewCUDAStream returns a new CUDA stream from the pool +func NewCUDAStream(device Device) CUDAStream { + var stream C.CUDAStream + MustNil(unsafe.Pointer(C.CUDA_GetCUDAStreamFromPool(&stream, &device.T))) + return CUDAStream{stream} +} diff --git a/cuda_test.go b/cuda_test.go new file mode 100644 index 00000000..0176a265 --- /dev/null +++ b/cuda_test.go @@ -0,0 +1,55 @@ +package gotorch_test + +import ( + "testing" + + "github.com/stretchr/testify/assert" + torch "github.com/wangkuiyi/gotorch" +) + +func getDefaultDevice() torch.Device { + var device torch.Device + if torch.IsCUDAAvailable() { + device = torch.NewDevice("cuda") + } else { + device = torch.NewDevice("cpu") + } + return device +} +func TestCUDAStreamPanics(t *testing.T) { + a := assert.New(t) + device := getDefaultDevice() + if torch.IsCUDAAvailable() { + a.NotPanics(func() { + torch.GetCurrentCUDAStream(device) + }) + } else { + a.Panics(func() { + torch.GetCurrentCUDAStream(device) + }) + a.Panics(func() { + torch.NewCUDAStream(device) + }) + } +} + +func TestMultiCUDAStream(t *testing.T) { + if !torch.IsCUDAAvailable() { + t.Skip("skip TestMultiCUDAStream which only run on CUDA device") + } + a := assert.New(t) + device := getDefaultDevice() + currStream := torch.GetCurrentCUDAStream(device) + defer torch.SetCurrentCUDAStream(currStream) + // create a new CUDA stream + stream := torch.NewCUDAStream(device) + // switch to the new CUDA stream + torch.SetCurrentCUDAStream(stream) + // copy Tensor from host to device async + input := torch.RandN([]int64{100, 200}, true).PinMemory() + input.CUDA(device, true /**nonBlocking=true**/) + // wait until all tasks completed + stream.Synchronize() + // make sure all tasks completed + a.True(stream.Query()) +} diff --git a/tensor.go b/tensor.go index 89c4ee05..7d35ff47 100644 --- a/tensor.go +++ b/tensor.go @@ -112,6 +112,18 @@ func (a Tensor) To(device Device, dtype ...int8) Tensor { return Tensor{(*unsafe.Pointer)(&t)} } +// CUDA returns a Tensor on CUDA device +func (a Tensor) CUDA(device Device, nonBlocking bool) Tensor { + var t C.Tensor + n := int8(0) + if nonBlocking { + n = 1 + } + MustNil(unsafe.Pointer(C.Tensor_CUDA(C.Tensor(*a.T), device.T, C.int8_t(n), &t))) + SetTensorFinalizer((*unsafe.Pointer)(&t)) + return Tensor{(*unsafe.Pointer)(&t)} +} + // CastTo cast tensor dtype func (a Tensor) CastTo(dtype int8) Tensor { var t C.Tensor diff --git a/tensor_test.go b/tensor_test.go index 8d128e2f..0a3a8250 100644 --- a/tensor_test.go +++ b/tensor_test.go @@ -59,7 +59,28 @@ func TestCastTo(t *testing.T) { b = a.To(torch.NewDevice("cpu"), torch.Float) assert.Equal(t, torch.Float, b.Dtype()) } +func TestCUDA(t *testing.T) { + a := assert.New(t) + device := getDefaultDevice() + input := torch.NewTensor([][]float32{{1, 2}, {3, 4}}) + if !torch.IsCUDAAvailable() { + // CUDA should panics on CPU device + a.Panics(func() { + input.CUDA(device, false) + }) + a.Panics(func() { + input.CUDA(device, true) + }) + return + } + + b := input.CUDA(device, false) + a.Equal(" 1 2\n 3 4\n[ CUDAFloatType{2,2} ]", b.String()) + c := input.CUDA(device, true) + torch.GetCurrentCUDAStream(device).Synchronize() + a.Equal(" 1 2\n 3 4\n[ CUDAFloatType{2,2} ]", c.String()) +} func TestCopyTo(t *testing.T) { device := torch.NewDevice("cpu") a := torch.NewTensor([]int64{1, 2})