Skip to content

Commit

Permalink
Port CUDAStream to Go (#332)
Browse files Browse the repository at this point in the history
* port CUDA apis

* update

* update

* fix ut

* update

* update

* update

* more unittest

* update

* update

* update

Co-authored-by: Yi Wang <[email protected]>
  • Loading branch information
Yancey1989 and wangkuiyi authored Sep 18, 2020
1 parent 5a7e4a1 commit 84a41d0
Show file tree
Hide file tree
Showing 10 changed files with 256 additions and 11 deletions.
3 changes: 2 additions & 1 deletion cgotorch/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ libcgotorch.so : $(srcs) $(hdrs) ${LIBTORCH_DIR}
-Wl,-rpath,libtorch/lib \
-Wl,-${LOAD} libtorch/lib/libc10.${LIB_SUFFIX} \
-lc10 -ltorch -ltorch_cpu \
-D_GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI}
-D_GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI} \
${CUDA_FLAGS}

clean:
rm -rf *.so
10 changes: 7 additions & 3 deletions cgotorch/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ GLIBCXX_USE_CXX11_ABI="1"
LOAD="force_load"
LIB_SUFFIX="so"
INSTALL_NAME=""
CUDA_FLAGS=""

function build_linux_no_cuda() {
CXX="clang++"
Expand All @@ -26,16 +27,17 @@ function build_linux_no_cuda() {
if [[ "$OS" == "linux" ]]; then
if [[ "$ARCH" =~ arm* ]]; then
echo "Building for Raspbian ...";
CXX="g++"
CXX="g++"
LIBTORCH_DIR="rpi/libtorch"
if [[ ! -d "$DIR/$LIBTORCH_DIR" ]]; then
curl -LsO 'https://github.com/ljk53/pytorch-rpi/raw/master/libtorch-rpi-cxx11-abi-shared-1.6.0.zip';
unzip -qq -o libtorch-rpi-cxx11-abi-shared-1.6.0.zip -d rpi
fi
elif $(whereis cuda | cut -f 2 -d ' ')/bin/nvcc --version > /dev/null; then
CXX="clang++"
NVCC=$(whereis cuda | cut -f 2 -d ' ')/bin/nvcc
NVCC=$(whereis cuda | cut -f 2 -d ' ')/bin/nvcc
CUDA_VERSION=$("$NVCC" --version | grep release | grep -Eo "[0-9]+.[0-9]+" | head -1)
CUDA_FLAGS="$CUDA_FLAGS -DWITH_CUDA -I /usr/local/cuda/include"
if [[ "$CUDA_VERSION" == "10.1" ]]; then
echo "Building for Linux with CUDA 10.1";
LIBTORCH_DIR="linux-cuda101/libtorch"
Expand Down Expand Up @@ -77,6 +79,8 @@ make CXX="$CXX" \
INSTALL_NAME="$INSTALL_NAME" \
LIBTORCH_DIR="$LIBTORCH_DIR" \
GLIBCXX_USE_CXX11_ABI="$GLIBCXX_USE_CXX11_ABI" \
LOAD="$LOAD"
LOAD="$LOAD" \
CUDA_FLAGS="$CUDA_FLAGS" \
-f Makefile;

popd
16 changes: 13 additions & 3 deletions cgotorch/cgotorch.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ typedef void *Normalize;
typedef void *Device;
typedef void *ByteBuffer;
#endif
typedef void *CUDAStream;

////////////////////////////////////////////////////////////////////////////////
// Helper functions
Expand Down Expand Up @@ -226,14 +227,23 @@ void Optimizer_Close(Optimizer opt);
////////////////////////////////////////////////////////////////////////////////

const char *Torch_Device(const char *device_type, Device *device);
bool IsCUDAAvailable();
bool IsCUDNNAvailable();

const char *Tensor_To(Tensor input, Device device, int8_t dtype,
Tensor *output);
const char *Tensor_CastTo(Tensor input, int8_t dtype, Tensor *output);
const char *Tensor_CopyTo(Tensor input, Device device, Tensor *output);
const char *Tensor_PinMemory(Tensor input, Tensor *output);
const char *Tensor_CUDA(Tensor input, Device device, int8_t non_blocking,
Tensor *output);
////////////////////////////////////////////////////////////////////////////////
// Nvidia CUDA
////////////////////////////////////////////////////////////////////////////////
bool IsCUDAAvailable();
bool IsCUDNNAvailable();
const char *CUDA_GetCurrentCUDAStream(CUDAStream *stream, Device *device);
const char *CUDA_SetCurrentCUDAStream(CUDAStream stream);
const char *CUDA_GetCUDAStreamFromPool(CUDAStream *stream, Device *device);
const char *CUDA_Synchronize(CUDAStream stream);
const char *CUDA_Query(CUDAStream stream, int8_t *result);

#ifdef __cplusplus
}
Expand Down
88 changes: 88 additions & 0 deletions cgotorch/cuda.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// Copyright 2020, GoTorch Authors
#include <iostream>
#include <sstream>
#include <string>
#include <vector>

#include "torch/script.h"
#include "torch/torch.h"

#ifdef WITH_CUDA
#include "c10/cuda/CUDAStream.h"
#endif

// FIXME(shendiaomo): including cgotorch.h before torch/torch.h will fail
#include "cgotorch/cgotorch.h"

bool IsCUDAAvailable() { return torch::cuda::is_available(); }

bool IsCUDNNAvailable() { return torch::cuda::cudnn_is_available(); }

const char *CUDA_GetCurrentCUDAStream(CUDAStream *stream, Device *device) {
#ifdef WITH_CUDA
try {
*stream = static_cast<void *>(new at::cuda::CUDAStream(
at::cuda::getCurrentCUDAStream((*device)->index())));
return nullptr;
} catch (const std::exception &e) {
return exception_str(e.what());
}
#else
return exception_str("CUDA API needs -DWITH_CUDA on building libcgotorch.so");
#endif
}

const char *CUDA_GetCUDAStreamFromPool(CUDAStream *stream, Device *device) {
#ifdef WITH_CUDA
try {
*stream = static_cast<void *>(
new at::cuda::CUDAStream(at::cuda::getStreamFromPool(
false /**isHighPriority**/, (*device)->index())));
return nullptr;
} catch (const std::exception &e) {
return exception_str(e.what());
}
#else
return exception_str("CUDA API needs -DWITH_CUDA on building libcgotorch.so");
#endif
}

const char *CUDA_SetCurrentCUDAStream(CUDAStream stream) {
#ifdef WITH_CUDA
try {
at::cuda::setCurrentCUDAStream(
*static_cast<at::cuda::CUDAStream *>(stream));
return nullptr;
} catch (const std::exception &e) {
return exception_str(e.what());
}
#else
return exception_str("CUDA API needs -DWITH_CUDA on building libcgotorch.so");
#endif
}

const char *CUDA_Synchronize(CUDAStream stream) {
#ifdef WITH_CUDA
try {
static_cast<at::cuda::CUDAStream *>(stream)->synchronize();
return nullptr;
} catch (const std::exception &e) {
return exception_str(e.what());
}
#else
return exception_str("CUDA API needs -DWITH_CUDA on building libcgotorch.so");
#endif
}

const char *CUDA_Query(CUDAStream stream, int8_t *result) {
#ifdef WITH_CUDA
try {
*result = static_cast<at::cuda::CUDAStream *>(stream)->query() ? 1 : 0;
return nullptr;
} catch (const std::exception &e) {
return exception_str(e.what());
}
#else
return exception_str("CUDA API needs -DWITH_CUDA on building libcgotorch.so");
#endif
}
4 changes: 0 additions & 4 deletions cgotorch/device.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,3 @@ const char *Torch_Device(const char *device_type, Device *device) {
}
return nullptr;
}

bool IsCUDAAvailable() { return torch::cuda::is_available(); }

bool IsCUDNNAvailable() { return torch::cuda::cudnn_is_available(); }
14 changes: 14 additions & 0 deletions cgotorch/tensor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,20 @@ const char *Tensor_To(Tensor input, Device device, int8_t dtype,
}
}

const char *Tensor_CUDA(Tensor input, Device device, int8_t non_blocking,
Tensor *output) {
try {
if (!device->is_cuda()) {
return exception_str("the device should be CUDA device");
}
auto result = input->to(*device, static_cast<bool>(non_blocking));
*output = new at::Tensor(result);
return nullptr;
} catch (const std::exception &e) {
return exception_str(e.what());
}
}

const char *Tensor_CastTo(Tensor input, int8_t dtype, Tensor *output) {
try {
auto result = input->to(static_cast<at::ScalarType>(dtype));
Expand Down
44 changes: 44 additions & 0 deletions cuda.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package gotorch

// #cgo CFLAGS: -I ${SRCDIR}/cgotorch -I ${SRCDIR}/cgotorch/libtorch/include
// #cgo LDFLAGS: -L ${SRCDIR}/cgotorch -Wl,-rpath ${SRCDIR}/cgotorch -lcgotorch
// #cgo LDFLAGS: -L ${SRCDIR}/cgotorch/libtorch/lib -Wl,-rpath ${SRCDIR}/cgotorch/libtorch/lib -lc10 -ltorch -ltorch_cpu
// #include "cgotorch.h"
import "C"
import "unsafe"

// CUDAStream struct wrapped Nvidia CUDA Stream
type CUDAStream struct {
P C.CUDAStream
}

// Query returns true if all tasks completed on this CUDA stream
func (s CUDAStream) Query() bool {
var b int8
MustNil(unsafe.Pointer(C.CUDA_Query(s.P, (*C.int8_t)(&b))))
return b != 0
}

// Synchronize wait until all tasks completed on this CUDA stream
func (s CUDAStream) Synchronize() {
MustNil(unsafe.Pointer(C.CUDA_Synchronize(s.P)))
}

// GetCurrentCUDAStream returns the current stream on device
func GetCurrentCUDAStream(device Device) CUDAStream {
var stream C.CUDAStream
MustNil(unsafe.Pointer(C.CUDA_GetCurrentCUDAStream(&stream, &device.T)))
return CUDAStream{stream}
}

// SetCurrentCUDAStream set stream as the current CUDA stream
func SetCurrentCUDAStream(stream CUDAStream) {
MustNil(unsafe.Pointer(C.CUDA_SetCurrentCUDAStream(stream.P)))
}

// NewCUDAStream returns a new CUDA stream from the pool
func NewCUDAStream(device Device) CUDAStream {
var stream C.CUDAStream
MustNil(unsafe.Pointer(C.CUDA_GetCUDAStreamFromPool(&stream, &device.T)))
return CUDAStream{stream}
}
55 changes: 55 additions & 0 deletions cuda_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package gotorch_test

import (
"testing"

"github.com/stretchr/testify/assert"
torch "github.com/wangkuiyi/gotorch"
)

func getDefaultDevice() torch.Device {
var device torch.Device
if torch.IsCUDAAvailable() {
device = torch.NewDevice("cuda")
} else {
device = torch.NewDevice("cpu")
}
return device
}
func TestCUDAStreamPanics(t *testing.T) {
a := assert.New(t)
device := getDefaultDevice()
if torch.IsCUDAAvailable() {
a.NotPanics(func() {
torch.GetCurrentCUDAStream(device)
})
} else {
a.Panics(func() {
torch.GetCurrentCUDAStream(device)
})
a.Panics(func() {
torch.NewCUDAStream(device)
})
}
}

func TestMultiCUDAStream(t *testing.T) {
if !torch.IsCUDAAvailable() {
t.Skip("skip TestMultiCUDAStream which only run on CUDA device")
}
a := assert.New(t)
device := getDefaultDevice()
currStream := torch.GetCurrentCUDAStream(device)
defer torch.SetCurrentCUDAStream(currStream)
// create a new CUDA stream
stream := torch.NewCUDAStream(device)
// switch to the new CUDA stream
torch.SetCurrentCUDAStream(stream)
// copy Tensor from host to device async
input := torch.RandN([]int64{100, 200}, true).PinMemory()
input.CUDA(device, true /**nonBlocking=true**/)
// wait until all tasks completed
stream.Synchronize()
// make sure all tasks completed
a.True(stream.Query())
}
12 changes: 12 additions & 0 deletions tensor.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,18 @@ func (a Tensor) To(device Device, dtype ...int8) Tensor {
return Tensor{(*unsafe.Pointer)(&t)}
}

// CUDA returns a Tensor on CUDA device
func (a Tensor) CUDA(device Device, nonBlocking bool) Tensor {
var t C.Tensor
n := int8(0)
if nonBlocking {
n = 1
}
MustNil(unsafe.Pointer(C.Tensor_CUDA(C.Tensor(*a.T), device.T, C.int8_t(n), &t)))
SetTensorFinalizer((*unsafe.Pointer)(&t))
return Tensor{(*unsafe.Pointer)(&t)}
}

// CastTo cast tensor dtype
func (a Tensor) CastTo(dtype int8) Tensor {
var t C.Tensor
Expand Down
21 changes: 21 additions & 0 deletions tensor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,28 @@ func TestCastTo(t *testing.T) {
b = a.To(torch.NewDevice("cpu"), torch.Float)
assert.Equal(t, torch.Float, b.Dtype())
}
func TestCUDA(t *testing.T) {
a := assert.New(t)
device := getDefaultDevice()
input := torch.NewTensor([][]float32{{1, 2}, {3, 4}})
if !torch.IsCUDAAvailable() {
// CUDA should panics on CPU device
a.Panics(func() {
input.CUDA(device, false)
})
a.Panics(func() {
input.CUDA(device, true)
})
return
}

b := input.CUDA(device, false)
a.Equal(" 1 2\n 3 4\n[ CUDAFloatType{2,2} ]", b.String())

c := input.CUDA(device, true)
torch.GetCurrentCUDAStream(device).Synchronize()
a.Equal(" 1 2\n 3 4\n[ CUDAFloatType{2,2} ]", c.String())
}
func TestCopyTo(t *testing.T) {
device := torch.NewDevice("cpu")
a := torch.NewTensor([]int64{1, 2})
Expand Down

0 comments on commit 84a41d0

Please sign in to comment.