diff --git a/cgotorch/Makefile b/cgotorch/Makefile
index 82b2dc44..c3bdc21c 100644
--- a/cgotorch/Makefile
+++ b/cgotorch/Makefile
@@ -18,7 +18,8 @@ libcgotorch.so : $(srcs) $(hdrs) ${LIBTORCH_DIR}
 	-Wl,-rpath,libtorch/lib \
 	-Wl,-${LOAD} libtorch/lib/libc10.${LIB_SUFFIX} \
 	-lc10 -ltorch -ltorch_cpu \
-	-D_GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI}
+	-D_GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI} \
+	${CUDA_FLAGS}
 
 clean:
 	rm -rf *.so
diff --git a/cgotorch/build.sh b/cgotorch/build.sh
index b83d8ba0..d560a7fe 100755
--- a/cgotorch/build.sh
+++ b/cgotorch/build.sh
@@ -12,6 +12,7 @@ GLIBCXX_USE_CXX11_ABI="1"
 LOAD="force_load"
 LIB_SUFFIX="so"
 INSTALL_NAME=""
+CUDA_FLAGS=""
 
 function build_linux_no_cuda() {
 	CXX="clang++"
@@ -26,7 +27,7 @@ function build_linux_no_cuda() {
 if [[ "$OS" == "linux" ]]; then
     if [[ "$ARCH" =~ arm* ]]; then
         echo "Building for Raspbian ...";
-	CXX="g++"
+        CXX="g++"
         LIBTORCH_DIR="rpi/libtorch"
         if [[ ! -d "$DIR/$LIBTORCH_DIR" ]]; then
             curl -LsO 'https://github.com/ljk53/pytorch-rpi/raw/master/libtorch-rpi-cxx11-abi-shared-1.6.0.zip';
@@ -34,8 +35,9 @@ if [[ "$OS" == "linux" ]]; then
         fi
     elif $(whereis cuda | cut -f 2 -d ' ')/bin/nvcc --version > /dev/null; then
         CXX="clang++"
-	NVCC=$(whereis cuda | cut -f 2 -d ' ')/bin/nvcc
+        NVCC=$(whereis cuda | cut -f 2 -d ' ')/bin/nvcc
         CUDA_VERSION=$("$NVCC" --version | grep release | grep -Eo "[0-9]+.[0-9]+" | head -1)
+        CUDA_FLAGS="$CUDA_FLAGS -DWITH_CUDA -I /usr/local/cuda/include"
         if [[ "$CUDA_VERSION" == "10.1" ]]; then
             echo "Building for Linux with CUDA 10.1";
             LIBTORCH_DIR="linux-cuda101/libtorch"
@@ -77,6 +79,8 @@ make CXX="$CXX" \
      INSTALL_NAME="$INSTALL_NAME" \
      LIBTORCH_DIR="$LIBTORCH_DIR" \
      GLIBCXX_USE_CXX11_ABI="$GLIBCXX_USE_CXX11_ABI" \
-     LOAD="$LOAD"
+     LOAD="$LOAD" \
+     CUDA_FLAGS="$CUDA_FLAGS" \
+     -f Makefile;
 
 popd
diff --git a/cgotorch/cgotorch.h b/cgotorch/cgotorch.h
index 40d69b1b..729dd9d8 100644
--- a/cgotorch/cgotorch.h
+++ b/cgotorch/cgotorch.h
@@ -22,6 +22,7 @@ typedef void *Normalize;
 typedef void *Device;
 typedef void *ByteBuffer;
 #endif
+typedef void *CUDAStream;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Helper functions
@@ -226,14 +227,23 @@ void Optimizer_Close(Optimizer opt);
 ////////////////////////////////////////////////////////////////////////////////
 
 const char *Torch_Device(const char *device_type, Device *device);
-bool IsCUDAAvailable();
-bool IsCUDNNAvailable();
-
 const char *Tensor_To(Tensor input, Device device, int8_t dtype,
                       Tensor *output);
 const char *Tensor_CastTo(Tensor input, int8_t dtype, Tensor *output);
 const char *Tensor_CopyTo(Tensor input, Device device, Tensor *output);
 const char *Tensor_PinMemory(Tensor input, Tensor *output);
+const char *Tensor_CUDA(Tensor input, Device device, int8_t non_blocking,
+                        Tensor *output);
+////////////////////////////////////////////////////////////////////////////////
+// Nvidia CUDA
+////////////////////////////////////////////////////////////////////////////////
+bool IsCUDAAvailable();
+bool IsCUDNNAvailable();
+const char *CUDA_GetCurrentCUDAStream(CUDAStream *stream, Device *device);
+const char *CUDA_SetCurrentCUDAStream(CUDAStream stream);
+const char *CUDA_GetCUDAStreamFromPool(CUDAStream *stream, Device *device);
+const char *CUDA_Synchronize(CUDAStream stream);
+const char *CUDA_Query(CUDAStream stream, int8_t *result);
 
 #ifdef __cplusplus
 }
diff --git a/cgotorch/cuda.cc b/cgotorch/cuda.cc
new file mode 100644
index 00000000..ed3c1ebd
--- /dev/null
+++ b/cgotorch/cuda.cc
@@ -0,0 +1,88 @@
+// Copyright 2020, GoTorch Authors
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "torch/script.h"
+#include "torch/torch.h"
+
+#ifdef WITH_CUDA
+#include "c10/cuda/CUDAStream.h"
+#endif
+
+// FIXME(shendiaomo): including cgotorch.h before torch/torch.h will fail
+#include "cgotorch/cgotorch.h"
+
+bool IsCUDAAvailable() { return torch::cuda::is_available(); }
+
+bool IsCUDNNAvailable() { return torch::cuda::cudnn_is_available(); }
+
+const char *CUDA_GetCurrentCUDAStream(CUDAStream *stream, Device *device) {
+#ifdef WITH_CUDA
+  try {
+    *stream = static_cast<void *>(new at::cuda::CUDAStream(
+        at::cuda::getCurrentCUDAStream((*device)->index())));
+    return nullptr;
+  } catch (const std::exception &e) {
+    return exception_str(e.what());
+  }
+#else
+  return exception_str("CUDA API needs -DWITH_CUDA on building libcgotorch.so");
+#endif
+}
+
+const char *CUDA_GetCUDAStreamFromPool(CUDAStream *stream, Device *device) {
+#ifdef WITH_CUDA
+  try {
+    *stream = static_cast<void *>(
+        new at::cuda::CUDAStream(at::cuda::getStreamFromPool(
+            false /**isHighPriority**/, (*device)->index())));
+    return nullptr;
+  } catch (const std::exception &e) {
+    return exception_str(e.what());
+  }
+#else
+  return exception_str("CUDA API needs -DWITH_CUDA on building libcgotorch.so");
+#endif
+}
+
+const char *CUDA_SetCurrentCUDAStream(CUDAStream stream) {
+#ifdef WITH_CUDA
+  try {
+    at::cuda::setCurrentCUDAStream(
+        *static_cast<at::cuda::CUDAStream *>(stream));
+    return nullptr;
+  } catch (const std::exception &e) {
+    return exception_str(e.what());
+  }
+#else
+  return exception_str("CUDA API needs -DWITH_CUDA on building libcgotorch.so");
+#endif
+}
+
+const char *CUDA_Synchronize(CUDAStream stream) {
+#ifdef WITH_CUDA
+  try {
+    static_cast<at::cuda::CUDAStream *>(stream)->synchronize();
+    return nullptr;
+  } catch (const std::exception &e) {
+    return exception_str(e.what());
+  }
+#else
+  return exception_str("CUDA API needs -DWITH_CUDA on building libcgotorch.so");
+#endif
+}
+
+const char *CUDA_Query(CUDAStream stream, int8_t *result) {
+#ifdef WITH_CUDA
+  try {
+    *result = static_cast<at::cuda::CUDAStream *>(stream)->query() ? 1 : 0;
+    return nullptr;
+  } catch (const std::exception &e) {
+    return exception_str(e.what());
+  }
+#else
+  return exception_str("CUDA API needs -DWITH_CUDA on building libcgotorch.so");
+#endif
+}
diff --git a/cgotorch/device.cc b/cgotorch/device.cc
index 709aad13..52412d79 100644
--- a/cgotorch/device.cc
+++ b/cgotorch/device.cc
@@ -24,7 +24,3 @@ const char *Torch_Device(const char *device_type, Device *device) {
   }
   return nullptr;
 }
-
-bool IsCUDAAvailable() { return torch::cuda::is_available(); }
-
-bool IsCUDNNAvailable() { return torch::cuda::cudnn_is_available(); }
diff --git a/cgotorch/tensor.cc b/cgotorch/tensor.cc
index e86295bc..ed66d5cc 100644
--- a/cgotorch/tensor.cc
+++ b/cgotorch/tensor.cc
@@ -128,6 +128,20 @@ const char *Tensor_To(Tensor input, Device device, int8_t dtype,
   }
 }
 
+const char *Tensor_CUDA(Tensor input, Device device, int8_t non_blocking,
+                        Tensor *output) {
+  try {
+    if (!device->is_cuda()) {
+      return exception_str("the device should be CUDA device");
+    }
+    auto result = input->to(*device, static_cast<bool>(non_blocking));
+    *output = new at::Tensor(result);
+    return nullptr;
+  } catch (const std::exception &e) {
+    return exception_str(e.what());
+  }
+}
+
 const char *Tensor_CastTo(Tensor input, int8_t dtype, Tensor *output) {
   try {
     auto result = input->to(static_cast<at::ScalarType>(dtype));
diff --git a/cuda.go b/cuda.go
new file mode 100644
index 00000000..cc85cfa2
--- /dev/null
+++ b/cuda.go
@@ -0,0 +1,44 @@
+package gotorch
+
+// #cgo CFLAGS: -I ${SRCDIR}/cgotorch -I ${SRCDIR}/cgotorch/libtorch/include
+// #cgo LDFLAGS: -L ${SRCDIR}/cgotorch -Wl,-rpath ${SRCDIR}/cgotorch -lcgotorch
+// #cgo LDFLAGS: -L ${SRCDIR}/cgotorch/libtorch/lib -Wl,-rpath ${SRCDIR}/cgotorch/libtorch/lib -lc10 -ltorch -ltorch_cpu
+// #include "cgotorch.h"
+import "C"
+import "unsafe"
+
+// CUDAStream struct wrapped Nvidia CUDA Stream
+type CUDAStream struct {
+	P C.CUDAStream
+}
+
+// Query returns true if all tasks completed on this CUDA stream
+func (s CUDAStream) Query() bool {
+	var b int8
+	MustNil(unsafe.Pointer(C.CUDA_Query(s.P, (*C.int8_t)(&b))))
+	return b != 0
+}
+
+// Synchronize wait until all tasks completed on this CUDA stream
+func (s CUDAStream) Synchronize() {
+	MustNil(unsafe.Pointer(C.CUDA_Synchronize(s.P)))
+}
+
+// GetCurrentCUDAStream returns the current stream on device
+func GetCurrentCUDAStream(device Device) CUDAStream {
+	var stream C.CUDAStream
+	MustNil(unsafe.Pointer(C.CUDA_GetCurrentCUDAStream(&stream, &device.T)))
+	return CUDAStream{stream}
+}
+
+// SetCurrentCUDAStream set stream as the current CUDA stream
+func SetCurrentCUDAStream(stream CUDAStream) {
+	MustNil(unsafe.Pointer(C.CUDA_SetCurrentCUDAStream(stream.P)))
+}
+
+// NewCUDAStream returns a new CUDA stream from the pool
+func NewCUDAStream(device Device) CUDAStream {
+	var stream C.CUDAStream
+	MustNil(unsafe.Pointer(C.CUDA_GetCUDAStreamFromPool(&stream, &device.T)))
+	return CUDAStream{stream}
+}
diff --git a/cuda_test.go b/cuda_test.go
new file mode 100644
index 00000000..0176a265
--- /dev/null
+++ b/cuda_test.go
@@ -0,0 +1,55 @@
+package gotorch_test
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	torch "github.com/wangkuiyi/gotorch"
+)
+
+func getDefaultDevice() torch.Device {
+	var device torch.Device
+	if torch.IsCUDAAvailable() {
+		device = torch.NewDevice("cuda")
+	} else {
+		device = torch.NewDevice("cpu")
+	}
+	return device
+}
+func TestCUDAStreamPanics(t *testing.T) {
+	a := assert.New(t)
+	device := getDefaultDevice()
+	if torch.IsCUDAAvailable() {
+		a.NotPanics(func() {
+			torch.GetCurrentCUDAStream(device)
+		})
+	} else {
+		a.Panics(func() {
+			torch.GetCurrentCUDAStream(device)
+		})
+		a.Panics(func() {
+			torch.NewCUDAStream(device)
+		})
+	}
+}
+
+func TestMultiCUDAStream(t *testing.T) {
+	if !torch.IsCUDAAvailable() {
+		t.Skip("skip TestMultiCUDAStream which only run on CUDA device")
+	}
+	a := assert.New(t)
+	device := getDefaultDevice()
+	currStream := torch.GetCurrentCUDAStream(device)
+	defer torch.SetCurrentCUDAStream(currStream)
+	// create a new CUDA stream
+	stream := torch.NewCUDAStream(device)
+	// switch to the new CUDA stream
+	torch.SetCurrentCUDAStream(stream)
+	// copy Tensor from host to device async
+	input := torch.RandN([]int64{100, 200}, true).PinMemory()
+	input.CUDA(device, true /**nonBlocking=true**/)
+	// wait until all tasks completed
+	stream.Synchronize()
+	// make sure all tasks completed
+	a.True(stream.Query())
+}
diff --git a/tensor.go b/tensor.go
index 89c4ee05..7d35ff47 100644
--- a/tensor.go
+++ b/tensor.go
@@ -112,6 +112,18 @@ func (a Tensor) To(device Device, dtype ...int8) Tensor {
 	return Tensor{(*unsafe.Pointer)(&t)}
 }
 
+// CUDA returns a Tensor on CUDA device
+func (a Tensor) CUDA(device Device, nonBlocking bool) Tensor {
+	var t C.Tensor
+	n := int8(0)
+	if nonBlocking {
+		n = 1
+	}
+	MustNil(unsafe.Pointer(C.Tensor_CUDA(C.Tensor(*a.T), device.T, C.int8_t(n), &t)))
+	SetTensorFinalizer((*unsafe.Pointer)(&t))
+	return Tensor{(*unsafe.Pointer)(&t)}
+}
+
 // CastTo cast tensor dtype
 func (a Tensor) CastTo(dtype int8) Tensor {
 	var t C.Tensor
diff --git a/tensor_test.go b/tensor_test.go
index 8d128e2f..0a3a8250 100644
--- a/tensor_test.go
+++ b/tensor_test.go
@@ -59,7 +59,28 @@ func TestCastTo(t *testing.T) {
 	b = a.To(torch.NewDevice("cpu"), torch.Float)
 	assert.Equal(t, torch.Float, b.Dtype())
 }
+func TestCUDA(t *testing.T) {
+	a := assert.New(t)
+	device := getDefaultDevice()
+	input := torch.NewTensor([][]float32{{1, 2}, {3, 4}})
+	if !torch.IsCUDAAvailable() {
+		// CUDA should panics on CPU device
+		a.Panics(func() {
+			input.CUDA(device, false)
+		})
+		a.Panics(func() {
+			input.CUDA(device, true)
+		})
+		return
+	}
+
+	b := input.CUDA(device, false)
+	a.Equal(" 1  2\n 3  4\n[ CUDAFloatType{2,2} ]", b.String())
 
+	c := input.CUDA(device, true)
+	torch.GetCurrentCUDAStream(device).Synchronize()
+	a.Equal(" 1  2\n 3  4\n[ CUDAFloatType{2,2} ]", c.String())
+}
 func TestCopyTo(t *testing.T) {
 	device := torch.NewDevice("cpu")
 	a := torch.NewTensor([]int64{1, 2})