Merge pull request #13 from saipraveenb25/main

Support `half` types when compiling CUDA generated from Slang
shader-slang · Jul 10, 2024 · 17b5d37 · 17b5d37
2 parents 3e528b4 + 2375b88
commit 17b5d37
Show file tree

Hide file tree

Showing 3 changed files with 41 additions and 3 deletions.
diff --git a/slangtorch/slangtorch.py b/slangtorch/slangtorch.py
@@ -15,6 +15,12 @@
 packageDir = pkg_resources.resource_filename(__name__, '')
 versionCode = my_version = pkg_resources.get_distribution('slangtorch').version
 
+DEFAULT_CUDA_CFLAGS = ["-U__CUDA_NO_HALF_OPERATORS__",
+                       "-U__CUDA_NO_HALF_CONVERSIONS__",
+                       "-U__CUDA_NO_HALF2_OPERATORS__",
+                       "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+                       "-DSLANG_CUDA_ENABLE_HALF=1",]
+
 if sys.platform == "win32":
     # Windows
     executable_extension = ".exe"
@@ -446,8 +452,8 @@ def _compileAndLoadModule(metadata, sources, moduleName, buildDir, slangSourceDi
     # make sure to add cl.exe to PATH on windows so ninja can find it.
     _add_msvc_to_env_var()
 
-    extra_cflags = None
-    extra_cuda_cflags = None
+    extra_cflags = []
+    extra_cuda_cflags = []
     # If windows, add /std:c++17 to extra_cflags
     if sys.platform == "win32":
         extra_cflags = ["/std:c++17"]
@@ -463,11 +469,13 @@ def _compileAndLoadModule(metadata, sources, moduleName, buildDir, slangSourceDi
     else:
         extra_include_paths = None
 
+    extra_cuda_cflags = extra_cuda_cflags + DEFAULT_CUDA_CFLAGS
+
     return jit_compile(
         moduleName,
         sources,
         extra_cflags=extra_cflags,
-        extra_cuda_cflags=extra_cuda_cflags,
+        extra_cuda_cflags=extra_cuda_cflags if extra_cuda_cflags else None,
         extra_ldflags=None,
         extra_include_paths=extra_include_paths,
         build_directory=os.path.realpath(buildDir),

diff --git a/tests/autobind-square-half.slang b/tests/autobind-square-half.slang
@@ -0,0 +1,13 @@
+[AutoPyBindCUDA]
+[CUDAKernel]
+void square(TensorView<half> input, TensorView<half> output)
+{
+    // Get the 'global' index of this thread.
+    uint3 dispatchIdx = cudaThreadIdx() + cudaBlockIdx() * cudaBlockDim();
+
+    // If the thread index is beyond the input size, exit early.
+    if (dispatchIdx.x >= input.size(0))
+        return;
+
+    output[dispatchIdx.x] = input[dispatchIdx.x] * input[dispatchIdx.x];
+}
diff --git a/tests/test.py b/tests/test.py
@@ -640,3 +640,20 @@ def test_empty_tensor(self):
 
         # Should not crash.
 
+class TestHalfDType(unittest.TestCase):
+    def setUp(self) -> None:
+        test_dir = os.path.dirname(os.path.abspath(__file__))
+        slangModuleSourceFile = os.path.join(test_dir, 'autobind-square-half.slang')
+
+        module = slangtorch.loadModule(slangModuleSourceFile)
+        self.module = module
+
+    def test_half_multiply(self):
+        X = torch.tensor([1., 2., 3., 4.]).cuda().half()
+        Z = torch.zeros_like(X).cuda().half()
+
+        self.module.square(input=X, output=Z).launchRaw(blockSize=(32, 1, 1), gridSize=(1, 1, 1))
+
+        expected = torch.tensor([1., 4., 9., 16.]).cuda().half()
+
+        assert(torch.all(torch.eq(Z, expected)))