diff --git a/source/slang/core.meta.slang b/source/slang/core.meta.slang index f8012af1d5..85d5302541 100644 --- a/source/slang/core.meta.slang +++ b/source/slang/core.meta.slang @@ -891,26 +891,31 @@ __magic_type(StringType) __intrinsic_type($(kIROp_StringType)) struct String { - __target_intrinsic(cpp) + [require(cpp)] __intrinsic_op($(kIROp_MakeString)) __init(int val); - __target_intrinsic(cpp) + + [require(cpp)] __intrinsic_op($(kIROp_MakeString)) __init(uint val); - __target_intrinsic(cpp) + + [require(cpp)] __intrinsic_op($(kIROp_MakeString)) __init(int64_t val); - __target_intrinsic(cpp) + + [require(cpp)] __intrinsic_op($(kIROp_MakeString)) __init(uint64_t val); - __target_intrinsic(cpp) + + [require(cpp)] __intrinsic_op($(kIROp_MakeString)) __init(float val); - __target_intrinsic(cpp) + + [require(cpp)] __intrinsic_op($(kIROp_MakeString)) __init(double val); - __target_intrinsic(cpp) + [require(cpp)] int64_t getLength(); property int length @@ -925,11 +930,23 @@ __magic_type(NativeStringType) __intrinsic_type($(kIROp_NativeStringType)) struct NativeString { - __target_intrinsic(cpp, "int(strlen($0))") - int getLength(); + [require(cpp)] + int getLength() + { + __target_switch + { + case cpp: __intrinsic_asm "int(strlen($0))"; + } + } - __target_intrinsic(cpp, "(void*)((const char*)($0))") - Ptr getBuffer(); + [require(cpp)] + Ptr getBuffer() + { + __target_switch + { + case cpp: __intrinsic_asm "(void*)((const char*)($0))"; + } + } property int length { [__unsafeForceInlineEarly] get{return getLength();} } @@ -1968,45 +1985,71 @@ int getStringHash(String string); /// Use will produce a syntax error in downstream compiler /// Useful for testing diagnostics around compilation errors of downstream compiler /// It 'returns' an int so can be used in expressions without the front end complaining. -__target_intrinsic(hlsl, " @ ") -__target_intrinsic(glsl, " @ ") -__target_intrinsic(cuda, " @ ") -__target_intrinsic(cpp, " @ ") -int __SyntaxError(); +[require(cpp_cuda_glsl_hlsl)] +int __SyntaxError() +{ + __target_switch + { + case cpp: __intrinsic_asm " @ "; + case cuda: __intrinsic_asm " @ "; + case glsl: __intrinsic_asm " @ "; + case hlsl: __intrinsic_asm " @ "; + } +} /// For downstream compilers that allow sizeof/alignof/offsetof /// Can't be called in the C/C++ style. Need to use __size_of() as opposed to sizeof(some_type). __generic -__target_intrinsic(cuda, "sizeof($G0)") __target_intrinsic(cpp, "sizeof($G0)") +__target_intrinsic(cuda, "sizeof($G0)") [__readNone] +[require(cpp_cuda)] int __sizeOf(); __generic -__target_intrinsic(cuda, "sizeof($T0)") -__target_intrinsic(cpp, "sizeof($T0)") [__readNone] -int __sizeOf(T v); +[require(cpp_cuda)] +int __sizeOf(T v) +{ + __target_switch + { + case cpp: __intrinsic_asm "sizeof($T0)"; + case cuda: __intrinsic_asm "sizeof($T0)"; + } +} __generic __target_intrinsic(cuda, "SLANG_ALIGN_OF($G0)") __target_intrinsic(cpp, "SLANG_ALIGN_OF($G0)") [__readNone] +[require(cpp_cuda)] int __alignOf(); __generic -__target_intrinsic(cuda, "SLANG_ALIGN_OF($T0)") -__target_intrinsic(cpp, "SLANG_ALIGN_OF($T0)") [__readNone] -int __alignOf(T v); +[require(cpp_cuda)] +int __alignOf(T v) +{ + __target_switch + { + case cpp: __intrinsic_asm "SLANG_ALIGN_OF($T0)"; + case cuda: __intrinsic_asm "SLANG_ALIGN_OF($T0)"; + } +} // It would be nice to have offsetof equivalent, but it's not clear how that would work in terms of the Slang language. // Here we allow calculating the offset of a field in bytes from an *instance* of the type. __generic -__target_intrinsic(cuda, "int(((char*)&($1)) - ((char*)&($0)))") -__target_intrinsic(cpp, "int(((char*)&($1)) - ((char*)&($0))") [__readNone] -int __offsetOf(in T t, in F field); +[require(cpp_cuda)] +int __offsetOf(in T t, in F field) +{ + __target_switch + { + case cpp: __intrinsic_asm "int(((char*)&($1)) - ((char*)&($0))"; + case cuda: __intrinsic_asm "int(((char*)&($1)) - ((char*)&($0)))"; + } +} /// Mark beginning of "interlocked" operations in a fragment shader. __glsl_extension(GL_ARB_fragment_shader_interlock) @@ -2639,4 +2682,4 @@ __attributeTarget(FuncDecl) attribute_syntax [DerivativeGroupLinear] : DerivativeGroupLinearAttribute; __attributeTarget(FuncDecl) -attribute_syntax [noRefInline] : NoRefInlineAttribute; \ No newline at end of file +attribute_syntax [noRefInline] : NoRefInlineAttribute; diff --git a/source/slang/diff.meta.slang b/source/slang/diff.meta.slang index 0026a76f90..769630d50d 100644 --- a/source/slang/diff.meta.slang +++ b/source/slang/diff.meta.slang @@ -57,15 +57,18 @@ struct TensorView { __target_intrinsic(cuda, "$0.data_ptr<$G0>()") [__NoSideEffect] + [require(cuda)] Ptr data_ptr(); __target_intrinsic(cuda, "$0.data_ptr_at<$G0>($1)") [__NoSideEffect] + [require(cuda)] Ptr data_ptr_at(uint index); __generic __target_intrinsic(cuda, "$0.data_ptr_at<$G0>($1)") [__NoSideEffect] + [require(cuda)] Ptr data_ptr_at(vector index); __implicit_conversion($(kConversionCost_ImplicitDereference)) @@ -74,58 +77,108 @@ struct TensorView __target_intrinsic(cuda, "$0.load<$G0>($1)") [__NoSideEffect] + [require(cuda)] T load(uint x); + __target_intrinsic(cuda, "$0.load<$G0>($1, $2)") [__NoSideEffect] + [require(cuda)] T load(uint x, uint y); + __target_intrinsic(cuda, "$0.load<$G0>($1, $2, $3)") [__NoSideEffect] + [require(cuda)] T load(uint x, uint y, uint z); + __target_intrinsic(cuda, "$0.load<$G0>($1, $2, $3, $4)") [__NoSideEffect] + [require(cuda)] T load(uint x, uint y, uint z, uint w); + __target_intrinsic(cuda, "$0.load<$G0>($1, $2, $3, $4, $5)") [__NoSideEffect] + [require(cuda)] T load(uint i0, uint i1, uint i2, uint i3, uint i4); - [__NoSideEffect] __generic __target_intrinsic(cuda, "$0.load<$TR>($1)") + [__NoSideEffect] + [require(cuda)] T load(vector index); __target_intrinsic(cuda, "$0.store<$G0>($1, $2)") + [require(cuda)] void store(uint x, T val); + __target_intrinsic(cuda, "$0.store<$G0>($1, $2, $3)") + [require(cuda)] void store(uint x, uint y, T val); + __target_intrinsic(cuda, "$0.store<$G0>($1, $2, $3, $4)") + [require(cuda)] void store(uint x, uint y, uint z, T val); + __target_intrinsic(cuda, "$0.store<$G0>($1, $2, $3, $4, $5)") + [require(cuda)] void store(uint x, uint y, uint z, uint w, T val); + __target_intrinsic(cuda, "$0.store<$G0>($1, $2, $3, $4, $5, $6)") + [require(cuda)] void store(uint i0, uint i1, uint i2, uint i3, uint i4, T val); __generic __target_intrinsic(cuda, "$0.store<$T2>($1, $2)") + [require(cuda)] void store(vector index, T val); - __target_intrinsic(cuda, "*($3) = atomicAdd($0.data_ptr_at<$T2>($1), $2)") - void InterlockedAdd(uint index, T val, out T oldVal); + [require(cuda)] + void InterlockedAdd(uint index, T val, out T oldVal) + { + __target_switch + { + case cuda: __intrinsic_asm "*($3) = atomicAdd($0.data_ptr_at<$T2>($1), $2)"; + } + } __generic - __target_intrinsic(cuda, "*($3) = atomicAdd($0.data_ptr_at<$T2>($1), $2)") - void InterlockedAdd(vector index, T val, out T oldVal); + [require(cuda)] + void InterlockedAdd(vector index, T val, out T oldVal) + { + __target_switch + { + case cuda: __intrinsic_asm "*($3) = atomicAdd($0.data_ptr_at<$T2>($1), $2)"; + } + } - __target_intrinsic(cuda, "$0.dimensionCount") [__readNone] - uint dims(); + [require(cuda)] + uint dims() + { + __target_switch + { + case cuda: __intrinsic_asm "$0.dimensionCount"; + } + } - __target_intrinsic(cuda, "$0.sizes[$1]") [__readNone] - uint size(uint i); + [require(cuda)] + uint size(uint i) + { + __target_switch + { + case cuda: __intrinsic_asm "$0.sizes[$1]"; + } + } - __target_intrinsic(cuda, "$0.strides[$1]") [__readNone] - uint stride(uint i); + [require(cuda)] + uint stride(uint i) + { + __target_switch + { + case cuda: __intrinsic_asm "$0.strides[$1]"; + } + } __subscript(uint index) -> T { @@ -202,54 +255,138 @@ extension TensorView<$(atomicIntegerTypeName)> { typealias __Element = $(atomicIntegerTypeName); - __target_intrinsic(cuda, "*($3) = atomicMin($0.data_ptr_at<$T2>($1), $2)") - void InterlockedMin(uint index, __Element val, out __Element oldVal); + [require(cuda)] + void InterlockedMin(uint index, __Element val, out __Element oldVal) + { + __target_switch + { + case cuda: __intrinsic_asm "*($3) = atomicMin($0.data_ptr_at<$T2>($1), $2)"; + } + } __generic - __target_intrinsic(cuda, "*($3) = atomicMin($0.data_ptr_at<$T2>($1), $2)") - void InterlockedMin(vector index, __Element val, out __Element oldVal); + [require(cuda)] + void InterlockedMin(vector index, __Element val, out __Element oldVal) + { + __target_switch + { + case cuda: __intrinsic_asm "*($3) = atomicMin($0.data_ptr_at<$T2>($1), $2)"; + } + } - __target_intrinsic(cuda, "*($3) = atomicMax($0.data_ptr_at<$T2>($1), $2)") - void InterlockedMax(uint index, __Element val, out __Element oldVal); + [require(cuda)] + void InterlockedMax(uint index, __Element val, out __Element oldVal) + { + __target_switch + { + case cuda: __intrinsic_asm "*($3) = atomicMax($0.data_ptr_at<$T2>($1), $2)"; + } + } __generic - __target_intrinsic(cuda, "*($3) = atomicMax($0.data_ptr_at<$T2>($1), $2)") - void InterlockedMax(vector index, __Element val, out __Element oldVal); + [require(cuda)] + void InterlockedMax(vector index, __Element val, out __Element oldVal) + { + __target_switch + { + case cuda: __intrinsic_asm "*($3) = atomicMax($0.data_ptr_at<$T2>($1), $2)"; + } + } - __target_intrinsic(cuda, "*($3) = atomicAnd($0.data_ptr_at<$T2>($1), $2)") - void InterlockedAnd(uint index, __Element val, out __Element oldVal); + [require(cuda)] + void InterlockedAnd(uint index, __Element val, out __Element oldVal) + { + __target_switch + { + case cuda: __intrinsic_asm "*($3) = atomicAnd($0.data_ptr_at<$T2>($1), $2)"; + } + } __generic - __target_intrinsic(cuda, "*($3) = atomicAnd($0.data_ptr_at<$T2>($1), $2)") - void InterlockedAnd(vector index, __Element val, out __Element oldVal); + [require(cuda)] + void InterlockedAnd(vector index, __Element val, out __Element oldVal) + { + __target_switch + { + case cuda: __intrinsic_asm "*($3) = atomicAnd($0.data_ptr_at<$T2>($1), $2)"; + } + } - __target_intrinsic(cuda, "*($3) = atomicOr($0.data_ptr_at<$T2>($1), $2)") - void InterlockedOr(uint index, __Element val, out __Element oldVal); + [require(cuda)] + void InterlockedOr(uint index, __Element val, out __Element oldVal) + { + __target_switch + { + case cuda: __intrinsic_asm "*($3) = atomicOr($0.data_ptr_at<$T2>($1), $2)"; + } + } __generic - __target_intrinsic(cuda, "*($3) = atomicOr($0.data_ptr_at<$T2>($1), $2)") - void InterlockedOr(vector index, __Element val, out __Element oldVal); + [require(cuda)] + void InterlockedOr(vector index, __Element val, out __Element oldVal) + { + __target_switch + { + case cuda: __intrinsic_asm "*($3) = atomicOr($0.data_ptr_at<$T2>($1), $2)"; + } + } - __target_intrinsic(cuda, "*($3) = atomicXor($0.data_ptr_at<$T2>($1), $2)") - void InterlockedXor(uint index, __Element val, out __Element oldVal); + [require(cuda)] + void InterlockedXor(uint index, __Element val, out __Element oldVal) + { + __target_switch + { + case cuda: __intrinsic_asm "*($3) = atomicXor($0.data_ptr_at<$T2>($1), $2)"; + } + } __generic - __target_intrinsic(cuda, "*($3) = atomicXor($0.data_ptr_at<$T2>($1), $2)") - void InterlockedXor(vector index, __Element val, out __Element oldVal); + [require(cuda)] + void InterlockedXor(vector index, __Element val, out __Element oldVal) + { + __target_switch + { + case cuda: __intrinsic_asm "*($3) = atomicXor($0.data_ptr_at<$T2>($1), $2)"; + } + } - __target_intrinsic(cuda, "*($3) = atomicExch($0.data_ptr_at<$T2>($1), $2)") - void InterlockedExchange(uint index, __Element va, out __Element oldVall); + [require(cuda)] + void InterlockedExchange(uint index, __Element va, out __Element oldVall) + { + __target_switch + { + case cuda: __intrinsic_asm "*($3) = atomicExch($0.data_ptr_at<$T2>($1), $2)"; + } + } __generic - __target_intrinsic(cuda, "*($3) = atomicExch($0.data_ptr_at<$T2>($1), $2)") - void InterlockedExchange(vector index, __Element val, out __Element oldVal); + [require(cuda)] + void InterlockedExchange(vector index, __Element val, out __Element oldVal) + { + __target_switch + { + case cuda: __intrinsic_asm "*($3) = atomicExch($0.data_ptr_at<$T2>($1), $2)"; + } + } - __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at<$T2>($1), $2, $3)") - void InterlockedCompareExchange(uint index, __Element compare, __Element val); + [require(cuda)] + void InterlockedCompareExchange(uint index, __Element compare, __Element val) + { + __target_switch + { + case cuda: __intrinsic_asm "atomicCAS($0.data_ptr_at<$T2>($1), $2, $3)"; + } + } __generic - __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at<$T2>($1), $2, $3)") - void InterlockedCompareExchange(vector index, __Element compare, __Element val); + [require(cuda)] + void InterlockedCompareExchange(vector index, __Element compare, __Element val) + { + __target_switch + { + case cuda: __intrinsic_asm "atomicCAS($0.data_ptr_at<$T2>($1), $2, $3)"; + } + } } ${{{{ @@ -258,19 +395,43 @@ ${{{{ extension TensorView { - __target_intrinsic(cuda, "*($3) = atomicExch($0.data_ptr_at($1), $2)") - float InterlockedExchange(uint index, float val, out float oldVal); + [require(cuda)] + float InterlockedExchange(uint index, float val, out float oldVal) + { + __target_switch + { + case cuda: __intrinsic_asm "*($3) = atomicExch($0.data_ptr_at($1), $2)"; + } + } __generic - __target_intrinsic(cuda, "*($3) = atomicExch($0.data_ptr_at($1), $2)") - float InterlockedExchange(vector index, float val, out float oldVal); + [require(cuda)] + float InterlockedExchange(vector index, float val, out float oldVal) + { + __target_switch + { + case cuda: __intrinsic_asm "*($3) = atomicExch($0.data_ptr_at($1), $2)"; + } + } - __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at($1), slang_bit_cast($2), slang_bit_cast($3))") - void InterlockedCompareExchange(uint index, float compare, float val); + [require(cuda)] + void InterlockedCompareExchange(uint index, float compare, float val) + { + __target_switch + { + case cuda: __intrinsic_asm "atomicCAS($0.data_ptr_at($1), slang_bit_cast($2), slang_bit_cast($3))"; + } + } __generic - __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at($1), slang_bit_cast($2), slang_bit_cast($3))") - void InterlockedCompareExchange(vector index, float compare, float val); + [require(cuda)] + void InterlockedCompareExchange(vector index, float compare, float val) + { + __target_switch + { + case cuda: __intrinsic_asm "atomicCAS($0.data_ptr_at($1), slang_bit_cast($2), slang_bit_cast($3))"; + } + } } interface IDiffTensorWrapper @@ -685,28 +846,47 @@ struct TorchTensor [CudaHost] TensorView getView(); - __target_intrinsic(cuda, "$0.dims()") - __target_intrinsic(cpp, "$0.dims()") [__readNone] [CudaHost] - uint dims(); + [require(cpp_cuda)] + uint dims() + { + __target_switch + { + case cpp: __intrinsic_asm "$0.dims()"; + case cuda: __intrinsic_asm "$0.dims()"; + } + } - __target_intrinsic(cuda, "$0.size($1)") - __target_intrinsic(cpp, "$0.size($1)") [__readNone] [CudaHost] - uint size(uint i); + [require(cpp_cuda)] + uint size(uint i) + { + __target_switch + { + case cpp: __intrinsic_asm "$0.size($1)"; + case cuda: __intrinsic_asm "$0.size($1)"; + } + } - __target_intrinsic(cuda, "$0.stride($1)") - __target_intrinsic(cpp, "$0.stride($1)") [__readNone] [CudaHost] - uint stride(uint i); + [require(cpp_cuda)] + uint stride(uint i) + { + __target_switch + { + case cpp: __intrinsic_asm "$0.stride($1)"; + case cuda: __intrinsic_asm "$0.stride($1)"; + } + } - __target_intrinsic(cuda, "$0.data_ptr<$G0>()") __target_intrinsic(cpp, "$0.data_ptr<$G0>()") + __target_intrinsic(cuda, "$0.data_ptr<$G0>()") [__readNone] [CudaHost] + [require(cpp_cuda)] Ptr data_ptr(); __intrinsic_op($(kIROp_AllocateTorchTensor)) @@ -733,13 +913,25 @@ struct TorchTensor [CudaHost] static TorchTensor emptyLike(TorchTensor other); - __target_intrinsic(cpp, "$0.zero_()") [CudaHost] - void fillZero(); + [require(cpp)] + void fillZero() + { + __target_switch + { + case cpp: __intrinsic_asm "$0.zero_()"; + } + } - __target_intrinsic(cpp, "$0.fill_($1)") [CudaHost] - void fillValue(T val); + [require(cpp)] + void fillValue(T val) + { + __target_switch + { + case cpp: __intrinsic_asm "$0.fill_($1)"; + } + } [CudaHost] static TorchTensor zerosLike(TorchTensor other) @@ -751,8 +943,14 @@ struct TorchTensor } -__target_intrinsic(cpp, "AT_CUDA_CHECK(cudaStreamSynchronize(at::cuda::getCurrentCUDAStream()))") -void syncTorchCudaStream(); +[require(cpp)] +void syncTorchCudaStream() +{ + __target_switch + { + case cpp: __intrinsic_asm "AT_CUDA_CHECK(cudaStreamSynchronize(at::cuda::getCurrentCUDAStream()))"; + } +} /// Constructs a `DifferentialPair` value from a primal value and a differential value. __generic diff --git a/source/slang/glsl.meta.slang b/source/slang/glsl.meta.slang index 508156b8c2..bafd0b9470 100644 --- a/source/slang/glsl.meta.slang +++ b/source/slang/glsl.meta.slang @@ -322,14 +322,18 @@ public vector atan(vector y, vector x) } __generic -__target_intrinsic(cuda, "$P_asinh($0)") -__target_intrinsic(cpp, "$P_asinh($0)") [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, GLSL_130)] public T asinh(T x) { - return log(x + sqrt(x * x + T(1))); + __target_switch + { + case cpp: __intrinsic_asm "$P_asinh($0)"; + case cuda: __intrinsic_asm "$P_asinh($0)"; + default: + return log(x + sqrt(x * x + T(1))); + } } __generic @@ -342,14 +346,18 @@ public vector asinh(vector x) } __generic -__target_intrinsic(cuda, "$P_acosh($0)") -__target_intrinsic(cpp, "$P_acosh($0)") [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, GLSL_130)] public T acosh(T x) { - return log(x + sqrt( x * x - T(1))); + __target_switch + { + case cpp: __intrinsic_asm "$P_acosh($0)"; + case cuda: __intrinsic_asm "$P_acosh($0)"; + default: + return log(x + sqrt( x * x - T(1))); + } } __generic @@ -362,14 +370,18 @@ public vector acosh(vector x) } __generic -__target_intrinsic(cuda, "$P_atanh($0)") -__target_intrinsic(cpp, "$P_atanh($0)") [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, GLSL_130)] public T atanh(T x) { - return T(0.5) * log((T(1) + x) / (T(1) - x)); + __target_switch + { + case cpp: __intrinsic_asm "$P_atanh($0)"; + case cuda: __intrinsic_asm "$P_atanh($0)"; + default: + return T(0.5) * log((T(1) + x) / (T(1) - x)); + } } __generic @@ -645,7 +657,6 @@ float unpackSnorm1x16(uint p) return clamp((float(p & wordMask) - 32767.0) / 32767.0, -1.0, 1.0); } -__target_intrinsic(glsl) [__readNone] [ForceInline] float unpackUnorm1x8(uint p) @@ -654,7 +665,6 @@ float unpackUnorm1x8(uint p) return float(p & byteMask) / 255.0; } -__target_intrinsic(glsl) [__readNone] [ForceInline] float unpackSnorm1x8(uint p) @@ -679,140 +689,192 @@ uint float2half(float f) return (s | e | m); } -__target_intrinsic(glsl) [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] public uint packUnorm2x16(vec2 v) { - return packUnorm1x16(v.x) | (packUnorm1x16(v.y) << uint(16)); + __target_switch + { + case glsl: __intrinsic_asm "packUnorm2x16"; + default: + return packUnorm1x16(v.x) | (packUnorm1x16(v.y) << uint(16)); + } } -__target_intrinsic(glsl) [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] public uint packSnorm2x16(vec2 v) { - return packSnorm1x16(v.x) | (packSnorm1x16(v.y) << uint(16)); + __target_switch + { + case glsl: __intrinsic_asm "packSnorm2x16"; + default: + return packSnorm1x16(v.x) | (packSnorm1x16(v.y) << uint(16)); + } } -__target_intrinsic(glsl) [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] public uint packUnorm4x8(vec4 v) { - return packUnorm1x8(v.x) | (packUnorm1x8(v.y) << uint(8)) | (packUnorm1x8(v.z) << uint(16)) | (packUnorm1x8(v.w) << uint(24)); + __target_switch + { + case glsl: __intrinsic_asm "packUnorm4x8"; + default: + return packUnorm1x8(v.x) | (packUnorm1x8(v.y) << uint(8)) | (packUnorm1x8(v.z) << uint(16)) | (packUnorm1x8(v.w) << uint(24)); + } } -__target_intrinsic(glsl) [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] public uint packSnorm4x8(vec4 v) { - return packSnorm1x8(v.x) | (packSnorm1x8(v.y) << uint(8)) | (packSnorm1x8(v.z) << uint(16)) | (packSnorm1x8(v.w) << uint(24)); + __target_switch + { + case glsl: __intrinsic_asm "packSnorm4x8"; + default: + return packSnorm1x8(v.x) | (packSnorm1x8(v.y) << uint(8)) | (packSnorm1x8(v.z) << uint(16)) | (packSnorm1x8(v.w) << uint(24)); + } } -__target_intrinsic(glsl) [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] public vec2 unpackUnorm2x16(uint p) { - return vec2(unpackUnorm1x16(p & uint(0xffff)), unpackUnorm1x16(p >> uint(16))); + __target_switch + { + case glsl: __intrinsic_asm "unpackUnorm2x16"; + default: + return vec2(unpackUnorm1x16(p & uint(0xffff)), unpackUnorm1x16(p >> uint(16))); + } } -__target_intrinsic(glsl) [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] public vec2 unpackSnorm2x16(uint p) { - return vec2(unpackSnorm1x16(p & uint(0xffff)), unpackSnorm1x16(p >> uint(16))); + __target_switch + { + case glsl: __intrinsic_asm "unpackSnorm2x16"; + default: + return vec2(unpackSnorm1x16(p & uint(0xffff)), unpackSnorm1x16(p >> uint(16))); + } } -__target_intrinsic(glsl) [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] public vec4 unpackUnorm4x8(highp uint p) { - return vec4( - unpackUnorm1x8(p), - unpackUnorm1x8(p >> 8), - unpackUnorm1x8(p >> 16), - unpackUnorm1x8(p >> 24)); + __target_switch + { + case glsl: __intrinsic_asm "unpackUnorm4x8"; + default: + return vec4( + unpackUnorm1x8(p), + unpackUnorm1x8(p >> 8), + unpackUnorm1x8(p >> 16), + unpackUnorm1x8(p >> 24)); + } } -__target_intrinsic(glsl) [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] public vec4 unpackSnorm4x8(highp uint p) { - return vec4( - unpackSnorm1x8(p), - unpackSnorm1x8(p >> 8), - unpackSnorm1x8(p >> 16), - unpackSnorm1x8(p >> 24)); + __target_switch + { + case glsl: __intrinsic_asm "unpackSnorm4x8"; + default: + return vec4( + unpackSnorm1x8(p), + unpackSnorm1x8(p >> 8), + unpackSnorm1x8(p >> 16), + unpackSnorm1x8(p >> 24)); + } } -__target_intrinsic(glsl) [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] public uint packHalf2x16(vec2 v) { - return float2half(v.x) | (float2half(v.y) << uint(16)); + __target_switch + { + case glsl: __intrinsic_asm "packHalf2x16"; + default: + return float2half(v.x) | (float2half(v.y) << uint(16)); + } } -__target_intrinsic(glsl) [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] public float half2float(uint h) { - uint s = ((h & uint(0x8000)) << uint(16)); - uint e = 0; - uint m = ((h & uint(0x03ff)) << uint(13)); - if (m != 0) + __target_switch { - e = (((h & uint(0x7c00)) + uint(0x1c000)) << uint(13)); + case glsl: __intrinsic_asm "half2float"; + default: + uint s = ((h & uint(0x8000)) << uint(16)); + uint e = 0; + uint m = ((h & uint(0x03ff)) << uint(13)); + if (m != 0) + { + e = (((h & uint(0x7c00)) + uint(0x1c000)) << uint(13)); + } + return uintBitsToFloat(s | e | m); } - return uintBitsToFloat(s | e | m); } -__target_intrinsic(glsl) [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] public vec2 unpackHalf2x16(uint p) { - return vec2(half2float(p & uint(0xffff)), half2float(p >> uint(16))); + __target_switch + { + case glsl: __intrinsic_asm "unpackHalf2x16"; + default: + return vec2(half2float(p & uint(0xffff)), half2float(p >> uint(16))); + } } -__target_intrinsic(glsl) [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] public double packDouble2x32(uvec2 v) { - // TODO: there is no "asdouble()" - //return asdouble(uint64_t(v.x) | (uint64_t(v.y) << 32)); - return 0.0; + __target_switch + { + case glsl: __intrinsic_asm "packDouble2x32"; + default: + // TODO: there is no "asdouble()" + //return asdouble(uint64_t(v.x) | (uint64_t(v.y) << 32)); + return 0.0; + } } -__target_intrinsic(glsl) [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] public uvec2 unpackDouble2x32(double v) { - // TODO: there is no "asuint64()" - uint64_t u = 0; // asuint64(v); - return uvec2(uint(u & 0xFFFFFFFF), uint(u >> 32)); + __target_switch + { + case glsl: __intrinsic_asm "unpackDouble2x32"; + default: + // TODO: there is no "asuint64()" + uint64_t u = 0; // asuint64(v); + return uvec2(uint(u & 0xFFFFFFFF), uint(u >> 32)); + } } // @@ -833,29 +895,39 @@ public T faceforward(T n, T i, T ng) // __generic -__target_intrinsic(glsl) [__readNone] [ForceInline] [OverloadRank(15)] +[require(cpp_cuda_glsl_hlsl_spirv, GLSL_400)] public matrix outerProduct(vector c, vector r) { - // Column major matrix in GLSL - matrix result; - for (int i = 0; i < C; ++i) + __target_switch { - for (int j = 0; j < R; ++j) + case glsl: __intrinsic_asm "outerProduct"; + default: + // Column major matrix in GLSL + matrix result; + for (int i = 0; i < C; ++i) { - result[i][j] = c[i] * r[j]; + for (int j = 0; j < R; ++j) + { + result[i][j] = c[i] * r[j]; + } } + return result; } - return result; } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) [require(cpp_cuda_glsl_hlsl_spirv, GLSL_400)] -matrix inverse(matrix m); +matrix inverse(matrix m) +{ + __target_switch + { + case glsl: __intrinsic_asm "inverse"; + case hlsl: __intrinsic_asm "inverse"; + } +} // // Section 8.8. Integer Functions diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 9197969430..284cb2abc5 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -86,26 +86,33 @@ __intrinsic_type($(kIROp_HLSLByteAddressBufferType)) struct ByteAddressBuffer { [__readNone] - __target_intrinsic(hlsl) - __target_intrinsic(cpp) - __target_intrinsic(cuda) [__unsafeForceInlineEarly] - void GetDimensions(out uint dim); - - [__unsafeForceInlineEarly] - __specialized_for_target(spirv) - __specialized_for_target(glsl) + [require(cpp_cuda_glsl_hlsl_metal_spirv, structuredbuffer)] void GetDimensions(out uint dim) { - dim = __structuredBufferGetDimensions(__getEquivalentStructuredBuffer(this)).x*4; + __target_switch + { + case cpp: __intrinsic_asm ".GetDimensions"; + case cuda: __intrinsic_asm ".GetDimensions"; + case hlsl: __intrinsic_asm ".GetDimensions"; + case glsl: + case metal: + case spirv: + dim = __structuredBufferGetDimensions(__getEquivalentStructuredBuffer(this)).x*4; + } } [__readNone] [ForceInline] - __target_intrinsic(hlsl) + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer)] uint Load(int location) { - return __byteAddressBufferLoad(this, location); + __target_switch + { + case hlsl: __intrinsic_asm ".Load"; + default: + return __byteAddressBufferLoad(this, location); + } } [__readNone] @@ -113,10 +120,15 @@ struct ByteAddressBuffer [__readNone] [ForceInline] - __target_intrinsic(hlsl) + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer)] uint2 Load2(int location) { - return __byteAddressBufferLoad(this, location); + __target_switch + { + case hlsl: __intrinsic_asm ".Load2"; + default: + return __byteAddressBufferLoad(this, location); + } } [__readNone] @@ -124,10 +136,15 @@ struct ByteAddressBuffer [__readNone] [ForceInline] - __target_intrinsic(hlsl) + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer)] uint3 Load3(int location) { - return __byteAddressBufferLoad(this, location); + __target_switch + { + case hlsl: __intrinsic_asm ".Load3"; + default: + return __byteAddressBufferLoad(this, location); + } } [__readNone] @@ -135,10 +152,15 @@ struct ByteAddressBuffer [__readNone] [ForceInline] - __target_intrinsic(hlsl) + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer)] uint4 Load4(int location) { - return __byteAddressBufferLoad(this, location); + __target_switch + { + case hlsl: __intrinsic_asm ".Load4"; + default: + return __byteAddressBufferLoad(this, location); + } } [__readNone] @@ -223,9 +245,14 @@ struct __TextureImpl(TSampler s, TCoord value); +float __glsl_texture(TSampler s, TCoord value) +{ + __target_switch + { + case glsl: __intrinsic_asm "texture($0, $1)"; + } +} __glsl_extension(GL_EXT_texture_shadow_lod) __target_intrinsic(glsl, "texture($0, $1)") @@ -267,9 +294,14 @@ __target_intrinsic(glsl, "textureLodOffset($0, $1, 0, $2)") [require(glsl, texture_shadowlod)] float __glsl_texture_offset_level_zero_1d_shadow(TSampler s, TCoord value, constexpr TOffset offset); -__target_intrinsic(glsl, "texture($p, $2)") [require(glsl, texture_sm_4_1)] -float __glsl_texture(TTexture t, SamplerComparisonState s, TCoord value); +float __glsl_texture(TTexture t, SamplerComparisonState s, TCoord value) +{ + __target_switch + { + case glsl: __intrinsic_asm "texture($p, $2)"; + } +} __glsl_extension(GL_EXT_texture_shadow_lod) __target_intrinsic(glsl, "texture($p, $2)") @@ -447,12 +479,16 @@ extension __TextureImpl [__readNone] [ForceInline] - __target_intrinsic(hlsl) [require(cpp_glsl_hlsl_spirv, texture_sm_4_1_fragment)] T Sample(vector location, vector offset, float clamp, out uint status) { - status = 0; - return Sample(location, offset, clamp); + __target_switch + { + case hlsl: __intrinsic_asm ".Sample"; + default: + status = 0; + return Sample(location, offset, clamp); + } } [__readNone] @@ -903,13 +939,18 @@ extension __TextureImpl } } - __target_intrinsic(hlsl) [__readNone] [ForceInline] + [require(cpp_glsl_hlsl_spirv, texture_sm_4_1_fragment)] T Sample(SamplerState s, vector location, constexpr vector offset, float clamp, out uint status) { - status = 0; - return Sample(s, location, offset, clamp); + __target_switch + { + case hlsl: __intrinsic_asm ".Sample"; + default: + status = 0; + return Sample(s, location, offset, clamp); + } } [__readNone] @@ -1648,11 +1689,16 @@ extension __TextureImpl location, constexpr vector offset, out uint status) { - status = 0; - return Load(location, offset); + __target_switch + { + case hlsl: __intrinsic_asm ".Load"; + default: + status = 0; + return Load(location, offset); + } } __subscript(vector location) -> T @@ -1779,11 +1825,16 @@ extension __TextureImpl location, int sampleIndex, constexpr vector offset, out uint status) { - status = 0; - return Load(location, sampleIndex, offset); + __target_switch + { + case hlsl: __intrinsic_asm ".Load"; + default: + status = 0; + return Load(location, sampleIndex, offset); + } } __subscript(vector location, int sampleIndex) -> T @@ -2194,15 +2245,25 @@ half2 __atomicAdd(__ref half2 value, half2 amount) } // Helper for hlsl, using NVAPI -__target_intrinsic(hlsl, "NvInterlockedAddUint64($0, $1, $2)") [__requiresNVAPI] [require(hlsl, atomic_hlsl_nvapi)] -uint2 __atomicAdd(RWByteAddressBuffer buf, uint offset, uint2); +uint2 __atomicAdd(RWByteAddressBuffer buf, uint offset, uint2) +{ + __target_switch + { + case hlsl: __intrinsic_asm "NvInterlockedAddUint64($0, $1, $2)"; + } +} // atomic add for hlsl using SM6.6 -__target_intrinsic(hlsl, "$0.InterlockedAdd64($1, $2, $3)") [require(hlsl, atomic_hlsl_sm_6_6)] -void __atomicAdd(RWByteAddressBuffer buf, uint offset, int64_t value, out int64_t originalValue); +void __atomicAdd(RWByteAddressBuffer buf, uint offset, int64_t value, out int64_t originalValue) +{ + __target_switch + { + case hlsl: __intrinsic_asm "$0.InterlockedAdd64($1, $2, $3)"; + } +} __target_intrinsic(hlsl, "$0.InterlockedAdd64($1, $2, $3)") [require(hlsl, atomic_hlsl_sm_6_6)] void __atomicAdd(RWByteAddressBuffer buf, uint offset, uint64_t value, out uint64_t originalValue); @@ -2260,7 +2321,6 @@ int64_t __atomicAdd(__ref int64_t value, int64_t amount) } } -__target_intrinsic(glsl, "atomicAdd($0, $1)") __glsl_version(430) __glsl_extension(GL_EXT_shader_atomic_int64) [ForceInline] @@ -2283,15 +2343,25 @@ uint64_t __atomicAdd(__ref uint64_t value, uint64_t amount) // Helper for HLSL, using NVAPI -__target_intrinsic(hlsl, "NvInterlockedCompareExchangeUint64($0, $1, $2, $3)") [__requiresNVAPI] [require(hlsl, atomic_hlsl_nvapi)] -uint2 __cas(RWByteAddressBuffer buf, uint offset, uint2 compareValue, uint2 value); +uint2 __cas(RWByteAddressBuffer buf, uint offset, uint2 compareValue, uint2 value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "NvInterlockedCompareExchangeUint64($0, $1, $2, $3)"; + } +} // CAS using SM6.6 -__target_intrinsic(hlsl, "$0.InterlockedCompareExchange64($1, $2, $3, $4)") [require(hlsl, atomic_hlsl_sm_6_6)] -void __cas(RWByteAddressBuffer buf, uint offset, in int64_t compare_value, in int64_t value, out int64_t original_value); +void __cas(RWByteAddressBuffer buf, uint offset, in int64_t compare_value, in int64_t value, out int64_t original_value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "$0.InterlockedCompareExchange64($1, $2, $3, $4)"; + } +} __target_intrinsic(hlsl, "$0.InterlockedCompareExchange64($1, $2, $3, $4)") [require(hlsl, atomic_hlsl_sm_6_6)] void __cas(RWByteAddressBuffer buf, uint offset, in uint64_t compare_value, in uint64_t value, out uint64_t original_value); @@ -2334,10 +2404,15 @@ uint64_t __cas(__ref uint64_t ioValue, uint64_t compareValue, uint64_t newValue) // Max -__target_intrinsic(hlsl, "NvInterlockedMaxUint64($0, $1, $2)") [__requiresNVAPI] [require(hlsl, atomic_hlsl_nvapi)] -uint2 __atomicMax(RWByteAddressBuffer buf, uint offset, uint2 value); +uint2 __atomicMax(RWByteAddressBuffer buf, uint offset, uint2 value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "NvInterlockedMaxUint64($0, $1, $2)"; + } +} __glsl_version(430) __glsl_extension(GL_EXT_shader_atomic_int64) @@ -2397,10 +2472,15 @@ half __atomicMax(__ref half ioValue, half value) // Min -__target_intrinsic(hlsl, "NvInterlockedMinUint64($0, $1, $2)") [__requiresNVAPI] [require(hlsl, atomic_hlsl_nvapi)] -uint2 __atomicMin(RWByteAddressBuffer buf, uint offset, uint2 value); +uint2 __atomicMin(RWByteAddressBuffer buf, uint offset, uint2 value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "NvInterlockedMinUint64($0, $1, $2)"; + } +} __glsl_version(430) __glsl_extension(GL_EXT_shader_atomic_int64) @@ -2460,10 +2540,15 @@ half __atomicMin(__ref half ioValue, half value) // And -__target_intrinsic(hlsl, "NvInterlockedAndUint64($0, $1, $2)") [__requiresNVAPI] [require(hlsl, atomic_hlsl_nvapi)] -uint2 __atomicAnd(RWByteAddressBuffer buf, uint offset, uint2 value); +uint2 __atomicAnd(RWByteAddressBuffer buf, uint offset, uint2 value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "NvInterlockedAndUint64($0, $1, $2)"; + } +} __glsl_version(430) __glsl_extension(GL_EXT_shader_atomic_int64) @@ -2485,10 +2570,15 @@ uint64_t __atomicAnd(__ref uint64_t ioValue, uint64_t value) // Or -__target_intrinsic(hlsl, "NvInterlockedOrUint64($0, $1, $2)") [__requiresNVAPI] [require(hlsl, atomic_hlsl_nvapi)] -uint2 __atomicOr(RWByteAddressBuffer buf, uint offset, uint2 value); +uint2 __atomicOr(RWByteAddressBuffer buf, uint offset, uint2 value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "NvInterlockedOrUint64($0, $1, $2)"; + } +} __glsl_version(430) __glsl_extension(GL_EXT_shader_atomic_int64) @@ -2510,10 +2600,15 @@ uint64_t __atomicOr(__ref uint64_t ioValue, uint64_t value) // Xor -__target_intrinsic(hlsl, "NvInterlockedXorUint64($0, $1, $2)") [__requiresNVAPI] [require(hlsl, atomic_hlsl_nvapi)] -uint2 __atomicXor(RWByteAddressBuffer buf, uint offset, uint2 value); +uint2 __atomicXor(RWByteAddressBuffer buf, uint offset, uint2 value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "NvInterlockedXorUint64($0, $1, $2)"; + } +} __glsl_version(430) __glsl_extension(GL_EXT_shader_atomic_int64) @@ -2535,10 +2630,15 @@ uint64_t __atomicXor(__ref uint64_t ioValue, uint64_t value) // Exchange -__target_intrinsic(hlsl, "NvInterlockedExchangeUint64($0, $1, $2)") [__requiresNVAPI] [require(hlsl, atomic_hlsl_nvapi)] -uint2 __atomicExchange(RWByteAddressBuffer buf, uint offset, uint2 value); +uint2 __atomicExchange(RWByteAddressBuffer buf, uint offset, uint2 value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "NvInterlockedExchangeUint64($0, $1, $2)"; + } +} __glsl_version(430) __glsl_extension(GL_EXT_shader_atomic_int64) @@ -2611,8 +2711,6 @@ struct StructuredBuffer } __intrinsic_op($(kIROp_StructuredBufferLoad)) - __target_intrinsic(glsl, "$0._data[$1]") - __target_intrinsic(spirv, "%addr = OpAccessChain resultType*StorageBuffer resultId _0 const(int, 0) _1; OpLoad resultType resultId %addr;") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, structuredbuffer)] T Load(int location); @@ -2687,61 +2785,76 @@ struct $(item.name) // Note(tfoley): supports all operations from `ByteAddressBuffer` // TODO(tfoley): can this be made a sub-type? - __target_intrinsic(hlsl) - __target_intrinsic(cpp) - __target_intrinsic(cuda) [__unsafeForceInlineEarly] [require(cpp_cuda_glsl_hlsl_spirv, structuredbuffer_rw)] - void GetDimensions(out uint dim); - - [__unsafeForceInlineEarly] - __specialized_for_target(spirv) - __specialized_for_target(glsl) - [require(cpp_cuda_glsl_hlsl_spirv, structuredbuffer_rw)] void GetDimensions(out uint dim) { - dim = __structuredBufferGetDimensions(__getEquivalentStructuredBuffer(this)).x*4; + __target_switch + { + case cpp: __intrinsic_asm ".GetDimensions"; + case cuda: __intrinsic_asm ".GetDimensions"; + case hlsl: __intrinsic_asm ".GetDimensions"; + case glsl: + case spirv: + dim = __structuredBufferGetDimensions(__getEquivalentStructuredBuffer(this)).x*4; + } } - __target_intrinsic(hlsl) [__NoSideEffect] [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] uint Load(int location) { - return __byteAddressBufferLoad(this, location); + __target_switch + { + case hlsl: __intrinsic_asm ".Load"; + default: + return __byteAddressBufferLoad(this, location); + } } [__NoSideEffect] uint Load(int location, out uint status); - __target_intrinsic(hlsl) [__NoSideEffect] [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] uint2 Load2(int location) { - return __byteAddressBufferLoad(this, location); + __target_switch + { + case hlsl: __intrinsic_asm ".Load2"; + default: + return __byteAddressBufferLoad(this, location); + } } [__NoSideEffect] uint2 Load2(int location, out uint status); - __target_intrinsic(hlsl) [__NoSideEffect] [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] uint3 Load3(int location) { - return __byteAddressBufferLoad(this, location); + __target_switch + { + case hlsl: __intrinsic_asm ".Load3"; + default: + return __byteAddressBufferLoad(this, location); + } } [__NoSideEffect] uint3 Load3(int location, out uint status); - __target_intrinsic(hlsl) [__NoSideEffect] [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] uint4 Load4(int location) { - return __byteAddressBufferLoad(this, location); + __target_switch + { + case hlsl: __intrinsic_asm ".Load4"; + default: + return __byteAddressBufferLoad(this, location); + } } [__NoSideEffect] @@ -2887,66 +3000,54 @@ ${{{{ // Without returning original value __cuda_sm_version(6.0) - __target_intrinsic(cuda, "atomicAdd($0._getPtrAt($1), $2)") [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda6_int64)] - void InterlockedAddI64(uint byteAddress, int64_t valueToAdd); - - __specialized_for_target(hlsl) - [ForceInline] void InterlockedAddI64(uint byteAddress, int64_t valueToAdd) { - __atomicAdd(this, byteAddress, __asuint2(valueToAdd)); - } - - __specialized_for_target(glsl) - __specialized_for_target(spirv) - [ForceInline] - void InterlockedAddI64(uint byteAddress, int64_t valueToAdd) - { - let buf = __getEquivalentStructuredBuffer(this); - __atomicAdd(buf[byteAddress / 8], valueToAdd); + __target_switch + { + case cuda: __intrinsic_asm "atomicAdd($0._getPtrAt($1), $2)"; + case hlsl: + __atomicAdd(this, byteAddress, __asuint2(valueToAdd)); + case glsl: + case spirv: + let buf = __getEquivalentStructuredBuffer(this); + __atomicAdd(buf[byteAddress / 8], valueToAdd); + } } // Cas uint64_t - __target_intrinsic(cuda, "(*$4 = atomicCAS($0._getPtrAt($1), $2, $3))") [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)] - void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue); - - __specialized_for_target(hlsl) - [ForceInline] - void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue) - { - outOriginalValue = __asuint64(__cas(this, byteAddress, __asuint2(compareValue), __asuint2(value))); - } - - __specialized_for_target(glsl) - __specialized_for_target(spirv) - [ForceInline] void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue) { - let buf = __getEquivalentStructuredBuffer(this); - outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value); + __target_switch + { + case cuda: __intrinsic_asm "(*$4 = atomicCAS($0._getPtrAt($1), $2, $3))"; + case hlsl: + outOriginalValue = __asuint64(__cas(this, byteAddress, __asuint2(compareValue), __asuint2(value))); + case glsl: + case spirv: + let buf = __getEquivalentStructuredBuffer(this); + outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value); + } } // Max __cuda_sm_version(5.0) - __target_intrinsic(cuda, "atomicMax($0._getPtrAt($1), $2)") [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda5_int64)] - uint64_t InterlockedMaxU64(uint byteAddress, uint64_t value); - - __specialized_for_target(hlsl) - [ForceInline] - uint64_t InterlockedMaxU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicMax(this, byteAddress, __asuint2(value))); } - - __specialized_for_target(glsl) - __specialized_for_target(spirv) - [ForceInline] uint64_t InterlockedMaxU64(uint byteAddress, uint64_t value) { - let buf = __getEquivalentStructuredBuffer(this); - return __atomicMax(buf[byteAddress / 8], value); + __target_switch + { + case cuda: __intrinsic_asm "atomicMax($0._getPtrAt($1), $2)"; + case hlsl: + return __asuint64(__atomicMax(this, byteAddress, __asuint2(value))); + case glsl: + case spirv: + let buf = __getEquivalentStructuredBuffer(this); + return __atomicMax(buf[byteAddress / 8], value); + } } [ForceInline] @@ -2992,21 +3093,19 @@ ${{{{ // Min __cuda_sm_version(5.0) - __target_intrinsic(cuda, "atomicMin($0._getPtrAt($1), $2)") [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda5_int64)] - uint64_t InterlockedMinU64(uint byteAddress, uint64_t value); - - __specialized_for_target(hlsl) - [ForceInline] - uint64_t InterlockedMinU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicMin(this, byteAddress, __asuint2(value))); } - - __specialized_for_target(glsl) - __specialized_for_target(spirv) - [ForceInline] uint64_t InterlockedMinU64(uint byteAddress, uint64_t value) { - let buf = __getEquivalentStructuredBuffer(this); - return __atomicMin(buf[byteAddress / 8], value); + __target_switch + { + case cuda: __intrinsic_asm "atomicMin($0._getPtrAt($1), $2)"; + case hlsl: + return __asuint64(__atomicMin(this, byteAddress, __asuint2(value))); + case glsl: + case spirv: + let buf = __getEquivalentStructuredBuffer(this); + return __atomicMin(buf[byteAddress / 8], value); + } } [ForceInline] @@ -3052,21 +3151,19 @@ ${{{{ // And __cuda_sm_version(5.0) - __target_intrinsic(cuda, "atomicAnd($0._getPtrAt($1), $2)") [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda5_int64)] - uint64_t InterlockedAndU64(uint byteAddress, uint64_t value); - - __specialized_for_target(hlsl) - [ForceInline] - uint64_t InterlockedAndU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicAnd(this, byteAddress, __asuint2(value))); } - - __specialized_for_target(glsl) - __specialized_for_target(spirv) - [ForceInline] uint64_t InterlockedAndU64(uint byteAddress, uint64_t value) { - let buf = __getEquivalentStructuredBuffer(this); - return __atomicAnd(buf[byteAddress / 8], value); + __target_switch + { + case cuda: __intrinsic_asm "atomicAnd($0._getPtrAt($1), $2)"; + case hlsl: + return __asuint64(__atomicAnd(this, byteAddress, __asuint2(value))); + case glsl: + case spirv: + let buf = __getEquivalentStructuredBuffer(this); + return __atomicAnd(buf[byteAddress / 8], value); + } } [ForceInline] @@ -3092,21 +3189,19 @@ ${{{{ // Or __cuda_sm_version(5.0) - __target_intrinsic(cuda, "atomicOr($0._getPtrAt($1), $2)") [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda5_int64)] - uint64_t InterlockedOrU64(uint byteAddress, uint64_t value); - - __specialized_for_target(hlsl) - [ForceInline] - uint64_t InterlockedOrU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicOr(this, byteAddress, __asuint2(value))); } - - __specialized_for_target(glsl) - __specialized_for_target(spirv) - [ForceInline] uint64_t InterlockedOrU64(uint byteAddress, uint64_t value) { - let buf = __getEquivalentStructuredBuffer(this); - return __atomicOr(buf[byteAddress / 8], value); + __target_switch + { + case cuda: __intrinsic_asm "atomicOr($0._getPtrAt($1), $2)"; + case hlsl: + return __asuint64(__atomicOr(this, byteAddress, __asuint2(value))); + case glsl: + case spirv: + let buf = __getEquivalentStructuredBuffer(this); + return __atomicOr(buf[byteAddress / 8], value); + } } [ForceInline] @@ -3132,21 +3227,19 @@ ${{{{ // Xor __cuda_sm_version(5.0) - __target_intrinsic(cuda, "atomicXor($0._getPtrAt($1), $2)") [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda5_int64)] - uint64_t InterlockedXorU64(uint byteAddress, uint64_t value); - - __specialized_for_target(hlsl) - [ForceInline] - uint64_t InterlockedXorU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicXor(this, byteAddress, __asuint2(value))); } - - __specialized_for_target(glsl) - __specialized_for_target(spirv) - [ForceInline] uint64_t InterlockedXorU64(uint byteAddress, uint64_t value) { - let buf = __getEquivalentStructuredBuffer(this); - return __atomicXor(buf[byteAddress / 8], value); + __target_switch + { + case cuda: __intrinsic_asm "atomicXor($0._getPtrAt($1), $2)"; + case hlsl: + return __asuint64(__atomicXor(this, byteAddress, __asuint2(value))); + case glsl: + case spirv: + let buf = __getEquivalentStructuredBuffer(this); + return __atomicXor(buf[byteAddress / 8], value); + } } [ForceInline] @@ -3171,21 +3264,19 @@ ${{{{ // Exchange - __target_intrinsic(cuda, "atomicExch($0._getPtrAt($1), $2)") [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)] - uint64_t InterlockedExchangeU64(uint byteAddress, uint64_t value); - - __specialized_for_target(hlsl) - [ForceInline] - uint64_t InterlockedExchangeU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicExchange(this, byteAddress, __asuint2(value))); } - - __specialized_for_target(glsl) - __specialized_for_target(spirv) - [ForceInline] uint64_t InterlockedExchangeU64(uint byteAddress, uint64_t value) { - let buf = __getEquivalentStructuredBuffer(this); - return __atomicExchange(buf[byteAddress / 8], value); + __target_switch + { + case cuda: __intrinsic_asm "atomicExch($0._getPtrAt($1), $2)"; + case hlsl: + return __asuint64(__atomicExchange(this, byteAddress, __asuint2(value))); + case glsl: + case spirv: + let buf = __getEquivalentStructuredBuffer(this); + return __atomicExchange(buf[byteAddress / 8], value); + } } [ForceInline] @@ -3619,40 +3710,60 @@ ${{{{ } } - __target_intrinsic(hlsl) [ForceInline] + [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] void Store( uint address, uint value) { - __byteAddressBufferStore(this, address, value); + __target_switch + { + case hlsl: __intrinsic_asm ".Store"; + default: + __byteAddressBufferStore(this, address, value); + } } - __target_intrinsic(hlsl) [ForceInline] + [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] void Store2(uint address, uint2 value) { - __byteAddressBufferStore(this, address, value); + __target_switch + { + case hlsl: __intrinsic_asm ".Store2"; + default: + __byteAddressBufferStore(this, address, value); + } } - __target_intrinsic(hlsl) [ForceInline] + [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] void Store3( uint address, uint3 value) { - __byteAddressBufferStore(this, address, value); + __target_switch + { + case hlsl: __intrinsic_asm ".Store3"; + default: + __byteAddressBufferStore(this, address, value); + } } - __target_intrinsic(hlsl) [ForceInline] + [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] void Store4( uint address, uint4 value) { - __byteAddressBufferStore(this, address, value); - } - + __target_switch + { + case hlsl: __intrinsic_asm ".Store4"; + default: + __byteAddressBufferStore(this, address, value); + } + } + void Store(int offset, T value) { __byteAddressBufferStore(this, offset, value); @@ -3685,14 +3796,19 @@ struct $(item.name) [__readNone] [__unsafeForceInlineEarly] - __target_intrinsic(hlsl) + [require(cpp_cuda_glsl_hlsl_metal_spirv, structuredbuffer_rw)] void GetDimensions( out uint numStructs, out uint stride) { - let rs = __structuredBufferGetDimensions(this); - numStructs = rs.x; - stride = rs.y; + __target_switch + { + case hlsl: __intrinsic_asm ".GetDimensions"; + default: + let rs = __structuredBufferGetDimensions(this); + numStructs = rs.x; + stride = rs.y; + } } uint IncrementCounter(); @@ -3894,33 +4010,50 @@ matrix abs(matrix x) // Inverse cosine (HLSL SM 1.0) __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(cuda, "$P_acos($0)") -__target_intrinsic(cpp, "$P_acos($0)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Acos _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] -T acos(T x); +T acos(T x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_acos($0)"; + case cuda: __intrinsic_asm "$P_acos($0)"; + case glsl: __intrinsic_asm "acos"; + case hlsl: __intrinsic_asm "acos"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Acos $x + }; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Acos _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector acos(vector x) { - VECTOR_MAP_UNARY(T, N, acos, x); + __target_switch + { + case glsl: __intrinsic_asm "acos"; + case hlsl: __intrinsic_asm "acos"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Acos $x + }; + default: + VECTOR_MAP_UNARY(T, N, acos, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix acos(matrix x) { - MATRIX_MAP_UNARY(T, N, M, acos, x); + __target_switch + { + case hlsl: __intrinsic_asm "acos"; + default: + MATRIX_MAP_UNARY(T, N, M, acos, x); + } } // Test if all components are non-zero (HLSL SM 1.0) @@ -3996,14 +4129,19 @@ bool all(vector x) } __generic -__target_intrinsic(hlsl) [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv)] bool all(matrix x) { - bool result = true; - for(int i = 0; i < N; ++i) - result = result && all(x[i]); - return result; + __target_switch + { + case hlsl: __intrinsic_asm "all"; + default: + bool result = true; + for(int i = 0; i < N; ++i) + result = result && all(x[i]); + return result; + } } // Barrier for writes to all memory spaces (HLSL SM 5.0) @@ -4118,14 +4256,19 @@ bool any(vector x) } __generic -__target_intrinsic(hlsl) [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv)] bool any(matrix x) { - bool result = false; - for(int i = 0; i < N; ++i) - result = result || any(x[i]); - return result; + __target_switch + { + case hlsl: __intrinsic_asm "any"; + default: + bool result = false; + for(int i = 0; i < N; ++i) + result = result || any(x[i]); + return result; + } } @@ -4143,62 +4286,96 @@ double asdouble(uint lowbits, uint highbits); // Reinterpret bits as a float (HLSL SM 4.0) -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "intBitsToFloat") -__target_intrinsic(cpp, "$P_asfloat($0)") -__target_intrinsic(cuda, "$P_asfloat($0)") -__target_intrinsic(spirv, "OpBitcast resultType resultId _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] -float asfloat(int x); +float asfloat(int x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_asfloat($0)"; + case cuda: __intrinsic_asm "$P_asfloat($0)"; + case glsl: __intrinsic_asm "intBitsToFloat"; + case hlsl: __intrinsic_asm "asfloat"; + case spirv: return spirv_asm { + OpBitcast $$float result $x + }; + } +} -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "uintBitsToFloat") -__target_intrinsic(cpp, "$P_asfloat($0)") -__target_intrinsic(cuda, "$P_asfloat($0)") -__target_intrinsic(spirv, "OpBitcast resultType resultId _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] -float asfloat(uint x); +float asfloat(uint x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_asfloat($0)"; + case cuda: __intrinsic_asm "$P_asfloat($0)"; + case glsl: __intrinsic_asm "uintBitsToFloat"; + case hlsl: __intrinsic_asm "asfloat"; + case spirv: return spirv_asm { + OpBitcast $$float result $x + }; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "intBitsToFloat") -__target_intrinsic(spirv, "OpBitcast resultType resultId _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] vector asfloat(vector< int, N> x) { - VECTOR_MAP_UNARY(float, N, asfloat, x); + __target_switch + { + case glsl: __intrinsic_asm "intBitsToFloat"; + case hlsl: __intrinsic_asm "asfloat"; + case spirv: return spirv_asm { + OpBitcast $$vector result $x + }; + default: + VECTOR_MAP_UNARY(float, N, asfloat, x); + } } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "uintBitsToFloat") -__target_intrinsic(spirv, "OpBitcast resultType resultId _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] vector asfloat(vector x) { - VECTOR_MAP_UNARY(float, N, asfloat, x); + __target_switch + { + case glsl: __intrinsic_asm "uintBitsToFloat"; + case hlsl: __intrinsic_asm "asfloat"; + case spirv: return spirv_asm { + OpBitcast $$vector result $x + }; + default: + VECTOR_MAP_UNARY(float, N, asfloat, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] matrix asfloat(matrix< int,N,M> x) { - MATRIX_MAP_UNARY(float, N, M, asfloat, x); + __target_switch + { + case hlsl: __intrinsic_asm "asfloat"; + default: + MATRIX_MAP_UNARY(float, N, M, asfloat, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] matrix asfloat(matrix x) { - MATRIX_MAP_UNARY(float, N, M, asfloat, x); + __target_switch + { + case hlsl: __intrinsic_asm "asfloat"; + default: + MATRIX_MAP_UNARY(float, N, M, asfloat, x); + } } // No op @@ -4224,93 +4401,144 @@ matrix asfloat(matrix x) // Inverse sine (HLSL SM 1.0) __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(cuda, "$P_asin($0)") -__target_intrinsic(cpp, "$P_asin($0)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Asin _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] -T asin(T x); +T asin(T x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_asin($0)"; + case cuda: __intrinsic_asm "$P_asin($0)"; + case glsl: __intrinsic_asm "asin"; + case hlsl: __intrinsic_asm "asin"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Asin $x + }; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Asin _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector asin(vector x) { - VECTOR_MAP_UNARY(T,N,asin,x); + __target_switch + { + case glsl: __intrinsic_asm "asin"; + case hlsl: __intrinsic_asm "asin"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Asin $x + }; + default: + VECTOR_MAP_UNARY(T,N,asin,x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix asin(matrix x) { - MATRIX_MAP_UNARY(T,N,M,asin,x); + __target_switch + { + case hlsl: __intrinsic_asm "asin"; + default: + MATRIX_MAP_UNARY(T,N,M,asin,x); + } } // Reinterpret bits as an int (HLSL SM 4.0) -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "floatBitsToInt") -__target_intrinsic(cpp, "$P_asint($0)") -__target_intrinsic(cuda, "$P_asint($0)") -__target_intrinsic(spirv, "OpBitcast resultType resultId _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] -int asint(float x); +int asint(float x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_asint($0)"; + case cuda: __intrinsic_asm "$P_asint($0)"; + case glsl: __intrinsic_asm "floatBitsToInt"; + case hlsl: __intrinsic_asm "asint"; + case spirv: return spirv_asm { + OpBitcast $$int result $x + }; + } +} -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "int($0)") -__target_intrinsic(cpp, "$P_asint($0)") -__target_intrinsic(cuda, "$P_asint($0)") -__target_intrinsic(spirv, "OpBitcast resultType resultId _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] -int asint(uint x); +int asint(uint x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_asint($0)"; + case cuda: __intrinsic_asm "$P_asint($0)"; + case glsl: __intrinsic_asm "int($0)"; + case hlsl: __intrinsic_asm "asint"; + case spirv: return spirv_asm { + OpBitcast $$int result $x + }; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "floatBitsToInt") -__target_intrinsic(spirv, "OpBitcast resultType resultId _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] vector asint(vector x) { - VECTOR_MAP_UNARY(int, N, asint, x); + __target_switch + { + case glsl: __intrinsic_asm "floatBitsToInt"; + case hlsl: __intrinsic_asm "asint"; + case spirv: return spirv_asm { + OpBitcast $$vector result $x + }; + default: + VECTOR_MAP_UNARY(int, N, asint, x); + } } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "ivec$N0($0)") -__target_intrinsic(spirv, "OpBitcast resultType resultId _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] vector asint(vector x) { - VECTOR_MAP_UNARY(int, N, asint, x); + __target_switch + { + case glsl: __intrinsic_asm "ivec$N0($0)"; + case hlsl: __intrinsic_asm "asint"; + case spirv: return spirv_asm { + OpBitcast $$vector result $x + }; + default: + VECTOR_MAP_UNARY(int, N, asint, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] matrix asint(matrix x) { - MATRIX_MAP_UNARY(int, N, M, asint, x); + __target_switch + { + case hlsl: __intrinsic_asm "asint"; + default: + MATRIX_MAP_UNARY(int, N, M, asint, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] matrix asint(matrix x) { - MATRIX_MAP_UNARY(int, N, M, asint, x); + __target_switch + { + case hlsl: __intrinsic_asm "asint"; + default: + MATRIX_MAP_UNARY(int, N, M, asint, x); + } } // No op @@ -4361,62 +4589,96 @@ void asuint(double value, out uint lowbits, out uint highbits) // Reinterpret bits as a uint (HLSL SM 4.0) -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "floatBitsToUint") -__target_intrinsic(spirv, "OpBitcast resultType resultId _0") -__target_intrinsic(cpp, "$P_asuint($0)") -__target_intrinsic(cuda, "$P_asuint($0)") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] -uint asuint(float x); +uint asuint(float x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_asuint($0)"; + case cuda: __intrinsic_asm "$P_asuint($0)"; + case glsl: __intrinsic_asm "floatBitsToUint"; + case hlsl: __intrinsic_asm "asuint"; + case spirv: return spirv_asm { + OpBitcast $$uint result $x + }; + } +} -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "uint($0)") -__target_intrinsic(spirv, "OpBitcast resultType resultId _0") -__target_intrinsic(cpp, "$P_asuint($0)") -__target_intrinsic(cuda, "$P_asuint($0)") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] -uint asuint(int x); +uint asuint(int x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_asuint($0)"; + case cuda: __intrinsic_asm "$P_asuint($0)"; + case glsl: __intrinsic_asm "uint($0)"; + case hlsl: __intrinsic_asm "asuint"; + case spirv: return spirv_asm { + OpBitcast $$uint result $x + }; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "floatBitsToUint") -__target_intrinsic(spirv, "OpBitcast resultType resultId _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] vector asuint(vector x) { - VECTOR_MAP_UNARY(uint, N, asuint, x); + __target_switch + { + case glsl: __intrinsic_asm "floatBitsToUint"; + case hlsl: __intrinsic_asm "asuint"; + case spirv: return spirv_asm { + OpBitcast $$vector result $x + }; + default: + VECTOR_MAP_UNARY(uint, N, asuint, x); + } } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "uvec$N0($0)") -__target_intrinsic(spirv, "OpBitcast resultType resultId _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] vector asuint(vector x) { - VECTOR_MAP_UNARY(uint, N, asuint, x); + __target_switch + { + case glsl: __intrinsic_asm "uvec$N0($0)"; + case hlsl: __intrinsic_asm "asuint"; + case spirv: return spirv_asm { + OpBitcast $$vector result $x + }; + default: + VECTOR_MAP_UNARY(uint, N, asuint, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] matrix asuint(matrix x) { - MATRIX_MAP_UNARY(uint, N, M, asuint, x); + __target_switch + { + case hlsl: __intrinsic_asm "asuint"; + default: + MATRIX_MAP_UNARY(uint, N, M, asuint, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] matrix asuint(matrix x) { - MATRIX_MAP_UNARY(uint, N, M, asuint, x); + __target_switch + { + case hlsl: __intrinsic_asm "asuint"; + default: + MATRIX_MAP_UNARY(uint, N, M, asuint, x); + } } [__unsafeForceInlineEarly] @@ -4469,13 +4731,20 @@ matrix asuint(matrix x) // Float->unsigned cases: -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "uint16_t(packHalf2x16(vec2($0, 0.0)))") -__target_intrinsic(cuda, "__half_as_ushort") -__target_intrinsic(spirv, "OpBitcast resultType resultId _0") [__readNone] [require(cuda_glsl_hlsl_spirv, shader5_sm_5_0)] -uint16_t asuint16(float16_t value); +uint16_t asuint16(float16_t value) +{ + __target_switch + { + case cuda: __intrinsic_asm "__half_as_ushort"; + case glsl: __intrinsic_asm "uint16_t(packHalf2x16(vec2($0, 0.0)))"; + case hlsl: __intrinsic_asm "asuint16"; + case spirv: return spirv_asm { + OpBitcast $$uint16_t result $value + }; + } +} [__readNone] [require(cuda_glsl_hlsl_spirv, shader5_sm_5_0)] @@ -4489,13 +4758,20 @@ matrix asuint16(matrix va // Unsigned->float cases: -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "float16_t(unpackHalf2x16($0).x)") -__target_intrinsic(cuda, "__ushort_as_half") -__target_intrinsic(spirv, "OpBitcast resultType resultId _0") [__readNone] [require(cuda_glsl_hlsl_spirv, shader5_sm_5_0)] -float16_t asfloat16(uint16_t value); +float16_t asfloat16(uint16_t value) +{ + __target_switch + { + case cuda: __intrinsic_asm "__ushort_as_half"; + case glsl: __intrinsic_asm "float16_t(unpackHalf2x16($0).x)"; + case hlsl: __intrinsic_asm "asfloat16"; + case spirv: return spirv_asm { + OpBitcast $$float16_t result $value + }; + } +} [__readNone] vector asfloat16(vector value) @@ -4507,135 +4783,227 @@ matrix asfloat16(matrix v // Float<->signed cases: -__target_intrinsic(hlsl) -__target_intrinsic(cuda, "__half_as_short") -__target_intrinsic(spirv, "OpBitcast resultType resultId _0") [__unsafeForceInlineEarly] [__readNone] [require(cuda_hlsl_spirv, shader5_sm_5_0)] -int16_t asint16(float16_t value) { return asuint16(value); } +int16_t asint16(float16_t value) +{ + __target_switch + { + case cuda: __intrinsic_asm "__half_as_short"; + case hlsl: __intrinsic_asm "asint16"; + case spirv: return spirv_asm { + OpBitcast $$int16_t result $value + }; + default: return asuint16(value); + } +} -__target_intrinsic(hlsl) [__unsafeForceInlineEarly] [__readNone] [require(cuda_hlsl_spirv, shader5_sm_5_0)] -vector asint16(vector value) { return asuint16(value); } +vector asint16(vector value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "asint16"; + default: return asuint16(value); + } +} -__target_intrinsic(hlsl) [__unsafeForceInlineEarly] [__readNone] [require(cuda_hlsl_spirv, shader5_sm_5_0)] -matrix asint16(matrix value) { return asuint16(value); } +matrix asint16(matrix value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "asint16"; + default: return asuint16(value); + } +} -__target_intrinsic(hlsl) -__target_intrinsic(cuda, "__short_as_half") -__target_intrinsic(spirv, "OpBitcast resultType resultId _0") [__readNone] [__unsafeForceInlineEarly] [require(cuda_hlsl_spirv, shader5_sm_5_0)] -float16_t asfloat16(int16_t value) { return asfloat16(asuint16(value)); } +float16_t asfloat16(int16_t value) +{ + __target_switch + { + case cuda: __intrinsic_asm "__short_as_half"; + case hlsl: __intrinsic_asm "asfloat16"; + case spirv: return spirv_asm { + OpBitcast $$float16_t result $value + }; + default: return asfloat16(asuint16(value)); + } +} -__target_intrinsic(hlsl) [__unsafeForceInlineEarly] [__readNone] -vector asfloat16(vector value) { return asfloat16(asuint16(value)); } +[require(cuda_hlsl_spirv, shader5_sm_5_0)] +vector asfloat16(vector value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "asfloat16"; + default: return asfloat16(asuint16(value)); + } +} -__target_intrinsic(hlsl) [__unsafeForceInlineEarly] [__readNone] [require(cuda_hlsl_spirv, shader5_sm_5_0)] -matrix asfloat16(matrix value) { return asfloat16(asuint16(value)); } - -// Inverse tangent (HLSL SM 1.0) +matrix asfloat16(matrix value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "asfloat16"; + default: return asfloat16(asuint16(value)); + } +} + +// Inverse tangent (HLSL SM 1.0) __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(cuda, "$P_atan($0)") -__target_intrinsic(cpp, "$P_atan($0)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Atan _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] -T atan(T x); +T atan(T x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_atan($0)"; + case cuda: __intrinsic_asm "$P_atan($0)"; + case glsl: __intrinsic_asm "atan"; + case hlsl: __intrinsic_asm "atan"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Atan $x + }; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Atan _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector atan(vector x) { - VECTOR_MAP_UNARY(T, N, atan, x); + __target_switch + { + case glsl: __intrinsic_asm "atan"; + case hlsl: __intrinsic_asm "atan"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Atan $x + }; + default: + VECTOR_MAP_UNARY(T, N, atan, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix atan(matrix x) { - MATRIX_MAP_UNARY(T, N, M, atan, x); + __target_switch + { + case hlsl: __intrinsic_asm "atan"; + default: + MATRIX_MAP_UNARY(T, N, M, atan, x); + } } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl,"atan($0,$1)") -__target_intrinsic(cuda, "$P_atan2($0, $1)") -__target_intrinsic(cpp, "$P_atan2($0, $1)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Atan2 _0 _1") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] -T atan2(T y, T x); +T atan2(T y, T x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_atan2($0, $1)"; + case cuda: __intrinsic_asm "$P_atan2($0, $1)"; + case glsl: __intrinsic_asm "atan($0,$1)"; + case hlsl: __intrinsic_asm "atan2"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Atan2 $y $x + }; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl,"atan($0,$1)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Atan2 _0 _1") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector atan2(vector y, vector x) { - VECTOR_MAP_BINARY(T, N, atan2, y, x); + __target_switch + { + case glsl: __intrinsic_asm "atan($0,$1)"; + case hlsl: __intrinsic_asm "atan2"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Atan2 $y $x + }; + default: + VECTOR_MAP_BINARY(T, N, atan2, y, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix atan2(matrix y, matrix x) { - MATRIX_MAP_BINARY(T, N, M, atan2, y, x); + __target_switch + { + case hlsl: __intrinsic_asm "atan2"; + default: + MATRIX_MAP_BINARY(T, N, M, atan2, y, x); + } } // Ceiling (HLSL SM 1.0) __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(cuda, "$P_ceil($0)") -__target_intrinsic(cpp, "$P_ceil($0)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Ceil _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] -T ceil(T x); +T ceil(T x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_ceil($0)"; + case cuda: __intrinsic_asm "$P_ceil($0)"; + case glsl: __intrinsic_asm "ceil"; + case hlsl: __intrinsic_asm "ceil"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Ceil $x + }; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Ceil _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector ceil(vector x) { - VECTOR_MAP_UNARY(T, N, ceil, x); + __target_switch + { + case glsl: __intrinsic_asm "ceil"; + case hlsl: __intrinsic_asm "ceil"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Ceil $x + }; + default: + VECTOR_MAP_UNARY(T, N, ceil, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix ceil(matrix x) { - MATRIX_MAP_UNARY(T, N, M, ceil, x); + __target_switch + { + case hlsl: __intrinsic_asm "ceil"; + default: + MATRIX_MAP_UNARY(T, N, M, ceil, x); + } } @@ -4666,12 +5034,16 @@ vector clamp(vector x, vector minBound, vector maxBound) } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix clamp(matrix x, matrix minBound, matrix maxBound) { - return min(max(x, minBound), maxBound); + __target_switch + { + case hlsl: __intrinsic_asm "clamp"; + default: + return min(max(x, minBound), maxBound); + } } __generic @@ -4697,94 +5069,149 @@ vector clamp(vector x, vector minBound, vector maxBound) } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix clamp(matrix x, matrix minBound, matrix maxBound) { - return min(max(x, minBound), maxBound); + __target_switch + { + case hlsl: __intrinsic_asm "clamp"; + default: + return min(max(x, minBound), maxBound); + } } // Clip (discard) fragment conditionally __generic -__target_intrinsic(hlsl) [require(cpp_cuda_glsl_hlsl_spirv, fragment)] void clip(T x) { - if(x < T(0)) discard; + __target_switch + { + case hlsl: __intrinsic_asm "clip"; + default: + if(x < T(0)) discard; + } } __generic -__target_intrinsic(hlsl) [require(cpp_cuda_glsl_hlsl_spirv, fragment)] void clip(vector x) { - if(any(x < T(0))) discard; + __target_switch + { + case hlsl: __intrinsic_asm "clip"; + default: + if(any(x < T(0))) discard; + } } __generic -__target_intrinsic(hlsl) [require(cpp_cuda_glsl_hlsl_spirv, fragment)] void clip(matrix x) { - if(any(x < T(0))) discard; + __target_switch + { + case hlsl: __intrinsic_asm "clip"; + default: + if(any(x < T(0))) discard; + } } // Cosine __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(cuda, "$P_cos($0)") -__target_intrinsic(cpp, "$P_cos($0)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Cos _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] -T cos(T x); +T cos(T x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_cos($0)"; + case cuda: __intrinsic_asm "$P_cos($0)"; + case glsl: __intrinsic_asm "cos"; + case hlsl: __intrinsic_asm "cos"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Cos $x + }; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Cos _0") [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector cos(vector x) { - VECTOR_MAP_UNARY(T,N, cos, x); + __target_switch + { + case glsl: __intrinsic_asm "cos"; + case hlsl: __intrinsic_asm "cos"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Cos $x + }; + default: + VECTOR_MAP_UNARY(T,N, cos, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix cos(matrix x) { - MATRIX_MAP_UNARY(T, N, M, cos, x); + __target_switch + { + case hlsl: __intrinsic_asm "cos"; + default: + MATRIX_MAP_UNARY(T, N, M, cos, x); + } } // Hyperbolic cosine __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(cuda, "$P_cosh($0)") -__target_intrinsic(cpp, "$P_cosh($0)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Cosh _0") [__readNone] -T cosh(T x); +[require(cpp_cuda_glsl_hlsl_spirv)] +T cosh(T x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_cosh($0)"; + case cuda: __intrinsic_asm "$P_cosh($0)"; + case glsl: __intrinsic_asm "cosh"; + case hlsl: __intrinsic_asm "cosh"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Cosh $x + }; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Cosh _0") [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv)] vector cosh(vector x) { - VECTOR_MAP_UNARY(T,N, cosh, x); + __target_switch + { + case glsl: __intrinsic_asm "cosh"; + case hlsl: __intrinsic_asm "cosh"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Cosh $x + }; + default: + VECTOR_MAP_UNARY(T,N, cosh, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv)] matrix cosh(matrix x) { - MATRIX_MAP_UNARY(T, N, M, cosh, x); + __target_switch + { + case hlsl: __intrinsic_asm "cosh"; + default: + MATRIX_MAP_UNARY(T, N, M, cosh, x); + } } // Population count @@ -4809,41 +5236,57 @@ uint countbits(uint value) // Cross product // TODO: SPIRV does not support integer vectors. __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Cross _0 _1") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector cross(vector left, vector right) { - return vector( - left.y * right.z - left.z * right.y, - left.z * right.x - left.x * right.z, - left.x * right.y - left.y * right.x); + __target_switch + { + case glsl: __intrinsic_asm "cross"; + case hlsl: __intrinsic_asm "cross"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Cross $left $right + }; + default: + return vector( + left.y * right.z - left.z * right.y, + left.z * right.x - left.x * right.z, + left.x * right.y - left.y * right.x); + } } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Cross _0 _1") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector cross(vector left, vector right) { - return vector( - left.y * right.z - left.z * right.y, - left.z * right.x - left.x * right.z, - left.x * right.y - left.y * right.x); + __target_switch + { + case glsl: __intrinsic_asm "cross"; + case hlsl: __intrinsic_asm "cross"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Cross $left $right + }; + default: + return vector( + left.y * right.z - left.z * right.y, + left.z * right.x - left.x * right.z, + left.x * right.y - left.y * right.x); + } } // Convert encoded color -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] int4 D3DCOLORtoUBYTE4(float4 color) { - let scaled = color.zyxw * 255.001999f; - return int4(scaled); + __target_switch + { + case hlsl: __intrinsic_asm "D3DCOLORtoUBYTE4"; + default: + let scaled = color.zyxw * 255.001999f; + return int4(scaled); + } } // Partial-difference derivatives @@ -4890,7 +5333,6 @@ vector dd$(xOrY)(vector x) } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, fragmentprocessing)] matrix dd$(xOrY)(matrix x) @@ -5003,42 +5445,69 @@ ${{{{ // Radians to degrees __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Degrees _0") [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv)] T degrees(T x) { - return x * (T(180) / T.getPi()); + __target_switch + { + case glsl: __intrinsic_asm "degrees"; + case hlsl: __intrinsic_asm "degrees"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Degrees $x + }; + default: + return x * (T(180) / T.getPi()); + } } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Degrees _0") [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv)] vector degrees(vector x) { - VECTOR_MAP_UNARY(T, N, degrees, x); + __target_switch + { + case glsl: __intrinsic_asm "degrees"; + case hlsl: __intrinsic_asm "degrees"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Degrees $x + }; + default: + VECTOR_MAP_UNARY(T, N, degrees, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv)] matrix degrees(matrix x) { - MATRIX_MAP_UNARY(T, N, M, degrees, x); + __target_switch + { + case hlsl: __intrinsic_asm "degrees"; + default: + MATRIX_MAP_UNARY(T, N, M, degrees, x); + } } // Matrix determinant __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Determinant _0") [__readNone] [PreferCheckpoint] -T determinant(matrix m); +[require(glsl_hlsl_spirv)] +T determinant(matrix m) +{ + __target_switch + { + case glsl: __intrinsic_asm "determinant"; + case hlsl: __intrinsic_asm "determinant"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Determinant $m + }; + } +} // Barrier for device memory __glsl_extension(GL_KHR_memory_scope_semantics) @@ -5076,14 +5545,20 @@ void DeviceMemoryBarrierWithGroupSync() // Vector distance __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Distance _0 _1") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] T distance(vector x, vector y) { - return length(x - y); + __target_switch + { + case glsl: __intrinsic_asm "distance"; + case hlsl: __intrinsic_asm "distance"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Distance $x $y + }; + default: + return length(x - y); + } } __generic @@ -5097,40 +5572,54 @@ T distance(T x, T y) // Vector dot product __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] T dot(T x, T y) { - return x * y; + __target_switch + { + case glsl: __intrinsic_asm "dot"; + case hlsl: __intrinsic_asm "dot"; + default: + return x * y; + } } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpDot resultType resultId _0 _1") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] T dot(vector x, vector y) { - T result = T(0); - for(int i = 0; i < N; ++i) - result += x[i] * y[i]; - return result; + __target_switch + { + case glsl: __intrinsic_asm "dot"; + case hlsl: __intrinsic_asm "dot"; + case spirv: return spirv_asm { + OpDot $$T result $x $y + }; + default: + T result = T(0); + for(int i = 0; i < N; ++i) + result += x[i] * y[i]; + return result; + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] T dot(vector x, vector y) { - T result = T(0); - for(int i = 0; i < N; ++i) - result += x[i] * y[i]; - return result; + __target_switch + { + case hlsl: __intrinsic_asm "dot"; + default: + T result = T(0); + for(int i = 0; i < N; ++i) + result += x[i] * y[i]; + return result; + } } @@ -5161,53 +5650,90 @@ RasterizerOrderedStructuredBuffer __getEquivalentStructuredBuffer(Rasteriz // TODO: SPIRV-direct does not support non-floating-point types. __generic -__target_intrinsic(glsl, interpolateAtCentroid) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 InterpolateAtCentroid _0") [__readNone] [require(glsl_spirv, fragmentprocessing)] -T EvaluateAttributeAtCentroid(T x); +T EvaluateAttributeAtCentroid(T x) +{ + __target_switch + { + case glsl: __intrinsic_asm "interpolateAtCentroid"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 InterpolateAtCentroid $x + }; + } +} __generic -__target_intrinsic(glsl, interpolateAtCentroid) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 InterpolateAtCentroid _0") [__readNone] [require(glsl_spirv, fragmentprocessing)] -vector EvaluateAttributeAtCentroid(vector x); +vector EvaluateAttributeAtCentroid(vector x) +{ + __target_switch + { + case glsl: __intrinsic_asm "interpolateAtCentroid"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 InterpolateAtCentroid $x + }; + } +} __generic -__target_intrinsic(glsl, interpolateAtCentroid) [__readNone] +[require(glsl_spirv, fragmentprocessing)] matrix EvaluateAttributeAtCentroid(matrix x) { - MATRIX_MAP_UNARY(T, N, M, EvaluateAttributeAtCentroid, x); + __target_switch + { + case glsl: __intrinsic_asm "interpolateAtCentroid"; + default: + MATRIX_MAP_UNARY(T, N, M, EvaluateAttributeAtCentroid, x); + } } __generic -__target_intrinsic(glsl, "interpolateAtSample($0, int($1))") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 InterpolateAtSample _0 _1") [__readNone] [require(glsl_spirv, fragmentprocessing)] -T EvaluateAttributeAtSample(T x, uint sampleindex); +T EvaluateAttributeAtSample(T x, uint sampleindex) +{ + __target_switch + { + case glsl: __intrinsic_asm "interpolateAtSample($0, int($1))"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 InterpolateAtSample $x $sampleindex + }; + } +} __generic -__target_intrinsic(glsl, "interpolateAtSample($0, int($1))") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 InterpolateAtSample _0 _1") [__readNone] [require(glsl_spirv, fragmentprocessing)] -vector EvaluateAttributeAtSample(vector x, uint sampleindex); +vector EvaluateAttributeAtSample(vector x, uint sampleindex) +{ + __target_switch + { + case glsl: __intrinsic_asm "interpolateAtSample($0, int($1))"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 InterpolateAtSample $x $sampleindex + }; + } +} __generic -__target_intrinsic(glsl, "interpolateAtSample($0, int($1))") [__readNone] [require(glsl_spirv, fragmentprocessing)] matrix EvaluateAttributeAtSample(matrix x, uint sampleindex) { - matrix result; - for(int i = 0; i < N; ++i) + __target_switch { - result[i] = EvaluateAttributeAtSample(x[i], sampleindex); + case glsl: __intrinsic_asm "interpolateAtSample($0, int($1))"; + default: + matrix result; + for(int i = 0; i < N; ++i) + { + result[i] = EvaluateAttributeAtSample(x[i], sampleindex); + } + return result; } - return result; } __generic @@ -5225,46 +5751,70 @@ __target_intrinsic(spirv, "%foffset = OpConvertSToF _type(float2) resultId _1; % vector EvaluateAttributeSnapped(vector x, int2 offset); __generic -__target_intrinsic(glsl, "interpolateAtOffset($0, vec2($1) / 16.0f)") [__readNone] +[require(glsl_spirv, fragmentprocessing)] matrix EvaluateAttributeSnapped(matrix x, int2 offset) { - matrix result; - for(int i = 0; i < N; ++i) + __target_switch { - result[i] = EvaluateAttributeSnapped(x[i], offset); + case glsl: __intrinsic_asm "interpolateAtOffset($0, vec2($1) / 16.0f)"; + default: + matrix result; + for(int i = 0; i < N; ++i) + { + result[i] = EvaluateAttributeSnapped(x[i], offset); + } + return result; } - return result; } // Base-e exponent __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(cuda, "$P_exp($0)") -__target_intrinsic(cpp, "$P_exp($0)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Exp _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] -T exp(T x); +T exp(T x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_exp($0)"; + case cuda: __intrinsic_asm "$P_exp($0)"; + case glsl: __intrinsic_asm "exp"; + case hlsl: __intrinsic_asm "exp"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Exp $x + }; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Exp _0") [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector exp(vector x) { - VECTOR_MAP_UNARY(T, N, exp, x); + __target_switch + { + case glsl: __intrinsic_asm "exp"; + case hlsl: __intrinsic_asm "exp"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Exp $x + }; + default: + VECTOR_MAP_UNARY(T, N, exp, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix exp(matrix x) { - MATRIX_MAP_UNARY(T, N, M, exp, x); + __target_switch + { + case hlsl: __intrinsic_asm "exp"; + default: + MATRIX_MAP_UNARY(T, N, M, exp, x); + } } // Base-2 exponent @@ -5303,20 +5853,32 @@ T exp2(T x) } __generic -__target_intrinsic(hlsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Exp2 _0") [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector exp2(vector x) { - VECTOR_MAP_UNARY(T, N, exp2, x); + __target_switch + { + case hlsl: __intrinsic_asm "exp2"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Exp2 $x + }; + default: + VECTOR_MAP_UNARY(T, N, exp2, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix exp2(matrix x) { - MATRIX_MAP_UNARY(T, N, M, exp2, x); + __target_switch + { + case hlsl: __intrinsic_asm "exp2"; + default: + MATRIX_MAP_UNARY(T, N, M, exp2, x); + } } @@ -5345,11 +5907,16 @@ float f16tof32(uint value) } __generic -__target_intrinsic(hlsl) [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] vector f16tof32(vector value) { - VECTOR_MAP_UNARY(float, N, f16tof32, value); + __target_switch + { + case hlsl: __intrinsic_asm "f16tof32"; + default: + VECTOR_MAP_UNARY(float, N, f16tof32, value); + } } @@ -5379,11 +5946,16 @@ uint f32tof16(float value) } __generic -__target_intrinsic(hlsl) [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] vector f32tof16(vector value) { - VECTOR_MAP_UNARY(uint, N, f32tof16, value); + __target_switch + { + case hlsl: __intrinsic_asm "f32tof16"; + default: + VECTOR_MAP_UNARY(uint, N, f32tof16, value); + } } // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! @@ -5412,244 +5984,386 @@ float f16tof32(float16_t value) } __generic -__target_intrinsic(hlsl) -__target_intrinsic(cuda, "__half2float") -__target_intrinsic(spirv, "OpFConvert resultType resultId _0") [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] vector f16tof32(vector value) { - VECTOR_MAP_UNARY(float, N, f16tof32, value); + __target_switch + { + case cuda: __intrinsic_asm "__half2float"; + case hlsl: __intrinsic_asm "f16tof32"; + case spirv: return spirv_asm { + OpFConvert $$vector result $value + }; + default: + VECTOR_MAP_UNARY(float, N, f16tof32, value); + } } // Convert to float16_t -__target_intrinsic(glsl, "packHalf2x16(vec2($0,0.0))") __glsl_version(420) -__target_intrinsic(cuda, "__float2half") -__target_intrinsic(spirv, "OpFConvert resultType resultId _0") [__readNone] [require(cuda_glsl_spirv, shader5_sm_5_0)] -float16_t f32tof16_(float value); +float16_t f32tof16_(float value) +{ + __target_switch + { + case cuda: __intrinsic_asm "__float2half"; + case glsl: __intrinsic_asm "packHalf2x16(vec2($0,0.0))"; + case spirv: return spirv_asm { + OpFConvert $$float16_t result $value + }; + } +} __generic -__target_intrinsic(cuda, "__float2half") -__target_intrinsic(spirv, "OpFConvert resultType resultId _0") [__readNone] +[require(cuda_glsl_spirv, shader5_sm_5_0)] vector f32tof16_(vector value) { - VECTOR_MAP_UNARY(float16_t, N, f32tof16_, value); + __target_switch + { + case cuda: __intrinsic_asm "__float2half"; + case spirv: return spirv_asm { + OpFConvert $$vector result $value + }; + default: + VECTOR_MAP_UNARY(float16_t, N, f32tof16_, value); + } } // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! // Flip surface normal to face forward, if needed __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 FaceForward _0 _1 _2") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_400)] vector faceforward(vector n, vector i, vector ng) { - return dot(ng, i) < T(0.0f) ? n : -n; + __target_switch + { + case glsl: __intrinsic_asm "faceforward"; + case hlsl: __intrinsic_asm "faceforward"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 FaceForward $n $i $ng + }; + default: + return dot(ng, i) < T(0.0f) ? n : -n; + } } // Find first set bit starting at high bit and working down -__target_intrinsic(hlsl) -__target_intrinsic(glsl,"findMSB") -__target_intrinsic(cuda, "$P_firstbithigh($0)") -__target_intrinsic(cpp, "$P_firstbithigh($0)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 FindSMsb _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] -int firstbithigh(int value); +int firstbithigh(int value) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_firstbithigh($0)"; + case cuda: __intrinsic_asm "$P_firstbithigh($0)"; + case glsl: __intrinsic_asm "findMSB"; + case hlsl: __intrinsic_asm "firstbithigh"; + case spirv: return spirv_asm { + OpExtInst $$int result glsl450 FindSMsb $value + }; + } +} -__target_intrinsic(hlsl) -__target_intrinsic(glsl,"findMSB") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 FindSMsb _0") __generic [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] vector firstbithigh(vector value) { - VECTOR_MAP_UNARY(int, N, firstbithigh, value); + __target_switch + { + case glsl: __intrinsic_asm "findMSB"; + case hlsl: __intrinsic_asm "firstbithigh"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 FindSMsb $value + }; + default: + VECTOR_MAP_UNARY(int, N, firstbithigh, value); + } } -__target_intrinsic(hlsl) -__target_intrinsic(glsl,"findMSB") -__target_intrinsic(cuda, "$P_firstbithigh($0)") -__target_intrinsic(cpp, "$P_firstbithigh($0)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 FindUMsb _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] -uint firstbithigh(uint value); +uint firstbithigh(uint value) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_firstbithigh($0)"; + case cuda: __intrinsic_asm "$P_firstbithigh($0)"; + case glsl: __intrinsic_asm "findMSB"; + case hlsl: __intrinsic_asm "firstbithigh"; + case spirv: return spirv_asm { + OpExtInst $$uint result glsl450 FindUMsb $value + }; + } +} -__target_intrinsic(hlsl) -__target_intrinsic(glsl,"findMSB") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 FindUMsb _0") __generic [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] vector firstbithigh(vector value) { - VECTOR_MAP_UNARY(uint, N, firstbithigh, value); + __target_switch + { + case glsl: __intrinsic_asm "findMSB"; + case hlsl: __intrinsic_asm "firstbithigh"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 FindUMsb $value + }; + default: + VECTOR_MAP_UNARY(uint, N, firstbithigh, value); + } } // Find first set bit starting at low bit and working up -__target_intrinsic(hlsl) -__target_intrinsic(glsl,"findLSB") -__target_intrinsic(cuda, "$P_firstbitlow($0)") -__target_intrinsic(cpp, "$P_firstbitlow($0)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 FindILsb _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] -int firstbitlow(int value); +int firstbitlow(int value) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_firstbitlow($0)"; + case cuda: __intrinsic_asm "$P_firstbitlow($0)"; + case glsl: __intrinsic_asm "findLSB"; + case hlsl: __intrinsic_asm "firstbitlow"; + case spirv: return spirv_asm { + OpExtInst $$int result glsl450 FindILsb $value + }; + } +} -__target_intrinsic(hlsl) -__target_intrinsic(glsl,"findLSB") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 FindILsb _0") __generic [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] vector firstbitlow(vector value) { - VECTOR_MAP_UNARY(int, N, firstbitlow, value); + __target_switch + { + case glsl: __intrinsic_asm "findLSB"; + case hlsl: __intrinsic_asm "firstbitlow"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 FindILsb $value + }; + default: + VECTOR_MAP_UNARY(int, N, firstbitlow, value); + } } -__target_intrinsic(hlsl) -__target_intrinsic(glsl,"findLSB") -__target_intrinsic(cuda, "$P_firstbitlow($0)") -__target_intrinsic(cpp, "$P_firstbitlow($0)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 FindILsb _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] -uint firstbitlow(uint value); +uint firstbitlow(uint value) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_firstbitlow($0)"; + case cuda: __intrinsic_asm "$P_firstbitlow($0)"; + case glsl: __intrinsic_asm "findLSB"; + case hlsl: __intrinsic_asm "firstbitlow"; + case spirv: return spirv_asm { + OpExtInst $$uint result glsl450 FindILsb $value + }; + } +} -__target_intrinsic(hlsl) -__target_intrinsic(glsl,"findLSB") __generic -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 FindILsb _0") [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] vector firstbitlow(vector value) { - VECTOR_MAP_UNARY(uint, N, firstbitlow, value); + __target_switch + { + case glsl: __intrinsic_asm "findLSB"; + case hlsl: __intrinsic_asm "firstbitlow"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 FindILsb $value + }; + default: + VECTOR_MAP_UNARY(uint, N, firstbitlow, value); + } } // Floor (HLSL SM 1.0) __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(cuda, "$P_floor($0)") -__target_intrinsic(cpp, "$P_floor($0)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Floor _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] -T floor(T x); +T floor(T x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_floor($0)"; + case cuda: __intrinsic_asm "$P_floor($0)"; + case glsl: __intrinsic_asm "floor"; + case hlsl: __intrinsic_asm "floor"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Floor $x + }; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Floor _0") [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector floor(vector x) { - VECTOR_MAP_UNARY(T, N, floor, x); + __target_switch + { + case glsl: __intrinsic_asm "floor"; + case hlsl: __intrinsic_asm "floor"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Floor $x + }; + default: + VECTOR_MAP_UNARY(T, N, floor, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix floor(matrix x) { - MATRIX_MAP_UNARY(T, N, M, floor, x); + __target_switch + { + case hlsl: __intrinsic_asm "floor"; + default: + MATRIX_MAP_UNARY(T, N, M, floor, x); + } } // Fused multiply-add __generic -__target_intrinsic(glsl) -__target_intrinsic(cuda, "$P_fma($0, $1, $2)") -__target_intrinsic(cpp, "$P_fma($0, $1, $2)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] T fma(T a, T b, T c) { __target_switch { + case cpp: __intrinsic_asm "$P_fma($0, $1, $2)"; + case cuda: __intrinsic_asm "$P_fma($0, $1, $2)"; + case glsl: __intrinsic_asm "fma"; case hlsl: if (__isFloat() || __isHalf()) return mad(a, b, c); else __intrinsic_asm "fma($0, $1, $2)"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Fma $a $b $c + }; default: return a*b + c; } } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] vector fma(vector a, vector b, vector c) { - VECTOR_MAP_TRINARY(T, N, fma, a, b, c); + __target_switch + { + case glsl: __intrinsic_asm "fma"; + case hlsl: __intrinsic_asm "fma"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Fma $a $b $c + }; + default: + VECTOR_MAP_TRINARY(T, N, fma, a, b, c); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] matrix fma(matrix a, matrix b, matrix c) { - MATRIX_MAP_TRINARY(T, N, M, fma, a, b, c); + __target_switch + { + case hlsl: __intrinsic_asm "fma"; + default: + MATRIX_MAP_TRINARY(T, N, M, fma, a, b, c); + } } // Floating point remainder of x/y __generic -__target_intrinsic(hlsl) -__target_intrinsic(cuda, "$P_fmod($0, $1)") -__target_intrinsic(cpp, "$P_fmod($0, $1)") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] T fmod(T x, T y) { - return x - y * trunc(x/y); + __target_switch + { + case cpp: __intrinsic_asm "$P_fmod($0, $1)"; + case cuda: __intrinsic_asm "$P_fmod($0, $1)"; + case hlsl: __intrinsic_asm "fmod"; + default: + return x - y * trunc(x/y); + } } __generic -__target_intrinsic(hlsl) [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector fmod(vector x, vector y) { - VECTOR_MAP_BINARY(T, N, fmod, x, y); + __target_switch + { + case hlsl: __intrinsic_asm "fmod"; + default: + VECTOR_MAP_BINARY(T, N, fmod, x, y); + } } __generic -__target_intrinsic(hlsl) [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix fmod(matrix x, matrix y) { - MATRIX_MAP_BINARY(T, N, M, fmod, x, y); + __target_switch + { + case hlsl: __intrinsic_asm "fmod"; + default: + MATRIX_MAP_BINARY(T, N, M, fmod, x, y); + } } // Fractional part __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, fract) -__target_intrinsic(cuda, "$P_frac($0)") -__target_intrinsic(cpp, "$P_frac($0)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Fract _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] -T frac(T x); +T frac(T x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_frac($0)"; + case cuda: __intrinsic_asm "$P_frac($0)"; + case glsl: __intrinsic_asm "fract"; + case hlsl: __intrinsic_asm "frac"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Fract $x + }; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, fract) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Fract _0") [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector frac(vector x) { - VECTOR_MAP_UNARY(T, N, frac, x); + __target_switch + { + case glsl: __intrinsic_asm "fract"; + case hlsl: __intrinsic_asm "frac"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Fract $x + }; + default: + VECTOR_MAP_UNARY(T, N, frac, x); + } } __generic @@ -5681,11 +6395,16 @@ vector frexp(vector x, out vector exp) } __generic -__target_intrinsic(hlsl) [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix frexp(matrix x, out matrix exp) { - MATRIX_MAP_BINARY(T, N, M, frexp, x, exp); + __target_switch + { + case hlsl: __intrinsic_asm "frexp"; + default: + MATRIX_MAP_BINARY(T, N, M, frexp, x, exp); + } } // Texture filter width @@ -5730,7 +6449,6 @@ vector fwidth(vector x) } __generic -__target_intrinsic(hlsl) [__readNone] [require(glsl_hlsl_spirv, fragmentprocessing)] matrix fwidth(matrix x) @@ -6743,7 +7461,6 @@ void InterlockedXor(__ref uint64_t dest, uint64_t value, out uint64_t origina // Is floating-point value finite? __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] bool isfinite(T x) @@ -6773,12 +7490,16 @@ vector isfinite(vector x) } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix isfinite(matrix x) { - MATRIX_MAP_UNARY(bool, N, M, isfinite, x); + __target_switch + { + case hlsl: __intrinsic_asm "isfinite"; + default: + MATRIX_MAP_UNARY(bool, N, M, isfinite, x); + } } // Is floating-point value infinite? @@ -6818,12 +7539,16 @@ vector isinf(vector x) } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix isinf(matrix x) { - MATRIX_MAP_UNARY(bool, N, M, isinf, x); + __target_switch + { + case hlsl: __intrinsic_asm "isinf"; + default: + MATRIX_MAP_UNARY(bool, N, M, isinf, x); + } } // Is floating-point value not-a-number? @@ -6863,53 +7588,75 @@ vector isnan(vector x) } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix isnan(matrix x) { - MATRIX_MAP_UNARY(bool, N, M, isnan, x); + __target_switch + { + case hlsl: __intrinsic_asm "isnan"; + default: + MATRIX_MAP_UNARY(bool, N, M, isnan, x); + } } // Construct float from mantissa and exponent __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] T ldexp(T x, T exp) { - return x * exp2(exp); + __target_switch + { + case hlsl: __intrinsic_asm "ldexp"; + default: + return x * exp2(exp); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector ldexp(vector x, vector exp) { - return x * exp2(exp); + __target_switch + { + case hlsl: __intrinsic_asm "ldexp"; + default: + return x * exp2(exp); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix ldexp(matrix x, matrix exp) { - MATRIX_MAP_BINARY(T, N, M, ldexp, x, exp); + __target_switch + { + case hlsl: __intrinsic_asm "ldexp"; + default: + MATRIX_MAP_BINARY(T, N, M, ldexp, x, exp); + } } // Vector length __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Length _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] T length(vector x) { - return sqrt(dot(x, x)); + __target_switch + { + case glsl: __intrinsic_asm "length"; + case hlsl: __intrinsic_asm "length"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Length $x + }; + default: + return sqrt(dot(x, x)); + } } // Scalar float length @@ -6922,77 +7669,114 @@ T length(T x) // Linear interpolation __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, mix) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 FMix _0 _1 _2") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] T lerp(T x, T y, T s) { - return x * (T(1.0f) - s) + y * s; + __target_switch + { + case glsl: __intrinsic_asm "mix"; + case hlsl: __intrinsic_asm "lerp"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 FMix $x $y $s + }; + default: + return x * (T(1.0f) - s) + y * s; + } } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, mix) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 FMix _0 _1 _2") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector lerp(vector x, vector y, vector s) { - return x * (T(1.0f) - s) + y * s; + __target_switch + { + case glsl: __intrinsic_asm "mix"; + case hlsl: __intrinsic_asm "lerp"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 FMix $x $y $s + }; + default: + return x * (T(1.0f) - s) + y * s; + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix lerp(matrix x, matrix y, matrix s) { - MATRIX_MAP_TRINARY(T, N, M, lerp, x, y, s); + __target_switch + { + case hlsl: __intrinsic_asm "lerp"; + default: + MATRIX_MAP_TRINARY(T, N, M, lerp, x, y, s); + } } // Legacy lighting function (obsolete) -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] float4 lit(float n_dot_l, float n_dot_h, float m) { - let ambient = 1.0f; - let diffuse = max(n_dot_l, 0.0f); - let specular = step(0.0f, n_dot_l) * max(pow(n_dot_h, m), 0.0f); - return float4(ambient, diffuse, specular, 1.0f); + __target_switch + { + case hlsl: __intrinsic_asm "lit"; + default: + let ambient = 1.0f; + let diffuse = max(n_dot_l, 0.0f); + let specular = step(0.0f, n_dot_l) * max(pow(n_dot_h, m), 0.0f); + return float4(ambient, diffuse, specular, 1.0f); + } } // Base-e logarithm __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(cuda, "$P_log($0)") -__target_intrinsic(cpp, "$P_log($0)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Log _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] -T log(T x); +T log(T x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_log($0)"; + case cuda: __intrinsic_asm "$P_log($0)"; + case glsl: __intrinsic_asm "log"; + case hlsl: __intrinsic_asm "log"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Log $x + }; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Log _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector log(vector x) { - VECTOR_MAP_UNARY(T, N, log, x); + __target_switch + { + case glsl: __intrinsic_asm "log"; + case hlsl: __intrinsic_asm "log"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Log $x + }; + default: + VECTOR_MAP_UNARY(T, N, log, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix log(matrix x) { - MATRIX_MAP_UNARY(T, N, M, log, x); + __target_switch + { + case hlsl: __intrinsic_asm "log"; + default: + MATRIX_MAP_UNARY(T, N, M, log, x); + } } // Base-10 logarithm @@ -7018,105 +7802,160 @@ vector log10(vector x) } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix log10(matrix x) { - MATRIX_MAP_UNARY(T, N, M, log10, x); + __target_switch + { + case hlsl: __intrinsic_asm "log10"; + default: + MATRIX_MAP_UNARY(T, N, M, log10, x); + } } // Base-2 logarithm __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(cuda, "$P_log2($0)") -__target_intrinsic(cpp, "$P_log2($0)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Log2 _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] -T log2(T x); +T log2(T x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_log2($0)"; + case cuda: __intrinsic_asm "$P_log2($0)"; + case glsl: __intrinsic_asm "log2"; + case hlsl: __intrinsic_asm "log2"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Log2 $x + }; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Log2 _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector log2(vector x) { - VECTOR_MAP_UNARY(T, N, log2, x); + __target_switch + { + case glsl: __intrinsic_asm "log2"; + case hlsl: __intrinsic_asm "log2"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Log2 $x + }; + default: + VECTOR_MAP_UNARY(T, N, log2, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix log2(matrix x) { - MATRIX_MAP_UNARY(T, N, M, log2, x); + __target_switch + { + case hlsl: __intrinsic_asm "log2"; + default: + MATRIX_MAP_UNARY(T, N, M, log2, x); + } } // multiply-add __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, fma) -__target_intrinsic(cuda, "$P_fma($0, $1, $2)") -__target_intrinsic(cpp, "$P_fma($0, $1, $2)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] -T mad(T mvalue, T avalue, T bvalue); +T mad(T mvalue, T avalue, T bvalue) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_fma($0, $1, $2)"; + case cuda: __intrinsic_asm "$P_fma($0, $1, $2)"; + case glsl: __intrinsic_asm "fma"; + case hlsl: __intrinsic_asm "mad"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Fma $mvalue $avalue $bvalue + }; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, fma) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] vector mad(vector mvalue, vector avalue, vector bvalue) { - VECTOR_MAP_TRINARY(T, N, mad, mvalue, avalue, bvalue); + __target_switch + { + case glsl: __intrinsic_asm "fma"; + case hlsl: __intrinsic_asm "mad"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Fma $mvalue $avalue $bvalue + }; + default: + VECTOR_MAP_TRINARY(T, N, mad, mvalue, avalue, bvalue); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] matrix mad(matrix mvalue, matrix avalue, matrix bvalue) { - MATRIX_MAP_TRINARY(T, N, M, mad, mvalue, avalue, bvalue); + __target_switch + { + case hlsl: __intrinsic_asm "mad"; + default: + MATRIX_MAP_TRINARY(T, N, M, mad, mvalue, avalue, bvalue); + } } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, fma) -__target_intrinsic(cuda, "$P_fma($0, $1, $2)") -__target_intrinsic(cpp, "$P_fma($0, $1, $2)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] -T mad(T mvalue, T avalue, T bvalue); +T mad(T mvalue, T avalue, T bvalue) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_fma($0, $1, $2)"; + case cuda: __intrinsic_asm "$P_fma($0, $1, $2)"; + case glsl: __intrinsic_asm "fma"; + case hlsl: __intrinsic_asm "mad"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Fma $mvalue $avalue $bvalue + }; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, fma) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] vector mad(vector mvalue, vector avalue, vector bvalue) { - VECTOR_MAP_TRINARY(T, N, mad, mvalue, avalue, bvalue); + __target_switch + { + case glsl: __intrinsic_asm "fma"; + case hlsl: __intrinsic_asm "mad"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Fma $mvalue $avalue $bvalue + }; + default: + VECTOR_MAP_TRINARY(T, N, mad, mvalue, avalue, bvalue); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] matrix mad(matrix mvalue, matrix avalue, matrix bvalue) { - MATRIX_MAP_TRINARY(T, N, M, mad, mvalue, avalue, bvalue); + __target_switch + { + case hlsl: __intrinsic_asm "mad"; + default: + MATRIX_MAP_TRINARY(T, N, M, mad, mvalue, avalue, bvalue); + } } @@ -7147,12 +7986,16 @@ vector max(vector x, vector y) } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix max(matrix x, matrix y) { - MATRIX_MAP_BINARY(T, N, M, max, x, y); + __target_switch + { + case hlsl: __intrinsic_asm "max"; + default: + MATRIX_MAP_BINARY(T, N, M, max, x, y); + } } __generic @@ -7177,12 +8020,16 @@ vector max(vector x, vector y) } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix max(matrix x, matrix y) { - MATRIX_MAP_BINARY(T, N, M, max, x, y); + __target_switch + { + case hlsl: __intrinsic_asm "max"; + default: + MATRIX_MAP_BINARY(T, N, M, max, x, y); + } } // minimum @@ -7192,28 +8039,28 @@ __target_intrinsic(glsl) __target_intrinsic(cuda, "$P_min($0, $1)") __target_intrinsic(cpp, "$P_min($0, $1)") __target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 fus(FMin, UMin, SMin) _0 _1") -[__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] T min(T x, T y); __generic __target_intrinsic(hlsl) __target_intrinsic(glsl) __target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 fus(FMin, UMin, SMin) _0 _1") -[__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector min(vector x, vector y) { VECTOR_MAP_BINARY(T, N, min, x, y); } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix min(matrix x, matrix y) { - MATRIX_MAP_BINARY(T, N, M, min, x, y); + __target_switch + { + case hlsl: __intrinsic_asm "min"; + default: + MATRIX_MAP_BINARY(T, N, M, min, x, y); + } } __generic @@ -7238,12 +8085,16 @@ vector min(vector x, vector y) } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix min(matrix x, matrix y) { - MATRIX_MAP_BINARY(T, N, M, min, x, y); + __target_switch + { + case hlsl: __intrinsic_asm "min"; + default: + MATRIX_MAP_BINARY(T, N, M, min, x, y); + } } // split into integer and fractional parts (both with same sign) @@ -7266,32 +8117,40 @@ vector modf(vector x, out vector ip) } __generic -__target_intrinsic(hlsl) [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix modf(matrix x, out matrix ip) { - MATRIX_MAP_BINARY(T, N, M, modf, x, ip); + __target_switch + { + case hlsl: __intrinsic_asm "modf"; + default: + MATRIX_MAP_BINARY(T, N, M, modf, x, ip); + } } // msad4 (whatever that is) -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] uint4 msad4(uint reference, uint2 source, uint4 accum) { - int4 bytesRef = (reference >> uint4(24, 16, 8, 0)) & 0xFF; - int4 bytesX = (source.x >> uint4(24, 16, 8, 0)) & 0xFF; - int4 bytesY = (source.y >> uint4(24, 16, 8, 0)) & 0xFF; - - uint4 mask = select(bytesRef == 0, 0, 0xFFFFFFFFu); - - uint4 result = accum; - result += mask.x & abs(bytesRef - int4(bytesX.x, bytesY.y, bytesY.z, bytesY.w)); - result += mask.y & abs(bytesRef - int4(bytesX.x, bytesX.y, bytesY.z, bytesY.w)); - result += mask.z & abs(bytesRef - int4(bytesX.x, bytesX.y, bytesX.z, bytesY.w)); - result += mask.w & abs(bytesRef - int4(bytesX.x, bytesX.y, bytesX.z, bytesX.w)); - return result; + __target_switch + { + case hlsl: __intrinsic_asm "msad4"; + default: + int4 bytesRef = (reference >> uint4(24, 16, 8, 0)) & 0xFF; + int4 bytesX = (source.x >> uint4(24, 16, 8, 0)) & 0xFF; + int4 bytesY = (source.y >> uint4(24, 16, 8, 0)) & 0xFF; + + uint4 mask = select(bytesRef == 0, 0, 0xFFFFFFFFu); + + uint4 result = accum; + result += mask.x & abs(bytesRef - int4(bytesX.x, bytesY.y, bytesY.z, bytesY.w)); + result += mask.y & abs(bytesRef - int4(bytesX.x, bytesX.y, bytesY.z, bytesY.w)); + result += mask.z & abs(bytesRef - int4(bytesX.x, bytesX.y, bytesX.z, bytesY.w)); + result += mask.w & abs(bytesRef - int4(bytesX.x, bytesX.y, bytesX.z, bytesX.w)); + return result; + } } // General inner products @@ -7331,204 +8190,254 @@ matrix mul(T x, matrix y); // vector-vector (dot product) __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "dot") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] T mul(vector x, vector y) { - return dot(x, y); + __target_switch + { + case glsl: __intrinsic_asm "dot"; + case hlsl: __intrinsic_asm "mul"; + default: + return dot(x, y); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] T mul(vector x, vector y) { - return dot(x, y); + __target_switch + { + case hlsl: __intrinsic_asm "mul"; + default: + return dot(x, y); + } } // vector-matrix __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "($1 * $0)") -__target_intrinsic(spirv, "OpMatrixTimesVector resultType resultId _1 _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector mul(vector left, matrix right) { - vector result; - for( int j = 0; j < M; ++j ) + __target_switch { - T sum = T(0); - for( int i = 0; i < N; ++i ) + case glsl: __intrinsic_asm "($1 * $0)"; + case hlsl: __intrinsic_asm "mul"; + case spirv: return spirv_asm { + OpMatrixTimesVector $$vector result $right $left + }; + default: + vector result; + for( int j = 0; j < M; ++j ) { - sum += left[i] * right[i][j]; + T sum = T(0); + for( int i = 0; i < N; ++i ) + { + sum += left[i] * right[i][j]; + } + result[j] = sum; } - result[j] = sum; + return result; } - return result; } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "($1 * $0)") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector mul(vector left, matrix right) { - vector result; - for( int j = 0; j < M; ++j ) + __target_switch { - T sum = T(0); - for( int i = 0; i < N; ++i ) + case glsl: __intrinsic_asm "($1 * $0)"; + case hlsl: __intrinsic_asm "mul"; + default: + vector result; + for( int j = 0; j < M; ++j ) { - sum += left[i] * right[i][j]; + T sum = T(0); + for( int i = 0; i < N; ++i ) + { + sum += left[i] * right[i][j]; + } + result[j] = sum; } - result[j] = sum; + return result; } - return result; } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "($1 * $0)") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector mul(vector left, matrix right) { - vector result; - for( int j = 0; j < M; ++j ) + __target_switch { - T sum = T(0); - for( int i = 0; i < N; ++i ) + case glsl: __intrinsic_asm "($1 * $0)"; + case hlsl: __intrinsic_asm "mul"; + default: + vector result; + for( int j = 0; j < M; ++j ) { - sum |= left[i] & right[i][j]; + T sum = T(0); + for( int i = 0; i < N; ++i ) + { + sum |= left[i] & right[i][j]; + } + result[j] = sum; } - result[j] = sum; + return result; } - return result; } // matrix-vector __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "($1 * $0)") -__target_intrinsic(spirv, "OpVectorTimesMatrix resultType resultId _1 _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector mul(matrix left, vector right) { - vector result; - for( int i = 0; i < N; ++i ) + __target_switch { - T sum = T(0); - for( int j = 0; j < M; ++j ) + case glsl: __intrinsic_asm "($1 * $0)"; + case hlsl: __intrinsic_asm "mul"; + case spirv: return spirv_asm { + OpVectorTimesMatrix $$vector result $right $left + }; + default: + vector result; + for( int i = 0; i < N; ++i ) { - sum += left[i][j] * right[j]; + T sum = T(0); + for( int j = 0; j < M; ++j ) + { + sum += left[i][j] * right[j]; + } + result[i] = sum; } - result[i] = sum; + return result; } - return result; } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "($1 * $0)") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector mul(matrix left, vector right) { - vector result; - for( int i = 0; i < N; ++i ) + __target_switch { - T sum = T(0); - for( int j = 0; j < M; ++j ) + case glsl: __intrinsic_asm "($1 * $0)"; + case hlsl: __intrinsic_asm "mul"; + default: + vector result; + for( int i = 0; i < N; ++i ) { - sum += left[i][j] * right[j]; + T sum = T(0); + for( int j = 0; j < M; ++j ) + { + sum += left[i][j] * right[j]; + } + result[i] = sum; } - result[i] = sum; + return result; } - return result; } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "($1 * $0)") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector mul(matrix left, vector right) { - vector result; - for( int i = 0; i < N; ++i ) + __target_switch { - T sum = T(0); - for( int j = 0; j < M; ++j ) + case glsl: __intrinsic_asm "($1 * $0)"; + case hlsl: __intrinsic_asm "mul"; + default: + vector result; + for( int i = 0; i < N; ++i ) { - sum |= left[i][j] & right[j]; + T sum = T(0); + for( int j = 0; j < M; ++j ) + { + sum |= left[i][j] & right[j]; + } + result[i] = sum; } - result[i] = sum; + return result; } - return result; } // matrix-matrix __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "($1 * $0)") -__target_intrinsic(spirv, "OpMatrixTimesMatrix resultType resultId _1 _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix mul(matrix left, matrix right) { - matrix result; - for( int r = 0; r < R; ++r) - for( int c = 0; c < C; ++c) + __target_switch { - T sum = T(0); - for( int i = 0; i < N; ++i ) + case glsl: __intrinsic_asm "($1 * $0)"; + case hlsl: __intrinsic_asm "mul"; + case spirv: return spirv_asm { + OpMatrixTimesMatrix $$matrix result $right $left + }; + default: + matrix result; + for( int r = 0; r < R; ++r) + for( int c = 0; c < C; ++c) { - sum += left[r][i] * right[i][c]; + T sum = T(0); + for( int i = 0; i < N; ++i ) + { + sum += left[r][i] * right[i][c]; + } + result[r][c] = sum; } - result[r][c] = sum; + return result; } - return result; } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "($1 * $0)") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix mul(matrix left, matrix right) { - matrix result; - for( int r = 0; r < R; ++r) - for( int c = 0; c < C; ++c) + __target_switch { - T sum = T(0); - for( int i = 0; i < N; ++i ) + case glsl: __intrinsic_asm "($1 * $0)"; + case hlsl: __intrinsic_asm "mul"; + default: + matrix result; + for( int r = 0; r < R; ++r) + for( int c = 0; c < C; ++c) { - sum += left[r][i] * right[i][c]; + T sum = T(0); + for( int i = 0; i < N; ++i ) + { + sum += left[r][i] * right[i][c]; + } + result[r][c] = sum; } - result[r][c] = sum; + return result; } - return result; } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "($1 * $0)") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix mul(matrix left, matrix right) { - matrix result; - for( int r = 0; r < R; ++r) - for( int c = 0; c < C; ++c) + __target_switch { - T sum = T(0); - for( int i = 0; i < N; ++i ) + case glsl: __intrinsic_asm "($1 * $0)"; + case hlsl: __intrinsic_asm "mul"; + default: + matrix result; + for( int r = 0; r < R; ++r) + for( int c = 0; c < C; ++c) { - sum |= left[r][i] & right[i][c]; + T sum = T(0); + for( int i = 0; i < N; ++i ) + { + sum |= left[r][i] & right[i][c]; + } + result[r][c] = sum; } - result[r][c] = sum; + return result; } - return result; } // noise (deprecated) @@ -7593,56 +8502,85 @@ T NonUniformResourceIndex(T value) { return value; } // Normalize a vector __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Normalize _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector normalize(vector x) { - return x / length(x); + __target_switch + { + case glsl: __intrinsic_asm "normalize"; + case hlsl: __intrinsic_asm "normalize"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Normalize $x + }; + default: + return x / length(x); + } } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Normalize _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] T normalize(T x) { - return x / length(x); + __target_switch + { + case glsl: __intrinsic_asm "normalize"; + case hlsl: __intrinsic_asm "normalize"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Normalize $x + }; + default: + return x / length(x); + } } // Raise to a power __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(cuda, "$P_pow($0, $1)") -__target_intrinsic(cpp, "$P_pow($0, $1)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Pow _0 _1") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] -T pow(T x, T y); +T pow(T x, T y) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_pow($0, $1)"; + case cuda: __intrinsic_asm "$P_pow($0, $1)"; + case glsl: __intrinsic_asm "pow"; + case hlsl: __intrinsic_asm "pow"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Pow $x $y + }; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Pow _0 _1") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector pow(vector x, vector y) { - VECTOR_MAP_BINARY(T, N, pow, x, y); + __target_switch + { + case glsl: __intrinsic_asm "pow"; + case hlsl: __intrinsic_asm "pow"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Pow $x $y + }; + default: + VECTOR_MAP_BINARY(T, N, pow, x, y); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix pow(matrix x, matrix y) { - MATRIX_MAP_BINARY(T, N, M, pow, x, y); + __target_switch + { + case hlsl: __intrinsic_asm "pow"; + default: + MATRIX_MAP_BINARY(T, N, M, pow, x, y); + } } // Output message @@ -7779,114 +8717,166 @@ void ProcessTriTessFactorsMin( // Degrees to radians __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Radians _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] T radians(T x) { - return x * (T.getPi() / T(180.0f)); + __target_switch + { + case glsl: __intrinsic_asm "radians"; + case hlsl: __intrinsic_asm "radians"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Radians $x + }; + default: + return x * (T.getPi() / T(180.0f)); + } } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Radians _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector radians(vector x) { - return x * (T.getPi() / T(180.0f)); + __target_switch + { + case glsl: __intrinsic_asm "radians"; + case hlsl: __intrinsic_asm "radians"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Radians $x + }; + default: + return x * (T.getPi() / T(180.0f)); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix radians(matrix x) { - return x * (T.getPi() / T(180.0f)); + __target_switch + { + case hlsl: __intrinsic_asm "radians"; + default: + return x * (T.getPi() / T(180.0f)); + } } // Approximate reciprocal __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] T rcp(T x) { - return T(1.0) / x; + __target_switch + { + case hlsl: __intrinsic_asm "rcp"; + default: + return T(1.0) / x; + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector rcp(vector x) { - VECTOR_MAP_UNARY(T, N, rcp, x); + __target_switch + { + case hlsl: __intrinsic_asm "rcp"; + default: + VECTOR_MAP_UNARY(T, N, rcp, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix rcp(matrix x) { - MATRIX_MAP_UNARY(T, N, M, rcp, x); + __target_switch + { + case hlsl: __intrinsic_asm "rcp"; + default: + MATRIX_MAP_UNARY(T, N, M, rcp, x); + } } // Reflect incident vector across plane with given normal __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Reflect _0 _1") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] T reflect(T i, T n) { - return i - T(2) * dot(n,i) * n; + __target_switch + { + case glsl: __intrinsic_asm "reflect"; + case hlsl: __intrinsic_asm "reflect"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Reflect $i $n + }; + default: + return i - T(2) * dot(n,i) * n; + } } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Reflect _0 _1") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector reflect(vector i, vector n) { - return i - T(2) * dot(n,i) * n; + __target_switch + { + case glsl: __intrinsic_asm "reflect"; + case hlsl: __intrinsic_asm "reflect"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Reflect $i $n + }; + default: + return i - T(2) * dot(n,i) * n; + } } // Refract incident vector given surface normal and index of refraction __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Refract _0 _1 _2") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector refract(vector i, vector n, T eta) { - let dotNI = dot(n,i); - let k = T(1) - eta*eta*(T(1) - dotNI * dotNI); - if(k < T(0)) return vector(T(0)); - return eta * i - (eta * dotNI + sqrt(k)) * n; + __target_switch + { + case glsl: __intrinsic_asm "refract"; + case hlsl: __intrinsic_asm "refract"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Refract $i $n $eta + }; + default: + let dotNI = dot(n,i); + let k = T(1) - eta*eta*(T(1) - dotNI * dotNI); + if(k < T(0)) return vector(T(0)); + return eta * i - (eta * dotNI + sqrt(k)) * n; + } } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Refract _0 _1 _2") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] T refract(T i, T n, T eta) { - let dotNI = dot(n,i); - let k = T(1) - eta*eta*(T(1) - dotNI * dotNI); - if(k < T(0)) return T(0); - return eta * i - (eta * dotNI + sqrt(k)) * n; + __target_switch + { + case glsl: __intrinsic_asm "refract"; + case hlsl: __intrinsic_asm "refract"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Refract $i $n $eta + }; + default: + let dotNI = dot(n,i); + let k = T(1) - eta*eta*(T(1) - dotNI * dotNI); + if(k < T(0)) return T(0); + return eta * i - (eta * dotNI + sqrt(k)) * n; + } } // Reverse order of bits @@ -7908,7 +8898,6 @@ uint reversebits(uint value) } } -__target_intrinsic(glsl, "bitfieldReverse") __generic [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] @@ -7927,98 +8916,143 @@ vector reversebits(vector value) // Round-to-nearest __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(cuda, "$P_round($0)") -__target_intrinsic(cpp, "$P_round($0)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Round _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] -T round(T x); +T round(T x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_round($0)"; + case cuda: __intrinsic_asm "$P_round($0)"; + case glsl: __intrinsic_asm "round"; + case hlsl: __intrinsic_asm "round"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Round $x + }; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Round _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector round(vector x) { - VECTOR_MAP_UNARY(T, N, round, x); + __target_switch + { + case glsl: __intrinsic_asm "round"; + case hlsl: __intrinsic_asm "round"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Round $x + }; + default: + VECTOR_MAP_UNARY(T, N, round, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix round(matrix x) { - MATRIX_MAP_UNARY(T, N, M, round, x); + __target_switch + { + case hlsl: __intrinsic_asm "round"; + default: + MATRIX_MAP_UNARY(T, N, M, round, x); + } } // Reciprocal of square root __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "inversesqrt($0)") -__target_intrinsic(cuda, "$P_rsqrt($0)") -__target_intrinsic(cpp, "$P_rsqrt($0)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 InverseSqrt _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] T rsqrt(T x) { - return T(1.0) / sqrt(x); + __target_switch + { + case cpp: __intrinsic_asm "$P_rsqrt($0)"; + case cuda: __intrinsic_asm "$P_rsqrt($0)"; + case glsl: __intrinsic_asm "inversesqrt($0)"; + case hlsl: __intrinsic_asm "rsqrt"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 InverseSqrt $x + }; + default: + return T(1.0) / sqrt(x); + } } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl, "inversesqrt($0)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 InverseSqrt _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector rsqrt(vector x) { - VECTOR_MAP_UNARY(T, N, rsqrt, x); + __target_switch + { + case glsl: __intrinsic_asm "inversesqrt($0)"; + case hlsl: __intrinsic_asm "rsqrt"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 InverseSqrt $x + }; + default: + VECTOR_MAP_UNARY(T, N, rsqrt, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix rsqrt(matrix x) { - MATRIX_MAP_UNARY(T, N, M, rsqrt, x); + __target_switch + { + case hlsl: __intrinsic_asm "rsqrt"; + default: + MATRIX_MAP_UNARY(T, N, M, rsqrt, x); + } } // Clamp value to [0,1] range __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] T saturate(T x) { - return clamp(x, T(0), T(1)); + __target_switch + { + case hlsl: __intrinsic_asm "saturate"; + default: + return clamp(x, T(0), T(1)); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector saturate(vector x) { - return clamp(x, - vector(T(0)), - vector(T(1))); + __target_switch + { + case hlsl: __intrinsic_asm "saturate"; + default: + return clamp(x, + vector(T(0)), + vector(T(1))); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix saturate(matrix x) { - MATRIX_MAP_UNARY(T, N, M, saturate, x); + __target_switch + { + case hlsl: __intrinsic_asm "saturate"; + default: + MATRIX_MAP_UNARY(T, N, M, saturate, x); + } } __generic @@ -8076,344 +9110,513 @@ vector sign(vector x) } __generic -__target_intrinsic(hlsl) [__readNone] +[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix sign(matrix x) { - MATRIX_MAP_UNARY(int, N, M, sign, x); + __target_switch + { + case hlsl: __intrinsic_asm "sign"; + default: + MATRIX_MAP_UNARY(int, N, M, sign, x); + } } // Sine __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(cuda, "$P_sin($0)") -__target_intrinsic(cpp, "$P_sin($0)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Sin _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] -T sin(T x); +T sin(T x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_sin($0)"; + case cuda: __intrinsic_asm "$P_sin($0)"; + case glsl: __intrinsic_asm "sin"; + case hlsl: __intrinsic_asm "sin"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Sin $x + }; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Sin _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector sin(vector x) { - VECTOR_MAP_UNARY(T, N, sin, x); + __target_switch + { + case glsl: __intrinsic_asm "sin"; + case hlsl: __intrinsic_asm "sin"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Sin $x + }; + default: + VECTOR_MAP_UNARY(T, N, sin, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix sin(matrix x) { - MATRIX_MAP_UNARY(T, N, M, sin, x); + __target_switch + { + case hlsl: __intrinsic_asm "sin"; + default: + MATRIX_MAP_UNARY(T, N, M, sin, x); + } } // Sine and cosine __generic -__target_intrinsic(hlsl) -__target_intrinsic(cuda, "$P_sincos($0, $1, $2)") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] void sincos(T x, out T s, out T c) { - s = sin(x); - c = cos(x); + __target_switch + { + case cuda: __intrinsic_asm "$P_sincos($0, $1, $2)"; + case hlsl: __intrinsic_asm "sincos"; + default: + s = sin(x); + c = cos(x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] void sincos(vector x, out vector s, out vector c) { - s = sin(x); - c = cos(x); + __target_switch + { + case hlsl: __intrinsic_asm "sincos"; + default: + s = sin(x); + c = cos(x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] void sincos(matrix x, out matrix s, out matrix c) { - s = sin(x); - c = cos(x); + __target_switch + { + case hlsl: __intrinsic_asm "sincos"; + default: + s = sin(x); + c = cos(x); + } } // Hyperbolic Sine __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(cuda, "$P_sinh($0)") -__target_intrinsic(cpp, "$P_sinh($0)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Sinh _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] -T sinh(T x); +T sinh(T x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_sinh($0)"; + case cuda: __intrinsic_asm "$P_sinh($0)"; + case glsl: __intrinsic_asm "sinh"; + case hlsl: __intrinsic_asm "sinh"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Sinh $x + }; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Sinh _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector sinh(vector x) { - VECTOR_MAP_UNARY(T, N, sinh, x); + __target_switch + { + case glsl: __intrinsic_asm "sinh"; + case hlsl: __intrinsic_asm "sinh"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Sinh $x + }; + default: + VECTOR_MAP_UNARY(T, N, sinh, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix sinh(matrix x) { - MATRIX_MAP_UNARY(T, N, M, sinh, x); + __target_switch + { + case hlsl: __intrinsic_asm "sinh"; + default: + MATRIX_MAP_UNARY(T, N, M, sinh, x); + } } // Smooth step (Hermite interpolation) __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 SmoothStep _0 _1 _2") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] T smoothstep(T min, T max, T x) { - let t = saturate((x - min) / (max - min)); - return t * t * (T(3.0f) - (t + t)); + __target_switch + { + case glsl: __intrinsic_asm "smoothstep"; + case hlsl: __intrinsic_asm "smoothstep"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 SmoothStep $min $max $x + }; + default: + let t = saturate((x - min) / (max - min)); + return t * t * (T(3.0f) - (t + t)); + } } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 SmoothStep _0 _1 _2") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector smoothstep(vector min, vector max, vector x) { - VECTOR_MAP_TRINARY(T, N, smoothstep, min, max, x); + __target_switch + { + case glsl: __intrinsic_asm "smoothstep"; + case hlsl: __intrinsic_asm "smoothstep"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 SmoothStep $min $max $x + }; + default: + VECTOR_MAP_TRINARY(T, N, smoothstep, min, max, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix smoothstep(matrix min, matrix max, matrix x) { - MATRIX_MAP_TRINARY(T, N, M, smoothstep, min, max, x); + __target_switch + { + case hlsl: __intrinsic_asm "smoothstep"; + default: + MATRIX_MAP_TRINARY(T, N, M, smoothstep, min, max, x); + } } // Square root __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(cuda, "$P_sqrt($0)") -__target_intrinsic(cpp, "$P_sqrt($0)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Sqrt _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] -T sqrt(T x); +T sqrt(T x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_sqrt($0)"; + case cuda: __intrinsic_asm "$P_sqrt($0)"; + case glsl: __intrinsic_asm "sqrt"; + case hlsl: __intrinsic_asm "sqrt"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Sqrt $x + }; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Sqrt _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector sqrt(vector x) { - VECTOR_MAP_UNARY(T, N, sqrt, x); + __target_switch + { + case glsl: __intrinsic_asm "sqrt"; + case hlsl: __intrinsic_asm "sqrt"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Sqrt $x + }; + default: + VECTOR_MAP_UNARY(T, N, sqrt, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix sqrt(matrix x) { - MATRIX_MAP_UNARY(T, N, M, sqrt, x); + __target_switch + { + case hlsl: __intrinsic_asm "sqrt"; + default: + MATRIX_MAP_UNARY(T, N, M, sqrt, x); + } } // Step function __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Step _0 _1") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] T step(T y, T x) { - return x < y ? T(0.0f) : T(1.0f); + __target_switch + { + case glsl: __intrinsic_asm "step"; + case hlsl: __intrinsic_asm "step"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Step $y $x + }; + default: + return x < y ? T(0.0f) : T(1.0f); + } } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Step _0 _1") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector step(vector y, vector x) { - VECTOR_MAP_BINARY(T, N, step, y, x); + __target_switch + { + case glsl: __intrinsic_asm "step"; + case hlsl: __intrinsic_asm "step"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Step $y $x + }; + default: + VECTOR_MAP_BINARY(T, N, step, y, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix step(matrix y, matrix x) { - MATRIX_MAP_BINARY(T, N, M, step, y, x); + __target_switch + { + case hlsl: __intrinsic_asm "step"; + default: + MATRIX_MAP_BINARY(T, N, M, step, y, x); + } } // Tangent __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(cuda, "$P_tan($0)") -__target_intrinsic(cpp, "$P_tan($0)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Tan _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] -T tan(T x); +T tan(T x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_tan($0)"; + case cuda: __intrinsic_asm "$P_tan($0)"; + case glsl: __intrinsic_asm "tan"; + case hlsl: __intrinsic_asm "tan"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Tan $x + }; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Tan _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector tan(vector x) { - VECTOR_MAP_UNARY(T, N, tan, x); + __target_switch + { + case glsl: __intrinsic_asm "tan"; + case hlsl: __intrinsic_asm "tan"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Tan $x + }; + default: + VECTOR_MAP_UNARY(T, N, tan, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix tan(matrix x) { - MATRIX_MAP_UNARY(T, N, M, tan, x); + __target_switch + { + case hlsl: __intrinsic_asm "tan"; + default: + MATRIX_MAP_UNARY(T, N, M, tan, x); + } } // Hyperbolic tangent __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(cuda, "$P_tanh($0)") -__target_intrinsic(cpp, "$P_tanh($0)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Tanh _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] -T tanh(T x); +T tanh(T x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_tanh($0)"; + case cuda: __intrinsic_asm "$P_tanh($0)"; + case glsl: __intrinsic_asm "tanh"; + case hlsl: __intrinsic_asm "tanh"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Tanh $x + }; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Tanh _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector tanh(vector x) { - VECTOR_MAP_UNARY(T, N, tanh, x); + __target_switch + { + case glsl: __intrinsic_asm "tanh"; + case hlsl: __intrinsic_asm "tanh"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Tanh $x + }; + default: + VECTOR_MAP_UNARY(T, N, tanh, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix tanh(matrix x) { - MATRIX_MAP_UNARY(T, N, M, tanh, x); + __target_switch + { + case hlsl: __intrinsic_asm "tanh"; + default: + MATRIX_MAP_UNARY(T, N, M, tanh, x); + } } // Matrix transpose __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpTranspose resultType resultId _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] [PreferRecompute] matrix transpose(matrix x) { - matrix result; - for(int r = 0; r < M; ++r) - for(int c = 0; c < N; ++c) - result[r][c] = x[c][r]; - return result; + __target_switch + { + case glsl: __intrinsic_asm "transpose"; + case hlsl: __intrinsic_asm "transpose"; + case spirv: return spirv_asm { + OpTranspose $$matrix result $x + }; + default: + matrix result; + for(int r = 0; r < M; ++r) + for(int c = 0; c < N; ++c) + result[r][c] = x[c][r]; + return result; + } } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpTranspose resultType resultId _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] [PreferRecompute] matrix transpose(matrix x) { - matrix result; - for (int r = 0; r < M; ++r) - for (int c = 0; c < N; ++c) - result[r][c] = x[c][r]; - return result; + __target_switch + { + case glsl: __intrinsic_asm "transpose"; + case hlsl: __intrinsic_asm "transpose"; + case spirv: return spirv_asm { + OpTranspose $$matrix result $x + }; + default: + matrix result; + for (int r = 0; r < M; ++r) + for (int c = 0; c < N; ++c) + result[r][c] = x[c][r]; + return result; + } } __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpTranspose resultType resultId _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] [PreferRecompute] [OverloadRank(-1)] matrix transpose(matrix x) { - matrix result; - for (int r = 0; r < M; ++r) - for (int c = 0; c < N; ++c) - result[r][c] = x[c][r]; - return result; + __target_switch + { + case glsl: __intrinsic_asm "transpose"; + case hlsl: __intrinsic_asm "transpose"; + case spirv: return spirv_asm { + OpTranspose $$matrix result $x + }; + default: + matrix result; + for (int r = 0; r < M; ++r) + for (int c = 0; c < N; ++c) + result[r][c] = x[c][r]; + return result; + } } // Truncate to integer __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(cuda, "$P_trunc($0)") -__target_intrinsic(cpp, "$P_trunc($0)") -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Trunc _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] -T trunc(T x); +T trunc(T x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_trunc($0)"; + case cuda: __intrinsic_asm "$P_trunc($0)"; + case glsl: __intrinsic_asm "trunc"; + case hlsl: __intrinsic_asm "trunc"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Trunc $x + }; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(glsl) -__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Trunc _0") [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] vector trunc(vector x) { - VECTOR_MAP_UNARY(T, N, trunc, x); + __target_switch + { + case glsl: __intrinsic_asm "trunc"; + case hlsl: __intrinsic_asm "trunc"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Trunc $x + }; + default: + VECTOR_MAP_UNARY(T, N, trunc, x); + } } __generic -__target_intrinsic(hlsl) [__readNone] [require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] matrix trunc(matrix x) { - MATRIX_MAP_UNARY(T, N, M, trunc, x); + __target_switch + { + case hlsl: __intrinsic_asm "trunc"; + default: + MATRIX_MAP_UNARY(T, N, M, trunc, x); + } } // Slang Specific 'Mask' Wave Intrinsics @@ -8713,9 +9916,15 @@ vector WaveMaskBroadcastLaneAt(WaveMask mask, vector value, constexpr } } __generic -__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1, $2)") -__target_intrinsic(hlsl, "WaveReadLaneAt($1, $2)") -matrix WaveMaskBroadcastLaneAt(WaveMask mask, matrix value, constexpr int lane); +[require(cuda_hlsl, subgroup_ballot)] +matrix WaveMaskBroadcastLaneAt(WaveMask mask, matrix value, constexpr int lane) +{ + __target_switch + { + case cuda: __intrinsic_asm "_waveShuffleMultiple($0, $1, $2)"; + case hlsl: __intrinsic_asm "WaveReadLaneAt($1, $2)"; + } +} // TODO(JS): If it can be determines that the `laneId` is constExpr, then subgroupBroadcast // could be used on GLSL. For now we just use subgroupShuffle @@ -8758,9 +9967,15 @@ vector WaveMaskReadLaneAt(WaveMask mask, vector value, int lane) } } __generic -__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1, $2)") -__target_intrinsic(hlsl, "WaveReadLaneAt($1, $2)") -matrix WaveMaskReadLaneAt(WaveMask mask, matrix value, int lane); +[require(cuda_hlsl, subgroup_shuffle)] +matrix WaveMaskReadLaneAt(WaveMask mask, matrix value, int lane) +{ + __target_switch + { + case cuda: __intrinsic_asm "_waveShuffleMultiple($0, $1, $2)"; + case hlsl: __intrinsic_asm "WaveReadLaneAt($1, $2)"; + } +} // NOTE! WaveMaskShuffle is a NON STANDARD HLSL intrinsic! It will map to WaveReadLaneAt on HLSL // which means it will only work on hardware which allows arbitrary laneIds which is not true @@ -8844,9 +10059,15 @@ vector WaveMaskBitAnd(WaveMask mask, vector expr) } } __generic -__target_intrinsic(cuda, "_waveAndMultiple($0, $1)") -__target_intrinsic(hlsl, "WaveActiveBitAnd($1)") -matrix WaveMaskBitAnd(WaveMask mask, matrix expr); +[require(cuda_hlsl, subgroup_arithmetic)] +matrix WaveMaskBitAnd(WaveMask mask, matrix expr) +{ + __target_switch + { + case cuda: __intrinsic_asm "_waveAndMultiple($0, $1)"; + case hlsl: __intrinsic_asm "WaveActiveBitAnd($1)"; + } +} __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) @@ -8885,9 +10106,15 @@ vector WaveMaskBitOr(WaveMask mask, vector expr) } } __generic -__target_intrinsic(cuda, "_waveOrMultiple($0, $1)") -__target_intrinsic(hlsl, "WaveActiveBitOr($1)") -matrix WaveMaskBitOr(WaveMask mask, matrix expr); +[require(cuda_hlsl, subgroup_arithmetic)] +matrix WaveMaskBitOr(WaveMask mask, matrix expr) +{ + __target_switch + { + case cuda: __intrinsic_asm "_waveOrMultiple($0, $1)"; + case hlsl: __intrinsic_asm "WaveActiveBitOr($1)"; + } +} __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) @@ -8926,9 +10153,15 @@ vector WaveMaskBitXor(WaveMask mask, vector expr) } } __generic -__target_intrinsic(cuda, "_waveXorMultiple($0, $1)") -__target_intrinsic(hlsl, "WaveActiveBitXor($1)") -matrix WaveMaskBitXor(WaveMask mask, matrix expr); +[require(cuda_hlsl, subgroup_arithmetic)] +matrix WaveMaskBitXor(WaveMask mask, matrix expr) +{ + __target_switch + { + case cuda: __intrinsic_asm "_waveXorMultiple($0, $1)"; + case hlsl: __intrinsic_asm "WaveActiveBitXor($1)"; + } +} __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) @@ -8973,11 +10206,17 @@ vector WaveMaskMax(WaveMask mask, vector expr) } } -__generic -__target_intrinsic(cuda, "_waveMaxMultiple($0, $1)") -__target_intrinsic(hlsl, "WaveActiveMax($1)") -matrix WaveMaskMax(WaveMask mask, matrix expr); - +__generic +[require(cuda_hlsl, subgroup_arithmetic)] +matrix WaveMaskMax(WaveMask mask, matrix expr) +{ + __target_switch + { + case cuda: __intrinsic_asm "_waveMaxMultiple($0, $1)"; + case hlsl: __intrinsic_asm "WaveActiveMax($1)"; + } +} + __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) @@ -9023,9 +10262,15 @@ vector WaveMaskMin(WaveMask mask, vector expr) } __generic -__target_intrinsic(cuda, "_waveMinMultiple($0, $1)") -__target_intrinsic(hlsl, "WaveActiveMin($1)") -matrix WaveMaskMin(WaveMask mask, matrix expr); +[require(cuda_hlsl, subgroup_arithmetic)] +matrix WaveMaskMin(WaveMask mask, matrix expr) +{ + __target_switch + { + case cuda: __intrinsic_asm "_waveMinMultiple($0, $1)"; + case hlsl: __intrinsic_asm "WaveActiveMin($1)"; + } +} __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) @@ -9080,9 +10325,15 @@ vector WaveMaskProduct(WaveMask mask, vector expr) } __generic -__target_intrinsic(cuda, "_waveProductMultiple($0, $1)") -__target_intrinsic(hlsl, "WaveActiveProduct($1)") -matrix WaveMaskProduct(WaveMask mask, matrix expr); +[require(cuda_hlsl, subgroup_arithmetic)] +matrix WaveMaskProduct(WaveMask mask, matrix expr) +{ + __target_switch + { + case cuda: __intrinsic_asm "_waveProductMultiple($0, $1)"; + case hlsl: __intrinsic_asm "WaveActiveProduct($1)"; + } +} __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) @@ -9139,9 +10390,15 @@ vector WaveMaskSum(WaveMask mask, vector expr) } } __generic -__target_intrinsic(cuda, "_waveSumMultiple($0, $1)") -__target_intrinsic(hlsl, "WaveActiveSum($1)") -matrix WaveMaskSum(WaveMask mask, matrix expr); +[require(cuda_hlsl, subgroup_arithmetic)] +matrix WaveMaskSum(WaveMask mask, matrix expr) +{ + __target_switch + { + case cuda: __intrinsic_asm "_waveSumMultiple($0, $1)"; + case hlsl: __intrinsic_asm "WaveActiveSum($1)"; + } +} __generic __glsl_extension(GL_KHR_shader_subgroup_vote) @@ -9193,9 +10450,15 @@ bool WaveMaskAllEqual(WaveMask mask, vector value) } __generic __cuda_sm_version(7.0) -__target_intrinsic(cuda, "_waveAllEqualMultiple($0, $1)") -__target_intrinsic(hlsl, "WaveActiveAllEqual($1)") -bool WaveMaskAllEqual(WaveMask mask, matrix value); +[require(cuda_hlsl, subgroup_vote)] +bool WaveMaskAllEqual(WaveMask mask, matrix value) +{ + __target_switch + { + case cuda: __intrinsic_asm "_waveAllEqualMultiple($0, $1)"; + case hlsl: __intrinsic_asm "WaveActiveAllEqual($1)"; + } +} // Prefix @@ -9254,9 +10517,15 @@ vector WaveMaskPrefixProduct(WaveMask mask, vector expr) } } __generic -__target_intrinsic(cuda, "_wavePrefixProductMultiple($0, $1)") -__target_intrinsic(hlsl, "WavePrefixProduct($1)") -matrix WaveMaskPrefixProduct(WaveMask mask, matrix expr); +[require(cuda_hlsl, subgroup_arithmetic)] +matrix WaveMaskPrefixProduct(WaveMask mask, matrix expr) +{ + __target_switch + { + case cuda: __intrinsic_asm "_wavePrefixProductMultiple($0, $1)"; + case hlsl: __intrinsic_asm "WavePrefixProduct($1)"; + } +} __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) @@ -9314,9 +10583,15 @@ vector WaveMaskPrefixSum(WaveMask mask, vector expr) } } __generic -__target_intrinsic(cuda, "_wavePrefixSumMultiple($0, $1)") -__target_intrinsic(hlsl, "WavePrefixSum($1)") -matrix WaveMaskPrefixSum(WaveMask mask, matrix expr); +[require(cuda_hlsl, subgroup_arithmetic)] +matrix WaveMaskPrefixSum(WaveMask mask, matrix expr) +{ + __target_switch + { + case cuda: __intrinsic_asm "_wavePrefixSumMultiple($0, $1)"; + case hlsl: __intrinsic_asm "WavePrefixSum($1)"; + } +} __generic __glsl_extension(GL_KHR_shader_subgroup_ballot) @@ -9350,8 +10625,14 @@ vector WaveMaskReadLaneFirst(WaveMask mask, vector expr) } __generic -__target_intrinsic(cuda, "_waveReadFirstMultiple($0, $1)") -matrix WaveMaskReadLaneFirst(WaveMask mask, matrix expr); +[require(cuda, subgroup_ballot)] +matrix WaveMaskReadLaneFirst(WaveMask mask, matrix expr) +{ + __target_switch + { + case cuda: __intrinsic_asm "_waveReadFirstMultiple($0, $1)"; + } +} // WaveMask SM6.5 like intrinsics @@ -9401,13 +10682,19 @@ WaveMask WaveMaskMatch(WaveMask mask, vector value) } __generic -__target_intrinsic(hlsl, "WaveMatch($1).x") __glsl_extension(GL_NV_shader_subgroup_partitioned) __spirv_version(1.3) -__target_intrinsic(glsl, "subgroupPartitionNV($1).x") __cuda_sm_version(7.0) -__target_intrinsic(cuda, "_waveMatchMultiple($0, $1)") -WaveMask WaveMaskMatch(WaveMask mask, matrix value); +[require(cuda_glsl_hlsl, subgroup_partitioned)] +WaveMask WaveMaskMatch(WaveMask mask, matrix value) +{ + __target_switch + { + case cuda: __intrinsic_asm "_waveMatchMultiple($0, $1)"; + case glsl: __intrinsic_asm "subgroupPartitionNV($1).x"; + case hlsl: __intrinsic_asm "WaveMatch($1).x"; + } +} __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) @@ -9442,9 +10729,15 @@ vector WaveMaskPrefixBitAnd(WaveMask mask, vector expr) } __generic -__target_intrinsic(hlsl, "WaveMultiPrefixBitAnd($1, uint4($0, 0, 0, 0))") -__target_intrinsic(cuda, "_wavePrefixAndMultiple(_getMultiPrefixMask($0, $1)") -matrix WaveMaskPrefixBitAnd(WaveMask mask, matrix expr); +[require(cuda_hlsl, subgroup_arithmetic)] +matrix WaveMaskPrefixBitAnd(WaveMask mask, matrix expr) +{ + __target_switch + { + case cuda: __intrinsic_asm "_wavePrefixAndMultiple(_getMultiPrefixMask($0, $1)"; + case hlsl: __intrinsic_asm "WaveMultiPrefixBitAnd($1, uint4($0, 0, 0, 0))"; + } +} __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) @@ -9479,9 +10772,15 @@ vector WaveMaskPrefixBitOr(WaveMask mask, vector expr) } __generic -__target_intrinsic(hlsl, "WaveMultiPrefixBitOr($1, uint4($0, 0, 0, 0))") -__target_intrinsic(cuda, "_wavePrefixOrMultiple($0, $1)") -matrix WaveMaskPrefixBitOr(WaveMask mask, matrix expr); +[require(cuda_hlsl, subgroup_arithmetic)] +matrix WaveMaskPrefixBitOr(WaveMask mask, matrix expr) +{ + __target_switch + { + case cuda: __intrinsic_asm "_wavePrefixOrMultiple($0, $1)"; + case hlsl: __intrinsic_asm "WaveMultiPrefixBitOr($1, uint4($0, 0, 0, 0))"; + } +} __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) @@ -9516,9 +10815,15 @@ vector WaveMaskPrefixBitXor(WaveMask mask, vector expr) } __generic -__target_intrinsic(hlsl, "WaveMultiPrefixBitXor($1, uint4($0, 0, 0, 0))") -__target_intrinsic(cuda, "_wavePrefixXorMultiple($0, $1)") -matrix WaveMaskPrefixBitXor(WaveMask mask, matrix expr); +[require(cuda_hlsl, subgroup_arithmetic)] +matrix WaveMaskPrefixBitXor(WaveMask mask, matrix expr) +{ + __target_switch + { + case cuda: __intrinsic_asm "_wavePrefixXorMultiple($0, $1)"; + case hlsl: __intrinsic_asm "WaveMultiPrefixBitXor($1, uint4($0, 0, 0, 0))"; + } +} // Shader model 6.0 stuff @@ -9736,10 +11041,15 @@ vector WaveActive$(opName.hlslName)(vector expr) } __generic -__target_intrinsic(hlsl) +[require(cuda_hlsl, subgroup_arithmetic)] matrix WaveActive$(opName.hlslName)(matrix expr) { - return WaveMask$(opName.hlslName)(WaveGetActiveMask(), expr); + __target_switch + { + case hlsl: __intrinsic_asm "WaveActive$(opName.hlslName)"; + default: + return WaveMask$(opName.hlslName)(WaveGetActiveMask(), expr); + } } ${{{{ } // WaveActiveBitAnd, WaveActiveBitOr, WaveActiveBitXor @@ -9796,10 +11106,15 @@ vector WaveActive$(opName)(vector expr) } __generic -__target_intrinsic(hlsl) +[require(cuda_hlsl, subgroup_arithmetic)] matrix WaveActive$(opName)(matrix expr) { - return WaveMask$(opName)(WaveGetActiveMask(), expr); + __target_switch + { + case hlsl: __intrinsic_asm "WaveActive$(opName)"; + default: + return WaveMask$(opName)(WaveGetActiveMask(), expr); + } } ${{{{ @@ -9848,7 +11163,6 @@ T WaveActive$(opName.hlslName)(T expr) __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) -__target_intrinsic(hlsl) [require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] vector WaveActive$(opName.hlslName)(vector expr) { @@ -9879,10 +11193,15 @@ vector WaveActive$(opName.hlslName)(vector expr) } __generic -__target_intrinsic(hlsl) +[require(cuda_hlsl, subgroup_arithmetic)] matrix WaveActive$(opName.hlslName)(matrix expr) { - return WaveMask$(opName.hlslName)(WaveGetActiveMask(), expr); + __target_switch + { + case hlsl: __intrinsic_asm "WaveActive$(opName.hlslName)"; + default: + return WaveMask$(opName.hlslName)(WaveGetActiveMask(), expr); + } } ${{{{ } // WaveActiveProduct/WaveActiveProductSum. @@ -9935,10 +11254,15 @@ bool WaveActiveAllEqual(vector value) } __generic -__target_intrinsic(hlsl) +[require(cuda_hlsl, subgroup_vote)] bool WaveActiveAllEqual(matrix value) { - return WaveMaskAllEqual(WaveGetActiveMask(), value); + __target_switch + { + case hlsl: __intrinsic_asm "WaveActiveAllEqual"; + default: + return WaveMaskAllEqual(WaveGetActiveMask(), value); + } } __glsl_extension(GL_KHR_shader_subgroup_vote) @@ -10008,10 +11332,15 @@ uint4 WaveActiveBallot(bool condition) } } -__target_intrinsic(hlsl) +[require(cuda_glsl_hlsl_spirv, subgroup_ballot)] uint WaveActiveCountBits(bool value) { - return WaveMaskCountBits(WaveGetActiveMask(), value); + __target_switch + { + case hlsl: __intrinsic_asm "WaveActiveCountBits"; + default: + return WaveMaskCountBits(WaveGetActiveMask(), value); + } } __glsl_extension(GL_KHR_shader_subgroup_basic) @@ -10171,11 +11500,15 @@ vector WavePrefixProduct(vector expr) } __generic -__target_intrinsic(hlsl) -[require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] +[require(cuda_hlsl, subgroup_arithmetic)] matrix WavePrefixProduct(matrix expr) { - return WaveMaskPrefixProduct(WaveGetActiveMask(), expr); + __target_switch + { + case hlsl: __intrinsic_asm "WavePrefixProduct"; + default: + return WaveMaskPrefixProduct(WaveGetActiveMask(), expr); + } } __generic @@ -10237,10 +11570,15 @@ vector WavePrefixSum(vector expr) } __generic -__target_intrinsic(hlsl) +[require(cuda_hlsl, subgroup_arithmetic)] matrix WavePrefixSum(matrix expr) { - return WaveMaskPrefixSum(WaveGetActiveMask(), expr); + __target_switch + { + case hlsl: __intrinsic_asm "WavePrefixSum"; + default: + return WaveMaskPrefixSum(WaveGetActiveMask(), expr); + } } __generic @@ -10282,10 +11620,15 @@ vector WaveReadLaneFirst(vector expr) } __generic -__target_intrinsic(hlsl) +[require(cuda_hlsl, subgroup_ballot)] matrix WaveReadLaneFirst(matrix expr) { - return WaveMaskReadLaneFirst(WaveGetActiveMask(), expr); + __target_switch + { + case hlsl: __intrinsic_asm "WaveReadLaneFirst"; + default: + return WaveMaskReadLaneFirst(WaveGetActiveMask(), expr); + } } // NOTE! WaveBroadcastLaneAt is *NOT* standard HLSL @@ -10335,11 +11678,16 @@ vector WaveBroadcastLaneAt(vector value, constexpr int lane) } __generic -__target_intrinsic(cuda, "_waveShuffleMultiple(_getActiveMask(), $0, $1)") -__target_intrinsic(hlsl, "WaveReadLaneAt") +[require(cuda_hlsl, subgroup_ballot)] matrix WaveBroadcastLaneAt(matrix value, constexpr int lane) { - return WaveMaskBroadcastLaneAt(WaveGetActiveMask(), value, lane); + __target_switch + { + case cuda: __intrinsic_asm "_waveShuffleMultiple(_getActiveMask(), $0, $1)"; + case hlsl: __intrinsic_asm "WaveReadLaneAt"; + default: + return WaveMaskBroadcastLaneAt(WaveGetActiveMask(), value, lane); + } } // TODO(JS): If it can be determines that the `laneId` is constExpr, then subgroupBroadcast @@ -10385,11 +11733,16 @@ vector WaveReadLaneAt(vector value, int lane) } __generic -__target_intrinsic(cuda, "_waveShuffleMultiple(_getActiveMask(), $0, $1)") -__target_intrinsic(hlsl) +[require(cuda_hlsl, subgroup_shuffle)] matrix WaveReadLaneAt(matrix value, int lane) { - return WaveMaskReadLaneAt(WaveGetActiveMask(), value, lane); + __target_switch + { + case cuda: __intrinsic_asm "_waveShuffleMultiple(_getActiveMask(), $0, $1)"; + case hlsl: __intrinsic_asm "WaveReadLaneAt"; + default: + return WaveMaskReadLaneAt(WaveGetActiveMask(), value, lane); + } } // NOTE! WaveShuffle is a NON STANDARD HLSL intrinsic! It will map to WaveReadLaneAt on HLSL @@ -10436,10 +11789,15 @@ vector WaveShuffle(vector value, int lane) } __generic -__target_intrinsic(hlsl, "WaveReadLaneAt") +[require(cuda_hlsl, subgroup_shuffle)] matrix WaveShuffle(matrix value, int lane) { - return WaveMaskShuffle(WaveGetActiveMask(), value, lane); + __target_switch + { + case hlsl: __intrinsic_asm "WaveReadLaneAt"; + default: + return WaveMaskShuffle(WaveGetActiveMask(), value, lane); + } } __glsl_extension(GL_KHR_shader_subgroup_ballot) @@ -10495,138 +11853,233 @@ uint4 WaveGetActiveMulti() // https://github.com/microsoft/DirectX-Specs/blob/master/d3d/HLSL_ShaderModel6_5.md __generic -__target_intrinsic(hlsl) +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] uint4 WaveMatch(T value) { - return WaveMaskMatch(WaveGetActiveMask(), value); + __target_switch + { + case hlsl: __intrinsic_asm "WaveMatch"; + default: + return WaveMaskMatch(WaveGetActiveMask(), value); + } } __generic -__target_intrinsic(hlsl) +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] uint4 WaveMatch(vector value) { - return WaveMaskMatch(WaveGetActiveMask(), value); + __target_switch + { + case hlsl: __intrinsic_asm "WaveMatch"; + default: + return WaveMaskMatch(WaveGetActiveMask(), value); + } } __generic -__target_intrinsic(hlsl) +[require(cuda_glsl_hlsl, subgroup_partitioned)] uint4 WaveMatch(matrix value) { - return WaveMaskMatch(WaveGetActiveMask(), value); + __target_switch + { + case hlsl: __intrinsic_asm "WaveMatch"; + default: + return WaveMaskMatch(WaveGetActiveMask(), value); + } } -__target_intrinsic(hlsl) -__target_intrinsic(cuda, "_popc(__ballot_sync(($1).x, $0) & _getLaneLtMask())") [require(cuda_hlsl, waveprefix)] -uint WaveMultiPrefixCountBits(bool value, uint4 mask); +uint WaveMultiPrefixCountBits(bool value, uint4 mask) +{ + __target_switch + { + case cuda: __intrinsic_asm "_popc(__ballot_sync(($1).x, $0) & _getLaneLtMask())"; + case hlsl: __intrinsic_asm "WaveMultiPrefixCountBits"; + } +} __generic -__target_intrinsic(hlsl) __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) -__target_intrinsic(glsl, "subgroupExclusiveAnd($0)") -__target_intrinsic(cuda, "_wavePrefixAnd(_getMultiPrefixMask(($1).x), $0)") [require(cuda_glsl_hlsl, waveprefix)] -T WaveMultiPrefixBitAnd(T expr, uint4 mask); +T WaveMultiPrefixBitAnd(T expr, uint4 mask) +{ + __target_switch + { + case cuda: __intrinsic_asm "_wavePrefixAnd(_getMultiPrefixMask(($1).x), $0)"; + case glsl: __intrinsic_asm "subgroupExclusiveAnd($0)"; + case hlsl: __intrinsic_asm "WaveMultiPrefixBitAnd"; + } +} -__target_intrinsic(hlsl) __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) -__target_intrinsic(glsl, "subgroupExclusiveAnd($0)") -__target_intrinsic(cuda, "_wavePrefixAndMultiple(_getMultiPrefixMask(($1).x), $0)") __generic [require(cuda_glsl_hlsl, waveprefix)] -vector WaveMultiPrefixBitAnd(vector expr, uint4 mask); +vector WaveMultiPrefixBitAnd(vector expr, uint4 mask) +{ + __target_switch + { + case cuda: __intrinsic_asm "_wavePrefixAndMultiple(_getMultiPrefixMask(($1).x), $0)"; + case glsl: __intrinsic_asm "subgroupExclusiveAnd($0)"; + case hlsl: __intrinsic_asm "WaveMultiPrefixBitAnd"; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(cuda, "_wavePrefixAndMultiple(_getMultiPrefixMask(($1).x), $0)") [require(cuda_hlsl, waveprefix)] -matrix WaveMultiPrefixBitAnd(matrix expr, uint4 mask); +matrix WaveMultiPrefixBitAnd(matrix expr, uint4 mask) +{ + __target_switch + { + case cuda: __intrinsic_asm "_wavePrefixAndMultiple(_getMultiPrefixMask(($1).x), $0)"; + case hlsl: __intrinsic_asm "WaveMultiPrefixBitAnd"; + } +} __generic -__target_intrinsic(hlsl) __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) -__target_intrinsic(glsl, "subgroupExclusiveOr($0)") -__target_intrinsic(cuda, "_wavePrefixOr(, _getMultiPrefixMask(($1).x), $0)") [require(cuda_glsl_hlsl, waveprefix)] -T WaveMultiPrefixBitOr(T expr, uint4 mask); +T WaveMultiPrefixBitOr(T expr, uint4 mask) +{ + __target_switch + { + case cuda: __intrinsic_asm "_wavePrefixOr(, _getMultiPrefixMask(($1).x), $0)"; + case glsl: __intrinsic_asm "subgroupExclusiveOr($0)"; + case hlsl: __intrinsic_asm "WaveMultiPrefixBitOr"; + } +} __generic -__target_intrinsic(hlsl) __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) -__target_intrinsic(glsl, "subgroupExclusiveOr($0)") -__target_intrinsic(cuda, "_wavePrefixOrMultiple(_getMultiPrefixMask(($1).x), $0)") [require(cuda_glsl_hlsl, waveprefix)] -vector WaveMultiPrefixBitOr(vector expr, uint4 mask); +vector WaveMultiPrefixBitOr(vector expr, uint4 mask) +{ + __target_switch + { + case cuda: __intrinsic_asm "_wavePrefixOrMultiple(_getMultiPrefixMask(($1).x), $0)"; + case glsl: __intrinsic_asm "subgroupExclusiveOr($0)"; + case hlsl: __intrinsic_asm "WaveMultiPrefixBitOr"; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(cuda, "_wavePrefixOrMultiple(_getMultiPrefixMask(($1).x), $0)") [require(cuda_hlsl, waveprefix)] -matrix WaveMultiPrefixBitOr(matrix expr, uint4 mask); +matrix WaveMultiPrefixBitOr(matrix expr, uint4 mask) +{ + __target_switch + { + case cuda: __intrinsic_asm "_wavePrefixOrMultiple(_getMultiPrefixMask(($1).x), $0)"; + case hlsl: __intrinsic_asm "WaveMultiPrefixBitOr"; + } +} __generic -__target_intrinsic(hlsl) __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) -__target_intrinsic(glsl, "subgroupExclusiveXor($0)") -__target_intrinsic(cuda, "_wavePrefixXor(_getMultiPrefixMask(($1).x), $0)") [require(cuda_glsl_hlsl, waveprefix)] -T WaveMultiPrefixBitXor(T expr, uint4 mask); +T WaveMultiPrefixBitXor(T expr, uint4 mask) +{ + __target_switch + { + case cuda: __intrinsic_asm "_wavePrefixXor(_getMultiPrefixMask(($1).x), $0)"; + case glsl: __intrinsic_asm "subgroupExclusiveXor($0)"; + case hlsl: __intrinsic_asm "WaveMultiPrefixBitXor"; + } +} __generic -__target_intrinsic(hlsl) __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) -__target_intrinsic(glsl, "subgroupExclusiveXor($0)") -__target_intrinsic(cuda, "_wavePrefixXorMultiple(_getMultiPrefixMask(($1).x), $0)") [require(cuda_glsl_hlsl, waveprefix)] -vector WaveMultiPrefixBitXor(vector expr, uint4 mask); +vector WaveMultiPrefixBitXor(vector expr, uint4 mask) +{ + __target_switch + { + case cuda: __intrinsic_asm "_wavePrefixXorMultiple(_getMultiPrefixMask(($1).x), $0)"; + case glsl: __intrinsic_asm "subgroupExclusiveXor($0)"; + case hlsl: __intrinsic_asm "WaveMultiPrefixBitXor"; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(cuda, "_wavePrefixXorMultiple(_getMultiPrefixMask(($1).x), $0)") [require(cuda_hlsl, waveprefix)] -matrix WaveMultiPrefixBitXor(matrix expr, uint4 mask); +matrix WaveMultiPrefixBitXor(matrix expr, uint4 mask) +{ + __target_switch + { + case cuda: __intrinsic_asm "_wavePrefixXorMultiple(_getMultiPrefixMask(($1).x), $0)"; + case hlsl: __intrinsic_asm "WaveMultiPrefixBitXor"; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(cuda, "_wavePrefixProduct(_getMultiPrefixMask(($1).x), $0)") [require(cuda_hlsl, waveprefix)] -T WaveMultiPrefixProduct(T value, uint4 mask); +T WaveMultiPrefixProduct(T value, uint4 mask) +{ + __target_switch + { + case cuda: __intrinsic_asm "_wavePrefixProduct(_getMultiPrefixMask(($1).x), $0)"; + case hlsl: __intrinsic_asm "WaveMultiPrefixProduct"; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(cuda, "_wavePrefixProductMultiple(_getMultiPrefixMask(($1).x), $0)") [require(cuda_hlsl, waveprefix)] -vector WaveMultiPrefixProduct(vector value, uint4 mask); +vector WaveMultiPrefixProduct(vector value, uint4 mask) +{ + __target_switch + { + case cuda: __intrinsic_asm "_wavePrefixProductMultiple(_getMultiPrefixMask(($1).x), $0)"; + case hlsl: __intrinsic_asm "WaveMultiPrefixProduct"; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(cuda, "_wavePrefixProductMultiple(_getMultiPrefixMask(($1).x), $0)") [require(cuda_hlsl, waveprefix)] -matrix WaveMultiPrefixProduct(matrix value, uint4 mask); +matrix WaveMultiPrefixProduct(matrix value, uint4 mask) +{ + __target_switch + { + case cuda: __intrinsic_asm "_wavePrefixProductMultiple(_getMultiPrefixMask(($1).x), $0)"; + case hlsl: __intrinsic_asm "WaveMultiPrefixProduct"; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(cuda, "_wavePrefixSum(_getMultiPrefixMask(($1).x), $0)") [require(cuda_hlsl, waveprefix)] -T WaveMultiPrefixSum(T value, uint4 mask); +T WaveMultiPrefixSum(T value, uint4 mask) +{ + __target_switch + { + case cuda: __intrinsic_asm "_wavePrefixSum(_getMultiPrefixMask(($1).x), $0)"; + case hlsl: __intrinsic_asm "WaveMultiPrefixSum"; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(cuda, "_wavePrefixSumMultiple(_getMultiPrefixMask(($1).x), $0 )") [require(cuda_hlsl, waveprefix)] -vector WaveMultiPrefixSum(vector value, uint4 mask); +vector WaveMultiPrefixSum(vector value, uint4 mask) +{ + __target_switch + { + case cuda: __intrinsic_asm "_wavePrefixSumMultiple(_getMultiPrefixMask(($1).x), $0 )"; + case hlsl: __intrinsic_asm "WaveMultiPrefixSum"; + } +} __generic -__target_intrinsic(hlsl) -__target_intrinsic(cuda, "_wavePrefixSumMultiple(_getMultiPrefixMask(($1).x), $0)") [require(cuda_hlsl, waveprefix)] -matrix WaveMultiPrefixSum(matrix value, uint4 mask); +matrix WaveMultiPrefixSum(matrix value, uint4 mask) +{ + __target_switch + { + case cuda: __intrinsic_asm "_wavePrefixSumMultiple(_getMultiPrefixMask(($1).x), $0)"; + case hlsl: __intrinsic_asm "WaveMultiPrefixSum"; + } +} // `typedef`s to help with the fact that HLSL has been sorta-kinda case insensitive at various points typedef Texture2D texture2D; @@ -10821,9 +12274,14 @@ struct BuiltInTriangleIntersectionAttributes // `executeCallableNV` is the GLSL intrinsic that will be used to implement // `CallShader()` for GLSL-based targets. // -__target_intrinsic(glsl, "executeCallableEXT") [require(glsl, raytracing_raygen_closesthit_miss_callable)] -void __executeCallable(uint shaderIndex, int payloadLocation); +void __executeCallable(uint shaderIndex, int payloadLocation) +{ + __target_switch + { + case glsl: __intrinsic_asm "executeCallableEXT"; + } +} // Next is the custom intrinsic that will compute the payload location // for a type being used in a `CallShader()` call for GLSL-based targets. @@ -10878,7 +12336,6 @@ __generic __intrinsic_op($(kIROp_ForceVarIntoStructTemporarily)) Ref __forceVarIntoStructTemporarily(inout T maybeStruct); -__target_intrinsic(hlsl, "TraceRay") __generic [require(hlsl, raytracing)] void __traceRayHLSL( @@ -10889,9 +12346,14 @@ void __traceRayHLSL( uint MultiplierForGeometryContributionToHitGroupIndex, uint MissShaderIndex, RayDesc Ray, - inout payload_t Payload); + inout payload_t Payload) +{ + __target_switch + { + case hlsl: __intrinsic_asm "TraceRay"; + } +} -__target_intrinsic(glsl, "traceRayEXT") [require(glsl, raytracing_raygen_closesthit_miss)] void __traceRay( RaytracingAccelerationStructure AccelerationStructure, @@ -10904,7 +12366,13 @@ void __traceRay( float TMin, float3 Direction, float TMax, - int PayloadLocation); + int PayloadLocation) +{ + __target_switch + { + case glsl: __intrinsic_asm "traceRayEXT"; + } +} // TODO: Slang's parsing logic currently puts modifiers on // the `GenericDecl` rather than the inner decl when @@ -11000,7 +12468,6 @@ void TraceRay( // // https://github.com/KhronosGroup/GLSL/blob/master/extensions/nv/GLSL_NV_ray_tracing_motion_blur.txt -__target_intrinsic(hlsl, "TraceMotionRay") __generic [require(hlsl, raytracing_motionblur)] void __traceMotionRayHLSL( @@ -11012,10 +12479,15 @@ void __traceMotionRayHLSL( uint MissShaderIndex, RayDesc Ray, float CurrentTime, - inout payload_t Payload); + inout payload_t Payload) +{ + __target_switch + { + case hlsl: __intrinsic_asm "TraceMotionRay"; + } +} __glsl_extension(GL_NV_ray_tracing_motion_blur) -__target_intrinsic(glsl, "traceRayMotionNV") [require(glsl, raytracing_motionblur_raygen_closesthit_miss)] void __traceMotionRay( RaytracingAccelerationStructure AccelerationStructure, @@ -11029,7 +12501,13 @@ void __traceMotionRay( float3 Direction, float TMax, float CurrentTime, - int PayloadLocation); + int PayloadLocation) +{ + __target_switch + { + case glsl: __intrinsic_asm "traceRayMotionNV"; + } +} [ForceInline] [require(glsl_hlsl_spirv, raytracing_motionblur_raygen_closesthit_miss)] @@ -11637,37 +13115,79 @@ extension __TextureImpl(Texture2D tex, SamplerState samp, float2 location, float clamp); + [require(cpp_hlsl)] + void WriteSamplerFeedback(Texture2D tex, SamplerState samp, float2 location, float clamp) + { + __target_switch + { + case cpp: __intrinsic_asm "($0).WriteSamplerFeedback($1, $2, $3, $4)"; + case hlsl: __intrinsic_asm "($0).WriteSamplerFeedback($1, $2, $3, $4)"; + } + } - __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackBias($1, $2, $3, $4, $5)") - __target_intrinsic(cpp, "($0).WriteSamplerFeedbackBias($1, $2, $3, $4, $5)") - void WriteSamplerFeedbackBias(Texture2D tex, SamplerState samp, float2 location, float bias, float clamp); + [require(cpp_hlsl)] + void WriteSamplerFeedbackBias(Texture2D tex, SamplerState samp, float2 location, float bias, float clamp) + { + __target_switch + { + case cpp: __intrinsic_asm "($0).WriteSamplerFeedbackBias($1, $2, $3, $4, $5)"; + case hlsl: __intrinsic_asm "($0).WriteSamplerFeedbackBias($1, $2, $3, $4, $5)"; + } + } - __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5, $6)") - __target_intrinsic(cpp, "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5, $6)") - void WriteSamplerFeedbackGrad(Texture2D tex, SamplerState samp, float2 location, float2 ddx, float2 ddy, float clamp); + [require(cpp_hlsl)] + void WriteSamplerFeedbackGrad(Texture2D tex, SamplerState samp, float2 location, float2 ddx, float2 ddy, float clamp) + { + __target_switch + { + case cpp: __intrinsic_asm "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5, $6)"; + case hlsl: __intrinsic_asm "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5, $6)"; + } + } // Level - __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackLevel($1, $2, $3, $4)") - __target_intrinsic(cpp, "($0).WriteSamplerFeedbackLevel($1, $2, $3, $4)") - void WriteSamplerFeedbackLevel(Texture2D tex, SamplerState samp, float2 location, float lod); + [require(cpp_hlsl)] + void WriteSamplerFeedbackLevel(Texture2D tex, SamplerState samp, float2 location, float lod) + { + __target_switch + { + case cpp: __intrinsic_asm "($0).WriteSamplerFeedbackLevel($1, $2, $3, $4)"; + case hlsl: __intrinsic_asm "($0).WriteSamplerFeedbackLevel($1, $2, $3, $4)"; + } + } // Without Clamp - __target_intrinsic(hlsl, "($0).WriteSamplerFeedback($1, $2, $3)") - __target_intrinsic(cpp, "($0).WriteSamplerFeedback($1, $2, $3)") - void WriteSamplerFeedback(Texture2D tex, SamplerState samp, float2 location); + [require(cpp_hlsl)] + void WriteSamplerFeedback(Texture2D tex, SamplerState samp, float2 location) + { + __target_switch + { + case cpp: __intrinsic_asm "($0).WriteSamplerFeedback($1, $2, $3)"; + case hlsl: __intrinsic_asm "($0).WriteSamplerFeedback($1, $2, $3)"; + } + } - __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackBias($1, $2, $3, $4)") - __target_intrinsic(cpp, "($0).WriteSamplerFeedbackBias($1, $2, $3, $4)") - void WriteSamplerFeedbackBias(Texture2D tex, SamplerState samp, float2 location, float bias); + [require(cpp_hlsl)] + void WriteSamplerFeedbackBias(Texture2D tex, SamplerState samp, float2 location, float bias) + { + __target_switch + { + case cpp: __intrinsic_asm "($0).WriteSamplerFeedbackBias($1, $2, $3, $4)"; + case hlsl: __intrinsic_asm "($0).WriteSamplerFeedbackBias($1, $2, $3, $4)"; + } + } - __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5)") - __target_intrinsic(cpp, "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5)") - void WriteSamplerFeedbackGrad(Texture2D tex, SamplerState samp, float2 location, float2 ddx, float2 ddy); + [require(cpp_hlsl)] + void WriteSamplerFeedbackGrad(Texture2D tex, SamplerState samp, float2 location, float2 ddx, float2 ddy) + { + __target_switch + { + case cpp: __intrinsic_asm "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5)"; + case hlsl: __intrinsic_asm "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5)"; + } + } }; __generic @@ -11675,37 +13195,79 @@ extension __TextureImpl(Texture2DArray texArray, SamplerState samp, float3 location, float clamp); + [require(cpp_hlsl)] + void WriteSamplerFeedback(Texture2DArray texArray, SamplerState samp, float3 location, float clamp) + { + __target_switch + { + case cpp: __intrinsic_asm "($0).WriteSamplerFeedback($1, $2, $3, $4)"; + case hlsl: __intrinsic_asm "($0).WriteSamplerFeedback($1, $2, $3, $4)"; + } + } - __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackBias($1, $2, $3, $4, $5)") - __target_intrinsic(cpp, "($0).WriteSamplerFeedbackBias($1, $2, $3, $4, $5)") - void WriteSamplerFeedbackBias(Texture2DArray texArray, SamplerState samp, float3 location, float bias, float clamp); + [require(cpp_hlsl)] + void WriteSamplerFeedbackBias(Texture2DArray texArray, SamplerState samp, float3 location, float bias, float clamp) + { + __target_switch + { + case cpp: __intrinsic_asm "($0).WriteSamplerFeedbackBias($1, $2, $3, $4, $5)"; + case hlsl: __intrinsic_asm "($0).WriteSamplerFeedbackBias($1, $2, $3, $4, $5)"; + } + } - __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5, $6)") - __target_intrinsic(cpp, "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5, $6)") - void WriteSamplerFeedbackGrad(Texture2DArray texArray, SamplerState samp, float3 location, float3 ddx, float3 ddy, float clamp); + [require(cpp_hlsl)] + void WriteSamplerFeedbackGrad(Texture2DArray texArray, SamplerState samp, float3 location, float3 ddx, float3 ddy, float clamp) + { + __target_switch + { + case cpp: __intrinsic_asm "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5, $6)"; + case hlsl: __intrinsic_asm "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5, $6)"; + } + } // Level - __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackLevel($1, $2, $3, $4)") - __target_intrinsic(cpp, "($0).WriteSamplerFeedbackLevel($1, $2, $3, $4)") - void WriteSamplerFeedbackLevel(Texture2DArray texArray, SamplerState samp, float3 location, float lod); + [require(cpp_hlsl)] + void WriteSamplerFeedbackLevel(Texture2DArray texArray, SamplerState samp, float3 location, float lod) + { + __target_switch + { + case cpp: __intrinsic_asm "($0).WriteSamplerFeedbackLevel($1, $2, $3, $4)"; + case hlsl: __intrinsic_asm "($0).WriteSamplerFeedbackLevel($1, $2, $3, $4)"; + } + } // Without Clamp - __target_intrinsic(hlsl, "($0).WriteSamplerFeedback($1, $2, $3)") - __target_intrinsic(cpp, "($0).WriteSamplerFeedback($1, $2, $3)") - void WriteSamplerFeedback(Texture2DArray texArray, SamplerState samp, float3 location); + [require(cpp_hlsl)] + void WriteSamplerFeedback(Texture2DArray texArray, SamplerState samp, float3 location) + { + __target_switch + { + case cpp: __intrinsic_asm "($0).WriteSamplerFeedback($1, $2, $3)"; + case hlsl: __intrinsic_asm "($0).WriteSamplerFeedback($1, $2, $3)"; + } + } - __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackBias($1, $2, $3, $4)") - __target_intrinsic(cpp, "($0).WriteSamplerFeedbackBias($1, $2, $3, $4)") - void WriteSamplerFeedbackBias(Texture2DArray texArray, SamplerState samp, float3 location, float bias); + [require(cpp_hlsl)] + void WriteSamplerFeedbackBias(Texture2DArray texArray, SamplerState samp, float3 location, float bias) + { + __target_switch + { + case cpp: __intrinsic_asm "($0).WriteSamplerFeedbackBias($1, $2, $3, $4)"; + case hlsl: __intrinsic_asm "($0).WriteSamplerFeedbackBias($1, $2, $3, $4)"; + } + } - __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5)") - __target_intrinsic(cpp, "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5)") - void WriteSamplerFeedbackGrad(Texture2DArray texArray, SamplerState samp, float3 location, float3 ddx, float3 ddy); + [require(cpp_hlsl)] + void WriteSamplerFeedbackGrad(Texture2DArray texArray, SamplerState samp, float3 location, float3 ddx, float3 ddy) + { + __target_switch + { + case cpp: __intrinsic_asm "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5)"; + case hlsl: __intrinsic_asm "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5)"; + } + } }; // @@ -11829,7 +13391,6 @@ struct RayQuery __init(); - __target_intrinsic(glsl, "rayQueryInitializeEXT($0, $1, $2, $3, $4, $5, $6, $7)") __glsl_extension(GL_EXT_ray_query) [require(glsl_spirv, rayquery)] [mutating] @@ -13396,13 +14957,18 @@ struct HitObject } } - __target_intrinsic(hlsl, "NvInvokeHitObject") [require(hlsl, ser)] __generic static void __InvokeHLSL( RaytracingAccelerationStructure AccelerationStructure, HitObject HitOrMiss, - inout payload_t Payload); + inout payload_t Payload) + { + __target_switch + { + case hlsl: __intrinsic_asm "NvInvokeHitObject"; + } + } /// Invokes closesthit or miss shading for the specified hit object. In case of a NOP HitObject, no /// shader is invoked. @@ -13819,21 +15385,30 @@ struct HitObject } /// Loads a root constant from the local root table referenced by the hit object. Valid if the hit object /// represents a hit or a miss. RootConstantOffsetInBytes must be a multiple of 4. - __target_intrinsic(hlsl) [__requiresNVAPI] [require(hlsl, ser)] - uint LoadLocalRootTableConstant(uint RootConstantOffsetInBytes); + uint LoadLocalRootTableConstant(uint RootConstantOffsetInBytes) + { + __target_switch + { + case hlsl: __intrinsic_asm ".LoadLocalRootTableConstant"; + } + } /// /// !!!! Internal NVAPI HLSL impl. Not part of interface! !!!!!!!!!!!! /// - __target_intrinsic(hlsl, "NvGetAttributesFromHitObject($0, $1)") [__requiresNVAPI] [require(hlsl, ser_raygen_closesthit_miss)] - void __hlslGetAttributesFromHitObject(out T t); + void __hlslGetAttributesFromHitObject(out T t) + { + __target_switch + { + case hlsl: __intrinsic_asm "NvGetAttributesFromHitObject($0, $1)"; + } + } - __target_intrinsic(hlsl, "NvMakeHitWithRecordIndex") [__requiresNVAPI] [require(hlsl, ser_raygen_closesthit_miss)] static void __hlslMakeHitWithRecordIndex( @@ -13845,9 +15420,14 @@ struct HitObject uint HitKind, RayDesc Ray, attr_t attributes, - out HitObject hitObj); + out HitObject hitObj) + { + __target_switch + { + case hlsl: __intrinsic_asm "NvMakeHitWithRecordIndex"; + } + } - __target_intrinsic(hlsl, "NvMakeHit") [__requiresNVAPI] [require(hlsl, ser_raygen_closesthit_miss)] static void __hlslMakeHit(RaytracingAccelerationStructure AccelerationStructure, @@ -13859,9 +15439,14 @@ struct HitObject uint MultiplierForGeometryContributionToHitGroupIndex, RayDesc Ray, attr_t attributes, - out HitObject hitObj); + out HitObject hitObj) + { + __target_switch + { + case hlsl: __intrinsic_asm "NvMakeHit"; + } + } - __target_intrinsic(hlsl, "NvTraceRayHitObject") [__requiresNVAPI] [require(hlsl, ser_raygen_closesthit_miss)] static void __hlslTraceRay( @@ -13873,7 +15458,13 @@ struct HitObject uint MissShaderIndex, RayDesc Ray, inout payload_t Payload, - out HitObject hitObj); + out HitObject hitObj) + { + __target_switch + { + case hlsl: __intrinsic_asm "NvTraceRayHitObject"; + } + } /// /// !!!! Internal GLSL GL_NV_shader_invocation_reorder impl. Not part of interface! !!!!!!!!!!!! @@ -13881,7 +15472,6 @@ struct HitObject __glsl_extension(GL_NV_shader_invocation_reorder) __glsl_extension(GL_EXT_ray_tracing) - __target_intrinsic(glsl, "hitObjectRecordMissNV") [require(glsl, ser_raygen_closesthit_miss)] static void __glslMakeMiss( out HitObject hitObj, @@ -13889,13 +15479,18 @@ struct HitObject float3 Origin, float TMin, float3 Direction, - float TMax); + float TMax) + { + __target_switch + { + case glsl: __intrinsic_asm "hitObjectRecordMissNV"; + } + } // "void hitObjectRecordMissNV(hitObjectNV, uint, vec3, float, vec3, float);" __glsl_extension(GL_NV_shader_invocation_reorder) __glsl_extension(GL_EXT_ray_tracing) __glsl_extension(GL_NV_ray_tracing_motion_blur) - __target_intrinsic(glsl, "hitObjectRecordMissMotionNV") [require(glsl, ser_motion_raygen_closesthit_miss)] static void __glslMakeMotionMiss( out HitObject hitObj, @@ -13904,48 +15499,83 @@ struct HitObject float TMin, float3 Direction, float TMax, - float CurrentTime); + float CurrentTime) + { + __target_switch + { + case glsl: __intrinsic_asm "hitObjectRecordMissMotionNV"; + } + } __glsl_extension(GL_EXT_ray_tracing) __glsl_extension(GL_NV_shader_invocation_reorder) - __target_intrinsic(glsl, "hitObjectRecordEmptyNV") [require(glsl, ser_raygen_closesthit_miss)] - static void __glslMakeNop(out HitObject hitObj); + static void __glslMakeNop(out HitObject hitObj) + { + __target_switch + { + case glsl: __intrinsic_asm "hitObjectRecordEmptyNV"; + } + } __glsl_extension(GL_EXT_ray_tracing) __glsl_extension(GL_NV_shader_invocation_reorder) - __target_intrinsic(glsl, "hitObjectGetObjectRayDirectionNV($0)") [require(glsl, ser_raygen_closesthit_miss)] - float3 __glslGetRayDirection(); + float3 __glslGetRayDirection() + { + __target_switch + { + case glsl: __intrinsic_asm "hitObjectGetObjectRayDirectionNV($0)"; + } + } __glsl_extension(GL_EXT_ray_tracing) __glsl_extension(GL_NV_shader_invocation_reorder) - __target_intrinsic(glsl, "hitObjectGetWorldRayDirectionNV($0)") [require(glsl, ser_raygen_closesthit_miss)] - float3 __glslGetRayWorldDirection(); + float3 __glslGetRayWorldDirection() + { + __target_switch + { + case glsl: __intrinsic_asm "hitObjectGetWorldRayDirectionNV($0)"; + } + } __glsl_extension(GL_EXT_ray_tracing) __glsl_extension(GL_NV_shader_invocation_reorder) - __target_intrinsic(glsl, "hitObjectGetWorldRayOriginNV($0)") [require(glsl, ser_raygen_closesthit_miss)] - float3 __glslGetRayWorldOrigin(); + float3 __glslGetRayWorldOrigin() + { + __target_switch + { + case glsl: __intrinsic_asm "hitObjectGetWorldRayOriginNV($0)"; + } + } __glsl_extension(GL_EXT_ray_tracing) __glsl_extension(GL_NV_shader_invocation_reorder) - __target_intrinsic(glsl, "hitObjectGetRayTMaxNV($0)") [require(glsl, ser_raygen_closesthit_miss)] - float __glslGetTMax(); + float __glslGetTMax() + { + __target_switch + { + case glsl: __intrinsic_asm "hitObjectGetRayTMaxNV($0)"; + } + } __glsl_extension(GL_EXT_ray_tracing) __glsl_extension(GL_NV_shader_invocation_reorder) - __target_intrinsic(glsl, "hitObjectGetRayTMinNV($0)") [require(glsl, ser_raygen_closesthit_miss)] - float __glslGetTMin(); + float __glslGetTMin() + { + __target_switch + { + case glsl: __intrinsic_asm "hitObjectGetRayTMinNV($0)"; + } + } // "void hitObjectRecordHitWithIndexNV(hitObjectNV, accelerationStructureEXT,int,int,int,uint,uint,vec3,float,vec3,float,int);" __glsl_extension(GL_EXT_ray_tracing) __glsl_extension(GL_NV_shader_invocation_reorder) - __target_intrinsic(glsl, "hitObjectRecordHitWithIndexNV") [require(glsl, ser_raygen_closesthit_miss)] static void __glslMakeHitWithIndex( out HitObject hitObj, @@ -13959,13 +15589,18 @@ struct HitObject float Tmin, float3 direction, float Tmax, - int attributeLocation); + int attributeLocation) + { + __target_switch + { + case glsl: __intrinsic_asm "hitObjectRecordHitWithIndexNV"; + } + } // "void hitObjectRecordHitWithIndexMotionNV(hitObjectNV, accelerationStructureEXT,int,int,int,uint,uint,vec3,float,vec3,float,float,int);" __glsl_extension(GL_EXT_ray_tracing) __glsl_extension(GL_NV_shader_invocation_reorder) __glsl_extension(GL_NV_ray_tracing_motion_blur) - __target_intrinsic(glsl, "hitObjectRecordHitWithIndexMotionNV") [require(glsl, ser_motion_raygen_closesthit_miss)] static void __glslMakeMotionHitWithIndex( out HitObject hitObj, @@ -13980,12 +15615,17 @@ struct HitObject float3 direction, float Tmax, float CurrentTime, - int attributeLocation); + int attributeLocation) + { + __target_switch + { + case glsl: __intrinsic_asm "hitObjectRecordHitWithIndexMotionNV"; + } + } // "void hitObjectRecordHitNV(hitObjectNV,accelerationStructureEXT,int,int,int,uint,uint,uint,vec3,float,vec3,float,int);" __glsl_extension(GL_EXT_ray_tracing) __glsl_extension(GL_NV_shader_invocation_reorder) - __target_intrinsic(glsl, "hitObjectRecordHitNV") [require(glsl, ser_raygen_closesthit_miss)] static void __glslMakeHit( out HitObject hitObj, @@ -14000,13 +15640,18 @@ struct HitObject float Tmin, float3 direction, float Tmax, - int attributeLocation); + int attributeLocation) + { + __target_switch + { + case glsl: __intrinsic_asm "hitObjectRecordHitNV"; + } + } // "void hitObjectRecordHitMotionNV(hitObjectNV,accelerationStructureEXT,int,int,int,uint,uint,uint,vec3,float,vec3,float,float,int);" __glsl_extension(GL_EXT_ray_tracing) __glsl_extension(GL_NV_shader_invocation_reorder) __glsl_extension(GL_NV_ray_tracing_motion_blur) - __target_intrinsic(glsl, "hitObjectRecordHitMotionNV") [require(glsl, ser_motion_raygen_closesthit_miss)] static void __glslMakeMotionHit( out HitObject hitObj, @@ -14022,18 +15667,28 @@ struct HitObject float3 direction, float Tmax, float CurrentTime, - int attributeLocation); + int attributeLocation) + { + __target_switch + { + case glsl: __intrinsic_asm "hitObjectRecordHitMotionNV"; + } + } __glsl_extension(GL_EXT_ray_tracing) __glsl_extension(GL_NV_shader_invocation_reorder) - __target_intrinsic(glsl, "hitObjectGetAttributesNV($0, $1)") [require(glsl, ser_raygen_closesthit_miss)] - void __glslGetAttributes(int attributeLocation); + void __glslGetAttributes(int attributeLocation) + { + __target_switch + { + case glsl: __intrinsic_asm "hitObjectGetAttributesNV($0, $1)"; + } + } __glsl_extension(GL_EXT_ray_tracing) __glsl_extension(GL_NV_shader_invocation_reorder) - __target_intrinsic(glsl, "hitObjectTraceRayNV") [require(glsl, ser_raygen_closesthit_miss)] static void __glslTraceRay( out HitObject hitObject, @@ -14047,12 +15702,17 @@ struct HitObject float Tmin, float3 direction, float Tmax, - int payload); + int payload) + { + __target_switch + { + case glsl: __intrinsic_asm "hitObjectTraceRayNV"; + } + } __glsl_extension(GL_EXT_ray_tracing) __glsl_extension(GL_NV_shader_invocation_reorder) __glsl_extension(GL_NV_ray_tracing_motion_blur) - __target_intrinsic(glsl, "hitObjectTraceRayMotionNV") [require(glsl, ser_motion_raygen_closesthit_miss)] static void __glslTraceMotionRay( out HitObject hitObject, @@ -14067,15 +15727,26 @@ struct HitObject float3 direction, float Tmax, float currentTime, - int payload); + int payload) + { + __target_switch + { + case glsl: __intrinsic_asm "hitObjectTraceRayMotionNV"; + } + } __glsl_extension(GL_EXT_ray_tracing) __glsl_extension(GL_NV_shader_invocation_reorder) - __target_intrinsic(glsl, "hitObjectExecuteShaderNV") [require(glsl, ser_raygen_closesthit_miss)] static void __glslInvoke( HitObject hitObj, - int payload); + int payload) + { + __target_switch + { + case glsl: __intrinsic_asm "hitObjectExecuteShaderNV"; + } + } }; /// Reorders threads based on a coherence hint value. NumCoherenceHintBits indicates how many of @@ -14223,11 +15894,16 @@ uint getRealtimeClockLow() } } -__target_intrinsic(cpp, "std::chrono::high_resolution_clock::now().time_since_epoch().count()") -__target_intrinsic(cuda, "clock64") [NonUniformReturn] [require(cpp_cuda, shaderclock)] -int64_t __cudaCppGetRealtimeClock(); +int64_t __cudaCppGetRealtimeClock() +{ + __target_switch + { + case cpp: __intrinsic_asm "std::chrono::high_resolution_clock::now().time_since_epoch().count()"; + case cuda: __intrinsic_asm "clock64"; + } +} [__requiresNVAPI] __glsl_extension(GL_EXT_shader_realtime_clock) @@ -14259,23 +15935,38 @@ uint2 getRealtimeClock() // CUDA specific // -__target_intrinsic(cuda, "(threadIdx)") [__readNone] [NonUniformReturn] [require(cuda)] -uint3 cudaThreadIdx(); +uint3 cudaThreadIdx() +{ + __target_switch + { + case cuda: __intrinsic_asm "(threadIdx)"; + } +} -__target_intrinsic(cuda, "(blockIdx)") [__readNone] [NonUniformReturn] [require(cuda)] -uint3 cudaBlockIdx(); +uint3 cudaBlockIdx() +{ + __target_switch + { + case cuda: __intrinsic_asm "(blockIdx)"; + } +} -__target_intrinsic(cuda, "(blockDim)") [__readNone] [NonUniformReturn] [require(cuda)] -uint3 cudaBlockDim(); +uint3 cudaBlockDim() +{ + __target_switch + { + case cuda: __intrinsic_asm "(blockDim)"; + } +} // // Workgroup cooperation @@ -14879,8 +16570,6 @@ for(auto levelChoice : kLevelChoices) [__NoSideEffect] [__requiresNVAPI] - __target_intrinsic(hlsl, - "NvFootprint$(CoarseOrFine)($1, $2, $3, $4, NV_EXTN_TEXTURE_$!0D, $*5)") [require(hlsl, texturefootprint)] static __FootprintData __queryFootprint$(CoarseOrFine)NVAPI( int nd, @@ -14890,12 +16579,16 @@ for(auto levelChoice : kLevelChoices) uint samplerIndex, float3 coords, FootprintGranularity granularity, - out uint isSingleLod); + out uint isSingleLod) + { + __target_switch + { + case hlsl: __intrinsic_asm "NvFootprint$(CoarseOrFine)($1, $2, $3, $4, NV_EXTN_TEXTURE_$!0D, $*5)"; + } + } [__NoSideEffect] [__requiresNVAPI] - __target_intrinsic(hlsl, - "NvFootprint$(CoarseOrFine)Bias($1, $2, $3, $4, NV_EXTN_TEXTURE_$!0D, $*5)") [require(hlsl, texturefootprint)] static __FootprintData __queryFootprint$(CoarseOrFine)BiasNVAPI( int nd, @@ -14906,12 +16599,16 @@ for(auto levelChoice : kLevelChoices) float3 coords, FootprintGranularity granularity, float lodBias, - out uint isSingleLod); + out uint isSingleLod) + { + __target_switch + { + case hlsl: __intrinsic_asm "NvFootprint$(CoarseOrFine)Bias($1, $2, $3, $4, NV_EXTN_TEXTURE_$!0D, $*5)"; + } + } [__NoSideEffect] [__requiresNVAPI] - __target_intrinsic(hlsl, - "NvFootprint$(CoarseOrFine)Level($1, $2, $3, $4, NV_EXTN_TEXTURE_$!0D, $*5)") [require(hlsl, texturefootprint)] static __FootprintData __queryFootprint$(CoarseOrFine)LevelNVAPI( int nd, @@ -14922,12 +16619,16 @@ for(auto levelChoice : kLevelChoices) float3 coords, FootprintGranularity granularity, float lod, - out uint isSingleLod); + out uint isSingleLod) + { + __target_switch + { + case hlsl: __intrinsic_asm "NvFootprint$(CoarseOrFine)Level($1, $2, $3, $4, NV_EXTN_TEXTURE_$!0D, $*5)"; + } + } [__NoSideEffect] [__requiresNVAPI] - __target_intrinsic(hlsl, - "NvFootprint$(CoarseOrFine)Grad($1, $2, $3, $4, NV_EXTN_TEXTURE_$!0D, $*5)") [require(hlsl, texturefootprint)] static __FootprintData __queryFootprint$(CoarseOrFine)GradNVAPI( int nd, @@ -14939,7 +16640,13 @@ for(auto levelChoice : kLevelChoices) FootprintGranularity granularity, float3 dx, float3 dy, - out uint isSingleLod); + out uint isSingleLod) + { + __target_switch + { + case hlsl: __intrinsic_asm "NvFootprint$(CoarseOrFine)Grad($1, $2, $3, $4, NV_EXTN_TEXTURE_$!0D, $*5)"; + } + } ${ // We now define the portable operations that will be officially