diff --git a/Src/ILGPU.Algorithms/IL/ILFunctions.cs b/Src/ILGPU.Algorithms/IL/ILFunctions.cs deleted file mode 100644 index aa19cda85..000000000 --- a/Src/ILGPU.Algorithms/IL/ILFunctions.cs +++ /dev/null @@ -1,204 +0,0 @@ -// --------------------------------------------------------------------------------------- -// ILGPU Algorithms -// Copyright (c) 2021 ILGPU Project -// www.ilgpu.net -// -// File: ILFunctions.cs -// -// This file is part of ILGPU and is distributed under the University of Illinois Open -// Source License. See LICENSE.txt for details. -// --------------------------------------------------------------------------------------- - -using ILGPU.Algorithms.ScanReduceOperations; -using System; -using System.Diagnostics; -using System.Runtime.CompilerServices; - -namespace ILGPU.Algorithms.IL -{ - /// - /// Custom IL-specific implementations. - /// - static class ILFunctions - { - #region Nested Types - - public interface IILFunctionImplementation - { - /// - /// The maximum number of supported thread per context on the CPU accelerator - /// for the implementation of specific algorithms. - /// - int MaxNumThreads { get; } - - /// - /// Returns true if the current thread is the first thread. - /// - bool IsFirstThread { get; } - - /// - /// Returns the current linear thread index. - /// - int ThreadIndex { get; } - - /// - /// Returns the linear thread dimension. - /// - int ThreadDimension { get; } - - /// - /// The number of segments for reduce operations. - /// - int ReduceSegments { get; } - - /// - /// The reduction segment index to write to and read from. - /// - int ReduceSegmentIndex { get; } - - /// - /// Executes a barrier in the current context. - /// - void Barrier(); - } - - #endregion - - #region Reduce - - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static T Reduce(T value) - where T : unmanaged - where TReduction : IScanReduceOperation - where TImpl : struct, IILFunctionImplementation => - AllReduce(value); - - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static T AllReduce(T value) - where T : unmanaged - where TReduction : IScanReduceOperation - where TImpl : struct, IILFunctionImplementation - { - TImpl impl = default; - var sharedMemory = SharedMemory.Allocate(impl.ReduceSegments); - - TReduction reduction = default; - if (impl.IsFirstThread) - sharedMemory[impl.ReduceSegmentIndex] = reduction.Identity; - impl.Barrier(); - - reduction.AtomicApply(ref sharedMemory[impl.ReduceSegmentIndex], value); - - impl.Barrier(); - return sharedMemory[impl.ReduceSegmentIndex]; - } - - #endregion - - #region Scan - - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static T ExclusiveScan(T value) - where T : unmanaged - where TScanOperation : struct, IScanReduceOperation - where TImpl : struct, IILFunctionImplementation => - ExclusiveScanWithBoundaries(value, out var _); - - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static T InclusiveScan(T value) - where T : unmanaged - where TScanOperation : struct, IScanReduceOperation - where TImpl : struct, IILFunctionImplementation => - InclusiveScanWithBoundaries(value, out var _); - - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static T ExclusiveScanWithBoundaries( - T value, - out ScanBoundaries boundaries) - where T : unmanaged - where TScanOperation : struct, IScanReduceOperation - where TImpl : struct, IILFunctionImplementation - { - TImpl impl = default; - var sharedMemory = InclusiveScanImplementation( - value); - boundaries = new ScanBoundaries( - sharedMemory[0], - sharedMemory[Math.Max(0, impl.ThreadDimension - 2)]); - return impl.IsFirstThread - ? default(TScanOperation).Identity - : sharedMemory[impl.ThreadIndex - 1]; - } - - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static T InclusiveScanWithBoundaries( - T value, - out ScanBoundaries boundaries) - where T : unmanaged - where TScanOperation : struct, IScanReduceOperation - where TImpl : struct, IILFunctionImplementation - { - TImpl impl = default; - var sharedMemory = InclusiveScanImplementation( - value); - boundaries = new ScanBoundaries( - sharedMemory[0], - sharedMemory[impl.ThreadDimension - 1]); - return sharedMemory[impl.ThreadIndex]; - } - - /// - /// Performs a group-wide inclusive scan. - /// - /// The element type. - /// The type of the warp scan logic. - /// The internal implementation type. - /// The value to scan. - /// The resulting value for the current lane. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static ArrayView InclusiveScanImplementation< - T, - TScanOperation, - TImpl>( - T value) - where T : unmanaged - where TScanOperation : struct, IScanReduceOperation - where TImpl : struct, IILFunctionImplementation - { - TImpl impl = default; - - // Load values into shared memory - var sharedMemory = SharedMemory.Allocate(impl.MaxNumThreads); - Debug.Assert( - impl.ThreadDimension <= impl.MaxNumThreads, - "Invalid group/warp size"); - sharedMemory[impl.ThreadIndex] = value; - impl.Barrier(); - - // First thread performs all operations - if (impl.IsFirstThread) - { - TScanOperation scanOperation = default; - for (int i = 1; i < impl.ThreadDimension; ++i) - { - sharedMemory[i] = scanOperation.Apply( - sharedMemory[i - 1], - sharedMemory[i]); - } - } - impl.Barrier(); - - return sharedMemory; - } - - #endregion - } -} diff --git a/Src/ILGPU.Algorithms/IL/ILGroupExtensions.cs b/Src/ILGPU.Algorithms/IL/ILGroupExtensions.cs index af0d43fad..f4f1db1db 100644 --- a/Src/ILGPU.Algorithms/IL/ILGroupExtensions.cs +++ b/Src/ILGPU.Algorithms/IL/ILGroupExtensions.cs @@ -1,6 +1,6 @@ // --------------------------------------------------------------------------------------- // ILGPU Algorithms -// Copyright (c) 2019-2021 ILGPU Project +// Copyright (c) 2019-2022 ILGPU Project // www.ilgpu.net // // File: ILGroupExtensions.cs @@ -10,8 +10,9 @@ // --------------------------------------------------------------------------------------- using ILGPU.Algorithms.ScanReduceOperations; +using System; +using System.Diagnostics; using System.Runtime.CompilerServices; -using static ILGPU.Algorithms.IL.ILFunctions; namespace ILGPU.Algorithms.IL { @@ -20,54 +21,6 @@ namespace ILGPU.Algorithms.IL /// static class ILGroupExtensions { - #region Nested Types - - /// - /// Implements ILFunctions for groups. - /// - private readonly struct GroupImplementation : IILFunctionImplementation - { - /// - /// Returns 1024. - /// - /// - /// TODO: refine the implementation to avoid a hard-coded constant. - /// - public readonly int MaxNumThreads => 1024; - - /// - /// Returns true if this is the first group thread. - /// - public readonly bool IsFirstThread => Group.IsFirstThread; - - /// - /// Returns current linear group index. - /// - public readonly int ThreadIndex => Group.LinearIndex; - - /// - /// Returns the linear group dimension. - /// - public readonly int ThreadDimension => Group.Dimension.Size; - - /// - /// Returns 1. - /// - public readonly int ReduceSegments => 1; - - /// - /// Returns 0. - /// - public readonly int ReduceSegmentIndex => 0; - - /// - /// Performs a group-wide barrier. - /// - public readonly void Barrier() => Group.Barrier(); - } - - #endregion - #region Reduce /// @@ -81,8 +34,23 @@ public static T Reduce(T value) [MethodImpl(MethodImplOptions.AggressiveInlining)] public static T AllReduce(T value) where T : unmanaged - where TReduction : IScanReduceOperation => - AllReduce(value); + where TReduction : IScanReduceOperation + { + TReduction reduction = default; + + ref var sharedMemory = ref SharedMemory.Allocate(); + if (Group.IsFirstThread) + sharedMemory = reduction.Identity; + Group.Barrier(); + + // Reduce inside all warps first + var firstLaneReduced = ILWarpExtensions.Reduce(value); + if (Warp.IsFirstLane) + reduction.AtomicApply(ref sharedMemory, firstLaneReduced); + + Group.Barrier(); + return sharedMemory; + } #endregion @@ -93,14 +61,14 @@ public static T AllReduce(T value) public static T ExclusiveScan(T value) where T : unmanaged where TScanOperation : struct, IScanReduceOperation => - ExclusiveScan(value); + ExclusiveScanWithBoundaries(value, out var _); /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public static T InclusiveScan(T value) where T : unmanaged where TScanOperation : struct, IScanReduceOperation => - InclusiveScan(value); + InclusiveScanWithBoundaries(value, out var _); /// @@ -109,10 +77,16 @@ public static T ExclusiveScanWithBoundaries( T value, out ScanBoundaries boundaries) where T : unmanaged - where TScanOperation : struct, IScanReduceOperation => - ExclusiveScanWithBoundaries( - value, - out boundaries); + where TScanOperation : struct, IScanReduceOperation + { + var sharedMemory = InclusiveScanImplementation(value); + boundaries = new ScanBoundaries( + sharedMemory[0], + sharedMemory[Math.Max(0, Group.Dimension.Size - 2)]); + return Group.IsFirstThread + ? default(TScanOperation).Identity + : sharedMemory[Group.LinearIndex - 1]; + } /// @@ -121,10 +95,54 @@ public static T InclusiveScanWithBoundaries( T value, out ScanBoundaries boundaries) where T : unmanaged - where TScanOperation : struct, IScanReduceOperation => - InclusiveScanWithBoundaries( - value, - out boundaries); + where TScanOperation : struct, IScanReduceOperation + { + var sharedMemory = InclusiveScanImplementation( + value); + boundaries = new ScanBoundaries( + sharedMemory[0], + sharedMemory[Group.Dimension.Size - 1]); + return sharedMemory[Group.LinearIndex]; + } + + /// + /// Performs a group-wide inclusive scan. + /// + /// The element type. + /// The type of the warp scan logic. + /// The value to scan. + /// The resulting value for the current lane. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static ArrayView InclusiveScanImplementation( + T value) + where T : unmanaged + where TScanOperation : struct, IScanReduceOperation + { + const int MaxNumThreads = 2048; + + // Load values into shared memory + var sharedMemory = SharedMemory.Allocate(MaxNumThreads); + Debug.Assert( + Group.Dimension.Size <= MaxNumThreads, + "Invalid group/warp size"); + sharedMemory[Group.LinearIndex] = value; + Group.Barrier(); + + // First thread performs all operations + if (Group.IsFirstThread) + { + TScanOperation scanOperation = default; + for (int i = 1; i < Group.Dimension.Size; ++i) + { + sharedMemory[i] = scanOperation.Apply( + sharedMemory[i - 1], + sharedMemory[i]); + } + } + Group.Barrier(); + + return sharedMemory; + } /// /// Prepares for the next iteration of a group-wide exclusive scan within the diff --git a/Src/ILGPU.Algorithms/IL/ILWarpExtensions.cs b/Src/ILGPU.Algorithms/IL/ILWarpExtensions.cs index 7b1b56069..f4f987400 100644 --- a/Src/ILGPU.Algorithms/IL/ILWarpExtensions.cs +++ b/Src/ILGPU.Algorithms/IL/ILWarpExtensions.cs @@ -1,6 +1,6 @@ // --------------------------------------------------------------------------------------- // ILGPU Algorithms -// Copyright (c) 2019-2021 ILGPU Project +// Copyright (c) 2019-2022 ILGPU Project // www.ilgpu.net // // File: ILWarpExtensions.cs @@ -9,65 +9,18 @@ // Source License. See LICENSE.txt for details. // --------------------------------------------------------------------------------------- +using ILGPU.Algorithms.PTX; using ILGPU.Algorithms.ScanReduceOperations; using System.Runtime.CompilerServices; -using static ILGPU.Algorithms.IL.ILFunctions; namespace ILGPU.Algorithms.IL { /// - /// Custom IL-specific implementations. + /// Custom IL-specific implementations that fall back to PTX-specific implementations + /// as the CPU runtime is fully compatible with the PTX runtime. /// static class ILWarpExtensions { - #region Nested Types - - /// - /// Implements ILFunctions for warps. - /// - private readonly struct WarpImplementation : IILFunctionImplementation - { - /// - /// Returns 256. - /// - /// - /// TODO: refine the implementation to avoid a hard-coded constant. - /// - public readonly int MaxNumThreads => 256; - - /// - /// Returns true if this is the first warp thread. - /// - public readonly bool IsFirstThread => Warp.IsFirstLane; - - /// - /// Returns current lane index. - /// - public readonly int ThreadIndex => Warp.LaneIdx; - - /// - /// Returns the warp size. - /// - public readonly int ThreadDimension => Warp.WarpSize; - - /// - /// Returns the number of warps per group. - /// - public readonly int ReduceSegments => MaxNumThreads / Warp.WarpSize; - - /// - /// Returns the current warp index. - /// - public readonly int ReduceSegmentIndex => Warp.WarpIdx; - - /// - /// Performs a warp-wide barrier. - /// - public readonly void Barrier() => Warp.Barrier(); - } - - #endregion - #region Reduce /// @@ -75,14 +28,14 @@ static class ILWarpExtensions public static T Reduce(T value) where T : unmanaged where TReduction : IScanReduceOperation => - AllReduce(value); + PTXWarpExtensions.Reduce(value); /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public static T AllReduce(T value) where T : unmanaged where TReduction : IScanReduceOperation => - AllReduce(value); + PTXWarpExtensions.AllReduce(value); #endregion @@ -93,14 +46,14 @@ public static T AllReduce(T value) public static T ExclusiveScan(T value) where T : unmanaged where TScanOperation : struct, IScanReduceOperation => - ExclusiveScan(value); + PTXWarpExtensions.ExclusiveScan(value); /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public static T InclusiveScan(T value) where T : unmanaged where TScanOperation : struct, IScanReduceOperation => - InclusiveScan(value); + PTXWarpExtensions.InclusiveScan(value); #endregion }