From 096b95d26366e63771a2719f0655ec8d1dfff9b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-Fran=C3=A7ois=20Reboud?= Date: Fri, 5 Jan 2024 18:24:11 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20feat(core):=20GELU=20vs=20GELUAppro?= =?UTF-8?q?x=20(#113)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + .../GrAIdient/Core/Function/Activation.swift | 100 ++++++++++++++++-- Sources/GrAIdient/LayerSeq/ValueSeq.swift | 43 +++++--- .../GrAIdient/Metal/Kernel/Activation.metal | 94 +++++++++++++++- Sources/GrAIdient/Metal/Kernel/LayerSeq.metal | 28 +---- Sources/GrAIdient/Metal/MetalConfig.swift | 2 + Sources/GrAIdient/Utils/Concurrency.swift | 56 +++++++--- Tests/GrAIExamples/TransformerBenchmark.swift | 4 +- Tests/GrAITests/Activation1DTests.swift | 34 ++++++ Tests/GrAITests/Activation2DTests.swift | 51 +++++++++ Tests/GrAITests/ActivationSeqTests.swift | 34 ++++++ 11 files changed, 383 insertions(+), 64 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index af5d348b..c79f216d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ All notable changes to this project will be documented in this file. ## [unreleased] +⚙️ **core:** GELU vs GELUApprox ([113](https://github.com/owkin/GrAIdient/pull/113))\ 🚀 **perf:** QuerySelf & ValueSelf ([112](https://github.com/owkin/GrAIdient/pull/112))\ 🚀 **perf:** benchmark ViT base model ([111](https://github.com/owkin/GrAIdient/pull/111))\ 🐛 **fix:** run on Apple Silicon ([110](https://github.com/owkin/GrAIdient/pull/110))\ diff --git a/Sources/GrAIdient/Core/Function/Activation.swift b/Sources/GrAIdient/Core/Function/Activation.swift index 6171a184..edb79edd 100644 --- a/Sources/GrAIdient/Core/Function/Activation.swift +++ b/Sources/GrAIdient/Core/Function/Activation.swift @@ -767,23 +767,23 @@ public class Sigmoid: ActivationFunction } } -/// GELU activation function. -public class GELU: ActivationFunction +/// GELU approximative activation function. +public class GELUApprox: ActivationFunction { - public static let str = "GELU" + public static let str = "GELUApprox" /// Forward GPU kernel. public override var forwardKernel: String { get { - return "forwardGELU" + return "forwardGELUApprox" } } /// Backward GPU kernel. public override var backwardKernel: String { get { - return "backwardGELU" + return "backwardGELUApprox" } } @@ -865,6 +865,83 @@ public class GELU: ActivationFunction } } +/// GELU activation function. +public class GELU: ActivationFunction +{ + public static let str = "GELU" + + /// Forward GPU kernel. + public override var forwardKernel: String + { + get { + return "forwardGELU" + } + } + /// Backward GPU kernel. + public override var backwardKernel: String + { + get { + return "backwardGELU" + } + } + + /// + /// Coefficient to apply during the weights initialization. + /// + /// - Returns: The coefficient. + /// + open override var coeffInitWeights: Float + { + get { + return Float(sqrt(2.0)) + } + } + + /// Create a GELU activation function. + init() + { + super.init(GELU.str) + } + + /// + /// Decode from the disk. + /// + /// Throw an error if reading from the decoder fails, or + /// if the data read is corrupted or otherwise invalid. + /// + /// - Parameter decoder: The decoder to read data from. + /// + required public init(from decoder: Decoder) throws + { + try super.init(from: decoder) + } + + /// + /// Forward CPU. + /// + /// - Parameter x: The input. + /// - Returns: The output. + /// + public override func apply(_ x: Double) -> Double + { + return 0.5 * x * (1 + erf(x / sqrt(2.0))) + } + + /// + /// Backward CPU. + /// + /// - Parameter x: The input. + /// - Returns: The output. + /// + public override func derivate(_ x: Double) -> Double + { + let tmp1 = 0.5 * (1.0 + erf(x / sqrt(2.0))) + let tmp2 = x / sqrt(2.0 * Double.pi) * exp(-x * x / 2.0) + let derivative = tmp1 + tmp2 + return derivative + } +} + /// Factory API to build an activation function. public protocol ActivationKernel { @@ -886,6 +963,7 @@ class ActivationKernelImpl: ActivationKernel LeakyReLU.str: LeakyReLUKernel(), SoftReLU.str: SoftReLUKernel(), Sigmoid.str: SigmoidKernel(), + GELUApprox.str: GELUApproxKernel(), GELU.str: GELUKernel() ] @@ -954,7 +1032,17 @@ private class SigmoidKernel: ActivationKernelImpl } } -/// Factory to build a Sigmoid function. +/// Factory to build a GELU approximative function. +private class GELUApproxKernel: ActivationKernelImpl +{ + /// Build a Sigmoid function. + override func build() -> ActivationFunction + { + return GELUApprox() + } +} + +/// Factory to build a GELU function. private class GELUKernel: ActivationKernelImpl { /// Build a Sigmoid function. diff --git a/Sources/GrAIdient/LayerSeq/ValueSeq.swift b/Sources/GrAIdient/LayerSeq/ValueSeq.swift index e68c841e..09d6b70a 100644 --- a/Sources/GrAIdient/LayerSeq/ValueSeq.swift +++ b/Sources/GrAIdient/LayerSeq/ValueSeq.swift @@ -998,6 +998,16 @@ public class ValueSelfSeq: LayerMergeSeq if _layersPrev[0].computeDelta { + if _layersPrev[0].dirty + { + for elem in 0.. 0.927734375f) + { + // maximum error 0.99527 ulp + r = metal::fma(-1.72853470e-5f, t, 3.83197126e-4f); // -0x1.220000p-16,0x1.91cfb2p-12 + u = metal::fma(-3.88396438e-3f, t, 2.42546219e-2f); // -0x1.fd1438p-9, 0x1.8d6342p-6 + r = metal::fma(r, s, u); + r = metal::fma(r, t, -1.06777877e-1f); // -0x1.b55cb8p-4 + r = metal::fma(r, t, -6.34846687e-1f); // -0x1.450aa0p-1 + r = metal::fma(r, t, -1.28717512e-1f); // -0x1.079d0cp-3 + r = metal::fma(r, t, -t); + // TODO, replace with expm1 when implemented + r = 1.0f - metal::exp(r); + r = metal::copysign(r, a); + } + else + { + // maximum error 0.98929 ulp + r = -5.96761703e-4f; // -0x1.38e000p-11 + r = metal::fma(r, s, 4.99119423e-3f); // 0x1.471a58p-8 + r = metal::fma(r, s, -2.67681349e-2f); // -0x1.b691b2p-6 + r = metal::fma(r, s, 1.12819925e-1f); // 0x1.ce1c44p-4 + r = metal::fma(r, s, -3.76125336e-1f); // -0x1.812700p-2 + r = metal::fma(r, s, 1.28379166e-1f); // 0x1.06eba8p-3 + r = metal::fma(r, a, a); + } + return r; +} + +kernel void forwardGELU( + constant uint * pNbElems, + device float * tmps, + device float * outs, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + + if (pNbElems) + { + nbElems = pNbElems[0]; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + float x = outs[id]; + tmps[id] = x; + outs[id] = 0.5 * x * (1 + erf(x / sqrt(2.0))); +} + +kernel void backwardGELU( + const device float * tmps, + constant uint * pNbElems, + device float * delta, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + + if (pNbElems) + { + nbElems = pNbElems[0]; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + float x = tmps[id]; + float tmp1 = 0.5 * (1.0 + erf(x / sqrt(2.0))); + float tmp2 = x / sqrt(2.0 * M_PI_F) * exp(-x * x / 2.0); + float derivative = tmp1 + tmp2; + delta[id] = delta[id] * derivative; +} diff --git a/Sources/GrAIdient/Metal/Kernel/LayerSeq.metal b/Sources/GrAIdient/Metal/Kernel/LayerSeq.metal index 4c551f4b..8502fbcb 100644 --- a/Sources/GrAIdient/Metal/Kernel/LayerSeq.metal +++ b/Sources/GrAIdient/Metal/Kernel/LayerSeq.metal @@ -2401,7 +2401,6 @@ kernel void valueSelfValueSeqBackward( constant uint * pGlobalOffset, constant uint * pNbBatch, constant uint * pSequence, - constant uint * pDirty, device float * value, uint2 id [[ thread_position_in_grid ]]) { @@ -2414,10 +2413,9 @@ kernel void valueSelfValueSeqBackward( uint nbBatch; uint sequence; uint size; - uint dirty; if (pNbHeads && pNbNeurons && pNbNeuronsPrev && - pNbBlocksPrev && pGlobalOffset && pNbBatch && pSequence && pDirty && + pNbBlocksPrev && pGlobalOffset && pNbBatch && pSequence && value && score && delta) { nbHeads = *pNbHeads; @@ -2429,7 +2427,6 @@ kernel void valueSelfValueSeqBackward( nbBatch = *pNbBatch; sequence = *pSequence; size = nbNeurons2 / nbHeads; - dirty = *pDirty; } else return ; @@ -2459,14 +2456,7 @@ kernel void valueSelfValueSeqBackward( uint offsetValue = depth + valueOffset * nbNeurons2 + nbNeurons1 * seqK + sequence * nbNeurons1 * elem; - if (dirty) - { - value[offsetValue] = tmp; - } - else - { - value[offsetValue] += tmp; - } + value[offsetValue] += tmp; } kernel void valueSelfValueSeq4Backward( @@ -2479,7 +2469,6 @@ kernel void valueSelfValueSeq4Backward( constant uint * pGlobalOffset, constant uint * pNbBatch, constant uint * pSequence, - constant uint * pDirty, device float4 * value, uint2 id [[ thread_position_in_grid ]]) { @@ -2492,10 +2481,9 @@ kernel void valueSelfValueSeq4Backward( uint nbBatch; uint sequence; uint size; - uint dirty; if (pNbHeads && pNbNeurons && pNbNeuronsPrev && - pNbBlocksPrev && pGlobalOffset && pNbBatch && pSequence && pDirty && + pNbBlocksPrev && pGlobalOffset && pNbBatch && pSequence && value && score && delta) { nbHeads = *pNbHeads; @@ -2507,7 +2495,6 @@ kernel void valueSelfValueSeq4Backward( nbBatch = *pNbBatch; sequence = *pSequence; size = nbNeurons2 / nbHeads; - dirty = *pDirty; } else return ; @@ -2538,14 +2525,7 @@ kernel void valueSelfValueSeq4Backward( uint offsetValue = (depth + valueOffset * nbNeurons2 + nbNeurons1 * seqK + sequence * nbNeurons1 * elem) / 4; - if (dirty) - { - value[offsetValue] = tmp; - } - else - { - value[offsetValue] += tmp; - } + value[offsetValue] += tmp; } kernel void valueSelfScoreSeqBackward( diff --git a/Sources/GrAIdient/Metal/MetalConfig.swift b/Sources/GrAIdient/Metal/MetalConfig.swift index 6b1e04e7..3e8f3151 100644 --- a/Sources/GrAIdient/Metal/MetalConfig.swift +++ b/Sources/GrAIdient/Metal/MetalConfig.swift @@ -16,6 +16,8 @@ let CONFIG_KERNELS = "backwardSoftReLU", "forwardSigmoid", "backwardSigmoid", + "forwardGELUApprox", + "backwardGELUApprox", "forwardGELU", "backwardGELU", ], diff --git a/Sources/GrAIdient/Utils/Concurrency.swift b/Sources/GrAIdient/Utils/Concurrency.swift index 7c28366c..cb62a1f2 100644 --- a/Sources/GrAIdient/Utils/Concurrency.swift +++ b/Sources/GrAIdient/Utils/Concurrency.swift @@ -7,6 +7,40 @@ import Foundation +/// +/// Split an ensemble of elements into "balanced" batches. +/// +/// - Parameters : +/// - nbElems: The number of elements in the ensemble. +/// - nbSplits: The number of batch splits. +/// - Returns: The list of (start, end) indices for the different batches. +/// +func splitBatch( + nbElems: Int, nbSplits: Int +) -> [(start: Int, end: Int)] +{ + var batchRanges = [(start: Int, end: Int)]() + let batchSize = nbElems / nbSplits + let remaining = nbElems % nbSplits + + var cur = 0 + for block in 0.. ()) { - let nbThreads = ProcessInfo.processInfo.activeProcessorCount - if nbElems >= nbThreads + let nbThreads = min( + nbElems, ProcessInfo.processInfo.activeProcessorCount + ) + if nbThreads > 1 { + let batchRanges = splitBatch(nbElems: nbElems, nbSplits: nbThreads) DispatchQueue.concurrentPerform(iterations: nbThreads) { (thread: Int) in - let nbElemsPerThread = nbElems / nbThreads - let start = thread * nbElemsPerThread - let end = min(nbElems, (thread+1) * nbElemsPerThread) - - for elem in start.. 1 - { - DispatchQueue.concurrentPerform(iterations: nbElems) - { - (thread: Int) in - block(thread) - } - } else if nbElems == 1 { block(0) diff --git a/Tests/GrAIExamples/TransformerBenchmark.swift b/Tests/GrAIExamples/TransformerBenchmark.swift index ae7c2455..3265c401 100644 --- a/Tests/GrAIExamples/TransformerBenchmark.swift +++ b/Tests/GrAIExamples/TransformerBenchmark.swift @@ -215,7 +215,7 @@ final class TransformerBenchmark: XCTestCase } /// Test: train a ViT model. - func test_TrainTransformer() + func _test_TrainTransformer() { // Get optimizer parameters for iterating over batch size elements. let params = _getOptimizerParams(nbLoops: _batchSize) @@ -329,7 +329,7 @@ final class TransformerBenchmark: XCTestCase } /// Test: evaluate a ViT model. - func test_EvalTransformer() + func _test_EvalTransformer() { // Build a model with randomly initialized weights. let transformer = _buildModel( diff --git a/Tests/GrAITests/Activation1DTests.swift b/Tests/GrAITests/Activation1DTests.swift index 67716e23..4b3aa426 100644 --- a/Tests/GrAITests/Activation1DTests.swift +++ b/Tests/GrAITests/Activation1DTests.swift @@ -164,6 +164,23 @@ class Activation1DGradTests: Input1DMSE1DCase run(trainer) } + func testFLGELUApproxCPU() throws + { + GrAI.Opti.CPU = true + let trainer = _buildTrainer( + model: "FullyConnected", activation: GELUApprox.str + ) + run(trainer) + } + + func testFLGELUApproxGPU() throws + { + let trainer = _buildTrainer( + model: "FullyConnected", activation: GELUApprox.str + ) + run(trainer) + } + func testFLGELUCPU() throws { GrAI.Opti.CPU = true @@ -249,6 +266,23 @@ class Activation1DGradTests: Input1DMSE1DCase run(trainer) } + func testGELUApproxCPU() throws + { + GrAI.Opti.CPU = true + let trainer = _buildTrainer( + model: "Activation", activation: GELUApprox.str + ) + run(trainer) + } + + func testGELUApproxGPU() throws + { + let trainer = _buildTrainer( + model: "Activation", activation: GELUApprox.str + ) + run(trainer) + } + func testGELUCPU() throws { GrAI.Opti.CPU = true diff --git a/Tests/GrAITests/Activation2DTests.swift b/Tests/GrAITests/Activation2DTests.swift index 852e19f2..0f821e63 100644 --- a/Tests/GrAITests/Activation2DTests.swift +++ b/Tests/GrAITests/Activation2DTests.swift @@ -256,6 +256,40 @@ class Activation2DGradTests: Input2DMSE1DCase run(trainer) } + func testConvGELUApproxNoBNCPU() throws + { + GrAI.Opti.CPU = true + let trainer = _buildTrainer( + model: "Convolution", activation: GELUApprox.str, bn: false + ) + run(trainer) + } + + func testConvGELUApproxBNCPU() throws + { + GrAI.Opti.CPU = true + let trainer = _buildTrainer( + model: "Convolution", activation: GELUApprox.str, bn: true + ) + run(trainer) + } + + func testConvGELUApproxNoBNGPU() throws + { + let trainer = _buildTrainer( + model: "Convolution", activation: GELUApprox.str, bn: false + ) + run(trainer) + } + + func testConvGELUApproxBNGPU() throws + { + let trainer = _buildTrainer( + model: "Convolution", activation: GELUApprox.str, bn: true + ) + run(trainer) + } + func testConvGELUNoBNCPU() throws { GrAI.Opti.CPU = true @@ -358,6 +392,23 @@ class Activation2DGradTests: Input2DMSE1DCase run(trainer) } + func testGELUApproxCPU() throws + { + GrAI.Opti.CPU = true + let trainer = _buildTrainer( + model: "Activation", activation: GELUApprox.str, bn: false + ) + run(trainer) + } + + func testGELUApproxGPU() throws + { + let trainer = _buildTrainer( + model: "Activation", activation: GELUApprox.str, bn: false + ) + run(trainer) + } + func testGELUCPU() throws { GrAI.Opti.CPU = true diff --git a/Tests/GrAITests/ActivationSeqTests.swift b/Tests/GrAITests/ActivationSeqTests.swift index 5eda7487..da7bb90c 100644 --- a/Tests/GrAITests/ActivationSeqTests.swift +++ b/Tests/GrAITests/ActivationSeqTests.swift @@ -171,6 +171,23 @@ class ActivationSeqGradTests: Input2DMSE1DCase run(trainer) } + func testFLGELUApproxCPU() throws + { + GrAI.Opti.CPU = true + let trainer = _buildTrainer( + model: "FullyConnected", activation: GELUApprox.str + ) + run(trainer) + } + + func testFLGELUApproxGPU() throws + { + let trainer = _buildTrainer( + model: "FullyConnected", activation: GELUApprox.str + ) + run(trainer) + } + func testFLGELUCPU() throws { GrAI.Opti.CPU = true @@ -256,6 +273,23 @@ class ActivationSeqGradTests: Input2DMSE1DCase run(trainer) } + func testGELUApproxCPU() throws + { + GrAI.Opti.CPU = true + let trainer = _buildTrainer( + model: "Activation", activation: GELUApprox.str + ) + run(trainer) + } + + func testGELUApproxGPU() throws + { + let trainer = _buildTrainer( + model: "Activation", activation: GELUApprox.str + ) + run(trainer) + } + func testGELUCPU() throws { GrAI.Opti.CPU = true