diff --git a/CHANGELOG.md b/CHANGELOG.md index c8e6aff9..a383b263 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,8 @@ All notable changes to this project will be documented in this file. ## [unreleased] -๐Ÿชœ **feat:** LayerCAM2D -> VQGrad2D, LayerCAMSeq -> VQGradSeq ([#114](https://github.com/owkin/GrAIdient/pull/114))\ +๐Ÿš€ **perf:** Convolution2D ([118](https://github.com/owkin/GrAIdient/pull/118))\ +๐Ÿชœ **feat:** LayerCAM2D -> VQGrad2D, LayerCAMSeq -> VQGradSeq ([#117](https://github.com/owkin/GrAIdient/pull/117))\ โš™๏ธ **core:** GELU vs GELUApprox ([113](https://github.com/owkin/GrAIdient/pull/113))\ ๐Ÿš€ **perf:** QuerySelf & ValueSelf ([112](https://github.com/owkin/GrAIdient/pull/112))\ ๐Ÿš€ **perf:** benchmark ViT base model ([111](https://github.com/owkin/GrAIdient/pull/111))\ diff --git a/Sources/GrAIdient/Layer2D/Convolution2D.swift b/Sources/GrAIdient/Layer2D/Convolution2D.swift index 9f0da6b3..6ac4c757 100644 --- a/Sources/GrAIdient/Layer2D/Convolution2D.swift +++ b/Sources/GrAIdient/Layer2D/Convolution2D.swift @@ -1373,8 +1373,21 @@ public class Convolution2D: BN2D, LayerWeightInit UInt32(weightHeight)] let pNbBatch: [UInt32] = [UInt32(batchSize)] + let kernel: String + let coeff: Int + if forwardKernel == "convForward" && nbChannels % 16 == 0 + { + kernel = "conv16Forward" + coeff = 16 + } + else + { + kernel = forwardKernel + coeff = 1 + } + let command = MetalKernel.get.createCommand( - forwardKernel, deviceID: deviceID + kernel, deviceID: deviceID ) command.setBuffer(layerPrev.outs.metal, atIndex: 0) command.setBuffer(_wBuffers.w.metal, atIndex: 1) @@ -1390,7 +1403,7 @@ public class Convolution2D: BN2D, LayerWeightInit command.setBuffer(outs.metal, atIndex: 11) command.dispatchThreads( - width: nbChannels * width, + width: (nbChannels / coeff) * width, height: batchSize * height ) command.enqueue() @@ -1556,8 +1569,21 @@ public class Convolution2D: BN2D, LayerWeightInit let pNbBatch: [UInt32] = [UInt32(batchSize)] let pDirty: [UInt32] = layerPrev.dirty ? [1] : [0] + let kernel: String + let coeff: Int + if backwardKernel == "convBackward" && nbChannelsPrev % 16 == 0 + { + kernel = "conv16Backward" + coeff = 16 + } + else + { + kernel = backwardKernel + coeff = 1 + } + let command = MetalKernel.get.createCommand( - backwardKernel, deviceID: deviceID + kernel, deviceID: deviceID ) command.setBuffer(delta.metal, atIndex: 0) command.setBuffer(_wBuffers.w.metal, atIndex: 1) @@ -1573,7 +1599,7 @@ public class Convolution2D: BN2D, LayerWeightInit command.setBuffer(layerPrev.delta.metal, atIndex: 11) command.dispatchThreads( - width: nbChannelsPrev * layerPrev.width, + width: (nbChannelsPrev / coeff) * layerPrev.width, height: batchSize * layerPrev.height ) command.enqueue() @@ -1609,27 +1635,56 @@ public class Convolution2D: BN2D, LayerWeightInit var command: MetalCommand if GrAI.Gradient.batch { - command = MetalKernel.get.createCommand( - batchDerWeightsKernel, deviceID: deviceID - ) - command.setBuffer(layerPrev.outs.metal, atIndex: 0) - command.setBuffer(delta.metal, atIndex: 1) - command.setBytes(pStart, atIndex: 2) - command.setBytes(pStride, atIndex: 3) - command.setBytes(pNbChannels, atIndex: 4) - command.setBytes(pNbChannelsPrev, atIndex: 5) - command.setBytes(pDimensions, atIndex: 6) - command.setBytes(pDimensionsPrev, atIndex: 7) - command.setBytes(pDimWeights, atIndex: 8) - command.setBytes(pNbBatch, atIndex: 9) - command.setBytes(pAccumulate, atIndex: 10) - command.setBuffer(_wBuffers.g.metal, atIndex: 11) - - command.dispatchThreads( - width: nbChannels * weightWidth, - height: nbChannelsPrev * weightHeight - ) - command.enqueue() + if batchDerWeightsKernel == "convBatchDerWeights" && + _stride == 1 && + layerPrev.width == width && + layerPrev.height == height && + weightWidth == 3 && weightHeight == 3 && + height % 2 == 0 && width % 4 == 0 + { + command = MetalKernel.get.createCommand( + "conv34BatchDerWeights", deviceID: deviceID + ) + command.setBuffer(layerPrev.outs.metal, atIndex: 0) + command.setBuffer(delta.metal, atIndex: 1) + command.setBytes(pNbChannels, atIndex: 2) + command.setBytes(pNbChannelsPrev, atIndex: 3) + command.setBytes(pDimensions, atIndex: 4) + command.setBytes(pDimensionsPrev, atIndex: 5) + command.setBytes(pNbBatch, atIndex: 6) + command.setBytes(pAccumulate, atIndex: 7) + command.setBuffer(_wBuffers.g.metal, atIndex: 8) + + command.dispatchThreads( + width: nbChannels, + height: nbChannelsPrev + ) + command.enqueue() + } + else + { + command = MetalKernel.get.createCommand( + batchDerWeightsKernel, deviceID: deviceID + ) + command.setBuffer(layerPrev.outs.metal, atIndex: 0) + command.setBuffer(delta.metal, atIndex: 1) + command.setBytes(pStart, atIndex: 2) + command.setBytes(pStride, atIndex: 3) + command.setBytes(pNbChannels, atIndex: 4) + command.setBytes(pNbChannelsPrev, atIndex: 5) + command.setBytes(pDimensions, atIndex: 6) + command.setBytes(pDimensionsPrev, atIndex: 7) + command.setBytes(pDimWeights, atIndex: 8) + command.setBytes(pNbBatch, atIndex: 9) + command.setBytes(pAccumulate, atIndex: 10) + command.setBuffer(_wBuffers.g.metal, atIndex: 11) + + command.dispatchThreads( + width: nbChannels * weightWidth, + height: nbChannelsPrev * weightHeight + ) + command.enqueue() + } if _updateBiases { diff --git a/Sources/GrAIdient/Metal/Kernel/Convolution.metal b/Sources/GrAIdient/Metal/Kernel/Convolution.metal index 220e4c0b..9a688895 100644 --- a/Sources/GrAIdient/Metal/Kernel/Convolution.metal +++ b/Sources/GrAIdient/Metal/Kernel/Convolution.metal @@ -104,6 +104,108 @@ kernel void convForward( outs[offset] = tmp; } +kernel void conv16Forward( + const device float * outsPrev, + const device float * weights, + const device float * biases, + constant int * pStart, + constant uint * pStride, + constant uint * pNbChannels, + constant uint * pNbChannelsPrev, + constant uint * pDimensions, + constant uint * pDimensionsPrev, + constant uint * pDimWeights, + constant uint * pNbBatch, + device float * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint heightPrev, widthPrev; + uint weightHeight, weightWidth; + uint nbChannels; + uint nbChannelsPrev; + int startI, startJ; + int endI, endJ; + int offI, offJ; + uint stride; + uint nbBatch; + + if (pStart && pStride && pNbChannels && pNbChannelsPrev && + pDimensions && pDimensionsPrev && pDimWeights && pNbBatch && + outsPrev && weights && biases && outs) + { + width = pDimensions[0]; + height = pDimensions[1]; + widthPrev = pDimensionsPrev[0]; + heightPrev = pDimensionsPrev[1]; + weightWidth = pDimWeights[0]; + weightHeight = pDimWeights[1]; + nbChannels = *pNbChannels; + nbChannelsPrev = *pNbChannelsPrev; + nbBatch = *pNbBatch; + startI = pStart[0]; + endI = pStart[1]; + startJ = pStart[2]; + endJ = pStart[3]; + offI = pStart[4]; + offJ = pStart[5]; + stride = pStride[0]; + } + else + return ; + + uint coeff = 16; + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth * coeff >= width * nbChannels) + { + return ; + } + + float tmp[16] = {0}; + for (uint depthPrev=0; depthPrev= 0 && + (int)(stride*j)+l-offJ < (int)widthPrev && + (int)(stride*i)+k-offI >= 0 && + (int)(stride*i)+k-offI < (int)heightPrev) + { + uint offsetPrev = (int)(stride*j)+l-offJ + + (offsetStartPrev + (int)(stride*i)+k-offI)*widthPrev; + float outPrev = outsPrev[offsetPrev]; + + for (uint c=0; c= heightPrev * nbBatch || + j * depthPrev * coeff >= widthPrev * nbChannelsPrev) + { + return ; + } + + float tmp[16] = {0}; + for (uint depth=0; depth= 0 && j1 < (int)width && + i1 >= 0 && i1 < (int)height) + { + uint offset = j1 + (offsetStart + i1) * width; + float deltaCur = delta[offset]; + + for (uint c=0; c= nbChannels || + id[1] >= nbChannelsPrev) + { + return ; + } + + float tmp[9] = {0.0}; + for (uint elem=0; elem 0 && l > 0) + { + uint offsetPrev0 = + ((l-1)*4 + (offsetStartPrev + k*2-1) * widthPrev) / 4; + float outPrev0 = outsPrev[offsetPrev0][3]; + + tmp[0] += outPrev0 * delta4[0]; + } + if (k > 0) + { + uint offsetPrev1 = + (l*4 + (offsetStartPrev + k*2-1) * widthPrev) / 4; + float4 outPrev1 = outsPrev[offsetPrev1]; + + tmp[0] += outPrev1[0] * delta4[1]; + tmp[0] += outPrev1[1] * delta4[2]; + tmp[0] += outPrev1[2] * delta4[3]; + + float4 sum = outPrev1 * delta4; + tmp[1] += sum[0] + sum[1] + sum[2] + sum[3]; + + tmp[2] += outPrev1[1] * delta4[0]; + tmp[2] += outPrev1[2] * delta4[1]; + tmp[2] += outPrev1[3] * delta4[2]; + } + if (k > 0 && (l+1)*4 < width) + { + uint offsetPrev2 = + ((l+1)*4 + (offsetStartPrev + k*2-1) * widthPrev) / 4; + float outPrev2 = outsPrev[offsetPrev2][0]; + + tmp[2] += outPrev2 * delta4[3]; + } + + if (l > 0) + { + uint offsetPrev3 = + ((l-1)*4 + (offsetStartPrev + k*2) * widthPrev) / 4; + uint offsetPrev6 = + ((l-1)*4 + (offsetStartPrev + k*2+1) * widthPrev) / 4; + float outPrev3 = outsPrev[offsetPrev3][3]; + float outPrev6 = outsPrev[offsetPrev6][3]; + + tmp[0] += outPrev3 * delta7[0]; + tmp[3] += outPrev3 * delta4[0]; + tmp[3] += outPrev6 * delta7[0]; + tmp[6] += outPrev6 * delta4[0]; + } + + uint offsetPrev4 = + (l*4 + (offsetStartPrev + k*2) * widthPrev) / 4; + uint offsetPrev7 = + (l*4 + (offsetStartPrev + k*2+1) * widthPrev) / 4; + float4 outPrev4 = outsPrev[offsetPrev4]; + float4 outPrev7 = outsPrev[offsetPrev7]; + + tmp[0] += outPrev4[0] * delta7[1]; + tmp[0] += outPrev4[1] * delta7[2]; + tmp[0] += outPrev4[2] * delta7[3]; + + float4 sum = outPrev4 * delta7; + tmp[1] += sum[0] + sum[1] + sum[2] + sum[3]; + + tmp[2] += outPrev4[1] * delta7[0]; + tmp[2] += outPrev4[2] * delta7[1]; + tmp[2] += outPrev4[3] * delta7[2]; + + tmp[3] += outPrev4[0] * delta4[1]; + tmp[3] += outPrev4[1] * delta4[2]; + tmp[3] += outPrev4[2] * delta4[3]; + tmp[3] += outPrev7[0] * delta7[1]; + tmp[3] += outPrev7[1] * delta7[2]; + tmp[3] += outPrev7[2] * delta7[3]; + + sum = outPrev4 * delta4; + tmp[4] += sum[0] + sum[1] + sum[2] + sum[3]; + sum = outPrev7 * delta7; + tmp[4] += sum[0] + sum[1] + sum[2] + sum[3]; + + tmp[5] += outPrev4[1] * delta4[0]; + tmp[5] += outPrev4[2] * delta4[1]; + tmp[5] += outPrev4[3] * delta4[2]; + tmp[5] += outPrev7[1] * delta7[0]; + tmp[5] += outPrev7[2] * delta7[1]; + tmp[5] += outPrev7[3] * delta7[2]; + + tmp[6] += outPrev7[0] * delta4[1]; + tmp[6] += outPrev7[1] * delta4[2]; + tmp[6] += outPrev7[2] * delta4[3]; + + sum = outPrev7 * delta4; + tmp[7] += sum[0] + sum[1] + sum[2] + sum[3]; + + tmp[8] += outPrev7[1] * delta4[0]; + tmp[8] += outPrev7[2] * delta4[1]; + tmp[8] += outPrev7[3] * delta4[2]; + + if ((l+1)*4 < width) + { + uint offsetPrev5 = + ((l+1)*4 + (offsetStartPrev + k*2) * widthPrev) / 4; + uint offsetPrev8 = + ((l+1)*4 + (offsetStartPrev + k*2+1) * widthPrev) / 4; + float outPrev5 = outsPrev[offsetPrev5][0]; + float outPrev8 = outsPrev[offsetPrev8][0]; + + tmp[2] += outPrev5 * delta7[3]; + tmp[5] += outPrev5 * delta4[3]; + tmp[5] += outPrev8 * delta7[3]; + tmp[8] += outPrev8 * delta4[3]; + } + + if ((k+1)*2 < height && l > 0) + { + uint offsetPrev9 = + ((l-1)*4 + (offsetStartPrev + (k+1)*2) * widthPrev) / 4; + float outPrev9 = outsPrev[offsetPrev9][3]; + + tmp[6] += outPrev9 * delta7[0]; + } + if ((k+1)*2 < height) + { + uint offsetPrev10 = + (l*4 + (offsetStartPrev + (k+1)*2) * widthPrev) / 4; + float4 outPrev10 = outsPrev[offsetPrev10]; + + tmp[6] += outPrev10[0] * delta7[1]; + tmp[6] += outPrev10[1] * delta7[2]; + tmp[6] += outPrev10[2] * delta7[3]; + + float4 sum = outPrev10 * delta7; + tmp[7] += sum[0] + sum[1] + sum[2] + sum[3]; + + tmp[8] += outPrev10[1] * delta7[0]; + tmp[8] += outPrev10[2] * delta7[1]; + tmp[8] += outPrev10[3] * delta7[2]; + } + if ((k+1)*2 < height && (l+1)*4 < width) + { + uint offsetPrev11 = + ((l+1)*4 + (offsetStartPrev + (k+1)*2) * widthPrev) / 4; + float outPrev11 = outsPrev[offsetPrev11][0]; + + tmp[8] += outPrev11 * delta7[3]; + } + }} + } + + uint offsetStartWeights = (depthPrev + nbChannelsPrev * depth) * 3; + uint offsetWeights0 = 0 + (offsetStartWeights + 0) * 3; + uint offsetWeights1 = 1 + (offsetStartWeights + 0) * 3; + uint offsetWeights2 = 2 + (offsetStartWeights + 0) * 3; + uint offsetWeights3 = 0 + (offsetStartWeights + 1) * 3; + uint offsetWeights4 = 1 + (offsetStartWeights + 1) * 3; + uint offsetWeights5 = 2 + (offsetStartWeights + 1) * 3; + uint offsetWeights6 = 0 + (offsetStartWeights + 2) * 3; + uint offsetWeights7 = 1 + (offsetStartWeights + 2) * 3; + uint offsetWeights8 = 2 + (offsetStartWeights + 2) * 3; + + if (accumulate) + { + grads[offsetWeights0] += tmp[0]; + grads[offsetWeights1] += tmp[1]; + grads[offsetWeights2] += tmp[2]; + grads[offsetWeights3] += tmp[3]; + grads[offsetWeights4] += tmp[4]; + grads[offsetWeights5] += tmp[5]; + grads[offsetWeights6] += tmp[6]; + grads[offsetWeights7] += tmp[7]; + grads[offsetWeights8] += tmp[8]; + } + else + { + grads[offsetWeights0] = tmp[0]; + grads[offsetWeights1] = tmp[1]; + grads[offsetWeights2] = tmp[2]; + grads[offsetWeights3] = tmp[3]; + grads[offsetWeights4] = tmp[4]; + grads[offsetWeights5] = tmp[5]; + grads[offsetWeights6] = tmp[6]; + grads[offsetWeights7] = tmp[7]; + grads[offsetWeights8] = tmp[8]; + } +} + kernel void convBatchDerBiases( const device float * delta, constant uint * pNbChannels, diff --git a/Sources/GrAIdient/Metal/MetalConfig.swift b/Sources/GrAIdient/Metal/MetalConfig.swift index cad15f5c..8776d4d4 100644 --- a/Sources/GrAIdient/Metal/MetalConfig.swift +++ b/Sources/GrAIdient/Metal/MetalConfig.swift @@ -35,8 +35,11 @@ let CONFIG_KERNELS = ], "Convolution": [ "convForward", + "conv16Forward", "convBackward", + "conv16Backward", "convBatchDerWeights", + "conv34BatchDerWeights", "convBatchDerBiases", "convDerWeights", "convDerBiases", diff --git a/Tests/GrAIExamples/VGGBenchmark.swift b/Tests/GrAIExamples/VGGBenchmark.swift new file mode 100644 index 00000000..0a3bbd99 --- /dev/null +++ b/Tests/GrAIExamples/VGGBenchmark.swift @@ -0,0 +1,395 @@ +// +// VGGBenchmark.swift +// GrAIExamples +// +// Created by Jean-Franรงois Reboud on 24/02/2024. +// + +import XCTest +import GrAIdient + +/// Benchmark time spent for training and evaluating a VGG model with fake data. +final class VGGBenchmark: XCTestCase +{ + /// Batch size of data. + let _batchSize = 64 + /// Size of one image (height and width are the same). + let _size = 224 + + /// Initialize test. + override func setUp() + { + setPythonLib() + _ = MetalKernel.get + GrAI.Opti.GPU = true + } + + /// + /// Get optimizer parameters for model training. + /// + /// - Parameter nbLoops: Number of steps per epoch. + /// - Returns: The optimizer parameters. + /// + func _getOptimizerParams(nbLoops: Int) -> GrAI.Optimizer.Params + { + var optimizerParams = GrAI.Optimizer.Params() + optimizerParams.nbLoops = nbLoops + + // Simple optimizer scheduler: always the same optimizer during + // the training. + optimizerParams.optimizer = ConstEpochsScheduler( + GrAI.Optimizer.Class.AdamRectified + ) + + // Simple variable scheduler: always the same variable during + // the training. + optimizerParams.variables["alpha"] = ConstEpochsVar( + value: ConstVal(1e-3) + ) + optimizerParams.variables["lambda"] = ConstEpochsVar( + value: ConstVal(1e-6) + ) + + // Other schedulers can be built thanks to `GrAI.Optimizer.Params`. + return optimizerParams + } + + /// + /// Build a simple model. + /// + /// - Parameter bn: Whether to use batch normalization or not. + /// - Returns: The model built. + /// + func _buildModel(bn: Bool) -> Model + { + // Create the context to build a graph of layers where + // there is no previous model dependency: layer id starts at 0. + let context = ModelContext(name: "VGG16", models: []) + let params = GrAI.Model.Params(context: context) + + var layer: Layer2D + layer = Input2D( + nbChannels: 3, + width: _size, height: _size, + params: params + ) + + layer = Convolution2D( + layerPrev: layer, + size: 3, nbChannels: 64, stride: 1, + activation: ReLU.str, biases: true, bn: bn, + params: params + ) + layer = Convolution2D( + layerPrev: layer, + size: 3, nbChannels: 64, stride: 1, + activation: ReLU.str, biases: true, bn: bn, + params: params + ) + + layer = MaxPool2D( + layerPrev: layer, size: 2, stride: 2, params: params + ) + + layer = Convolution2D( + layerPrev: layer, + size: 3, nbChannels: 128, stride: 1, + activation: ReLU.str, biases: true, bn: bn, + params: params + ) + layer = Convolution2D( + layerPrev: layer, + size: 3, nbChannels: 128, stride: 1, + activation: ReLU.str, biases: true, bn: bn, + params: params + ) + + layer = MaxPool2D( + layerPrev: layer, size: 2, stride: 2, params: params + ) + + layer = Convolution2D( + layerPrev: layer, + size: 3, nbChannels: 256, stride: 1, + activation: ReLU.str, biases: true, bn: bn, + params: params + ) + layer = Convolution2D( + layerPrev: layer, + size: 3, nbChannels: 256, stride: 1, + activation: ReLU.str, biases: true, bn: bn, + params: params + ) + layer = Convolution2D( + layerPrev: layer, + size: 3, nbChannels: 256, stride: 1, + activation: ReLU.str, biases: true, bn: bn, + params: params + ) + + layer = MaxPool2D( + layerPrev: layer, size: 2, stride: 2, params: params + ) + + layer = Convolution2D( + layerPrev: layer, + size: 3, nbChannels: 512, stride: 1, + activation: ReLU.str, biases: true, bn: bn, + params: params + ) + layer = Convolution2D( + layerPrev: layer, + size: 3, nbChannels: 512, stride: 1, + activation: ReLU.str, biases: true, bn: bn, + params: params + ) + layer = Convolution2D( + layerPrev: layer, + size: 3, nbChannels: 512, stride: 1, + activation: ReLU.str, biases: true, bn: bn, + params: params + ) + + layer = MaxPool2D( + layerPrev: layer, size: 2, stride: 2, params: params + ) + + layer = Convolution2D( + layerPrev: layer, + size: 3, nbChannels: 512, stride: 1, + activation: ReLU.str, biases: true, bn: bn, + params: params + ) + layer = Convolution2D( + layerPrev: layer, + size: 3, nbChannels: 512, stride: 1, + activation: ReLU.str, biases: true, bn: bn, + params: params + ) + layer = Convolution2D( + layerPrev: layer, + size: 3, nbChannels: 512, stride: 1, + activation: ReLU.str, biases: true, bn: bn, + params: params + ) + + layer = MaxPool2D( + layerPrev: layer, size: 2, stride: 2, params: params + ) + + layer = AdaptiveAvgPool2D(layerPrev: layer, size: 7, params: params) + + var head: Layer1D = try! FullyConnected( + layerPrev: layer, + nbNeurons: 4096, + activation: ReLU.str, + biases: true, + params: params + ) + head = try! FullyConnected( + layerPrev: head, + nbNeurons: 4096, + activation: ReLU.str, + biases: true, + params: params + ) + head = try! FullyConnected( + layerPrev: head, + nbNeurons: 1, + activation: ReLU.str, + biases: true, + params: params + ) + + _ = MSE1D(layerPrev: head, params: params) + + // Retrieve base model in the context and initialize a + // real model (with `layerPrev` links updated). + let model = Model(model: context.model, modelsPrev: []) + return model + } + + /// Test: train a VGG model. + func _test_TrainVGG() + { + // Get optimizer parameters for iterating over batch size elements. + let params = _getOptimizerParams(nbLoops: _batchSize) + + // Build a model with randomly initialized weights. + let vgg = _buildModel(bn: false) + + // Initialize for training. + vgg.initialize(params: params, phase: .Training) + + let firstLayer: Input2D = vgg.layers.first as! Input2D + let lastLayer: MSE1D = vgg.layers.last as! MSE1D + + // Initialize the ground truth once and for all. + let groundTruth = MetalSharedBuffer(_batchSize, deviceID: 0) + let buffer = groundTruth.buffer + for elem in 0..<_batchSize / 2 + { + buffer[elem] = 0.0 + } + for elem in _batchSize / 2..<_batchSize + { + buffer[elem] = 1.0 + } + groundTruth.upload() + + // Initialize data once and for all. + let data = MetalPrivateBuffer( + _batchSize * 3 * _size * _size, deviceID: 0 + ) + let dataBuffer = data.shared.buffer + for i in 0..<_batchSize * 3 * _size * _size + { + dataBuffer[i] = Float.random(in: -1..<1) + } + data.upload() + + let nbEpochs = 1 + let nbSteps = 20 + for epoch in 0..(_batchSize, deviceID: 0) + let gtBuffer = groundTruth.buffer + for elem in 0..<_batchSize / 2 + { + gtBuffer[elem] = 0.0 + } + for elem in _batchSize / 2..<_batchSize + { + gtBuffer[elem] = 1.0 + } + groundTruth.upload() + + // Initialize data once and for all. + let data = MetalPrivateBuffer( + _batchSize * 3 * _size * _size, deviceID: 0 + ) + let dataBuffer = data.shared.buffer + for i in 0..<_batchSize * 3 * _size * _size + { + dataBuffer[i] = Float.random(in: -1..<1) + } + data.upload() + + let nbEpochs = 2 + let nbSteps = 20 + for epoch in 0.. FlowTrainer + { + let trainer = FlowTrainer( + name: "Layer2D", + params: optimizerParams + ) + trainer.build() + { + (context: ModelContext) in + buildModel(model: model, bn: bn, context: context) + } + return trainer + } + + func buildModel(model: String, bn: Bool, context: ModelContext) + { + let params = GrAI.Model.Params(context: context) + + var layer: Layer2D = Input2D( + nbChannels: 1, width: width, height: height, params: params + ) + var head: Layer1D? = nil + + layer = Convolution2D( + layerPrev: layer, size: 1, nbChannels: 32, stride: 1, + activation: LeakyReLU.str, biases: true, bn: false, params: params + ) + + switch model + { + case "Convolution1": + layer = Convolution2D( + layerPrev: layer, size: 3, nbChannels: 32, stride: 1, + activation: LeakyReLU.str, biases: !bn, bn: bn, params: params + ) + + case "Convolution2": + layer = Convolution2D( + layerPrev: layer, size: 2, nbChannels: 32, stride: 1, + activation: LeakyReLU.str, biases: !bn, bn: bn, params: params + ) + + case "ConvolutionStride1": + layer = Convolution2D( + layerPrev: layer, size: 3, nbChannels: 32, stride: 2, + activation: LeakyReLU.str, biases: !bn, bn: bn, params: params + ) + + case "ConvolutionStride2": + layer = Convolution2D( + layerPrev: layer, size: 2, nbChannels: 32, stride: 2, + activation: LeakyReLU.str, biases: !bn, bn: bn, params: params + ) + + case "Deconvolution": + layer = Deconvolution2D( + layerPrev: layer, size: 3, nbChannels: 16, stride: 1, + activation: LeakyReLU.str, biases: !bn, bn: bn, params: params + ) + + default: + fatalError("Unreachable.") + } + + if head == nil + { + head = AvgPool2D(layerPrev: layer, params: params) + } + + head = try! FullyConnected( + layerPrev: head!, nbNeurons: 1, + activation: LeakyReLU.str, biases: true, params: params + ) + + head = MSE1D(layerPrev: head!, params: params) + } + + func testConvolution1() throws + { + let trainer = _buildTrainer(model: "Convolution1", bn: false) + run(trainer) + } + + func testConvolution2() throws + { + let trainer = _buildTrainer(model: "Convolution2", bn: false) + run(trainer) + } + + func testConvolutionStride1() throws + { + let trainer = _buildTrainer(model: "ConvolutionStride1", bn: false) + run(trainer) + } + + func testConvolutionStride2() throws + { + let trainer = _buildTrainer(model: "ConvolutionStride2", bn: false) + run(trainer) + } + + func testDeconvolution() throws + { + let trainer = _buildTrainer(model: "Deconvolution", bn: false) + run(trainer) + } +} + // ----------------------------------------------------------------------------- // Compare GPU gradients with CPU ones through time. // We expect to see errors ~ 1e-7 and less. @@ -2194,7 +2322,7 @@ class Layer2DFlowResetTests: Layer2DFlowTests override func testInstanceNorm() throws { let trainer = _buildTrainer(model: "InstanceNorm", bn: false) - run(trainer) + run(trainer, diffThreshold: 0.0001) } override func testAdaIN() throws @@ -2600,7 +2728,7 @@ class Layer2DFlowReverseTests: Layer2DFlowTests override func testAdaIN() throws { let trainer = _buildTrainer(model: "AdaIN", bn: false) - run(trainer) + run(trainer, diffThreshold: 0.0001) } override func testConstant() throws @@ -2947,7 +3075,7 @@ class Layer2DFlowAccumulateTests: Input2DMSE1DCase func testInstanceNorm() throws { let trainer = _buildTrainer(model: "InstanceNorm", bn: false) - run(trainer) + run(trainer, diffThreshold: 0.0001) } func testConstant() throws