diff --git a/CHANGELOG.md b/CHANGELOG.md
index c8e6aff9..a383b263 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,7 +4,8 @@ All notable changes to this project will be documented in this file.
 
 ## [unreleased]
 
-🪜 **feat:** LayerCAM2D -> VQGrad2D, LayerCAMSeq -> VQGradSeq ([#114](https://github.com/owkin/GrAIdient/pull/114))\
+🚀 **perf:** Convolution2D ([118](https://github.com/owkin/GrAIdient/pull/118))\
+🪜 **feat:** LayerCAM2D -> VQGrad2D, LayerCAMSeq -> VQGradSeq ([#117](https://github.com/owkin/GrAIdient/pull/117))\
 ⚙️ **core:** GELU vs GELUApprox ([113](https://github.com/owkin/GrAIdient/pull/113))\
 🚀 **perf:** QuerySelf & ValueSelf ([112](https://github.com/owkin/GrAIdient/pull/112))\
 🚀 **perf:** benchmark ViT base model ([111](https://github.com/owkin/GrAIdient/pull/111))\
diff --git a/Sources/GrAIdient/Layer2D/Convolution2D.swift b/Sources/GrAIdient/Layer2D/Convolution2D.swift
index 9f0da6b3..6ac4c757 100644
--- a/Sources/GrAIdient/Layer2D/Convolution2D.swift
+++ b/Sources/GrAIdient/Layer2D/Convolution2D.swift
@@ -1373,8 +1373,21 @@ public class Convolution2D: BN2D, LayerWeightInit
                                          UInt32(weightHeight)]
             let pNbBatch: [UInt32] = [UInt32(batchSize)]
             
+            let kernel: String
+            let coeff: Int
+            if forwardKernel == "convForward" && nbChannels % 16 == 0
+            {
+                kernel = "conv16Forward"
+                coeff = 16
+            }
+            else
+            {
+                kernel = forwardKernel
+                coeff = 1
+            }
+            
             let command = MetalKernel.get.createCommand(
-                forwardKernel, deviceID: deviceID
+                kernel, deviceID: deviceID
             )
             command.setBuffer(layerPrev.outs.metal, atIndex: 0)
             command.setBuffer(_wBuffers.w.metal, atIndex: 1)
@@ -1390,7 +1403,7 @@ public class Convolution2D: BN2D, LayerWeightInit
             command.setBuffer(outs.metal, atIndex: 11)
             
             command.dispatchThreads(
-                width: nbChannels * width,
+                width: (nbChannels / coeff) * width,
                 height: batchSize * height
             )
             command.enqueue()
@@ -1556,8 +1569,21 @@ public class Convolution2D: BN2D, LayerWeightInit
             let pNbBatch: [UInt32] = [UInt32(batchSize)]
             let pDirty: [UInt32] = layerPrev.dirty ? [1] : [0]
             
+            let kernel: String
+            let coeff: Int
+            if backwardKernel == "convBackward" && nbChannelsPrev % 16 == 0
+            {
+                kernel = "conv16Backward"
+                coeff = 16
+            }
+            else
+            {
+                kernel = backwardKernel
+                coeff = 1
+            }
+            
             let command = MetalKernel.get.createCommand(
-                backwardKernel, deviceID: deviceID
+                kernel, deviceID: deviceID
             )
             command.setBuffer(delta.metal, atIndex: 0)
             command.setBuffer(_wBuffers.w.metal, atIndex: 1)
@@ -1573,7 +1599,7 @@ public class Convolution2D: BN2D, LayerWeightInit
             command.setBuffer(layerPrev.delta.metal, atIndex: 11)
             
             command.dispatchThreads(
-                width: nbChannelsPrev * layerPrev.width,
+                width: (nbChannelsPrev / coeff) * layerPrev.width,
                 height: batchSize * layerPrev.height
             )
             command.enqueue()
@@ -1609,27 +1635,56 @@ public class Convolution2D: BN2D, LayerWeightInit
             var command: MetalCommand
             if GrAI.Gradient.batch
             {
-                command = MetalKernel.get.createCommand(
-                    batchDerWeightsKernel, deviceID: deviceID
-                )
-                command.setBuffer(layerPrev.outs.metal, atIndex: 0)
-                command.setBuffer(delta.metal, atIndex: 1)
-                command.setBytes(pStart, atIndex: 2)
-                command.setBytes(pStride, atIndex: 3)
-                command.setBytes(pNbChannels, atIndex: 4)
-                command.setBytes(pNbChannelsPrev, atIndex: 5)
-                command.setBytes(pDimensions, atIndex: 6)
-                command.setBytes(pDimensionsPrev, atIndex: 7)
-                command.setBytes(pDimWeights, atIndex: 8)
-                command.setBytes(pNbBatch, atIndex: 9)
-                command.setBytes(pAccumulate, atIndex: 10)
-                command.setBuffer(_wBuffers.g.metal, atIndex: 11)
-                
-                command.dispatchThreads(
-                    width: nbChannels * weightWidth,
-                    height: nbChannelsPrev * weightHeight
-                )
-                command.enqueue()
+                if batchDerWeightsKernel == "convBatchDerWeights" &&
+                   _stride == 1 &&
+                   layerPrev.width == width &&
+                   layerPrev.height == height &&
+                   weightWidth == 3 && weightHeight == 3 &&
+                   height % 2 == 0 && width % 4 == 0
+                {
+                    command = MetalKernel.get.createCommand(
+                        "conv34BatchDerWeights", deviceID: deviceID
+                    )
+                    command.setBuffer(layerPrev.outs.metal, atIndex: 0)
+                    command.setBuffer(delta.metal, atIndex: 1)
+                    command.setBytes(pNbChannels, atIndex: 2)
+                    command.setBytes(pNbChannelsPrev, atIndex: 3)
+                    command.setBytes(pDimensions, atIndex: 4)
+                    command.setBytes(pDimensionsPrev, atIndex: 5)
+                    command.setBytes(pNbBatch, atIndex: 6)
+                    command.setBytes(pAccumulate, atIndex: 7)
+                    command.setBuffer(_wBuffers.g.metal, atIndex: 8)
+                    
+                    command.dispatchThreads(
+                        width: nbChannels,
+                        height: nbChannelsPrev
+                    )
+                    command.enqueue()
+                }
+                else
+                {
+                    command = MetalKernel.get.createCommand(
+                        batchDerWeightsKernel, deviceID: deviceID
+                    )
+                    command.setBuffer(layerPrev.outs.metal, atIndex: 0)
+                    command.setBuffer(delta.metal, atIndex: 1)
+                    command.setBytes(pStart, atIndex: 2)
+                    command.setBytes(pStride, atIndex: 3)
+                    command.setBytes(pNbChannels, atIndex: 4)
+                    command.setBytes(pNbChannelsPrev, atIndex: 5)
+                    command.setBytes(pDimensions, atIndex: 6)
+                    command.setBytes(pDimensionsPrev, atIndex: 7)
+                    command.setBytes(pDimWeights, atIndex: 8)
+                    command.setBytes(pNbBatch, atIndex: 9)
+                    command.setBytes(pAccumulate, atIndex: 10)
+                    command.setBuffer(_wBuffers.g.metal, atIndex: 11)
+                    
+                    command.dispatchThreads(
+                        width: nbChannels * weightWidth,
+                        height: nbChannelsPrev * weightHeight
+                    )
+                    command.enqueue()
+                }
                 
                 if _updateBiases
                 {
diff --git a/Sources/GrAIdient/Metal/Kernel/Convolution.metal b/Sources/GrAIdient/Metal/Kernel/Convolution.metal
index 220e4c0b..9a688895 100644
--- a/Sources/GrAIdient/Metal/Kernel/Convolution.metal
+++ b/Sources/GrAIdient/Metal/Kernel/Convolution.metal
@@ -104,6 +104,108 @@ kernel void convForward(
     outs[offset] = tmp;
 }
 
+kernel void conv16Forward(
+    const device float * outsPrev,
+    const device float * weights,
+    const device float * biases,
+    constant int * pStart,
+    constant uint * pStride,
+    constant uint * pNbChannels,
+    constant uint * pNbChannelsPrev,
+    constant uint * pDimensions,
+    constant uint * pDimensionsPrev,
+    constant uint * pDimWeights,
+    constant uint * pNbBatch,
+    device float * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint weightHeight, weightWidth;
+    uint nbChannels;
+    uint nbChannelsPrev;
+    int startI, startJ;
+    int endI, endJ;
+    int offI, offJ;
+    uint stride;
+    uint nbBatch;
+    
+    if (pStart && pStride && pNbChannels && pNbChannelsPrev &&
+        pDimensions && pDimensionsPrev && pDimWeights && pNbBatch &&
+        outsPrev && weights && biases && outs)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        weightWidth = pDimWeights[0];
+        weightHeight = pDimWeights[1];
+        nbChannels = *pNbChannels;
+        nbChannelsPrev = *pNbChannelsPrev;
+        nbBatch = *pNbBatch;
+        startI = pStart[0];
+        endI = pStart[1];
+        startJ = pStart[2];
+        endJ = pStart[3];
+        offI = pStart[4];
+        offJ = pStart[5];
+        stride = pStride[0];
+    }
+    else
+        return ;
+    
+    uint coeff = 16;
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth * coeff >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    float tmp[16] = {0};
+    for (uint depthPrev=0; depthPrev<nbChannelsPrev; depthPrev++)
+    {
+        uint offsetStartPrev =
+            (depthPrev + nbChannelsPrev*elem) * heightPrev;
+        
+        for (int k=startI; k<=endI; k++){
+        for (int l=startJ; l<=endJ; l++)
+        {
+            if ((int)(stride*j)+l-offJ >= 0 &&
+                (int)(stride*j)+l-offJ < (int)widthPrev &&
+                (int)(stride*i)+k-offI >= 0 &&
+                (int)(stride*i)+k-offI < (int)heightPrev)
+            {
+                uint offsetPrev = (int)(stride*j)+l-offJ +
+                    (offsetStartPrev + (int)(stride*i)+k-offI)*widthPrev;
+                float outPrev = outsPrev[offsetPrev];
+                
+                for (uint c=0; c<coeff; c++)
+                {
+                    uint offsetStartWeights = weightHeight *
+                        (depthPrev + nbChannelsPrev * (depth*coeff+c));
+                    uint offsetWeights = l-startJ +
+                        (offsetStartWeights + k-startI) * weightWidth;
+                    float w = weights[offsetWeights];
+                    
+                    tmp[c] += outPrev * w;
+                }
+            }
+        }}
+    }
+    
+    for (uint c=0; c<coeff; c++)
+    {
+        uint offsetStart = ((depth*coeff+c) + nbChannels * elem) * height;
+        uint offset = j + (offsetStart + i) * width;
+        outs[offset] = tmp[c] + biases[depth*coeff+c];
+    }
+}
+
 kernel void convBackward(
     const device float * delta,
     const device float * weights,
@@ -211,6 +313,121 @@ kernel void convBackward(
     }
 }
 
+kernel void conv16Backward(
+    const device float * delta,
+    const device float * weights,
+    constant int * pStart,
+    constant uint * pStride,
+    constant uint * pNbChannels,
+    constant uint * pNbChannelsPrev,
+    constant uint * pDimensions,
+    constant uint * pDimensionsPrev,
+    constant uint * pDimWeights,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device float * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint weightHeight, weightWidth;
+    uint nbChannels;
+    uint nbChannelsPrev;
+    int startI, startJ;
+    int endI, endJ;
+    int offI, offJ;
+    uint stride;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pStart && pStride && pNbChannels && pNbChannelsPrev &&
+        pDimensions && pDimensionsPrev && pDimWeights && pNbBatch && pDirty &&
+        delta && weights && deltaPrev)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        weightWidth = pDimWeights[0];
+        weightHeight = pDimWeights[1];
+        nbChannels = *pNbChannels;
+        nbChannelsPrev = *pNbChannelsPrev;
+        nbBatch = *pNbBatch;
+        startI = pStart[0];
+        endI = pStart[1];
+        startJ = pStart[2];
+        endJ = pStart[3];
+        offI = pStart[4];
+        offJ = pStart[5];
+        stride = pStride[0];
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint coeff = 16;
+    uint depthPrev = id[0] / widthPrev;
+    uint elem = id[1] / heightPrev;
+    uint i = id[1] % heightPrev;
+    uint j = id[0] % widthPrev;
+    
+    if (i * elem >= heightPrev * nbBatch ||
+        j * depthPrev * coeff >= widthPrev * nbChannelsPrev)
+    {
+        return ;
+    }
+    
+    float tmp[16] = {0};
+    for (uint depth=0; depth<nbChannels; depth++)
+    {
+        uint offsetStart = (depth + nbChannels * elem) * height;
+        
+        for (int k=startI; k<=endI; k++){
+        for (int l=startJ; l<=endJ; l++)
+        {
+            if ((i-k+offI) % stride == 0 && (j-l+offJ) % stride == 0)
+            {
+                int i1 = (i-k+offI) / stride;
+                int j1 = (j-l+offJ) / stride;
+                
+                if (j1 >= 0 && j1 < (int)width &&
+                    i1 >= 0 && i1 < (int)height)
+                {
+                    uint offset = j1 + (offsetStart + i1) * width;
+                    float deltaCur = delta[offset];
+                    
+                    for (uint c=0; c<coeff; c++)
+                    {
+                        uint offsetStartWeights = weightHeight *
+                            ((depthPrev*coeff+c) + nbChannelsPrev * depth);
+                        uint offsetWeights = l-startJ +
+                            (offsetStartWeights + k-startI) * weightWidth;
+                        float w = weights[offsetWeights];
+                        
+                        tmp[c] += deltaCur * w;
+                    }
+                }
+            }
+        }}
+    }
+    
+    for (uint c=0; c<coeff; c++)
+    {
+        uint offsetStartPrev = heightPrev *
+            ((depthPrev*coeff+c) + nbChannelsPrev * elem);
+        uint offsetPrev = j + (offsetStartPrev + i) * widthPrev;
+        
+        if (dirty)
+        {
+            deltaPrev[offsetPrev] = tmp[c];
+        }
+        else
+        {
+            deltaPrev[offsetPrev] += tmp[c];
+        }
+    }
+}
+
 kernel void convBatchDerWeights(
     const device float * outsPrev,
     const device float * delta,
@@ -321,6 +538,251 @@ kernel void convBatchDerWeights(
     }
 }
 
+kernel void conv34BatchDerWeights(
+    const device float4 * outsPrev,
+    const device float4 * delta,
+    constant uint * pNbChannels,
+    constant uint * pNbChannelsPrev,
+    constant uint * pDimensions,
+    constant uint * pDimensionsPrev,
+    constant uint * pNbBatch,
+    constant uint * pAccumulate,
+    device float * grads,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint nbChannels;
+    uint nbChannelsPrev;
+    uint nbBatch;
+    uint accumulate;
+    
+    if (pNbChannels && pNbChannelsPrev && pDimensions &&
+        pDimensionsPrev && pNbBatch && pAccumulate &&
+        outsPrev && delta && grads)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        nbChannels = *pNbChannels;
+        nbChannelsPrev = *pNbChannelsPrev;
+        nbBatch = *pNbBatch;
+        accumulate = *pAccumulate;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint depthPrev = id[1];
+    
+    if (id[0] >= nbChannels ||
+        id[1] >= nbChannelsPrev)
+    {
+        return ;
+    }
+    
+    float tmp[9] = {0.0};
+    for (uint elem=0; elem<nbBatch; elem++)
+    {
+        uint offsetStart =
+            (depth + nbChannels * elem) * height;
+        uint offsetStartPrev =
+            (depthPrev + nbChannelsPrev * elem) * heightPrev;
+        
+        for (uint k=0; k<height/2; k++){
+        for (uint l=0; l<width/4; l++)
+        {
+            uint offset4 = (l*4 + (offsetStart + k*2) * width) / 4;
+            uint offset7 = (l*4 + (offsetStart + k*2+1) * width) / 4;
+            float4 delta4 = delta[offset4];
+            float4 delta7 = delta[offset7];
+            
+            if (k > 0 && l > 0)
+            {
+                uint offsetPrev0 =
+                    ((l-1)*4 + (offsetStartPrev + k*2-1) * widthPrev) / 4;
+                float outPrev0 = outsPrev[offsetPrev0][3];
+                
+                tmp[0] += outPrev0 * delta4[0];
+            }
+            if (k > 0)
+            {
+                uint offsetPrev1 =
+                    (l*4 + (offsetStartPrev + k*2-1) * widthPrev) / 4;
+                float4 outPrev1 = outsPrev[offsetPrev1];
+                
+                tmp[0] += outPrev1[0] * delta4[1];
+                tmp[0] += outPrev1[1] * delta4[2];
+                tmp[0] += outPrev1[2] * delta4[3];
+                
+                float4 sum = outPrev1 * delta4;
+                tmp[1] += sum[0] + sum[1] + sum[2] + sum[3];
+                
+                tmp[2] += outPrev1[1] * delta4[0];
+                tmp[2] += outPrev1[2] * delta4[1];
+                tmp[2] += outPrev1[3] * delta4[2];
+            }
+            if (k > 0 && (l+1)*4 < width)
+            {
+                uint offsetPrev2 =
+                    ((l+1)*4 + (offsetStartPrev + k*2-1) * widthPrev) / 4;
+                float outPrev2 = outsPrev[offsetPrev2][0];
+                
+                tmp[2] += outPrev2 * delta4[3];
+            }
+            
+            if (l > 0)
+            {
+                uint offsetPrev3 =
+                    ((l-1)*4 + (offsetStartPrev + k*2) * widthPrev) / 4;
+                uint offsetPrev6 =
+                    ((l-1)*4 + (offsetStartPrev + k*2+1) * widthPrev) / 4;
+                float outPrev3 = outsPrev[offsetPrev3][3];
+                float outPrev6 = outsPrev[offsetPrev6][3];
+                
+                tmp[0] += outPrev3 * delta7[0];
+                tmp[3] += outPrev3 * delta4[0];
+                tmp[3] += outPrev6 * delta7[0];
+                tmp[6] += outPrev6 * delta4[0];
+            }
+            
+            uint offsetPrev4 =
+                (l*4 + (offsetStartPrev + k*2) * widthPrev) / 4;
+            uint offsetPrev7 =
+                (l*4 + (offsetStartPrev + k*2+1) * widthPrev) / 4;
+            float4 outPrev4 = outsPrev[offsetPrev4];
+            float4 outPrev7 = outsPrev[offsetPrev7];
+            
+            tmp[0] += outPrev4[0] * delta7[1];
+            tmp[0] += outPrev4[1] * delta7[2];
+            tmp[0] += outPrev4[2] * delta7[3];
+            
+            float4 sum = outPrev4 * delta7;
+            tmp[1] += sum[0] + sum[1] + sum[2] + sum[3];
+            
+            tmp[2] += outPrev4[1] * delta7[0];
+            tmp[2] += outPrev4[2] * delta7[1];
+            tmp[2] += outPrev4[3] * delta7[2];
+            
+            tmp[3] += outPrev4[0] * delta4[1];
+            tmp[3] += outPrev4[1] * delta4[2];
+            tmp[3] += outPrev4[2] * delta4[3];
+            tmp[3] += outPrev7[0] * delta7[1];
+            tmp[3] += outPrev7[1] * delta7[2];
+            tmp[3] += outPrev7[2] * delta7[3];
+            
+            sum = outPrev4 * delta4;
+            tmp[4] += sum[0] + sum[1] + sum[2] + sum[3];
+            sum = outPrev7 * delta7;
+            tmp[4] += sum[0] + sum[1] + sum[2] + sum[3];
+            
+            tmp[5] += outPrev4[1] * delta4[0];
+            tmp[5] += outPrev4[2] * delta4[1];
+            tmp[5] += outPrev4[3] * delta4[2];
+            tmp[5] += outPrev7[1] * delta7[0];
+            tmp[5] += outPrev7[2] * delta7[1];
+            tmp[5] += outPrev7[3] * delta7[2];
+            
+            tmp[6] += outPrev7[0] * delta4[1];
+            tmp[6] += outPrev7[1] * delta4[2];
+            tmp[6] += outPrev7[2] * delta4[3];
+            
+            sum = outPrev7 * delta4;
+            tmp[7] += sum[0] + sum[1] + sum[2] + sum[3];
+            
+            tmp[8] += outPrev7[1] * delta4[0];
+            tmp[8] += outPrev7[2] * delta4[1];
+            tmp[8] += outPrev7[3] * delta4[2];
+            
+            if ((l+1)*4 < width)
+            {
+                uint offsetPrev5 =
+                    ((l+1)*4 + (offsetStartPrev + k*2) * widthPrev) / 4;
+                uint offsetPrev8 =
+                    ((l+1)*4 + (offsetStartPrev + k*2+1) * widthPrev) / 4;
+                float outPrev5 = outsPrev[offsetPrev5][0];
+                float outPrev8 = outsPrev[offsetPrev8][0];
+                
+                tmp[2] += outPrev5 * delta7[3];
+                tmp[5] += outPrev5 * delta4[3];
+                tmp[5] += outPrev8 * delta7[3];
+                tmp[8] += outPrev8 * delta4[3];
+            }
+            
+            if ((k+1)*2 < height && l > 0)
+            {
+                uint offsetPrev9 =
+                    ((l-1)*4 + (offsetStartPrev + (k+1)*2) * widthPrev) / 4;
+                float outPrev9 = outsPrev[offsetPrev9][3];
+                
+                tmp[6] += outPrev9 * delta7[0];
+            }
+            if ((k+1)*2 < height)
+            {
+                uint offsetPrev10 =
+                    (l*4 + (offsetStartPrev + (k+1)*2) * widthPrev) / 4;
+                float4 outPrev10 = outsPrev[offsetPrev10];
+                
+                tmp[6] += outPrev10[0] * delta7[1];
+                tmp[6] += outPrev10[1] * delta7[2];
+                tmp[6] += outPrev10[2] * delta7[3];
+                
+                float4 sum = outPrev10 * delta7;
+                tmp[7] += sum[0] + sum[1] + sum[2] + sum[3];
+                
+                tmp[8] += outPrev10[1] * delta7[0];
+                tmp[8] += outPrev10[2] * delta7[1];
+                tmp[8] += outPrev10[3] * delta7[2];
+            }
+            if ((k+1)*2 < height && (l+1)*4 < width)
+            {
+                uint offsetPrev11 =
+                    ((l+1)*4 + (offsetStartPrev + (k+1)*2) * widthPrev) / 4;
+                float outPrev11 = outsPrev[offsetPrev11][0];
+                
+                tmp[8] += outPrev11 * delta7[3];
+            }
+        }}
+    }
+    
+    uint offsetStartWeights = (depthPrev + nbChannelsPrev * depth) * 3;
+    uint offsetWeights0 = 0 + (offsetStartWeights + 0) * 3;
+    uint offsetWeights1 = 1 + (offsetStartWeights + 0) * 3;
+    uint offsetWeights2 = 2 + (offsetStartWeights + 0) * 3;
+    uint offsetWeights3 = 0 + (offsetStartWeights + 1) * 3;
+    uint offsetWeights4 = 1 + (offsetStartWeights + 1) * 3;
+    uint offsetWeights5 = 2 + (offsetStartWeights + 1) * 3;
+    uint offsetWeights6 = 0 + (offsetStartWeights + 2) * 3;
+    uint offsetWeights7 = 1 + (offsetStartWeights + 2) * 3;
+    uint offsetWeights8 = 2 + (offsetStartWeights + 2) * 3;
+    
+    if (accumulate)
+    {
+        grads[offsetWeights0] += tmp[0];
+        grads[offsetWeights1] += tmp[1];
+        grads[offsetWeights2] += tmp[2];
+        grads[offsetWeights3] += tmp[3];
+        grads[offsetWeights4] += tmp[4];
+        grads[offsetWeights5] += tmp[5];
+        grads[offsetWeights6] += tmp[6];
+        grads[offsetWeights7] += tmp[7];
+        grads[offsetWeights8] += tmp[8];
+    }
+    else
+    {
+        grads[offsetWeights0] = tmp[0];
+        grads[offsetWeights1] = tmp[1];
+        grads[offsetWeights2] = tmp[2];
+        grads[offsetWeights3] = tmp[3];
+        grads[offsetWeights4] = tmp[4];
+        grads[offsetWeights5] = tmp[5];
+        grads[offsetWeights6] = tmp[6];
+        grads[offsetWeights7] = tmp[7];
+        grads[offsetWeights8] = tmp[8];
+    }
+}
+
 kernel void convBatchDerBiases(
     const device float * delta,
     constant uint * pNbChannels,
diff --git a/Sources/GrAIdient/Metal/MetalConfig.swift b/Sources/GrAIdient/Metal/MetalConfig.swift
index cad15f5c..8776d4d4 100644
--- a/Sources/GrAIdient/Metal/MetalConfig.swift
+++ b/Sources/GrAIdient/Metal/MetalConfig.swift
@@ -35,8 +35,11 @@ let CONFIG_KERNELS =
     ],
     "Convolution": [
         "convForward",
+        "conv16Forward",
         "convBackward",
+        "conv16Backward",
         "convBatchDerWeights",
+        "conv34BatchDerWeights",
         "convBatchDerBiases",
         "convDerWeights",
         "convDerBiases",
diff --git a/Tests/GrAIExamples/VGGBenchmark.swift b/Tests/GrAIExamples/VGGBenchmark.swift
new file mode 100644
index 00000000..0a3bbd99
--- /dev/null
+++ b/Tests/GrAIExamples/VGGBenchmark.swift
@@ -0,0 +1,395 @@
+//
+// VGGBenchmark.swift
+// GrAIExamples
+//
+// Created by Jean-François Reboud on 24/02/2024.
+//
+
+import XCTest
+import GrAIdient
+
+/// Benchmark time spent for training and evaluating a VGG model with fake data.
+final class VGGBenchmark: XCTestCase
+{
+    /// Batch size of data.
+    let _batchSize = 64
+    /// Size of one image (height and width are the same).
+    let _size = 224
+    
+    /// Initialize test.
+    override func setUp()
+    {
+        setPythonLib()
+        _ = MetalKernel.get
+        GrAI.Opti.GPU = true
+    }
+    
+    ///
+    /// Get optimizer parameters for model training.
+    ///
+    /// - Parameter nbLoops: Number of steps per epoch.
+    /// - Returns: The optimizer parameters.
+    ///
+    func _getOptimizerParams(nbLoops: Int) -> GrAI.Optimizer.Params
+    {
+        var optimizerParams = GrAI.Optimizer.Params()
+        optimizerParams.nbLoops = nbLoops
+        
+        // Simple optimizer scheduler: always the same optimizer during
+        // the training.
+        optimizerParams.optimizer = ConstEpochsScheduler(
+            GrAI.Optimizer.Class.AdamRectified
+        )
+        
+        // Simple variable scheduler: always the same variable during
+        // the training.
+        optimizerParams.variables["alpha"] = ConstEpochsVar(
+            value: ConstVal(1e-3)
+        )
+        optimizerParams.variables["lambda"] = ConstEpochsVar(
+            value: ConstVal(1e-6)
+        )
+        
+        // Other schedulers can be built thanks to `GrAI.Optimizer.Params`.
+        return optimizerParams
+    }
+    
+    ///
+    /// Build a simple model.
+    ///
+    /// - Parameter bn: Whether to use batch normalization or not.
+    /// - Returns: The model built.
+    ///
+    func _buildModel(bn: Bool) -> Model
+    {
+        // Create the context to build a graph of layers where
+        // there is no previous model dependency: layer id starts at 0.
+        let context = ModelContext(name: "VGG16", models: [])
+        let params = GrAI.Model.Params(context: context)
+        
+        var layer: Layer2D
+        layer = Input2D(
+            nbChannels: 3,
+            width: _size, height: _size,
+            params: params
+        )
+        
+        layer = Convolution2D(
+            layerPrev: layer,
+            size: 3, nbChannels: 64, stride: 1,
+            activation: ReLU.str, biases: true, bn: bn,
+            params: params
+        )
+        layer = Convolution2D(
+            layerPrev: layer,
+            size: 3, nbChannels: 64, stride: 1,
+            activation: ReLU.str, biases: true, bn: bn,
+            params: params
+        )
+        
+        layer = MaxPool2D(
+            layerPrev: layer, size: 2, stride: 2, params: params
+        )
+        
+        layer = Convolution2D(
+            layerPrev: layer,
+            size: 3, nbChannels: 128, stride: 1,
+            activation: ReLU.str, biases: true, bn: bn,
+            params: params
+        )
+        layer = Convolution2D(
+            layerPrev: layer,
+            size: 3, nbChannels: 128, stride: 1,
+            activation: ReLU.str, biases: true, bn: bn,
+            params: params
+        )
+        
+        layer = MaxPool2D(
+            layerPrev: layer, size: 2, stride: 2, params: params
+        )
+        
+        layer = Convolution2D(
+            layerPrev: layer,
+            size: 3, nbChannels: 256, stride: 1,
+            activation: ReLU.str, biases: true, bn: bn,
+            params: params
+        )
+        layer = Convolution2D(
+            layerPrev: layer,
+            size: 3, nbChannels: 256, stride: 1,
+            activation: ReLU.str, biases: true, bn: bn,
+            params: params
+        )
+        layer = Convolution2D(
+            layerPrev: layer,
+            size: 3, nbChannels: 256, stride: 1,
+            activation: ReLU.str, biases: true, bn: bn,
+            params: params
+        )
+        
+        layer = MaxPool2D(
+            layerPrev: layer, size: 2, stride: 2, params: params
+        )
+        
+        layer = Convolution2D(
+            layerPrev: layer,
+            size: 3, nbChannels: 512, stride: 1,
+            activation: ReLU.str, biases: true, bn: bn,
+            params: params
+        )
+        layer = Convolution2D(
+            layerPrev: layer,
+            size: 3, nbChannels: 512, stride: 1,
+            activation: ReLU.str, biases: true, bn: bn,
+            params: params
+        )
+        layer = Convolution2D(
+            layerPrev: layer,
+            size: 3, nbChannels: 512, stride: 1,
+            activation: ReLU.str, biases: true, bn: bn,
+            params: params
+        )
+        
+        layer = MaxPool2D(
+            layerPrev: layer, size: 2, stride: 2, params: params
+        )
+        
+        layer = Convolution2D(
+            layerPrev: layer,
+            size: 3, nbChannels: 512, stride: 1,
+            activation: ReLU.str, biases: true, bn: bn,
+            params: params
+        )
+        layer = Convolution2D(
+            layerPrev: layer,
+            size: 3, nbChannels: 512, stride: 1,
+            activation: ReLU.str, biases: true, bn: bn,
+            params: params
+        )
+        layer = Convolution2D(
+            layerPrev: layer,
+            size: 3, nbChannels: 512, stride: 1,
+            activation: ReLU.str, biases: true, bn: bn,
+            params: params
+        )
+        
+        layer = MaxPool2D(
+            layerPrev: layer, size: 2, stride: 2, params: params
+        )
+        
+        layer = AdaptiveAvgPool2D(layerPrev: layer, size: 7, params: params)
+        
+        var head: Layer1D = try! FullyConnected(
+            layerPrev: layer,
+            nbNeurons: 4096,
+            activation: ReLU.str,
+            biases: true,
+            params: params
+        )
+        head = try! FullyConnected(
+            layerPrev: head,
+            nbNeurons: 4096,
+            activation: ReLU.str,
+            biases: true,
+            params: params
+        )
+        head = try! FullyConnected(
+            layerPrev: head,
+            nbNeurons: 1,
+            activation: ReLU.str,
+            biases: true,
+            params: params
+        )
+        
+        _ = MSE1D(layerPrev: head, params: params)
+        
+        // Retrieve base model in the context and initialize a
+        // real model (with `layerPrev` links updated).
+        let model = Model(model: context.model, modelsPrev: [])
+        return model
+    }
+    
+    /// Test: train a VGG model.
+    func _test_TrainVGG()
+    {
+        // Get optimizer parameters for iterating over batch size elements.
+        let params = _getOptimizerParams(nbLoops: _batchSize)
+        
+        // Build a model with randomly initialized weights.
+        let vgg = _buildModel(bn: false)
+        
+        // Initialize for training.
+        vgg.initialize(params: params, phase: .Training)
+        
+        let firstLayer: Input2D = vgg.layers.first as! Input2D
+        let lastLayer: MSE1D = vgg.layers.last as! MSE1D
+        
+        // Initialize the ground truth once and for all.
+        let groundTruth = MetalSharedBuffer<Float>(_batchSize, deviceID: 0)
+        let buffer = groundTruth.buffer
+        for elem in 0..<_batchSize / 2
+        {
+            buffer[elem] = 0.0
+        }
+        for elem in _batchSize / 2..<_batchSize
+        {
+            buffer[elem] = 1.0
+        }
+        groundTruth.upload()
+        
+        // Initialize data once and for all.
+        let data = MetalPrivateBuffer<Float>(
+            _batchSize * 3 * _size * _size, deviceID: 0
+        )
+        let dataBuffer = data.shared.buffer
+        for i in 0..<_batchSize * 3 * _size * _size
+        {
+            dataBuffer[i] = Float.random(in: -1..<1)
+        }
+        data.upload()
+        
+        let nbEpochs = 1
+        let nbSteps = 20
+        for epoch in 0..<nbEpochs
+        {
+            print("EPOCH \(epoch + 1)/\(nbEpochs).")
+            
+            let start1 = Date()
+            for step in 0..<nbSteps
+            {
+                let start2 = Date()
+                
+                // Reset gradient validity for backward pass
+                // and update the batch size (although here it stays the same).
+                vgg.updateKernel(batchSize: _batchSize)
+                
+                // Set data.
+                try! firstLayer.setDataGPU(
+                    data,
+                    batchSize: _batchSize,
+                    nbChannels: 3, 
+                    height: _size,
+                    width: _size
+                )
+                
+                // Forward.
+                try! vgg.forward()
+                
+                // Apply loss derivative.
+                try! lastLayer.lossDerivativeGPU(
+                    groundTruth,
+                    batchSize: _batchSize,
+                    nbNeurons: 1
+                )
+                
+                // Backward.
+                try! vgg.backward()
+                
+                // Update weights.
+                try! vgg.update()
+                
+                // Get loss result.
+                // Note that backward is explicitly
+                // enabled by `applyGradient` whereas `getLoss` is
+                // just an indicator.
+                let loss = try! lastLayer.getLossGPU(
+                    groundTruth,
+                    batchSize: _batchSize,
+                    nbNeurons: 1
+                )
+                
+                // Update internal step.
+                // This is not mandatory except if we used another
+                // optimizer scheduler: see `_getOptimizerParams`.
+                vgg.incStep()
+                
+                let end2 = Date()
+                let timeSpent = end2.timeIntervalSince(start2)
+                print("Step \(step + 1)/\(nbSteps): " +
+                      "\(sqrt(loss)) in \(timeSpent)s.")
+            }
+            
+            let end1 = Date()
+            let timeSpent = end1.timeIntervalSince(start1)
+            print("Epoch \(epoch + 1), time spent: \(timeSpent)s.")
+        }
+    }
+    
+    /// Test: evaluate a VGG model.
+    func _test_EvalTransformer()
+    {
+        // Build a model with randomly initialized weights.
+        let vgg = _buildModel(bn: true)
+        
+        // Initialize for inference.
+        vgg.initKernel(phase: .Inference)
+        
+        let firstLayer: Input2D = vgg.layers.first as! Input2D
+        let lastLayer: MSE1D = vgg.layers.last as! MSE1D
+        
+        // Initialize the ground truth once and for all.
+        let groundTruth = MetalSharedBuffer<Float>(_batchSize, deviceID: 0)
+        let gtBuffer = groundTruth.buffer
+        for elem in 0..<_batchSize / 2
+        {
+            gtBuffer[elem] = 0.0
+        }
+        for elem in _batchSize / 2..<_batchSize
+        {
+            gtBuffer[elem] = 1.0
+        }
+        groundTruth.upload()
+        
+        // Initialize data once and for all.
+        let data = MetalPrivateBuffer<Float>(
+            _batchSize * 3 * _size * _size, deviceID: 0
+        )
+        let dataBuffer = data.shared.buffer
+        for i in 0..<_batchSize * 3 * _size * _size
+        {
+            dataBuffer[i] = Float.random(in: -1..<1)
+        }
+        data.upload()
+        
+        let nbEpochs = 2
+        let nbSteps = 20
+        for epoch in 0..<nbEpochs
+        {
+            print("EPOCH \(epoch + 1)/\(nbEpochs).")
+            
+            let start1 = Date()
+            for step in 0..<nbSteps
+            {
+                let start2 = Date()
+                
+                // Reset gradient validity for backward pass
+                // and update the batch size (although here it stays the same).
+                vgg.updateKernel(batchSize: _batchSize)
+                
+                // Set data.
+                try! firstLayer.setDataGPU(
+                    data,
+                    batchSize: _batchSize,
+                    nbChannels: 3,
+                    height: _size,
+                    width: _size
+                )
+                
+                // Forward.
+                try! vgg.forward()
+                
+                // Get predictions.
+                var preds = [Float](lastLayer.outs.download()[0..<_batchSize])
+                preds = preds.map { 1.0 / (1.0 + exp(-$0)) } // Sigmoid.
+                
+                let end2 = Date()
+                let timeSpent = end2.timeIntervalSince(start2)
+                print("Step \(step + 1)/\(nbSteps): in \(timeSpent)s.")
+            }
+            
+            let end1 = Date()
+            let timeSpent = end1.timeIntervalSince(start1)
+            print("Epoch \(epoch + 1), time spent: \(timeSpent)s.")
+        }
+    }
+}
diff --git a/Tests/GrAITests/Layer2DDirtyTests.swift b/Tests/GrAITests/Layer2DDirtyTests.swift
index 59b1c0d9..bcdaa384 100644
--- a/Tests/GrAITests/Layer2DDirtyTests.swift
+++ b/Tests/GrAITests/Layer2DDirtyTests.swift
@@ -526,7 +526,7 @@ class Layer2DDirtyGradTests: Input2DMSE1DCase
     func testDeconvolutionGPU() throws
     {
         let trainer = _buildTrainer(model: "Deconvolution")
-        run(trainer, diffThreshold: 0.0001)
+        run(trainer, diffThreshold: 0.001)
     }
     
     func testDeconvolutionStrideCPU() throws
@@ -539,7 +539,7 @@ class Layer2DDirtyGradTests: Input2DMSE1DCase
     func testDeconvolutionStrideGPU() throws
     {
         let trainer = _buildTrainer(model: "DeconvolutionStride")
-        run(trainer, diffThreshold: 0.0001)
+        run(trainer, diffThreshold: 0.001)
     }
     
     func testInstanceNormCPU() throws
diff --git a/Tests/GrAITests/Layer2DTests.swift b/Tests/GrAITests/Layer2DTests.swift
index 03659135..9171ef89 100644
--- a/Tests/GrAITests/Layer2DTests.swift
+++ b/Tests/GrAITests/Layer2DTests.swift
@@ -1794,13 +1794,13 @@ class Layer2DFlowTests: Input2DMSE1DCase
     func testInstanceNorm() throws
     {
         let trainer = _buildTrainer(model: "InstanceNorm", bn: false)
-        run(trainer)
+        run(trainer, diffThreshold: 0.0001)
     }
     
     func testAdaIN() throws
     {
         let trainer = _buildTrainer(model: "AdaIN", bn: false)
-        run(trainer)
+        run(trainer, diffThreshold: 0.0001)
     }
     
     func testConstant() throws
@@ -1883,6 +1883,134 @@ class Layer2DFlowTests: Input2DMSE1DCase
     }
 }
 
+// -----------------------------------------------------------------------------
+// Compare GPU gradients with CPU ones through time.
+// We expect to see errors ~ 1e-7 and less.
+// -----------------------------------------------------------------------------
+class Layer2D16FlowTests: Input2DMSE1DCase
+{
+    /// Systematic call before test begins.
+    override func setUp()
+    {
+        height = 8
+        width = 8
+        batchSize = 5
+        
+        _ = MetalKernel.get
+        GrAI.Opti.GPU = true
+        
+        setOptimizerParams(params: &optimizerParams)
+        optimizerParams.nbLoops = 3
+    }
+    
+    private func _buildTrainer(model: String, bn: Bool) -> FlowTrainer
+    {
+        let trainer = FlowTrainer(
+            name: "Layer2D",
+            params: optimizerParams
+        )
+        trainer.build()
+        {
+            (context: ModelContext) in
+            buildModel(model: model, bn: bn, context: context)
+        }
+        return trainer
+    }
+    
+    func buildModel(model: String, bn: Bool, context: ModelContext)
+    {
+        let params = GrAI.Model.Params(context: context)
+        
+        var layer: Layer2D = Input2D(
+            nbChannels: 1, width: width, height: height, params: params
+        )
+        var head: Layer1D? = nil
+        
+        layer = Convolution2D(
+            layerPrev: layer, size: 1, nbChannels: 32, stride: 1,
+            activation: LeakyReLU.str, biases: true, bn: false, params: params
+        )
+        
+        switch model
+        {
+        case "Convolution1":
+            layer = Convolution2D(
+                layerPrev: layer, size: 3, nbChannels: 32, stride: 1,
+                activation: LeakyReLU.str, biases: !bn, bn: bn, params: params
+            )
+            
+        case "Convolution2":
+            layer = Convolution2D(
+                layerPrev: layer, size: 2, nbChannels: 32, stride: 1,
+                activation: LeakyReLU.str, biases: !bn, bn: bn, params: params
+            )
+            
+        case "ConvolutionStride1":
+            layer = Convolution2D(
+                layerPrev: layer, size: 3, nbChannels: 32, stride: 2,
+                activation: LeakyReLU.str, biases: !bn, bn: bn, params: params
+            )
+            
+        case "ConvolutionStride2":
+            layer = Convolution2D(
+                layerPrev: layer, size: 2, nbChannels: 32, stride: 2,
+                activation: LeakyReLU.str, biases: !bn, bn: bn, params: params
+            )
+            
+        case "Deconvolution":
+            layer = Deconvolution2D(
+                layerPrev: layer, size: 3, nbChannels: 16, stride: 1,
+                activation: LeakyReLU.str, biases: !bn, bn: bn, params: params
+            )
+            
+        default:
+            fatalError("Unreachable.")
+        }
+        
+        if head == nil
+        {
+            head = AvgPool2D(layerPrev: layer, params: params)
+        }
+        
+        head = try! FullyConnected(
+            layerPrev: head!, nbNeurons: 1,
+            activation: LeakyReLU.str, biases: true, params: params
+        )
+        
+        head = MSE1D(layerPrev: head!, params: params)
+    }
+    
+    func testConvolution1() throws
+    {
+        let trainer = _buildTrainer(model: "Convolution1", bn: false)
+        run(trainer)
+    }
+    
+    func testConvolution2() throws
+    {
+        let trainer = _buildTrainer(model: "Convolution2", bn: false)
+        run(trainer)
+    }
+    
+    func testConvolutionStride1() throws
+    {
+        let trainer = _buildTrainer(model: "ConvolutionStride1", bn: false)
+        run(trainer)
+    }
+    
+    func testConvolutionStride2() throws
+    {
+        let trainer = _buildTrainer(model: "ConvolutionStride2", bn: false)
+        run(trainer)
+    }
+    
+    func testDeconvolution() throws
+    {
+        let trainer = _buildTrainer(model: "Deconvolution", bn: false)
+        run(trainer)
+    }
+}
+
 // -----------------------------------------------------------------------------
 // Compare GPU gradients with CPU ones through time.
 // We expect to see errors ~ 1e-7 and less.
@@ -2194,7 +2322,7 @@ class Layer2DFlowResetTests: Layer2DFlowTests
     override func testInstanceNorm() throws
     {
         let trainer = _buildTrainer(model: "InstanceNorm", bn: false)
-        run(trainer)
+        run(trainer, diffThreshold: 0.0001)
     }
     
     override func testAdaIN() throws
@@ -2600,7 +2728,7 @@ class Layer2DFlowReverseTests: Layer2DFlowTests
     override func testAdaIN() throws
     {
         let trainer = _buildTrainer(model: "AdaIN", bn: false)
-        run(trainer)
+        run(trainer, diffThreshold: 0.0001)
     }
     
     override func testConstant() throws
@@ -2947,7 +3075,7 @@ class Layer2DFlowAccumulateTests: Input2DMSE1DCase
     func testInstanceNorm() throws
     {
         let trainer = _buildTrainer(model: "InstanceNorm", bn: false)
-        run(trainer)
+        run(trainer, diffThreshold: 0.0001)
     }
     
     func testConstant() throws