From 54b4a30091d8bd35280c9305c1354e206e85f798 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-Fran=C3=A7ois=20Reboud?= Date: Fri, 19 Jul 2024 10:40:23 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20feat(layer=5Fseq):=20LLM=20sliding?= =?UTF-8?q?=20window=20(#131)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 3 +- Sources/GrAIdient/LayerSeq/QuerySeq.swift | 150 +++-- Sources/GrAIdient/LayerSeq/SoftmaxSeq.swift | 10 +- Sources/GrAIdient/LayerSeq/ValueSeq.swift | 148 +++-- Sources/GrAIdient/Metal/Kernel/NLPFloat.metal | 4 +- Sources/GrAIdient/Metal/Kernel/NLPHalf.metal | 4 +- Tests/GrAIExamples/LLMExample.swift | 1 + Tests/GrAITests/NLPTests.swift | 520 +++++++++++++++++- 8 files changed, 764 insertions(+), 76 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 409fd909..bcf6fbd8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,8 @@ All notable changes to this project will be documented in this file. ## [unreleased] -🚀 **examples**: 3 LLMs examples ([#130](https://github.com/owkin/GrAIdient/pull/130))\ +✨ **layer_seq:** LLM sliding window ([#131](https://github.com/owkin/GrAIdient/pull/131))\ +🚀 **examples:** 3 LLMs examples ([#130](https://github.com/owkin/GrAIdient/pull/130))\ 📚 **docs:** LLM doc & split tests ([129](https://github.com/owkin/GrAIdient/pull/129))\ ✨ **layer_seq:** LLM generate ([128](https://github.com/owkin/GrAIdient/pull/128))\ ✨ **layer_seq:** MultiplySeq, SiLU & LLM test ([127](https://github.com/owkin/GrAIdient/pull/127))\ diff --git a/Sources/GrAIdient/LayerSeq/QuerySeq.swift b/Sources/GrAIdient/LayerSeq/QuerySeq.swift index e4c4fd06..11330ecb 100644 --- a/Sources/GrAIdient/LayerSeq/QuerySeq.swift +++ b/Sources/GrAIdient/LayerSeq/QuerySeq.swift @@ -1270,20 +1270,25 @@ public class QueryCausalSeq: LayerMergeSeq if cacheKey != nil && cacheSeq != nil && cacheKey.nbElems != batchSize * cacheSeqMax * nbNeuronsPrevKey { - _cacheKeyTmp = FloatBuffer( + let cacheKeyTmp = FloatBuffer( nbElems: batchSize * cacheSeqMax * nbNeuronsPrevKey, deviceID: deviceID ) let nbElems = batchSize * cacheSeq * nbNeuronsPrevKey - _copyGPU(nbElems: nbElems, from: cacheKey, to: _cacheKeyTmp) + _copyGPU(nbElems: nbElems, from: cacheKey, to: cacheKeyTmp) cacheKey = FloatBuffer( nbElems: batchSize * cacheSeqMax * nbNeuronsPrevKey, deviceID: deviceID ) - _copyGPU(nbElems: nbElems, from: _cacheKeyTmp, to: cacheKey) + _copyGPU(nbElems: nbElems, from: cacheKeyTmp, to: cacheKey) + + if batchSize > 1 + { + _cacheKeyTmp = cacheKeyTmp + } } } @@ -1664,13 +1669,13 @@ public class QueryCausalSeq: LayerMergeSeq throw LayerError.Init(message: "`sequence` should be 1.") } - _concatGPU() + _mergeCacheGPU() let query = layersPrev[0] as! LayerSeq let key = layersPrev[1] as! LayerSeq let nbNeuronsPrevQuery = query.nbNeurons let nbNeuronsPrevKey = key.nbNeurons - let nbNeurons = (cacheSeq + 1) * _nbHeadsQuery + let nbNeurons = min(cacheSeq + 1, cacheSeqMax) * _nbHeadsQuery let pNbHeadsQuery: [UInt32] = [UInt32(_nbHeadsQuery)] let pNbHeadsKey: [UInt32] = [UInt32(_nbHeadsKey)] @@ -1678,7 +1683,7 @@ public class QueryCausalSeq: LayerMergeSeq let pNbNeuronsPrevQuery: [UInt32] = [UInt32(nbNeuronsPrevQuery)] let pNbNeuronsPrevKey: [UInt32] = [UInt32(nbNeuronsPrevKey)] let pNbBatch: [UInt32] = [UInt32(batchSize)] - let pSequence: [UInt32] = [UInt32(cacheSeq + 1)] + let pSequence: [UInt32] = [UInt32(min(cacheSeq + 1, cacheSeqMax))] let kernel = (nbNeuronsPrevQuery / _nbHeadsQuery) % 4 == 0 ? "queryCausalSeq4Generate" : "queryCausalSeqGenerate" @@ -1686,7 +1691,7 @@ public class QueryCausalSeq: LayerMergeSeq kernel, deviceID: deviceID ) command.setBuffer(query.outs.metal, atIndex: 0) - command.setBuffer(_cacheKeyTmp.metal, atIndex: 1) + command.setBuffer(_getKeyCacheOutputGPU()!.metal, atIndex: 1) command.setBytes(pNbHeadsQuery, atIndex: 2) command.setBytes(pNbHeadsKey, atIndex: 3) command.setBytes(pNbNeurons, atIndex: 4) @@ -1702,22 +1707,29 @@ public class QueryCausalSeq: LayerMergeSeq ) command.enqueue() - let nbElems = batchSize * (cacheSeq + 1) * nbNeuronsPrevKey - _copyGPU(nbElems: nbElems, from: _cacheKeyTmp, to: cacheKey) - cacheSeq += 1 } - /// Concatenate cache to key. - private func _concatGPU() + /// Merge cache to key. + private func _mergeCacheGPU() { + let slidingWindow: Bool + if cacheSeq >= cacheSeqMax + { + slidingWindow = true + } + else + { + slidingWindow = false + } + let key = layersPrev[1] as! LayerSeq let nbNeuronsPrevKey = key.nbNeurons let nbNeurons = nbNeuronsPrevKey let pNbNeurons: [UInt32] = [UInt32(nbNeurons)] let pNbBatch: [UInt32] = [UInt32(batchSize)] - let pSequence: [UInt32] = [UInt32(cacheSeq + 1)] + let pSequence: [UInt32] = [UInt32(min(cacheSeq + 1, cacheSeqMax))] let pSequenceCache: [UInt32] = [UInt32(cacheSeq)] let pSequenceKey: [UInt32] = [UInt32(1)] @@ -1725,32 +1737,41 @@ public class QueryCausalSeq: LayerMergeSeq var command: MetalCommand var globalOffset = 0 - - var pGlobalOffset: [UInt32] = [UInt32(globalOffset)] - let kernel = nbNeurons % 4 == 0 ? "concat1Seq4Forward" : "concat1SeqForward" let coeff = nbNeurons % 4 == 0 ? 4 : 1 - command = metalKernel.createCommand( - kernel, deviceID: deviceID - ) - command.setBuffer(cacheKey.metal, atIndex: 0) - command.setBytes(pGlobalOffset, atIndex: 1) - command.setBytes(pNbNeurons, atIndex: 2) - command.setBytes(pNbBatch, atIndex: 3) - command.setBytes(pSequence, atIndex: 4) - command.setBytes(pSequenceCache, atIndex: 5) - command.setBuffer(_cacheKeyTmp.metal, atIndex: 6) - command.dispatchThreads( - width: nbNeurons / coeff, - height: batchSize * cacheSeq - ) - command.enqueue() + if batchSize != 1 && !slidingWindow + { + let pGlobalOffset: [UInt32] = [UInt32(globalOffset)] + + command = metalKernel.createCommand( + kernel, deviceID: deviceID + ) + command.setBuffer(_getKeyCacheInputGPU()!.metal, atIndex: 0) + command.setBytes(pGlobalOffset, atIndex: 1) + command.setBytes(pNbNeurons, atIndex: 2) + command.setBytes(pNbBatch, atIndex: 3) + command.setBytes(pSequence, atIndex: 4) + command.setBytes(pSequenceCache, atIndex: 5) + command.setBuffer(_getKeyCacheOutputGPU()!.metal, atIndex: 6) + + command.dispatchThreads( + width: nbNeurons / coeff, + height: batchSize * cacheSeq + ) + command.enqueue() + } - globalOffset += cacheSeq + globalOffset += cacheSeq % cacheSeqMax + // TODO: when using sliding window with an instruct model, + // it is risky to erase the header information! + // if cacheSeq >= cacheSeqMax + // { + // globalOffset += 5 + // } - pGlobalOffset = [UInt32(globalOffset)] + let pGlobalOffset = [UInt32(globalOffset)] command = metalKernel.createCommand( kernel, deviceID: deviceID @@ -1761,7 +1782,7 @@ public class QueryCausalSeq: LayerMergeSeq command.setBytes(pNbBatch, atIndex: 3) command.setBytes(pSequence, atIndex: 4) command.setBytes(pSequenceKey, atIndex: 5) - command.setBuffer(_cacheKeyTmp.metal, atIndex: 6) + command.setBuffer(_getKeyCacheOutputGPU()!.metal, atIndex: 6) command.dispatchThreads( width: nbNeurons / coeff, @@ -1770,6 +1791,67 @@ public class QueryCausalSeq: LayerMergeSeq command.enqueue() } + /// + /// Get key cache buffer to use as input in Metal kernel. + /// + /// - Returns: key cache to use as input. + /// + private func _getKeyCacheInputGPU() -> FloatBuffer? + { + if cacheSeq != nil + { + if cacheSeq % 2 == 0 + { + return _cacheKeyTmp + } + else + { + return cacheKey + } + } + return nil + } + + /// + /// Get key cache buffer to use as input in Metal kernel. + /// + /// - Returns: key cache to use as input. + /// + private func _getKeyCacheOutputGPU() -> FloatBuffer? + { + if cacheSeq != nil + { + if batchSize == 1 + { + return cacheKey + } + else + { + if cacheSeq >= cacheSeqMax // sliding window + { + // The cache key has not changed. + if (cacheSeqMax - 1) % 2 == 0 + { + return cacheKey + } + else + { + return _cacheKeyTmp + } + } + else if cacheSeq % 2 == 0 + { + return cacheKey + } + else + { + return _cacheKeyTmp + } + } + } + return nil + } + /// Apply the forward pass in the GPU execution context. private func _forwardGPU() { diff --git a/Sources/GrAIdient/LayerSeq/SoftmaxSeq.swift b/Sources/GrAIdient/LayerSeq/SoftmaxSeq.swift index bff11333..375ea688 100644 --- a/Sources/GrAIdient/LayerSeq/SoftmaxSeq.swift +++ b/Sources/GrAIdient/LayerSeq/SoftmaxSeq.swift @@ -363,11 +363,15 @@ public class SoftmaxSeq: LayerSeq /// public class SoftmaxCausalSeq: SoftmaxSeq { + /// Maximal sequence of cache. + public var cacheSeqMax = 128 + /// Current cache sequence. public var cacheSeq: Int! = nil private enum Keys: String, CodingKey { + case cacheSeqMax case cacheSeq } @@ -401,6 +405,7 @@ public class SoftmaxCausalSeq: SoftmaxSeq public required init(from decoder: Decoder) throws { let values = try decoder.container(keyedBy: Keys.self) + cacheSeqMax = try values.decode(Int.self, forKey: Keys.cacheSeqMax) cacheSeq = try values.decodeIfPresent(Int.self, forKey: .cacheSeq) try super.init(from: decoder) } @@ -419,6 +424,7 @@ public class SoftmaxCausalSeq: SoftmaxSeq public override func encode(to encoder: Encoder) throws { var container = encoder.container(keyedBy: Keys.self) + try container.encode(cacheSeqMax, forKey: Keys.cacheSeqMax) if cacheSeq != nil { try container.encode(cacheSeq, forKey: Keys.cacheSeq) @@ -453,6 +459,8 @@ public class SoftmaxCausalSeq: SoftmaxSeq nbHeads: _nbHeads, params: params ) + + layer.cacheSeqMax = cacheSeqMax layer.cacheSeq = cacheSeq return layer @@ -507,7 +515,7 @@ public class SoftmaxCausalSeq: SoftmaxSeq if let layerPrev = self.layerPrev as? LayerSeq { - let nbNeurons = (cacheSeq + 1) * _nbHeads + let nbNeurons = min(cacheSeq + 1, cacheSeqMax) * _nbHeads let pNbHeads: [UInt32] = [UInt32(_nbHeads)] let pNbNeurons: [UInt32] = [UInt32(nbNeurons)] diff --git a/Sources/GrAIdient/LayerSeq/ValueSeq.swift b/Sources/GrAIdient/LayerSeq/ValueSeq.swift index 2c5d2e59..6267b718 100644 --- a/Sources/GrAIdient/LayerSeq/ValueSeq.swift +++ b/Sources/GrAIdient/LayerSeq/ValueSeq.swift @@ -1344,20 +1344,25 @@ public class ValueCausalSeq: LayerMergeSeq if cacheValue != nil && cacheSeq != nil && cacheValue.nbElems != batchSize * cacheSeqMax * nbNeuronsPrevValue { - _cacheValueTmp = FloatBuffer( + let cacheValueTmp = FloatBuffer( nbElems: batchSize * cacheSeqMax * nbNeuronsPrevValue, deviceID: deviceID ) let nbElems = batchSize * cacheSeq * nbNeuronsPrevValue - _copyGPU(nbElems: nbElems, from: cacheValue, to: _cacheValueTmp) + _copyGPU(nbElems: nbElems, from: cacheValue, to: cacheValueTmp) cacheValue = FloatBuffer( nbElems: batchSize * cacheSeqMax * nbNeuronsPrevValue, deviceID: deviceID ) - _copyGPU(nbElems: nbElems, from: _cacheValueTmp, to: cacheValue) + _copyGPU(nbElems: nbElems, from: cacheValueTmp, to: cacheValue) + + if batchSize > 1 + { + _cacheValueTmp = cacheValueTmp + } } } @@ -1658,12 +1663,12 @@ public class ValueCausalSeq: LayerMergeSeq throw LayerError.Init(message: "`sequence` should be 1.") } - _concatGPU() + _mergeCacheGPU() let value = layersPrev[0] as! LayerSeq let score = layersPrev[1] as! LayerSeq let nbNeuronsPrevValue = value.nbNeurons - let nbNeuronsPrevScore = score.nbNeurons + let nbNeuronsPrevScore = min(cacheSeq + 1, cacheSeqMax) * _nbHeadsScore let pNbHeadsValue: [UInt32] = [UInt32(_nbHeadsValue)] let pNbHeadsScore: [UInt32] = [UInt32(_nbHeadsScore)] @@ -1671,7 +1676,7 @@ public class ValueCausalSeq: LayerMergeSeq let pNbNeuronsPrevValue: [UInt32] = [UInt32(nbNeuronsPrevValue)] let pNbNeuronsPrevScore: [UInt32] = [UInt32(nbNeuronsPrevScore)] let pNbBatch: [UInt32] = [UInt32(batchSize)] - let pSequence: [UInt32] = [UInt32(cacheSeq + 1)] + let pSequence: [UInt32] = [UInt32(min(cacheSeq + 1, cacheSeqMax))] let kernel = (nbNeurons / _nbHeadsScore) % 4 == 0 ? "valueCausalSeq4Generate" : "valueCausalSeqGenerate" @@ -1679,7 +1684,7 @@ public class ValueCausalSeq: LayerMergeSeq let command = MetalKernel.get.createCommand( kernel, deviceID: deviceID ) - command.setBuffer(_cacheValueTmp.metal, atIndex: 0) + command.setBuffer(_getValueCacheOutputGPU()!.metal, atIndex: 0) command.setBuffer(score.outs.metal, atIndex: 1) command.setBytes(pNbHeadsValue, atIndex: 2) command.setBytes(pNbHeadsScore, atIndex: 3) @@ -1696,22 +1701,29 @@ public class ValueCausalSeq: LayerMergeSeq ) command.enqueue() - let nbElems = batchSize * (cacheSeq + 1) * nbNeuronsPrevValue - _copyGPU(nbElems: nbElems, from: _cacheValueTmp, to: cacheValue) - cacheSeq += 1 } /// Concatenate cache to key. - private func _concatGPU() + private func _mergeCacheGPU() { + let slidingWindow: Bool + if cacheSeq >= cacheSeqMax + { + slidingWindow = true + } + else + { + slidingWindow = false + } + let value = layersPrev[0] as! LayerSeq let nbNeuronsPrevValue = value.nbNeurons let nbNeurons = nbNeuronsPrevValue let pNbNeurons: [UInt32] = [UInt32(nbNeurons)] let pNbBatch: [UInt32] = [UInt32(batchSize)] - let pSequence: [UInt32] = [UInt32(cacheSeq + 1)] + let pSequence: [UInt32] = [UInt32(min(cacheSeq + 1, cacheSeqMax))] let pSequenceCache: [UInt32] = [UInt32(cacheSeq)] let pSequenceValue: [UInt32] = [UInt32(1)] @@ -1719,32 +1731,41 @@ public class ValueCausalSeq: LayerMergeSeq var command: MetalCommand var globalOffset = 0 - - var pGlobalOffset: [UInt32] = [UInt32(globalOffset)] - let kernel = nbNeurons % 4 == 0 ? "concat1Seq4Forward" : "concat1SeqForward" let coeff = nbNeurons % 4 == 0 ? 4 : 1 - command = metalKernel.createCommand( - kernel, deviceID: deviceID - ) - command.setBuffer(cacheValue.metal, atIndex: 0) - command.setBytes(pGlobalOffset, atIndex: 1) - command.setBytes(pNbNeurons, atIndex: 2) - command.setBytes(pNbBatch, atIndex: 3) - command.setBytes(pSequence, atIndex: 4) - command.setBytes(pSequenceCache, atIndex: 5) - command.setBuffer(_cacheValueTmp.metal, atIndex: 6) - command.dispatchThreads( - width: nbNeurons / coeff, - height: batchSize * cacheSeq - ) - command.enqueue() + if batchSize != 1 && !slidingWindow + { + let pGlobalOffset: [UInt32] = [UInt32(globalOffset)] + + command = metalKernel.createCommand( + kernel, deviceID: deviceID + ) + command.setBuffer(_getValueCacheInputGPU()!.metal, atIndex: 0) + command.setBytes(pGlobalOffset, atIndex: 1) + command.setBytes(pNbNeurons, atIndex: 2) + command.setBytes(pNbBatch, atIndex: 3) + command.setBytes(pSequence, atIndex: 4) + command.setBytes(pSequenceCache, atIndex: 5) + command.setBuffer(_getValueCacheOutputGPU()!.metal, atIndex: 6) + + command.dispatchThreads( + width: nbNeurons / coeff, + height: batchSize * cacheSeq + ) + command.enqueue() + } - globalOffset += cacheSeq + globalOffset += cacheSeq % cacheSeqMax + // TODO: when using sliding window with an instruct model, + // it is risky to erase the header information! + // if cacheSeq >= cacheSeqMax + // { + // globalOffset += 5 + // } - pGlobalOffset = [UInt32(globalOffset)] + let pGlobalOffset = [UInt32(globalOffset)] command = metalKernel.createCommand( kernel, deviceID: deviceID @@ -1755,7 +1776,7 @@ public class ValueCausalSeq: LayerMergeSeq command.setBytes(pNbBatch, atIndex: 3) command.setBytes(pSequence, atIndex: 4) command.setBytes(pSequenceValue, atIndex: 5) - command.setBuffer(_cacheValueTmp.metal, atIndex: 6) + command.setBuffer(_getValueCacheOutputGPU()!.metal, atIndex: 6) command.dispatchThreads( width: nbNeurons / coeff, @@ -1764,6 +1785,67 @@ public class ValueCausalSeq: LayerMergeSeq command.enqueue() } + /// + /// Get value cache buffer to use as input in Metal kernel. + /// + /// - Returns: value cache to use as input. + /// + private func _getValueCacheInputGPU() -> FloatBuffer? + { + if cacheSeq != nil + { + if cacheSeq % 2 == 0 + { + return _cacheValueTmp + } + else + { + return cacheValue + } + } + return nil + } + + /// + /// Get value cache buffer to use as input in Metal kernel. + /// + /// - Returns: value cache to use as input. + /// + private func _getValueCacheOutputGPU() -> FloatBuffer? + { + if cacheSeq != nil + { + if batchSize == 1 + { + return cacheValue + } + else + { + if cacheSeq >= cacheSeqMax // sliding window + { + // The cache key has not changed. + if (cacheSeqMax - 1) % 2 == 0 + { + return cacheValue + } + else + { + return _cacheValueTmp + } + } + else if cacheSeq % 2 == 0 + { + return cacheValue + } + else + { + return _cacheValueTmp + } + } + } + return nil + } + /// Apply the forward pass in the GPU execution context. private func _forwardGPU() { diff --git a/Sources/GrAIdient/Metal/Kernel/NLPFloat.metal b/Sources/GrAIdient/Metal/Kernel/NLPFloat.metal index decae419..d20a6a9b 100644 --- a/Sources/GrAIdient/Metal/Kernel/NLPFloat.metal +++ b/Sources/GrAIdient/Metal/Kernel/NLPFloat.metal @@ -714,7 +714,7 @@ kernel void valueCausalSeqGenerateFloat( uint depthValue = j + headValue * size; float tmp = 0.0; - for (uint seqK=0; seqK<=sequence; seqK++) + for (uint seqK=0; seqK= maxTokens - tmpSeq - 1 + { + for score in scores + { + XCTAssert(score != 0.0) + } + } + + // Test that sum of scores equal to 1. + scores = score2Layer.outs.download() + var sum = 0.0 + for (j, score) in scores.enumerated() + { + sum += Double(score) + + // Every seqK is not yet used: we still have 0.0 in the + // context cache. + if (j + 1) == scores.count && i < maxTokens - tmpSeq - 1 + { + XCTAssert(sum == 0.0) + } + // Every seqK is used: there should not be any 0.0 as + // the context cache is full. + else if (j + 1) == scores.count + { + let value = round(sum * 100) / 100.0 + XCTAssert(value == 1.0) + } + // Nominal case, we are feeding `sum`. + else if (j + 1) % (min(nbTokens + 1, maxTokens)) == 0 + { + if sum != 0.0 + { + let value = round(sum * 100) / 100.0 + XCTAssert(value == 1.0) + } + sum = 0.0 + } + } + + // Get result. + let out = (model.layers.last as! LayerSeq).outs.download() + + // Compute prediction for each token. + for seq in 0..