diff --git a/CHANGELOG.md b/CHANGELOG.md index 14317c73..a6727d9d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ All notable changes to this project will be documented in this file. ## [unreleased] +📚 **docs:** LLM doc & split tests ([129](https://github.com/owkin/GrAIdient/pull/129))\ ✨ **layer_seq:** LLM generate ([128](https://github.com/owkin/GrAIdient/pull/128))\ ✨ **layer_seq:** MultiplySeq, SiLU & LLM test ([127](https://github.com/owkin/GrAIdient/pull/127))\ ✨ **layer_seq:** ValueCausalSeq ([126](https://github.com/owkin/GrAIdient/pull/126))\ diff --git a/Docs/Examples/AutoEncoder.md b/Docs/Examples/AutoEncoder.md index eb9b1451..aef3a7c3 100644 --- a/Docs/Examples/AutoEncoder.md +++ b/Docs/Examples/AutoEncoder.md @@ -64,7 +64,19 @@ conda env remove --name graiexamples ## Steps -1. Dump the training dataset. +Each train example uses a `CIFARAutoEncoderTrainer`. +The latter is responsible for initializing the training dataset +before the actual training takes place. + 1. Train a simple auto encoder model. 1. Train a UNet like auto encoder model. 1. Train a StyleGAN like auto encoder model. + +## Further tests + +Further tests are available at +[AutoEncoderTests](../../Tests/GrAIExamples/AutoEncoderTests.swift). + +The test `testTrain` compares the training of a `SimpleAutoEncoder` +in GrAIdient and in PyTorch to show that the same `loss` is computed +throughout the training. diff --git a/Docs/Examples/EXAMPLES.md b/Docs/Examples/EXAMPLES.md index 21f388b8..7f2cbcab 100644 --- a/Docs/Examples/EXAMPLES.md +++ b/Docs/Examples/EXAMPLES.md @@ -12,3 +12,4 @@ The following examples are currently available: - [VGG](VGG.md) - [Vision Transformer](VisionTransformer.md) - [Auto Encoder](AutoEncoder.md) +- [NLP](NLP.md) diff --git a/Docs/Examples/NLP.md b/Docs/Examples/NLP.md new file mode 100644 index 00000000..882a2be6 --- /dev/null +++ b/Docs/Examples/NLP.md @@ -0,0 +1,50 @@ +# 🚀 NLP Example + +This is the documentation for running +[LLMs](../../Tests/GrAIExamples/NLPExample.swift) on the GPU. + +## Setup + +This example has some `Python` dependencies. In order to run +the example, we first have to setup the environment: + +```bash +conda create --name graiexamples python=3.9 +conda activate graiexamples +cd Tests/GrAIExamples/Base +pip install -e . +``` + +Then: +- download weights from +[MistralAI](https://docs.mistral.ai/getting-started/open_weight_models/). +- Update `_modelPath` in the +[NLPExample](../../Tests/GrAIExamples/NLPExample.swift) file with the +previous downloaded weights. +- Optionnally update `_prompt`. +- Rename `_testGenerate` into `testGenerate`. +- Run the test. + +It is finally possible to clean the environment 🌍 + +```bash +conda deactivate +conda env remove --name graiexamples +``` + +## Steps + +1. Generate text from a prompt. + +## Further tests + +Further tests are available at +[NLPExampleTests](../../Tests/GrAIExamples/NLPExampleTests.swift). +In order to run them, rename +`_testPredict1` and `_testPredict32` into `testPredict1` and `testPredict32`. + +The test `testPredict1` compares the first step of generation +of a toy LLM (just one transformer block) in GrAIdient and in PyTorch. + +The test `testPredict32` runs the first step of generation +of a full LLM in GrAIdient and compares the expected result from PyTorch. diff --git a/Docs/Examples/VGG.md b/Docs/Examples/VGG.md index 40f3db74..9f34de73 100644 --- a/Docs/Examples/VGG.md +++ b/Docs/Examples/VGG.md @@ -91,3 +91,17 @@ conda env remove --name graiexamples 1. Train a model on the training dataset. 1. Evaluate the trained model on the testing dataset: watch a better performance. + +## Benchmarks + +To benchmark the time performance of the VGG model, look at +[VGGBenchmark](../../Tests/GrAIExamples/VGGBenchmark.swift) and rename +`_test_TrainVGG` and `_test_EvalVGG` into `test_TrainVGG` and `test_EvalVGG`. + +The test `test_TrainVGG` will measure the time spent for training the VGG +model for 20 steps. + +The test `test_EvalVGG` will measure the time spent for running the VGG model +in inference for 20 steps. + +Note that for both tests, the data is random and fixed once and for all. diff --git a/Docs/Examples/VisionTransformer.md b/Docs/Examples/VisionTransformer.md index 6dfdf405..b347e7aa 100644 --- a/Docs/Examples/VisionTransformer.md +++ b/Docs/Examples/VisionTransformer.md @@ -86,3 +86,20 @@ conda env remove --name graiexamples 1. Dump the training dataset. 1. Train a simple Vision Transformer model. + +## Benchmarks + +To benchmark the time performance of the Vision Transformer model, +look at +[TransformerBenchmark](../../Tests/GrAIExamples/TransformerBenchmark.swift) +and rename +`_test_TrainTransformer` and `_test_EvalTransformer` into +`test_TrainTransformer` and `test_EvalTransformer`. + +The test `test_TrainTransformer` will measure the time spent for training the +VisionTransformer model for 20 steps. + +The test `test_EvalTransformer` will measure the time spent for running the +VisionTransformer model in inference for 20 steps. + +Note that for both tests, the data is random and fixed once and for all. diff --git a/Tests/GrAIExamples/Base/python_lib/nlp/generate.py b/Tests/GrAIExamples/Base/python_lib/nlp/generate.py index f5145f6e..b095eb6f 100644 --- a/Tests/GrAIExamples/Base/python_lib/nlp/generate.py +++ b/Tests/GrAIExamples/Base/python_lib/nlp/generate.py @@ -2,14 +2,17 @@ import torch import numpy as np from pathlib import Path -from typing import Generator, List +from typing import Generator, List, Optional from python_lib.nlp.tokenizer import Tokenizer from python_lib.nlp.model import Transformer, TransformerArgs def _predict_no_cache( - prompt: torch.Tensor, model: Transformer, temp: float = 0.0 + prompt: torch.Tensor, + model: Transformer, + temp: float = 0.0, + n_layers: Optional[int] = None ) -> torch.Tensor: """ Predict text based on the given prompt and model. @@ -22,6 +25,8 @@ def _predict_no_cache( The model to use for generation. temp: float The temperature for sampling. If temp is 0, use max sampling. + n_layers: int + Modifier of the number of Transformer blocks. Returns ------- @@ -38,7 +43,7 @@ def sample(logits: torch.Tensor) -> torch.Tensor: ) y = prompt - logits, _ = model(y[None], cache=None) + logits, _ = model(y[None], cache=None, n_layers=n_layers) return sample(logits) @@ -146,6 +151,7 @@ def _predict( prompt: str, model_path: str, temp: float = 0, + n_layers: Optional[int] = None ): """ Predict text based on the given prompt and model. @@ -158,6 +164,8 @@ def _predict( Path to the model on the disk. temp: float The temperature for sampling. If temp is 0, use max sampling. + n_layers: int + Modifier of the number of Transformer blocks. """ state = torch.load(str(Path(model_path) / "consolidated.00.pth")) tokenizer = Tokenizer(str(Path(model_path) / "tokenizer.model")) @@ -178,14 +186,15 @@ def _predict( ) tokens = _predict_no_cache( - prompt, model, temp + prompt, model, temp, n_layers ).squeeze(dim=0).cpu().numpy().tolist() print(tokenizer.decode(tokens)) def predict( prompt: str, - model_path: str + model_path: str, + n_layers: Optional[int] = None ) -> np.ndarray: """ Predict text based on the given prompt and model. @@ -196,6 +205,8 @@ def predict( The input prompt. model_path: str Path to the model on the disk. + n_layers: int + Modifier of the number of Transformer blocks. """ state = torch.load(str(Path(model_path) / "consolidated.00.pth")) tokenizer = Tokenizer(str(Path(model_path) / "tokenizer.model")) @@ -213,7 +224,7 @@ def predict( prompt = torch.tensor( tokenizer.encode(prompt), dtype=torch.long, device="mps" ) - out, _ = model(prompt[None]) + out, _ = model(prompt[None], n_layers=n_layers) return out.detach().cpu().numpy().flatten() @@ -255,12 +266,14 @@ def decode( if __name__ == "__main__": model_path = "" + prompt = "How do you do?" + _generate( prompt="How do you do?", model_path=model_path ) prompt = encode( - prompt="How do you do?", + prompt=prompt, model_path=model_path ) prompt = decode( @@ -268,10 +281,12 @@ def decode( model_path=model_path ) _predict( - prompt="How do you do?", + prompt=prompt, model_path=model_path, + n_layers=None ) predict( - prompt="How do you do?", - model_path=model_path + prompt=prompt, + model_path=model_path, + n_layers=1 ) diff --git a/Tests/GrAIExamples/Base/python_lib/nlp/model.py b/Tests/GrAIExamples/Base/python_lib/nlp/model.py index 9eabbdf4..27ad866f 100644 --- a/Tests/GrAIExamples/Base/python_lib/nlp/model.py +++ b/Tests/GrAIExamples/Base/python_lib/nlp/model.py @@ -377,6 +377,7 @@ def forward( self, x: torch.Tensor, cache=None, + n_layers=None ) -> Tuple[torch.Tensor, Optional[list]]: """ Forward pass. @@ -388,6 +389,8 @@ def forward( cache: (key_cache, value_cache): (torch.Tensor, torch.Tensor) cache for keys and values for generating tokens with past context. + n_layers: Int + Modifier of the number of Transformer blocks. Returns ------- @@ -424,9 +427,11 @@ def forward( cache = [None] * len(self.layers) for e, layer in enumerate(self.layers): + if n_layers is not None and e == n_layers: + break + h, cache[e] = layer( h, rotation_matrix=rotation_matrix, mask=mask, cache=cache[e] ) - break return self.output(self.norm(h)), cache diff --git a/Tests/GrAIExamples/NLPExample.swift b/Tests/GrAIExamples/NLPExample.swift index afd351d4..6a7b7fa4 100644 --- a/Tests/GrAIExamples/NLPExample.swift +++ b/Tests/GrAIExamples/NLPExample.swift @@ -399,140 +399,6 @@ final class NLPExample: XCTestCase } } - /// Predict text from prompt. - func _testPredict1() throws - { - let nbBlocks = 1 - let hiddenDim = 4096 - let headDim = 128 - let mlpDim = 14336 - let nbHeadsQuery = 32 - let nbHeadsKV = 8 - let vocabularySize = 32000 - - // Encode prompt. - let pythonLib = Python.import("python_lib") - let prompt = [Int](pythonLib.encode( - _prompt, - _modelPath - ))! - - // Compute reference. - let arrayRef = [Float](numpy: pythonLib.predict( - _prompt, - _modelPath - ))! - - // Load pre trained model. - let model = _buildModel( - modelPath: _modelPath, - sequence: prompt.count, - nbBlocks: nbBlocks, - hiddenDim: hiddenDim, - headDim: headDim, - mlpDim: mlpDim, - nbHeadsQuery: nbHeadsQuery, - nbHeadsKV: nbHeadsKV, - vocabularySize: vocabularySize - ) - - // Initialize for inference. - model.initKernel(phase: .Inference) - model.updateKernel(batchSize: 1) - - // Forward. - let firstLayer: EmbeddingSeq = model.layers.first as! EmbeddingSeq - try! firstLayer.setDataGPU( - [prompt], batchSize: 1, sequence: prompt.count - ) - try! model.forward() - - // Get result. - let arrayOut = (model.layers.last as! LayerSeq).outs.download() - - // Compare difference. - for (elemOut, elemRef) in zip(arrayOut, arrayRef) - { - if elemRef == 0.0 - { - XCTAssert(elemOut == 0.0) - } - else - { - let diffPercent = abs(elemOut - elemRef) / abs(elemRef) * 100.0 - if diffPercent > 1 - { - print(diffPercent) - } - XCTAssert(diffPercent < 1) - } - } - } - - /// Predict text from prompt. - func _testPredict32() throws - { - let nbBlocks = 32 - let hiddenDim = 4096 - let headDim = 128 - let mlpDim = 14336 - let nbHeadsQuery = 32 - let nbHeadsKV = 8 - let vocabularySize = 32000 - - // Encode prompt. - let pythonLib = Python.import("python_lib") - let prompt = [Int](pythonLib.encode( - _prompt, - _modelPath - ))! - - // Load pre trained model. - let model = _buildModel( - modelPath: _modelPath, - sequence: prompt.count, - nbBlocks: nbBlocks, - hiddenDim: hiddenDim, - headDim: headDim, - mlpDim: mlpDim, - nbHeadsQuery: nbHeadsQuery, - nbHeadsKV: nbHeadsKV, - vocabularySize: vocabularySize - ) - - // Initialize for inference. - model.initKernel(phase: .Inference) - model.updateKernel(batchSize: 1) - - // Forward. - let firstLayer: EmbeddingSeq = model.layers.first as! EmbeddingSeq - try! firstLayer.setDataGPU( - [prompt], batchSize: 1, sequence: prompt.count - ) - try! model.forward() - - // Get result. - let out = (model.layers.last as! LayerSeq).outs.download() - - // Compute prediction for each token. - var predictions = [Int]() - for seq in 0.. Int? + { + if array.isEmpty + { + return nil + } + + var maxIndex = 0 + var maxValue = array[0] + for i in 1.. maxValue + { + maxIndex = i + maxValue = array[i] + } + } + return maxIndex + } + + /// + /// Build LLM model. + /// + /// - Parameters: + /// - modelPath: Model path on the disk. + /// - sequence: Length of the sequence. + /// - nbBlocks: Number of transformer + MLP blocks. + /// - hiddenDim: Dimension of neurons in the main branch. + /// - headDim: Dimension of neurons in the transformer branches. + /// - mlpDim: Dimension of neurons in the MLP branches. + /// - nbHeads: Number of heads (groups) of neurons for queries. + /// - nbHeadsKV: Number of heads (groups) of neurons for keys and values. + /// - vocabularySize: Vocabulary size. + /// - Returns: The model built. + /// + func _buildModel( + modelPath: String, + sequence: Int, + nbBlocks: Int, + hiddenDim: Int, + headDim: Int, + mlpDim: Int, + nbHeadsQuery: Int, + nbHeadsKV: Int, + vocabularySize: Int) -> Model + { + let context = ModelContext(name: "NLP", curID: 0) + let params = GrAI.Model.Params(context: context) + + var curPyTorch = 0 + var curGrAIdient = 0 + var dicoGrAIdient2PyTorch = [Int: Int]() + + var layer: LayerSeq = EmbeddingSeq( + sequence: sequence, + vocabularySize: vocabularySize, + nbNeurons: hiddenDim, params: params + ) + dicoGrAIdient2PyTorch[curGrAIdient] = curPyTorch + curGrAIdient += 1 + curPyTorch += 1 + 2 + + for _ in 0..( + numpy: weightsNumpy[idPyTorch]! + )! + layerTmp.weightsCPU = weightsTmp + + weightsNumpy[idPyTorch] = nil + } + if let layerTmp = layer as? RMSNormSeq + { + let idGrAIdient = layerTmp.id + let idPyTorch = dicoGrAIdient2PyTorch[idGrAIdient]! + + let weightsTmp: [Float] = Array( + numpy: weightsNumpy[idPyTorch]! + )! + layerTmp.weightsCPU = weightsTmp + + weightsNumpy[idPyTorch] = nil + } + if let layerTmp = layer as? FullyConnectedSeq + { + let idGrAIdient = layerTmp.id + let idPyTorch = dicoGrAIdient2PyTorch[idGrAIdient]! + + let weightsTmp: [Float] = Array( + numpy: weightsNumpy[idPyTorch]! + )! + layerTmp.weightsCPU = weightsTmp + + weightsNumpy[idPyTorch] = nil + } + } + return model + } + + /// Predict text from prompt. + func _testPredict1() throws + { + let nbBlocks = 1 + let hiddenDim = 4096 + let headDim = 128 + let mlpDim = 14336 + let nbHeadsQuery = 32 + let nbHeadsKV = 8 + let vocabularySize = 32000 + + // Encode prompt. + let pythonLib = Python.import("python_lib") + let prompt = [Int](pythonLib.encode( + _prompt, + _modelPath + ))! + + // Compute reference. + let arrayRef = [Float](numpy: pythonLib.predict( + _prompt, + _modelPath, + 1 + ))! + + // Load pre trained model. + let model = _buildModel( + modelPath: _modelPath, + sequence: prompt.count, + nbBlocks: nbBlocks, + hiddenDim: hiddenDim, + headDim: headDim, + mlpDim: mlpDim, + nbHeadsQuery: nbHeadsQuery, + nbHeadsKV: nbHeadsKV, + vocabularySize: vocabularySize + ) + + // Initialize for inference. + model.initKernel(phase: .Inference) + model.updateKernel(batchSize: 1) + + // Forward. + let firstLayer: EmbeddingSeq = model.layers.first as! EmbeddingSeq + try! firstLayer.setDataGPU( + [prompt], batchSize: 1, sequence: prompt.count + ) + try! model.forward() + + // Get result. + let arrayOut = (model.layers.last as! LayerSeq).outs.download() + + // Compare difference. + for (elemOut, elemRef) in zip(arrayOut, arrayRef) + { + if elemRef == 0.0 + { + XCTAssert(elemOut == 0.0) + } + else + { + let diffPercent = abs(elemOut - elemRef) / abs(elemRef) * 100.0 + if diffPercent > 1 + { + print(diffPercent) + } + XCTAssert(diffPercent < 1) + } + } + } + + /// Predict text from prompt. + func _testPredict32() throws + { + let nbBlocks = 32 + let hiddenDim = 4096 + let headDim = 128 + let mlpDim = 14336 + let nbHeadsQuery = 32 + let nbHeadsKV = 8 + let vocabularySize = 32000 + + // Encode prompt. + let pythonLib = Python.import("python_lib") + let prompt = [Int](pythonLib.encode( + _prompt, + _modelPath + ))! + + // Load pre trained model. + let model = _buildModel( + modelPath: _modelPath, + sequence: prompt.count, + nbBlocks: nbBlocks, + hiddenDim: hiddenDim, + headDim: headDim, + mlpDim: mlpDim, + nbHeadsQuery: nbHeadsQuery, + nbHeadsKV: nbHeadsKV, + vocabularySize: vocabularySize + ) + + // Initialize for inference. + model.initKernel(phase: .Inference) + model.updateKernel(batchSize: 1) + + // Forward. + let firstLayer: EmbeddingSeq = model.layers.first as! EmbeddingSeq + try! firstLayer.setDataGPU( + [prompt], batchSize: 1, sequence: prompt.count + ) + try! model.forward() + + // Get result. + let out = (model.layers.last as! LayerSeq).outs.download() + + // Compute prediction for each token. + var predictions = [Int]() + for seq in 0..