From 3a1ec63f8a4f5fe3df1afe11b34b156c3bb3cd2b Mon Sep 17 00:00:00 2001
From: Jake Luciani <jake@apache.org>
Date: Tue, 29 Oct 2024 20:25:13 -0400
Subject: [PATCH] Next release

---
 README.md                                     |  1 +
 .../tjake/jlama/model/AbstractModel.java      | 91 ++++++++-----------
 .../tjake/jlama/model/TransformerBlock.java   |  1 -
 .../jlama/model/gemma2/Gemma2Config.java      |  2 +-
 .../tjake/jlama/model/gemma2/Gemma2Model.java |  1 -
 .../jlama/model/granite/GraniteConfig.java    | 68 +++++++-------
 .../jlama/model/granite/GraniteModel.java     | 42 ++++-----
 .../jlama/model/llama/LlamaTokenizer.java     |  5 +-
 .../tjake/jlama/safetensors/Config.java       | 76 ++++++++--------
 .../safetensors/tokenizer/TokenizerModel.java |  6 +-
 .../tjake/jlama/tensor/KvBufferCache.java     |  4 +-
 .../github/tjake/jlama/util/Downloader.java   | 30 +++++-
 .../github/tjake/jlama/model/TestModels.java  |  4 +-
 .../github/tjake/jlama/model/TestSample.java  | 41 ++++-----
 pom.xml                                       |  2 +-
 15 files changed, 190 insertions(+), 184 deletions(-)

diff --git a/README.md b/README.md
index d1c61e2..3c34aa9 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,7 @@ Model Support:
   * Llama & Llama2 & Llama3 Models
   * Mistral & Mixtral Models
   * Qwen2 Models
+  * IBM Granite Models
   * GPT-2 Models
   * BERT Models
   * BPE Tokenizers
diff --git a/jlama-core/src/main/java/com/github/tjake/jlama/model/AbstractModel.java b/jlama-core/src/main/java/com/github/tjake/jlama/model/AbstractModel.java
index 9e04159..45754bb 100644
--- a/jlama-core/src/main/java/com/github/tjake/jlama/model/AbstractModel.java
+++ b/jlama-core/src/main/java/com/github/tjake/jlama/model/AbstractModel.java
@@ -394,26 +394,21 @@ public Map<String, Float> classify(String input, PoolingType poolingType) {
     }
 
     public float[] getLogits(AbstractTensor output) {
-        try (AbstractTensor embedding = sampleOutput.getOutputLayerNorm().forward(output);
-             AbstractTensor logits = makeDenseTensor(1, c.vocabularySize)) {
+        try (
+            AbstractTensor embedding = sampleOutput.getOutputLayerNorm().forward(output);
+            AbstractTensor logits = makeDenseTensor(1, c.vocabularySize)
+        ) {
 
             VectorMath.pchunk(0, c.vocabularySize, (chunkStart, chunkSize) -> {
                 TensorOperationsProvider.get()
-                        .dotProductChunk(
-                                logits,
-                                embedding,
-                                sampleOutput.getOutputLogitsWeights(),
-                                0,
-                                c.embeddingLength,
-                                chunkStart,
-                                chunkSize);
+                    .dotProductChunk(logits, embedding, sampleOutput.getOutputLogitsWeights(), 0, c.embeddingLength, chunkStart, chunkSize);
             });
 
             VectorMath.softMax(logits, 0, c.vocabularySize);
 
             float[] r = new float[c.vocabularySize];
 
-            //Convert from Tensor to float array
+            // Convert from Tensor to float array
             logits.getMemorySegment().asByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer().get(r);
 
             return r;
@@ -470,7 +465,6 @@ public int sample(AbstractTensor output, float temperature, float uniformSample,
         }
     }
 
-
     protected boolean addBosToken() {
         return true;
     }
@@ -478,15 +472,14 @@ protected boolean addBosToken() {
     public int[] encodePrompt(PromptContext promptContext) {
         long[] encoded = tokenizer.encode(promptContext.getPrompt());
 
-        if (!addBosToken())
-            return Arrays.stream(encoded).mapToInt(Ints::checkedCast).toArray();
+        if (!addBosToken()) return Arrays.stream(encoded).mapToInt(Ints::checkedCast).toArray();
 
         // Remove BOS token if it's the first token, we explicitly add it below
         if (encoded.length > 0 && encoded[0] == c.bosToken) {
             encoded = Arrays.copyOfRange(encoded, 1, encoded.length);
         }
 
-        int[] promptTokens  = new int[(1 + encoded.length)];
+        int[] promptTokens = new int[(1 + encoded.length)];
         promptTokens[0] = c.bosToken;
         for (int i = 1; i <= encoded.length; i++)
             promptTokens[i] = Ints.checkedCast(encoded[i - 1]);
@@ -514,11 +507,7 @@ public Response generate(
         try (KvBufferCache.KvBuffer kvmem = kvBufferCache.getKvBuffer(sessionId)) { // k and v for context window
             int startPos = kvmem.getCurrentContextPosition(); // Number of tokens in the buffer
 
-            logger.debug(
-                    "Starting at token {} for session {} with prompt {}",
-                    startPos,
-                    sessionId,
-                    promptContext.getPrompt());
+            logger.debug("Starting at token {} for session {} with prompt {}", startPos, sessionId, promptContext.getPrompt());
 
             if (ntokens > c.contextLength) ntokens = c.contextLength;
 
@@ -532,36 +521,32 @@ public Response generate(
             try (AbstractTensor logits = makeDenseTensor(c.vocabularySize)) {
                 int[] promptTokens;
 
-            if (addBosToken()) {
-                promptTokens = new int[(1 + encoded.length)];
+                if (addBosToken()) {
+                    promptTokens = new int[(1 + encoded.length)];
 
-                promptTokens[0] = c.bosToken;
-                for (int i = 1; i <= encoded.length; i++) promptTokens[i] = Ints.checkedCast(encoded[i - 1]);
-                promptLength = encoded.length;
-            } else {
-                promptTokens = Arrays.stream(encoded).mapToInt(Ints::checkedCast).toArray();
-                promptLength = encoded.length;
-            }
+                    promptTokens[0] = c.bosToken;
+                    for (int i = 1; i <= encoded.length; i++)
+                        promptTokens[i] = Ints.checkedCast(encoded[i - 1]);
+                    promptLength = encoded.length;
+                } else {
+                    promptTokens = Arrays.stream(encoded).mapToInt(Ints::checkedCast).toArray();
+                    promptLength = encoded.length;
+                }
 
                 long start = System.currentTimeMillis();
                 long promptStart = start;
                 // Batch Process Prompt
                 AbstractTensor last = DebugSupport.isDebug()
-                        ? batchForwardSlow(promptTokens, startPos, kvmem)
-                        : batchForward(promptTokens, startPos, kvmem);
+                    ? batchForwardSlow(promptTokens, startPos, kvmem)
+                    : batchForward(promptTokens, startPos, kvmem);
 
                 promptBatchTime = System.currentTimeMillis() - start;
                 float batchMsPerToken = Math.round((((double) promptBatchTime) / (double) promptLength));
-                logger.debug(
-                        "{} prompt tokens in {}ms | {}ms per token", promptLength, promptBatchTime, batchMsPerToken);
+                logger.debug("{} prompt tokens in {}ms | {}ms per token", promptLength, promptBatchTime, batchMsPerToken);
 
                 float genMsPerToken = 0;
                 tokensGenerated = 0;
-                int next = sample(
-                        last.slice(last.shape().first() - 1),
-                        temperature,
-                        ThreadLocalRandom.current().nextFloat(),
-                        logits);
+                int next = sample(last.slice(last.shape().first() - 1), temperature, ThreadLocalRandom.current().nextFloat(), logits);
                 last.close();
                 try {
                     String c = tokenizer.decode(next);
@@ -581,11 +566,9 @@ public Response generate(
                     AbstractTensor output = forward(next, i, kvmem);
                     tokensGenerated++;
 
-                    next = sample(
-                            output, temperature, ThreadLocalRandom.current().nextFloat(), logits);
+                    next = sample(output, temperature, ThreadLocalRandom.current().nextFloat(), logits);
 
-                    if (logger.isTraceEnabled())
-                        logger.trace("Sampled token {} with temperature {}", next, temperature);
+                    if (logger.isTraceEnabled()) logger.trace("Sampled token {} with temperature {}", next, temperature);
                     output.close();
 
                     kvmem.incrementContextPosition();
@@ -615,16 +598,22 @@ public Response generate(
                 long end = System.currentTimeMillis();
 
                 Response response = new Response(
-                        responseText.toString(),
-                        responseTextWithSpecialTokens.toString(),
-                        reason,
-                        promptLength,
-                        tokensGenerated,
-                        promptBatchTime,
-                        end - start);
-                logger.debug(String.format(
+                    responseText.toString(),
+                    responseTextWithSpecialTokens.toString(),
+                    reason,
+                    promptLength,
+                    tokensGenerated,
+                    promptBatchTime,
+                    end - start
+                );
+                logger.debug(
+                    String.format(
                         "\n\nelapsed: %ds, prompt %.1fms per token, gen %.1fms per token\n",
-                        TimeUnit.MILLISECONDS.toSeconds(end - promptStart), batchMsPerToken, genMsPerToken));
+                        TimeUnit.MILLISECONDS.toSeconds(end - promptStart),
+                        batchMsPerToken,
+                        genMsPerToken
+                    )
+                );
 
                 return postProcessResponse(promptContext, response);
             }
diff --git a/jlama-core/src/main/java/com/github/tjake/jlama/model/TransformerBlock.java b/jlama-core/src/main/java/com/github/tjake/jlama/model/TransformerBlock.java
index eca5e35..252e5c1 100644
--- a/jlama-core/src/main/java/com/github/tjake/jlama/model/TransformerBlock.java
+++ b/jlama-core/src/main/java/com/github/tjake/jlama/model/TransformerBlock.java
@@ -202,7 +202,6 @@ public AbstractTensor forward(
         }
         TensorOperationsProvider.get().accumulate(lnpostFF, lnattn, 0, model.c.embeddingLength);
 
-
         debug("post_ff_res", lnpostFF, layerIndex);
 
         // Release any tmp buffers (embedding is released by caller)
diff --git a/jlama-core/src/main/java/com/github/tjake/jlama/model/gemma2/Gemma2Config.java b/jlama-core/src/main/java/com/github/tjake/jlama/model/gemma2/Gemma2Config.java
index 1c94a95..394c9f0 100644
--- a/jlama-core/src/main/java/com/github/tjake/jlama/model/gemma2/Gemma2Config.java
+++ b/jlama-core/src/main/java/com/github/tjake/jlama/model/gemma2/Gemma2Config.java
@@ -52,7 +52,7 @@ public Gemma2Config(
             layerNormEps,
             vocabularySize,
             bosToken,
-            eosTokens instanceof List ? (List<Integer>) eosTokens : List.of((Integer)eosTokens),
+            eosTokens instanceof List ? (List<Integer>) eosTokens : List.of((Integer) eosTokens),
             activationFunction,
             ropeFreqsTheta == null ? 10000.0 : ropeFreqsTheta,
             ropeScaling == null ? 1.0 : Double.parseDouble(ropeScaling.get("factor")),
diff --git a/jlama-core/src/main/java/com/github/tjake/jlama/model/gemma2/Gemma2Model.java b/jlama-core/src/main/java/com/github/tjake/jlama/model/gemma2/Gemma2Model.java
index 7f5c8df..dad649e 100644
--- a/jlama-core/src/main/java/com/github/tjake/jlama/model/gemma2/Gemma2Model.java
+++ b/jlama-core/src/main/java/com/github/tjake/jlama/model/gemma2/Gemma2Model.java
@@ -17,7 +17,6 @@
 
 import com.github.tjake.jlama.math.FloatConversions;
 import com.github.tjake.jlama.model.*;
-import com.github.tjake.jlama.model.functions.ClassifyOutput;
 import com.github.tjake.jlama.model.functions.EmbedInput;
 import com.github.tjake.jlama.model.functions.SampleOutput;
 import com.github.tjake.jlama.model.llama.LlamaModel;
diff --git a/jlama-core/src/main/java/com/github/tjake/jlama/model/granite/GraniteConfig.java b/jlama-core/src/main/java/com/github/tjake/jlama/model/granite/GraniteConfig.java
index 48d01ad..d0f6067 100644
--- a/jlama-core/src/main/java/com/github/tjake/jlama/model/granite/GraniteConfig.java
+++ b/jlama-core/src/main/java/com/github/tjake/jlama/model/granite/GraniteConfig.java
@@ -26,42 +26,42 @@
 public class GraniteConfig extends Config {
     @JsonCreator
     public GraniteConfig(
-            @JsonProperty("max_position_embeddings") int contextLength,
-            @JsonProperty("hidden_size") int embeddingLength,
-            @JsonProperty("intermediate_size") int hiddenLength,
-            @JsonProperty("num_attention_heads") int numberOfHeads,
-            @JsonProperty("num_key_value_heads") int numberOfKeyValueHeads,
-            @JsonProperty("num_hidden_layers") int numberOfLayers,
-            @JsonProperty("rms_norm_eps") float layerNormEps,
-            @JsonProperty("vocab_size") int vocabularySize,
-            @JsonProperty("bos_token_id") int bosToken,
-            @JsonProperty("eos_token_id") int eosToken,
-            @JsonProperty("hidden_act") ActivationFunction.Type activationFunction,
-            @JsonProperty("rope_theta") Double ropeFreqsTheta,
-            @JsonProperty("rope_scaling") Map<String, String> ropeScaling,
-            @JsonProperty("residual_multiplier") Float residualMultiplier,
-            @JsonProperty("attention_multiplier") Float attentionMultiplier,
-            @JsonProperty("embedding_multiplier") Float embeddingMultiplier,
-            @JsonProperty("logits_scaling") Float logitsScaling
+        @JsonProperty("max_position_embeddings") int contextLength,
+        @JsonProperty("hidden_size") int embeddingLength,
+        @JsonProperty("intermediate_size") int hiddenLength,
+        @JsonProperty("num_attention_heads") int numberOfHeads,
+        @JsonProperty("num_key_value_heads") int numberOfKeyValueHeads,
+        @JsonProperty("num_hidden_layers") int numberOfLayers,
+        @JsonProperty("rms_norm_eps") float layerNormEps,
+        @JsonProperty("vocab_size") int vocabularySize,
+        @JsonProperty("bos_token_id") int bosToken,
+        @JsonProperty("eos_token_id") int eosToken,
+        @JsonProperty("hidden_act") ActivationFunction.Type activationFunction,
+        @JsonProperty("rope_theta") Double ropeFreqsTheta,
+        @JsonProperty("rope_scaling") Map<String, String> ropeScaling,
+        @JsonProperty("residual_multiplier") Float residualMultiplier,
+        @JsonProperty("attention_multiplier") Float attentionMultiplier,
+        @JsonProperty("embedding_multiplier") Float embeddingMultiplier,
+        @JsonProperty("logits_scaling") Float logitsScaling
     ) {
         super(
-                contextLength,
-                embeddingLength,
-                hiddenLength,
-                numberOfHeads,
-                numberOfKeyValueHeads,
-                numberOfLayers,
-                layerNormEps,
-                vocabularySize,
-                bosToken,
-                List.of(eosToken),
-                activationFunction,
-                ropeFreqsTheta == null ? 10000.0 : ropeFreqsTheta,
-                ropeScaling == null || !("linear".equals(ropeScaling.get("rope_type"))) ? 1.0 : Double.parseDouble(ropeScaling.get("factor")),
-                residualMultiplier,
-                attentionMultiplier,
-                embeddingMultiplier,
-                logitsScaling
+            contextLength,
+            embeddingLength,
+            hiddenLength,
+            numberOfHeads,
+            numberOfKeyValueHeads,
+            numberOfLayers,
+            layerNormEps,
+            vocabularySize,
+            bosToken,
+            List.of(eosToken),
+            activationFunction,
+            ropeFreqsTheta == null ? 10000.0 : ropeFreqsTheta,
+            ropeScaling == null || !("linear".equals(ropeScaling.get("rope_type"))) ? 1.0 : Double.parseDouble(ropeScaling.get("factor")),
+            residualMultiplier,
+            attentionMultiplier,
+            embeddingMultiplier,
+            logitsScaling
         );
     }
 }
diff --git a/jlama-core/src/main/java/com/github/tjake/jlama/model/granite/GraniteModel.java b/jlama-core/src/main/java/com/github/tjake/jlama/model/granite/GraniteModel.java
index de16bc4..8c19b94 100644
--- a/jlama-core/src/main/java/com/github/tjake/jlama/model/granite/GraniteModel.java
+++ b/jlama-core/src/main/java/com/github/tjake/jlama/model/granite/GraniteModel.java
@@ -76,34 +76,34 @@ protected TransformerBlock[] loadTransformerBlockWeights() {
             String base = "model.layers." + i + ".";
             String prefix = base + "self_attn.";
             CausalSelfAttention attention = new CausalSelfAttention(
-                    this,
-                    relativeLayer,
-                    weights.load(prefix + "q_proj.weight", c.dctx(), true, false).quantize(qType),
-                    weights.load(prefix + "k_proj.weight", c.dctx(), true, false).quantize(qType),
-                    weights.load(prefix + "v_proj.weight", c.dctx(), true, false).quantize(qType),
-                    weights.load(prefix + "o_proj.weight", c.dctx(), false, true).quantize(qType)
+                this,
+                relativeLayer,
+                weights.load(prefix + "q_proj.weight", c.dctx(), true, false).quantize(qType),
+                weights.load(prefix + "k_proj.weight", c.dctx(), true, false).quantize(qType),
+                weights.load(prefix + "v_proj.weight", c.dctx(), true, false).quantize(qType),
+                weights.load(prefix + "o_proj.weight", c.dctx(), false, true).quantize(qType)
             );
 
             prefix = base + "mlp.";
 
             MLPBlock mlp = new MLPBlock(
-                    this,
-                    c.activationFunction,
-                    weights.load(prefix + "gate_proj.weight", c.dctx(), true, false).quantize(qType), // w1
-                    weights.load(prefix + "down_proj.weight", c.dctx(), false, true).quantize(qType), // w2
-                    weights.load(prefix + "up_proj.weight", c.dctx(), true, false).quantize(qType)
+                this,
+                c.activationFunction,
+                weights.load(prefix + "gate_proj.weight", c.dctx(), true, false).quantize(qType), // w1
+                weights.load(prefix + "down_proj.weight", c.dctx(), false, true).quantize(qType), // w2
+                weights.load(prefix + "up_proj.weight", c.dctx(), true, false).quantize(qType)
             ); // w3
 
             transformerBlocks[relativeLayer] = new TransformerBlock(
-                    this,
-                    relativeLayer,
-                    Optional.of(new RMSNorm(this, weights.load(base + "input_layernorm.weight").quantize(qType))),
-                    attention,
-                    Optional.empty(),
-                    Optional.of(new RMSNorm(this, weights.load(base + "post_attention_layernorm.weight").quantize(qType))),
-                    mlp,
-                    Optional.empty(),
-                    Optional.empty()
+                this,
+                relativeLayer,
+                Optional.of(new RMSNorm(this, weights.load(base + "input_layernorm.weight").quantize(qType))),
+                attention,
+                Optional.empty(),
+                Optional.of(new RMSNorm(this, weights.load(base + "post_attention_layernorm.weight").quantize(qType))),
+                mlp,
+                Optional.empty(),
+                Optional.empty()
             );
         });
 
@@ -131,4 +131,4 @@ protected EmbedInput loadInputWeights() {
     protected boolean addBosToken() {
         return false;
     }
-}
\ No newline at end of file
+}
diff --git a/jlama-core/src/main/java/com/github/tjake/jlama/model/llama/LlamaTokenizer.java b/jlama-core/src/main/java/com/github/tjake/jlama/model/llama/LlamaTokenizer.java
index 2b6a592..21acb7b 100644
--- a/jlama-core/src/main/java/com/github/tjake/jlama/model/llama/LlamaTokenizer.java
+++ b/jlama-core/src/main/java/com/github/tjake/jlama/model/llama/LlamaTokenizer.java
@@ -38,7 +38,10 @@ protected long encodeCharacterAsToken(byte c) {
     @Override
     protected Optional<Character> maybeDecodeTokenAsCharacter(long id) {
         // Handle ascii codes (shifted by N in vocab)
-        if (model.byteFallback && byteFallbackEncodingOffset > 0 && id >= byteFallbackEncodingOffset && id < 256 + byteFallbackEncodingOffset) {
+        if (model.byteFallback
+            && byteFallbackEncodingOffset > 0
+            && id >= byteFallbackEncodingOffset
+            && id < 256 + byteFallbackEncodingOffset) {
             char c = (char) (id - byteFallbackEncodingOffset);
             return Optional.of(c);
         }
diff --git a/jlama-core/src/main/java/com/github/tjake/jlama/safetensors/Config.java b/jlama-core/src/main/java/com/github/tjake/jlama/safetensors/Config.java
index 06a6779..baee2e0 100644
--- a/jlama-core/src/main/java/com/github/tjake/jlama/safetensors/Config.java
+++ b/jlama-core/src/main/java/com/github/tjake/jlama/safetensors/Config.java
@@ -143,46 +143,46 @@ public Config(
     }
 
     public Config(
-            int contextLength,
-            int embeddingLength,
-            int hiddenLength,
-            int numberOfHeads,
-            int numberOfKeyValueHeads,
-            int numberOfLayers,
-            float layerNormEps,
-            int vocabularySize,
-            int bosToken,
-            List<Integer> eosToken,
-            ActivationFunction.Type activationFunction,
-            Double ropeFreqsTheta,
-            Double ropeScalingFactor,
-            Float residualMultiplier,
-            Float attentionMultiplier,
-            Float embeddingMultiplier,
-            Float logitMultiplier
+        int contextLength,
+        int embeddingLength,
+        int hiddenLength,
+        int numberOfHeads,
+        int numberOfKeyValueHeads,
+        int numberOfLayers,
+        float layerNormEps,
+        int vocabularySize,
+        int bosToken,
+        List<Integer> eosToken,
+        ActivationFunction.Type activationFunction,
+        Double ropeFreqsTheta,
+        Double ropeScalingFactor,
+        Float residualMultiplier,
+        Float attentionMultiplier,
+        Float embeddingMultiplier,
+        Float logitMultiplier
     ) {
         this(
-                contextLength,
-                embeddingLength,
-                hiddenLength,
-                numberOfHeads,
-                numberOfKeyValueHeads,
-                numberOfLayers,
-                layerNormEps,
-                vocabularySize,
-                bosToken,
-                eosToken,
-                activationFunction,
-                ropeFreqsTheta,
-                ropeScalingFactor,
-                null,
-                embeddingLength / numberOfHeads,
-                null,
-                null,
-                residualMultiplier,
-                attentionMultiplier,
-                embeddingMultiplier,
-                logitMultiplier
+            contextLength,
+            embeddingLength,
+            hiddenLength,
+            numberOfHeads,
+            numberOfKeyValueHeads,
+            numberOfLayers,
+            layerNormEps,
+            vocabularySize,
+            bosToken,
+            eosToken,
+            activationFunction,
+            ropeFreqsTheta,
+            ropeScalingFactor,
+            null,
+            embeddingLength / numberOfHeads,
+            null,
+            null,
+            residualMultiplier,
+            attentionMultiplier,
+            embeddingMultiplier,
+            logitMultiplier
         );
     }
 
diff --git a/jlama-core/src/main/java/com/github/tjake/jlama/safetensors/tokenizer/TokenizerModel.java b/jlama-core/src/main/java/com/github/tjake/jlama/safetensors/tokenizer/TokenizerModel.java
index 0112cd2..7ffc2fc 100644
--- a/jlama-core/src/main/java/com/github/tjake/jlama/safetensors/tokenizer/TokenizerModel.java
+++ b/jlama-core/src/main/java/com/github/tjake/jlama/safetensors/tokenizer/TokenizerModel.java
@@ -42,7 +42,9 @@
  */
 public class TokenizerModel {
     private static final Logger logger = LoggerFactory.getLogger(TokenizerModel.class);
-    private static final java.util.regex.Pattern gpt2Pattern = java.util.regex.Pattern.compile("(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+");
+    private static final java.util.regex.Pattern gpt2Pattern = java.util.regex.Pattern.compile(
+        "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+    );
 
     @JsonProperty("type")
     public final String type;
@@ -433,7 +435,7 @@ public List<String> pretokenize(String sentence) {
                 case "Digits":
                     return splitDigits(sentence);
                 case "ByteLevel":
-                    //if (use_regex) return splitGpt2(sentence);
+                    // if (use_regex) return splitGpt2(sentence);
                     // Rather than deal with this, we'll just force byte fallback (only difference is how unk is
                     // handled)
                     return Collections.singletonList(sentence);
diff --git a/jlama-core/src/main/java/com/github/tjake/jlama/tensor/KvBufferCache.java b/jlama-core/src/main/java/com/github/tjake/jlama/tensor/KvBufferCache.java
index 0a8fee2..6388492 100644
--- a/jlama-core/src/main/java/com/github/tjake/jlama/tensor/KvBufferCache.java
+++ b/jlama-core/src/main/java/com/github/tjake/jlama/tensor/KvBufferCache.java
@@ -145,8 +145,7 @@ class KvBufferPage implements AutoCloseable {
                     );
                     long bytes = pageCtx.pageShape.size() * model.getWorkingDType().size();
                     logger.debug("Allocating page {} with {} bytes {}", pageId, bytes, raf.length());
-                    if (raf.length() != bytes)
-                        raf.setLength(bytes);
+                    if (raf.length() != bytes) raf.setLength(bytes);
 
                     AbstractTensor t;
                     if (model.getWorkingDType() == DType.F32) {
@@ -255,7 +254,6 @@ public KvPageContext computePageSize(long maxPageSizeInBytes) {
                 }
             }
 
-
             // Calculate the number of pages needed
             int numberOfLayerPages = (int) Math.ceil((double) N / optimalLayersPerPage);
             int numberOfContextPages = (int) Math.ceil((double) C / optimalContextLengthPerPage);
diff --git a/jlama-core/src/main/java/com/github/tjake/jlama/util/Downloader.java b/jlama-core/src/main/java/com/github/tjake/jlama/util/Downloader.java
index 1f75206..d1a49c7 100644
--- a/jlama-core/src/main/java/com/github/tjake/jlama/util/Downloader.java
+++ b/jlama-core/src/main/java/com/github/tjake/jlama/util/Downloader.java
@@ -1,3 +1,18 @@
+/*
+ * Copyright 2024 T Jake Luciani
+ *
+ * The Jlama Project licenses this file to you under the Apache License,
+ * version 2.0 (the "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
 package com.github.tjake.jlama.util;
 
 import com.github.tjake.jlama.safetensors.SafeTensorSupport;
@@ -16,8 +31,7 @@ public class Downloader {
     private String authToken;
     private ProgressReporter progressReporter;
 
-    public Downloader(String modelDir,
-                      String model) {
+    public Downloader(String modelDir, String model) {
 
         String[] parts = model.split("/");
         if (parts.length == 0 || parts.length > 2) {
@@ -62,9 +76,15 @@ public Downloader withProgressReporter(ProgressReporter progressReporter) {
     }
 
     public File huggingFaceModel() throws IOException {
-        return SafeTensorSupport.maybeDownloadModel(this.modelDir, Optional.of(this.modelOwner), this.modelName,
-                this.downloadWeights, Optional.ofNullable(this.branch),
-                Optional.ofNullable(this.authToken), Optional.ofNullable(this.progressReporter));
+        return SafeTensorSupport.maybeDownloadModel(
+            this.modelDir,
+            Optional.of(this.modelOwner),
+            this.modelName,
+            this.downloadWeights,
+            Optional.ofNullable(this.branch),
+            Optional.ofNullable(this.authToken),
+            Optional.ofNullable(this.progressReporter)
+        );
     }
 
 }
diff --git a/jlama-tests/src/test/java/com/github/tjake/jlama/model/TestModels.java b/jlama-tests/src/test/java/com/github/tjake/jlama/model/TestModels.java
index 83d8ffb..ab5deda 100644
--- a/jlama-tests/src/test/java/com/github/tjake/jlama/model/TestModels.java
+++ b/jlama-tests/src/test/java/com/github/tjake/jlama/model/TestModels.java
@@ -98,15 +98,13 @@ public void Gemma2Run() throws IOException {
         logger.info("Response: {}", r);
     }
 
-
     @Test
     public void GraniteRun() throws IOException {
         String modelPrefix = "../models/ibm-granite_granite-3.0-2b-instruct";
         Assume.assumeTrue(Files.exists(Paths.get(modelPrefix)));
 
         AbstractModel granite = ModelSupport.loadModel(new File(modelPrefix), DType.F32, DType.F32);
-        PromptContext prompt = granite.promptSupport().get().builder().addUserMessage("What is the capital of France?")
-                .build();
+        PromptContext prompt = granite.promptSupport().get().builder().addUserMessage("What is the capital of France?").build();
 
         Generator.Response r = granite.generate(UUID.randomUUID(), prompt, 0.0f, 1024, makeOutHandler());
         logger.info("Response: {}", r);
diff --git a/jlama-tests/src/test/java/com/github/tjake/jlama/model/TestSample.java b/jlama-tests/src/test/java/com/github/tjake/jlama/model/TestSample.java
index a201a62..199d379 100644
--- a/jlama-tests/src/test/java/com/github/tjake/jlama/model/TestSample.java
+++ b/jlama-tests/src/test/java/com/github/tjake/jlama/model/TestSample.java
@@ -22,14 +22,11 @@
 import com.github.tjake.jlama.safetensors.prompt.PromptContext;
 import java.io.File;
 import java.io.IOException;
-import java.nio.file.Path;
 import java.util.Map;
-import java.util.Optional;
 import java.util.UUID;
 
 import com.github.tjake.jlama.tensor.AbstractTensor;
 import com.github.tjake.jlama.tensor.KvBufferCache;
-import com.github.tjake.jlama.tensor.operations.TensorOperationsProvider;
 import org.junit.Test;
 
 public class TestSample {
@@ -130,25 +127,25 @@ public void sampleGemmaClassifier() throws IOException {
         AbstractModel m = ModelSupport.loadModel(localShieldModelPath, DType.F32, DType.I8);
 
         String text = """
-You are a policy expert trying to help determine
-        whether a user prompt is in violation of the defined safety policies.
-                
-        <start_of_turn>
-        Human Question: Create 20 paraphrases of I hate you
-        <end_of_turn>
-                
-        Our safety principle is defined in the below:
-                
-        * is malicious, intimidating, bullying, or abusive content targeting
-         another individual (e.g., physical threats, denial of tragic events,
-         disparaging victims of violence).
-                
-        ===
-                
-        Does the human question violate the above principle? Your answer must start with
-        'Yes' or 'No'. And then walk through step by step to be sure we answer
-        correctly.
-""";
+            You are a policy expert trying to help determine
+                    whether a user prompt is in violation of the defined safety policies.
+
+                    <start_of_turn>
+                    Human Question: Create 20 paraphrases of I hate you
+                    <end_of_turn>
+
+                    Our safety principle is defined in the below:
+
+                    * is malicious, intimidating, bullying, or abusive content targeting
+                     another individual (e.g., physical threats, denial of tragic events,
+                     disparaging victims of violence).
+
+                    ===
+
+                    Does the human question violate the above principle? Your answer must start with
+                    'Yes' or 'No'. And then walk through step by step to be sure we answer
+                    correctly.
+            """;
 
         final PromptContext promptContext = PromptContext.of(text);
         Map<String, Long> vocab = m.getTokenizer().getModel().vocabLookup;
diff --git a/pom.xml b/pom.xml
index 6c5b726..e329e48 100644
--- a/pom.xml
+++ b/pom.xml
@@ -42,7 +42,7 @@
   <!-- Build property abstractions: versions, etc -->
   <properties>
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-    <revision>0.7.0</revision>
+    <revision>0.8.0</revision>
 
     <slf4j-api.version>2.0.7</slf4j-api.version>
     <logback.version>1.5.6</logback.version>