InfiniGPT config added

ArionDas · Jun 25, 2024 · ae52ec2 · ae52ec2
1 parent c9da909
commit ae52ec2
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 5 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -2,5 +2,5 @@ torch
 pandas
 ipykernel
 tiktoken
-numpy
+numpy==1.26.4
 transformers
diff --git a/src/infini_attention.py b/src/infini_attention.py
@@ -5,8 +5,9 @@
 from transformers.modeling_utils import Cache
 from transformers import AutoConfig
 from rotary_embeddings import RotaryEmbedding
+from infini_gpt_config import INFINIGPT_CONFIG
 
-### Rotary Embeddings copied from jlamprou repo
+### Rotary Embeddings from jlamprou repo
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
@@ -137,6 +138,10 @@ def forward(
             key_states = self.k_proj(hidden_states)
             value_states = self.q_proj(hidden_states)
 
+            """
+            bsz = batch_size
+            q_len = sequence_length
+            """
             query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1,2)
             key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
             value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
@@ -179,11 +184,17 @@ def forward(
                 dropout_p=self.attention_dropout if self.training else 0.0,
             )
 
-            combined_output = self.long_term_injection_(attn_output, memory_output)
+            combined_output = self.long_term_memory_injection_(attn_output, memory_output)
 
             #### output for this segment
             combined_output = combined_output.transpose(1,2).contiguous()
             combined_output = combined_output.view(bsz, q_len, self.hidden_size)
             final_output = self.o_proj(combined_output)
             return final_output, None, past_key_value
+
+
+""" Confusions : 
+1) **Cache** ?? How to use it to store the past key value states in the input stream?
+2) Tensor dimensions have to be matched from the dataloader with that of infini_attention.
+"""
 
diff --git a/src/infini_gpt_config.py b/src/infini_gpt_config.py
@@ -0,0 +1,14 @@
+INFINIGPT_CONFIG = {
+    "vocab_size": 50257,                # Vocabulary size
+    "context_length": 1024,             # Context length
+    "emb_dim": 128,                     # Embedding dimension
+    "num_attention_heads": 8,           # Number of attention heads
+    "n_layers": 12,                     # Number of layers
+    "drop_rate": 0.1,                   # Dropout rate
+    "qkv_bias": False,                  # Query-Key-Value bias
+    "hidden_size": 4096,                # Hidden size
+}
+
+## segment length = 2048
+## sequence length = 32768
+## num of segments = 32768/2048 = 16
diff --git a/src/main.py b/src/main.py
@@ -7,15 +7,17 @@
 from torch.utils.data import Dataset, DataLoader
 from data_preprocessing import InfiniGPTDataset, InfiniGPTDataLoader
 from attention import CausalSelfAttention, MultiHeadAttention
+from infini_gpt_config import INFINIGPT_CONFIG
 
 
 def main():
 
     ### Hyperparameters
-    vocab_size = 50257
+    config = INFINIGPT_CONFIG
+    vocab_size = config["vocab_size"]
     output_dim = 256
     max_length = 4
-    context_length = 1024
+    context_length = config["context_length"]
 
     ### Embeddings
     token_embedding_layer = nn.Embedding(vocab_size, output_dim)