diff --git a/backend/generate_answer.py b/backend/generate_answer.py
index 2dde683..4bce9b6 100644
--- a/backend/generate_answer.py
+++ b/backend/generate_answer.py
@@ -80,20 +80,28 @@ def generate_note(page_content, note_content, model, tokenizer):
         state = model.get_state(page_content, note_content)
         outputs = model.act(state)
         return tokenizer.decode(outputs[0], skip_special_tokens=True)
-    else:
-        # fallback to original implementation
-        inputs = tokenizer(f"I did not like this note: {note_content}. Generate new notes for the given content: {page_content}", return_tensors="pt")
-        # move inputs to same device as model
-        inputs = {k: v.to(device) for k, v in inputs.items()}
-        
-        outputs = model.generate(
-            **inputs,
-            max_length=2048,
-            num_return_sequences=1,
-            pad_token_id=tokenizer.eos_token_id,
-            temperature=0.7,
-        )
-        return tokenizer.decode(outputs[0], skip_special_tokens=True)
+
+    # fallback to original implementation
+    inputs = tokenizer(f"I did not like this note: {note_content}. Generate new notes for the given content: {page_content}", return_tensors="pt")
+    # move inputs to same device as model
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    
+    outputs = model.generate(
+        **inputs,
+        max_length=2048,
+        pad_token_id=tokenizer.eos_token_id,
+        num_return_sequences=1,
+        temperature=0.7,
+        output_scores=False,  # Exclude unnecessary scores
+        return_dict_in_generate=True,  # Return generation metadata
+    )
+    
+    # Extract the generated tokens beyond the input tokens
+    generated_tokens = outputs.sequences[0][inputs['input_ids'].shape[-1]:]
+    
+    # Decode the generated tokens
+    final_output = tokenizer.decode(generated_tokens, skip_special_tokens=True)
+    return final_output
 
 def generate_quiz_review(origin_content, wrong_questions, student_history):
     rl_model = RLModel(Config.MODEL_NAME)
diff --git a/backend/lora.py b/backend/lora.py
index 04fdd59..e495882 100644
--- a/backend/lora.py
+++ b/backend/lora.py
@@ -9,6 +9,7 @@ def fine_tune_and_save_lora_weights(model_name, data, output_dir="./lora_weights
     """
     Fine-tunes the model using the given dataset and saves the LoRA weights.
     """
+    device = "cuda" if torch.cuda.is_available() else "cpu"
     dataset = Dataset.from_list(data)
     
     # Use GPU if available
@@ -21,14 +22,15 @@ def fine_tune_and_save_lora_weights(model_name, data, output_dir="./lora_weights
     )
 
     # Load tokenizer and model
-    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="right", truncation_side="right")
+    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="right", truncation_side="right", max_next_tokens=300, token=Config.HUGGINGFACE_ACCESS_TOKEN)
     tokenizer.pad_token = tokenizer.eos_token
 
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
         device_map="auto",
         quantization_config=bnb_config,
-        torch_dtype=torch.float16
+        torch_dtype=torch.float16,
+        token=Config.HUGGINGFACE_ACCESS_TOKEN
     ).to(device)
     model.config.pad_token_id = tokenizer.pad_token_id
     model.config.use_cache = False
@@ -125,12 +127,13 @@ def apply_lora_weights_to_model(base_model_name, lora_weights_dir):
     )
 
     # Load tokenizer and base model
-    tokenizer = AutoTokenizer.from_pretrained(base_model_name, padding_side="right", truncation_side="right")
+    tokenizer = AutoTokenizer.from_pretrained(base_model_name, padding_side="right", truncation_side="right", token=Config.HUGGINGFACE_ACCESS_TOKEN, max_new_tokens=300)
     base_model = AutoModelForCausalLM.from_pretrained(
         base_model_name,
         device_map="auto",
         quantization_config=bnb_config,
         torch_dtype=torch.float16,
+        token=Config.HUGGINGFACE_ACCESS_TOKEN,
     ).to(device)
 
     # Apply LoRA weights