comments reloved and functional test

delphi-suite · Feb 17, 2024 · a3fff6f · a3fff6f
1 parent 67bf0f2
commit a3fff6f
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 14 deletions.
diff --git a/scripts/generate_logprobs.sh b/scripts/generate_logprobs.sh
@@ -3,7 +3,8 @@
 # Define the batch size
 BATCH_SIZE=80  # This worked well in my CPU, but 200 was too much
 DATASET_NAME="delphi-suite/tinystories-v2-clean-tokenized"
-TOKEN="hf_aaaaaaaaaaaaaaaaaaaaaaaaa"  # your Hugging Face API token
+USERNAME="transcendingvictor"  # your Hugging Face username
+TOKEN="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"  # your Hugging Face API token
 
 
 # List of models
@@ -21,7 +22,7 @@ declare -a MODEL_NAMES=("delphi-suite/delphi-llama2-100k"
 for MODEL_NAME in "${MODEL_NAMES[@]}"
 do
     echo "Processing $MODEL_NAME"
-    python scripts/inference_delete.py "$MODEL_NAME" --batch_size "$BATCH_SIZE" --token "$TOKEN"
+    python scripts/inference.py "$MODEL_NAME" --batch-size "$BATCH_SIZE" --dataset-name "$DATASET_NAME" --username "$USERNAME" --token "$TOKEN"
 done
 
 echo "All models processed."
diff --git a/scripts/inference.py b/scripts/inference.py
@@ -13,39 +13,57 @@
 torch.set_grad_enabled(False)
 
 
-def main(model_name: str, batch_size: Int, dataset_name: str, token: str):
+def main(
+    model_name: str,
+    batch_size: Int,
+    dataset_name: str,
+    username: str,
+    token: str,
+    funct_test: bool = False,
+):
     """
     Outputs the log probabilities of the next token for each token in the validation dataset.
     And uploads the resulting dataset to huggingface.
     Args:
     - model_name: The name of the model to use for inference
     - batch_size: The batch size for processing. 80 worked well in CPU.
     - dataset_name: The name of the dataset from which validation set will be loaded
+    - username: Hugging Face API username
     - token: Hugging Face API token
     """
     val_ds = load_validation_dataset(dataset_name)
 
     # model accepts 2D tensors (batch_size, seq_len)
     val_sequences = torch.tensor([s["tokens"] for s in val_ds])
-    model = AutoModelForCausalLM.from_pretrained(model_name)
 
-    accumulated_logprobs = torch.tensor([], dtype=torch.float32)
+    if funct_test:
+        val_sequences = val_sequences[:320]
+
+    model = AutoModelForCausalLM.from_pretrained(model_name)
 
+    logprobs_list = []
     for i in tqdm(range(0, len(val_sequences), batch_size)):
         batch_sequences = val_sequences[i : i + batch_size]
-        _, next_logprobs = get_all_and_next_logprobs(model, val_sequences)
-        accumulated_logprobs = torch.cat((accumulated_logprobs, next_logprobs), dim=0)
+        _, next_logprobs = get_all_and_next_logprobs(model, batch_sequences)
+        logprobs_list.append(next_logprobs)
+    accumulated_logprobs = torch.cat(logprobs_list, dim=0)
 
     nan_tensor = torch.full((accumulated_logprobs.size(0), 1), float("nan"))
-    extended_next_logprobs = torch.cat([nan_tensor, next_logprobs], dim=1)  # 513 tokens
+    extended_next_logprobs = torch.cat(
+        [nan_tensor, accumulated_logprobs], dim=1
+    )  # 513 tokens
 
     df_dataset = pd.DataFrame({"logprobs": extended_next_logprobs.tolist()})
     hf_dataset = Dataset.from_pandas(df_dataset)
 
-    # change the repo_id to your hf username
-    # change the token in generate_logprobs.sh
+    # change the repo_id to your hf username in generate_logprobs.sh
+    # change the yout hf token in generate_logprobs.sh
+
+    repo_id = f"{username}/{model_name.rsplit('/', 1)[-1]}-validation-logprobs"
+    if funct_test:
+        repo_id += "-funct-test"
     hf_dataset.push_to_hub(
-        repo_id=f"transcendingvictor/{model_name.rsplit('/', 1)[-1]}-validation-logprobs",
+        repo_id=repo_id,
         split="validation",
         private=False,
         token=token,
@@ -60,24 +78,40 @@ def main(model_name: str, batch_size: Int, dataset_name: str, token: str):
         "model_name", type=str, help="Model name with or without delphi-suite/ prefix"
     )
     parser.add_argument(
-        "--batch_size",
+        "--batch-size",
         type=int,
         default=80,
         help="Batch size for processing (default: 80)",
     )
     parser.add_argument(
-        "--dataset_name",
+        "--dataset-name",
         type=str,
         help="Dataset name with or without delphi-suite/ prefix",
     )
+    parser.add_argument(
+        "--username",
+        type=str,
+        help="Hugging Face API username",
+    )
     parser.add_argument(
         "--token",
         type=str,
         help="Hugging Face API token",
     )
+    parser.add_argument(
+        "--test-funct", action="store_true", help="Enable test function mode"
+    )
+
     args = parser.parse_args()
 
     if "/" not in args.model_name:
         args.model_name = "delphi-suite/" + args.model_name
 
-    main(args.model_name, args.batch_size, args.dataset_name, args.token)
+    main(
+        args.model_name,
+        args.batch_size,
+        args.dataset_name,
+        args.username,
+        args.token,
+        args.test_funct,
+    )
diff --git a/tests/scripts/functional_test_generate_logprobs.sh b/tests/scripts/functional_test_generate_logprobs.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+#test to check if whether inference.py uploads log probabilities to Hugging Face.
+#similar to generate_logprobs.sh, much smaller.
+
+BATCH_SIZE=80
+DATASET_NAME="delphi-suite/tinystories-v2-clean-tokenized"
+USERNAME="transcendingvictor"  # Your Hugging Face username
+TOKEN="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"  # Your Hugging Face API token
+
+# List of models
+declare -a MODEL_NAMES=("delphi-suite/delphi-llama2-100k"
+                        "delphi-suite/delphi-llama2-200k"
+                        )
+
+# Loop through each model and generate log probabilities
+for MODEL_NAME in "${MODEL_NAMES[@]}"
+do
+    echo "Processing $MODEL_NAME"
+    python scripts/inference.py "$MODEL_NAME" --batch-size "$BATCH_SIZE" --dataset-name "$DATASET_NAME" --username "$USERNAME" --token "$TOKEN" --test-funct
+done
+
+echo "All models processed."