From 25b22283e5bd9a7ae3480ca3100e451dd184ca7c Mon Sep 17 00:00:00 2001 From: VICTOR ABIA Date: Tue, 13 Feb 2024 22:24:42 +0100 Subject: [PATCH] other changes --- .gitignore | 2 +- scripts/inference.py | 1 - src/delphi/eval/generate_logprobs.sh | 27 ------- src/delphi/eval/inference.py | 93 ---------------------- src/delphi/eval/inference_on_validation.py | 63 --------------- 5 files changed, 1 insertion(+), 185 deletions(-) delete mode 100644 src/delphi/eval/generate_logprobs.sh delete mode 100644 src/delphi/eval/inference.py delete mode 100644 src/delphi/eval/inference_on_validation.py diff --git a/.gitignore b/.gitignore index db94f212..60d4632b 100644 --- a/.gitignore +++ b/.gitignore @@ -159,4 +159,4 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ -Correct_logprobs1/ \ No newline at end of file +Correct_logprobs/ \ No newline at end of file diff --git a/scripts/inference.py b/scripts/inference.py index 2038afae..fe4a42de 100644 --- a/scripts/inference.py +++ b/scripts/inference.py @@ -43,7 +43,6 @@ def main(model_name, dataset_split, batch_size): # model accepts 2D tensors (batch_size, seq_len) val_sequences = torch.tensor([s["tokens"] for s in val_ds]) - val_sequences = val_sequences[:220] output_folder = "Correct_logprobs" os.makedirs(output_folder, exist_ok=True) diff --git a/src/delphi/eval/generate_logprobs.sh b/src/delphi/eval/generate_logprobs.sh deleted file mode 100644 index 78087e49..00000000 --- a/src/delphi/eval/generate_logprobs.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash - -# Define the dataset split -DATASET_SPLIT="validation" # Change this to your desired dataset split - -# Define the batch size -BATCH_SIZE=80 # Change this if you want to use a different batch size - -# List of models -declare -a MODEL_NAMES=("delphi-suite/delphi-llama2-100k" - "delphi-suite/delphi-llama2-200k" - "delphi-suite/delphi-llama2-400k" - "delphi-suite/delphi-llama2-800k" - "delphi-suite/delphi-llama2-1.6m" - "delphi-suite/delphi-llama2-3.2m" - "delphi-suite/delphi-llama2-6.4m" - "delphi-suite/delphi-llama2-12.8m" - "delphi-suite/delphi-llama2-25.6m") - -# Loop through each model and generate log probabilities -for MODEL_NAME in "${MODEL_NAMES[@]}" -do - echo "Processing $MODEL_NAME" - python inference.py "$MODEL_NAME" "$DATASET_SPLIT" --batch_size "$BATCH_SIZE" -done - -echo "All models processed." diff --git a/src/delphi/eval/inference.py b/src/delphi/eval/inference.py deleted file mode 100644 index 2038afae..00000000 --- a/src/delphi/eval/inference.py +++ /dev/null @@ -1,93 +0,0 @@ -import argparse -import os - -import pandas as pd -import torch -from datasets import load_dataset -from tqdm.auto import tqdm -from transformers import AutoModelForCausalLM - -torch.set_grad_enabled(False) - - -def get_correct_logprobs(model, samples_tok): - # logits: seq, pos, d_vocab - logits = model(samples_tok).logits - # logprobs: [batch_size, seq_length, vocab_size] - logprobs = torch.nn.functional.log_softmax(logits, dim=-1) - - # make probs a list of lists of correct token LOG probabilities. - list_logprob = [] - for i, sample in enumerate(samples_tok): - valid_length = len(sample) - 1 # Last token doesn't have a next token - sample_logprobs = logprobs[i, :valid_length, :] # [valid_length, vocab_size] - - # Extract the probabilities of the actual next tokens - next_tokens = sample[ - 1 : valid_length + 1 - ] # Tokens that follow each token in the sequence - correct_logprobs = sample_logprobs[torch.arange(valid_length), next_tokens] - - list_logprob.append(correct_logprobs) - return list_logprob - - # outputs a list of lists of correct token LOG probabilities. - # correct_logprobs = get_correct_logprobs(model, val_sequences[:10]) - - -def main(model_name, dataset_split, batch_size): - val_ds = load_dataset( - "delphi-suite/tinystories-v2-clean-tokenized", split=dataset_split - ) - # val_ds[0]["tokens"] # access first sample - - # model accepts 2D tensors (batch_size, seq_len) - val_sequences = torch.tensor([s["tokens"] for s in val_ds]) - val_sequences = val_sequences[:220] - - output_folder = "Correct_logprobs" - os.makedirs(output_folder, exist_ok=True) - - # Initialize an empty DataFrame to accumulate log probabilities - accumulated_df = pd.DataFrame() - - model = AutoModelForCausalLM.from_pretrained(model_name) - - # Loop over the validation dataset in batches - for i in tqdm(range(0, len(val_sequences), batch_size)): - batch_sequences = val_sequences[i : i + batch_size] - batch_logprobs = get_correct_logprobs(model, batch_sequences) - # Convert batch log probabilities to a DataFrame - batch_df = pd.DataFrame([logprob.tolist() for logprob in batch_logprobs]) - # Append the batch DataFrame to the accumulated DataFrame - accumulated_df = pd.concat([accumulated_df, batch_df], ignore_index=True) - - # Save the accumulated DataFrame to a Parquet file - output_file = os.path.join(output_folder, f'{model_name.replace("/", "-")}.parquet') - accumulated_df.to_parquet(output_file) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Run inference and generate log probabilities." - ) - parser.add_argument( - "model_name", type=str, help="Model name with or without delphi-suite/ prefix" - ) - parser.add_argument( - "dataset_split", type=str, help="Dataset split (e.g., train, validation, test)" - ) - parser.add_argument( - "--batch_size", - type=int, - default=80, - help="Batch size for processing (default: 80)", - ) - - args = parser.parse_args() - - # Default prefix handling - if "/" not in args.model_name: - args.model_name = "delphi-suite/" + args.model_name - - main(args.model_name, args.dataset_split, args.batch_size) diff --git a/src/delphi/eval/inference_on_validation.py b/src/delphi/eval/inference_on_validation.py deleted file mode 100644 index c38e94ca..00000000 --- a/src/delphi/eval/inference_on_validation.py +++ /dev/null @@ -1,63 +0,0 @@ -# %% load validation dataset -from datasets import load_dataset -from tqdm.auto import tqdm - -val_ds = load_dataset("delphi-suite/tinystories-v2-clean-tokenized", split="validation") - -# %% Models are stored in "model" variable -import torch - -torch.set_grad_enabled(False) -from transformers import AutoModelForCausalLM, AutoTokenizer - -model_name = "delphi-suite/delphi-llama2-100k" -tokenizer = AutoTokenizer.from_pretrained(model_name) -model = AutoModelForCausalLM.from_pretrained(model_name) - - -# %% get their logits in parallel -def get_logits(model, samples_tok: list[str]) -> torch.Tensor: - # Check that samples_tok is a list of sequences - assert all( - isinstance(seq, (list, torch.Tensor)) for seq in samples_tok - ), "samples_tok must be a list of sequences" - # Check that each tensor in samples_tok is 1D, and print debug info if not - for seq in samples_tok: - if isinstance(seq, torch.Tensor) and seq.ndim != 1: - print(f"Found a non-1D tensor: {seq}") - print(f"Shape: {seq.shape}") - raise AssertionError("All tensors must be 1D") - - padded_matrix_samples = pad_sequences(samples_tok) - logits = model(padded_matrix_samples).logits - return logits # (num_seqs, max_seq_len, vocab_size) - - -logits = get_logits(model, val_ds[:10]) -print(logits.shape) - - -# %% - - -def get_correct_logprobs(model, samples_tok): - # logits: seq, pos, d_vocab - logits = get_logits(model, samples_tok) - # probs: seq, pos, d_vocab - probs = torch.softmax(logits, dim=-1) - logprobs = torch.log(probs) - - # make probs a list of lists of correct token LOG probabilities. - list_logprob = [] - for i, sample in enumerate(samples_tok): - valid_length = len(sample) - 1 # Last token doesn't have a next token - sample_logprobs = logprobs[i, :valid_length, :] # [valid_length, vocab_size] - - # Extract the probabilities of the actual next tokens - next_tokens = sample[ - 1 : valid_length + 1 - ] # Tokens that follow each token in the sequence - correct_logprobs = sample_logprobs[torch.arange(valid_length), next_tokens] - - list_logprob.append(correct_logprobs) - return list_logprob