Skip to content

Commit

Permalink
comments reloved and functional test
Browse files Browse the repository at this point in the history
  • Loading branch information
transcendingvictor committed Feb 17, 2024
1 parent 67bf0f2 commit a3fff6f
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 14 deletions.
5 changes: 3 additions & 2 deletions scripts/generate_logprobs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
# Define the batch size
BATCH_SIZE=80 # This worked well in my CPU, but 200 was too much
DATASET_NAME="delphi-suite/tinystories-v2-clean-tokenized"
TOKEN="hf_aaaaaaaaaaaaaaaaaaaaaaaaa" # your Hugging Face API token
USERNAME="transcendingvictor" # your Hugging Face username
TOKEN="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" # your Hugging Face API token


# List of models
Expand All @@ -21,7 +22,7 @@ declare -a MODEL_NAMES=("delphi-suite/delphi-llama2-100k"
for MODEL_NAME in "${MODEL_NAMES[@]}"
do
echo "Processing $MODEL_NAME"
python scripts/inference_delete.py "$MODEL_NAME" --batch_size "$BATCH_SIZE" --token "$TOKEN"
python scripts/inference.py "$MODEL_NAME" --batch-size "$BATCH_SIZE" --dataset-name "$DATASET_NAME" --username "$USERNAME" --token "$TOKEN"
done

echo "All models processed."
58 changes: 46 additions & 12 deletions scripts/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,39 +13,57 @@
torch.set_grad_enabled(False)


def main(model_name: str, batch_size: Int, dataset_name: str, token: str):
def main(
model_name: str,
batch_size: Int,
dataset_name: str,
username: str,
token: str,
funct_test: bool = False,
):
"""
Outputs the log probabilities of the next token for each token in the validation dataset.
And uploads the resulting dataset to huggingface.
Args:
- model_name: The name of the model to use for inference
- batch_size: The batch size for processing. 80 worked well in CPU.
- dataset_name: The name of the dataset from which validation set will be loaded
- username: Hugging Face API username
- token: Hugging Face API token
"""
val_ds = load_validation_dataset(dataset_name)

# model accepts 2D tensors (batch_size, seq_len)
val_sequences = torch.tensor([s["tokens"] for s in val_ds])
model = AutoModelForCausalLM.from_pretrained(model_name)

accumulated_logprobs = torch.tensor([], dtype=torch.float32)
if funct_test:
val_sequences = val_sequences[:320]

model = AutoModelForCausalLM.from_pretrained(model_name)

logprobs_list = []
for i in tqdm(range(0, len(val_sequences), batch_size)):
batch_sequences = val_sequences[i : i + batch_size]
_, next_logprobs = get_all_and_next_logprobs(model, val_sequences)
accumulated_logprobs = torch.cat((accumulated_logprobs, next_logprobs), dim=0)
_, next_logprobs = get_all_and_next_logprobs(model, batch_sequences)
logprobs_list.append(next_logprobs)
accumulated_logprobs = torch.cat(logprobs_list, dim=0)

nan_tensor = torch.full((accumulated_logprobs.size(0), 1), float("nan"))
extended_next_logprobs = torch.cat([nan_tensor, next_logprobs], dim=1) # 513 tokens
extended_next_logprobs = torch.cat(
[nan_tensor, accumulated_logprobs], dim=1
) # 513 tokens

df_dataset = pd.DataFrame({"logprobs": extended_next_logprobs.tolist()})
hf_dataset = Dataset.from_pandas(df_dataset)

# change the repo_id to your hf username
# change the token in generate_logprobs.sh
# change the repo_id to your hf username in generate_logprobs.sh
# change the yout hf token in generate_logprobs.sh

repo_id = f"{username}/{model_name.rsplit('/', 1)[-1]}-validation-logprobs"
if funct_test:
repo_id += "-funct-test"
hf_dataset.push_to_hub(
repo_id=f"transcendingvictor/{model_name.rsplit('/', 1)[-1]}-validation-logprobs",
repo_id=repo_id,
split="validation",
private=False,
token=token,
Expand All @@ -60,24 +78,40 @@ def main(model_name: str, batch_size: Int, dataset_name: str, token: str):
"model_name", type=str, help="Model name with or without delphi-suite/ prefix"
)
parser.add_argument(
"--batch_size",
"--batch-size",
type=int,
default=80,
help="Batch size for processing (default: 80)",
)
parser.add_argument(
"--dataset_name",
"--dataset-name",
type=str,
help="Dataset name with or without delphi-suite/ prefix",
)
parser.add_argument(
"--username",
type=str,
help="Hugging Face API username",
)
parser.add_argument(
"--token",
type=str,
help="Hugging Face API token",
)
parser.add_argument(
"--test-funct", action="store_true", help="Enable test function mode"
)

args = parser.parse_args()

if "/" not in args.model_name:
args.model_name = "delphi-suite/" + args.model_name

main(args.model_name, args.batch_size, args.dataset_name, args.token)
main(
args.model_name,
args.batch_size,
args.dataset_name,
args.username,
args.token,
args.test_funct,
)
22 changes: 22 additions & 0 deletions tests/scripts/functional_test_generate_logprobs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash
#test to check if whether inference.py uploads log probabilities to Hugging Face.
#similar to generate_logprobs.sh, much smaller.

BATCH_SIZE=80
DATASET_NAME="delphi-suite/tinystories-v2-clean-tokenized"
USERNAME="transcendingvictor" # Your Hugging Face username
TOKEN="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" # Your Hugging Face API token

# List of models
declare -a MODEL_NAMES=("delphi-suite/delphi-llama2-100k"
"delphi-suite/delphi-llama2-200k"
)

# Loop through each model and generate log probabilities
for MODEL_NAME in "${MODEL_NAMES[@]}"
do
echo "Processing $MODEL_NAME"
python scripts/inference.py "$MODEL_NAME" --batch-size "$BATCH_SIZE" --dataset-name "$DATASET_NAME" --username "$USERNAME" --token "$TOKEN" --test-funct
done

echo "All models processed."

0 comments on commit a3fff6f

Please sign in to comment.