Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Inference no copy #43

Merged
merged 6 commits into from
Feb 24, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scripts/generate_logprobs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
BATCH_SIZE=80 # This worked well in my CPU, but 200 was too much
DATASET_NAME="delphi-suite/tinystories-v2-clean-tokenized"
USERNAME="transcendingvictor" # your Hugging Face username
TOKEN="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" # your Hugging Face API token
TOKEN="hf_aaaaaaaaaaaaaaaaaaaaaaaaaa" # your Hugging Face API token


# List of models
Expand Down
18 changes: 7 additions & 11 deletions scripts/inference.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse
import os

import numpy as np
import pandas as pd
import torch
from datasets import Dataset, load_dataset
Expand Down Expand Up @@ -35,27 +36,22 @@ def main(

model = AutoModelForCausalLM.from_pretrained(model_name)

logprobs_list = []
# logprobs_list = []
transcendingvictor marked this conversation as resolved.
Show resolved Hide resolved
total_sequences = (
len(val_ds) if not funct_test else 320
) # Use only 320 sequences if funct_test is True

logprobs = np.empty((total_sequences, 513))
logprobs[:, 0] = float("nan")
for i in tqdm(range(0, total_sequences, batch_size)):
batch_end = min(i + batch_size, total_sequences)
batch_sequences = [val_ds[j]["tokens"] for j in range(i, batch_end)]
batch_sequences_tensor = torch.tensor(batch_sequences)

_, next_logprobs = get_all_and_next_logprobs(model, batch_sequences_tensor)
logprobs_list.append(next_logprobs)
logprobs_tensor = get_all_and_next_logprobs(model, batch_sequences_tensor)[1]
logprobs[i:batch_end, 1:] = logprobs_tensor.cpu().numpy()

accumulated_logprobs = torch.cat(logprobs_list, dim=0)

nan_tensor = torch.full((accumulated_logprobs.size(0), 1), float("nan"))
extended_next_logprobs = torch.cat(
[nan_tensor, accumulated_logprobs], dim=1
) # 513 tokens

df_dataset = pd.DataFrame({"logprobs": extended_next_logprobs.tolist()})
df_dataset = pd.DataFrame({"logprobs": [row for row in logprobs]})
hf_dataset = Dataset.from_pandas(df_dataset)

# change the repo_id to your hf username in generate_logprobs.sh
Expand Down
2 changes: 1 addition & 1 deletion tests/scripts/functional_test_generate_logprobs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
BATCH_SIZE=80
DATASET_NAME="delphi-suite/tinystories-v2-clean-tokenized"
USERNAME="transcendingvictor" # Your Hugging Face username
TOKEN="hf_aaaaaaaaaaaaaaaaaaaaaaaaaaaaa" # Your Hugging Face API token
TOKEN="hf_aaaaaaaaaaaaaaaaaaaaaaaaa" # Your Hugging Face API token

# List of models
declare -a MODEL_NAMES=("delphi-suite/delphi-llama2-100k"
Expand Down
Loading