Skip to content

Commit

Permalink
map_tokens from risky pickle to safe hf
Browse files Browse the repository at this point in the history
  • Loading branch information
transcendingvictor committed Feb 21, 2024
1 parent 75e68aa commit a5b5e63
Showing 1 changed file with 37 additions and 6 deletions.
43 changes: 37 additions & 6 deletions scripts/map_tokens.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,53 @@
#!/usr/bin/env python3

import argparse
import pickle

import pandas as pd
from datasets import Dataset

from delphi.eval.token_map import token_map
from delphi.eval.utils import load_validation_dataset

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="")

parser.add_argument(
"dataset_name",
type=str,
help="Dataset from huggingface to run token_map on",
)
parser.add_argument(
"--username",
type=str,
help="Hugging Face API username",
)
parser.add_argument(
"dataset_name", help="Dataset from huggingface to run token_map on"
"--token",
type=str,
help="Hugging Face API token",
)
parser.add_argument("--output", help="Output file name", default="token_map.pkl")
args = parser.parse_args()

dataset = load_validation_dataset(args.dataset_name)

mapping = token_map(dataset)
mapping = token_map(
dataset
) # outputs the dictionary: dict[int, list[tuple[int, int]]]

with open(f"data/{args.output}", "wb") as f:
pickle.dump(mapping, file=f)
df_dataset = pd.DataFrame.from_dict(
{"token_id": list(mapping.keys()), "token_positions": list(mapping.values())}
)
df_dataset.set_index("token_id", inplace=True) # Set the token_id as the index

hf_dataset = Dataset.from_pandas(df_dataset)

dataset_name = args.dataset_name.split("/")[-1]

repo_id = f"{args.username}/{dataset_name}-token-map" # location in to hf

hf_dataset.push_to_hub(
repo_id=repo_id,
split="validation",
private=False,
token=args.token,
)

0 comments on commit a5b5e63

Please sign in to comment.