forked from slee-lab/llama-recipes
-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathsamsum_dataset.py
32 lines (25 loc) · 1.04 KB
/
samsum_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# Copyright (c) Meta Platforms, Inc. and affiliates.
# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
# For dataset details visit: https://huggingface.co/datasets/samsum
import datasets
from .utils import Concatenator
def get_preprocessed_samsum(dataset_config, tokenizer, split):
dataset = datasets.load_dataset("samsum", split=split)
prompt = (
f"Summarize this dialog:\n{{dialog}}\n---\nSummary:\n{{summary}}{{eos_token}}"
)
def apply_prompt_template(sample):
return {
"text": prompt.format(
dialog=sample["dialogue"],
summary=sample["summary"],
eos_token=tokenizer.eos_token,
)
}
dataset = dataset.map(apply_prompt_template, remove_columns=list(dataset.features))
dataset = dataset.map(
lambda sample: tokenizer(sample["text"]),
batched=True,
remove_columns=list(dataset.features),
).map(Concatenator(), batched=True)
return dataset