-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalpaca_processing.py
26 lines (21 loc) · 1.04 KB
/
alpaca_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import json
import datasets
import transformers
import json
import tqdm
CHAT_SYSTEM_PROMPT ="""<|SYSTEM|>You are to follow instructions faithfully from the user, which will begin in the next message."""
if __name__ == '__main__':
data = datasets.load_dataset("c-s-ale/alpaca-gpt4-data")
tokenizer = transformers.AutoTokenizer.from_pretrained(
"stablelm_tokenizer"
)
output_list = list()
for item in tqdm.tqdm(data['train']):
system_info = "<|SYSTEM|>You are to follow instructions faithfully from the user, which will begin in the next message.<|USER|>" + item['instruction']
user_string = "" if item['input'].strip() == "" else "\n" + item['input']
assist_string = "<|ASSISTANT|>" + item['output']
output_string = system_info + user_string + assist_string
if len(tokenizer(output_string)['input_ids']) < 4096:
output_list.append(output_string)
with open("alpaca-gpt4-data.json", 'w', encoding='latin1') as f:
json.dump(output_list, f, indent=2)