-
Notifications
You must be signed in to change notification settings - Fork 0
/
inference.py
150 lines (125 loc) · 5.74 KB
/
inference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
from typing import Optional
import torch
import time
from pathlib import Path
import json
from sentencepiece import SentencePieceProcessor
from tqdm import tqdm
from model import ModelArgs, Transformer
class LLaMA:
def __init__(self , model:Transformer, tokenizer:SentencePieceProcessor, model_args:ModelArgs):
self.model=model
self.tokenizer=tokenizer
self.args=model_args
@staticmethod
def build(checkpoints_dir:str,tokenizer_path:str,load_model:bool,max_seq_len:int,max_batch_size:int,device:str):
prev_time=time.time()
if load_model:
checkpoints=sorted(Path(checkpoints_dir).glob('*.pth'))
assert len(checkpoints)>0,"No checkpoints files found"
chk_path=checkpoints[0]
print(f"Loading ckp {chk_path}")
checkpoint=torch.load(chk_path,map_location='cpu')
print(f"Loaded ckp in {(time.time()-prev_time):.2f}s")
prev_time=time.time()
with open(Path(checkpoints_dir)/"params.json","r") as f:
params=json.loads(f.read())
model_args:ModelArgs=ModelArgs(max_seq_len=max_seq_len, max_batch_size=max_batch_size,device=device,**params)
tokenizer=SentencePieceProcessor()
tokenizer.load(tokenizer_path)
model_args.vocab_size=tokenizer.vocab_size()
if device =='cuda':
torch.set_default_tensor_type(torch.cuda.HalfTensor)
else:
torch.set_default_tensor_type(torch.BFloat16Tensor)
model=Transformer(model_args).to(device)
if load_model:
del checkpoint["rope.freqs"]
model.load_state_dict(checkpoint,strict=True)
print(f"Loaded state dict in {time.time() -prev_time:.2f}s")
return LLaMA(model,tokenizer,model_args)
def text_completion(self,prompts:list[str],temperature:float=0.6,top_p:float=0.9,max_gen_len:Optional[int]=None):
if max_gen_len is None:
max_gen_len=self.args.max_seq_len-1
prompt_tokens=[self.tokenizer.encode(prompt, out_type=int, add_bos=True, add_eos=False) for prompt in prompts]
batch_size=len(prompt_tokens)
assert batch_size <= self.args.max_batch_size
max_prompt_len =max(len(prompt) for prompt in prompt_tokens)
assert max_prompt_len <= self.args.max_seq_len
total_len=min(self.args.max_seq_len,max_gen_len+max_prompt_len)
# list that contains the generated tokens, along weith the prompt
pad_id=self.tokenizer.pad_id()
tokens=torch.full((batch_size,total_len),pad_id,dtype=torch.long,device=device)
for k,t in enumerate(prompt_tokens):
#populate with prompt tokens
tokens[k,:len(t)]=torch.tensor(t,dtype=torch.long,device=device)
eos_reached=torch.tensor([False]*batch_size,device=device)
prompt_tokens_mask=tokens !=pad_id
for curr_pos in tqdm(range(1,total_len),desc='Generating Tokens'):
with torch.no_grad():
logits=self.model.forward(tokens[:,curr_pos-1:curr_pos],curr_pos)
if temperature >0:
probs=torch.softmax(logits[:,-1]/temperature,dim=-1)
next_token=self._sample_top_p(probs,top_p)
else:
# greedy
next_token=torch.argmax(logits[:,-1],dim=-1)
next_token=next_token.reshape(-1)
#replace if token is padding token
next_token=torch.where(prompt_tokens_mask[:,curr_pos],tokens[:,curr_pos],next_token)
tokens[:,curr_pos]=next_token
#eos
eos_reached|=(~prompt_tokens_mask[:,curr_pos]) & (next_token==self.tokenizer.eos_id())
if all(eos_reached):
break
out_tokens=[]
out_text=[]
for prompt_idx, curr_prompt_tokens in enumerate(tokens.tolist()):
# cut to EOS token, if it is there.
if self.tokenizer.eos_id() in curr_prompt_tokens:
eos_idx=curr_prompt_tokens.index(self.tokenizer.eos_id())
curr_prompt_tokens=curr_prompt_tokens[:eos_idx]
out_tokens.append(curr_prompt_tokens)
out_text.append(self.tokenizer.decode(curr_prompt_tokens))
return (out_tokens,out_text)
def _sample_top_p(self,probs,p):
probs_sort,probs_idx=torch.sort(probs,dim=-1,descending=True)
probs_sum=torch.cumsum(probs_sort,dim=-1)
mask=probs_sum-probs_sort>p
probs_sort[mask]=0.0
probs_sort.div_(probs_sort.sum(dim=-1,keepdim=True))
next_token=torch.multinomial(probs_sort,num_samples=1)
next_token=torch.gather(probs_idx,-1,next_token)
return next_token
if __name__=='__main__':
torch.manual_seed(0)
allow_cuda = False
device = 'cuda' if torch.cuda.is_available() and allow_cuda else 'cpu'
prompts = [
"Simply put, the theory of relativity states that ",
"If Google was an Italian company founded in Milan, it would",
# Few shot promt
"""Translate English to French:
sea otter => loutre de mer
peppermint => menthe poivrée
plush girafe => girafe peluche
cheese =>""",
# Zero shot prompt
"""Tell me if the following person is actually Doraemon disguised as human:
Name: Umar Jamil
Decision:
"""
]
model = LLaMA.build(
checkpoints_dir='llama-2-7b/',
tokenizer_path='tokenizer.model',
load_model=True,
max_seq_len=1024,
max_batch_size=len(prompts),
device=device
)
out_tokens, out_texts = (model.text_completion(prompts, max_gen_len=64))
assert len(out_texts) == len(prompts)
for i in range(len(out_texts)):
print(f'{out_texts[i]}')
print('-' * 50)