Skip to content

Commit

Permalink
Add new inference bench
Browse files Browse the repository at this point in the history
  • Loading branch information
pierre.delaunay committed Nov 6, 2023
1 parent 8e62d37 commit fdc3c54
Show file tree
Hide file tree
Showing 14 changed files with 319 additions and 9 deletions.
2 changes: 1 addition & 1 deletion benchmarks/accelerate_opt/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ evaluate
accelerate
deepspeed
rich
voir>=0.2.9,<0.3
voir
2 changes: 1 addition & 1 deletion benchmarks/dlrm/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ torchx
tensorboard

# Following limits are for milabench
voir>=0.2.9,<0.3
voir
2 changes: 1 addition & 1 deletion benchmarks/huggingface/requirements.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
torch
transformers
voir>=0.2.9,<0.3
voir
18 changes: 18 additions & 0 deletions benchmarks/llama/benchfile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import uuid

from milabench.executors import TorchRunExecutor
from milabench.pack import Package


class LLAMA(Package):
base_requirements = "requirements.in"
main_script = "main.py"

def make_env(self):
return {
**super().make_env(),
"OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8))
}


__pack__ = LLAMA
25 changes: 25 additions & 0 deletions benchmarks/llama/config/llama2_13b_chat_hf.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"_name_or_path": null,
"architectures": [
"LlamaForCausalLM"
],
"bos_token_id": 1,
"eos_token_id": 2,
"hidden_act": "silu",
"hidden_size": 5120,
"initializer_range": 0.02,
"intermediate_size": 13824,
"max_position_embeddings": 4096,
"model_type": "llama",
"num_attention_heads": 40,
"num_hidden_layers": 40,
"num_key_value_heads": 40,
"pretraining_tp": 1,
"rms_norm_eps": 1e-05,
"rope_scaling": null,
"tie_word_embeddings": false,
"torch_dtype": "float16",
"transformers_version": "4.31.0.dev0",
"use_cache": true,
"vocab_size": 32000
}
25 changes: 25 additions & 0 deletions benchmarks/llama/config/llama2_70b_chat_hf.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"_name_or_path": "meta-llama/Llama-2-70b-chat-hf",
"architectures": [
"LlamaForCausalLM"
],
"bos_token_id": 1,
"eos_token_id": 2,
"hidden_act": "silu",
"hidden_size": 8192,
"initializer_range": 0.02,
"intermediate_size": 28672,
"max_position_embeddings": 4096,
"model_type": "llama",
"num_attention_heads": 64,
"num_hidden_layers": 80,
"num_key_value_heads": 8,
"pretraining_tp": 1,
"rms_norm_eps": 1e-05,
"rope_scaling": null,
"tie_word_embeddings": false,
"torch_dtype": "float16",
"transformers_version": "4.31.0.dev0",
"use_cache": true,
"vocab_size": 32000
}
25 changes: 25 additions & 0 deletions benchmarks/llama/config/llama2_7b_chat_hf.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"_name_or_path": "meta-llama/Llama-2-7b-chat-hf",
"architectures": [
"LlamaForCausalLM"
],
"bos_token_id": 1,
"eos_token_id": 2,
"hidden_act": "silu",
"hidden_size": 4096,
"initializer_range": 0.02,
"intermediate_size": 11008,
"max_position_embeddings": 4096,
"model_type": "llama",
"num_attention_heads": 32,
"num_hidden_layers": 32,
"num_key_value_heads": 32,
"pretraining_tp": 1,
"rms_norm_eps": 1e-06,
"rope_scaling": null,
"tie_word_embeddings": false,
"torch_dtype": "float16",
"transformers_version": "4.32.0.dev0",
"use_cache": true,
"vocab_size": 32000
}
212 changes: 212 additions & 0 deletions benchmarks/llama/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@

import json
import os
import argparse
import time
import sys
import multiprocessing

import torch

from voir.smuggle import SmuggleWriter
from voir.instruments.gpu import get_gpu_info

root = os.path.dirname(__file__)


def available_models():
models = dict()

for size in ("7b", "13b", "70b"):
models[f'llama2-{size}'] = {
"name": f"meta-llama/Llama-2-{size}-chat-hf",
"config": f"llama2_{size}_chat_hf.config"
}

return models


def _worker(state, queue, func, delay):
import time

while state['running']:
queue.put(func())
time.sleep(delay)

class Monitor:
def __init__(self, delay, func):
self.manager = multiprocessing.Manager()
self.state = self.manager.dict()
self.state['running'] = True
self.results = multiprocessing.Queue()
self.process = multiprocessing.Process(
target=_worker,
args=(self.state, self.results, func, delay),
)

def start(self):
self.process.start()

def stop(self):
self.state['running'] = False
self.process.join()


def setupvoir():
# wtf this do
data_file = SmuggleWriter(sys.stdout)
# data_file = sys.stdout

def log(data):
if data_file is not None:
data["t"] = time.time()
print(json.dumps(data), file=data_file)

while not monitor.results.empty():
print(json.dumps(monitor.results.get()), file=data_file)

def monitor_fn():
data = {
gpu["device"]: {
"memory": [
gpu["memory"]["used"],
gpu["memory"]["total"],
],
"load": gpu["utilization"]["compute"],
"temperature": gpu["temperature"],
"power": gpu["power"]
}
for gpu in get_gpu_info()["gpus"].values()
}
return {"task": "main", "gpudata": data, "t": time.time()}

monitor = Monitor(0.5, monitor_fn)
monitor.start()
return log, monitor


class WrappedTokenizer:
def __init__(self, tokenizer):
self.tokenizer = tokenizer
self.count = 0

def __call__(self, *args, **kwargs):
input_ids = self.tokenizer(*args, **kwargs)
self.count = len(input_ids)
return input_ids

def __getattr__(self, attr):
if hasattr(self.tokenizer, attr):
method = getattr(self.tokenizer, attr)
return method
else:
raise AttributeError(f"'{type(self.tokenizer).__name__}' object has no attribute '{attr}'")


def huggingface_main(args, model, config):
# Huggingface imported AFTER setup
import transformers
from transformers import LlamaForCausalLM, LlamaTokenizerFast
from transformers.models.llama.configuration_llama import LlamaConfig

from datasets import load_dataset

# Dataset here
dataset = load_dataset(
"wikitext",
"wikitext-103-v1"
)

# LLAMA tokenizer official tokenizer is hidden behind a login
tokenizer = WrappedTokenizer(
LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
)

# Prepare is done
if args.prepare:
return 0

# We do not download LLAMA because it takes too long
# we just instantiate an untrained one
model = LlamaForCausalLM(LlamaConfig.from_dict(config))

pipeline = transformers.pipeline(
"text-generation",
model=model,
torch_dtype=torch.float16,
device_map="auto",
tokenizer=tokenizer,
)

in_token_count = 0
out_token_count = 0

start = time.time()

log, monitor = setupvoir()

for entry in dataset["train"]:
text = entry["text"].strip()

# Titles
if text == "" or text.startswith(" = "):
continue


sequences = pipeline(
text,
do_sample=True,
top_k=10,
num_return_sequences=1,
eos_token_id=tokenizer.eos_token_id,
max_length=400,
)

for seq in sequences:
out_token_count += len(seq["generated_text"])

in_token_count += tokenizer.count
total = out_token_count + in_token_count

elapsed = time.time() - start
print(total / elapsed)

if total > 100:
out_token_count = 0
in_token_count = 0
start = time.time()

if log is not None:
log({
"task": "train",
"rate": total / elapsed,
"units": "Tok/s"
})


monitor.stop()

def main():
models = available_models()

parser = argparse.ArgumentParser()
parser.add_argument("--model", default="llama2-7b", choices=models.keys())
parser.add_argument("--prepare", action="store_true")
parser.add_argument("--cache", required=True, type=str)

#
args = parser.parse_args()
os.environ["XDG_CACHE_HOME"] = str(args.cache)

settings = models[args.model]
model, config = settings["name"], settings["config"]

with open(os.path.join(root, 'config', config), 'r') as file:
config = json.load(file)

return huggingface_main(args, model, config)



if __name__ == "__main__":
main()
6 changes: 6 additions & 0 deletions benchmarks/llama/requirements.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
torch
fairscale
fire
sentencepiece
voir
datasets
2 changes: 1 addition & 1 deletion benchmarks/stable_baselines3/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ seaborn
tqdm

# Following limits are for milabench
voir>=0.2.9,<0.3.0
voir
2 changes: 1 addition & 1 deletion benchmarks/stargan/requirements.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
numpy
torch
torchvision
voir>=0.2.9,<0.3
voir
3 changes: 1 addition & 2 deletions benchmarks/super-slomo/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,4 @@ torchvision
numpy
tqdm
opencv-python

voir>=0.2.9,<0.3
voir
2 changes: 1 addition & 1 deletion benchmarks/timm/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ torchvision
pyyaml
huggingface_hub
safetensors>=0.2
voir>=0.2.9,<0.3
voir
2 changes: 1 addition & 1 deletion benchmarks/torchvision/requirements.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
torch
torchvision
tqdm
voir>=0.2.9,<0.3
voir

0 comments on commit fdc3c54

Please sign in to comment.