Skip to content

Commit

Permalink
Add new inference bench
Browse files Browse the repository at this point in the history
  • Loading branch information
pierre.delaunay committed Nov 6, 2023
1 parent 8e62d37 commit 6c514f2
Show file tree
Hide file tree
Showing 15 changed files with 335 additions and 9 deletions.
2 changes: 1 addition & 1 deletion benchmarks/accelerate_opt/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ evaluate
accelerate
deepspeed
rich
voir>=0.2.9,<0.3
voir
2 changes: 1 addition & 1 deletion benchmarks/dlrm/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ torchx
tensorboard

# Following limits are for milabench
voir>=0.2.9,<0.3
voir
2 changes: 1 addition & 1 deletion benchmarks/huggingface/requirements.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
torch
transformers
voir>=0.2.9,<0.3
voir
33 changes: 33 additions & 0 deletions benchmarks/llama/benchfile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import uuid

from milabench.executors import TorchRunExecutor
from milabench.pack import Package

GITHUB = "https://github.com/facebookresearch/llama"
BRANCH = "54d44631054deae836aec8ceff92dcf8f20ca9e7"


class LLAMA(Package):
base_requirements = "requirements.in"
main_script = "main.py"

def make_env(self):
return {
**super().make_env(),
"OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8))
}

async def install(self):
await super().install()

timm = self.dirs.code / "llama"
if not timm.exists():
timm.clone_subtree(GITHUB, BRANCH)

def build_run_plan(self):
# self.config is not the right config for this
plan = super().build_run_plan()
return TorchRunExecutor(plan, use_stdout=True)


__pack__ = LLAMA
25 changes: 25 additions & 0 deletions benchmarks/llama/config/llama2_13b_chat_hf.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"_name_or_path": null,
"architectures": [
"LlamaForCausalLM"
],
"bos_token_id": 1,
"eos_token_id": 2,
"hidden_act": "silu",
"hidden_size": 5120,
"initializer_range": 0.02,
"intermediate_size": 13824,
"max_position_embeddings": 4096,
"model_type": "llama",
"num_attention_heads": 40,
"num_hidden_layers": 40,
"num_key_value_heads": 40,
"pretraining_tp": 1,
"rms_norm_eps": 1e-05,
"rope_scaling": null,
"tie_word_embeddings": false,
"torch_dtype": "float16",
"transformers_version": "4.31.0.dev0",
"use_cache": true,
"vocab_size": 32000
}
25 changes: 25 additions & 0 deletions benchmarks/llama/config/llama2_70b_chat_hf.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"_name_or_path": "meta-llama/Llama-2-70b-chat-hf",
"architectures": [
"LlamaForCausalLM"
],
"bos_token_id": 1,
"eos_token_id": 2,
"hidden_act": "silu",
"hidden_size": 8192,
"initializer_range": 0.02,
"intermediate_size": 28672,
"max_position_embeddings": 4096,
"model_type": "llama",
"num_attention_heads": 64,
"num_hidden_layers": 80,
"num_key_value_heads": 8,
"pretraining_tp": 1,
"rms_norm_eps": 1e-05,
"rope_scaling": null,
"tie_word_embeddings": false,
"torch_dtype": "float16",
"transformers_version": "4.31.0.dev0",
"use_cache": true,
"vocab_size": 32000
}
25 changes: 25 additions & 0 deletions benchmarks/llama/config/llama2_7b_chat_hf.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"_name_or_path": "meta-llama/Llama-2-7b-chat-hf",
"architectures": [
"LlamaForCausalLM"
],
"bos_token_id": 1,
"eos_token_id": 2,
"hidden_act": "silu",
"hidden_size": 4096,
"initializer_range": 0.02,
"intermediate_size": 11008,
"max_position_embeddings": 4096,
"model_type": "llama",
"num_attention_heads": 32,
"num_hidden_layers": 32,
"num_key_value_heads": 32,
"pretraining_tp": 1,
"rms_norm_eps": 1e-06,
"rope_scaling": null,
"tie_word_embeddings": false,
"torch_dtype": "float16",
"transformers_version": "4.32.0.dev0",
"use_cache": true,
"vocab_size": 32000
}
11 changes: 11 additions & 0 deletions benchmarks/llama/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from typing import List, Optional

import fire

from llama import Llama, Dialog

config = {
"7b": {"dim": 4096, "multiple_of": 256, "n_heads": 32, "n_layers": 32, "norm_eps": 1e-05, "vocab_size": -1},
"13b": {"dim": 5120, "multiple_of": 256, "n_heads": 40, "n_layers": 40, "norm_eps": 1e-05, "vocab_size": -1},
"70b": {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
}
202 changes: 202 additions & 0 deletions benchmarks/llama/main_hugging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@

import json
import os
import argparse

from voir.smuggle import SmuggleWriter
from voir.instruments.gpu import get_gpu_info

root = os.path.dirname(__file__)


def available_models():
models = dict()

for size in ("7b", "13b", "70b"):
models[f'llama2-{size}'] = {
"name": f"meta-llama/Llama-2-{size}-chat-hf",
"config": f"llama2_{size}_chat_hf.config"
}

return models


def _worker(state, queue, func, delay):
import time

while state['running']:
queue.put(func())
time.sleep(delay)

class Monitor:
def __init__(self, delay, func):
self.manager = multiprocessing.Manager()
self.state = self.manager.dict()
self.state['running'] = True
self.results = multiprocessing.Queue()
self.process = multiprocessing.Process(
target=_worker,
args=(self.state, self.results, func, delay),
)

def start(self):
self.process.start()

def stop(self):
self.state['running'] = False
self.process.join()


def setupvoir():
# wtf this do
data_file = SmuggleWriter(sys.stdout)
# data_file = sys.stdout

def log(data):
if data_file is not None:
data["t"] = time.time()
print(json.dumps(data), file=data_file)

while not monitor.results.empty():
print(json.dumps(monitor.results.get()), file=data_file)

def monitor_fn():
data = {
gpu["device"]: {
"memory": [
gpu["memory"]["used"],
gpu["memory"]["total"],
],
"load": gpu["utilization"]["compute"],
"temperature": gpu["temperature"],
"power": gpu["power"]
}
for gpu in get_gpu_info()["gpus"].values()
}
return {"task": "main", "gpudata": data, "t": time.time()}

monitor = Monitor(0.5, monitor_fn)
monitor.start()
return log, monitor


class WrappedTokenizer:
def __init__(self, tokenizer):
self.tokenizer = tokenizer
self.count = 0

def __call__(self *args, **kwargs):
input_ids = self.tokenizer(*args, **kwargs)
self.count = len(input_ids)
return input_ids

def __getattr__(self, attr):
if hasattr(self.tokenizer, attr):
method = getattr(self.tokenizer, attr)
return method
else:
raise AttributeError(f"'{type(self.tokenizer).__name__}' object has no attribute '{attr}'")


def huggingface_main(args, model, config):
# Huggingface imported AFTER setup
from transformers import LlamaForCausalLM, LlamaTokenizerFast
from transformers.models.llama.configuration_llama import LlamaConfig

from datasets import load_dataset

# Dataset here
dataset = load_dataset(
"wikitext",
"wikitext-103-v1"
)

# LLAMA tokenizer official tokenizer is hidden behind a login
tokenizer = WrappedTokenizer(
LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
)

# Prepare is done
if args.prepare:
return 0

# We do not download LLAMA because it takes too long
# we just instantiate an untrained one
model = LlamaForCausalLM(LlamaConfig.from_dict(config))

pipeline = transformers.pipeline(
"text-generation",
model=model,
torch_dtype=torch.float16,
device_map="auto",
)

in_token_count = 0
out_token_count = 0

start = time.time()

log, monitor = setupvoir()

for entry in dataset["train"]:
text = entry["text"].strip()

# Titles
if text == "" or text.startswith(" = "):
continue

in_token_count += len(tokenizer.count)
sequences = pipeline(
text,
do_sample=True,
top_k=10,
num_return_sequences=1,
eos_token_id=tokenizer.eos_token_id,
max_length=200,
)

for seq in sequences:
out_token_count += len(seq["generated_text"])

if out_token_count > 1000:
elapsed = start - time.time()
total = (out_token_count + in_token_count)

out_token_count = 0
in_token_count = 0
start = time.time()

if log is not None:
log({
"task": "train",
"rate": total / elapsed,
"units": "Tok/s"
})


monitor.stop()

def main():
models = available_models()

parser = argparse.ArgumentParser()
parser.add_argument("--model", default="llama2-7b", choices=models.keys())
parser.add_argument("--prepare", action="store_true")
parser.add_argument("--cache", required=True, type=str)

#
args = parser.parse_args()
os.environ["XDG_CACHE_HOME"] = str(args.cache)

settings = models[args.model]
model, config = settings["name"], settings["config"]

with open(os.path.join(root, 'config', config), 'r') as file:
config = json.load(file)

return huggingface_main(args, model, config)



if __name__ == "__main__":
main()
6 changes: 6 additions & 0 deletions benchmarks/llama/requirements.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
torch
fairscale
fire
sentencepiece
voir
datasets
2 changes: 1 addition & 1 deletion benchmarks/stable_baselines3/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ seaborn
tqdm

# Following limits are for milabench
voir>=0.2.9,<0.3.0
voir
2 changes: 1 addition & 1 deletion benchmarks/stargan/requirements.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
numpy
torch
torchvision
voir>=0.2.9,<0.3
voir
3 changes: 1 addition & 2 deletions benchmarks/super-slomo/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,4 @@ torchvision
numpy
tqdm
opencv-python

voir>=0.2.9,<0.3
voir
2 changes: 1 addition & 1 deletion benchmarks/timm/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ torchvision
pyyaml
huggingface_hub
safetensors>=0.2
voir>=0.2.9,<0.3
voir
2 changes: 1 addition & 1 deletion benchmarks/torchvision/requirements.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
torch
torchvision
tqdm
voir>=0.2.9,<0.3
voir

0 comments on commit 6c514f2

Please sign in to comment.