forked from huggingface/transformers-bloom-inference
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Makefile
121 lines (107 loc) · 4.53 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
gen-proto:
mkdir -p inference_server/model_handler/grpc_utils/pb
python -m grpc_tools.protoc -Iinference_server/model_handler/grpc_utils/proto --python_out=inference_server/model_handler/grpc_utils/pb --grpc_python_out=inference_server/model_handler/grpc_utils/pb inference_server/model_handler/grpc_utils/proto/generation.proto
find inference_server/model_handler/grpc_utils/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
touch inference_server/model_handler/grpc_utils/__init__.py
touch inference_server/model_handler/grpc_utils/pb/__init__.py
rm -rf inference_server/model_handler/grpc_utils/pb/*.py-e
# ------------------------- DS inference -------------------------
bloom-176b:
TOKENIZERS_PARALLELISM=false \
MODEL_NAME=bigscience/bloom \
MODEL_CLASS=AutoModelForCausalLM \
DEPLOYMENT_FRAMEWORK=ds_inference \
DTYPE=fp16 \
MAX_INPUT_LENGTH=2048 \
MAX_BATCH_SIZE=4 \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
# loads faster than the above one
microsoft-bloom-176b:
TOKENIZERS_PARALLELISM=false \
MODEL_NAME=microsoft/bloom-deepspeed-inference-fp16 \
MODEL_CLASS=AutoModelForCausalLM \
DEPLOYMENT_FRAMEWORK=ds_inference \
DTYPE=fp16 \
MAX_INPUT_LENGTH=2048 \
MAX_BATCH_SIZE=4 \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
bloomz-176b:
TOKENIZERS_PARALLELISM=false \
MODEL_NAME=bigscience/bloomz \
MODEL_CLASS=AutoModelForCausalLM \
DEPLOYMENT_FRAMEWORK=ds_inference \
DTYPE=fp16 \
MAX_INPUT_LENGTH=2048 \
MAX_BATCH_SIZE=4 \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
bloom-176b-int8:
TOKENIZERS_PARALLELISM=false \
MODEL_NAME=microsoft/bloom-deepspeed-inference-int8 \
MODEL_CLASS=AutoModelForCausalLM \
DEPLOYMENT_FRAMEWORK=ds_inference \
DTYPE=int8 \
MAX_INPUT_LENGTH=2048 \
MAX_BATCH_SIZE=4 \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
# ------------------------- HF accelerate -------------------------
bloom-560m:
TOKENIZERS_PARALLELISM=false \
MODEL_NAME=bigscience/bloom-560m \
MODEL_CLASS=AutoModelForCausalLM \
DEPLOYMENT_FRAMEWORK=hf_accelerate \
DTYPE=bf16 \
MAX_INPUT_LENGTH=2048 \
MAX_BATCH_SIZE=32 \
CUDA_VISIBLE_DEVICES=0 \
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
flan-t5-xxl:
TOKENIZERS_PARALLELISM=false \
MODEL_NAME=google/flan-t5-xxl \
MODEL_CLASS=AutoModelForSeq2SeqLM \
DEPLOYMENT_FRAMEWORK=hf_accelerate \
DTYPE=bf16 \
MAX_INPUT_LENGTH=2048 \
MAX_BATCH_SIZE=4 \
CUDA_VISIBLE_DEVICES=0 \
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
ul2:
TOKENIZERS_PARALLELISM=false \
MODEL_NAME=google/ul2 \
MODEL_CLASS=AutoModelForSeq2SeqLM \
DEPLOYMENT_FRAMEWORK=hf_accelerate \
DTYPE=bf16 \
MAX_INPUT_LENGTH=2048 \
MAX_BATCH_SIZE=4 \
CUDA_VISIBLE_DEVICES=0 \
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
codegen-mono:
TOKENIZERS_PARALLELISM=false \
MODEL_NAME=Salesforce/codegen-16B-mono \
MODEL_CLASS=AutoModelForCausalLM \
DEPLOYMENT_FRAMEWORK=hf_accelerate \
DTYPE=bf16 \
MAX_INPUT_LENGTH=2048 \
MAX_BATCH_SIZE=4 \
CUDA_VISIBLE_DEVICES=0 \
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
# ------------------------- HF CPU -------------------------
bloom-560m-cpu:
MODEL_NAME=bigscience/bloom-560m \
MODEL_CLASS=AutoModelForCausalLM \
DEPLOYMENT_FRAMEWORK=hf_cpu \
DTYPE=bf16 \
MAX_INPUT_LENGTH=2048 \
MAX_BATCH_SIZE=32 \
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
flan-t5-base-cpu:
MODEL_NAME=google/flan-t5-base \
MODEL_CLASS=AutoModelForSeq2SeqLM \
DEPLOYMENT_FRAMEWORK=hf_cpu \
DTYPE=bf16 \
MAX_INPUT_LENGTH=2048 \
MAX_BATCH_SIZE=32 \
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'