Skip to content

Commit

Permalink
Merge branch 'inference' into optimize_attn
Browse files Browse the repository at this point in the history
  • Loading branch information
xinhaoc authored Sep 30, 2023
2 parents 5ed09ed + 0e68bb7 commit 5afbe36
Show file tree
Hide file tree
Showing 9 changed files with 86 additions and 40 deletions.
1 change: 1 addition & 0 deletions .github/workflows/gpu-ci-daemon.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,6 @@ jobs:
run: |
pip3 install pip --upgrade
pip3 install pyopenssl --upgrade
pip3 install urllib3 --upgrade
pip3 install pygithub
python3 .github/workflows/helpers/gpu_ci_helper.py --daemon
1 change: 1 addition & 0 deletions .github/workflows/gpu-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ jobs:
run: |
pip3 install pip --upgrade
pip3 install pyopenssl --upgrade
pip3 install urllib3 --upgrade
pip3 install pygithub
python3 .github/workflows/helpers/gpu_ci_helper.py
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/multinode-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ jobs:
run: |
pip3 install pip --upgrade
pip3 install pyopenssl --upgrade
pip3 install urllib3 --upgrade
pip3 install pygithub
python3 .github/workflows/helpers/gpu_ci_helper.py
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -630,7 +630,7 @@ install(FILES ${FLEXFLOW_HDR} DESTINATION ${INCLUDE_DEST})
install(TARGETS flexflow DESTINATION ${LIB_DEST})
# install python
if (FF_USE_PYTHON)
execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "from distutils import sysconfig; print(sysconfig.get_python_lib(plat_specific=False,standard_lib=False))" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)
execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)
if (NOT FF_BUILD_FROM_PYPI)
install(
DIRECTORY ${FLEXFLOW_ROOT}/python/flexflow/
Expand Down
2 changes: 1 addition & 1 deletion cmake/pip_install/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Use setup.py script to re-install the Python bindings library with the right library paths
if (FF_USE_PYTHON)
execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "from distutils import sysconfig; print(sysconfig.get_python_lib(plat_specific=False,standard_lib=False))" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)
execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)
if(FF_BUILD_FROM_PYPI)
install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E echo \"Editing path to Legion library using path: ${PY_DEST}/flexflow/lib \")")
# CMAKE_CURRENT_SOURCE_DIR=/usr/FlexFlow/cmake/pip_install
Expand Down
18 changes: 9 additions & 9 deletions python/flexflow/core/flexflowlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import os, platform
import site, os, platform
from typing import Any, Union

from .flexflow_cffi_header import flexflow_header
Expand Down Expand Up @@ -47,14 +47,14 @@ def get_shared_library(self) -> str:
libname = "libflexflow" + self.get_library_extension()

# If we installed with pip, use the full path instead of just the library name, because the library will not be in the LD_LIBRARY_PATH
packages_dir = sysconfig.get_python_lib(plat_specific=False, standard_lib=False)
ff_lib_path = os.path.join(packages_dir, "flexflow", "lib", libname)
installed_with_pip = os.path.exists(ff_lib_path)

if installed_with_pip:
return ff_lib_path
else:
return libname
candidate_package_dirs = [pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func)]
candidate_package_dirs += sysconfig.get_python_lib(plat_specific=False, standard_lib=False)
for packages_dir in candidate_package_dirs:
ff_lib_path = os.path.join(packages_dir, "flexflow", "lib", libname)
installed_with_pip = os.path.exists(ff_lib_path)
if installed_with_pip:
return ff_lib_path
return libname

def get_c_header(self) -> str:
return self._header
Expand Down
18 changes: 9 additions & 9 deletions python/flexflow/serve/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,19 +48,19 @@ def init(
fusion: Optional[bool] = None,
):
"""
Configure FlexFlow Serve and start the runtime.
Configure FlexFlow Serve and start the runtime.
The function takes, alternatively, configs_dict (a positional argument of type dictionary),
or three mandatory named parameters, plus some additional optional named parameters. When passing
a configs_dict, no named parameter should be specified, and the dictionary should have keys matching
at least the mandatory named parameters.
The three mandatory parameters, which cannot be changed after starting the runtime, are:
- num_gpus: the number of GPUs to reserve for the runtime
- memory_per_gpu: the amount of memory (in MB) to pre-allocate on each GPU
- zero_copy_memory_per_node: the amount of zero-copy memory (in MB) to pre-allocate for each node
The optional parameters are:
The optional parameters are:
- num_cpus: the number of CPU processors to reserve for the runtime, defaults to 4
- legion_utility_processors: number of Legion utility threads to create per process, defaults to 1
- data_parallelism_degree: the degree of parallelization in the data parallel dimension, defaults to 1
Expand All @@ -72,7 +72,7 @@ def init(
- use_8bit_quantization: whether to use 8-bit quantization, defaults to False
- profiling: whether to enable the FlexFlow profiling mode, defaults to False
- fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True
The configurations are passed down to the FlexFlow runtime (implemented in C++) via command line arguments.
Expand Down Expand Up @@ -106,7 +106,7 @@ def init(
:type profiling: Optional[bool], optional
:param fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True
:type fusion: Optional[bool], optional
:raises ValueError: this function will raise an exception if the user passes both a configs_dict and some named parameters
:raises TypeError: this function will raise an exception if the configs_dict is not a dictionary
:raises ValueError: this function will raise an exception if the mandatory FlexFlow initialization parameters are missing, or are not positive integers: num_gpus, memory_per_gpu, zero_copy_memory_per_node
Expand Down Expand Up @@ -152,7 +152,7 @@ def init(
"use_4bit_quantization": use_4bit_quantization,
"use_8bit_quantization": use_8bit_quantization,
"profiling": profiling,
"fusion": fusion
"fusion": fusion,
}

# Check that mandatory configs are present
Expand Down Expand Up @@ -188,7 +188,7 @@ def init(
if configs_dict.get("offload", None) is None:
configs_dict["offload"] = False
if configs_dict.get("offload_reserve_space_size", None) is None:
configs_dict["offload_reserve_space_size"] = 1024 ** 2
configs_dict["offload_reserve_space_size"] = 1024**2
if configs_dict.get("use_4bit_quantization", None) is None:
configs_dict["use_4bit_quantization"] = False
if configs_dict.get("use_8bit_quantization", None) is None:
Expand Down
21 changes: 17 additions & 4 deletions python/flexflow/serve/models/falcon.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,17 @@ def __init__(self, hf_config):
self.hidden_size = hf_config.hidden_size
self.layer_norm_epsilon = hf_config.layer_norm_epsilon
self.multi_query = hf_config.multi_query
self.n_head = hf_config.n_head
self.n_head = (
hf_config.n_head
if "n_head" in hf_config.__dict__
else hf_config.num_attention_heads
)
self.n_head_kv = hf_config.n_head_kv if "n_head_kv" in hf_config.__dict__ else 1
self.n_layer = hf_config.n_layer
self.n_layer = (
hf_config.n_layer
if "n_layer" in hf_config.__dict__
else hf_config.num_hidden_layers
)
self.parallel_attn = hf_config.parallel_attn
self.vocab_size = hf_config.vocab_size

Expand Down Expand Up @@ -234,6 +242,11 @@ def build_model(self):

def convert_hf_model(model, dst_folder):
os.makedirs(dst_folder, exist_ok=True)
n_head = (
model.config.n_head
if "n_head" in model.config.__dict__
else model.config.num_attention_heads
)
for name, params in model.named_parameters():
name = (
name.replace(".", "_")
Expand All @@ -250,8 +263,8 @@ def convert_hf_model(model, dst_folder):
params,
[
model.config.hidden_size,
model.config.hidden_size // model.config.n_head,
model.config.hidden_size // model.config.n_head,
model.config.hidden_size // n_head,
model.config.hidden_size // n_head,
],
0,
)
Expand Down
62 changes: 46 additions & 16 deletions python/flexflow/serve/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,15 @@ def __init__(
self.topp = topp
self.topk = topk


class GenerationResult:
"""A class to store the output of a generation request."""

def __init__(self, text: str = None, tokens: list = None):
self.output_text = text
self.output_tokens = tokens


class LLM:
"""This class creates a LLM (Large-Language Model) object based on a model from HuggingFace"""

Expand Down Expand Up @@ -87,6 +90,7 @@ def __init__(
"LLaMAForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA),
"OPTForCausalLM": (ModelType.OPT, FlexFlowOPT),
"RWForCausalLM": (ModelType.FALCON, FlexFlowFalcon),
"FalconForCausalLM": (ModelType.FALCON, FlexFlowFalcon),
"GPTBigCodeForCausalLM": (ModelType.STARCODER, FlexFlowSTARCODER),
"MPTForCausalLM": (ModelType.MPT, FlexFlowMPT),
}
Expand Down Expand Up @@ -124,21 +128,27 @@ def download_hf_config(self):

def __get_revision_hashes(self, model_name: str, weights: bool):
ff_revision = None
ff_revision_file = os.path.join(self.weights_path, "rev_sha.txt") if weights else os.path.join(self.tokenizer_path, "rev_sha.txt")
ff_revision_file = (
os.path.join(self.weights_path, "rev_sha.txt")
if weights
else os.path.join(self.tokenizer_path, "rev_sha.txt")
)
if os.path.exists(ff_revision_file):
ff_revision = "".join(open(ff_revision_file).read().split())

if os.path.exists(model_name) and os.path.isdir(model_name):
# Local model
files = os.listdir(model_name)
state = files + [os.path.getmtime(os.path.join(model_name, f)) for f in files]
latest_revision = hashlib.md5(str(state).encode('utf-8')).hexdigest()
state = files + [
os.path.getmtime(os.path.join(model_name, f)) for f in files
]
latest_revision = hashlib.md5(str(state).encode("utf-8")).hexdigest()
else:
# Remote HuggingFace model
hf_api = HfApi()
latest_revision = hf_api.model_info(self.model_name).sha
return ff_revision, ff_revision_file, latest_revision

def download_hf_weights_if_needed(self):
"""Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date.
If not, or if the refresh_cache parameter is set to True, download new weights.
Expand Down Expand Up @@ -168,7 +178,9 @@ def download_hf_weights_if_needed(self):
os.makedirs(self.weights_path, exist_ok=True)
print(f"Creating directory {self.weights_path} (if it doesn't exist)...")

ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(self.model_name, weights=True)
ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
self.model_name, weights=True
)

# Download if needed
if ff_revision != latest_revision:
Expand All @@ -179,9 +191,13 @@ def download_hf_weights_if_needed(self):
)
else:
# Remote model
print(f"'{self.model_name}' local model weights were updated! Converting new weights now...")
print(
f"'{self.model_name}' local model weights were updated! Converting new weights now..."
)
# Download model from HuggingFace, or load it from the local folder
hf_model = AutoModelForCausalLM.from_pretrained(self.model_name, trust_remote_code=True)
hf_model = AutoModelForCausalLM.from_pretrained(
self.model_name, trust_remote_code=True
)
# Print log message to notify user download of model has finished
if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
print("Done downloading HF weights. Converting them now...")
Expand Down Expand Up @@ -217,15 +233,21 @@ def download_hf_tokenizer_if_needed(self):
os.makedirs(self.tokenizer_path, exist_ok=True)

# Get local revision SHA, check if it matches latest one on huggingface
ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(self.model_name, weights=False)
ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
self.model_name, weights=False
)

if ff_revision != latest_revision:
if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
# Local model
print(f"'{self.model_name}' tokenizer not found in cache or outdated. Downloading from huggingface.co ...")
print(
f"'{self.model_name}' tokenizer not found in cache or outdated. Downloading from huggingface.co ..."
)
else:
# Remote model
print(f"'{self.model_name}' local tokenizer was updated! Saving new tokenizer now...")
print(
f"'{self.model_name}' local tokenizer was updated! Saving new tokenizer now..."
)
# Download tokenizer from HuggingFace, or load it from the local folder
if self.model_type == ModelType.LLAMA:
hf_tokenizer = LlamaTokenizer.from_pretrained(
Expand All @@ -242,7 +264,7 @@ def download_hf_tokenizer_if_needed(self):
# Save new revision hash to file
with open(ff_revision_file, "w+") as f:
f.write(latest_revision)

else:
print(f"Loading '{self.model_name}' tokenizer from the cache...")

Expand Down Expand Up @@ -357,9 +379,15 @@ def compile(

# Create request manager
self.rm = RequestManager()
bos_token_id = -1 if self.hf_config.bos_token_id is None else self.hf_config.bos_token_id
eos_token_id = -1 if self.hf_config.eos_token_id is None else self.hf_config.eos_token_id
self.rm.register_tokenizer(self.model_type, bos_token_id, eos_token_id, self.tokenizer_path)
bos_token_id = (
-1 if self.hf_config.bos_token_id is None else self.hf_config.bos_token_id
)
eos_token_id = (
-1 if self.hf_config.eos_token_id is None else self.hf_config.eos_token_id
)
self.rm.register_tokenizer(
self.model_type, bos_token_id, eos_token_id, self.tokenizer_path
)
self.rm.register_output_filepath(self.output_file)

self.im.init_operators_inference(self.model.ffmodel)
Expand All @@ -382,7 +410,9 @@ def generate(self, prompts: Union[str, List[str]], max_length: int = 128):
elif type(prompts) == list:
if len(prompts) == 0:
return []
return [self.model.ffmodel.generate(prompt, max_length) for prompt in prompts]
return [
self.model.ffmodel.generate(prompt, max_length) for prompt in prompts
]
else:
assert False, "Please pass a non-empty string or list of strings"

Expand Down

0 comments on commit 5afbe36

Please sign in to comment.