Merge branch 'inference' into optimize_attn

xinhaoc · Sep 30, 2023 · 5afbe36 · 5afbe36
2 parents 5ed09ed + 0e68bb7
commit 5afbe36
Show file tree

Hide file tree

Showing 9 changed files with 86 additions and 40 deletions.
diff --git a/.github/workflows/gpu-ci-daemon.yml b/.github/workflows/gpu-ci-daemon.yml
@@ -34,5 +34,6 @@ jobs:
         run: |
           pip3 install pip --upgrade
           pip3 install pyopenssl --upgrade
+          pip3 install urllib3 --upgrade
           pip3 install pygithub
           python3 .github/workflows/helpers/gpu_ci_helper.py --daemon
diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
@@ -56,6 +56,7 @@ jobs:
         run: |
           pip3 install pip --upgrade
           pip3 install pyopenssl --upgrade
+          pip3 install urllib3 --upgrade
           pip3 install pygithub
           python3 .github/workflows/helpers/gpu_ci_helper.py
 

diff --git a/.github/workflows/multinode-test.yml b/.github/workflows/multinode-test.yml
@@ -25,6 +25,7 @@ jobs:
         run: |
           pip3 install pip --upgrade
           pip3 install pyopenssl --upgrade
+          pip3 install urllib3 --upgrade
           pip3 install pygithub
           python3 .github/workflows/helpers/gpu_ci_helper.py
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -630,7 +630,7 @@ install(FILES ${FLEXFLOW_HDR} DESTINATION ${INCLUDE_DEST})
 install(TARGETS flexflow DESTINATION ${LIB_DEST})
 # install python
 if (FF_USE_PYTHON)
-  execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "from distutils import sysconfig; print(sysconfig.get_python_lib(plat_specific=False,standard_lib=False))" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)
+  execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)
   if (NOT FF_BUILD_FROM_PYPI)
     install(
       DIRECTORY ${FLEXFLOW_ROOT}/python/flexflow/

diff --git a/cmake/pip_install/CMakeLists.txt b/cmake/pip_install/CMakeLists.txt
@@ -1,6 +1,6 @@
 # Use setup.py script to re-install the Python bindings library with the right library paths
 if (FF_USE_PYTHON)
-    execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "from distutils import sysconfig; print(sysconfig.get_python_lib(plat_specific=False,standard_lib=False))" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)  
+    execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)  
     if(FF_BUILD_FROM_PYPI)
         install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E echo \"Editing path to Legion library using path: ${PY_DEST}/flexflow/lib \")")
         # CMAKE_CURRENT_SOURCE_DIR=/usr/FlexFlow/cmake/pip_install 

diff --git a/python/flexflow/core/flexflowlib.py b/python/flexflow/core/flexflowlib.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os, platform
+import site, os, platform
 from typing import Any, Union
 
 from .flexflow_cffi_header import flexflow_header
@@ -47,14 +47,14 @@ def get_shared_library(self) -> str:
         libname = "libflexflow" + self.get_library_extension()
 
         # If we installed with pip, use the full path instead of just the library name, because the library will not be in the LD_LIBRARY_PATH
-        packages_dir = sysconfig.get_python_lib(plat_specific=False, standard_lib=False)
-        ff_lib_path = os.path.join(packages_dir, "flexflow", "lib", libname)
-        installed_with_pip = os.path.exists(ff_lib_path)
-
-        if installed_with_pip:
-            return ff_lib_path
-        else:
-            return libname
+        candidate_package_dirs = [pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func)]
+        candidate_package_dirs += sysconfig.get_python_lib(plat_specific=False, standard_lib=False)
+        for packages_dir in candidate_package_dirs:
+            ff_lib_path = os.path.join(packages_dir, "flexflow", "lib", libname)
+            installed_with_pip = os.path.exists(ff_lib_path)
+            if installed_with_pip:
+                return ff_lib_path
+        return libname
 
     def get_c_header(self) -> str:
         return self._header

diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py
@@ -48,19 +48,19 @@ def init(
     fusion: Optional[bool] = None,
 ):
     """
-    Configure FlexFlow Serve and start the runtime. 
-    
+    Configure FlexFlow Serve and start the runtime.
+
     The function takes, alternatively, configs_dict (a positional argument of type dictionary),
     or three mandatory named parameters, plus some additional optional named parameters. When passing
     a configs_dict, no named parameter should be specified, and the dictionary should have keys matching
     at least the mandatory named parameters.
-    
+
     The three mandatory parameters, which cannot be changed after starting the runtime, are:
     - num_gpus: the number of GPUs to reserve for the runtime
     - memory_per_gpu: the amount of memory (in MB) to pre-allocate on each GPU
     - zero_copy_memory_per_node: the amount of zero-copy memory (in MB) to pre-allocate for each node
-    
-    The optional parameters are: 
+
+    The optional parameters are:
     - num_cpus: the number of CPU processors to reserve for the runtime, defaults to 4
     - legion_utility_processors: number of Legion utility threads to create per process, defaults to 1
     - data_parallelism_degree: the degree of parallelization in the data parallel dimension, defaults to 1
@@ -72,7 +72,7 @@ def init(
     - use_8bit_quantization: whether to use 8-bit quantization, defaults to False
     - profiling: whether to enable the FlexFlow profiling mode, defaults to False
     - fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True
-    
+
     The configurations are passed down to the FlexFlow runtime (implemented in C++) via command line arguments.
 
 
@@ -106,7 +106,7 @@ def init(
     :type profiling: Optional[bool], optional
     :param fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True
     :type fusion: Optional[bool], optional
-    
+
     :raises ValueError: this function will raise an exception if the user passes both a configs_dict and some named parameters
     :raises TypeError: this function will raise an exception if the configs_dict is not a dictionary
     :raises ValueError: this function will raise an exception if the mandatory FlexFlow initialization parameters are missing, or are not positive integers: num_gpus, memory_per_gpu, zero_copy_memory_per_node
@@ -152,7 +152,7 @@ def init(
             "use_4bit_quantization": use_4bit_quantization,
             "use_8bit_quantization": use_8bit_quantization,
             "profiling": profiling,
-            "fusion": fusion
+            "fusion": fusion,
         }
 
     # Check that mandatory configs are present
@@ -188,7 +188,7 @@ def init(
     if configs_dict.get("offload", None) is None:
         configs_dict["offload"] = False
     if configs_dict.get("offload_reserve_space_size", None) is None:
-        configs_dict["offload_reserve_space_size"] = 1024 ** 2
+        configs_dict["offload_reserve_space_size"] = 1024**2
     if configs_dict.get("use_4bit_quantization", None) is None:
         configs_dict["use_4bit_quantization"] = False
     if configs_dict.get("use_8bit_quantization", None) is None:

diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
@@ -27,9 +27,17 @@ def __init__(self, hf_config):
         self.hidden_size = hf_config.hidden_size
         self.layer_norm_epsilon = hf_config.layer_norm_epsilon
         self.multi_query = hf_config.multi_query
-        self.n_head = hf_config.n_head
+        self.n_head = (
+            hf_config.n_head
+            if "n_head" in hf_config.__dict__
+            else hf_config.num_attention_heads
+        )
         self.n_head_kv = hf_config.n_head_kv if "n_head_kv" in hf_config.__dict__ else 1
-        self.n_layer = hf_config.n_layer
+        self.n_layer = (
+            hf_config.n_layer
+            if "n_layer" in hf_config.__dict__
+            else hf_config.num_hidden_layers
+        )
         self.parallel_attn = hf_config.parallel_attn
         self.vocab_size = hf_config.vocab_size
 
@@ -234,6 +242,11 @@ def build_model(self):
 
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
+        n_head = (
+            model.config.n_head
+            if "n_head" in model.config.__dict__
+            else model.config.num_attention_heads
+        )
         for name, params in model.named_parameters():
             name = (
                 name.replace(".", "_")
@@ -250,8 +263,8 @@ def convert_hf_model(model, dst_folder):
                     params,
                     [
                         model.config.hidden_size,
-                        model.config.hidden_size // model.config.n_head,
-                        model.config.hidden_size // model.config.n_head,
+                        model.config.hidden_size // n_head,
+                        model.config.hidden_size // n_head,
                     ],
                     0,
                 )

diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
@@ -52,12 +52,15 @@ def __init__(
         self.topp = topp
         self.topk = topk
 
+
 class GenerationResult:
     """A class to store the output of a generation request."""
+
     def __init__(self, text: str = None, tokens: list = None):
         self.output_text = text
         self.output_tokens = tokens
 
+
 class LLM:
     """This class creates a LLM (Large-Language Model) object based on a model from HuggingFace"""
 
@@ -87,6 +90,7 @@ def __init__(
             "LLaMAForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA),
             "OPTForCausalLM": (ModelType.OPT, FlexFlowOPT),
             "RWForCausalLM": (ModelType.FALCON, FlexFlowFalcon),
+            "FalconForCausalLM": (ModelType.FALCON, FlexFlowFalcon),
             "GPTBigCodeForCausalLM": (ModelType.STARCODER, FlexFlowSTARCODER),
             "MPTForCausalLM": (ModelType.MPT, FlexFlowMPT),
         }
@@ -124,21 +128,27 @@ def download_hf_config(self):
 
     def __get_revision_hashes(self, model_name: str, weights: bool):
         ff_revision = None
-        ff_revision_file = os.path.join(self.weights_path, "rev_sha.txt") if weights else os.path.join(self.tokenizer_path, "rev_sha.txt")
+        ff_revision_file = (
+            os.path.join(self.weights_path, "rev_sha.txt")
+            if weights
+            else os.path.join(self.tokenizer_path, "rev_sha.txt")
+        )
         if os.path.exists(ff_revision_file):
             ff_revision = "".join(open(ff_revision_file).read().split())
-        
+
         if os.path.exists(model_name) and os.path.isdir(model_name):
             # Local model
             files = os.listdir(model_name)
-            state = files + [os.path.getmtime(os.path.join(model_name, f)) for f in files]
-            latest_revision = hashlib.md5(str(state).encode('utf-8')).hexdigest() 
+            state = files + [
+                os.path.getmtime(os.path.join(model_name, f)) for f in files
+            ]
+            latest_revision = hashlib.md5(str(state).encode("utf-8")).hexdigest()
         else:
             # Remote HuggingFace model
             hf_api = HfApi()
             latest_revision = hf_api.model_info(self.model_name).sha
         return ff_revision, ff_revision_file, latest_revision
-    
+
     def download_hf_weights_if_needed(self):
         """Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date.
         If not, or if the refresh_cache parameter is set to True, download new weights.
@@ -168,7 +178,9 @@ def download_hf_weights_if_needed(self):
         os.makedirs(self.weights_path, exist_ok=True)
         print(f"Creating directory {self.weights_path} (if it doesn't exist)...")
 
-        ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(self.model_name, weights=True)
+        ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
+            self.model_name, weights=True
+        )
 
         # Download if needed
         if ff_revision != latest_revision:
@@ -179,9 +191,13 @@ def download_hf_weights_if_needed(self):
                 )
             else:
                 # Remote model
-                print(f"'{self.model_name}' local model weights were updated! Converting new weights now...")
+                print(
+                    f"'{self.model_name}' local model weights were updated! Converting new weights now..."
+                )
             # Download model from HuggingFace, or load it from the local folder
-            hf_model = AutoModelForCausalLM.from_pretrained(self.model_name, trust_remote_code=True)
+            hf_model = AutoModelForCausalLM.from_pretrained(
+                self.model_name, trust_remote_code=True
+            )
             # Print log message to notify user download of model has finished
             if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
                 print("Done downloading HF weights. Converting them now...")
@@ -217,15 +233,21 @@ def download_hf_tokenizer_if_needed(self):
             os.makedirs(self.tokenizer_path, exist_ok=True)
 
         # Get local revision SHA, check if it matches latest one on huggingface
-        ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(self.model_name, weights=False)
+        ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
+            self.model_name, weights=False
+        )
 
         if ff_revision != latest_revision:
             if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
                 # Local model
-                print(f"'{self.model_name}' tokenizer not found in cache or outdated. Downloading from huggingface.co ...")
+                print(
+                    f"'{self.model_name}' tokenizer not found in cache or outdated. Downloading from huggingface.co ..."
+                )
             else:
                 # Remote model
-                print(f"'{self.model_name}' local tokenizer was updated! Saving new tokenizer now...")
+                print(
+                    f"'{self.model_name}' local tokenizer was updated! Saving new tokenizer now..."
+                )
             # Download tokenizer from HuggingFace, or load it from the local folder
             if self.model_type == ModelType.LLAMA:
                 hf_tokenizer = LlamaTokenizer.from_pretrained(
@@ -242,7 +264,7 @@ def download_hf_tokenizer_if_needed(self):
             # Save new revision hash to file
             with open(ff_revision_file, "w+") as f:
                 f.write(latest_revision)
-            
+
         else:
             print(f"Loading '{self.model_name}' tokenizer from the cache...")
 
@@ -357,9 +379,15 @@ def compile(
 
         # Create request manager
         self.rm = RequestManager()
-        bos_token_id = -1 if self.hf_config.bos_token_id is None else self.hf_config.bos_token_id
-        eos_token_id = -1 if self.hf_config.eos_token_id is None else self.hf_config.eos_token_id
-        self.rm.register_tokenizer(self.model_type, bos_token_id, eos_token_id, self.tokenizer_path)
+        bos_token_id = (
+            -1 if self.hf_config.bos_token_id is None else self.hf_config.bos_token_id
+        )
+        eos_token_id = (
+            -1 if self.hf_config.eos_token_id is None else self.hf_config.eos_token_id
+        )
+        self.rm.register_tokenizer(
+            self.model_type, bos_token_id, eos_token_id, self.tokenizer_path
+        )
         self.rm.register_output_filepath(self.output_file)
 
         self.im.init_operators_inference(self.model.ffmodel)
@@ -382,7 +410,9 @@ def generate(self, prompts: Union[str, List[str]], max_length: int = 128):
         elif type(prompts) == list:
             if len(prompts) == 0:
                 return []
-            return [self.model.ffmodel.generate(prompt, max_length) for prompt in prompts]
+            return [
+                self.model.ffmodel.generate(prompt, max_length) for prompt in prompts
+            ]
         else:
             assert False, "Please pass a non-empty string or list of strings"