Skip to content

Commit

Permalink
fix low_cpu_mem_usage(contguous) (#1832)
Browse files Browse the repository at this point in the history
  • Loading branch information
lvyufeng authored Nov 22, 2024
1 parent 965d658 commit ac841c9
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 5 deletions.
5 changes: 3 additions & 2 deletions llm/inference/tinyllama/app_jit.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

if ON_ORANGE_PI:
mindspore.set_context(
enable_compile_cache=True,
enable_graph_kernel=True,
mode=mindspore.GRAPH_MODE,
jit_config={
"jit_level": "O2",
Expand All @@ -23,7 +23,8 @@

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = LlamaTokenizer.from_pretrained(model_id)
model = LlamaForCausalLM.from_pretrained(model_id, ms_dtype=mindspore.float16)
model = LlamaForCausalLM.from_pretrained(model_id, ms_dtype=mindspore.float16, low_cpu_mem_usage=True)
model = model.npu()

# quantize_cfg = w8x8(model.model.config)
# quantize(model, cfg=quantize_cfg)
Expand Down
9 changes: 9 additions & 0 deletions llm/inference/tinyllama/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,13 @@ We offer an easy way to interact with Tinyllama. This guide explains how to set

```bash
sudo sync && echo 3 | sudo tee /proc/sys/vm/drop_caches
```

```bash
export TE_PARALLEL_COMPILER=1
export MAX_COMPILE_CORE_NUMBER=1
export MS_BUILD_PROCESS_NUM=1
export MAX_RUNTIME_CORE_NUMBER=1
# if use O2
export MS_ENABLE_IO_REUSE=1
```
2 changes: 1 addition & 1 deletion mindnlp/core/nn/modules/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -689,7 +689,7 @@ def __dir__(self):

def cuda(self):
return self._apply(lambda t: t.move_to('GPU'))

def npu(self):
return self._apply(lambda t: t.move_to('Ascend'))

Expand Down
12 changes: 10 additions & 2 deletions mindnlp/transformers/modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -733,8 +733,8 @@ def _load_state_dict_into_meta_model(
if dtype is None:
param = param.to(old_param.dtype)

if old_param.is_contiguous():
param = param.contiguous()
# if old_param.is_contiguous():
# param = param.contiguous()

set_module_kwargs["value"] = param

Expand Down Expand Up @@ -2658,6 +2658,7 @@ def from_pretrained(
use_flash_attention_2 = kwargs.pop("use_flash_attention_2", False)

gguf_file = kwargs.pop("gguf_file", None)
gguf_path = None

if token is not None and adapter_kwargs is not None and "token" not in adapter_kwargs:
adapter_kwargs["token"] = token
Expand Down Expand Up @@ -3100,6 +3101,13 @@ def from_pretrained(
else:
loaded_state_dict_keys = list(state_dict.keys())

if gguf_path is None and (low_cpu_mem_usage or use_keep_in_fp32_modules):
# In case some weights need to be kept in float32 and accelerate is not installed,
# we later on want to take the path where state_dict is not None, that is the one
# that do not require accelerate.
state_dict = None


config.name_or_path = pretrained_model_name_or_path

# Instantiate model.
Expand Down

0 comments on commit ac841c9

Please sign in to comment.