Skip to content

Commit

Permalink
Merge branch 'huggingface:main' into fix-tokenizer-text-split
Browse files Browse the repository at this point in the history
  • Loading branch information
jiongjiongli authored Jan 3, 2025
2 parents fbd9036 + 4286586 commit 12b6aa8
Show file tree
Hide file tree
Showing 7 changed files with 12 additions and 7 deletions.
2 changes: 1 addition & 1 deletion docs/source/en/fsdp.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ Otherwise, you can choose a size-based wrapping policy where FSDP is applied to

### Checkpointing

Intermediate checkpoints should be saved with `fsdp_state_dict_type: SHARDED_STATE_DICT` because saving the full state dict with CPU offloading on rank 0 takes a lot of time and often results in `NCCL Timeout` errors due to indefinite hanging during broadcasting. You can resume training with the sharded state dicts with the [`~accelerate.Accelerator.load_state`]` method.
Intermediate checkpoints should be saved with `fsdp_state_dict_type: SHARDED_STATE_DICT` because saving the full state dict with CPU offloading on rank 0 takes a lot of time and often results in `NCCL Timeout` errors due to indefinite hanging during broadcasting. You can resume training with the sharded state dicts with the [`~accelerate.Accelerator.load_state`] method.

```py
# directory containing checkpoints
Expand Down
2 changes: 1 addition & 1 deletion docs/source/zh/fsdp.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ FSDP 是通过包装网络中的每个层来应用的。通常,包装是以嵌

应该使用 `fsdp_state_dict_type: SHARDED_STATE_DICT` 来保存中间检查点,
因为在排名 0 上保存完整状态字典需要很长时间,通常会导致 `NCCL Timeout` 错误,因为在广播过程中会无限期挂起。
您可以使用 [`~accelerate.Accelerator.load_state`]` 方法加载分片状态字典以恢复训练。
您可以使用 [`~accelerate.Accelerator.load_state`] 方法加载分片状态字典以恢复训练。

```py
# 包含检查点的目录
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/modernbert/modeling_modernbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ def eager_attention_forward(
dim: int,
output_attentions: Optional[bool] = False,
**_kwargs,
) -> Tuple[torch.Tensor, torch.Tensor] | Tuple[torch.Tensor]:
) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
# qkv: [batch_size, seqlen, 3, nheads, headdim]
cos, sin = module.rotary_emb(qkv, position_ids=position_ids)
query, key, value = qkv.transpose(3, 1).unbind(dim=2)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/modernbert/modular_modernbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,7 +532,7 @@ def eager_attention_forward(
dim: int,
output_attentions: Optional[bool] = False,
**_kwargs,
) -> Tuple[torch.Tensor, torch.Tensor] | Tuple[torch.Tensor]:
) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
# qkv: [batch_size, seqlen, 3, nheads, headdim]
cos, sin = module.rotary_emb(qkv, position_ids=position_ids)
query, key, value = qkv.transpose(3, 1).unbind(dim=2)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/paligemma/modeling_paligemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@ def forward(
# mask out pad-token-ids in labels for BC
if labels is not None and self.pad_token_id in labels:
logger.warning_once(
"`labels` contains `pad_token_id` which will be masked with `config.ignore_index`. ",
"`labels` contains `pad_token_id` which will be masked with `config.ignore_index`. "
"You have to mask out `pad_token_id` when preparing `labels`, this behavior will be removed in v.4.46.",
)
labels = torch.where(input_ids == self.pad_token_id, self.config.ignore_index, labels)
Expand Down
2 changes: 1 addition & 1 deletion tests/quantization/bnb/test_4bit.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def test_memory_footprint(self):
mem_fp16 = self.model_fp16.get_memory_footprint()
mem_4bit = self.model_4bit.get_memory_footprint()

self.assertAlmostEqual(mem_fp16 / mem_4bit, self.EXPECTED_RELATIVE_DIFFERENCE)
self.assertAlmostEqual(mem_fp16 / mem_4bit, self.EXPECTED_RELATIVE_DIFFERENCE, delta=1e-5)
linear = get_some_linear_layer(self.model_4bit)
self.assertTrue(linear.weight.__class__ == Params4bit)

Expand Down
7 changes: 6 additions & 1 deletion tests/quantization/bnb/test_mixed_int8.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ def test_memory_footprint(self):
mem_fp16 = self.model_fp16.get_memory_footprint()
mem_8bit = self.model_8bit.get_memory_footprint()

self.assertAlmostEqual(mem_fp16 / mem_8bit, self.EXPECTED_RELATIVE_DIFFERENCE)
self.assertAlmostEqual(mem_fp16 / mem_8bit, self.EXPECTED_RELATIVE_DIFFERENCE, delta=1e-5)
self.assertTrue(get_some_linear_layer(self.model_8bit).weight.__class__ == Int8Params)

def test_linear_are_8bit(self):
Expand Down Expand Up @@ -938,8 +938,13 @@ class MixedInt8LlamaTest(MixedInt8Test):
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
EXPECTED_RELATIVE_DIFFERENCE = 1.7869331026479096
EXPECTED_OUTPUTS = set()

# Expected on Intel XPU
EXPECTED_OUTPUTS.add("Hello my name is John Smith and I am a software engineer. I")

# Expected on NVIDIA T4
EXPECTED_OUTPUTS.add("Hello my name is John and I am a software engineer. I have")

def test_int8_from_pretrained(self):
r"""
Test whether loading a 8bit model from the Hub works as expected
Expand Down

0 comments on commit 12b6aa8

Please sign in to comment.