Merge branch 'huggingface:main' into fix-tokenizer-text-split

jiongjiongli · Jan 3, 2025 · 12b6aa8 · 12b6aa8
2 parents fbd9036 + 4286586
commit 12b6aa8
Show file tree

Hide file tree

Showing 7 changed files with 12 additions and 7 deletions.
diff --git a/docs/source/en/fsdp.md b/docs/source/en/fsdp.md
@@ -58,7 +58,7 @@ Otherwise, you can choose a size-based wrapping policy where FSDP is applied to
 
 ### Checkpointing
 
-Intermediate checkpoints should be saved with `fsdp_state_dict_type: SHARDED_STATE_DICT` because saving the full state dict with CPU offloading on rank 0 takes a lot of time and often results in `NCCL Timeout` errors due to indefinite hanging during broadcasting. You can resume training with the sharded state dicts with the [`~accelerate.Accelerator.load_state`]` method.
+Intermediate checkpoints should be saved with `fsdp_state_dict_type: SHARDED_STATE_DICT` because saving the full state dict with CPU offloading on rank 0 takes a lot of time and often results in `NCCL Timeout` errors due to indefinite hanging during broadcasting. You can resume training with the sharded state dicts with the [`~accelerate.Accelerator.load_state`] method.
 
 ```py
 # directory containing checkpoints

diff --git a/docs/source/zh/fsdp.md b/docs/source/zh/fsdp.md
@@ -74,7 +74,7 @@ FSDP 是通过包装网络中的每个层来应用的。通常，包装是以嵌
 
 应该使用 `fsdp_state_dict_type: SHARDED_STATE_DICT` 来保存中间检查点，
 因为在排名 0 上保存完整状态字典需要很长时间，通常会导致 `NCCL Timeout` 错误，因为在广播过程中会无限期挂起。
-您可以使用 [`~accelerate.Accelerator.load_state`]` 方法加载分片状态字典以恢复训练。
+您可以使用 [`~accelerate.Accelerator.load_state`] 方法加载分片状态字典以恢复训练。
 
 ```py
 # 包含检查点的目录

diff --git a/src/transformers/models/modernbert/modeling_modernbert.py b/src/transformers/models/modernbert/modeling_modernbert.py
@@ -307,7 +307,7 @@ def eager_attention_forward(
     dim: int,
     output_attentions: Optional[bool] = False,
     **_kwargs,
-) -> Tuple[torch.Tensor, torch.Tensor] | Tuple[torch.Tensor]:
+) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
     # qkv: [batch_size, seqlen, 3, nheads, headdim]
     cos, sin = module.rotary_emb(qkv, position_ids=position_ids)
     query, key, value = qkv.transpose(3, 1).unbind(dim=2)

diff --git a/src/transformers/models/modernbert/modular_modernbert.py b/src/transformers/models/modernbert/modular_modernbert.py
@@ -532,7 +532,7 @@ def eager_attention_forward(
     dim: int,
     output_attentions: Optional[bool] = False,
     **_kwargs,
-) -> Tuple[torch.Tensor, torch.Tensor] | Tuple[torch.Tensor]:
+) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
     # qkv: [batch_size, seqlen, 3, nheads, headdim]
     cos, sin = module.rotary_emb(qkv, position_ids=position_ids)
     query, key, value = qkv.transpose(3, 1).unbind(dim=2)

diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py
@@ -519,7 +519,7 @@ def forward(
         # mask out pad-token-ids in labels for BC
         if labels is not None and self.pad_token_id in labels:
             logger.warning_once(
-                "`labels` contains `pad_token_id` which will be masked with `config.ignore_index`. ",
+                "`labels` contains `pad_token_id` which will be masked with `config.ignore_index`. "
                 "You have to mask out `pad_token_id` when preparing `labels`, this behavior will be removed in v.4.46.",
             )
             labels = torch.where(input_ids == self.pad_token_id, self.config.ignore_index, labels)

diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
@@ -172,7 +172,7 @@ def test_memory_footprint(self):
         mem_fp16 = self.model_fp16.get_memory_footprint()
         mem_4bit = self.model_4bit.get_memory_footprint()
 
-        self.assertAlmostEqual(mem_fp16 / mem_4bit, self.EXPECTED_RELATIVE_DIFFERENCE)
+        self.assertAlmostEqual(mem_fp16 / mem_4bit, self.EXPECTED_RELATIVE_DIFFERENCE, delta=1e-5)
         linear = get_some_linear_layer(self.model_4bit)
         self.assertTrue(linear.weight.__class__ == Params4bit)
 

diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
@@ -229,7 +229,7 @@ def test_memory_footprint(self):
         mem_fp16 = self.model_fp16.get_memory_footprint()
         mem_8bit = self.model_8bit.get_memory_footprint()
 
-        self.assertAlmostEqual(mem_fp16 / mem_8bit, self.EXPECTED_RELATIVE_DIFFERENCE)
+        self.assertAlmostEqual(mem_fp16 / mem_8bit, self.EXPECTED_RELATIVE_DIFFERENCE, delta=1e-5)
         self.assertTrue(get_some_linear_layer(self.model_8bit).weight.__class__ == Int8Params)
 
     def test_linear_are_8bit(self):
@@ -938,8 +938,13 @@ class MixedInt8LlamaTest(MixedInt8Test):
     model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
     EXPECTED_RELATIVE_DIFFERENCE = 1.7869331026479096
     EXPECTED_OUTPUTS = set()
+
+    # Expected on Intel XPU
     EXPECTED_OUTPUTS.add("Hello my name is John Smith and I am a software engineer. I")
 
+    # Expected on NVIDIA T4
+    EXPECTED_OUTPUTS.add("Hello my name is John and I am a software engineer. I have")
+
     def test_int8_from_pretrained(self):
         r"""
         Test whether loading a 8bit model from the Hub works as expected