From 3ddbe300acf1d9ce0f566825fb6cba7da2ff7a3a Mon Sep 17 00:00:00 2001
From: lvyufeng <lvyufeng@cqu.edu.cn>
Date: Tue, 22 Oct 2024 21:00:27 +0800
Subject: [PATCH 1/3] fix from_numpy caused error

---
 mindnlp/core/nn/modules/module.py |  3 +++
 mindnlp/core/serialization.py     | 14 +++++++-------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/mindnlp/core/nn/modules/module.py b/mindnlp/core/nn/modules/module.py
index 9c7774c60..7a22d1d8a 100644
--- a/mindnlp/core/nn/modules/module.py
+++ b/mindnlp/core/nn/modules/module.py
@@ -572,6 +572,9 @@ def remove_from(*dicts_or_sets):
                         d.discard(name)
 
         params = self.__dict__.get('_parameters')
+
+        if isinstance(value, StubTensor):
+            value = value.stub_sync()
         if isinstance(value, Parameter):
             if params is None:
                 raise AttributeError(
diff --git a/mindnlp/core/serialization.py b/mindnlp/core/serialization.py
index e08393568..13b7c3c66 100644
--- a/mindnlp/core/serialization.py
+++ b/mindnlp/core/serialization.py
@@ -805,7 +805,7 @@ def _rebuild_tensor_v2(storage, storage_offset, size, stride, requires_grad, bac
     else:
         order = "C"
         array = array.reshape(size, order=order)
-    param = Tensor.from_numpy(array)
+    param = Tensor(array)
     return param
 
 def _rebuild_from_type_v2(func, new_type, args, state):
@@ -1134,7 +1134,7 @@ def persistent_load(saved_id):
         if array.dtype == bfloat16 and not SUPPORT_BF16:
             logger.warning_once("MindSpore do not support bfloat16 dtype, we will automaticlly convert to float16")
             array = array.astype(np.float16)
-        new_result[k] = Tensor.from_numpy(array)
+        new_result[k] = Tensor(array)
 
     return new_result
 
@@ -1380,9 +1380,9 @@ def legacy_safe_load_file(filename):
             arr = np.frombuffer(v["data"], dtype=dtype).reshape(v["shape"])
 
             if (not SUPPORT_BF16 and dtype != bfloat16) or SUPPORT_BF16:
-                result[k] = Tensor.from_numpy(arr)
+                result[k] = Tensor(arr)
             else:
-                result[k] = Tensor.from_numpy(arr.astype(np.float16))
+                result[k] = Tensor(arr.astype(np.float16))
         return result
 
 
@@ -1412,7 +1412,7 @@ def convert(info: dict[str, Any]):
             logger.warning_once("MindSpore do not support bfloat16 dtype, we will automaticlly convert to float16")
             array = array.astype(np.float16)
 
-        return Tensor.from_numpy(array)
+        return Tensor(array)
 
     with open(filename, "rb") as fp:
         header_size, = struct.unpack('<Q', fp.read(8))
@@ -1506,7 +1506,7 @@ def load_checkpoint(ckpt_file_name):
                 dims = element.tensor.dims
                 param_data = np.frombuffer(data, np_type)
                 param_data = param_data.reshape(list(dims))
-                parameter = Tensor(param_data, ms_type)
+                parameter = Tensor(param_data)
                 parameter_dict[element.tag] = parameter
                 continue
             element_data = np.frombuffer(data, np_type)
@@ -1526,7 +1526,7 @@ def load_checkpoint(ckpt_file_name):
                         param_data = int(param_data[0])
                     if dims not in ([0], [1]):
                         param_data = param_data.reshape(list(dims))
-                    parameter = Tensor(param_data, ms_type)
+                    parameter = Tensor(param_data)
                     parameter_dict[element.tag] = parameter
 
     except BaseException as e:

From a52dc4b0bf98fbac398076efca9aeb2d3dc9ea4b Mon Sep 17 00:00:00 2001
From: lvyufeng <lvyufeng@cqu.edu.cn>
Date: Wed, 23 Oct 2024 11:06:51 +0800
Subject: [PATCH 2/3] fix hf_backbone ut

---
 mindnlp/core/serialization.py | 6 +++++-
 setup.py                      | 1 -
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/mindnlp/core/serialization.py b/mindnlp/core/serialization.py
index 13b7c3c66..a06c5c168 100644
--- a/mindnlp/core/serialization.py
+++ b/mindnlp/core/serialization.py
@@ -1408,11 +1408,15 @@ def convert(info: dict[str, Any]):
         assert end - begin == math.prod(shape) * np.dtype(numpy_dtype).itemsize
         buf = byte_buf[begin:end]
         array = np.frombuffer(buf, dtype=numpy_dtype).reshape(shape)
+
         if array.dtype == bfloat16 and not SUPPORT_BF16:
             logger.warning_once("MindSpore do not support bfloat16 dtype, we will automaticlly convert to float16")
             array = array.astype(np.float16)
 
-        return Tensor(array)
+        if info['dtype'] == 'I64':
+            array = array.astype(numpy_dtype)
+        out = Tensor(array)
+        return out
 
     with open(filename, "rb") as fp:
         header_size, = struct.unpack('<Q', fp.read(8))
diff --git a/setup.py b/setup.py
index 0009131f1..29cf213c0 100644
--- a/setup.py
+++ b/setup.py
@@ -128,7 +128,6 @@ def run(self):
         'addict',
         'ml_dtypes',
         'pyctcdecode',
-        'jieba',
         'pytest==7.2.0',
         'pillow>=10.0.0'
     ],

From 051525b3f4477b8566a64f93c4e32126c134e980 Mon Sep 17 00:00:00 2001
From: lvyufeng <lvyufeng@cqu.edu.cn>
Date: Fri, 25 Oct 2024 00:20:12 +0800
Subject: [PATCH 3/3] add audio pipeline

---
 mindnlp/core/serialization.py                 |   22 +-
 mindnlp/transformers/cache_utils.py           |    2 +-
 mindnlp/transformers/configuration_utils.py   |    7 -
 mindnlp/transformers/generation/utils.py      |    5 +-
 mindnlp/transformers/modeling_rope_utils.py   |   12 +-
 mindnlp/transformers/modeling_utils.py        |    4 +-
 mindnlp/transformers/models/__init__.py       |    3 +
 mindnlp/transformers/models/auto/__init__.py  |    2 +
 .../models/auto/configuration_auto.py         |    4 +
 .../models/auto/image_processing_auto.py      |    4 +-
 .../models/auto/modeling_graph_auto.py        |  680 ------
 .../models/cohere/modeling_cohere.py          |    1 -
 .../models/gemma/modeling_gemma.py            |    4 +-
 .../models/gemma2/modeling_gemma2.py          |   12 +-
 .../transformers/models/git/modeling_git.py   |    4 +-
 .../models/gpt_neox/modeling_gpt_neox.py      |    4 +-
 .../transformers/models/gptj/modeling_gptj.py |    2 +-
 .../instructblip/modeling_instructblip.py     |    4 +-
 .../models/llama/modeling_llama.py            |   35 +-
 .../models/llava/modeling_llava.py            |    1 -
 .../models/llava_next/modeling_llava_next.py  |    3 +-
 .../models/mistral/modeling_mistral.py        |    2 +-
 .../models/mllama/modeling_mllama.py          |    2 +-
 .../models/mpt/configuration_mpt.py           |    4 +-
 .../transformers/models/olmo/modeling_olmo.py |    2 +-
 .../models/persimmon/modeling_persimmon.py    |    4 +-
 .../transformers/models/phi/modeling_phi.py   |    4 +-
 .../models/qwen2/modeling_qwen2.py            |    2 +-
 .../models/qwen2_vl/modeling_qwen2_vl.py      |    9 +-
 .../models/speech_to_text/__init__.py         |   12 +-
 .../models/speech_to_text_2/__init__.py       |   26 +
 .../configuration_speech_to_text_2.py         |  133 ++
 .../modeling_speech_to_text_2.py              |  898 ++++++++
 .../processing_speech_to_text_2.py            |  118 +
 .../tokenization_speech_to_text_2.py          |  251 +++
 .../models/stablelm/modeling_stablelm.py      |    2 +-
 .../models/starcoder2/modeling_starcoder2.py  |    3 +-
 .../models/vipllava/modeling_vipllava.py      |    3 +-
 .../models/wav2vec2/modeling_wav2vec2.py      |    2 +-
 .../models/whisper/generation_whisper.py      |    6 +-
 .../models/whisper/modeling_whisper.py        |    2 +-
 mindnlp/transformers/pipelines/__init__.py    |   14 +-
 .../pipelines/audio_classification.py         |  220 ++
 mindnlp/transformers/pipelines/audio_utils.py |   83 +-
 .../pipelines/automatic_speech_recognition.py |  364 +--
 mindnlp/transformers/pipelines/base.py        |   41 +-
 .../pipelines/depth_estimation.py             |    0
 .../pipelines/feature_extraction.py           |    0
 .../pipelines/image_classification.py         |    0
 .../pipelines/image_feature_extraction.py     |    0
 .../pipelines/image_segmentation.py           |    0
 .../transformers/pipelines/image_to_image.py  |    0
 .../transformers/pipelines/image_to_text.py   |    0
 .../transformers/pipelines/mask_generation.py |    0
 .../pipelines/object_detection.py             |    0
 .../transformers/pipelines/text_to_audio.py   |    0
 .../pipelines/token_classification.py         |    0
 .../pipelines/video_classification.py         |    0
 .../pipelines/visual_question_answering.py    |    0
 .../zero_shot_audio_classification.py         |    0
 .../zero_shot_image_classification.py         |    0
 .../pipelines/zero_shot_object_detection.py   |    0
 mindnlp/transformers/processing_utils.py      |    2 +-
 mindnlp/utils/download.py                     |    4 +-
 mindnlp/utils/testing_utils.py                |    5 +
 .../test_pipelines_audio_classification.py    |  139 ++
 ..._pipelines_automatic_speech_recognition.py | 1954 +++++++++++++++++
 67 files changed, 4034 insertions(+), 1092 deletions(-)
 delete mode 100644 mindnlp/transformers/models/auto/modeling_graph_auto.py
 create mode 100644 mindnlp/transformers/models/speech_to_text_2/__init__.py
 create mode 100644 mindnlp/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py
 create mode 100644 mindnlp/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
 create mode 100644 mindnlp/transformers/models/speech_to_text_2/processing_speech_to_text_2.py
 create mode 100644 mindnlp/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py
 create mode 100644 mindnlp/transformers/pipelines/audio_classification.py
 create mode 100644 mindnlp/transformers/pipelines/depth_estimation.py
 create mode 100644 mindnlp/transformers/pipelines/feature_extraction.py
 create mode 100644 mindnlp/transformers/pipelines/image_classification.py
 create mode 100644 mindnlp/transformers/pipelines/image_feature_extraction.py
 create mode 100644 mindnlp/transformers/pipelines/image_segmentation.py
 create mode 100644 mindnlp/transformers/pipelines/image_to_image.py
 create mode 100644 mindnlp/transformers/pipelines/image_to_text.py
 create mode 100644 mindnlp/transformers/pipelines/mask_generation.py
 create mode 100644 mindnlp/transformers/pipelines/object_detection.py
 create mode 100644 mindnlp/transformers/pipelines/text_to_audio.py
 create mode 100644 mindnlp/transformers/pipelines/token_classification.py
 create mode 100644 mindnlp/transformers/pipelines/video_classification.py
 create mode 100644 mindnlp/transformers/pipelines/visual_question_answering.py
 create mode 100644 mindnlp/transformers/pipelines/zero_shot_audio_classification.py
 create mode 100644 mindnlp/transformers/pipelines/zero_shot_image_classification.py
 create mode 100644 mindnlp/transformers/pipelines/zero_shot_object_detection.py
 create mode 100644 tests/ut/transformers/pipelines/test_pipelines_audio_classification.py
 create mode 100644 tests/ut/transformers/pipelines/test_pipelines_automatic_speech_recognition.py

diff --git a/mindnlp/core/serialization.py b/mindnlp/core/serialization.py
index a06c5c168..9cd12e7b6 100644
--- a/mindnlp/core/serialization.py
+++ b/mindnlp/core/serialization.py
@@ -1402,20 +1402,26 @@ def safe_load_file(filename):
     """
     def convert(info: dict[str, Any]):
         numpy_dtype = _NP_TYPES[info['dtype']]
+        ms_dtype = _MS_TYPES[info['dtype']]
         shape: list[int] = info['shape']
         begin, end = info['data_offsets']
         assert 0 <= begin <= end <= len(byte_buf)
         assert end - begin == math.prod(shape) * np.dtype(numpy_dtype).itemsize
         buf = byte_buf[begin:end]
-        array = np.frombuffer(buf, dtype=numpy_dtype).reshape(shape)
 
-        if array.dtype == bfloat16 and not SUPPORT_BF16:
-            logger.warning_once("MindSpore do not support bfloat16 dtype, we will automaticlly convert to float16")
-            array = array.astype(np.float16)
-
-        if info['dtype'] == 'I64':
-            array = array.astype(numpy_dtype)
-        out = Tensor(array)
+        try:
+            if info['dtype'] == 'BF16' and not SUPPORT_BF16:
+                logger.warning_once("MindSpore do not support bfloat16 dtype, we will automaticlly convert to float16")
+                ms_dtype = mindspore.float16
+            out = Tensor.convert_bytes_to_tensor(buf, tuple(shape), ms_dtype)
+        except:
+            array = np.frombuffer(buf, dtype=numpy_dtype).reshape(shape)
+
+            if array.dtype == bfloat16 and not SUPPORT_BF16:
+                logger.warning_once("MindSpore do not support bfloat16 dtype, we will automaticlly convert to float16")
+                array = array.astype(np.float16)
+            array = array.astype(array.dtype)
+            out = Tensor(array)
         return out
 
     with open(filename, "rb") as fp:
diff --git a/mindnlp/transformers/cache_utils.py b/mindnlp/transformers/cache_utils.py
index 93719c6aa..8b6710ce5 100644
--- a/mindnlp/transformers/cache_utils.py
+++ b/mindnlp/transformers/cache_utils.py
@@ -1146,7 +1146,7 @@ def _sliding_update(self, cache_position, layer_idx, key_states, value_states, k
             # into consideration when building kv cache instead of just throwing away tokens outside of the window
             return key_states, value_states
 
-        slicing = ops.ones(max_cache_len, dtype=mindspore.int64).cumsum(0)
+        slicing = ops.ones(max_cache_len, dtype=mindspore.int32).cumsum(0)
         cache_position = cache_position.clamp(0, max_cache_len - 1)
         to_shift = cache_position >= max_cache_len - 1
         indices = (slicing + to_shift[-1].int() - 1) % max_cache_len
diff --git a/mindnlp/transformers/configuration_utils.py b/mindnlp/transformers/configuration_utils.py
index e0c69ea42..9605aaac5 100644
--- a/mindnlp/transformers/configuration_utils.py
+++ b/mindnlp/transformers/configuration_utils.py
@@ -335,13 +335,6 @@ def __init__(self, **kwargs):
                 "but only 'regression', 'single_label_classification' and 'multi_label_classification' are valid."
             )
 
-        # TPU arguments
-        if kwargs.pop("xla_device", None) is not None:
-            logger.warning(
-                "The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can "
-                "safely remove it from your `config.json` file."
-            )
-
         # Name or path to the pretrained checkpoint
         self._name_or_path = str(kwargs.pop("name_or_path", ""))
         # Config hash
diff --git a/mindnlp/transformers/generation/utils.py b/mindnlp/transformers/generation/utils.py
index d0bfec4ad..f24a71b0d 100644
--- a/mindnlp/transformers/generation/utils.py
+++ b/mindnlp/transformers/generation/utils.py
@@ -1869,7 +1869,6 @@ def generate(
         # - `model_kwargs` may be updated in place with a cache as defined by the parameters in `generation_config`.
         # - different models have a different cache name expected by the model (default = "past_key_values")
         # - `max_length`, prepared above, is used to determine the maximum cache length
-        # TODO (joao): remove `user_defined_cache` after v4.47 (remove default conversion to legacy format)
         cache_name = "past_key_values" if "mamba" not in self.__class__.__name__.lower() else "cache_params"
         user_defined_cache = model_kwargs.get(cache_name)
         max_cache_length = generation_config.max_length
@@ -2174,7 +2173,7 @@ def typeerror():
 
         # Convert to legacy cache format if requested
         if (
-            generation_config.return_legacy_cache is not False  # Should check for `True` after v4.47
+            generation_config.return_legacy_cache is not False
             and hasattr(result, "past_key_values")
             and hasattr(result.past_key_values, "to_legacy_cache")
             and result.past_key_values.to_legacy_cache is not None
@@ -2192,7 +2191,7 @@ def typeerror():
             )
             if not is_user_defined_cache and is_default_cache_type:
                 logger.warning_once(
-                    "From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` "
+                    "When a model cache is to be returned, `generate` will return a `Cache` "
                     "instance instead by default (as opposed to the legacy tuple of tuples format). If you want to "
                     "keep returning the legacy format, please set `return_legacy_cache=True`."
                 )
diff --git a/mindnlp/transformers/modeling_rope_utils.py b/mindnlp/transformers/modeling_rope_utils.py
index 22522d0bf..6f02e1805 100644
--- a/mindnlp/transformers/modeling_rope_utils.py
+++ b/mindnlp/transformers/modeling_rope_utils.py
@@ -40,7 +40,7 @@ def _compute_default_rope_parameters(
         seq_len (`int`, *optional*):
             The current sequence length. Unused for this type of RoPE.
         rope_kwargs (`Dict`, *optional*):
-            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+            BC compatibility with the previous RoPE class instantiation, will be removed.
     Returns:
         Tuple of (`mindspore.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
         post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
@@ -78,7 +78,7 @@ def _compute_linear_scaling_rope_parameters(
         seq_len (`int`, *optional*):
             The current sequence length. Unused for this type of RoPE.
         rope_kwargs (`Dict`, *optional*):
-            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+            BC compatibility with the previous RoPE class instantiation, will be removed.
     Returns:
         Tuple of (`mindspore.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
         post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
@@ -116,7 +116,7 @@ def _compute_dynamic_ntk_parameters(
         seq_len (`int`, *optional*):
             The current sequence length, used to update the dynamic RoPE at inference time.
         rope_kwargs (`Dict`, *optional*):
-            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+            BC compatibility with the previous RoPE class instantiation, will be removed.
     Returns:
         Tuple of (`mindspore.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
         post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
@@ -162,7 +162,7 @@ def _compute_yarn_parameters(
         seq_len (`int`, *optional*):
             The current sequence length. Unused for this type of RoPE.
         rope_kwargs (`Dict`, *optional*):
-            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+            BC compatibility with the previous RoPE class instantiation, will be removed.
     Returns:
         Tuple of (`mindspore.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
         post-processing scaling factor applied to the computed cos/sin.
@@ -238,7 +238,7 @@ def _compute_longrope_parameters(
         seq_len (`int`, *optional*):
             The current sequence length. Unused for this type of RoPE.
         rope_kwargs (`Dict`, *optional*):
-            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+            BC compatibility with the previous RoPE class instantiation, will be removed.
     Returns:
         Tuple of (`mindspore.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
         post-processing scaling factor applied to the computed cos/sin.
@@ -300,7 +300,7 @@ def _compute_llama3_parameters(
         seq_len (`int`, *optional*):
             The current sequence length. Unused for this type of RoPE.
         rope_kwargs (`Dict`, *optional*):
-            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+            BC compatibility with the previous RoPE class instantiation, will be removed.
     Returns:
         Tuple of (`mindspore.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
         post-processing scaling factor applied to the computed cos/sin.
diff --git a/mindnlp/transformers/modeling_utils.py b/mindnlp/transformers/modeling_utils.py
index 1f862a734..e0aab2031 100644
--- a/mindnlp/transformers/modeling_utils.py
+++ b/mindnlp/transformers/modeling_utils.py
@@ -1233,7 +1233,7 @@ def can_generate(cls) -> bool:
                 continue
             if "PreTrainedModel" not in str(base) and base.can_generate():
                 return True
-        # BC: Detects whether `prepare_inputs_for_generation` has been overwritten in the model. Prior to v4.45, this
+        # BC: Detects whether `prepare_inputs_for_generation` has been overwritten in the model. this
         # was how we detected whether a model could generate.
         if "GenerationMixin" not in str(cls.prepare_inputs_for_generation):
             logger.warning_once(
@@ -2022,7 +2022,7 @@ def save_pretrained(
                             "To avoid this behavior and this warning, we recommend you to overwrite the generation "
                             "config model attribute before calling the model's `save_pretrained`, preferably also "
                             "removing any generation kwargs from the model config. This warning will be raised to an "
-                            "exception in v4.41."
+                            "exception."
                         )
                 model_to_save.generation_config.save_pretrained(save_directory)
 
diff --git a/mindnlp/transformers/models/__init__.py b/mindnlp/transformers/models/__init__.py
index 097a9f0c7..f686894de 100644
--- a/mindnlp/transformers/models/__init__.py
+++ b/mindnlp/transformers/models/__init__.py
@@ -203,6 +203,7 @@
     sew_d,
     speech_encoder_decoder,
     speech_to_text,
+    speech_to_text_2,
     speecht5,
     stablelm,
     splinter,
@@ -444,6 +445,7 @@
 from .sew_d import *
 from .speech_encoder_decoder import *
 from .speech_to_text import *
+from .speech_to_text_2 import *
 from .speecht5 import *
 from .stablelm import *
 from .splinter import *
@@ -685,6 +687,7 @@
 __all__.extend(sew_d.__all__)
 __all__.extend(speech_encoder_decoder.__all__)
 __all__.extend(speech_to_text.__all__)
+__all__.extend(speech_to_text_2.__all__)
 __all__.extend(speecht5.__all__)
 __all__.extend(stablelm.__all__)
 __all__.extend(splinter.__all__)
diff --git a/mindnlp/transformers/models/auto/__init__.py b/mindnlp/transformers/models/auto/__init__.py
index 2f072bdb0..f663a51c1 100644
--- a/mindnlp/transformers/models/auto/__init__.py
+++ b/mindnlp/transformers/models/auto/__init__.py
@@ -34,6 +34,7 @@
 from .processing_auto import PROCESSOR_MAPPING, AutoProcessor
 
 from .modeling_auto import (
+    MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
     MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING,
     MODEL_FOR_AUDIO_XVECTOR_MAPPING,
     MODEL_FOR_BACKBONE_MAPPING,
@@ -116,6 +117,7 @@
     "FEATURE_EXTRACTOR_MAPPING", "AutoFeatureExtractor",
     "IMAGE_PROCESSOR_MAPPING", "AutoImageProcessor",
     "PROCESSOR_MAPPING", "AutoProcessor",
+    "MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
     'MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING',
     'MODEL_FOR_AUDIO_XVECTOR_MAPPING',
     'MODEL_FOR_BACKBONE_MAPPING',
diff --git a/mindnlp/transformers/models/auto/configuration_auto.py b/mindnlp/transformers/models/auto/configuration_auto.py
index 753ebb117..0228eeddc 100644
--- a/mindnlp/transformers/models/auto/configuration_auto.py
+++ b/mindnlp/transformers/models/auto/configuration_auto.py
@@ -185,9 +185,12 @@
         ("roc_bert", "RoCBertConfig"),
         ("rwkv", "RwkvConfig"),
         ("sam", "SamConfig"),
+        ("seamless_m4t", "SeamlessM4TConfig"),
+        ("seamless_m4t_v2", "SeamlessM4Tv2Config"),
         ("segformer", "SegformerConfig"),
         ("speech-encoder-decoder", "SpeechEncoderDecoderConfig"),
         ("speech_to_text", "Speech2TextConfig"),
+        ("speech_to_text_2", "Speech2Text2Config"),
         ("speecht5", "SpeechT5Config"),
         ("stablelm", "StableLmConfig"),
         ("splinter", "SplinterConfig"),
@@ -674,6 +677,7 @@
         ("rwkv", "RWKV"),
         ("sam", "SAM"),
         ("seamless_m4t", "SeamlessM4T"),
+        ("seamless_m4t_v2", "SeamlessM4Tv2"),
         ("segformer", "SegFormer"),
         ("sew", "SEW"),
         ("sew-d", "SEW-D"),
diff --git a/mindnlp/transformers/models/auto/image_processing_auto.py b/mindnlp/transformers/models/auto/image_processing_auto.py
index 04f16d89b..84323c1a0 100644
--- a/mindnlp/transformers/models/auto/image_processing_auto.py
+++ b/mindnlp/transformers/models/auto/image_processing_auto.py
@@ -394,7 +394,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                     "Could not find image processor class in the image processor config or the model config. Loading "
                     "based on pattern matching with the model's feature extractor configuration. Please open a "
                     "PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of "
-                    "`feature_extractor_type`. This warning will be removed in v4.40."
+                    "`feature_extractor_type`. This warning will be removed."
                 )
                 image_processor_class = feature_extractor_class.replace("FeatureExtractor", "ImageProcessor")
             if "AutoFeatureExtractor" in config_dict.get("auto_map", {}):
@@ -404,7 +404,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                     "Could not find image processor auto map in the image processor config or the model config. "
                     "Loading based on pattern matching with the model's feature extractor configuration. Please open a "
                     "PR/issue to update `preprocessor_config.json` to use `AutoImageProcessor` instead of "
-                    "`AutoFeatureExtractor`. This warning will be removed in v4.40."
+                    "`AutoFeatureExtractor`. This warning will be removed."
                 )
 
         # If we don't find the image processor class in the image processor config, let's try the model config.
diff --git a/mindnlp/transformers/models/auto/modeling_graph_auto.py b/mindnlp/transformers/models/auto/modeling_graph_auto.py
deleted file mode 100644
index 743ddd820..000000000
--- a/mindnlp/transformers/models/auto/modeling_graph_auto.py
+++ /dev/null
@@ -1,680 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google MS Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-""" Auto Model class."""
-
-
-from collections import OrderedDict
-
-from mindnlp.utils import logging
-from .auto_factory import _BaseAutoModelClass, _LazyAutoMapping
-from .configuration_auto import CONFIG_MAPPING_NAMES
-
-
-logger = logging.get_logger(__name__)
-
-
-FLAX_MODEL_MAPPING_NAMES = OrderedDict(
-    [
-        # Base model mapping
-        ("albert", "MSAlbertModel"),
-        ("bart", "MSBartModel"),
-        ("beit", "MSBeitModel"),
-        ("bert", "MSBertModel"),
-        ("big_bird", "MSBigBirdModel"),
-        ("blenderbot", "MSBlenderbotModel"),
-        ("blenderbot-small", "MSBlenderbotSmallModel"),
-        ("bloom", "MSBloomModel"),
-        ("clip", "MSCLIPModel"),
-        ("deepseek_v2", "DeepseekV2Model"),
-        ("distilbert", "MSDistilBertModel"),
-        ("electra", "MSElectraModel"),
-        ("gpt-sw3", "MSGPT2Model"),
-        ("gpt2", "MSGPT2Model"),
-        ("gpt_neo", "MSGPTNeoModel"),
-        ("gptj", "MSGPTJModel"),
-        ("longt5", "MSLongT5Model"),
-        ("marian", "MSMarianModel"),
-        ("mbart", "MSMBartModel"),
-        ("mt5", "MSMT5Model"),
-        ("opt", "MSOPTModel"),
-        ("pegasus", "MSPegasusModel"),
-        ("regnet", "MSRegNetModel"),
-        ("resnet", "MSResNetModel"),
-        ("roberta", "MSRobertaModel"),
-        ("roberta-prelayernorm", "MSRobertaPreLayerNormModel"),
-        ("roformer", "MSRoFormerModel"),
-        ("t5", "MST5Model"),
-        ("vision-text-dual-encoder", "MSVisionTextDualEncoderModel"),
-        ("vit", "MSViTModel"),
-        ("wav2vec2", "MSWav2Vec2Model"),
-        ("whisper", "MSWhisperModel"),
-        ("xglm", "MSXGLMModel"),
-        ("xlm-roberta", "MSXLMRobertaModel"),
-    ]
-)
-
-FLAX_MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for pre-training mapping
-        ("albert", "MSAlbertForPreTraining"),
-        ("bart", "MSBartForConditionalGeneration"),
-        ("bert", "MSBertForPreTraining"),
-        ("big_bird", "MSBigBirdForPreTraining"),
-        ("electra", "MSElectraForPreTraining"),
-        ("longt5", "MSLongT5ForConditionalGeneration"),
-        ("mbart", "MSMBartForConditionalGeneration"),
-        ("mt5", "MSMT5ForConditionalGeneration"),
-        ("roberta", "MSRobertaForMaskedLM"),
-        ("roberta-prelayernorm", "MSRobertaPreLayerNormForMaskedLM"),
-        ("roformer", "MSRoFormerForMaskedLM"),
-        ("t5", "MST5ForConditionalGeneration"),
-        ("wav2vec2", "MSWav2Vec2ForPreTraining"),
-        ("whisper", "MSWhisperForConditionalGeneration"),
-        ("xlm-roberta", "MSXLMRobertaForMaskedLM"),
-    ]
-)
-
-FLAX_MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Masked LM mapping
-        ("albert", "MSAlbertForMaskedLM"),
-        ("bart", "MSBartForConditionalGeneration"),
-        ("bert", "MSBertForMaskedLM"),
-        ("big_bird", "MSBigBirdForMaskedLM"),
-        ("distilbert", "MSDistilBertForMaskedLM"),
-        ("electra", "MSElectraForMaskedLM"),
-        ("mbart", "MSMBartForConditionalGeneration"),
-        ("roberta", "MSRobertaForMaskedLM"),
-        ("roberta-prelayernorm", "MSRobertaPreLayerNormForMaskedLM"),
-        ("roformer", "MSRoFormerForMaskedLM"),
-        ("xlm-roberta", "MSXLMRobertaForMaskedLM"),
-    ]
-)
-
-FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Seq2Seq Causal LM mapping
-        ("bart", "MSBartForConditionalGeneration"),
-        ("blenderbot", "MSBlenderbotForConditionalGeneration"),
-        ("blenderbot-small", "MSBlenderbotSmallForConditionalGeneration"),
-        ("encoder-decoder", "MSEncoderDecoderModel"),
-        ("longt5", "MSLongT5ForConditionalGeneration"),
-        ("marian", "MSMarianMTModel"),
-        ("mbart", "MSMBartForConditionalGeneration"),
-        ("mt5", "MSMT5ForConditionalGeneration"),
-        ("pegasus", "MSPegasusForConditionalGeneration"),
-        ("t5", "MST5ForConditionalGeneration"),
-    ]
-)
-
-FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Image-classsification
-        ("beit", "MSBeitForImageClassification"),
-        ("regnet", "MSRegNetForImageClassification"),
-        ("resnet", "MSResNetForImageClassification"),
-        ("vit", "MSViTForImageClassification"),
-    ]
-)
-
-FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict(
-    [
-        ("vision-encoder-decoder", "MSVisionEncoderDecoderModel"),
-    ]
-)
-
-FLAX_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Causal LM mapping
-        ("bart", "MSBartForCausalLM"),
-        ("bert", "MSBertForCausalLM"),
-        ("big_bird", "MSBigBirdForCausalLM"),
-        ("bloom", "MSBloomForCausalLM"),
-        ("electra", "MSElectraForCausalLM"),
-        ("gpt-sw3", "MSGPT2LMHeadModel"),
-        ("gpt2", "MSGPT2LMHeadModel"),
-        ("gpt_neo", "MSGPTNeoForCausalLM"),
-        ("gptj", "MSGPTJForCausalLM"),
-        ("opt", "MSOPTForCausalLM"),
-        ("roberta", "MSRobertaForCausalLM"),
-        ("roberta-prelayernorm", "MSRobertaPreLayerNormForCausalLM"),
-        ("xglm", "MSXGLMForCausalLM"),
-        ("xlm-roberta", "MSXLMRobertaForCausalLM"),
-    ]
-)
-
-FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Sequence Classification mapping
-        ("albert", "MSAlbertForSequenceClassification"),
-        ("bart", "MSBartForSequenceClassification"),
-        ("bert", "MSBertForSequenceClassification"),
-        ("big_bird", "MSBigBirdForSequenceClassification"),
-        ("distilbert", "MSDistilBertForSequenceClassification"),
-        ("electra", "MSElectraForSequenceClassification"),
-        ("mbart", "MSMBartForSequenceClassification"),
-        ("roberta", "MSRobertaForSequenceClassification"),
-        ("roberta-prelayernorm", "MSRobertaPreLayerNormForSequenceClassification"),
-        ("roformer", "MSRoFormerForSequenceClassification"),
-        ("xlm-roberta", "MSXLMRobertaForSequenceClassification"),
-        ("deepseek_v2", "DeepseekV2ForSequenceClassification")
-    ]
-)
-
-FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Question Answering mapping
-        ("albert", "MSAlbertForQuestionAnswering"),
-        ("bart", "MSBartForQuestionAnswering"),
-        ("bert", "MSBertForQuestionAnswering"),
-        ("big_bird", "MSBigBirdForQuestionAnswering"),
-        ("distilbert", "MSDistilBertForQuestionAnswering"),
-        ("electra", "MSElectraForQuestionAnswering"),
-        ("mbart", "MSMBartForQuestionAnswering"),
-        ("roberta", "MSRobertaForQuestionAnswering"),
-        ("roberta-prelayernorm", "MSRobertaPreLayerNormForQuestionAnswering"),
-        ("roformer", "MSRoFormerForQuestionAnswering"),
-        ("xlm-roberta", "MSXLMRobertaForQuestionAnswering"),
-    ]
-)
-
-FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Token Classification mapping
-        ("albert", "MSAlbertForTokenClassification"),
-        ("bert", "MSBertForTokenClassification"),
-        ("big_bird", "MSBigBirdForTokenClassification"),
-        ("distilbert", "MSDistilBertForTokenClassification"),
-        ("electra", "MSElectraForTokenClassification"),
-        ("roberta", "MSRobertaForTokenClassification"),
-        ("roberta-prelayernorm", "MSRobertaPreLayerNormForTokenClassification"),
-        ("roformer", "MSRoFormerForTokenClassification"),
-        ("xlm-roberta", "MSXLMRobertaForTokenClassification"),
-    ]
-)
-
-FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Multiple Choice mapping
-        ("albert", "MSAlbertForMultipleChoice"),
-        ("bert", "MSBertForMultipleChoice"),
-        ("big_bird", "MSBigBirdForMultipleChoice"),
-        ("distilbert", "MSDistilBertForMultipleChoice"),
-        ("electra", "MSElectraForMultipleChoice"),
-        ("roberta", "MSRobertaForMultipleChoice"),
-        ("roberta-prelayernorm", "MSRobertaPreLayerNormForMultipleChoice"),
-        ("roformer", "MSRoFormerForMultipleChoice"),
-        ("xlm-roberta", "MSXLMRobertaForMultipleChoice"),
-    ]
-)
-
-FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES = OrderedDict(
-    [
-        ("bert", "MSBertForNextSentencePrediction"),
-    ]
-)
-
-FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict(
-    [
-        ("speech-encoder-decoder", "MSSpeechEncoderDecoderModel"),
-        ("whisper", "MSWhisperForConditionalGeneration"),
-    ]
-)
-
-FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
-    [
-        ("whisper", "MSWhisperForAudioClassification"),
-    ]
-)
-
-FLAX_MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_MAPPING_NAMES)
-FLAX_MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_PRETRAINING_MAPPING_NAMES)
-FLAX_MODEL_FOR_MASKED_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_MASKED_LM_MAPPING_NAMES)
-FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
-)
-FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
-)
-FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES)
-FLAX_MODEL_FOR_CAUSAL_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
-FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
-)
-FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
-)
-FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
-)
-FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES
-)
-FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES
-)
-FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES
-)
-FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES
-)
-
-
-class MSAutoModel(_BaseAutoModelClass):
-
-    """
-    MSAutoModel is a Python class that represents a model for a Microsoft Azure Machine Learning service.
-    It inherits from the _BaseAutoModelClass and provides functionality for creating and managing auto ML
-    models on the Azure platform.
-    """
-    _model_mapping = FLAX_MODEL_MAPPING
-
-
-class MSAutoModelForPreTraining(_BaseAutoModelClass):
-
-    """
-    Represents a model for pre-training in Microsoft's Machine Learning framework.
-    This class serves as a base class for different auto models used for pre-training.
-    It inherits functionality from the _BaseAutoModelClass class.
-    """
-    _model_mapping = FLAX_MODEL_FOR_PRETRAINING_MAPPING
-
-
-class MSAutoModelForCausalLM(_BaseAutoModelClass):
-
-    """
-    This class represents an auto-regressive language model for causal language modeling using Microsoft's AutoModel.
-    
-    The MSAutoModelForCausalLM class inherits from the _BaseAutoModelClass, providing additional functionality and customization options.
-    
-    Attributes:
-        base_model_name_or_path (str): The name or path of the base model to be used for language modeling.
-        config (AutoConfig): The configuration object for the auto-regressive language model.
-        tokenizer (AutoTokenizer): The tokenizer object for the auto-regressive language model.
-        model (AutoModelForCausalLM): The underlying model for the auto-regressive language model.
-
-    Methods:
-        __init__:
-            Initializes a new instance of the MSAutoModelForCausalLM class.
-
-        forward:
-            Performs a forward pass through the auto-regressive language model.
-
-        generate:
-            Generates text using the auto-regressive language model.
-
-        save_pretrained:
-            Saves the auto-regressive language model and its configuration and tokenizer to the specified directory.
-
-        from_pretrained:
-            Instantiates a new instance of the MSAutoModelForCausalLM class from a pretrained model.
-
-        from_config:
-            Instantiates a new instance of the MSAutoModelForCausalLM class from a configuration object.
-
-        from_pretrained:
-            Instantiates a new instance of the MSAutoModelForCausalLM class from a pretrained model.
-
-        from_pretrained:
-            Instantiates a new instance of the MSAutoModelForCausalLM class from a pretrained model.
-    """
-    _model_mapping = FLAX_MODEL_FOR_CAUSAL_LM_MAPPING
-
-
-class MSAutoModelForMaskedLM(_BaseAutoModelClass):
-
-    """
-    A class representing an auto model for masked language modeling using Microsoft's AutoModel architecture.
-
-    This class, MSAutoModelForMaskedLM, is a subclass of _BaseAutoModelClass and provides an implementation for
-    generating predictions for masked tokens in a given input sequence. It utilizes Microsoft's AutoModel architecture
-    which combines transformer-based models with language modeling techniques to achieve state-of-the-art performance
-    on masked language modeling tasks.
-
-    The MSAutoModelForMaskedLM class inherits the core functionality from the _BaseAutoModelClass, which provides a
-    generic interface for auto models. It extends this base class by implementing specific methods and configurations
-    that are tailored for masked language modeling tasks.
-
-    The MSAutoModelForMaskedLM class can be instantiated with various parameters to control the architecture, model
-    weights, tokenization, and other settings. It supports loading pre-trained models, fine-tuning on custom datasets,
-    and generating predictions for masked tokens.
-
-    Example:
-        ```python
-        >>> model = MSAutoModelForMaskedLM(model_name='bert-base-uncased')
-        >>> input_sequence = "The [MASK] is blue."
-        ...
-        >>> # Generate predictions for masked tokens
-        >>> predictions = model.predict_masked_tokens(input_sequence)
-        ...
-        >>> print(predictions)
-        ```
-    """
-    _model_mapping = FLAX_MODEL_FOR_MASKED_LM_MAPPING
-
-
-class MSAutoModelForSeq2SeqLM(_BaseAutoModelClass):
-
-    """
-    This class represents a pre-trained model for sequence-to-sequence language modeling with automatic architecture selection.
-    
-    It is a subclass of the '_BaseAutoModelClass' and inherits its methods and attributes.
-    The 'MSAutoModelForSeq2SeqLM' class provides an interface for automatically selecting and loading the appropriate
-    model architecture for sequence-to-sequence language modeling tasks.
-
-    Attributes:
-        config_class (type): The class to use for instantiating the model configuration.
-        base_model_prefix (str): The prefix to use for the base model.
-        _keys_to_ignore_on_load_missing (List[str]): A list of keys to ignore when loading the model.
-
-    Methods:
-        from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs): Class method to instantiate a
-            pre-trained 'MSAutoModelForSeq2SeqLM' instance from a pre-trained model.
-        forward(**inputs): Performs a forward pass of the model with the given inputs.
-        generate(**kwargs): Generates text using the model with the provided inputs.
-        prepare_inputs_for_generation(input_ids, **kwargs): Prepares the inputs for text generation.
-        save_pretrained(save_directory): Saves the model to the specified directory.
-        save_model(save_directory): Deprecated method. Use 'save_pretrained' instead.
-
-    Note:
-        This class is designed for sequence-to-sequence language modeling tasks and provides a convenient way
-        to select and use the appropriate model architecture.
-
-        Example:
-            ```python
-            >>> from transformers import MSAutoModelForSeq2SeqLM
-            ...
-            >>> model = MSAutoModelForSeq2SeqLM.from_pretrained("microsoft/MSDialog-GPT-large-finetuned-turbo")
-            >>> input_text = "What is the capital of France?"
-            >>> generated_text = model.generate(input_text)
-            >>> print(generated_text)
-            ```
-    
-        This example demonstrates how to use the 'MSAutoModelForSeq2SeqLM' class to load a pre-trained model and generate text using the model.
-    """
-    _model_mapping = FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
-
-
-class MSAutoModelForSequenceClassification(_BaseAutoModelClass):
-
-    """
-    This class represents an auto model for sequence classification tasks in the Microsoft AutoML framework.
-    
-    The 'MSAutoModelForSequenceClassification' class is a subclass of '_BaseAutoModelClass' and
-    provides a convenient interface for training and evaluating sequence classification models.
-    It utilizes the power of AutoML to automatically search for the best model architecture
-    and hyperparameters for a given sequence classification task.
-
-    The class inherits the functionalities of the '_BaseAutoModelClass' class,
-    which includes methods for loading and saving models, as well as performing inference using trained models.
-
-    To use the 'MSAutoModelForSequenceClassification' class, first initialize an instance by providing
-    the required parameters such as the number of classes, input dimensions, and other relevant configuration
-    options.
-    Then, you can call the 'fit' method to start the automatic model search process.
-    This method takes in the training data, performs the model search, and returns the best model found.
-
-    Once the model has been trained, you can use the 'evaluate' method to evaluate its performance on a separate validation or test dataset.
-    This method calculates various evaluation metrics such as accuracy, precision, recall, and F1-score.
-
-    In addition to these core methods, the 'MSAutoModelForSequenceClassification' class provides various helper methods
-    for configuring and fine-tuning the model search process.
-    These include methods for setting the search space, defining custom metrics, specifying early stopping criteria, and more.
-
-    Note that the automatic model search process may take some time to complete, depending on the size of the dataset
-    and the complexity of the search space.
-    However, it helps to alleviate the burden of manually tuning hyperparameters and allows you to focus on
-    other aspects of your sequence classification task.
-    
-    For more information on how to use the 'MSAutoModelForSequenceClassification' class and the Microsoft AutoML framework,
-    please refer to the official documentation and examples.
-    
-    """
-    _model_mapping = FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
-
-
-class MSAutoModelForQuestionAnswering(_BaseAutoModelClass):
-
-    """
-    MSAutoModelForQuestionAnswering is a class that represents a pre-trained model for question answering tasks
-    using the Microsoft Azure Machine Learning service.
-    
-    This class inherits from _BaseAutoModelClass, which provides the base functionality for automatic modeling
-    in the Microsoft Azure Machine Learning service.
-    
-    The MSAutoModelForQuestionAnswering class provides methods and attributes specific to question answering tasks,
-    enabling users to fine-tune and deploy pre-trained models for question answering in a production environment.
-
-    Attributes:
-        model_name_or_path (str): The name or path of the pre-trained model.
-        tokenizer_name_or_path (str): The name or path of the tokenizer associated with the pre-trained model.
-        config_name_or_path (Optional[str]): The name or path of the model configuration file.
-        cache_dir (Optional[str]): The directory where the pre-trained models and related files will be cached.
-        revision (Union[str, int]): The revision number of the model to load from the Hugging Face model hub.
-        use_auth_token (Union[str, bool]): The authentication token to use for downloading models from the Hugging Face model hub.
-
-    Methods:
-        from_pretrained(cls, model_name_or_path, *args, **kwargs):
-            Class method to instantiate an instance of MSAutoModelForQuestionAnswering from a pre-trained model.
-        forward(self, ...): Method to perform the forward pass of the model, taking inputs and returning the predicted outputs.
-        train(self, ...): Method to train the model on a given dataset.
-        evaluate(self, ...): Method to evaluate the performance of the model on a given dataset.
-        save_pretrained(self, ...): Method to save the model and associated files to a specified directory.
-        from_pretrained(cls, ...):
-            Class method to load a pre-trained instance of MSAutoModelForQuestionAnswering from a specified directory.
-        generate(self, ...): Method to generate text using the model.
-        get_named_parameters(self, ...): Method to get named parameters of the model.
-        get_input_embeddings(self, ...): Method to get the input embeddings of the model.
-        set_input_embeddings(self, ...): Method to set the input embeddings of the model.
-
-    Example:
-        ```python
-        >>> # Instantiate a pre-trained model for question answering
-        >>> model = MSAutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')
-        ...
-        >>> # Perform the forward pass of the model
-        >>> outputs = model.forward(inputs)
-        ...
-        >>> # Train the model on a given dataset
-        >>> model.train(train_dataset)
-        ...
-        >>> # Evaluate the performance of the model on a given dataset
-        >>> model.evaluate(eval_dataset)
-        ...
-        >>> # Save the model and associated files to a specified directory
-        >>> model.save_pretrained('saved_model')
-        ...
-        >>> # Load a pre-trained instance of MSAutoModelForQuestionAnswering from a specified directory
-        >>> loaded_model = MSAutoModelForQuestionAnswering.from_pretrained('saved_model')
-        ...
-        >>> # Generate text using the model
-        >>> generated_text = model.generate(input_text)
-        ...
-        >>> # Get named parameters of the model
-        >>> params = model.get_named_parameters()
-        ...
-        >>> # Get the input embeddings of the model
-        >>> embeddings = model.get_input_embeddings()
-        ...
-        >>> # Set the input embeddings of the model
-        >>> model.set_input_embeddings(new_embeddings)
-        ```
-    """
-    _model_mapping = FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING
-
-
-class MSAutoModelForTokenClassification(_BaseAutoModelClass):
-
-    """
-    Represents a model for token classification using Microsoft's AutoModel framework.
-    
-    This class inherits from _BaseAutoModelClass and provides functionality for token classification tasks.
-    It encapsulates the architecture and configuration of the model, including loading pre-trained weights
-    and performing inference on token sequences.
-    The model supports fine-tuning on specific token classification datasets
-    and provides a high-level interface for integrating with downstream applications.
-    
-    Note:
-        This docstring is a placeholder and should be updated with specific details about the class attributes, methods, and usage examples.
-    """
-    _model_mapping = FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
-
-
-class MSAutoModelForMultipleChoice(_BaseAutoModelClass):
-
-    """
-    This class represents an automated model for multiple choice tasks in the Microsoft Azure Machine Learning service.
-    
-    The 'MSAutoModelForMultipleChoice' class inherits from the '_BaseAutoModelClass' class,
-    which provides the foundational functionality for automated model creation.
-    
-    The 'MSAutoModelForMultipleChoice' class is specifically designed to handle multiple choice tasks in the
-    Microsoft Azure Machine Learning service.
-    It streamlines the process of creating, training, and evaluating models for multiple choice tasks,
-    reducing the amount of manual effort required.
-
-    To use this class, first instantiate an object of the 'MSAutoModelForMultipleChoice' class.
-    Then, call the appropriate methods to perform tasks such as loading data, preprocessing, training the model, and
-    evaluating its performance.
-
-    This class encapsulates various methods and attributes that are essential for automating the model creation process
-    for multiple choice tasks.
-    It leverages the power of the Microsoft Azure Machine Learning service to provide a seamless
-    and efficient experience for users.
-
-    Note:
-        This class requires the Microsoft Azure Machine Learning service to be properly set up and configured in order to function correctly.
-    
-    For detailed information on how to use this class and its methods, please refer to the documentation and examples provided.
-    """
-    _model_mapping = FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING
-
-
-class MSAutoModelForNextSentencePrediction(_BaseAutoModelClass):
-
-    """
-    This class represents an implementation of a pre-trained model for next sentence prediction using the Microsoft AutoModel framework.
-    
-    The MSAutoModelForNextSentencePrediction class inherits from the _BaseAutoModelClass,
-    which provides basic functionality for automatic model loading and inference.
-    
-    Next sentence prediction is a task in natural language processing that involves predicting
-    whether two given sentences are logically connected, such as being consecutive or having a cause-effect relationship.
-
-    This class encapsulates the architecture and weights of a pre-trained model specifically designed for next sentence prediction.
-    It provides methods for loading the model, encoding input sentences, and making predictions.
-
-    To use this class, first initialize an instance by providing the necessary model configuration.
-    Then, load the pre-trained weights using the 'load_weights' method.
-    After loading, you can encode input sentences using the 'encode' method,
-    which converts the sentences into numerical representations suitable for model input.
-    Finally, use the 'predict' method to predict the connection between pairs of sentences.
-
-    Note that this class assumes the pre-trained weights have been downloaded and stored in a specific format compatible
-    with the Microsoft AutoModel framework.
-    If the weights are not available, they can be obtained from the official Microsoft website or other trusted sources.
-
-    For more details on how to use this class and examples of its functionality,
-    refer to the documentation and code examples provided in the Microsoft AutoModel repository.
-
-    Attributes:
-        model_config (dict): A dictionary containing the configuration of the pre-trained model.
-        model_weights (str): The file path or URL to the pre-trained weights of the model.
-
-    Methods:
-        load_weights:
-            Loads the pre-trained weights of the model from the specified path.
-
-        encode:
-            Encodes a list of input sentences into numerical representations suitable for model input.
-            Returns a list of encoded representations, where each representation is a list of floats.
-
-        predict:
-            Predicts the connection between pairs of input sentences.
-            Returns a list of probabilities, where each probability represents the likelihood of the two sentences being logically connected.
-    """
-    _model_mapping = FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING
-
-
-class MSAutoModelForImageClassification(_BaseAutoModelClass):
-
-    """
-    This class represents an auto model for image classification using the Microsoft Azure cognitive services. 
-    
-    The 'MSAutoModelForImageClassification' class is a Python class that inherits from the '_BaseAutoModelClass'.
-    It provides an interface for training and deploying machine learning models specifically
-    designed for image classification tasks using the Microsoft Azure cognitive services.
-
-    Features:
-        >   - Data preprocessing: The class supports various data preprocessing techniques such as resizing, cropping,
-        and normalization to prepare the image data for training and prediction.
-        >   - Model training: The class allows users to train the image classification model using their own labeled dataset.
-        It supports popular deep learning architectures like CNN (Convolutional Neural Networks) and transfer learning techniques.
-        >   - Model evaluation: Users can evaluate the performance of the trained model using standard evaluation metrics
-        such as accuracy, precision, recall, and F1-score.
-        >   - Model deployment: Once the model is trained, it can be deployed in production environments to
-        perform real-time image classification tasks.
-        >   - Integration with Microsoft Azure cognitive services: The class seamlessly integrates with the
-        Microsoft Azure cognitive services, allowing users to leverage powerful cloud-based functionalities such as
-        automatic scaling, high availability, and advanced analytics.
-
-    Usage:
-        >   1. Instantiate an object of the 'MSAutoModelForImageClassification' class.
-        >   2. Configure the model parameters and hyperparameters.
-        >   3. Preprocess the input image data using the provided data preprocessing methods.
-        >   4. Train the model using the labeled dataset.
-        >   5. Evaluate the model's performance using the evaluation metrics.
-        >   6. Deploy the trained model in a production environment.
-        >   7. Utilize the model for real-time image classification tasks.
-
-    Note:
-        The 'MSAutoModelForImageClassification' class requires a valid Microsoft Azure cognitive services subscription
-        and the necessary API keys for authentication and authorization.
-    
-    For detailed implementation instructions and code examples, refer to the official documentation and examples
-    provided by Microsoft Azure cognitive services.
-    """
-    _model_mapping = FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
-
-
-class MSAutoModelForVision2Seq(_BaseAutoModelClass):
-
-    """
-    The MSAutoModelForVision2Seq class is a Python class that represents a vision-to-sequence auto model
-    for multi-modal tasks. This class inherits from the _BaseAutoModelClass and provides functionalities for
-    vision to sequence transformation in multi-modal tasks.
-    """
-    _model_mapping = FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING
-
-
-class MSAutoModelForSpeechSeq2Seq(_BaseAutoModelClass):
-
-    """
-    Represents a speech sequence-to-sequence model for multi-source automatic speech recognition (ASR) and natural language generation (NLG).
-    
-    This class inherits from _BaseAutoModelClass and provides a pre-trained model for processing speech input
-    and generating sequence-to-sequence outputs. It supports multi-source ASR and NLG tasks, making it suitable
-    for a wide range of speech-related applications.
-
-    The MSAutoModelForSpeechSeq2Seq class encapsulates the functionality for loading the pre-trained model,
-    processing input speech data, and generating corresponding sequence-to-sequence outputs.
-    It also provides methods for fine-tuning the model and evaluating its performance on speech-related tasks.
-
-    Users can instantiate an object of this class to leverage the pre-trained speech sequence-to-sequence model
-    for ASR and NLG tasks, enabling efficient and accurate processing of speech data with multi-source support.
-    """
-    _model_mapping = FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING
diff --git a/mindnlp/transformers/models/cohere/modeling_cohere.py b/mindnlp/transformers/models/cohere/modeling_cohere.py
index ffec3f7a2..613fb9b92 100644
--- a/mindnlp/transformers/models/cohere/modeling_cohere.py
+++ b/mindnlp/transformers/models/cohere/modeling_cohere.py
@@ -733,7 +733,6 @@ def forward(
 
         hidden_states = outputs[0]
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        # TODO: remove the float() operation in v4.46
         logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
         logits = logits * self.logit_scale
 
diff --git a/mindnlp/transformers/models/gemma/modeling_gemma.py b/mindnlp/transformers/models/gemma/modeling_gemma.py
index 422532721..ee827e81a 100644
--- a/mindnlp/transformers/models/gemma/modeling_gemma.py
+++ b/mindnlp/transformers/models/gemma/modeling_gemma.py
@@ -538,8 +538,8 @@ def forward(
             return_legacy_cache = True
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed. "
+                "Please use an appropriate `Cache` class"
             )
 
         # decoder layers
diff --git a/mindnlp/transformers/models/gemma2/modeling_gemma2.py b/mindnlp/transformers/models/gemma2/modeling_gemma2.py
index c569fd983..af29d1198 100644
--- a/mindnlp/transformers/models/gemma2/modeling_gemma2.py
+++ b/mindnlp/transformers/models/gemma2/modeling_gemma2.py
@@ -440,8 +440,8 @@ def forward(
                 min_dtype = float(ops.finfo(hidden_states.dtype).min)
                 min_dtype = mindspore.tensor(min_dtype)
                 sliding_window_mask = ops.tril(
-                    ops.ones_like(attention_mask, dtype=mindspore.bool_), diagonal=-self.sliding_window
-                )
+                    ops.ones_like(attention_mask), diagonal=-self.sliding_window
+                ).to(mindspore.bool_)
                 attention_mask = ops.where(sliding_window_mask, min_dtype, attention_mask)
                 if attention_mask.shape[-1] <= 1:  # when decoding
                     attention_mask = attention_mask[:, :, :, -self.sliding_window :]
@@ -493,11 +493,11 @@ class Gemma2PreTrainedModel(PreTrainedModel):
     def _init_weights(self, module):
         std = self.config.initializer_range
         if isinstance(module, nn.Linear):
-            nn.init.normal_(module.weight,mean=0.0, std=std)
+            nn.init.normal_(module.weight, mean=0.0, std=std)
             if module.bias is not None:
                 nn.init.zeros_(module.bias)
         elif isinstance(module, nn.Embedding):
-            nn.init.normal_(module.weight,mean=0.0, std=std)
+            nn.init.normal_(module.weight, mean=0.0, std=std)
             if module.padding_idx is not None:
                 module.weight[module.padding_idx] = 0
 
@@ -841,8 +841,8 @@ def prepare_inputs_for_generation(
                 input_ids = input_ids[:, cache_position]
         if attention_mask is not None and position_ids is None:
             # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
+            position_ids = attention_mask.int().cumsum(-1) - 1
+            position_ids = position_ids.masked_fill(attention_mask == 0, 1)
             if past_key_values:
                 position_ids = position_ids[:, -input_ids.shape[1] :]
                 # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s
diff --git a/mindnlp/transformers/models/git/modeling_git.py b/mindnlp/transformers/models/git/modeling_git.py
index b9bd22218..8c540639e 100644
--- a/mindnlp/transformers/models/git/modeling_git.py
+++ b/mindnlp/transformers/models/git/modeling_git.py
@@ -419,8 +419,8 @@ def forward(
             use_legacy_cache = True
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed. "
+                "Please use an appropriate `Cache` class"
             )
 
         all_hidden_states = () if output_hidden_states else None
diff --git a/mindnlp/transformers/models/gpt_neox/modeling_gpt_neox.py b/mindnlp/transformers/models/gpt_neox/modeling_gpt_neox.py
index dfa873266..a93f61f89 100644
--- a/mindnlp/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/mindnlp/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -624,8 +624,8 @@ def forward(
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             if not self.training:
                 logger.warning_once(
-                    "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. "
-                    "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                    "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed. "
+                    "Please use an appropriate `Cache` class"
                 )
 
         seq_length = inputs_embeds.shape[1]
diff --git a/mindnlp/transformers/models/gptj/modeling_gptj.py b/mindnlp/transformers/models/gptj/modeling_gptj.py
index f10724aea..e79f7fd39 100644
--- a/mindnlp/transformers/models/gptj/modeling_gptj.py
+++ b/mindnlp/transformers/models/gptj/modeling_gptj.py
@@ -408,7 +408,7 @@ def forward(
                 past_key_values = DynamicCache.from_legacy_cache(past_key_values)
                 logger.warning_once(
                     "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
-                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "will be removed. Please convert your cache or use an appropriate `Cache` class "
                     "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
                 )
 
diff --git a/mindnlp/transformers/models/instructblip/modeling_instructblip.py b/mindnlp/transformers/models/instructblip/modeling_instructblip.py
index 7104feb1d..b62361070 100644
--- a/mindnlp/transformers/models/instructblip/modeling_instructblip.py
+++ b/mindnlp/transformers/models/instructblip/modeling_instructblip.py
@@ -1319,7 +1319,7 @@ def forward(
             logger.warning_once(
                 "Expanding inputs for image tokens in InstructBLIP should be done in processing. "
                 "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+                "Using processors without these attributes in the config is deprecated and will throw an error."
             )
             inputs_embeds = ops.cat([language_model_inputs, inputs_embeds], dim=1)
             attention_mask = ops.cat(
@@ -1453,7 +1453,7 @@ def generate(
             logger.warning_once(
                 "Expanding inputs for image tokens in InstructBLIP should be done in processing. "
                 "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+                "Using processors without these attributes in the config is deprecated and will throw an error."
             )
             inputs_embeds = ops.cat([language_model_inputs, inputs_embeds], dim=1)
             attention_mask = ops.cat(
diff --git a/mindnlp/transformers/models/llama/modeling_llama.py b/mindnlp/transformers/models/llama/modeling_llama.py
index 13769a490..7d25fea73 100644
--- a/mindnlp/transformers/models/llama/modeling_llama.py
+++ b/mindnlp/transformers/models/llama/modeling_llama.py
@@ -150,7 +150,7 @@ def __init__(
         if config is None:
             logger.warning_once(
                 "`LlamaRotaryEmbedding` can now be fully parameterized by passing the model config through the "
-                "`config` argument. All other arguments will be removed in v4.45"
+                "`config` argument. All other arguments will be removed"
             )
             self.rope_kwargs = {
                 "rope_type": rope_type,
@@ -217,31 +217,6 @@ def forward(self, x, position_ids):
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
 
-class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
-
-    def __init__(self, *args, **kwargs):
-        logger.warning_once(
-            "`LlamaLinearScalingRotaryEmbedding` is deprecated an will be removed in v4.45. Please use "
-            "`LlamaRotaryEmbedding`, which now also does linear scaling (simply pass the model config to __init__)."
-        )
-        kwargs["rope_type"] = "linear"
-        super().__init__(*args, **kwargs)
-
-
-class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
-
-    def __init__(self, *args, **kwargs):
-        logger.warning_once(
-            "`LlamaDynamicNTKScalingRotaryEmbedding` is deprecated an will be removed in v4.45. Please use "
-            "`LlamaRotaryEmbedding`, which now also does dynamic ntk scaling (simply pass the model config to "
-            "__init__)."
-        )
-        kwargs["rope_type"] = "dynamic"
-        super().__init__(*args, **kwargs)
-
-
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     # x1 = x[..., : x.shape[-1] // 2]
@@ -359,7 +334,6 @@ def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None):
         self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
         self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
 
-        # TODO (joao): remove in v4.45 (RoPE is computed in the model, not in the decoder layers)
         self.rotary_emb = LlamaRotaryEmbedding(config=self.config)
 
     def forward(
@@ -371,7 +345,7 @@ def forward(
         output_attentions: bool = False,
         use_cache: bool = False,
         cache_position: Optional[mindspore.Tensor] = None,
-        position_embeddings: Optional[Tuple[mindspore.Tensor, mindspore.Tensor]] = None,  # will become mandatory in v4.45
+        position_embeddings: Optional[Tuple[mindspore.Tensor, mindspore.Tensor]] = None,
         **kwargs,
     ) -> Tuple[mindspore.Tensor, Optional[mindspore.Tensor], Optional[Tuple[mindspore.Tensor]]]:
         bsz, q_len, _ = hidden_states.shape
@@ -407,8 +381,7 @@ def forward(
             logger.warning_once(
                 "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
                 "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
-                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be "
-                "removed and `position_embeddings` will be mandatory."
+                "`position_embeddings` (Tuple of tensors, containing cos and sin)."
             )
             cos, sin = self.rotary_emb(value_states, position_ids)
         else:
@@ -483,7 +456,7 @@ def forward(
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
         cache_position: Optional[mindspore.Tensor] = None,
-        position_embeddings: Optional[Tuple[mindspore.Tensor, mindspore.Tensor]] = None,  # will become mandatory in v4.45
+        position_embeddings: Optional[Tuple[mindspore.Tensor, mindspore.Tensor]] = None,
         **kwargs,
     ) -> Tuple[mindspore.Tensor, Optional[Tuple[mindspore.Tensor, mindspore.Tensor]]]:
         """
diff --git a/mindnlp/transformers/models/llava/modeling_llava.py b/mindnlp/transformers/models/llava/modeling_llava.py
index 9417f4457..7f7c17aad 100644
--- a/mindnlp/transformers/models/llava/modeling_llava.py
+++ b/mindnlp/transformers/models/llava/modeling_llava.py
@@ -392,7 +392,6 @@ def forward(
                         -target_length:
                     ]
 
-            # TODO: @raushan retain only the new behavior after v4.47
             else:
                 special_image_mask = (
                     (input_ids == self.config.image_token_index)
diff --git a/mindnlp/transformers/models/llava_next/modeling_llava_next.py b/mindnlp/transformers/models/llava_next/modeling_llava_next.py
index 40c703492..716182133 100644
--- a/mindnlp/transformers/models/llava_next/modeling_llava_next.py
+++ b/mindnlp/transformers/models/llava_next/modeling_llava_next.py
@@ -702,7 +702,7 @@ def forward(
                     "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
                     "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
                     "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+                    "Using processors without these attributes in the config is deprecated and will throw an error."
                 )
                 if input_ids.shape[1] != 1:
                     inputs_embeds = inputs_embeds.to(image_features.dtype)
@@ -744,7 +744,6 @@ def forward(
                     attention_mask = ops.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
                     position_ids = ops.sum(attention_mask, dim=1).unsqueeze(-1) - 1
 
-            # TODO: @raushan retain only the new behavior after v4.47
             else:
                 special_image_mask = (
                     (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
diff --git a/mindnlp/transformers/models/mistral/modeling_mistral.py b/mindnlp/transformers/models/mistral/modeling_mistral.py
index c033ec9fc..9e699ee07 100644
--- a/mindnlp/transformers/models/mistral/modeling_mistral.py
+++ b/mindnlp/transformers/models/mistral/modeling_mistral.py
@@ -428,7 +428,7 @@ def forward(
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             return_legacy_cache = True
             logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed. "
                 "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
diff --git a/mindnlp/transformers/models/mllama/modeling_mllama.py b/mindnlp/transformers/models/mllama/modeling_mllama.py
index 8be5e3433..1d7e32515 100644
--- a/mindnlp/transformers/models/mllama/modeling_mllama.py
+++ b/mindnlp/transformers/models/mllama/modeling_mllama.py
@@ -701,7 +701,7 @@ def forward(
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
         cache_position: Optional[mindspore.Tensor] = None,
-        position_embeddings: Optional[Tuple[mindspore.Tensor, mindspore.Tensor]] = None,  # will become mandatory in v4.45
+        position_embeddings: Optional[Tuple[mindspore.Tensor, mindspore.Tensor]] = None,
     ) -> Tuple[mindspore.Tensor, Optional[Tuple[mindspore.Tensor, mindspore.Tensor]]]:
         """
         Args:
diff --git a/mindnlp/transformers/models/mpt/configuration_mpt.py b/mindnlp/transformers/models/mpt/configuration_mpt.py
index 2b2dabfe6..b42f1d27a 100644
--- a/mindnlp/transformers/models/mpt/configuration_mpt.py
+++ b/mindnlp/transformers/models/mpt/configuration_mpt.py
@@ -29,8 +29,8 @@ class DeprecatedList(list):
     Represents a list class that issues a warning about deprecated features when accessed.
     
     This class inherits from the built-in list class and overrides the __getitem__ method to issue a warning message
-    when accessing elements. The warning message alerts users that archive maps are deprecated and will be removed in
-    version v4.40.0 as they are no longer relevant. It also provides a recommendation for an alternative method to
+    when accessing elements. The warning message alerts users that archive maps are deprecated and will be removed
+    as they are no longer relevant. It also provides a recommendation for an alternative method to
     retrieve all checkpoints for a given architecture using the `huggingface_hub` library with the `list_models` method.
     """
     def __getitem__(self, item):
diff --git a/mindnlp/transformers/models/olmo/modeling_olmo.py b/mindnlp/transformers/models/olmo/modeling_olmo.py
index 4b6b2304b..5f34146bc 100644
--- a/mindnlp/transformers/models/olmo/modeling_olmo.py
+++ b/mindnlp/transformers/models/olmo/modeling_olmo.py
@@ -546,7 +546,7 @@ def forward(
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
                 "We detected that you are passing `past_key_values` as a tuple and this is deprecated.43. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                "Please use an appropriate `Cache` class"
             )
 
         if cache_position is None:
diff --git a/mindnlp/transformers/models/persimmon/modeling_persimmon.py b/mindnlp/transformers/models/persimmon/modeling_persimmon.py
index 54cd8127e..ee70d7b4e 100644
--- a/mindnlp/transformers/models/persimmon/modeling_persimmon.py
+++ b/mindnlp/transformers/models/persimmon/modeling_persimmon.py
@@ -576,8 +576,8 @@ def forward(
             use_legacy_cache = True
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed. "
+                "Please use an appropriate `Cache` class"
             )
 
         if inputs_embeds is None:
diff --git a/mindnlp/transformers/models/phi/modeling_phi.py b/mindnlp/transformers/models/phi/modeling_phi.py
index 6fc1e156a..219eb09e3 100644
--- a/mindnlp/transformers/models/phi/modeling_phi.py
+++ b/mindnlp/transformers/models/phi/modeling_phi.py
@@ -586,8 +586,8 @@ def forward(
             use_legacy_cache = True
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed. "
+                "Please use an appropriate `Cache` class"
             )
 
         if inputs_embeds is None:
diff --git a/mindnlp/transformers/models/qwen2/modeling_qwen2.py b/mindnlp/transformers/models/qwen2/modeling_qwen2.py
index 66d6a1aaa..0dfab72e3 100644
--- a/mindnlp/transformers/models/qwen2/modeling_qwen2.py
+++ b/mindnlp/transformers/models/qwen2/modeling_qwen2.py
@@ -523,7 +523,7 @@ def forward(
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
                 "We detected that you are passing `past_key_values` as a tuple and this is deprecated.43. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                "Please use an appropriate `Cache` class"
             )
 
         if inputs_embeds is None:
diff --git a/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py
index 5623dc00b..b8a504afc 100644
--- a/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+++ b/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py
@@ -104,7 +104,7 @@ def __init__(
         if config is None:
             logger.warning_once(
                 "`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the "
-                "`config` argument. All other arguments will be removed in v4.46"
+                "`config` argument. All other arguments will be removed"
             )
             self.rope_kwargs = {
                 "rope_type": rope_type,
@@ -515,7 +515,7 @@ def forward(
         output_attentions: bool = False,
         use_cache: bool = False,
         cache_position: Optional[mindspore.Tensor] = None,
-        position_embeddings: Optional[Tuple[mindspore.Tensor, mindspore.Tensor]] = None,  # will become mandatory in v4.46
+        position_embeddings: Optional[Tuple[mindspore.Tensor, mindspore.Tensor]] = None,
     ) -> Tuple[mindspore.Tensor, Optional[mindspore.Tensor], Optional[Tuple[mindspore.Tensor]]]:
         bsz, q_len, _ = hidden_states.shape
 
@@ -535,8 +535,7 @@ def forward(
             logger.warning_once(
                 "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
                 "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
-                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
-                "removed and `position_embeddings` will be mandatory."
+                "`position_embeddings` (Tuple of tensors, containing cos and sin)."
             )
             cos, sin = self.rotary_emb(value_states, position_ids)
         else:
@@ -611,7 +610,7 @@ def forward(
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
         cache_position: Optional[mindspore.Tensor] = None,
-        position_embeddings: Optional[Tuple[mindspore.Tensor, mindspore.Tensor]] = None,  # will become mandatory in v4.46
+        position_embeddings: Optional[Tuple[mindspore.Tensor, mindspore.Tensor]] = None,
         **kwargs,
     ) -> Tuple[mindspore.Tensor, Optional[Tuple[mindspore.Tensor, mindspore.Tensor]]]:
         """
diff --git a/mindnlp/transformers/models/speech_to_text/__init__.py b/mindnlp/transformers/models/speech_to_text/__init__.py
index 30bd78b36..f5701d09a 100644
--- a/mindnlp/transformers/models/speech_to_text/__init__.py
+++ b/mindnlp/transformers/models/speech_to_text/__init__.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Speech2Text model init"""
-from typing import TYPE_CHECKING
 
 from . import configuration_speech_to_text, modeling_speech_to_text, tokenization_speech_to_text,processing_speech_to_text,feature_extraction_speech_to_text
 from .configuration_speech_to_text import *
@@ -20,19 +19,10 @@
 from .tokenization_speech_to_text import *
 from .processing_speech_to_text import *
 from .feature_extraction_speech_to_text import *
+
 __all__ = []
 __all__.extend(configuration_speech_to_text.__all__)
 __all__.extend(modeling_speech_to_text.__all__)
 __all__.extend(tokenization_speech_to_text.__all__)
 __all__.extend(processing_speech_to_text.__all__)
 __all__.extend(feature_extraction_speech_to_text.__all__)
-
-_import_structure = {
-    "configuration_speech_to_text": ["Speech2TextConfig"],
-    "modeling_speech_to_text": [
-        "Speech2TextForConditionalGeneration",
-        "Speech2TextModel",
-        "Speech2TextPreTrainedModel",
-    ],
-    "tokenization_speech_to_text": ["Speech2TextTokenizer"],
-}
diff --git a/mindnlp/transformers/models/speech_to_text_2/__init__.py b/mindnlp/transformers/models/speech_to_text_2/__init__.py
new file mode 100644
index 000000000..55dafcbe0
--- /dev/null
+++ b/mindnlp/transformers/models/speech_to_text_2/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Speech2Text2 model init"""
+from . import configuration_speech_to_text_2, modeling_speech_to_text_2, tokenization_speech_to_text_2, \
+    processing_speech_to_text_2
+from .configuration_speech_to_text_2 import *
+from .modeling_speech_to_text_2 import *
+from .tokenization_speech_to_text_2 import *
+from .processing_speech_to_text_2 import *
+
+__all__ = []
+__all__.extend(configuration_speech_to_text_2.__all__)
+__all__.extend(modeling_speech_to_text_2.__all__)
+__all__.extend(tokenization_speech_to_text_2.__all__)
+__all__.extend(processing_speech_to_text_2.__all__)
diff --git a/mindnlp/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py b/mindnlp/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py
new file mode 100644
index 000000000..dc40368f7
--- /dev/null
+++ b/mindnlp/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py
@@ -0,0 +1,133 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Speech2Text model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Speech2Text2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Speech2Text2ForCausalLM`]. It is used to
+    instantiate an Speech2Text2 model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Speech2Text2
+    [facebook/s2t-wav2vec2-large-en-de](https://huggingface.co/facebook/s2t-wav2vec2-large-en-de) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50265):
+            Vocabulary size of the Speech2Text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`Speech2TextModel`]
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        decoder_layers (`int`, *optional*, defaults to 12):
+            Number of decoder layers.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the pooler. If string, `"gelu"`, `"relu"`,
+            `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            https://arxiv.org/abs/1909.11556>`__ for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        max_target_positions (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+
+    Example:
+
+    ```python
+    >>> from transformers import Speech2Text2Config, Speech2Text2ForCausalLM
+
+    >>> # Initializing a Speech2Text2 s2t_transformer_s style configuration
+    >>> configuration = Speech2Text2Config()
+
+    >>> # Initializing a model (with random weights) from the s2t_transformer_s style configuration
+    >>> model = Speech2Text2ForCausalLM(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "speech_to_text_2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "decoder_attention_heads", "hidden_size": "d_model"}
+
+    def __init__(
+        self,
+        vocab_size=10000,
+        decoder_layers=6,
+        decoder_ffn_dim=2048,
+        decoder_attention_heads=4,
+        decoder_layerdrop=0.0,
+        use_cache=True,
+        activation_function="relu",
+        d_model=256,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        decoder_start_token_id=2,
+        scale_embedding=True,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        max_target_positions=1024,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.decoder_layerdrop = decoder_layerdrop
+        self.use_cache = use_cache
+        self.num_hidden_layers = decoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        self.max_target_positions = max_target_positions
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
+
+__all__ = ['Speech2Text2Config']
diff --git a/mindnlp/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/mindnlp/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
new file mode 100644
index 000000000..bb94345ef
--- /dev/null
+++ b/mindnlp/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
@@ -0,0 +1,898 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MindSpore Speech2Text2 model."""
+
+import copy
+import math
+from typing import Optional, Tuple, Union
+
+import mindspore
+from mindnlp.core import nn, ops, no_grad, get_default_dtype
+from mindnlp.core.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
+from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
+from ...modeling_utils import PreTrainedModel
+from ....utils import logging
+from .configuration_speech_to_text_2 import Speech2Text2Config
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "Speech2Text2Config"
+_CHECKPOINT_FOR_DOC = "facebook/s2t-wav2vec2-large-en-de"
+
+
+class Speech2Text2SinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        super().__init__()
+        self.offset = 2
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
+
+    def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
+        if hasattr(self, "weights"):
+            # in forward put the weights on the correct dtype and device of the param
+            emb_weights = emb_weights.to(dtype=self.weights.dtype) # pylint: disable=access-member-before-definition
+
+        self.weights = nn.Parameter(emb_weights)
+        self.weights.requires_grad = False
+
+    @staticmethod
+    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        """
+        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
+        description in Section 3.5 of "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = ops.exp(ops.arange(half_dim, dtype=mindspore.int64).float() * -emb)
+        emb = ops.arange(num_embeddings, dtype=mindspore.int64).float().unsqueeze(1) * emb.unsqueeze(0)
+        emb = ops.cat([ops.sin(emb), ops.cos(emb)], dim=1).view(num_embeddings, -1)
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = ops.cat([emb, ops.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+        return emb.to(get_default_dtype())
+
+    @no_grad()
+    def forward(self, input_ids: mindspore.Tensor, past_key_values_length: int = 0):
+        bsz, seq_len = input_ids.shape
+        # Create the position ids from the input token ids. Any padded tokens remain padded.
+        position_ids = self.create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
+
+        # expand embeddings if needed
+        max_pos = self.padding_idx + 1 + seq_len
+        if max_pos > self.weights.shape[0]:
+            self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
+
+        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, -1)
+
+    def create_position_ids_from_input_ids(
+        self, input_ids: mindspore.Tensor, padding_idx: int, past_key_values_length: Optional[int] = 0
+    ):
+        """
+        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
+        symbols are ignored. This is modified from fairseq's `utils.make_positions`.
+
+        Args:
+            x: mindspore.Tensor x:
+        Returns: mindspore.Tensor
+        """
+        # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+        mask = input_ids.ne(padding_idx).int()
+        incremental_indices = (ops.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+        return incremental_indices.long() + padding_idx
+
+
+class Speech2Text2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        config: Optional[Speech2Text2Config] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: mindspore.Tensor, seq_len: int, bsz: int):
+        return ops.transpose(tensor.view(bsz, seq_len, self.num_heads, self.head_dim), 1, 2)
+
+    def forward(
+        self,
+        hidden_states: mindspore.Tensor,
+        key_value_states: Optional[mindspore.Tensor] = None,
+        past_key_value: Optional[Tuple[mindspore.Tensor]] = None,
+        attention_mask: Optional[mindspore.Tensor] = None,
+        layer_head_mask: Optional[mindspore.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[mindspore.Tensor, Optional[mindspore.Tensor], Optional[Tuple[mindspore.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.shape
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = ops.cat([past_key_value[0], key_states], dim=2)
+            value_states = ops.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(mindspore.Tensor, mindspore.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(mindspore.Tensor, mindspore.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+
+        src_len = key_states.shape[1]
+        attn_weights = ops.bmm(query_states, ops.transpose(key_states, 1, 2))
+
+        if attn_weights.shape != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.shape}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.shape != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.shape}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.shape != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.shape}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = ops.bmm(attn_probs, value_states)
+
+        if attn_output.shape != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.shape}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = ops.transpose(attn_output, 1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class Speech2Text2DecoderLayer(nn.Module):
+    def __init__(self, config: Speech2Text2Config):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = Speech2Text2Attention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+
+        if config.is_decoder:
+            self.encoder_attn = Speech2Text2Attention(
+                self.embed_dim,
+                config.decoder_attention_heads,
+                dropout=config.attention_dropout,
+                is_decoder=True,
+            )
+            self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: mindspore.Tensor,
+        attention_mask: Optional[mindspore.Tensor] = None,
+        encoder_hidden_states: Optional[mindspore.Tensor] = None,
+        encoder_attention_mask: Optional[mindspore.Tensor] = None,
+        layer_head_mask: Optional[mindspore.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[mindspore.Tensor] = None,
+        past_key_value: Optional[Tuple[mindspore.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ):
+        """
+        Args:
+            hidden_states (`mindspore.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`mindspore.Tensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`mindspore.Tensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`mindspore.Tensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`mindspore.Tensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`mindspore.Tensor`): mask for cross-attention heads in a given layer of
+                size *(decoder_attention_heads,)*.
+            past_key_value (`Tuple(mindspore.Tensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class Speech2Text2PreTrainedModel(PreTrainedModel):
+    config_class = Speech2Text2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            nn.init.normal_(module.weight.data, mean=0.0, std=std)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias.data)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight.data, mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx] = 0
+
+
+class Speech2Text2Decoder(Speech2Text2PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`Speech2Text2DecoderLayer`]
+
+    Args:
+        config: Speech2Text2Config
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: Speech2Text2Config):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_target_positions
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+
+        self.embed_positions = Speech2Text2SinusoidalPositionalEmbedding(
+            self.max_target_positions,
+            config.d_model,
+            self.padding_idx,
+        )
+
+        self.layers = nn.ModuleList([Speech2Text2DecoderLayer(config) for _ in range(config.decoder_layers)])
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (`mindspore.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`Speech2Text2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`mindspore.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`mindspore.Tensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`mindspore.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`mindspore.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention
+                on hidden heads. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple(tuple(mindspore.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(mindspore.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            inputs_embeds (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        attention_mask = _prepare_4d_causal_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _prepare_4d_attention_mask(
+                encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # embed positions
+        positions = self.embed_positions(input_ids, past_key_values_length=past_key_values_length)
+
+        hidden_states = inputs_embeds + positions
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.shape[0] != (len(self.layers)):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.shape[0]}."
+                    )
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.training:
+                dropout_probability = ops.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    head_mask[idx] if head_mask is not None else None,
+                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                    ),
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class Speech2Text2DecoderWrapper(Speech2Text2PreTrainedModel):
+    """
+    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
+    used in combination with the [`EncoderDecoderModel`] framework.
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.decoder = Speech2Text2Decoder(config)
+
+    def forward(self, *args, **kwargs):
+        return self.decoder(*args, **kwargs)
+
+
+class Speech2Text2ForCausalLM(Speech2Text2PreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        config = copy.deepcopy(config)
+        config.is_decoder = True
+        config.is_encoder_decoder = False
+        super().__init__(config)
+        self.model = Speech2Text2DecoderWrapper(config)
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.decoder.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model.decoder = decoder
+
+    def get_decoder(self):
+        return self.model.decoder
+
+    def forward(
+        self,
+        input_ids: Optional[mindspore.Tensor] = None,
+        attention_mask: Optional[mindspore.Tensor] = None,
+        encoder_hidden_states: Optional[mindspore.Tensor] = None,
+        encoder_attention_mask: Optional[mindspore.Tensor] = None,
+        head_mask: Optional[mindspore.Tensor] = None,
+        cross_attn_head_mask: Optional[mindspore.Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
+        inputs_embeds: Optional[mindspore.Tensor] = None,
+        labels: Optional[mindspore.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[mindspore.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        Args:
+            input_ids (`mindspore.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`Speech2Text2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states  (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                if the model is configured as a decoder.
+            encoder_attention_mask (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
+                in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            head_mask (`mindspore.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`mindspore.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple(tuple(mindspore.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(mindspore.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional
+                tensors are only required when the model is used as a decoder in a Sequence to Sequence model.
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import (
+        ...     SpeechEncoderDecoderModel,
+        ...     Speech2Text2ForCausalLM,
+        ...     Wav2Vec2Model,
+        ...     Speech2Text2Config,
+        ...     Wav2Vec2Config,
+        ...     Wav2Vec2FeatureExtractor,
+        ...     Speech2Text2Tokenizer,
+        ... )
+        >>> from datasets import load_dataset
+
+        >>> feature_extractor = Wav2Vec2FeatureExtractor()
+        >>> tokenizer = Speech2Text2Tokenizer.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
+
+        >>> encoder = Wav2Vec2Model(Wav2Vec2Config())
+        >>> decoder = Speech2Text2ForCausalLM(Speech2Text2Config())
+        >>> # init random speech2text model
+
+        >>> model = SpeechEncoderDecoderModel(encoder=encoder, decoder=decoder)
+        >>> model.config.pad_token_id = tokenizer.pad_token_id
+        >>> model.config.decoder_start_token_id = tokenizer.bos_token_id
+        >>> # pre-process inputs and labels
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> inputs = feature_extractor(
+        ...     ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt"
+        ... )
+        >>> input_values = inputs.input_values
+        >>> decoder_input_ids = tokenizer(ds[0]["text"], return_tensors="pt").input_ids
+        >>> # compute loss
+
+        >>> loss = model(inputs=input_values, labels=decoder_input_ids).loss
+        >>> # backprop loss
+
+        >>> loss.backward()  # doctest: +IGNORE_RESULT
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        logits = self.lm_head(outputs[0])
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
+    ):
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_ids.shape)
+
+        if past_key_values:
+            past_length = past_key_values[0][0].shape[2]
+
+            # Some generation methods already pass only the last input ID
+            if input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default to old behavior: keep only final ID
+                remove_prefix_length = input_ids.shape[1] - 1
+
+            input_ids = input_ids[:, remove_prefix_length:]
+        # first step, decoder_cached_states are empty
+        return {
+            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
+            "attention_mask": attention_mask,
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+        }
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),
+            )
+        return reordered_past
+
+__all__ = [
+    "Speech2Text2ForCausalLM",
+    "Speech2Text2PreTrainedModel",
+]
diff --git a/mindnlp/transformers/models/speech_to_text_2/processing_speech_to_text_2.py b/mindnlp/transformers/models/speech_to_text_2/processing_speech_to_text_2.py
new file mode 100644
index 000000000..21762efd1
--- /dev/null
+++ b/mindnlp/transformers/models/speech_to_text_2/processing_speech_to_text_2.py
@@ -0,0 +1,118 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Speech processor class for Speech2Text2
+"""
+
+import warnings
+from contextlib import contextmanager
+
+from ...processing_utils import ProcessorMixin
+
+
+class Speech2Text2Processor(ProcessorMixin):
+    r"""
+    Constructs a Speech2Text2 processor which wraps a Speech2Text2 feature extractor and a Speech2Text2 tokenizer into
+    a single processor.
+
+    [`Speech2Text2Processor`] offers all the functionalities of [`AutoFeatureExtractor`] and [`Speech2Text2Tokenizer`].
+    See the [`~Speech2Text2Processor.__call__`] and [`~Speech2Text2Processor.decode`] for more information.
+
+    Args:
+        feature_extractor (`AutoFeatureExtractor`):
+            An instance of [`AutoFeatureExtractor`]. The feature extractor is a required input.
+        tokenizer (`Speech2Text2Tokenizer`):
+            An instance of [`Speech2Text2Tokenizer`]. The tokenizer is a required input.
+    """
+
+    feature_extractor_class = "AutoFeatureExtractor"
+    tokenizer_class = "Speech2Text2Tokenizer"
+
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
+
+    def __call__(self, *args, **kwargs):
+        """
+        When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's
+        [`~AutoFeatureExtractor.__call__`] and returns its output. If used in the context
+        [`~Speech2Text2Processor.as_target_processor`] this method forwards all its arguments to
+        Speech2Text2Tokenizer's [`~Speech2Text2Tokenizer.__call__`]. Please refer to the doctsring of the above two
+        methods for more information.
+        """
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor(*args, **kwargs)
+
+        if "raw_speech" in kwargs:
+            warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
+            audio = kwargs.pop("raw_speech")
+        else:
+            audio = kwargs.pop("audio", None)
+        sampling_rate = kwargs.pop("sampling_rate", None)
+        text = kwargs.pop("text", None)
+        if len(args) > 0:
+            audio = args[0]
+            args = args[1:]
+
+        if audio is None and text is None:
+            raise ValueError("You need to specify either an `audio` or `text` input to process.")
+
+        if audio is not None:
+            inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
+        if text is not None:
+            encodings = self.tokenizer(text, **kwargs)
+
+        if text is None:
+            return inputs
+        elif audio is None:
+            return encodings
+        else:
+            inputs["labels"] = encodings["input_ids"]
+            return inputs
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Speech2Text2Tokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Speech2Text2Tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @contextmanager
+    def as_target_processor(self):
+        """
+        Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
+        Speech2Text2.
+        """
+        warnings.warn(
+            "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
+            "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
+            "your audio inputs, or in a separate call."
+        )
+        self._in_target_context_manager = True
+        self.current_processor = self.tokenizer
+        yield
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
+
+__all__ = ['Speech2Text2Processor']
diff --git a/mindnlp/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py b/mindnlp/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py
new file mode 100644
index 000000000..abbbae586
--- /dev/null
+++ b/mindnlp/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py
@@ -0,0 +1,251 @@
+# coding=utf-8
+# Copyright 2021 The Facebook Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for Speech2Text2."""
+
+import json
+import os
+from typing import Dict, List, Optional, Tuple
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "tokenizer_config_file": "tokenizer_config.json",
+    "merges_file": "merges.txt",
+}
+
+
+BPE_TOKEN_MERGES = "</w>"
+BPE_TOKEN_VOCAB = "@@ "
+
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
+    strings)
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+# Speech2Text2 has no max input length
+
+
+class Speech2Text2Tokenizer(PreTrainedTokenizer):
+    """
+    Constructs a Speech2Text2Tokenizer.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods. Users should refer to
+    the superclass for more information regarding such methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sentence token.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sentence token.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+
+        **kwargs
+            Additional keyword arguments passed along to [`PreTrainedTokenizer`]
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        pad_token="<pad>",
+        eos_token="</s>",
+        unk_token="<unk>",
+        do_lower_case=False,
+        merges_file=None,
+        **kwargs,
+    ):
+        self.do_lower_case = do_lower_case
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+
+        if merges_file is None:
+            logger.info(f"No merges files provided. {self.__class__.__name__} can only be used for decoding.")
+
+            self.bpe_ranks = None
+            self.cache = None
+        else:
+            with open(merges_file, encoding="utf-8") as merges_handle:
+                merges = merges_handle.read().split("\n")[:-1]
+
+            merges = [tuple(merge.split()[:2]) for merge in merges]
+            self.bpe_ranks = dict(zip(merges, range(len(merges))))
+            self.cache = {}
+        super().__init__(
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            do_lower_case=do_lower_case,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self.decoder)
+
+    def get_vocab(self) -> Dict:
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def bpe(self, token):
+        word = tuple(token[:-1]) + (token[-1] + BPE_TOKEN_MERGES,)
+        if token in self.cache:
+            return self.cache[token]
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        if word == "\n  " + BPE_TOKEN_MERGES:
+            word = "\n" + BPE_TOKEN_MERGES
+
+        if word.endswith(BPE_TOKEN_MERGES):
+            word = word.replace(BPE_TOKEN_MERGES, "")
+
+        word = word.replace(" ", BPE_TOKEN_VOCAB)
+        self.cache[token] = word
+        return word
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+
+        if self.bpe_ranks is None:
+            raise ValueError(
+                "This tokenizer was instantiated without a `merges.txt` file, so"
+                " that it can only be used for decoding, not for encoding. "
+                "Make sure to provide `merges.txt` file at instantiation to enable "
+                "encoding."
+            )
+
+        if self.do_lower_case:
+            text = text.lower()
+
+        text = text.split()
+
+        split_tokens = []
+        for token in text:
+            if token:
+                split_tokens.extend(list(self.bpe(token).split(" ")))
+
+        return split_tokens
+
+    def _convert_token_to_id(self, token: str) -> int:
+        """Converts a token (str) in an index (integer) using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index: int) -> str:
+        """Converts an index (integer) in a token (str) using the vocab."""
+        result = self.decoder.get(index, self.unk_token)
+        return result
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """
+        Converts a list of output tokens into a single string.
+        """
+        # combine tokens
+        string = " ".join(tokens)
+
+        # make sure @@ tokens are concatenated
+        string = "".join(string.split(BPE_TOKEN_VOCAB))
+
+        return string
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merges_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        index = 0
+        if self.bpe_ranks is None:
+            return (vocab_file,)
+
+        with open(merges_file, "w", encoding="utf-8") as writer:
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merges_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return (vocab_file, merges_file)
+
+__all__ = ['Speech2Text2Tokenizer']
diff --git a/mindnlp/transformers/models/stablelm/modeling_stablelm.py b/mindnlp/transformers/models/stablelm/modeling_stablelm.py
index aa61469f9..a09bcc2c2 100644
--- a/mindnlp/transformers/models/stablelm/modeling_stablelm.py
+++ b/mindnlp/transformers/models/stablelm/modeling_stablelm.py
@@ -615,7 +615,7 @@ def forward(
             use_legacy_cache = True
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed. "
                 "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
diff --git a/mindnlp/transformers/models/starcoder2/modeling_starcoder2.py b/mindnlp/transformers/models/starcoder2/modeling_starcoder2.py
index 71101131f..9562d764b 100644
--- a/mindnlp/transformers/models/starcoder2/modeling_starcoder2.py
+++ b/mindnlp/transformers/models/starcoder2/modeling_starcoder2.py
@@ -489,7 +489,7 @@ def forward(
             use_legacy_cache = True
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed. "
                 "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
@@ -725,7 +725,6 @@ def forward(
 
         hidden_states = outputs[0]
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        # TODO: remove the float() operation in v4.46
         logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
 
         loss = None
diff --git a/mindnlp/transformers/models/vipllava/modeling_vipllava.py b/mindnlp/transformers/models/vipllava/modeling_vipllava.py
index fad5ac62d..51195aa01 100644
--- a/mindnlp/transformers/models/vipllava/modeling_vipllava.py
+++ b/mindnlp/transformers/models/vipllava/modeling_vipllava.py
@@ -342,7 +342,7 @@ def forward(
                 logger.warning_once(
                     "Expanding inputs for image tokens in VipLLaVa should be done in processing. "
                     "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
-                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+                    "Using processors without these attributes in the config is deprecated and will throw an error."
                 )
                 # prefill stage vs decoding stage (legacy behavior copied)
                 if input_ids.shape[1] != 1:
@@ -378,7 +378,6 @@ def forward(
                     attention_mask = ops.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
                     position_ids = ops.sum(attention_mask, dim=1).unsqueeze(-1) - 1
 
-            # TODO: @raushan retain only the new behavior after v4.47
             else:
                 special_image_mask = (
                     (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
diff --git a/mindnlp/transformers/models/wav2vec2/modeling_wav2vec2.py b/mindnlp/transformers/models/wav2vec2/modeling_wav2vec2.py
index bc08c93b1..4cfc650d6 100644
--- a/mindnlp/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/mindnlp/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -1976,7 +1976,7 @@ def forward(
             hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
             hidden_states = ops.stack(hidden_states, dim=1)
             norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
-            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+            hidden_states = ops.sum(hidden_states * norm_weights.view(-1, 1, 1), dim=1)
         else:
             hidden_states = outputs[0]
 
diff --git a/mindnlp/transformers/models/whisper/generation_whisper.py b/mindnlp/transformers/models/whisper/generation_whisper.py
index 7a0cd0fac..49a43a427 100644
--- a/mindnlp/transformers/models/whisper/generation_whisper.py
+++ b/mindnlp/transformers/models/whisper/generation_whisper.py
@@ -249,7 +249,7 @@ def _extract_token_timestamps(self, generate_outputs, alignment_heads, time_prec
             weights = _median_filter(weights, self.config.median_filter_width)
 
             # Average the different cross-attention heads.
-            weights = weights.mean(dim=1)
+            weights = ops.mean(weights, dim=1)
 
         # Perform dynamic time warping on each element of the batch.
         for batch_idx in range(batch_size):
@@ -263,11 +263,11 @@ def _extract_token_timestamps(self, generate_outputs, alignment_heads, time_prec
                 matrix = _median_filter(matrix, self.config.median_filter_width)
 
                 # Average the different cross-attention heads.
-                matrix = matrix.mean(dim=0)
+                matrix = ops.mean(matrix, dim=0)
             else:
                 matrix = weights[batch_idx]
 
-            text_indices, time_indices = _dynamic_time_warping(-matrix.double().asnumpy())
+            text_indices, time_indices = _dynamic_time_warping(-matrix.astype(mindspore.float64).asnumpy())
             jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool)
             jump_times = time_indices[jumps] * time_precision
             timestamps[batch_idx, 1:] = mindspore.tensor(jump_times)
diff --git a/mindnlp/transformers/models/whisper/modeling_whisper.py b/mindnlp/transformers/models/whisper/modeling_whisper.py
index 352ace028..6c45bacad 100644
--- a/mindnlp/transformers/models/whisper/modeling_whisper.py
+++ b/mindnlp/transformers/models/whisper/modeling_whisper.py
@@ -917,7 +917,7 @@ def forward(
             elif not isinstance(past_key_values, EncoderDecoderCache):
                 return_legacy_cache = True
                 logger.warning_once(
-                    "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. "
+                    "Passing a tuple of `past_key_values` is deprecated and will be removed. "
                     "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
                     "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
                 )
diff --git a/mindnlp/transformers/pipelines/__init__.py b/mindnlp/transformers/pipelines/__init__.py
index e9ddf7e26..511cbb2fb 100644
--- a/mindnlp/transformers/pipelines/__init__.py
+++ b/mindnlp/transformers/pipelines/__init__.py
@@ -42,7 +42,7 @@
     is_mindspore_available,
     logging,
 )
-# from .audio_classification import AudioClassificationPipeline
+from .audio_classification import AudioClassificationPipeline
 from .automatic_speech_recognition import AutomaticSpeechRecognitionPipeline
 from .base import (
     ArgumentHandler,
@@ -135,12 +135,12 @@
     "text-to-speech": "text-to-audio",
 }
 SUPPORTED_TASKS = {
-    # "audio-classification": {
-    #     "impl": AudioClassificationPipeline,
-    #     "ms": (AutoModelForAudioClassification,) if is_mindspore_available() else (),
-    #     "default": {"model": {"ms": ("superb/wav2vec2-base-superb-ks", "372e048")}},
-    #     "type": "audio",
-    # },
+    "audio-classification": {
+        "impl": AudioClassificationPipeline,
+        "ms": (AutoModelForAudioClassification,) if is_mindspore_available() else (),
+        "default": {"model": {"ms": ("superb/wav2vec2-base-superb-ks", "372e048")}},
+        "type": "audio",
+    },
     "automatic-speech-recognition": {
         "impl": AutomaticSpeechRecognitionPipeline,
         "ms": (AutoModelForCTC, AutoModelForSpeechSeq2Seq) if is_mindspore_available() else (),
diff --git a/mindnlp/transformers/pipelines/audio_classification.py b/mindnlp/transformers/pipelines/audio_classification.py
new file mode 100644
index 000000000..b88fc245c
--- /dev/null
+++ b/mindnlp/transformers/pipelines/audio_classification.py
@@ -0,0 +1,220 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""audio classification"""
+import subprocess
+from typing import Union
+
+import numpy as np
+import requests
+
+from ...utils import logging
+from .base import Pipeline
+
+from ..models.auto.modeling_auto import MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
+    """
+    Helper function to read an audio file through ffmpeg.
+    """
+    ar = f"{sampling_rate}"
+    ac = "1"
+    format_for_conversion = "f32le"
+    ffmpeg_command = [
+        "ffmpeg",
+        "-i",
+        "pipe:0",
+        "-ac",
+        ac,
+        "-ar",
+        ar,
+        "-f",
+        format_for_conversion,
+        "-hide_banner",
+        "-loglevel",
+        "quiet",
+        "pipe:1",
+    ]
+
+    try:
+        ffmpeg_process = subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+    except FileNotFoundError:
+        raise ValueError("ffmpeg was not found but is required to load audio files from filename")
+    output_stream = ffmpeg_process.communicate(bpayload)
+    out_bytes = output_stream[0]
+
+    audio = np.frombuffer(out_bytes, np.float32)
+    if audio.shape[0] == 0:
+        raise ValueError("Malformed soundfile")
+    return audio
+
+
+class AudioClassificationPipeline(Pipeline):
+    """
+    Audio classification pipeline using any `AutoModelForAudioClassification`. This pipeline predicts the class of a
+    raw waveform or an audio file. In case of an audio file, ffmpeg should be installed to support multiple audio
+    formats.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> classifier = pipeline(model="superb/wav2vec2-base-superb-ks")
+    >>> classifier("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")
+    [{'score': 0.997, 'label': '_unknown_'}, {'score': 0.002, 'label': 'left'}, {'score': 0.0, 'label': 'yes'}, {'score': 0.0, 'label': 'down'}, {'score': 0.0, 'label': 'stop'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+
+    This pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"audio-classification"`.
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=audio-classification).
+    """
+
+    def __init__(self, *args, **kwargs):
+        # Default, might be overriden by the model.config.
+        kwargs["top_k"] = 5
+        super().__init__(*args, **kwargs)
+
+        self.check_model_type(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES)
+
+    def __call__(
+        self,
+        inputs: Union[np.ndarray, bytes, str],
+        **kwargs,
+    ):
+        """
+        Classify the sequence(s) given as inputs. See the [`AutomaticSpeechRecognitionPipeline`] documentation for more
+        information.
+
+        Args:
+            inputs (`np.ndarray` or `bytes` or `str` or `dict`):
+                The inputs is either :
+                    - `str` that is the filename of the audio file, the file will be read at the correct sampling rate
+                      to get the waveform using *ffmpeg*. This requires *ffmpeg* to be installed on the system.
+                    - `bytes` it is supposed to be the content of an audio file and is interpreted by *ffmpeg* in the
+                      same way.
+                    - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
+                        Raw audio at the correct sampling rate (no further check will be done)
+                    - `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this
+                      pipeline do the resampling. The dict must be either be in the format `{"sampling_rate": int,
+                      "raw": np.array}`, or `{"sampling_rate": int, "array": np.array}`, where the key `"raw"` or
+                      `"array"` is used to denote the raw audio waveform.
+            top_k (`int`, *optional*, defaults to None):
+                The number of top labels that will be returned by the pipeline. If the provided number is `None` or
+                higher than the number of labels available in the model configuration, it will default to the number of
+                labels.
+            function_to_apply(`str`, *optional*, defaults to "softmax"):
+                The function to apply to the model output. By default, the pipeline will apply the softmax function to
+                the output of the model. Valid options: ["softmax", "sigmoid", "none"]. Note that passing Python's
+                built-in `None` will default to "softmax", so you need to pass the string "none" to disable any
+                post-processing.
+
+        Return:
+            A list of `dict` with the following keys:
+
+            - **label** (`str`) -- The label predicted.
+            - **score** (`float`) -- The corresponding probability.
+        """
+        return super().__call__(inputs, **kwargs)
+
+    def _sanitize_parameters(self, top_k=None, function_to_apply=None, **kwargs):
+        # No parameters on this pipeline right now
+        postprocess_params = {}
+        if top_k is not None:
+            if top_k > self.model.config.num_labels:
+                top_k = self.model.config.num_labels
+            postprocess_params["top_k"] = top_k
+        if function_to_apply is not None:
+            if function_to_apply not in ["softmax", "sigmoid", "none"]:
+                raise ValueError(
+                    f"Invalid value for `function_to_apply`: {function_to_apply}. "
+                    "Valid options are ['softmax', 'sigmoid', 'none']"
+                )
+            postprocess_params["function_to_apply"] = function_to_apply
+        else:
+            postprocess_params["function_to_apply"] = "softmax"
+        return {}, {}, postprocess_params
+
+    def preprocess(self, inputs):
+        if isinstance(inputs, str):
+            if inputs.startswith("http://") or inputs.startswith("https://"):
+                # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+                # like http_huggingface_co.png
+                inputs = requests.get(inputs, timeout=10).content
+            else:
+                with open(inputs, "rb") as f:
+                    inputs = f.read()
+
+        if isinstance(inputs, bytes):
+            inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate)
+
+        if isinstance(inputs, dict):
+            # Accepting `"array"` which is the key defined in `datasets` for
+            # better integration
+            if not ("sampling_rate" in inputs and ("raw" in inputs or "array" in inputs)):
+                raise ValueError(
+                    "When passing a dictionary to AudioClassificationPipeline, the dict needs to contain a "
+                    '"raw" key containing the numpy array representing the audio and a "sampling_rate" key, '
+                    "containing the sampling_rate associated with that array"
+                )
+
+            _inputs = inputs.pop("raw", None)
+            if _inputs is None:
+                # Remove path which will not be used from `datasets`.
+                inputs.pop("path", None)
+                _inputs = inputs.pop("array", None)
+            in_sampling_rate = inputs.pop("sampling_rate")
+            inputs = _inputs
+            if in_sampling_rate != self.feature_extractor.sampling_rate:
+                from mindspore.dataset.audio import Resample
+
+                resample = Resample(in_sampling_rate, self.feature_extractor.sampling_rate)
+                inputs = resample(inputs)
+
+        if not isinstance(inputs, np.ndarray):
+            raise TypeError("We expect a numpy ndarray as input")
+        if len(inputs.shape) != 1:
+            raise ValueError("We expect a single channel audio input for AudioClassificationPipeline")
+
+        processed = self.feature_extractor(
+            inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="ms"
+        )
+        return processed
+
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        return model_outputs
+
+    def postprocess(self, model_outputs, top_k=5, function_to_apply="softmax"):
+        if function_to_apply == "softmax":
+            probs = model_outputs.logits[0].softmax(-1)
+        elif function_to_apply == "sigmoid":
+            probs = model_outputs.logits[0].sigmoid()
+        else:
+            probs = model_outputs.logits[0]
+        scores, ids = probs.topk(top_k)
+
+        scores = scores.tolist()
+        ids = ids.tolist()
+
+        labels = [{"score": score, "label": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]
+
+        return labels
diff --git a/mindnlp/transformers/pipelines/audio_utils.py b/mindnlp/transformers/pipelines/audio_utils.py
index 07541a69e..6ca33ae09 100644
--- a/mindnlp/transformers/pipelines/audio_utils.py
+++ b/mindnlp/transformers/pipelines/audio_utils.py
@@ -64,9 +64,35 @@ def ffmpeg_microphone(
     sampling_rate: int,
     chunk_length_s: float,
     format_for_conversion: str = "f32le",
+    ffmpeg_input_device: Optional[str] = None,
+    ffmpeg_additional_args: Optional[list[str]] = None,
 ):
     """
-    Helper function to read raw microphone data.
+    Helper function to read audio from a microphone using ffmpeg. The default input device will be used unless another
+    input device is specified using the `ffmpeg_input_device` argument. Uses 'alsa' on Linux, 'avfoundation' on MacOS and
+    'dshow' on Windows.
+
+    Arguments:
+        sampling_rate (`int`):
+            The sampling_rate to use when reading the data from the microphone. Try using the model's sampling_rate to
+            avoid resampling later.
+        chunk_length_s (`float` or `int`):
+            The length of the maximum chunk of audio to be sent returned.
+        format_for_conversion (`str`, defaults to `f32le`):
+            The name of the format of the audio samples to be returned by ffmpeg. The standard is `f32le`, `s16le`
+            could also be used.
+        ffmpeg_input_device (`str`, *optional*):
+            The indentifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset,
+            the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices`
+            for how to specify and list input devices.
+        ffmpeg_additional_args (`list[str]`, *optional*):
+            Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background
+            process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags
+            with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]).
+
+    Returns:
+        A generator yielding audio chunks of `chunk_length_s` seconds as `bytes` objects of length
+        `int(round(sampling_rate * chunk_length_s)) * size_of_sample`.
     """
     ar = f"{sampling_rate}"
     ac = "1"
@@ -78,15 +104,18 @@ def ffmpeg_microphone(
         raise ValueError(f"Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`")
 
     system = platform.system()
+
     if system == "Linux":
         format_ = "alsa"
-        input_ = "default"
+        input_ = ffmpeg_input_device or "default"
     elif system == "Darwin":
         format_ = "avfoundation"
-        input_ = ":0"
+        input_ = ffmpeg_input_device or ":default"
     elif system == "Windows":
         format_ = "dshow"
-        input_ = _get_microphone_name()
+        input_ = ffmpeg_input_device or _get_microphone_name()
+
+    ffmpeg_additional_args = [] if ffmpeg_additional_args is None else ffmpeg_additional_args
 
     ffmpeg_command = [
         "ffmpeg",
@@ -107,9 +136,13 @@ def ffmpeg_microphone(
         "quiet",
         "pipe:1",
     ]
+
+    ffmpeg_command.extend(ffmpeg_additional_args)
+
     chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample
     iterator = _ffmpeg_stream(ffmpeg_command, chunk_len)
-    yield from iterator
+    for item in iterator:
+        yield item
 
 
 def ffmpeg_microphone_live(
@@ -118,11 +151,14 @@ def ffmpeg_microphone_live(
     stream_chunk_s: Optional[int] = None,
     stride_length_s: Optional[Union[Tuple[float, float], float]] = None,
     format_for_conversion: str = "f32le",
+    ffmpeg_input_device: Optional[str] = None,
+    ffmpeg_additional_args: Optional[list[str]] = None,
 ):
     """
-    Helper function to read audio from the microphone file through ffmpeg. This will output `partial` overlapping
-    chunks starting from `stream_chunk_s` (if it is defined) until `chunk_length_s` is reached. It will make use of
-    striding to avoid errors on the "sides" of the various chunks.
+    Helper function to read audio from a microphone using ffmpeg. This will output `partial` overlapping chunks starting
+    from `stream_chunk_s` (if it is defined) until `chunk_length_s` is reached. It will make use of striding to avoid
+    errors on the "sides" of the various chunks. The default input device will be used unless another input device is
+    specified using the `ffmpeg_input_device` argument. Uses 'alsa' on Linux, 'avfoundation' on MacOS and 'dshow' on Windows.
 
     Arguments:
         sampling_rate (`int`):
@@ -130,33 +166,46 @@ def ffmpeg_microphone_live(
             avoid resampling later.
         chunk_length_s (`float` or `int`):
             The length of the maximum chunk of audio to be sent returned. This includes the eventual striding.
-        stream_chunk_s (`float` or `int`)
+        stream_chunk_s (`float` or `int`):
             The length of the minimal temporary audio to be returned.
-        stride_length_s (`float` or `int` or `(float, float)`, *optional*, defaults to `None`)
+        stride_length_s (`float` or `int` or `(float, float)`, *optional*):
             The length of the striding to be used. Stride is used to provide context to a model on the (left, right) of
             an audio sample but without using that part to actually make the prediction. Setting this does not change
             the length of the chunk.
-        format_for_conversion (`str`, defalts to `f32le`)
+        format_for_conversion (`str`, *optional*, defaults to `f32le`):
             The name of the format of the audio samples to be returned by ffmpeg. The standard is `f32le`, `s16le`
             could also be used.
+        ffmpeg_input_device (`str`, *optional*):
+            The identifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset,
+            the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices`
+            for how to specify and list input devices.
+        ffmpeg_additional_args (`list[str]`, *optional*):
+            Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background
+            process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags
+            with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]).
 
-    Returns:
+    Return:
         A generator yielding dictionaries of the following form
 
-        `{"sampling_rate": int, "raw": np.array(), "partial" bool}` With optionnally a `"stride" (int, int)` key if
+        `{"sampling_rate": int, "raw": np.array(), "partial" bool}` With optionally a `"stride" (int, int)` key if
         `stride_length_s` is defined.
 
         `stride` and `raw` are all expressed in `samples`, and `partial` is a boolean saying if the current yield item
         is a whole chunk, or a partial temporary result to be later replaced by another larger chunk.
-
-
     """
     if stream_chunk_s is not None:
         chunk_s = stream_chunk_s
     else:
         chunk_s = chunk_length_s
 
-    microphone = ffmpeg_microphone(sampling_rate, chunk_s, format_for_conversion=format_for_conversion)
+    microphone = ffmpeg_microphone(
+        sampling_rate,
+        chunk_s,
+        format_for_conversion=format_for_conversion,
+        ffmpeg_input_device=ffmpeg_input_device,
+        ffmpeg_additional_args=[] if ffmpeg_additional_args is None else ffmpeg_additional_args,
+    )
+
     if format_for_conversion == "s16le":
         dtype = np.int16
         size_of_sample = 2
@@ -249,7 +298,7 @@ def _get_microphone_name():
     command = ["ffmpeg", "-list_devices", "true", "-f", "dshow", "-i", ""]
 
     try:
-        ffmpeg_devices = subprocess.run(command, text=True, stderr=subprocess.PIPE, encoding="utf-8", check=False)
+        ffmpeg_devices = subprocess.run(command, text=True, stderr=subprocess.PIPE, encoding="utf-8") # pylint: disable=subprocess-run-check
         microphone_lines = [line for line in ffmpeg_devices.stderr.splitlines() if "(audio)" in line]
 
         if microphone_lines:
diff --git a/mindnlp/transformers/pipelines/automatic_speech_recognition.py b/mindnlp/transformers/pipelines/automatic_speech_recognition.py
index 6e8a2fe64..8147d3aea 100644
--- a/mindnlp/transformers/pipelines/automatic_speech_recognition.py
+++ b/mindnlp/transformers/pipelines/automatic_speech_recognition.py
@@ -13,20 +13,17 @@
 # limitations under the License.
 # ============================================================================
 """ASR pipeline"""
+import warnings
 from collections import defaultdict
 from typing import TYPE_CHECKING, Dict, Optional, Union
 
 import numpy as np
 import requests
 
-from mindspore import ops
-from mindspore.dataset.audio import Resample
-
-from mindnlp.utils import logging
+from ..tokenization_utils import PreTrainedTokenizer
+from ...utils import is_mindspore_available, logging
 from .audio_utils import ffmpeg_read
 from .base import ChunkPipeline
-from ..tokenization_utils import PreTrainedTokenizer
-from ..models.auto.modeling_auto import MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES
 
 
 if TYPE_CHECKING:
@@ -37,6 +34,12 @@
 
 logger = logging.get_logger(__name__)
 
+if is_mindspore_available():
+    import mindspore
+    from mindnlp.core import ops
+
+    from ..models.auto.modeling_auto import MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES
+
 
 def rescale_stride(stride, ratio):
     """
@@ -59,24 +62,6 @@ def rescale_stride(stride, ratio):
 
 
 def chunk_iter(inputs, feature_extractor, chunk_len, stride_left, stride_right, dtype=None):
-    """
-    Chunks the input data and processes each chunk using a specified feature extractor.
-    
-    Args:
-        inputs (ndarray): The input data to be chunked and processed.
-        feature_extractor (callable): The function used to extract features from each chunk.
-        chunk_len (int): The length of each chunk to be extracted.
-        stride_left (int): The amount of overlap on the left side of each chunk.
-        stride_right (int): The amount of overlap on the right side of each chunk.
-        dtype (dtype, optional): The data type to convert the processed data to.
-    
-    Returns:
-        None
-    
-    Raises:
-        ValueError: If the input data is not in the expected format or if there are issues with processing the chunks.
-        AttributeError: If the feature extractor does not have the required attributes or methods.
-    """
     inputs_len = inputs.shape[0]
     step = chunk_len - stride_left - stride_right
     for chunk_start_idx in range(0, inputs_len, step):
@@ -86,8 +71,7 @@ def chunk_iter(inputs, feature_extractor, chunk_len, stride_left, stride_right,
         if dtype is not None:
             processed = processed.to(dtype=dtype)
         _stride_left = 0 if chunk_start_idx == 0 else stride_left
-        # all right strides must be full, otherwise it is the last item
-        is_last = chunk_end_idx > inputs_len if stride_right > 0 else chunk_end_idx >= inputs_len
+        is_last = chunk_end_idx >= inputs_len
         _stride_right = 0 if is_last else stride_right
 
         chunk_len = chunk.shape[0]
@@ -99,22 +83,6 @@ def chunk_iter(inputs, feature_extractor, chunk_len, stride_left, stride_right,
 
 
 def _fast_find_longest_common_sequence(sequence_left, sequence_right):
-    """
-    Finds the longest common sequence between two given sequences.
-    
-    Args:
-        sequence_left (list): The first sequence to compare.
-        sequence_right (list): The second sequence to compare.
-    
-    Returns:
-        tuple: A tuple containing the index of the starting element of the longest common sequence in 'sequence_left',
-            the index of the starting element of the longest common sequence in 'sequence_right',
-            and the length of the longest common sequence.
-    
-    Raises:
-        None.
-    
-    """
     seq_len_left = len(sequence_left)
     seq_len_right = len(sequence_right)
     counter = [[0] * (seq_len_right + 1) for _ in range(seq_len_left + 1)]
@@ -124,7 +92,8 @@ def _fast_find_longest_common_sequence(sequence_left, sequence_right):
             if sequence_left[i] == sequence_right[j]:
                 previous_counter = counter[i][j] + 1
                 counter[i + 1][j + 1] = previous_counter
-                longest = max(longest, previous_counter)
+                if previous_counter > longest:
+                    longest = previous_counter
 
     counter = np.array(counter)
     # we return the idx of the first element of the longest common sequence in the left sequence
@@ -134,30 +103,6 @@ def _fast_find_longest_common_sequence(sequence_left, sequence_right):
 
 
 def _find_longest_common_sequence(sequences, tokenizer):
-    """
-    Finds the longest common sequence among multiple sequences of tokens.
-    
-    Args:
-        sequences (List[Tuple[np.ndarray, Any]]):
-            A list of tuples, where each tuple contains a sequence of tokens as a numpy array and any additional
-            information associated with the sequence. The sequences are expected to be preprocessed and tokenized.
-        tokenizer (Any):
-            The tokenizer object used for tokenization. It should have an attribute 'all_special_ids' which contains
-            a list of special token IDs to be excluded from the sequences.
-
-    Returns:
-        np.ndarray: A numpy array representing the longest common sequence found among the input sequences.
-            The array contains the token IDs of the common sequence.
-
-    Raises:
-        None
-
-    Note:
-        The function uses a sliding window approach to find the longest common sequence.
-        The sequences are compared token by token, excluding any special tokens defined by the tokenizer.
-        The function returns the longest common sequence found among all input sequences.
-
-    """
     # TODO  Use a faster algorithm this can probably be done in O(n)
     # using suffix array.
     # It might be tedious to do because of fault tolerance.
@@ -190,13 +135,14 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
     to support multiple audio formats
 
     Example:
-        ```python
-        >>> from transformers import pipeline
-        ...
-        >>> transcriber = pipeline(model="openai/whisper-base")
-        >>> transcriber("https://hf-mirror.com/datasets/Narsil/asr_dummy/resolve/main/1.flac")
-        {'text': ' He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered flour-fatten sauce.'}
-        ```
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> transcriber = pipeline(model="openai/whisper-base")
+    >>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")
+    {'text': ' He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered flour-fatten sauce.'}
+    ```
 
     Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
 
@@ -219,7 +165,7 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
             <Tip>
 
             For more information on how to effectively use `chunk_length_s`, please have a look at the [ASR chunking
-            blog post](https://hf-mirror.com/blog/asr-chunking).
+            blog post](https://huggingface.co/blog/asr-chunking).
 
             </Tip>
 
@@ -231,7 +177,7 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
             <Tip>
 
             For more information on how to effectively use `stride_length_s`, please have a look at the [ASR chunking
-            blog post](https://hf-mirror.com/blog/asr-chunking).
+            blog post](https://huggingface.co/blog/asr-chunking).
 
             </Tip>
 
@@ -240,14 +186,12 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
             installed. If no framework is specified, will default to the one currently installed. If no framework is
             specified and both frameworks are installed, will default to the framework of the `model`, or to PyTorch if
             no model is provided.
-        device (Union[`int`, `torch.device`], *optional*):
-            Device ordinal for CPU/GPU supports. Setting this to `None` will leverage CPU, a positive will run the
-            model on the associated CUDA device id.
-        ms_dtype (Union[`int`, `torch.dtype`], *optional*):
+        ms_dtype (Union[`int`, `mindspore.dtype`], *optional*):
             The data-type (dtype) of the computation. Setting this to `None` will use float32 precision. Set to
-            `torch.float16` or `torch.bfloat16` to use half-precision in the respective dtypes.
+            `mindspore.float16` or `mindspore.bfloat16` to use half-precision in the respective dtypes.
 
     """
+
     def __init__(
         self,
         model: "PreTrainedModel",
@@ -257,25 +201,6 @@ def __init__(
         ms_dtype: Optional[str] = None,
         **kwargs,
     ):
-        """
-        This method initializes an instance of AutomaticSpeechRecognitionPipeline.
-
-        Args:
-            self: The instance of the class.
-            model (PreTrainedModel): The pre-trained model used for speech recognition.
-            feature_extractor (Union[SequenceFeatureExtractor, str]): The feature extractor used for processing
-                input data. It can be an instance of SequenceFeatureExtractor class or a string.
-            tokenizer (Optional[PreTrainedTokenizer]): The tokenizer used for tokenizing input data.
-            decoder (Optional[Union[BeamSearchDecoderCTC, str]]): The decoder used for decoding the model predictions.
-                It can be an instance of BeamSearchDecoderCTC class or a string.
-            ms_dtype (Optional[str]): The data type used for processing input data.
-
-        Returns:
-            None.
-
-        Raises:
-            None
-        """
         # set the model type so we can check we have the right pre- and post-processing parameters
         if model.config.model_type == "whisper":
             self.type = "seq2seq_whisper"
@@ -304,57 +229,55 @@ def __call__(
 
         Args:
             inputs (`np.ndarray` or `bytes` or `str` or `dict`):
-                - `str` that is either the filename of a local audio file, or a public URL address to download the
-                audio file. The file will be read at the correct sampling rate to get the waveform using
-                *ffmpeg*. This requires *ffmpeg* to be installed on the system.
-                - `bytes` it is supposed to be the content of an audio file and is interpreted by *ffmpeg* in the same way.
-                - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
-                Raw audio at the correct sampling rate (no further check will be done)
-                - `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this
-                pipeline do the resampling. The dict must be in the format `{"sampling_rate": int, "raw":
-                np.array}` with optionally a `"stride": (left: int, right: int)` than can ask the pipeline to
-                treat the first `left` samples and last `right` samples to be ignored in decoding (but used at
-                inference to provide more context to the model). Only use `stride` with CTC models.
+                The inputs is either :
+                    - `str` that is either the filename of a local audio file, or a public URL address to download the
+                      audio file. The file will be read at the correct sampling rate to get the waveform using
+                      *ffmpeg*. This requires *ffmpeg* to be installed on the system.
+                    - `bytes` it is supposed to be the content of an audio file and is interpreted by *ffmpeg* in the
+                      same way.
+                    - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
+                        Raw audio at the correct sampling rate (no further check will be done)
+                    - `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this
+                      pipeline do the resampling. The dict must be in the format `{"sampling_rate": int, "raw":
+                      np.array}` with optionally a `"stride": (left: int, right: int)` than can ask the pipeline to
+                      treat the first `left` samples and last `right` samples to be ignored in decoding (but used at
+                      inference to provide more context to the model). Only use `stride` with CTC models.
             return_timestamps (*optional*, `str` or `bool`):
-                - Only available for pure CTC models (Wav2Vec2, HuBERT, etc) and the Whisper model. Not available for
+                Only available for pure CTC models (Wav2Vec2, HuBERT, etc) and the Whisper model. Not available for
                 other sequence-to-sequence models.
-                - For CTC models, timestamps can take one of two formats:
 
+                For CTC models, timestamps can take one of two formats:
                     - `"char"`: the pipeline will return timestamps along the text for every character in the text. For
-                    instance, if you get `[{"text": "h", "timestamp": (0.5, 0.6)}, {"text": "i", "timestamp": (0.7,
-                    0.9)}]`, then it means the model predicts that the letter "h" was spoken after `0.5` and before
-                    `0.6` seconds.
+                        instance, if you get `[{"text": "h", "timestamp": (0.5, 0.6)}, {"text": "i", "timestamp": (0.7,
+                        0.9)}]`, then it means the model predicts that the letter "h" was spoken after `0.5` and before
+                        `0.6` seconds.
                     - `"word"`: the pipeline will return timestamps along the text for every word in the text. For
-                    instance, if you get `[{"text": "hi ", "timestamp": (0.5, 0.9)}, {"text": "there", "timestamp":
-                    (1.0, 1.5)}]`, then it means the model predicts that the word "hi" was spoken after `0.5` and
-                    before `0.9` seconds.
-                - For the Whisper model, timestamps can take one of two formats:
+                        instance, if you get `[{"text": "hi ", "timestamp": (0.5, 0.9)}, {"text": "there", "timestamp":
+                        (1.0, 1.5)}]`, then it means the model predicts that the word "hi" was spoken after `0.5` and
+                        before `0.9` seconds.
 
+                For the Whisper model, timestamps can take one of two formats:
                     - `"word"`: same as above for word-level CTC timestamps. Word-level timestamps are predicted
-                            through the *dynamic-time warping (DTW)* algorithm, an approximation to word-level timestamps
-                            by inspecting the cross-attention weights.
+                        through the *dynamic-time warping (DTW)* algorithm, an approximation to word-level timestamps
+                        by inspecting the cross-attention weights.
                     - `True`: the pipeline will return timestamps along the text for *segments* of words in the text.
-                            For instance, if you get `[{"text": " Hi there!", "timestamp": (0.5, 1.5)}]`, then it means the
-                            model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds.
-                            Note that a segment of text refers to a sequence of one or more words, rather than individual
-                            words as with word-level timestamps.
+                        For instance, if you get `[{"text": " Hi there!", "timestamp": (0.5, 1.5)}]`, then it means the
+                        model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds.
+                        Note that a segment of text refers to a sequence of one or more words, rather than individual
+                        words as with word-level timestamps.
             generate_kwargs (`dict`, *optional*):
                 The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a
                 complete overview of generate, check the [following
-                guide](https://hf-mirror.com/docs/transformers/en/main_classes/text_generation).
-            max_new_tokens (`int`, *optional*):
-                The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
-
-        Returns:
-            `Dict`:
-                A dictionary with the following keys:
+                guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation).
 
+        Return:
+            `Dict`: A dictionary with the following keys:
                 - **text** (`str`): The recognized text.
                 - **chunks** (*optional(, `List[Dict]`)
-                When using `return_timestamps`, the `chunks` will become a list containing all the various text
-                chunks identified by the model, *e.g.* `[{"text": "hi ", "timestamp": (0.5, 0.9)}, {"text":
-                "there", "timestamp": (1.0, 1.5)}]`. The original full text can roughly be recovered by doing
-                `"".join(chunk["text"] for chunk in output["chunks"])`.
+                    When using `return_timestamps`, the `chunks` will become a list containing all the various text
+                    chunks identified by the model, *e.g.* `[{"text": "hi ", "timestamp": (0.5, 0.9)}, {"text":
+                    "there", "timestamp": (1.0, 1.5)}]`. The original full text can roughly be recovered by doing
+                    `"".join(chunk["text"] for chunk in output["chunks"])`.
         """
         return super().__call__(inputs, **kwargs)
 
@@ -369,38 +292,6 @@ def _sanitize_parameters(
         generate_kwargs=None,
         max_new_tokens=None,
     ):
-        """
-        This method '_sanitize_parameters' in the class 'AutomaticSpeechRecognitionPipeline' is responsible for
-        sanitizing and validating input parameters for the Automatic Speech Recognition pipeline.
-
-        Args:
-            self (object): The instance of the class.
-            chunk_length_s (float, optional): The length of each audio chunk in seconds. If provided, it is stored in
-                the preprocess_params dictionary. Note: Experimental with 'seq2seq' models.
-            stride_length_s (float, optional): The stride length between consecutive audio chunks in seconds.
-                Stored in preprocess_params.
-            ignore_warning (bool, optional): If True, ignores experimental warning when using 'chunk_length_s'
-                with 'seq2seq' models.
-            decoder_kwargs (dict, optional): Additional keyword arguments for the decoder. Stored in postprocess_params.
-            return_timestamps (str or bool, optional): Specifies the type of timestamps to return. Restrictions
-                based on the model type.
-            return_language (str, optional): Specifies whether to return language information.
-                Only available for 'seq2seq_whisper' models.
-            generate_kwargs (dict, optional): Additional keyword arguments for model generation.
-                If 'max_new_tokens' is defined here, it should not be repeated in the argument list.
-            max_new_tokens (int, optional): Maximum number of new tokens to generate. Stored in forward_params.
-
-        Returns:
-            tuple:
-                A tuple containing three dictionaries - preprocess_params, forward_params, and postprocess_params.
-                These dictionaries hold sanitized parameters for different stages of the ASR pipeline.
-
-        Raises:
-            ValueError: If 'max_new_tokens' is defined both as an argument and inside 'generate_kwargs'.
-            ValueError: If attempting to return timestamps not supported by the model type.
-            ValueError: If language information is requested for a model other than 'seq2seq_whisper'.
-            Warning: Experimental warning message when using 'chunk_length_s' with 'seq2seq' models.
-        """
         # No parameters on this pipeline right now
         preprocess_params = {}
         if chunk_length_s is not None:
@@ -417,6 +308,10 @@ def _sanitize_parameters(
 
         forward_params = defaultdict(dict)
         if max_new_tokens is not None:
+            warnings.warn(
+                "`max_new_tokens` is deprecated and will be removed in version 4.49 of Transformers. To remove this warning, pass `max_new_tokens` as a key inside `generate_kwargs` instead.",
+                FutureWarning,
+            )
             forward_params["max_new_tokens"] = max_new_tokens
         if generate_kwargs is not None:
             if max_new_tokens is not None and "max_new_tokens" in generate_kwargs:
@@ -455,38 +350,11 @@ def _sanitize_parameters(
         return preprocess_params, forward_params, postprocess_params
 
     def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
-        """
-        This method preprocesses the input data for the AutomaticSpeechRecognitionPipeline.
-
-        Args:
-            self (object): The instance of the AutomaticSpeechRecognitionPipeline class.
-            inputs (str, bytes, dict, or np.ndarray):
-                The input data, which can be in the form of a file path (str), binary data (bytes),
-                a dictionary containing audio data and its properties, or a numpy array representing the audio.
-            chunk_length_s (float):
-                The length of chunks into which the audio data should be divided for processing, in seconds.
-                Defaults to 0.
-            stride_length_s (float or list):
-                The length of stride for chunking the audio data, in seconds.
-
-                - If a single value is provided, it is applied to both the left and right strides.
-                - If a list is provided, the first value represents the left stride and the second value represents
-                the right stride.
-                - If not provided, it defaults to chunk_length_s / 6.
-
-        Returns:
-            None: This method yields processed chunks of the input audio data and does not return a single value.
-
-        Raises:
-            ValueError: If the input data does not meet the expected format or requirements,
-                such as missing keys in the dictionary input, incorrect stride length, or invalid chunk length.
-            TypeError: If the type of the input does not match the expected type.
-        """
         if isinstance(inputs, str):
             if inputs.startswith("http://") or inputs.startswith("https://"):
                 # We need to actually check for a real protocol, otherwise it's impossible to use a local file
-                # like http_hf-mirror.com.png
-                inputs = requests.get(inputs, timeout=3).content
+                # like http_huggingface_co.png
+                inputs = requests.get(inputs, timeout=10).content
             else:
                 with open(inputs, "rb") as f:
                     inputs = f.read()
@@ -516,8 +384,9 @@ def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
             extra = inputs
             inputs = _inputs
             if in_sampling_rate != self.feature_extractor.sampling_rate:
-                transform = Resample(orig_freq=in_sampling_rate, new_freq=self.feature_extractor.sampling_rate)
-                inputs = transform(inputs)
+                from mindspore.dataset.audio import Resample
+                resample = Resample(in_sampling_rate, self.feature_extractor.sampling_rate)
+                inputs = resample(inputs)
                 ratio = self.feature_extractor.sampling_rate / in_sampling_rate
             else:
                 ratio = 1
@@ -531,7 +400,7 @@ def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
                 # of the original length in the stride so we can cut properly.
                 stride = (inputs.shape[0], int(round(stride[0] * ratio)), int(round(stride[1] * ratio)))
         if not isinstance(inputs, np.ndarray):
-            raise ValueError(f"We expect a numpy ndarray as input, got `{type(inputs)}`")
+            raise TypeError(f"We expect a numpy ndarray as input, got `{type(inputs)}`")
         if len(inputs.shape) != 1:
             raise ValueError("We expect a single channel audio input for AutomaticSpeechRecognitionPipeline")
 
@@ -553,9 +422,10 @@ def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
             if chunk_len < stride_left + stride_right:
                 raise ValueError("Chunk length must be superior to stride length")
 
-            yield from chunk_iter(
+            for item in chunk_iter(
                 inputs, self.feature_extractor, chunk_len, stride_left, stride_right, self.ms_dtype
-            )
+            ):
+                yield item
         else:
             if self.type == "seq2seq_whisper" and inputs.shape[0] > self.feature_extractor.n_samples:
                 processed = self.feature_extractor(
@@ -564,12 +434,25 @@ def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
                     truncation=False,
                     padding="longest",
                     return_tensors="ms",
+                    return_attention_mask=True,
                 )
             else:
-                processed = self.feature_extractor(
-                    inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="ms"
-                )
-
+                if self.type == "seq2seq_whisper" and stride is None:
+                    processed = self.feature_extractor(
+                        inputs,
+                        sampling_rate=self.feature_extractor.sampling_rate,
+                        return_tensors="ms",
+                        return_token_timestamps=True,
+                        return_attention_mask=True,
+                    )
+                    extra["num_frames"] = processed.pop("num_frames")
+                else:
+                    processed = self.feature_extractor(
+                        inputs,
+                        sampling_rate=self.feature_extractor.sampling_rate,
+                        return_tensors="ms",
+                        return_attention_mask=True,
+                    )
             if self.ms_dtype is not None:
                 processed = processed.to(dtype=self.ms_dtype)
             if stride is not None:
@@ -580,33 +463,15 @@ def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
             yield {"is_last": True, **processed, **extra}
 
     def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs):
-        """
-        Performs the forward pass for Automatic Speech Recognition (ASR) in the AutomaticSpeechRecognitionPipeline class.
-
-        Args:
-            self (AutomaticSpeechRecognitionPipeline): The instance of the AutomaticSpeechRecognitionPipeline class.
-            model_inputs (dict): A dictionary containing the model inputs.
-            return_timestamps (bool, optional): Indicates whether to return token timestamps. Defaults to False.
-
-        Returns:
-            dict: A dictionary containing the output of the forward pass.
-                The structure of the dictionary depends on the ASR model type.
-
-        Raises:
-            ValueError:
-                If the model_inputs dictionary does not contain either 'input_features' or 'input_values' key,
-                when using a seq2seq or seq2seq_whisper model.
-
-        Note:
-            Other exceptions may be raised depending on the underlying ASR model used.
-
-        """
         attention_mask = model_inputs.pop("attention_mask", None)
         stride = model_inputs.pop("stride", None)
+        num_frames = model_inputs.pop("num_frames", None)
         is_last = model_inputs.pop("is_last")
 
+        if stride is not None and num_frames is not None:
+            raise ValueError("num_frames must be used only when stride is None")
+
         if self.type in {"seq2seq", "seq2seq_whisper"}:
-            encoder = self.model.get_encoder()
             # Consume values so we can let extra information flow freely through
             # the pipeline (important for `partial` in microphone)
             if "input_features" in model_inputs:
@@ -631,13 +496,15 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs):
                             generate_kwargs["num_frames"] = stride[0] // self.feature_extractor.hop_length
                         else:
                             generate_kwargs["num_frames"] = [s[0] // self.feature_extractor.hop_length for s in stride]
+                    else:
+                        generate_kwargs["num_frames"] = num_frames
 
-            if self.type == "seq2seq_whisper" and inputs.shape[-1] > self.feature_extractor.nb_max_frames:
-                generate_kwargs["input_features"] = inputs
-            else:
-                generate_kwargs["encoder_outputs"] = encoder(inputs, attention_mask=attention_mask)
+            # User-defined `generation_config` passed to the pipeline call take precedence
+            if "generation_config" not in generate_kwargs:
+                generate_kwargs["generation_config"] = self.generation_config
 
             tokens = self.model.generate(
+                inputs=inputs,
                 attention_mask=attention_mask,
                 **generate_kwargs,
             )
@@ -668,7 +535,7 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs):
             if self.type == "ctc_with_lm":
                 out = {"logits": logits}
             else:
-                out = {"tokens": logits.argmax(axis=-1)}
+                out = {"tokens": ops.argmax(logits, dim=-1)}
             if stride is not None:
                 # Send stride to `postprocess`.
                 # it needs to be handled there where
@@ -685,28 +552,6 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs):
     def postprocess(
         self, model_outputs, decoder_kwargs: Optional[Dict] = None, return_timestamps=None, return_language=None
     ):
-        """
-        Method postprocess in the class AutomaticSpeechRecognitionPipeline.
-
-        Args:
-            self: Object instance of the class AutomaticSpeechRecognitionPipeline.
-            model_outputs: List of dictionaries representing the outputs from the model.
-                Each dictionary contains 'logits' or 'tokens' key with corresponding values.
-            decoder_kwargs: Optional dictionary containing keyword arguments for the decoder. Defaults to None.
-            return_timestamps: Optional parameter indicating whether to return timestamps.
-                Can be None, 'word', or 'char'.
-            return_language: Optional parameter specifying the language to return.
-                Can be None or a specific language identifier.
-
-        Returns:
-            None: The method modifies the model_outputs and decoder_kwargs in place.
-        
-        Raises:
-            ValueError: If the provided 'model_outputs' format is incorrect.
-            AttributeError: If the 'stride' key is missing or improperly defined in the model_outputs dictionary.
-            KeyError: If required keys are missing in the model_outputs dictionary.
-            TypeError: If the input parameters are of incorrect types or incompatible values.
-        """
         # Optional return types
         optional = {}
 
@@ -714,13 +559,16 @@ def postprocess(
         key = "logits" if self.type == "ctc_with_lm" else "tokens"
         stride = None
         for outputs in model_outputs:
-            items = outputs[key].numpy()
+            if outputs[key].dtype in (mindspore.bfloat16, mindspore.float16):
+                items = outputs[key].to(mindspore.float32).asnumpy()
+            else:
+                items = outputs[key].asnumpy()
             stride = outputs.get("stride", None)
             if stride is not None and self.type in {"ctc", "ctc_with_lm"}:
                 total_n, left, right = stride
                 # Total_n might be < logits.shape[1]
                 # because of padding, that's why
-                # we need to reforward this information
+                # we need to reconstruct this information
                 # This won't work with left padding (which doesn't exist right now)
                 right_n = total_n - right
                 items = items[:, left:right_n]
diff --git a/mindnlp/transformers/pipelines/base.py b/mindnlp/transformers/pipelines/base.py
index d78f7703d..c004af275 100644
--- a/mindnlp/transformers/pipelines/base.py
+++ b/mindnlp/transformers/pipelines/base.py
@@ -24,6 +24,7 @@
 import traceback
 import types
 import warnings
+import copy
 from abc import ABC, abstractmethod
 from os.path import abspath, exists
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -38,7 +39,7 @@
     logging,
 )
 from ..feature_extraction_utils import PreTrainedFeatureExtractor
-# from ..image_processing_utils import BaseImageProcessor
+from ..image_processing_utils import BaseImageProcessor
 from ..models.auto.configuration_auto import AutoConfig
 from ..tokenization_utils import PreTrainedTokenizer
 
@@ -937,24 +938,38 @@ def __init__(
         self.ms_dtype = ms_dtype
         self.binary_output = binary_output
 
-        # Update config and generation_config with task specific parameters
-        task_specific_params = self.model.config.task_specific_params
-        if task_specific_params is not None and task in task_specific_params:
-            self.model.config.update(task_specific_params.get(task))
-            if self.model.can_generate():
-                self.model.generation_config.update(**task_specific_params.get(task))
+        # If the model can generate, create a local generation config. This is done to avoid side-effects on the model
+        # as we apply local tweaks to the generation config.
+        if self.model.can_generate():
+            self.prefix = self.model.config.prefix if hasattr(self.model.config, "prefix") else None
+            self.generation_config = copy.deepcopy(self.model.generation_config)
+            # Update the generation config with task specific params if they exist
+            # NOTE: `prefix` is pipeline-specific and doesn't exist in the generation config.
+            task_specific_params = self.model.config.task_specific_params
+            if task_specific_params is not None and task in task_specific_params:
+                this_task_params = task_specific_params.get(task)
+                if "prefix" in this_task_params:
+                    self.prefix = this_task_params.pop("prefix")
+                self.generation_config.update(**this_task_params)
+            # If the tokenizer has a pad token but the model doesn't, set it so that `generate` is aware of it.
+            if (
+                self.tokenizer is not None
+                and self.tokenizer.pad_token_id is not None
+                and self.generation_config.pad_token_id is None
+            ):
+                self.generation_config.pad_token_id = self.tokenizer.pad_token_id
 
         self.call_count = 0
         self._batch_size = kwargs.pop("batch_size", None)
         self._num_workers = kwargs.pop("num_workers", None)
         self._preprocess_params, self._forward_params, self._postprocess_params = self._sanitize_parameters(**kwargs)
 
-        # if self.image_processor is None and self.feature_extractor is not None:
-        #     if isinstance(self.feature_extractor, BaseImageProcessor):
-        #         # Backward compatible change, if users called
-        #         # ImageSegmentationPipeline(.., feature_extractor=MyFeatureExtractor())
-        #         # then we should keep working
-        #         self.image_processor = self.feature_extractor
+        if self.image_processor is None and self.feature_extractor is not None:
+            if isinstance(self.feature_extractor, BaseImageProcessor):
+                # Backward compatible change, if users called
+                # ImageSegmentationPipeline(.., feature_extractor=MyFeatureExtractor())
+                # then we should keep working
+                self.image_processor = self.feature_extractor
 
     def save_pretrained(self, save_directory: str, safe_serialization: bool = True):
         """
diff --git a/mindnlp/transformers/pipelines/depth_estimation.py b/mindnlp/transformers/pipelines/depth_estimation.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/mindnlp/transformers/pipelines/feature_extraction.py b/mindnlp/transformers/pipelines/feature_extraction.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/mindnlp/transformers/pipelines/image_classification.py b/mindnlp/transformers/pipelines/image_classification.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/mindnlp/transformers/pipelines/image_feature_extraction.py b/mindnlp/transformers/pipelines/image_feature_extraction.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/mindnlp/transformers/pipelines/image_segmentation.py b/mindnlp/transformers/pipelines/image_segmentation.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/mindnlp/transformers/pipelines/image_to_image.py b/mindnlp/transformers/pipelines/image_to_image.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/mindnlp/transformers/pipelines/image_to_text.py b/mindnlp/transformers/pipelines/image_to_text.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/mindnlp/transformers/pipelines/mask_generation.py b/mindnlp/transformers/pipelines/mask_generation.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/mindnlp/transformers/pipelines/object_detection.py b/mindnlp/transformers/pipelines/object_detection.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/mindnlp/transformers/pipelines/text_to_audio.py b/mindnlp/transformers/pipelines/text_to_audio.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/mindnlp/transformers/pipelines/token_classification.py b/mindnlp/transformers/pipelines/token_classification.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/mindnlp/transformers/pipelines/video_classification.py b/mindnlp/transformers/pipelines/video_classification.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/mindnlp/transformers/pipelines/visual_question_answering.py b/mindnlp/transformers/pipelines/visual_question_answering.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/mindnlp/transformers/pipelines/zero_shot_audio_classification.py b/mindnlp/transformers/pipelines/zero_shot_audio_classification.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/mindnlp/transformers/pipelines/zero_shot_image_classification.py b/mindnlp/transformers/pipelines/zero_shot_image_classification.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/mindnlp/transformers/pipelines/zero_shot_object_detection.py b/mindnlp/transformers/pipelines/zero_shot_object_detection.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/mindnlp/transformers/processing_utils.py b/mindnlp/transformers/processing_utils.py
index 2d6c722db..d68cb398f 100644
--- a/mindnlp/transformers/processing_utils.py
+++ b/mindnlp/transformers/processing_utils.py
@@ -1058,7 +1058,7 @@ def _is_valid(input, validator):
     if (images is None and text_is_images) or (text is None and images_is_text) or (images_is_text and text_is_images):
         logger.warning_once(
             "You may have used the wrong order for inputs. `images` should be passed before `text`. "
-            "The `images` and `text` inputs will be swapped. This behavior will be deprecated in transformers v4.47."
+            "The `images` and `text` inputs will be swapped. This behavior will be deprecated."
         )
         return text, images
 
diff --git a/mindnlp/utils/download.py b/mindnlp/utils/download.py
index 2bda35b24..13fe939da 100644
--- a/mindnlp/utils/download.py
+++ b/mindnlp/utils/download.py
@@ -190,8 +190,10 @@ def http_get(url, path=None, md5sum=None, download_file_name=None, proxies=None,
     while not (os.path.exists(file_path) and check_md5(file_path, md5sum)):
         # get downloaded size
         tmp_file_path = file_path + "_tmp"
-        if os.path.exists(tmp_file_path) and retry_cnt != 0:
+        if os.path.exists(tmp_file_path):
             file_size = os.path.getsize(tmp_file_path)
+            if file_size % chunk_size != 0:
+                file_size = 0
             headers['Range'] = f'bytes={file_size}-'
         else:
             file_size = 0
diff --git a/mindnlp/utils/testing_utils.py b/mindnlp/utils/testing_utils.py
index a662a8420..3bd00464f 100644
--- a/mindnlp/utils/testing_utils.py
+++ b/mindnlp/utils/testing_utils.py
@@ -44,6 +44,7 @@
 
 import mindspore
 from mindnlp.utils import logging as mindnlp_logging
+from mindnlp.configs import SUPPORT_BF16
 
 from .import_utils import (
     is_pytest_available,
@@ -242,6 +243,10 @@ def require_mindspore(test_case):
     """
     return unittest.skipUnless(is_mindspore_available(), "test requires MindSpore")(test_case)
 
+def require_bfloat16(test_case):
+    """require_bfloat16"""
+    return unittest.skipUnless(SUPPORT_BF16, "test need bfloat16")(test_case)
+
 def require_mindspore_gpu(test_case):
     """Decorator marking a test that requires CUDA and MindSpore."""
     return unittest.skipUnless(mindspore.get_context('device_target') == "GPU", "test requires CUDA")(test_case)
diff --git a/tests/ut/transformers/pipelines/test_pipelines_audio_classification.py b/tests/ut/transformers/pipelines/test_pipelines_audio_classification.py
new file mode 100644
index 000000000..3c0257493
--- /dev/null
+++ b/tests/ut/transformers/pipelines/test_pipelines_audio_classification.py
@@ -0,0 +1,139 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from huggingface_hub import AudioClassificationOutputElement
+
+from mindnlp.transformers import MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
+from mindnlp.transformers.pipelines import AudioClassificationPipeline, pipeline
+from mindnlp.utils.testing_utils import (
+    is_pipeline_test,
+    nested_simplify,
+    slow,
+)
+
+from .test_pipelines_common import ANY
+
+
+@is_pipeline_test
+class AudioClassificationPipelineTests(unittest.TestCase):
+    model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
+
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
+        audio_classifier = AudioClassificationPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            torch_dtype=torch_dtype,
+        )
+
+        # test with a raw waveform
+        audio = np.zeros((34000,))
+        audio2 = np.zeros((14000,))
+        return audio_classifier, [audio2, audio]
+
+    def run_pipeline_test(self, audio_classifier, examples):
+        audio2, audio = examples
+        output = audio_classifier(audio)
+        # by default a model is initialized with num_labels=2
+        self.assertEqual(
+            output,
+            [
+                {"score": ANY(float), "label": ANY(str)},
+                {"score": ANY(float), "label": ANY(str)},
+            ],
+        )
+        output = audio_classifier(audio, top_k=1)
+        self.assertEqual(
+            output,
+            [
+                {"score": ANY(float), "label": ANY(str)},
+            ],
+        )
+
+        self.run_msaudio(audio_classifier)
+
+    def run_msaudio(self, audio_classifier):
+        import datasets
+
+        # test with a local file
+        dataset = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        audio = dataset[0]["audio"]["array"]
+        output = audio_classifier(audio)
+        self.assertEqual(
+            output,
+            [
+                {"score": ANY(float), "label": ANY(str)},
+                {"score": ANY(float), "label": ANY(str)},
+            ],
+        )
+
+    def test_small_model_ms(self):
+        model = "anton-l/wav2vec2-random-tiny-classifier"
+
+        audio_classifier = pipeline("audio-classification", model=model)
+
+        audio = np.ones((8000,))
+        output = audio_classifier(audio, top_k=4)
+
+        EXPECTED_OUTPUT = [
+            {"score": 0.0842, "label": "no"},
+            {"score": 0.0838, "label": "up"},
+            {"score": 0.0837, "label": "go"},
+            {"score": 0.0834, "label": "right"},
+        ]
+        EXPECTED_OUTPUT_PT_2 = [
+            {"score": 0.0845, "label": "stop"},
+            {"score": 0.0844, "label": "on"},
+            {"score": 0.0841, "label": "right"},
+            {"score": 0.0834, "label": "left"},
+        ]
+        self.assertIn(nested_simplify(output, decimals=4), [EXPECTED_OUTPUT, EXPECTED_OUTPUT_PT_2])
+
+        audio_dict = {"array": np.ones((8000,)), "sampling_rate": audio_classifier.feature_extractor.sampling_rate}
+        output = audio_classifier(audio_dict, top_k=4)
+        self.assertIn(nested_simplify(output, decimals=4), [EXPECTED_OUTPUT, EXPECTED_OUTPUT_PT_2])
+
+    @slow
+    def test_large_model_ms(self):
+        import datasets
+
+        model = "superb/wav2vec2-base-superb-ks"
+
+        audio_classifier = pipeline("audio-classification", model=model)
+        dataset = datasets.load_dataset("anton-l/superb_dummy", "ks", split="test", trust_remote_code=True)
+
+        audio = np.array(dataset[3]["speech"], dtype=np.float32)
+        output = audio_classifier(audio, top_k=4)
+        self.assertEqual(
+            nested_simplify(output, decimals=3),
+            [
+                {"score": 0.981, "label": "go"},
+                {"score": 0.007, "label": "up"},
+                {"score": 0.006, "label": "_unknown_"},
+                {"score": 0.001, "label": "down"},
+            ],
+        )
diff --git a/tests/ut/transformers/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/ut/transformers/pipelines/test_pipelines_automatic_speech_recognition.py
new file mode 100644
index 000000000..c9c766a6a
--- /dev/null
+++ b/tests/ut/transformers/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -0,0 +1,1954 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import unittest
+
+import numpy as np
+import pytest
+from huggingface_hub import AutomaticSpeechRecognitionOutput, hf_hub_download, snapshot_download
+from datasets import Audio, load_dataset
+
+from mindnlp.transformers import (
+    MODEL_FOR_CTC_MAPPING,
+    MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
+    AutoFeatureExtractor,
+    AutoModelForCausalLM,
+    AutoModelForSpeechSeq2Seq,
+    AutoProcessor,
+    AutoTokenizer,
+    Speech2TextForConditionalGeneration,
+    Wav2Vec2ForCTC,
+    WhisperForConditionalGeneration,
+)
+from mindnlp.transformers.pipelines import AutomaticSpeechRecognitionPipeline, pipeline
+from mindnlp.transformers.pipelines.audio_utils import chunk_bytes_iter, ffmpeg_microphone_live
+from mindnlp.transformers.pipelines.automatic_speech_recognition import _find_timestamp_sequence, chunk_iter
+from mindnlp.utils.testing_utils import (
+    is_pipeline_test,
+    is_mindspore_available,
+    nested_simplify,
+    require_pyctcdecode,
+    require_mindspore,
+    require_bfloat16,
+    slow,
+)
+
+from .test_pipelines_common import ANY
+
+
+if is_mindspore_available():
+    import mindspore
+    from mindnlp.core import ops
+
+
+# We can't use this mixin because it assumes TF support.
+# from .test_pipelines_common import CustomInputPipelineCommonMixin
+
+
+@is_pipeline_test
+class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
+    model_mapping = dict(
+        (list(MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING.items()) if MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING else [])
+        + (MODEL_FOR_CTC_MAPPING.items() if MODEL_FOR_CTC_MAPPING else [])
+    )
+
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        ms_dtype="float32",
+    ):
+        if tokenizer is None:
+            # Side effect of no Fast Tokenizer class for these model, so skipping
+            # But the slow tokenizer test should still run as they're quite small
+            self.skipTest(reason="No tokenizer available")
+
+        speech_recognizer = AutomaticSpeechRecognitionPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            ms_dtype=ms_dtype,
+        )
+
+        # test with a raw waveform
+        audio = np.zeros((34000,))
+        audio2 = np.zeros((14000,))
+        return speech_recognizer, [audio, audio2]
+
+    def run_pipeline_test(self, speech_recognizer, examples):
+        audio = np.zeros((34000,))
+        outputs = speech_recognizer(audio)
+        self.assertEqual(outputs, {"text": ANY(str)})
+
+        # Striding
+        audio = {"raw": audio, "stride": (0, 4000), "sampling_rate": speech_recognizer.feature_extractor.sampling_rate}
+        if speech_recognizer.type == "ctc":
+            outputs = speech_recognizer(audio)
+            self.assertEqual(outputs, {"text": ANY(str)})
+        elif "Whisper" in speech_recognizer.model.__class__.__name__:
+            outputs = speech_recognizer(audio)
+            self.assertEqual(outputs, {"text": ANY(str)})
+        else:
+            # Non CTC models cannot use striding.
+            with self.assertRaises(ValueError):
+                outputs = speech_recognizer(audio)
+
+        # Timestamps
+        audio = np.zeros((34000,))
+        if speech_recognizer.type == "ctc":
+            outputs = speech_recognizer(audio, return_timestamps="char")
+            self.assertIsInstance(outputs["chunks"], list)
+            n = len(outputs["chunks"])
+            self.assertEqual(
+                outputs,
+                {
+                    "text": ANY(str),
+                    "chunks": [{"text": ANY(str), "timestamp": (ANY(float), ANY(float))} for i in range(n)],
+                },
+            )
+
+            outputs = speech_recognizer(audio, return_timestamps="word")
+            self.assertIsInstance(outputs["chunks"], list)
+            n = len(outputs["chunks"])
+            self.assertEqual(
+                outputs,
+                {
+                    "text": ANY(str),
+                    "chunks": [{"text": ANY(str), "timestamp": (ANY(float), ANY(float))} for i in range(n)],
+                },
+            )
+        elif "Whisper" in speech_recognizer.model.__class__.__name__:
+            outputs = speech_recognizer(audio, return_timestamps=True)
+            self.assertIsInstance(outputs["chunks"], list)
+            nb_chunks = len(outputs["chunks"])
+            self.assertGreater(nb_chunks, 0)
+            self.assertEqual(
+                outputs,
+                {
+                    "text": ANY(str),
+                    "chunks": [{"text": ANY(str), "timestamp": (ANY(float), ANY(float))} for i in range(nb_chunks)],
+                },
+            )
+        else:
+            # Non CTC models cannot use return_timestamps
+            with self.assertRaisesRegex(
+                ValueError, "^We cannot return_timestamps yet on non-CTC models apart from Whisper!$"
+            ):
+                outputs = speech_recognizer(audio, return_timestamps="char")
+
+    @require_mindspore
+    @slow
+    def test_ms_defaults(self):
+        pipeline("automatic-speech-recognition")
+
+    @require_mindspore
+    def test_small_model_ms(self):
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model="facebook/s2t-small-mustc-en-fr-st",
+            tokenizer="facebook/s2t-small-mustc-en-fr-st",
+        )
+        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
+        output = speech_recognizer(waveform)
+        self.assertEqual(output, {"text": "(Applaudissements)"})
+        output = speech_recognizer(waveform, chunk_length_s=10)
+        self.assertEqual(output, {"text": "(Applaudissements)"})
+
+        # Non CTC models cannot use return_timestamps
+        with self.assertRaisesRegex(
+            ValueError, "^We cannot return_timestamps yet on non-CTC models apart from Whisper!$"
+        ):
+            _ = speech_recognizer(waveform, return_timestamps="char")
+
+    @require_mindspore
+    def test_small_model_ms_fp16(self):
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model="facebook/s2t-small-mustc-en-fr-st",
+            tokenizer="facebook/s2t-small-mustc-en-fr-st",
+            ms_dtype=mindspore.float16,
+        )
+        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
+        output = speech_recognizer(waveform)
+        self.assertEqual(output, {"text": "(Applaudissements)"})
+        output = speech_recognizer(waveform, chunk_length_s=10)
+        self.assertEqual(output, {"text": "(Applaudissements)"})
+
+        # Non CTC models cannot use return_timestamps
+        with self.assertRaisesRegex(
+            ValueError, "^We cannot return_timestamps yet on non-CTC models apart from Whisper!$"
+        ):
+            _ = speech_recognizer(waveform, return_timestamps="char")
+
+    @require_mindspore
+    @require_bfloat16
+    def test_small_model_ms_bf16(self):
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model="facebook/s2t-small-mustc-en-fr-st",
+            tokenizer="facebook/s2t-small-mustc-en-fr-st",
+            ms_dtype=mindspore.bfloat16,
+        )
+        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
+        output = speech_recognizer(waveform)
+        self.assertEqual(output, {"text": "(Applaudissements)"})
+        output = speech_recognizer(waveform, chunk_length_s=10)
+        self.assertEqual(output, {"text": "(Applaudissements)"})
+
+        # Non CTC models cannot use return_timestamps
+        with self.assertRaisesRegex(
+            ValueError, "^We cannot return_timestamps yet on non-CTC models apart from Whisper!$"
+        ):
+            _ = speech_recognizer(waveform, return_timestamps="char")
+
+    @slow
+    def test_whisper_fp16(self):
+        speech_recognizer = pipeline(
+            model="openai/whisper-base",
+            ms_dtype=mindspore.float16,
+        )
+        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
+        speech_recognizer(waveform)
+
+    @require_mindspore
+    def test_small_model_ms_seq2seq(self):
+        speech_recognizer = pipeline(
+            model="hf-internal-testing/tiny-random-speech-encoder-decoder",
+        )
+
+        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
+        output = speech_recognizer(waveform)
+        self.assertEqual(output, {"text": "あл ش 湯 清 ه ܬ া लᆨしث ल eか u w 全 u"})
+
+    @require_mindspore
+    def test_small_model_ms_seq2seq_gen_kwargs(self):
+        speech_recognizer = pipeline(
+            model="hf-internal-testing/tiny-random-speech-encoder-decoder",
+        )
+
+        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
+        output = speech_recognizer(waveform, max_new_tokens=10, generate_kwargs={"num_beams": 2})
+        self.assertEqual(output, {"text": "あл † γ ت ב オ 束 泣 足"})
+
+    @slow
+    @require_mindspore
+    @require_pyctcdecode
+    def test_large_model_ms_with_lm(self):
+        dataset = load_dataset("Narsil/asr_dummy", streaming=True, trust_remote_code=True)
+        third_item = next(iter(dataset["test"].skip(3)))
+        filename = third_item["file"]
+
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model="patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm",
+        )
+        self.assertEqual(speech_recognizer.type, "ctc_with_lm")
+
+        output = speech_recognizer(filename)
+        self.assertEqual(
+            output,
+            {"text": "y en las ramas medio sumergidas revoloteaban algunos pájaros de quimérico y legendario plumaje"},
+        )
+
+        # Override back to pure CTC
+        speech_recognizer.type = "ctc"
+        output = speech_recognizer(filename)
+        # plumajre != plumaje
+        self.assertEqual(
+            output,
+            {
+                "text": (
+                    "y en las ramas medio sumergidas revoloteaban algunos pájaros de quimérico y legendario plumajre"
+                )
+            },
+        )
+
+        speech_recognizer.type = "ctc_with_lm"
+        # Simple test with CTC with LM, chunking + timestamps
+        output = speech_recognizer(filename, chunk_length_s=2.0, return_timestamps="word")
+        self.assertEqual(
+            output,
+            {
+                "text": (
+                    "y en las ramas medio sumergidas revoloteaban algunos pájaros de quimérico y legendario plumajcri"
+                ),
+                "chunks": [
+                    {"text": "y", "timestamp": (0.52, 0.54)},
+                    {"text": "en", "timestamp": (0.6, 0.68)},
+                    {"text": "las", "timestamp": (0.74, 0.84)},
+                    {"text": "ramas", "timestamp": (0.94, 1.24)},
+                    {"text": "medio", "timestamp": (1.32, 1.52)},
+                    {"text": "sumergidas", "timestamp": (1.56, 2.22)},
+                    {"text": "revoloteaban", "timestamp": (2.36, 3.0)},
+                    {"text": "algunos", "timestamp": (3.06, 3.38)},
+                    {"text": "pájaros", "timestamp": (3.46, 3.86)},
+                    {"text": "de", "timestamp": (3.92, 4.0)},
+                    {"text": "quimérico", "timestamp": (4.08, 4.6)},
+                    {"text": "y", "timestamp": (4.66, 4.68)},
+                    {"text": "legendario", "timestamp": (4.74, 5.26)},
+                    {"text": "plumajcri", "timestamp": (5.34, 5.74)},
+                ],
+            },
+        )
+        # CTC + LM models cannot use return_timestamps="char"
+        with self.assertRaisesRegex(
+            ValueError, "^CTC with LM can only predict word level timestamps, set `return_timestamps='word'`$"
+        ):
+            _ = speech_recognizer(filename, return_timestamps="char")
+
+    @require_mindspore
+    def test_ms_small_no_tokenizer_files(self):
+        # test that model without tokenizer file cannot be loaded
+        with pytest.raises(OSError):
+            pipeline(
+                task="automatic-speech-recognition",
+                model="patrickvonplaten/tiny-wav2vec2-no-tokenizer",
+            )
+
+    @require_mindspore
+    @slow
+    def test_ms_large(self):
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model="facebook/wav2vec2-base-960h",
+            tokenizer="facebook/wav2vec2-base-960h",
+        )
+        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
+        output = speech_recognizer(waveform)
+        self.assertEqual(output, {"text": ""})
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
+        audio = ds[40]["audio"]
+        output = speech_recognizer(audio)
+        self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
+
+    @require_mindspore
+    @slow
+    def test_ms_large_with_input_features(self):
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model="hf-audio/wav2vec2-bert-CV16-en",
+        )
+        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
+        output = speech_recognizer(waveform)
+        self.assertEqual(output, {"text": ""})
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
+        audio = ds[40]["audio"]
+        output = speech_recognizer(audio)
+        self.assertEqual(output, {"text": "a man said to the universe sir i exist"})
+
+    @slow
+    @require_mindspore
+    def test_return_timestamps_in_preprocess(self):
+        pipe = pipeline(
+            task="automatic-speech-recognition",
+            model="openai/whisper-tiny",
+            chunk_length_s=8,
+            stride_length_s=1,
+        )
+        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True)
+        sample = next(iter(data))
+
+        res = pipe(sample["audio"]["array"])
+        self.assertEqual(res, {"text": " Conquered returned to its place amidst the tents."})
+
+        res = pipe(sample["audio"]["array"], return_timestamps=True)
+        self.assertEqual(
+            res,
+            {
+                "text": " Conquered returned to its place amidst the tents.",
+                "chunks": [{"timestamp": (0.0, 3.36), "text": " Conquered returned to its place amidst the tents."}],
+            },
+        )
+
+        res = pipe(sample["audio"]["array"], return_timestamps="word")
+        # fmt: off
+        self.assertEqual(
+            res,
+            {
+                'text': ' Conquered returned to its place amidst the tents.',
+                'chunks': [
+                    {'text': ' Conquered', 'timestamp': (0.5, 1.2)},
+                    {'text': ' returned', 'timestamp': (1.2, 1.64)},
+                    {'text': ' to', 'timestamp': (1.64, 1.84)},
+                    {'text': ' its', 'timestamp': (1.84, 2.02)},
+                    {'text': ' place', 'timestamp': (2.02, 2.28)},
+                    {'text': ' amidst', 'timestamp': (2.28, 2.8)},
+                    {'text': ' the', 'timestamp': (2.8, 2.98)},
+                    {'text': ' tents.', 'timestamp': (2.98, 3.48)},
+                ],
+            },
+        )
+        # fmt: on
+
+    @slow
+    @require_mindspore
+    def test_return_timestamps_and_language_in_preprocess(self):
+        pipe = pipeline(
+            task="automatic-speech-recognition",
+            model="openai/whisper-tiny",
+            chunk_length_s=8,
+            stride_length_s=1,
+            return_language=True,
+        )
+        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True)
+        sample = next(iter(data))
+
+        res = pipe(sample["audio"]["array"])
+        self.assertEqual(
+            res,
+            {
+                "text": " Conquered returned to its place amidst the tents.",
+                "chunks": [{"language": "english", "text": " Conquered returned to its place amidst the tents."}],
+            },
+        )
+
+        res = pipe(sample["audio"]["array"], return_timestamps=True)
+        self.assertEqual(
+            res,
+            {
+                "text": " Conquered returned to its place amidst the tents.",
+                "chunks": [
+                    {
+                        "timestamp": (0.0, 3.36),
+                        "language": "english",
+                        "text": " Conquered returned to its place amidst the tents.",
+                    }
+                ],
+            },
+        )
+
+        res = pipe(sample["audio"]["array"], return_timestamps="word")
+        # fmt: off
+        self.assertEqual(
+            res,
+            {
+                'text': ' Conquered returned to its place amidst the tents.',
+                'chunks': [
+                    {"language": "english",'text': ' Conquered', 'timestamp': (0.5, 1.2)},
+                    {"language": "english", 'text': ' returned', 'timestamp': (1.2, 1.64)},
+                    {"language": "english",'text': ' to', 'timestamp': (1.64, 1.84)},
+                    {"language": "english",'text': ' its', 'timestamp': (1.84, 2.02)},
+                    {"language": "english",'text': ' place', 'timestamp': (2.02, 2.28)},
+                    {"language": "english",'text': ' amidst', 'timestamp': (2.28, 2.8)},
+                    {"language": "english",'text': ' the', 'timestamp': (2.8, 2.98)},
+                    {"language": "english",'text': ' tents.', 'timestamp': (2.98, 3.48)},
+                ],
+            },
+        )
+        # fmt: on
+
+    @slow
+    @require_mindspore
+    def test_return_timestamps_in_preprocess_longform(self):
+        pipe = pipeline(
+            task="automatic-speech-recognition",
+            model="openai/whisper-tiny.en",
+        )
+        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True)
+        samples = [next(iter(data)) for _ in range(8)]
+        audio = np.concatenate([sample["audio"]["array"] for sample in samples])
+
+        res = pipe(audio)
+        expected_output = {
+            "text": " Concord returned to its place amidst the tents. Concord returned to its place amidst the tents. Concord returned to its place amidst "
+            "the tents. Concord returned to its place amidst the tents. Concord returned to its place amidst the tents. Concord returned to its place amidst "
+            "the tents. Concord returned to its place amidst the tents. Concord returned to its place amidst the tents. Concord returned to its place amidst "
+            "the tents. Concord returned to its place amidst the tents."
+        }
+        self.assertEqual(res, expected_output)
+        res = pipe(audio, return_timestamps=True)
+        self.assertEqual(
+            res,
+            {
+                "text": " Concord returned to its place amidst the tents. Concord returned to its place amidst the tents. Concord returned to its place amidst the tents. Concord returned to its place amidst the tents. Concord returned to its place amidst the tents. Concord returned to its place amidst the tents. Concord returned to its place amidst the tents. Concord returned to its place amidst the tents.",
+                "chunks": [
+                    {"timestamp": (0.0, 3.22), "text": " Concord returned to its place amidst the tents."},
+                    {"timestamp": (3.22, 6.74), "text": " Concord returned to its place amidst the tents."},
+                    {"timestamp": (6.74, 10.26), "text": " Concord returned to its place amidst the tents."},
+                    {"timestamp": (10.26, 13.78), "text": " Concord returned to its place amidst the tents."},
+                    {"timestamp": (13.78, 17.3), "text": " Concord returned to its place amidst the tents."},
+                    {"timestamp": (17.3, 20.82), "text": " Concord returned to its place amidst the tents."},
+                    {"timestamp": (20.82, 24.34), "text": " Concord returned to its place amidst the tents."},
+                    {"timestamp": (24.34, 27.86), "text": " Concord returned to its place amidst the tents."},
+                ],
+            },
+        )
+        pipe.model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
+        res = pipe(audio, return_timestamps="word")
+
+        # fmt: off
+        self.assertEqual(
+            res["chunks"][:15],
+            [
+                {"text": " Concord", "timestamp": (0.5, 0.94)},
+                {"text": " returned", "timestamp": (0.94, 1.52)},
+                {"text": " to", "timestamp": (1.52, 1.78)},
+                {"text": " its", "timestamp": (1.78, 1.98)},
+                {"text": " place", "timestamp": (1.98, 2.16)},
+                {"text": " amidst", "timestamp": (2.16, 2.5)},
+                {"text": " the", "timestamp": (2.5, 2.9)},
+                {"text": " tents.", "timestamp": (2.9, 4.2)},
+                {"text": " Concord", "timestamp": (4.2, 4.5)},
+                {"text": " returned", "timestamp": (4.5, 5.0)},
+                {"text": " to", "timestamp": (5.0, 5.28)},
+                {"text": " its", "timestamp": (5.28, 5.48)},
+                {"text": " place", "timestamp": (5.48, 5.7)},
+                {"text": " amidst", "timestamp": (5.7, 6.02)},
+                {"text": " the", "timestamp": (6.02, 6.4)}
+
+
+            ],
+        )
+        # fmt: on
+
+    @require_mindspore
+    def test_return_timestamps_in_init(self):
+        # segment-level timestamps are accepted
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
+        tokenizer = AutoTokenizer.from_pretrained("openai/whisper-tiny")
+        feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-tiny")
+
+        dummy_speech = np.ones(100)
+
+        pipe = pipeline(
+            task="automatic-speech-recognition",
+            model=model,
+            feature_extractor=feature_extractor,
+            tokenizer=tokenizer,
+            chunk_length_s=8,
+            stride_length_s=1,
+            return_timestamps=True,
+        )
+
+        _ = pipe(dummy_speech)
+
+        # word-level timestamps are accepted
+        pipe = pipeline(
+            task="automatic-speech-recognition",
+            model=model,
+            feature_extractor=feature_extractor,
+            tokenizer=tokenizer,
+            chunk_length_s=8,
+            stride_length_s=1,
+            return_timestamps="word",
+        )
+
+        _ = pipe(dummy_speech)
+
+        # char-level timestamps are not accepted
+        with self.assertRaisesRegex(
+            ValueError,
+            "^Whisper cannot return `char` timestamps, only word level or segment level timestamps. "
+            "Use `return_timestamps='word'` or `return_timestamps=True` respectively.$",
+        ):
+            pipe = pipeline(
+                task="automatic-speech-recognition",
+                model=model,
+                feature_extractor=feature_extractor,
+                tokenizer=tokenizer,
+                chunk_length_s=8,
+                stride_length_s=1,
+                return_timestamps="char",
+            )
+
+            _ = pipe(dummy_speech)
+
+    @require_mindspore
+    @slow
+    def test_ms_whisper(self):
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model="openai/whisper-tiny",
+        )
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
+        audio = ds[40]["audio"]
+        output = speech_recognizer(audio)
+        self.assertEqual(output, {"text": " A man said to the universe, Sir, I exist."})
+
+        output = speech_recognizer([ds[40]["audio"]], chunk_length_s=5, batch_size=4)
+        self.assertEqual(output, [{"text": " A man said to the universe, Sir, I exist."}])
+
+    @require_mindspore
+    @slow
+    def test_ms_whisper_batched(self):
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model="openai/whisper-tiny",
+        )
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:2]")
+        EXPECTED_OUTPUT = [
+            {"text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."},
+            {"text": " Nor is Mr. Quilters' manner less interesting than his matter."},
+        ]
+
+        output = speech_recognizer(ds["audio"], batch_size=2)
+        self.assertEqual(output, EXPECTED_OUTPUT)
+
+    @slow
+    def test_find_longest_common_subsequence(self):
+        max_source_positions = 1500
+        processor = AutoProcessor.from_pretrained("openai/whisper-tiny")
+
+        previous_sequence = [[51492, 406, 3163, 1953, 466, 13, 51612, 51612]]
+        self.assertEqual(
+            processor.decode(previous_sequence[0], output_offsets=True),
+            {
+                "text": " not worth thinking about.",
+                "offsets": [{"text": " not worth thinking about.", "timestamp": (22.56, 24.96)}],
+            },
+        )
+
+        # Merge when the previous sequence is a suffix of the next sequence
+        # fmt: off
+        next_sequences_1 = [
+            [50364, 295, 6177, 3391, 11, 19817, 3337, 507, 307, 406, 3163, 1953, 466, 13, 50614, 50614, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50834, 50257]
+        ]
+        # fmt: on
+        self.assertEqual(
+            processor.decode(next_sequences_1[0], output_offsets=True),
+            {
+                "text": (
+                    " of spectators, retrievality is not worth thinking about. His instant panic was followed by a"
+                    " small, sharp blow high on his chest.<|endoftext|>"
+                ),
+                "offsets": [
+                    {"text": " of spectators, retrievality is not worth thinking about.", "timestamp": (0.0, 5.0)},
+                    {
+                        "text": " His instant panic was followed by a small, sharp blow high on his chest.",
+                        "timestamp": (5.0, 9.4),
+                    },
+                ],
+            },
+        )
+        merge = _find_timestamp_sequence(
+            [[previous_sequence, (480_000, 0, 0)], [next_sequences_1, (480_000, 120_000, 0)]],
+            processor.tokenizer,
+            processor.feature_extractor,
+            max_source_positions,
+        )
+
+        # fmt: off
+        self.assertEqual(
+            merge,
+            [51492, 406, 3163, 1953, 466, 13, 51739, 51739, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 51959],
+        )
+        # fmt: on
+        self.assertEqual(
+            processor.decode(merge, output_offsets=True),
+            {
+                "text": (
+                    " not worth thinking about. His instant panic was followed by a small, sharp blow high on his"
+                    " chest."
+                ),
+                "offsets": [
+                    {"text": " not worth thinking about.", "timestamp": (22.56, 27.5)},
+                    {
+                        "text": " His instant panic was followed by a small, sharp blow high on his chest.",
+                        "timestamp": (27.5, 31.900000000000002),
+                    },
+                ],
+            },
+        )
+
+        # Merge when the sequence is in the middle of the 1st next sequence
+        # fmt: off
+        next_sequences_2 = [
+            [50364, 295, 6177, 3391, 11, 19817, 3337, 507, 307, 406, 3163, 1953, 466, 13, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50834, 50257]
+        ]
+        # fmt: on
+        # {'text': ' of spectators, retrievality is not worth thinking about. His instant panic was followed by a small, sharp blow high on his chest.','timestamp': (0.0, 9.4)}
+        merge = _find_timestamp_sequence(
+            [[previous_sequence, (480_000, 0, 0)], [next_sequences_2, (480_000, 120_000, 0)]],
+            processor.tokenizer,
+            processor.feature_extractor,
+            max_source_positions,
+        )
+        # fmt: off
+        self.assertEqual(
+            merge,
+            [51492, 406, 3163, 1953, 466, 13, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 51959],
+        )
+        # fmt: on
+        self.assertEqual(
+            processor.decode(merge, output_offsets=True),
+            {
+                "text": (
+                    " not worth thinking about. His instant panic was followed by a small, sharp blow high on his"
+                    " chest."
+                ),
+                "offsets": [
+                    {
+                        "text": (
+                            " not worth thinking about. His instant panic was followed by a small, sharp blow high on"
+                            " his chest."
+                        ),
+                        "timestamp": (22.56, 31.900000000000002),
+                    },
+                ],
+            },
+        )
+
+        # Merge when the previous sequence is not included in the current sequence
+        next_sequences_3 = [[50364, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50584, 50257]]  # fmt: skip
+        # {'text': ' His instant panic was followed by a small, sharp blow high on his chest.','timestamp': (0.0, 9.4)}
+        merge = _find_timestamp_sequence(
+            [[previous_sequence, (480_000, 0, 0)], [next_sequences_3, (480_000, 120_000, 0)]],
+            processor.tokenizer,
+            processor.feature_extractor,
+            max_source_positions,
+        )
+        self.assertEqual(
+            merge,
+            [51492, 406, 3163, 1953, 466, 13, 51612, 51612, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 51832],
+        )  # fmt: skip
+        self.assertEqual(
+            processor.decode(merge, output_offsets=True),
+            {
+                "text": (
+                    " not worth thinking about. His instant panic was followed by a small, sharp blow high on his"
+                    " chest."
+                ),
+                "offsets": [
+                    {"text": " not worth thinking about.", "timestamp": (22.56, 24.96)},
+                    {
+                        "text": " His instant panic was followed by a small, sharp blow high on his chest.",
+                        "timestamp": (24.96, 29.36),
+                    },
+                ],
+            },
+        )
+        # last case is when the sequence is not in the first next predicted start and end of timestamp
+        next_sequences_3 = [
+            [50364, 2812, 9836, 14783, 390, 406, 3163, 1953, 466, 13, 50634, 50634, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50934]
+        ]  # fmt: skip
+        merge = _find_timestamp_sequence(
+            [[previous_sequence, (480_000, 0, 0)], [next_sequences_3, (480_000, 167_000, 0)]],
+            processor.tokenizer,
+            processor.feature_extractor,
+            max_source_positions,
+        )
+        self.assertEqual(
+            merge,
+            [51492, 406, 3163, 1953, 466, 13, 51612, 51612, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 51912]
+        )  # fmt: skip
+        self.assertEqual(
+            processor.decode(merge, output_offsets=True),
+            {
+                "text": (
+                    " not worth thinking about. His instant panic was followed by a small, sharp blow high on his"
+                    " chest."
+                ),
+                "offsets": [
+                    {"text": " not worth thinking about.", "timestamp": (22.56, 24.96)},
+                    {
+                        "text": " His instant panic was followed by a small, sharp blow high on his chest.",
+                        "timestamp": (24.96, 30.96),
+                    },
+                ],
+            },
+        )
+
+    @slow
+    @require_mindspore
+    def test_whisper_timestamp_prediction(self):
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
+        array = np.concatenate(
+            [ds[40]["audio"]["array"], ds[41]["audio"]["array"], ds[42]["audio"]["array"], ds[43]["audio"]["array"]]
+        )
+        pipe = pipeline(
+            model="openai/whisper-small",
+            return_timestamps=True,
+        )
+
+        output = pipe(ds[40]["audio"])
+        self.assertDictEqual(
+            output,
+            {
+                "text": " A man said to the universe, Sir, I exist.",
+                "chunks": [{"text": " A man said to the universe, Sir, I exist.", "timestamp": (0.0, 4.26)}],
+            },
+        )
+
+        output = pipe(array, chunk_length_s=10)
+        self.assertDictEqual(
+            nested_simplify(output),
+            {
+                "chunks": [
+                    {"text": " A man said to the universe, Sir, I exist.", "timestamp": (0.0, 5.5)},
+                    {
+                        "text": (
+                            " Sweat covered Brion's body, trickling into the "
+                            "tight-loan cloth that was the only garment he wore, the "
+                            "cut"
+                        ),
+                        "timestamp": (5.5, 11.95),
+                    },
+                    {
+                        "text": (
+                            " on his chest still dripping blood, the ache of his "
+                            "overstrained eyes, even the soaring arena around him "
+                            "with"
+                        ),
+                        "timestamp": (11.95, 19.61),
+                    },
+                    {
+                        "text": " the thousands of spectators, retrievality is not worth thinking about.",
+                        "timestamp": (19.61, 25.0),
+                    },
+                    {
+                        "text": " His instant panic was followed by a small, sharp blow high on his chest.",
+                        "timestamp": (25.0, 29.4),
+                    },
+                ],
+                "text": (
+                    " A man said to the universe, Sir, I exist. Sweat covered Brion's "
+                    "body, trickling into the tight-loan cloth that was the only garment "
+                    "he wore, the cut on his chest still dripping blood, the ache of his "
+                    "overstrained eyes, even the soaring arena around him with the "
+                    "thousands of spectators, retrievality is not worth thinking about. "
+                    "His instant panic was followed by a small, sharp blow high on his "
+                    "chest."
+                ),
+            },
+        )
+
+        output = pipe(array)
+        self.assertDictEqual(
+            output,
+            {
+                "chunks": [
+                    {"text": " A man said to the universe, Sir, I exist.", "timestamp": (0.0, 5.5)},
+                    {
+                        "text": (
+                            " Sweat covered Brion's body, trickling into the "
+                            "tight-loan cloth that was the only garment"
+                        ),
+                        "timestamp": (5.5, 10.18),
+                    },
+                    {"text": " he wore.", "timestamp": (10.18, 11.68)},
+                    {"text": " The cut on his chest still dripping blood.", "timestamp": (11.68, 14.92)},
+                    {"text": " The ache of his overstrained eyes.", "timestamp": (14.92, 17.6)},
+                    {
+                        "text": (
+                            " Even the soaring arena around him with the thousands of spectators were trivialities"
+                        ),
+                        "timestamp": (17.6, 22.56),
+                    },
+                    {"text": " not worth thinking about.", "timestamp": (22.56, 24.96)},
+                ],
+                "text": (
+                    " A man said to the universe, Sir, I exist. Sweat covered Brion's "
+                    "body, trickling into the tight-loan cloth that was the only garment "
+                    "he wore. The cut on his chest still dripping blood. The ache of his "
+                    "overstrained eyes. Even the soaring arena around him with the "
+                    "thousands of spectators were trivialities not worth thinking about."
+                ),
+            },
+        )
+
+    @slow
+    @require_mindspore
+    def test_whisper_large_timestamp_prediction(self):
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
+        array = np.concatenate(
+            [ds[40]["audio"]["array"], ds[41]["audio"]["array"], ds[42]["audio"]["array"], ds[43]["audio"]["array"]]
+        )
+        pipe = pipeline(model="openai/whisper-large-v3", return_timestamps=True)
+
+        output = pipe(ds[40]["audio"])
+        self.assertDictEqual(
+            output,
+            {
+                "text": " A man said to the universe, Sir, I exist.",
+                "chunks": [{"text": " A man said to the universe, Sir, I exist.", "timestamp": (0.0, 4.08)}],
+            },
+        )
+
+        output = pipe(array, chunk_length_s=10)
+
+        self.assertDictEqual(
+            nested_simplify(output),
+            {
+                "chunks": [
+                    {"timestamp": (0.0, 2.0), "text": (" A man said to the universe,")},
+                    {"timestamp": (2.0, 4.1), "text": (" Sir, I exist.")},
+                    {"timestamp": (5.14, 5.96), "text": (" Sweat covered")},
+                    {"timestamp": (5.96, 8.02), "text": (" Breon's body, trickling into")},
+                    {"timestamp": (8.02, 10.67), "text": (" the tight loincloth that was the only garment he wore,")},
+                    {"timestamp": (10.67, 13.67), "text": (" the cut on his chest still dripping blood,")},
+                    {"timestamp": (13.67, 17.61), "text": (" the ache of his overstrained eyes.")},
+                    {
+                        "timestamp": (17.61, 24.0),
+                        "text": (
+                            " Even the soaring arena around him with thousands of spectators were trivialities not worth thinking about."
+                        ),
+                    },
+                    {
+                        "timestamp": (24.0, 29.94),
+                        "text": (" His instant of panic was followed by a small, sharp blow high on his chest."),
+                    },
+                ],
+                "text": (
+                    " A man said to the universe, Sir, I exist. Sweat covered Breon's"
+                    " body, trickling into the tight loincloth that was the only garment"
+                    " he wore, the cut on his chest still dripping blood, the ache of his"
+                    " overstrained eyes. Even the soaring arena around him with thousands"
+                    " of spectators were trivialities not worth thinking about. His "
+                    "instant of panic was followed by a small, sharp blow high on his chest."
+                ),
+            },
+        )
+
+        output = pipe(array)
+        self.assertDictEqual(
+            output,
+            {
+                "chunks": [
+                    {"timestamp": (0.0, 1.96), "text": " A man said to the universe,"},
+                    {"timestamp": (2.7, 4.1), "text": " Sir, I exist."},
+                    {"timestamp": (5.14, 6.84), "text": " Sweat covered Brion's body,"},
+                    {
+                        "timestamp": (7.4, 10.68),
+                        "text": " trickling into the tight loincloth that was the only garment he wore,",
+                    },
+                    {"timestamp": (11.6, 13.94), "text": " the cut on his chest still dripping blood,"},
+                    {"timestamp": (14.78, 16.72), "text": " the ache of his overstrained eyes,"},
+                    {
+                        "timestamp": (17.32, 21.16),
+                        "text": " even the soaring arena around him with the thousands of spectators",
+                    },
+                    {"timestamp": (21.16, 23.94), "text": " were trivialities not worth thinking about."},
+                    {
+                        "timestamp": (24.42, 29.94),
+                        "text": " His instant panic was followed by a small sharp blow high on his chest.",
+                    },
+                ],
+                "text": (
+                    " A man said to the universe, Sir, I exist. Sweat covered Brion's body,"
+                    " trickling into the tight loincloth that was the only garment he wore, "
+                    "the cut on his chest still dripping blood, the ache of his overstrained "
+                    "eyes, even the soaring arena around him with the thousands of spectators "
+                    "were trivialities not worth thinking about. His instant panic was followed "
+                    "by a small sharp blow high on his chest."
+                ),
+            },
+        )
+
+    @slow
+    @require_mindspore
+    def test_whisper_word_timestamps_batched(self):
+        pipe = pipeline(
+            task="automatic-speech-recognition",
+            model="openai/whisper-tiny",
+            chunk_length_s=3,
+            return_timestamps="word",
+        )
+        data = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        sample = data[0]["audio"]
+
+        # not the same output as test_simple_whisper_asr because of chunking
+        EXPECTED_OUTPUT = {
+            "text": " Mr. Quilder is the apostle of the middle classes and we are glad to welcome his gospel.",
+            "chunks": [
+                {"text": " Mr.", "timestamp": (0.48, 0.96)},
+                {"text": " Quilder", "timestamp": (0.96, 1.24)},
+                {"text": " is", "timestamp": (1.24, 1.5)},
+                {"text": " the", "timestamp": (1.5, 1.72)},
+                {"text": " apostle", "timestamp": (1.72, 1.98)},
+                {"text": " of", "timestamp": (1.98, 2.32)},
+                {"text": " the", "timestamp": (2.32, 2.5)},
+                {"text": " middle", "timestamp": (2.5, 2.68)},
+                {"text": " classes", "timestamp": (2.68, 3.2)},
+                {"text": " and", "timestamp": (3.2, 3.56)},
+                {"text": " we", "timestamp": (3.56, 3.68)},
+                {"text": " are", "timestamp": (3.68, 3.8)},
+                {"text": " glad", "timestamp": (3.8, 4.1)},
+                {"text": " to", "timestamp": (4.1, 4.34)},
+                {"text": " welcome", "timestamp": (4.3, 4.6)},
+                {"text": " his", "timestamp": (4.6, 4.94)},
+                {"text": " gospel.", "timestamp": (4.94, 5.82)},
+            ],
+        }
+
+        # batch size 1: copy the audio sample since pipeline consumes it
+        output = pipe(sample.copy(), batch_size=1)
+        self.assertDictEqual(output, EXPECTED_OUTPUT)
+
+        # batch size 2: input audio is chunked into smaller pieces so it's testing batching
+        output = pipe(sample, batch_size=2)
+        self.assertDictEqual(output, EXPECTED_OUTPUT)
+
+    @slow
+    @require_mindspore
+    def test_whisper_large_word_timestamps_batched(self):
+        pipe = pipeline(
+            task="automatic-speech-recognition",
+            model="openai/whisper-large-v3",
+            return_timestamps="word",
+        )
+        data = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        sample = data[0]["audio"]
+
+        # not the same output as test_simple_whisper_asr because of chunking
+        EXPECTED_OUTPUT = {
+            "text": " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.",
+            "chunks": [
+                {"text": " Mr.", "timestamp": (0.0, 0.74)},
+                {"text": " Quilter", "timestamp": (0.74, 1.04)},
+                {"text": " is", "timestamp": (1.04, 1.3)},
+                {"text": " the", "timestamp": (1.3, 1.44)},
+                {"text": " apostle", "timestamp": (1.44, 1.74)},
+                {"text": " of", "timestamp": (1.74, 2.18)},
+                {"text": " the", "timestamp": (2.18, 2.28)},
+                {"text": " middle", "timestamp": (2.28, 2.5)},
+                {"text": " classes,", "timestamp": (2.5, 3.0)},
+                {"text": " and", "timestamp": (3.0, 3.4)},
+                {"text": " we", "timestamp": (3.4, 3.5)},
+                {"text": " are", "timestamp": (3.5, 3.6)},
+                {"text": " glad", "timestamp": (3.6, 3.84)},
+                {"text": " to", "timestamp": (3.84, 4.1)},
+                {"text": " welcome", "timestamp": (4.1, 4.4)},
+                {"text": " his", "timestamp": (4.4, 4.7)},
+                {"text": " gospel.", "timestamp": (4.7, 5.34)},
+            ],
+        }
+
+        # batch size 1: copy the audio sample since pipeline consumes it
+        output = pipe(sample.copy(), batch_size=1)
+        self.assertDictEqual(output, EXPECTED_OUTPUT)
+
+        # batch size 2: input audio is chunked into smaller pieces so it's testing batching
+        output = pipe(sample, batch_size=2)
+        self.assertDictEqual(output, EXPECTED_OUTPUT)
+
+    @require_mindspore
+    @slow
+    def test_ms_speech_encoder_decoder(self):
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model="facebook/s2t-wav2vec2-large-en-de",
+            feature_extractor="facebook/s2t-wav2vec2-large-en-de",
+        )
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
+        audio = ds[40]["audio"]
+        output = speech_recognizer(audio)
+        self.assertEqual(output, {"text": 'Ein Mann sagte zum Universum : " Sir, ich existiert! "'})
+
+    @slow
+    @require_mindspore
+    def test_simple_wav2vec2(self):
+        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
+        tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
+        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
+
+        asr = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
+        output = asr(waveform)
+        self.assertEqual(output, {"text": ""})
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
+        audio = ds[40]["audio"]
+        output = asr(audio)
+        self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
+
+        data = Audio().encode_example(ds[40]["audio"])["bytes"]
+        output = asr(data)
+        self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
+
+    @slow
+    @require_mindspore
+    def test_simple_s2t(self):
+        model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-mustc-en-it-st")
+        tokenizer = AutoTokenizer.from_pretrained("facebook/s2t-small-mustc-en-it-st")
+        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/s2t-small-mustc-en-it-st")
+
+        asr = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
+
+        output = asr(waveform)
+        self.assertEqual(output, {"text": "(Applausi)"})
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
+        audio = ds[40]["audio"]
+        output = asr(audio)
+        self.assertEqual(output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."})
+
+        data = Audio().encode_example(ds[40]["audio"])["bytes"]
+        output = asr(data)
+        self.assertEqual(output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."})
+
+    @slow
+    @require_mindspore
+    def test_simple_whisper_asr(self):
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model="openai/whisper-tiny.en",
+        )
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        audio = ds[0]["audio"]
+        output = speech_recognizer(audio)
+        self.assertEqual(
+            output,
+            {"text": " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."},
+        )
+        output = speech_recognizer(ds[0]["audio"], return_timestamps=True)
+        self.assertEqual(
+            output,
+            {
+                "text": " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.",
+                "chunks": [
+                    {
+                        "text": (
+                            " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."
+                        ),
+                        "timestamp": (0.0, 5.44),
+                    }
+                ],
+            },
+        )
+        speech_recognizer.model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
+        output = speech_recognizer(ds[0]["audio"], return_timestamps="word")
+        # fmt: off
+        self.assertEqual(
+            output,
+            {
+                'text': ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.',
+                'chunks': [
+                    {'text': ' Mr.', 'timestamp': (0.38, 1.04)},
+                    {'text': ' Quilter', 'timestamp': (1.04, 1.18)},
+                    {'text': ' is', 'timestamp': (1.18, 1.44)},
+                    {'text': ' the', 'timestamp': (1.44, 1.58)},
+                    {'text': ' apostle', 'timestamp': (1.58, 1.98)},
+                    {'text': ' of', 'timestamp': (1.98, 2.32)},
+                    {'text': ' the', 'timestamp': (2.32, 2.46)},
+                    {'text': ' middle', 'timestamp': (2.46, 2.56)},
+                    {'text': ' classes,', 'timestamp': (2.56, 3.4)},
+                    {'text': ' and', 'timestamp': (3.4, 3.54)},
+                    {'text': ' we', 'timestamp': (3.54, 3.62)},
+                    {'text': ' are', 'timestamp': (3.62, 3.72)},
+                    {'text': ' glad', 'timestamp': (3.72, 4.0)},
+                    {'text': ' to', 'timestamp': (4.0, 4.26)},
+                    {'text': ' welcome', 'timestamp': (4.26, 4.56)},
+                    {'text': ' his', 'timestamp': (4.56, 4.92)},
+                    {'text': ' gospel.', 'timestamp': (4.92, 5.84)}
+                ]
+            }
+        )
+        # fmt: on
+
+        # Whisper can only predict segment level timestamps or word level, not character level
+        with self.assertRaisesRegex(
+            ValueError,
+            "^Whisper cannot return `char` timestamps, only word level or segment level timestamps. "
+            "Use `return_timestamps='word'` or `return_timestamps=True` respectively.$",
+        ):
+            _ = speech_recognizer(audio, return_timestamps="char")
+
+    @slow
+    @require_mindspore
+    def test_simple_whisper_translation(self):
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model="openai/whisper-large",
+        )
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
+        audio = ds[40]["audio"]
+        output = speech_recognizer(audio)
+        self.assertEqual(output, {"text": " A man said to the universe, Sir, I exist."})
+
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
+        tokenizer = AutoTokenizer.from_pretrained("openai/whisper-large")
+        feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-large")
+
+        speech_recognizer_2 = AutomaticSpeechRecognitionPipeline(
+            model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
+        )
+        output_2 = speech_recognizer_2(ds[40]["audio"])
+        self.assertEqual(output, output_2)
+
+        # either use generate_kwargs or set the model's generation_config
+        # model.generation_config.task = "transcribe"
+        # model.generation_config.lang = "<|it|>"
+        speech_translator = AutomaticSpeechRecognitionPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            generate_kwargs={"task": "transcribe", "language": "<|it|>"},
+        )
+        output_3 = speech_translator(ds[40]["audio"])
+        self.assertEqual(output_3, {"text": " Un uomo ha detto all'universo, Sir, esiste."})
+
+    @slow
+    @require_mindspore
+    def test_whisper_language(self):
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model="openai/whisper-tiny.en",
+        )
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        audio = ds[0]["audio"]
+
+        # 1. English-only model compatible with no language argument
+        output = speech_recognizer(audio)
+        self.assertEqual(
+            output,
+            {"text": " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."},
+        )
+
+        # 2. English-only Whisper does not accept the language argument
+        with self.assertRaisesRegex(
+            ValueError,
+            "Cannot specify `task` or `language` for an English-only model. If the model is intended to be multilingual, "
+            "pass `is_multilingual=True` to generate, or update the generation config.",
+        ):
+            _ = speech_recognizer(ds[0]["audio"], generate_kwargs={"language": "en"})
+
+        # 3. Multilingual model accepts language argument
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model="openai/whisper-tiny",
+        )
+        output = speech_recognizer(ds[0]["audio"], generate_kwargs={"language": "en"})
+        self.assertEqual(
+            output,
+            {"text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."},
+        )
+
+    @slow
+    def test_speculative_decoding_whisper_non_distil(self):
+        # Load data:
+        dataset = load_dataset(
+            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]", trust_remote_code=True
+        )
+        sample = dataset[0]["audio"]
+
+        # Load model:
+        model_id = "openai/whisper-large-v2"
+        processor = AutoProcessor.from_pretrained(model_id)
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            model_id,
+            use_safetensors=True,
+        )
+
+        # Load assistant:
+        assistant_model_id = "openai/whisper-tiny"
+        assistant_model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            assistant_model_id,
+            use_safetensors=True,
+        )
+
+        # Load pipeline:
+        pipe = AutomaticSpeechRecognitionPipeline(
+            model=model,
+            tokenizer=processor.tokenizer,
+            feature_extractor=processor.feature_extractor,
+            generate_kwargs={"language": "en"},
+        )
+
+        start_time = time.time()
+        transcription_non_ass = pipe(sample.copy(), generate_kwargs={"assistant_model": assistant_model})["text"]
+        total_time_assist = time.time() - start_time
+
+        start_time = time.time()
+        transcription_ass = pipe(sample)["text"]
+        total_time_non_assist = time.time() - start_time
+
+        self.assertEqual(transcription_ass, transcription_non_ass)
+        self.assertEqual(
+            transcription_ass,
+            " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.",
+        )
+        self.assertTrue(total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster")
+
+    @slow
+    def test_speculative_decoding_whisper_distil(self):
+        # Load data:
+        dataset = load_dataset(
+            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]", trust_remote_code=True
+        )
+        sample = dataset[0]["audio"]
+
+        # Load model:
+        model_id = "openai/whisper-large-v2"
+        processor = AutoProcessor.from_pretrained(model_id)
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            model_id,
+            use_safetensors=True,
+        )
+
+        # Load assistant:
+        assistant_model_id = "distil-whisper/distil-large-v2"
+        assistant_model = AutoModelForCausalLM.from_pretrained(
+            assistant_model_id,
+            use_safetensors=True,
+        )
+
+        # Load pipeline:
+        pipe = AutomaticSpeechRecognitionPipeline(
+            model=model,
+            tokenizer=processor.tokenizer,
+            feature_extractor=processor.feature_extractor,
+            generate_kwargs={"language": "en"},
+        )
+
+        start_time = time.time()
+        transcription_non_ass = pipe(sample.copy(), generate_kwargs={"assistant_model": assistant_model})["text"]
+        total_time_assist = time.time() - start_time
+
+        start_time = time.time()
+        transcription_ass = pipe(sample)["text"]
+        total_time_non_assist = time.time() - start_time
+
+        self.assertEqual(transcription_ass, transcription_non_ass)
+        self.assertEqual(
+            transcription_ass,
+            " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.",
+        )
+        self.assertEqual(total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster")
+
+    @slow
+    @require_mindspore
+    def test_xls_r_to_en(self):
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model="facebook/wav2vec2-xls-r-1b-21-to-en",
+            feature_extractor="facebook/wav2vec2-xls-r-1b-21-to-en",
+        )
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
+        audio = ds[40]["audio"]
+        output = speech_recognizer(audio)
+        self.assertEqual(output, {"text": "A man said to the universe: “Sir, I exist."})
+
+    @slow
+    @require_mindspore
+    def test_xls_r_from_en(self):
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model="facebook/wav2vec2-xls-r-1b-en-to-15",
+            feature_extractor="facebook/wav2vec2-xls-r-1b-en-to-15",
+        )
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
+        audio = ds[40]["audio"]
+        output = speech_recognizer(audio)
+        self.assertEqual(output, {"text": "Ein Mann sagte zu dem Universum, Sir, ich bin da."})
+
+    @slow
+    @require_mindspore
+    def test_speech_to_text_leveraged(self):
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model="patrickvonplaten/wav2vec2-2-bart-base",
+            feature_extractor="patrickvonplaten/wav2vec2-2-bart-base",
+            tokenizer=AutoTokenizer.from_pretrained("patrickvonplaten/wav2vec2-2-bart-base"),
+        )
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
+        audio = ds[40]["audio"]
+        output = speech_recognizer(audio)
+        self.assertEqual(output, {"text": "a man said to the universe sir i exist"})
+
+    @slow
+    def test_wav2vec2_conformer_float16(self):
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model="facebook/wav2vec2-conformer-rope-large-960h-ft",
+            ms_dtype=mindspore.float16,
+        )
+
+        dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        sample = dataset[0]["audio"]
+
+        output = speech_recognizer(sample)
+        self.assertEqual(
+            output,
+            {"text": "MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL"},
+        )
+
+    @require_mindspore
+    def test_chunking_fast(self):
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model="hf-internal-testing/tiny-random-wav2vec2",
+            chunk_length_s=10.0,
+        )
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
+        audio = ds[40]["audio"]["array"]
+
+        n_repeats = 2
+        audio_tiled = np.tile(audio, n_repeats)
+        output = speech_recognizer([audio_tiled], batch_size=2)
+        self.assertEqual(output, [{"text": ANY(str)}])
+        self.assertEqual(output[0]["text"][:6], "ZBT ZC")
+
+    @require_mindspore
+    def test_return_timestamps_ctc_fast(self):
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model="hf-internal-testing/tiny-random-wav2vec2",
+        )
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
+        # Take short audio to keep the test readable
+        audio = ds[40]["audio"]["array"][:800]
+
+        output = speech_recognizer(audio, return_timestamps="char")
+        self.assertEqual(
+            output,
+            {
+                "text": "ZBT ZX G",
+                "chunks": [
+                    {"text": " ", "timestamp": (0.0, 0.012)},
+                    {"text": "Z", "timestamp": (0.012, 0.016)},
+                    {"text": "B", "timestamp": (0.016, 0.02)},
+                    {"text": "T", "timestamp": (0.02, 0.024)},
+                    {"text": " ", "timestamp": (0.024, 0.028)},
+                    {"text": "Z", "timestamp": (0.028, 0.032)},
+                    {"text": "X", "timestamp": (0.032, 0.036)},
+                    {"text": " ", "timestamp": (0.036, 0.04)},
+                    {"text": "G", "timestamp": (0.04, 0.044)},
+                ],
+            },
+        )
+
+        output = speech_recognizer(audio, return_timestamps="word")
+        self.assertEqual(
+            output,
+            {
+                "text": "ZBT ZX G",
+                "chunks": [
+                    {"text": "ZBT", "timestamp": (0.012, 0.024)},
+                    {"text": "ZX", "timestamp": (0.028, 0.036)},
+                    {"text": "G", "timestamp": (0.04, 0.044)},
+                ],
+            },
+        )
+
+    @require_mindspore
+    @require_pyctcdecode
+    def test_chunking_fast_with_lm(self):
+        speech_recognizer = pipeline(
+            model="hf-internal-testing/processor_with_lm",
+            chunk_length_s=10.0,
+        )
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
+        audio = ds[40]["audio"]["array"]
+
+        n_repeats = 2
+        audio_tiled = np.tile(audio, n_repeats)
+        # Batch_size = 1
+        output1 = speech_recognizer([audio_tiled], batch_size=1)
+        self.assertEqual(output1, [{"text": ANY(str)}])
+        self.assertEqual(output1[0]["text"][:6], "<s> <s")
+
+        # batch_size = 2
+        output2 = speech_recognizer([audio_tiled], batch_size=2)
+        self.assertEqual(output2, [{"text": ANY(str)}])
+        self.assertEqual(output2[0]["text"][:6], "<s> <s")
+
+        # TODO There is an offby one error because of the ratio.
+        # Maybe logits get affected by the padding on this random
+        # model is more likely. Add some masking ?
+        # self.assertEqual(output1, output2)
+
+    @require_mindspore
+    @require_pyctcdecode
+    def test_with_lm_fast(self):
+        speech_recognizer = pipeline(
+            model="hf-internal-testing/processor_with_lm",
+        )
+        self.assertEqual(speech_recognizer.type, "ctc_with_lm")
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
+        audio = ds[40]["audio"]["array"]
+
+        n_repeats = 2
+        audio_tiled = np.tile(audio, n_repeats)
+
+        output = speech_recognizer([audio_tiled], batch_size=2)
+        self.assertEqual(output, [{"text": ANY(str)}])
+        self.assertEqual(output[0]["text"][:6], "<s> <s")
+
+        # Making sure the argument are passed to the decoder
+        # Since no change happens in the result, check the error comes from
+        # the `decode_beams` function.
+        with self.assertRaises(TypeError) as e:
+            output = speech_recognizer([audio_tiled], decoder_kwargs={"num_beams": 2})
+            self.assertContains(e.msg, "TypeError: decode_beams() got an unexpected keyword argument 'num_beams'")
+        output = speech_recognizer([audio_tiled], decoder_kwargs={"beam_width": 2})
+
+    @require_mindspore
+    @require_pyctcdecode
+    def test_with_local_lm_fast(self):
+        local_dir = snapshot_download("hf-internal-testing/processor_with_lm")
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model=local_dir,
+        )
+        self.assertEqual(speech_recognizer.type, "ctc_with_lm")
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
+        audio = ds[40]["audio"]["array"]
+
+        n_repeats = 2
+        audio_tiled = np.tile(audio, n_repeats)
+
+        output = speech_recognizer([audio_tiled], batch_size=2)
+
+        self.assertEqual(output, [{"text": ANY(str)}])
+        self.assertEqual(output[0]["text"][:6], "<s> <s")
+
+    @require_mindspore
+    @slow
+    def test_whisper_prompted(self):
+        processor = AutoProcessor.from_pretrained("openai/whisper-tiny")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
+
+        pipe = pipeline(
+            "automatic-speech-recognition",
+            model=model,
+            tokenizer=processor.tokenizer,
+            feature_extractor=processor.feature_extractor,
+            max_new_tokens=128,
+            chunk_length_s=30,
+            batch_size=16,
+        )
+
+        dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
+        sample = dataset[0]["audio"]
+
+        # prompt the model to misspell "Mr Quilter" as "Mr Quillter"
+        whisper_prompt = "Mr. Quillter."
+        prompt_ids = pipe.tokenizer.get_prompt_ids(whisper_prompt, return_tensors="ms")
+
+        unprompted_result = pipe(sample.copy())["text"]
+        prompted_result = pipe(sample, generate_kwargs={"prompt_ids": prompt_ids})["text"]
+
+        # fmt: off
+        EXPECTED_UNPROMPTED_RESULT = " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins work is really Greek after all and can discover in it but little of rocky Ithaca. Lennils, pictures are a sort of upguards and atom paintings and Mason's exquisite itals are as national as a jingo poem. Mr. Birkut Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap on the back before he says like a shampoo or a Turkish bath. Next man"
+        EXPECTED_PROMPTED_RESULT = " Mr. Quillter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quillter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins work is really great after all, and can discover in it but little of rocky Ithaca. Lennils, pictures are a sort of upguards and atom paintings, and Mason's exquisite itals are as national as a jingo poem. Mr. Birkut Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. Mr. John Collier gives his sitter a cheerful slap on the back before he says like a shampoo or a Turkish bath. Next man."
+        # fmt: on
+
+        self.assertEqual(unprompted_result, EXPECTED_UNPROMPTED_RESULT)
+        self.assertEqual(prompted_result, EXPECTED_PROMPTED_RESULT)
+
+    @require_mindspore
+    @slow
+    def test_whisper_longform(self):
+        # fmt: off
+        EXPECTED_RESULT = " Folks, if you watch the show, you know, I spent a lot of time right over there. Patiently and astutely scrutinizing the boxwood and mahogany chest set of the day's biggest stories developing the central headline pawns, definitely maneuvering an oso topical night to F6, fainting a classic Sicilian, nade door variation on the news, all the while seeing eight moves deep and patiently marshalling the latest press releases into a fisher's shows in Lip Nitsky attack that culminates in the elegant lethal slow-played, all-passant checkmate that is my nightly monologue. But sometimes, sometimes, folks, I. CHEERING AND APPLAUSE Sometimes I startle away, cubside down in the monkey bars of a condemned playground on a super fun site. Get all hept up on goofballs. Rummage that were discarded tag bag of defective toys. Yank out a fist bowl of disembodied doll limbs, toss them on Saturday, Rusty Cargo, container down by the Wharf, and challenge toothless drifters to the godless bughouse lets of tournament that is my segment. MUSIC Meanwhile!"
+        # fmt: on
+
+        processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+
+        pipe = pipeline(
+            "automatic-speech-recognition",
+            model=model,
+            tokenizer=processor.tokenizer,
+            feature_extractor=processor.feature_extractor,
+            max_new_tokens=128,
+            return_timestamps=True,  # to allow longform generation
+        )
+
+        ds = load_dataset("distil-whisper/meanwhile", "default")["test"]
+        ds = ds.cast_column("audio", Audio(sampling_rate=16000))
+        audio = ds[:1]["audio"]
+
+        result = pipe(audio)[0]["text"]
+
+        assert result == EXPECTED_RESULT
+
+    @require_mindspore
+    @slow
+    def test_seamless_v2(self):
+        pipe = pipeline(
+            "automatic-speech-recognition",
+            model="facebook/seamless-m4t-v2-large",
+        )
+
+        dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        sample = dataset[0]["audio"]
+
+        result = pipe(sample, generate_kwargs={"tgt_lang": "eng"})
+        EXPECTED_RESULT = "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel"
+
+        assert result["text"] == EXPECTED_RESULT
+
+    @require_mindspore
+    @slow
+    def test_chunking_and_timestamps(self):
+        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
+        tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
+        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            chunk_length_s=10.0,
+        )
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
+        audio = ds[40]["audio"]["array"]
+
+        n_repeats = 10
+        audio_tiled = np.tile(audio, n_repeats)
+        output = speech_recognizer([audio_tiled], batch_size=2)
+        self.assertEqual(output, [{"text": ("A MAN SAID TO THE UNIVERSE SIR I EXIST " * n_repeats).strip()}])
+
+        output = speech_recognizer(audio, return_timestamps="char")
+        self.assertEqual(audio.shape, (74_400,))
+        self.assertEqual(speech_recognizer.feature_extractor.sampling_rate, 16_000)
+        # The audio is 74_400 / 16_000 = 4.65s long.
+        self.assertEqual(
+            output,
+            {
+                "text": "A MAN SAID TO THE UNIVERSE SIR I EXIST",
+                "chunks": [
+                    {"text": "A", "timestamp": (0.6, 0.62)},
+                    {"text": " ", "timestamp": (0.62, 0.66)},
+                    {"text": "M", "timestamp": (0.68, 0.7)},
+                    {"text": "A", "timestamp": (0.78, 0.8)},
+                    {"text": "N", "timestamp": (0.84, 0.86)},
+                    {"text": " ", "timestamp": (0.92, 0.98)},
+                    {"text": "S", "timestamp": (1.06, 1.08)},
+                    {"text": "A", "timestamp": (1.14, 1.16)},
+                    {"text": "I", "timestamp": (1.16, 1.18)},
+                    {"text": "D", "timestamp": (1.2, 1.24)},
+                    {"text": " ", "timestamp": (1.24, 1.28)},
+                    {"text": "T", "timestamp": (1.28, 1.32)},
+                    {"text": "O", "timestamp": (1.34, 1.36)},
+                    {"text": " ", "timestamp": (1.38, 1.42)},
+                    {"text": "T", "timestamp": (1.42, 1.44)},
+                    {"text": "H", "timestamp": (1.44, 1.46)},
+                    {"text": "E", "timestamp": (1.46, 1.5)},
+                    {"text": " ", "timestamp": (1.5, 1.56)},
+                    {"text": "U", "timestamp": (1.58, 1.62)},
+                    {"text": "N", "timestamp": (1.64, 1.68)},
+                    {"text": "I", "timestamp": (1.7, 1.72)},
+                    {"text": "V", "timestamp": (1.76, 1.78)},
+                    {"text": "E", "timestamp": (1.84, 1.86)},
+                    {"text": "R", "timestamp": (1.86, 1.9)},
+                    {"text": "S", "timestamp": (1.96, 1.98)},
+                    {"text": "E", "timestamp": (1.98, 2.02)},
+                    {"text": " ", "timestamp": (2.02, 2.06)},
+                    {"text": "S", "timestamp": (2.82, 2.86)},
+                    {"text": "I", "timestamp": (2.94, 2.96)},
+                    {"text": "R", "timestamp": (2.98, 3.02)},
+                    {"text": " ", "timestamp": (3.06, 3.12)},
+                    {"text": "I", "timestamp": (3.5, 3.52)},
+                    {"text": " ", "timestamp": (3.58, 3.6)},
+                    {"text": "E", "timestamp": (3.66, 3.68)},
+                    {"text": "X", "timestamp": (3.68, 3.7)},
+                    {"text": "I", "timestamp": (3.9, 3.92)},
+                    {"text": "S", "timestamp": (3.94, 3.96)},
+                    {"text": "T", "timestamp": (4.0, 4.02)},
+                    {"text": " ", "timestamp": (4.06, 4.1)},
+                ],
+            },
+        )
+        output = speech_recognizer(audio, return_timestamps="word")
+        self.assertEqual(
+            output,
+            {
+                "text": "A MAN SAID TO THE UNIVERSE SIR I EXIST",
+                "chunks": [
+                    {"text": "A", "timestamp": (0.6, 0.62)},
+                    {"text": "MAN", "timestamp": (0.68, 0.86)},
+                    {"text": "SAID", "timestamp": (1.06, 1.24)},
+                    {"text": "TO", "timestamp": (1.28, 1.36)},
+                    {"text": "THE", "timestamp": (1.42, 1.5)},
+                    {"text": "UNIVERSE", "timestamp": (1.58, 2.02)},
+                    {"text": "SIR", "timestamp": (2.82, 3.02)},
+                    {"text": "I", "timestamp": (3.5, 3.52)},
+                    {"text": "EXIST", "timestamp": (3.66, 4.02)},
+                ],
+            },
+        )
+        output = speech_recognizer(audio, return_timestamps="word", chunk_length_s=2.0)
+        self.assertEqual(
+            output,
+            {
+                "text": "A MAN SAID TO THE UNIVERSE SIR I EXIST",
+                "chunks": [
+                    {"text": "A", "timestamp": (0.6, 0.62)},
+                    {"text": "MAN", "timestamp": (0.68, 0.86)},
+                    {"text": "SAID", "timestamp": (1.06, 1.24)},
+                    {"text": "TO", "timestamp": (1.3, 1.36)},
+                    {"text": "THE", "timestamp": (1.42, 1.48)},
+                    {"text": "UNIVERSE", "timestamp": (1.58, 2.02)},
+                    # Tiny change linked to chunking.
+                    {"text": "SIR", "timestamp": (2.84, 3.02)},
+                    {"text": "I", "timestamp": (3.5, 3.52)},
+                    {"text": "EXIST", "timestamp": (3.66, 4.02)},
+                ],
+            },
+        )
+        # CTC models must specify return_timestamps type - cannot set `return_timestamps=True` blindly
+        with self.assertRaisesRegex(
+            ValueError,
+            "^CTC can either predict character level timestamps, or word level timestamps. "
+            "Set `return_timestamps='char'` or `return_timestamps='word'` as required.$",
+        ):
+            _ = speech_recognizer(audio, return_timestamps=True)
+
+    @require_mindspore
+    @slow
+    def test_chunking_with_lm(self):
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model="patrickvonplaten/wav2vec2-base-100h-with-lm",
+            chunk_length_s=10.0,
+        )
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
+        audio = ds[40]["audio"]["array"]
+
+        n_repeats = 10
+        audio = np.tile(audio, n_repeats)
+        output = speech_recognizer([audio], batch_size=2)
+        expected_text = "A MAN SAID TO THE UNIVERSE SIR I EXIST " * n_repeats
+        expected = [{"text": expected_text.strip()}]
+        self.assertEqual(output, expected)
+
+    @require_mindspore
+    def test_chunk_iterator(self):
+        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
+        inputs = ops.arange(100).long()
+        outs = list(chunk_iter(inputs, feature_extractor, 100, 0, 0))
+
+        self.assertEqual(len(outs), 1)
+        self.assertEqual([o["stride"] for o in outs], [(100, 0, 0)])
+        self.assertEqual([o["input_values"].shape for o in outs], [(1, 100)])
+        self.assertEqual([o["is_last"] for o in outs], [True])
+
+        # two chunks no stride
+        outs = list(chunk_iter(inputs, feature_extractor, 50, 0, 0))
+        self.assertEqual(len(outs), 2)
+        self.assertEqual([o["stride"] for o in outs], [(50, 0, 0), (50, 0, 0)])
+        self.assertEqual([o["input_values"].shape for o in outs], [(1, 50), (1, 50)])
+        self.assertEqual([o["is_last"] for o in outs], [False, True])
+
+        # two chunks incomplete last
+        outs = list(chunk_iter(inputs, feature_extractor, 80, 0, 0))
+        self.assertEqual(len(outs), 2)
+        self.assertEqual([o["stride"] for o in outs], [(80, 0, 0), (20, 0, 0)])
+        self.assertEqual([o["input_values"].shape for o in outs], [(1, 80), (1, 20)])
+        self.assertEqual([o["is_last"] for o in outs], [False, True])
+
+        # one chunk since first is also last, because it contains only data
+        # in the right strided part we just mark that part as non stride
+        # This test is specifically crafted to trigger a bug if next chunk
+        # would be ignored by the fact that all the data would be
+        # contained in the strided left data.
+        outs = list(chunk_iter(inputs, feature_extractor, 105, 5, 5))
+        self.assertEqual(len(outs), 1)
+        self.assertEqual([o["stride"] for o in outs], [(100, 0, 0)])
+        self.assertEqual([o["input_values"].shape for o in outs], [(1, 100)])
+        self.assertEqual([o["is_last"] for o in outs], [True])
+
+    @require_mindspore
+    def test_chunk_iterator_stride(self):
+        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
+        inputs = ops.arange(100).long()
+        input_values = feature_extractor(inputs, sampling_rate=feature_extractor.sampling_rate, return_tensors="ms")[
+            "input_values"
+        ]
+        outs = list(chunk_iter(inputs, feature_extractor, 100, 20, 10))
+        self.assertEqual(len(outs), 1)
+        self.assertEqual([o["stride"] for o in outs], [(100, 0, 0)])
+        self.assertEqual([o["input_values"].shape for o in outs], [(1, 100)])
+        self.assertEqual([o["is_last"] for o in outs], [True])
+
+        outs = list(chunk_iter(inputs, feature_extractor, 80, 20, 10))
+        self.assertEqual(len(outs), 2)
+        self.assertEqual([o["stride"] for o in outs], [(80, 0, 10), (50, 20, 0)])
+        self.assertEqual([o["input_values"].shape for o in outs], [(1, 80), (1, 50)])
+        self.assertEqual([o["is_last"] for o in outs], [False, True])
+
+        outs = list(chunk_iter(inputs, feature_extractor, 90, 20, 0))
+        self.assertEqual(len(outs), 2)
+        self.assertEqual([o["stride"] for o in outs], [(90, 0, 0), (30, 20, 0)])
+        self.assertEqual([o["input_values"].shape for o in outs], [(1, 90), (1, 30)])
+
+        outs = list(chunk_iter(inputs, feature_extractor, 36, 6, 6))
+        self.assertEqual(len(outs), 4)
+        self.assertEqual([o["stride"] for o in outs], [(36, 0, 6), (36, 6, 6), (36, 6, 6), (28, 6, 0)])
+        self.assertEqual([o["input_values"].shape for o in outs], [(1, 36), (1, 36), (1, 36), (1, 28)])
+
+        inputs = mindspore.Tensor([i % 2 for i in range(100)])
+        input_values = feature_extractor(inputs, sampling_rate=feature_extractor.sampling_rate, return_tensors="ms")[
+            "input_values"
+        ]
+        outs = list(chunk_iter(inputs, feature_extractor, 30, 5, 5))
+        self.assertEqual(len(outs), 5)
+        self.assertEqual([o["stride"] for o in outs], [(30, 0, 5), (30, 5, 5), (30, 5, 5), (30, 5, 5), (20, 5, 0)])
+        self.assertEqual([o["input_values"].shape for o in outs], [(1, 30), (1, 30), (1, 30), (1, 30), (1, 20)])
+        self.assertEqual([o["is_last"] for o in outs], [False, False, False, False, True])
+        # (0, 25)
+        self.assertEqual(nested_simplify(input_values[:, :30]), nested_simplify(outs[0]["input_values"]))
+        # (25, 45)
+        self.assertEqual(nested_simplify(input_values[:, 20:50]), nested_simplify(outs[1]["input_values"]))
+        # (45, 65)
+        self.assertEqual(nested_simplify(input_values[:, 40:70]), nested_simplify(outs[2]["input_values"]))
+        # (65, 85)
+        self.assertEqual(nested_simplify(input_values[:, 60:90]), nested_simplify(outs[3]["input_values"]))
+        # (85, 100)
+        self.assertEqual(nested_simplify(input_values[:, 80:100]), nested_simplify(outs[4]["input_values"]))
+
+    @require_mindspore
+    def test_stride(self):
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model="hf-internal-testing/tiny-random-wav2vec2",
+        )
+        waveform = np.tile(np.arange(1000, dtype=np.float32), 10)
+        output = speech_recognizer({"raw": waveform, "stride": (0, 0), "sampling_rate": 16_000})
+        self.assertEqual(output, {"text": "OB XB  B EB BB  B EB B OB X"})
+
+        # 0 effective ids Just take the middle one
+        output = speech_recognizer({"raw": waveform, "stride": (5000, 5000), "sampling_rate": 16_000})
+        self.assertEqual(output, {"text": ""})
+
+        # Only 1 arange.
+        output = speech_recognizer({"raw": waveform, "stride": (0, 9000), "sampling_rate": 16_000})
+        self.assertEqual(output, {"text": "OB"})
+
+        # 2nd arange
+        output = speech_recognizer({"raw": waveform, "stride": (1000, 8000), "sampling_rate": 16_000})
+        self.assertEqual(output, {"text": "XB"})
+
+    @slow
+    def test_slow_unfinished_sequence(self):
+        from mindnlp.transformers import GenerationConfig
+
+        pipe = pipeline(
+            "automatic-speech-recognition",
+            model="vasista22/whisper-hindi-large-v2",
+        )
+
+        # the audio is 4 seconds long
+        audio = hf_hub_download("Narsil/asr_dummy", filename="hindi.ogg", repo_type="dataset")
+
+        # Original model wasn't trained with timestamps and has incorrect generation config
+        out = pipe(
+            audio,
+            return_timestamps=True,
+            generate_kwargs={"generation_config": GenerationConfig.from_pretrained("openai/whisper-large-v2")},
+        )
+        self.assertEqual(
+            out,
+            {
+                "text": "मिर्ची में कितने विभिन्न प्रजातियां हैं",
+                "chunks": [{"timestamp": (0.58, None), "text": "मिर्ची में कितने विभिन्न प्रजातियां हैं"}],
+            },
+        )
+
+
+def require_ffmpeg(test_case):
+    """
+    Decorator marking a test that requires FFmpeg.
+
+    These tests are skipped when FFmpeg isn't installed.
+
+    """
+    import subprocess
+
+    try:
+        subprocess.check_output(["ffmpeg", "-h"], stderr=subprocess.DEVNULL)
+        return test_case
+    except Exception:
+        return unittest.skip(reason="test requires ffmpeg")(test_case)
+
+
+def bytes_iter(chunk_size, chunks):
+    for i in range(chunks):
+        yield bytes(range(i * chunk_size, (i + 1) * chunk_size))
+
+
+@require_ffmpeg
+class AudioUtilsTest(unittest.TestCase):
+    def test_chunk_bytes_iter_too_big(self):
+        iter_ = iter(chunk_bytes_iter(bytes_iter(chunk_size=3, chunks=2), 10, stride=(0, 0)))
+        self.assertEqual(next(iter_), {"raw": b"\x00\x01\x02\x03\x04\x05", "stride": (0, 0)})
+        with self.assertRaises(StopIteration):
+            next(iter_)
+
+    def test_chunk_bytes_iter(self):
+        iter_ = iter(chunk_bytes_iter(bytes_iter(chunk_size=3, chunks=2), 3, stride=(0, 0)))
+        self.assertEqual(next(iter_), {"raw": b"\x00\x01\x02", "stride": (0, 0)})
+        self.assertEqual(next(iter_), {"raw": b"\x03\x04\x05", "stride": (0, 0)})
+        with self.assertRaises(StopIteration):
+            next(iter_)
+
+    def test_chunk_bytes_iter_stride(self):
+        iter_ = iter(chunk_bytes_iter(bytes_iter(chunk_size=3, chunks=2), 3, stride=(1, 1)))
+        self.assertEqual(next(iter_), {"raw": b"\x00\x01\x02", "stride": (0, 1)})
+        self.assertEqual(next(iter_), {"raw": b"\x01\x02\x03", "stride": (1, 1)})
+        self.assertEqual(next(iter_), {"raw": b"\x02\x03\x04", "stride": (1, 1)})
+        # This is finished, but the chunk_bytes doesn't know it yet.
+        self.assertEqual(next(iter_), {"raw": b"\x03\x04\x05", "stride": (1, 1)})
+        self.assertEqual(next(iter_), {"raw": b"\x04\x05", "stride": (1, 0)})
+        with self.assertRaises(StopIteration):
+            next(iter_)
+
+    def test_chunk_bytes_iter_stride_stream(self):
+        iter_ = iter(chunk_bytes_iter(bytes_iter(chunk_size=3, chunks=2), 5, stride=(1, 1), stream=True))
+        self.assertEqual(next(iter_), {"raw": b"\x00\x01\x02", "stride": (0, 0), "partial": True})
+        self.assertEqual(next(iter_), {"raw": b"\x00\x01\x02\x03\x04", "stride": (0, 1), "partial": False})
+        self.assertEqual(next(iter_), {"raw": b"\x03\x04\x05", "stride": (1, 0), "partial": False})
+        with self.assertRaises(StopIteration):
+            next(iter_)
+
+        iter_ = iter(chunk_bytes_iter(bytes_iter(chunk_size=3, chunks=3), 5, stride=(1, 1), stream=True))
+        self.assertEqual(next(iter_), {"raw": b"\x00\x01\x02", "stride": (0, 0), "partial": True})
+        self.assertEqual(next(iter_), {"raw": b"\x00\x01\x02\x03\x04", "stride": (0, 1), "partial": False})
+        self.assertEqual(next(iter_), {"raw": b"\x03\x04\x05\x06\x07", "stride": (1, 1), "partial": False})
+        self.assertEqual(next(iter_), {"raw": b"\x06\x07\x08", "stride": (1, 0), "partial": False})
+        with self.assertRaises(StopIteration):
+            next(iter_)
+
+        iter_ = iter(chunk_bytes_iter(bytes_iter(chunk_size=3, chunks=3), 10, stride=(1, 1), stream=True))
+        self.assertEqual(next(iter_), {"raw": b"\x00\x01\x02", "stride": (0, 0), "partial": True})
+        self.assertEqual(next(iter_), {"raw": b"\x00\x01\x02\x03\x04\x05", "stride": (0, 0), "partial": True})
+        self.assertEqual(
+            next(iter_), {"raw": b"\x00\x01\x02\x03\x04\x05\x06\x07\x08", "stride": (0, 0), "partial": True}
+        )
+        self.assertEqual(
+            next(iter_), {"raw": b"\x00\x01\x02\x03\x04\x05\x06\x07\x08", "stride": (0, 0), "partial": False}
+        )
+        with self.assertRaises(StopIteration):
+            next(iter_)
+
+    def test_ffmpeg_no_additional_args(self):
+        mic = ffmpeg_microphone_live(16000, 2.0)
+        mic.close()
+
+    def test_ffmpeg_additional_args(self):
+        mic = ffmpeg_microphone_live(16000, 2.0, ffmpeg_additional_args=["-nostdin"])
+        mic.close()
\ No newline at end of file