diff --git a/ChangeLog b/ChangeLog
index bdd1e5e342422..201193fee8c0a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,35 @@
+2024-06-19  YunQiang Su  <syq@gcc.gnu.org>
+
+	Revert:
+	2024-06-19  Collin Funk  <collin.funk1@gmail.com>
+
+	* configure.ac: Quote variable result of AC_SEARCH_LIBS.
+	* configure: Regenerate.
+
+2024-06-19  YunQiang Su  <syq@gcc.gnu.org>
+
+	Revert:
+	2024-06-19  YunQiang Su  <syq@gcc.gnu.org>
+
+	PR bootstrap/115453
+	* configure.ac: Fix typo ac_cv_search_pthread_crate.
+	* configure: Regnerate.
+
+2024-06-19  YunQiang Su  <syq@gcc.gnu.org>
+
+	PR bootstrap/115453
+	* configure.ac: Fix typo ac_cv_search_pthread_crate.
+	* configure: Regnerate.
+
+2024-06-19  Collin Funk  <collin.funk1@gmail.com>
+
+	* configure.ac: Quote variable result of AC_SEARCH_LIBS.
+	* configure: Regenerate.
+
+2024-06-19  Ramana Radhakrishnan  <ramanara@nvidia.com>
+
+	* MAINTAINERS: Update my email address.
+
 2024-06-18  Kyrylo Tkachov  <ktkachov@nvidia.com>
 
 	* MAINTAINERS (aarch64 port): Update my email address.
diff --git a/configure b/configure
index 1469cd735392a..51bf1d1add185 100755
--- a/configure
+++ b/configure
@@ -19746,7 +19746,7 @@ config.status
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
-Copyright (C)  Free Software Foundation, Inc.
+Copyright (C) 2012 Free Software Foundation, Inc.
 This config.status script is free software; the Free Software Foundation
 gives unlimited permission to copy, distribute and modify it."
 
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index d64a751a55ea5..8610e76b07b30 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,143 @@
+2024-06-19  YunQiang Su  <syq@gcc.gnu.org>
+
+	Revert:
+	2024-06-19  Collin Funk  <collin.funk1@gmail.com>
+
+	* configure.ac: Add missing quotation of variable
+	gcc_cv_as_mips_explicit_relocs.
+	* configure: Regenerate.
+
+2024-06-19  demin.han  <demin.han@starfivetech.com>
+
+	* config/riscv/riscv-vector-builtins-bases.cc: Remove eqne cond
+	* config/riscv/vector.md (@pred_eqne<mode>_scalar): Remove patterns
+	(*pred_eqne<mode>_scalar_merge_tie_mask): Ditto
+	(*pred_eqne<mode>_scalar): Ditto
+	(*pred_eqne<mode>_scalar_narrow): Ditto
+
+2024-06-19  Patrick O'Neill  <patrick@rivosinc.com>
+
+	* common/config/riscv/riscv-common.cc: Add 'a' extension to
+	riscv_combine_info.
+
+2024-06-19  Jakub Jelinek  <jakub@redhat.com>
+
+	PR tree-optimization/115544
+	* gimple-lower-bitint.cc (gimple_lower_bitint): Disable optimizing
+	loads used by COMPLEX_EXPR operands.
+
+2024-06-19  mayshao  <mayshao-oc@zhaoxin.com>
+
+	* common/config/i386/cpuinfo.h (get_zhaoxin_cpu): Recognize shijidadao.
+	* common/config/i386/i386-common.cc: Add shijidadao.
+	* common/config/i386/i386-cpuinfo.h (enum processor_subtypes):
+	Add ZHAOXIN_FAM7H_SHIJIDADAO.
+	* config.gcc: Add shijidadao.
+	* config/i386/driver-i386.cc (host_detect_local_cpu):
+	Let -march=native recognize shijidadao processors.
+	* config/i386/i386-c.cc (ix86_target_macros_internal): Add shijidadao.
+	* config/i386/i386-options.cc (m_ZHAOXIN): Add m_SHIJIDADAO.
+	(m_SHIJIDADAO): New definition.
+	* config/i386/i386.h (enum processor_type): Add PROCESSOR_SHIJIDADAO.
+	* config/i386/x86-tune-costs.h (struct processor_costs):
+	Add shijidadao_cost.
+	* config/i386/x86-tune-sched.cc (ix86_issue_rate): Add shijidadao.
+	(ix86_adjust_cost): Ditto.
+	* config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Add m_SHIJIDADAO.
+	(X86_TUNE_USE_GATHER_4PARTS): Ditto.
+	(X86_TUNE_USE_GATHER_8PARTS): Ditto.
+	(X86_TUNE_AVOID_128FMA_CHAINS): Ditto.
+	* doc/extend.texi: Add details about shijidadao.
+	* doc/invoke.texi: Ditto.
+
+2024-06-19  Takayuki 'January June' Suwa  <jjsuwa_sys3175@yahoo.co.jp>
+
+	* config/xtensa/xtensa.cc (print_operand):
+	When outputting MEMW before the instruction, check if the previous
+	instruction is already that.
+
+2024-06-19  Andre Vieira  <andre.simoesdiasvieira@arm.com>
+	    Stam Markianos-Wright  <stam.markianos-wright@arm.com>
+
+	* config/arm/arm-protos.h (arm_target_bb_ok_for_lob): Change
+	declaration to pass basic_block.
+	(arm_attempt_dlstp_transform): New declaration.
+	* config/arm/arm.cc (TARGET_LOOP_UNROLL_ADJUST): Define targethook.
+	(TARGET_PREDICT_DOLOOP_P): Likewise.
+	(arm_target_bb_ok_for_lob): Adapt condition.
+	(arm_mve_get_vctp_lanes): New function.
+	(arm_dl_usage_type): New internal enum.
+	(arm_get_required_vpr_reg): New function.
+	(arm_get_required_vpr_reg_param): New function.
+	(arm_get_required_vpr_reg_ret_val): New function.
+	(arm_mve_get_loop_vctp): New function.
+	(arm_mve_insn_predicated_by): New function.
+	(arm_mve_across_lane_insn_p): New function.
+	(arm_mve_load_store_insn_p): New function.
+	(arm_mve_impl_pred_on_outputs_p): New function.
+	(arm_mve_impl_pred_on_inputs_p): New function.
+	(arm_last_vect_def_insn): New function.
+	(arm_mve_impl_predicated_p): New function.
+	(arm_mve_check_reg_origin_is_num_elems): New function.
+	(arm_mve_dlstp_check_inc_counter): New function.
+	(arm_mve_dlstp_check_dec_counter): New function.
+	(arm_mve_loop_valid_for_dlstp): New function.
+	(arm_predict_doloop_p): New function.
+	(arm_loop_unroll_adjust): New function.
+	(arm_emit_mve_unpredicated_insn_to_seq): New function.
+	(arm_attempt_dlstp_transform): New function.
+	* config/arm/arm.opt (mdlstp): New option.
+	* config/arm/iterators.md (dlstp_elemsize, letp_num_lanes,
+	letp_num_lanes_neg, letp_num_lanes_minus_1): New attributes.
+	(DLSTP, LETP): New iterators.
+	* config/arm/mve.md (predicated_doloop_end_internal<letp_num_lanes>,
+	dlstp<dlstp_elemsize>_insn): New insn patterns.
+	* config/arm/thumb2.md (doloop_end): Adapt to support tail-predicated
+	loops.
+	(doloop_begin): Likewise.
+	* config/arm/types.md (mve_misc): New mve type to represent
+	predicated_loop_end insn sequences.
+	* config/arm/unspecs.md:
+	(DLSTP8, DLSTP16, DLSTP32, DSLTP64,
+	LETP8, LETP16, LETP32, LETP64): New unspecs for DLSTP and LETP.
+
+2024-06-19  Andre Vieira  <andre.simoesdiasvieira@arm.com>
+	    Stam Markianos-Wright  <stam.markianos-wright@arm.com>
+
+	* df-core.cc (df_bb_regno_only_def_find): New helper function.
+	* df.h (df_bb_regno_only_def_find): Declare new function.
+	* loop-doloop.cc (doloop_condition_get): Add support for detecting
+	predicated vectorized hardware loops.
+	(doloop_modify): Add support for GTU condition checks.
+	(doloop_optimize): Update costing computation to support alterations to
+	desc->niter_expr by the backend.
+
+2024-06-19  Collin Funk  <collin.funk1@gmail.com>
+
+	* configure.ac: Add missing quotation of variable
+	gcc_cv_as_mips_explicit_relocs.
+	* configure: Regenerate.
+
+2024-06-19  Takayuki 'January June' Suwa  <jjsuwa_sys3175@yahoo.co.jp>
+
+	* config/xtensa/xtensa-protos.h (xtensa_constantsynth):
+	Change the second argument from HOST_WIDE_INT to rtx.
+	* config/xtensa/xtensa.cc (#include):
+	Add "context.h" and "pass_manager.h".
+	(machine_function): Add a new hash_map field "litpool_usage".
+	(xtensa_constantsynth): Make "src" (the second operand) accept
+	RTX literal instead of its value, and treat both bare and pooled
+	SI/SFmode literals equally by bit-exact canonicalization into
+	CONST_INT RTX internally.  And then, make avoid synthesis if
+	such multiple identical canonicalized literals are found in same
+	function when optimizing for size.  Finally, for literals where
+	synthesis is not possible or has been avoided, re-emit "move"
+	RTXes with canonicalized ones to increase the chances of sharing
+	literal pool entries.
+	* config/xtensa/xtensa.md (split patterns for constant synthesis):
+	Change to simply invoke xtensa_constantsynth() as mentioned above,
+	and add new patterns for when TARGET_AUTO_LITPOOLS is enabled.
+
 2024-06-18  Edwin Lu  <ewlu@rivosinc.com>
 	    Robin Dapp  <rdapp@ventanamicro.com>
 
diff --git a/gcc/DATESTAMP b/gcc/DATESTAMP
index 6fe37f7c38677..9df1831b6e340 100644
--- a/gcc/DATESTAMP
+++ b/gcc/DATESTAMP
@@ -1 +1 @@
-20240619
+20240620
diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 4610bf6d6a458..936039725ab6c 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -667,12 +667,18 @@ get_zhaoxin_cpu (struct __processor_model *cpu_model,
 	  reset_cpu_feature (cpu_model, cpu_features2, FEATURE_F16C);
 	  cpu_model->__cpu_subtype = ZHAOXIN_FAM7H_LUJIAZUI;
 	}
-     else if (model >= 0x5b)
+     else if (model == 0x5b)
 	{
 	  cpu = "yongfeng";
 	  CHECK___builtin_cpu_is ("yongfeng");
 	  cpu_model->__cpu_subtype = ZHAOXIN_FAM7H_YONGFENG;
 	}
+     else if (model >= 0x6b)
+	{
+	  cpu = "shijidadao";
+	  CHECK___builtin_cpu_is ("shijidadao");
+	  cpu_model->__cpu_subtype = ZHAOXIN_FAM7H_SHIJIDADAO;
+	}
       break;
     default:
       break;
diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc
index 5d9c188c9c7db..e38b1b22ffb10 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -2066,6 +2066,7 @@ const char *const processor_names[] =
   "intel",
   "lujiazui",
   "yongfeng",
+  "shijidadao",
   "geode",
   "k6",
   "athlon",
@@ -2271,10 +2272,13 @@ const pta processor_alias_table[] =
       | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR, 0, P_NONE},
   {"lujiazui", PROCESSOR_LUJIAZUI, CPU_LUJIAZUI,
 	PTA_LUJIAZUI,
-	M_CPU_SUBTYPE (ZHAOXIN_FAM7H_LUJIAZUI), P_NONE},
+	M_CPU_SUBTYPE (ZHAOXIN_FAM7H_LUJIAZUI), P_PROC_BMI},
   {"yongfeng", PROCESSOR_YONGFENG, CPU_YONGFENG,
 	PTA_YONGFENG,
-	M_CPU_SUBTYPE (ZHAOXIN_FAM7H_YONGFENG), P_NONE},
+	M_CPU_SUBTYPE (ZHAOXIN_FAM7H_YONGFENG), P_PROC_AVX2},
+  {"shijidadao", PROCESSOR_SHIJIDADAO, CPU_YONGFENG,
+	PTA_YONGFENG,
+	M_CPU_SUBTYPE (ZHAOXIN_FAM7H_SHIJIDADAO), P_PROC_AVX2},
   {"k8", PROCESSOR_K8, CPU_K8,
     PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
       | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR, 0, P_NONE},
diff --git a/gcc/common/config/i386/i386-cpuinfo.h b/gcc/common/config/i386/i386-cpuinfo.h
index 3ec9e005a6ad5..ccc6deb63853e 100644
--- a/gcc/common/config/i386/i386-cpuinfo.h
+++ b/gcc/common/config/i386/i386-cpuinfo.h
@@ -104,6 +104,7 @@ enum processor_subtypes
   INTEL_COREI7_PANTHERLAKE,
   ZHAOXIN_FAM7H_YONGFENG,
   AMDFAM1AH_ZNVER5,
+  ZHAOXIN_FAM7H_SHIJIDADAO,
   CPU_SUBTYPE_MAX
 };
 
diff --git a/gcc/common/config/riscv/riscv-common.cc b/gcc/common/config/riscv/riscv-common.cc
index 1dc1d9904c7bd..410e673f5e017 100644
--- a/gcc/common/config/riscv/riscv-common.cc
+++ b/gcc/common/config/riscv/riscv-common.cc
@@ -401,6 +401,7 @@ static const struct riscv_ext_version riscv_ext_version_table[] =
 /* Combine extensions defined in this table  */
 static const struct riscv_ext_version riscv_combine_info[] =
 {
+  {"a", ISA_SPEC_CLASS_20191213, 2, 1},
   {"zk",  ISA_SPEC_CLASS_NONE, 1, 0},
   {"zkn",  ISA_SPEC_CLASS_NONE, 1, 0},
   {"zks",  ISA_SPEC_CLASS_NONE, 1, 0},
diff --git a/gcc/config.gcc b/gcc/config.gcc
index e500ba63e3222..644c456290dc1 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -711,9 +711,9 @@ atom slm nehalem westmere sandybridge ivybridge haswell broadwell bonnell \
 silvermont skylake-avx512 cannonlake icelake-client icelake-server \
 skylake goldmont goldmont-plus tremont cascadelake tigerlake cooperlake \
 sapphirerapids alderlake rocketlake eden-x2 nano nano-1000 nano-2000 nano-3000 \
-nano-x2 eden-x4 nano-x4 lujiazui yongfeng x86-64 x86-64-v2 x86-64-v3 x86-64-v4 \
-sierraforest graniterapids graniterapids-d grandridge arrowlake arrowlake-s \
-clearwaterforest pantherlake native"
+nano-x2 eden-x4 nano-x4 lujiazui yongfeng shijidadao x86-64 x86-64-v2 \
+x86-64-v3 x86-64-v4 sierraforest graniterapids graniterapids-d grandridge \
+arrowlake arrowlake-s clearwaterforest pantherlake native"
 
 # Additional x86 processors supported by --with-cpu=.  Each processor
 # MUST be separated by exactly one space.
@@ -3855,6 +3855,10 @@ case ${target} in
 	arch=yongfeng
 	cpu=yongfeng
 	;;
+      shijidadao-*)
+	arch=shijidadao
+	cpu=shijidadao
+	;;
       pentium2-*)
 	arch=pentium2
 	cpu=pentium2
@@ -3980,6 +3984,10 @@ case ${target} in
 	arch=yongfeng
 	cpu=yongfeng
 	;;
+      shijidadao-*)
+	arch=shijidadao
+	cpu=shijidadao
+	;;
       nocona-*)
 	arch=nocona
 	cpu=nocona
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 2cd560c99254b..34d6be76e94ac 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -65,8 +65,8 @@ extern void arm_emit_speculation_barrier_function (void);
 extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *);
 extern bool arm_q_bit_access (void);
 extern bool arm_ge_bits_access (void);
-extern bool arm_target_insn_ok_for_lob (rtx);
-
+extern bool arm_target_bb_ok_for_lob (basic_block);
+extern int arm_attempt_dlstp_transform (rtx);
 #ifdef RTX_CODE
 enum reg_class
 arm_mode_base_reg_class (machine_mode);
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index b8c32db0a1d7f..7d67d2cfee9f4 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -668,6 +668,12 @@ static const scoped_attribute_specs *const arm_attribute_table[] =
 #undef TARGET_HAVE_CONDITIONAL_EXECUTION
 #define TARGET_HAVE_CONDITIONAL_EXECUTION arm_have_conditional_execution
 
+#undef TARGET_LOOP_UNROLL_ADJUST
+#define TARGET_LOOP_UNROLL_ADJUST arm_loop_unroll_adjust
+
+#undef TARGET_PREDICT_DOLOOP_P
+#define TARGET_PREDICT_DOLOOP_P arm_predict_doloop_p
+
 #undef TARGET_LEGITIMATE_CONSTANT_P
 #define TARGET_LEGITIMATE_CONSTANT_P arm_legitimate_constant_p
 
@@ -34659,19 +34665,1236 @@ arm_invalid_within_doloop (const rtx_insn *insn)
 }
 
 bool
-arm_target_insn_ok_for_lob (rtx insn)
-{
-  basic_block bb = BLOCK_FOR_INSN (insn);
-  /* Make sure the basic block of the target insn is a simple latch
-     having as single predecessor and successor the body of the loop
-     itself.  Only simple loops with a single basic block as body are
-     supported for 'low over head loop' making sure that LE target is
-     above LE itself in the generated code.  */
-
-  return single_succ_p (bb)
-    && single_pred_p (bb)
-    && single_succ_edge (bb)->dest == single_pred_edge (bb)->src
-    && contains_no_active_insn_p (bb);
+arm_target_bb_ok_for_lob (basic_block bb)
+{
+  /* Make sure the basic block is a simple latch having as the single
+     predecessor and successor the body of the loop itself.
+     Only simple loops with a single basic block as body are supported for
+     low over head loops, making sure that LE target is above LE instruction
+     in the generated code.  */
+  return (single_succ_p (bb)
+	  && single_pred_p (bb)
+	  && single_succ_edge (bb)->dest == single_pred_edge (bb)->src);
+}
+
+/* Utility fuction: Given a VCTP or a VCTP_M insn, return the number of MVE
+   lanes based on the machine mode being used.  */
+
+static int
+arm_mve_get_vctp_lanes (rtx_insn *insn)
+{
+  rtx insn_set = single_set (insn);
+  if (insn_set
+      && GET_CODE (SET_SRC (insn_set)) == UNSPEC
+      && (XINT (SET_SRC (insn_set), 1) == VCTP
+	  || XINT (SET_SRC (insn_set), 1) == VCTP_M))
+    {
+      machine_mode mode = GET_MODE (SET_SRC (insn_set));
+      return ((VECTOR_MODE_P (mode) && VALID_MVE_PRED_MODE (mode))
+	      ? GET_MODE_NUNITS (mode) : 0);
+    }
+  return 0;
+}
+
+enum arm_dl_usage_type { DL_USAGE_ANY = 0,
+			 DL_USAGE_READ = 1,
+			 DL_USAGE_WRITE = 2 };
+
+/* Check if INSN requires the use of the VPR reg, if it does, return the
+   sub-rtx of the VPR reg.  The TYPE argument controls whether
+   this function should:
+   * For TYPE == DL_USAGE_ANY, check all operands, including the OUT operands,
+     and return the first occurrence of the VPR reg.
+   * For TYPE == DL_USAGE_READ, only check the input operands.
+   * For TYPE == DL_USAGE_WRITE, only check the output operands.
+   (INOUT operands are considered both as input and output operands)
+*/
+static rtx
+arm_get_required_vpr_reg (rtx_insn *insn,
+			  arm_dl_usage_type type = DL_USAGE_ANY)
+{
+  gcc_assert (type < 3);
+  if (!NONJUMP_INSN_P (insn))
+    return NULL_RTX;
+
+  bool requires_vpr;
+  extract_constrain_insn (insn);
+  int n_operands = recog_data.n_operands;
+  if (recog_data.n_alternatives == 0)
+    return NULL_RTX;
+
+  /* Fill in recog_op_alt with information about the constraints of
+     this insn.  */
+  preprocess_constraints (insn);
+
+  for (int op = 0; op < n_operands; op++)
+    {
+      requires_vpr = true;
+      if (type == DL_USAGE_READ
+	  && recog_data.operand_type[op] == OP_OUT)
+	continue;
+      else if (type == DL_USAGE_WRITE
+	       && recog_data.operand_type[op] == OP_IN)
+	continue;
+
+      /* Iterate through alternatives of operand "op" in recog_op_alt and
+	 identify if the operand is required to be the VPR.  */
+      for (int alt = 0; alt < recog_data.n_alternatives; alt++)
+	{
+	  const operand_alternative *op_alt
+	      = &recog_op_alt[alt * n_operands];
+	  /* Fetch the reg_class for each entry and check it against the
+	     VPR_REG reg_class.  */
+	  if (alternative_class (op_alt, op) != VPR_REG)
+	    requires_vpr = false;
+	}
+      /* If all alternatives of the insn require the VPR reg for this operand,
+	 it means that either this is VPR-generating instruction, like a vctp,
+	 vcmp, etc., or it is a VPT-predicated insruction.  Return the subrtx
+	 of the VPR reg operand.  */
+      if (requires_vpr)
+	return recog_data.operand[op];
+    }
+  return NULL_RTX;
+}
+
+/* Wrapper function of arm_get_required_vpr_reg with TYPE == DL_USAGE_READ,
+   so return the VPR only if it is an input operand to the insn.  */
+
+static rtx
+arm_get_required_vpr_reg_param (rtx_insn *insn)
+{
+  return arm_get_required_vpr_reg (insn, DL_USAGE_READ);
+}
+
+/* Wrapper function of arm_get_required_vpr_reg with TYPE == DL_USAGE_WRITE,
+   so return the VPR only if it is the return value, an output of, or is
+   clobbered by the insn.  */
+
+static rtx
+arm_get_required_vpr_reg_ret_val (rtx_insn *insn)
+{
+  return arm_get_required_vpr_reg (insn, DL_USAGE_WRITE);
+}
+
+/* Return the first VCTP instruction in BB, if it exists, or NULL otherwise.  */
+
+static rtx_insn *
+arm_mve_get_loop_vctp (basic_block bb)
+{
+  rtx_insn *insn = BB_HEAD (bb);
+
+  /* Now scan through all the instruction patterns and pick out the VCTP
+     instruction.  We require arm_get_required_vpr_reg_param to be false
+     to make sure we pick up a VCTP, rather than a VCTP_M.  */
+  FOR_BB_INSNS (bb, insn)
+    if (NONDEBUG_INSN_P (insn))
+      if (arm_get_required_vpr_reg_ret_val (insn)
+	  && (arm_mve_get_vctp_lanes (insn) != 0)
+	  && !arm_get_required_vpr_reg_param (insn))
+	return insn;
+  return NULL;
+}
+
+/* Return true if INSN is a MVE instruction that is VPT-predicable and is
+   predicated on VPR_REG.  */
+
+static bool
+arm_mve_insn_predicated_by (rtx_insn *insn, rtx vpr_reg)
+{
+  rtx insn_vpr_reg_operand = (MVE_VPT_PREDICATED_INSN_P (insn)
+			      ? arm_get_required_vpr_reg_param (insn)
+			      : NULL_RTX);
+  return (insn_vpr_reg_operand
+	  && rtx_equal_p (vpr_reg, insn_vpr_reg_operand));
+}
+
+/* Utility function to identify if INSN is an MVE instruction that performs
+   some across lane operation (and as a result does not align with normal
+   lane predication rules).  All such instructions give one only scalar
+   output, except for vshlcq which gives a PARALLEL of a vector and a scalar
+   (one vector result and one carry output).  */
+
+static bool
+arm_mve_across_lane_insn_p (rtx_insn* insn)
+{
+  df_ref insn_defs = NULL;
+  if (!MVE_VPT_PREDICABLE_INSN_P (insn))
+    return false;
+
+  FOR_EACH_INSN_DEF (insn_defs, insn)
+    if (!VALID_MVE_MODE (GET_MODE (DF_REF_REG (insn_defs)))
+	&& !arm_get_required_vpr_reg_ret_val (insn))
+      return true;
+
+  return false;
+}
+
+/* Utility function to identify if INSN is an MVE load or store instruction.
+   * For TYPE == DL_USAGE_ANY, check all operands.  If the function returns
+     true, INSN is a load or a store insn.
+   * For TYPE == DL_USAGE_READ, only check the input operands.  If the
+     function returns true, INSN is a load insn.
+   * For TYPE == DL_USAGE_WRITE, only check the output operands.  If the
+     function returns true, INSN is a store insn.  */
+
+static bool
+arm_mve_load_store_insn_p (rtx_insn* insn,
+			   arm_dl_usage_type type = DL_USAGE_ANY)
+{
+  gcc_assert (type < 3);
+  int n_operands = recog_data.n_operands;
+  extract_insn (insn);
+
+  for (int op = 0; op < n_operands; op++)
+    {
+      if (type == DL_USAGE_READ && recog_data.operand_type[op] == OP_OUT)
+	continue;
+      else if (type == DL_USAGE_WRITE && recog_data.operand_type[op] == OP_IN)
+	continue;
+      if (mve_memory_operand (recog_data.operand[op],
+			      GET_MODE (recog_data.operand[op])))
+	return true;
+    }
+  return false;
+}
+
+/* Return TRUE if INSN is validated for implicit predication by how its outputs
+   are used.
+
+   If INSN is a MVE operation across lanes that is not predicated by
+   VCTP_VPR_GENERATED it can not be validated by the use of its ouputs.
+
+   Any other INSN is safe to implicit predicate if we don't use its outputs
+   outside the loop.  The instructions that use this INSN's outputs will be
+   validated as we go through the analysis.  */
+
+static bool
+arm_mve_impl_pred_on_outputs_p (rtx_insn *insn, rtx vctp_vpr_generated)
+{
+  /* Reject any unpredicated across lane operation.  */
+  if (!arm_mve_insn_predicated_by (insn, vctp_vpr_generated)
+      && arm_mve_across_lane_insn_p (insn))
+    return false;
+
+  /* Next, scan forward to the various USEs of the DEFs in this insn.  */
+  df_ref insn_def = NULL;
+  basic_block insn_bb = BLOCK_FOR_INSN (insn);
+  FOR_EACH_INSN_DEF (insn_def, insn)
+    {
+      for (df_ref use = DF_REG_USE_CHAIN (DF_REF_REGNO (insn_def));
+	   use;
+	   use = DF_REF_NEXT_REG (use))
+	{
+	  rtx_insn *next_use_insn = DF_REF_INSN (use);
+	  if (!INSN_P (next_use_insn) || DEBUG_INSN_P (next_use_insn))
+	    continue;
+
+	  if (insn_bb != BLOCK_FOR_INSN (next_use_insn))
+	    return false;
+	}
+    }
+  return true;
+}
+
+
+/* Returns the prevailing definition of OP before CUR_INSN in the same
+   basic block as CUR_INSN, if one exists, returns NULL otherwise.  */
+
+static rtx_insn*
+arm_last_vect_def_insn (rtx op, rtx_insn *cur_insn)
+{
+  if (!REG_P (op)
+      || !BLOCK_FOR_INSN (cur_insn))
+    return NULL;
+
+  df_ref def_insns;
+  rtx_insn *last_def = NULL;
+  for (def_insns = DF_REG_DEF_CHAIN (REGNO (op));
+       def_insns;
+       def_insns = DF_REF_NEXT_REG (def_insns))
+    {
+      rtx_insn *def_insn = DF_REF_INSN (def_insns);
+      /* Definition not in the loop body or after the current insn.  */
+      if (DF_REF_BB (def_insns) != BLOCK_FOR_INSN (cur_insn)
+	  || INSN_UID (def_insn) >= INSN_UID (cur_insn))
+	continue;
+
+      if (!last_def || INSN_UID (def_insn) > INSN_UID (last_def))
+	last_def = def_insn;
+    }
+  return last_def;
+}
+
+
+/* This function returns TRUE if we can validate the implicit predication of
+   INSN_IN with VCTP_VPR_GENERATED based on the definition of the instruction's
+   input operands.
+
+   If INSN_IN is a MVE operation across lanes then all of its MVE vector
+   operands must have its tail-predicated lanes be zeroes.  We keep track of any
+   instructions that define vector operands for which this is true in
+   PROPS_ZERO_SET.
+
+   For any other INSN_IN, the definition of all its operands must be defined
+   inside the loop body by an instruction that comes before INSN_IN and not be
+   a MVE load predicated by a different VPR.  These instructions have all been
+   validated for explicit or implicit predication.
+ */
+
+static bool
+arm_mve_impl_pred_on_inputs_p (vec <rtx_insn *> *props_zero_set,
+			       rtx_insn *insn_in, rtx vctp_vpr_generated)
+{
+  /* If all inputs come from instructions that are explicitly or
+     implicitly predicated by the same predicate then it is safe to
+     implicitly predicate this instruction.  */
+  df_ref insn_uses = NULL;
+  bool across_lane = arm_mve_across_lane_insn_p (insn_in);
+  FOR_EACH_INSN_USE (insn_uses, insn_in)
+  {
+    rtx op = DF_REF_REG (insn_uses);
+    rtx_insn *def_insn = arm_last_vect_def_insn (op, insn_in);
+    if (across_lane)
+      {
+	if (!VALID_MVE_MODE (GET_MODE (op)))
+	  continue;
+	if (!def_insn || !props_zero_set->contains (def_insn))
+	  return false;
+
+	continue;
+      }
+
+    if (!def_insn
+	|| (!arm_mve_insn_predicated_by (def_insn, vctp_vpr_generated)
+	    && arm_mve_load_store_insn_p (def_insn, DL_USAGE_READ)))
+      return false;
+  }
+
+  return true;
+}
+
+
+/* Determine whether INSN_IN is safe to implicitly predicate based on the type
+   of instruction and where needed the definition of its inputs and the uses of
+   its outputs.
+   Return TRUE if it is safe to implicitly predicate and FALSE otherwise.
+
+   * If INSN_IN is a store, then it is always unsafe to implicitly predicate it.
+   * If INSN_IN is a load, only reject implicit predication if its uses
+     directly invalidate it.
+   * If INSN_IN operates across vector lanes and does not have the
+     "mve_safe_imp_xlane_pred" attribute, then it is always unsafe to implicitly
+     predicate.
+   * If INSN_IN operates on Floating Point elements and we are not compiling
+     with -Ofast, then it is unsafe to implicitly predicate it as we may be
+     changing exception and cumulative bits behaviour.
+   * If INSN_IN is a VCTP instruction, then it is safe to implicitly predicate,
+     but instructions that use this predicate will need to be checked
+     just like any other UNPREDICATED MVE instruction.
+   * Otherwise check if INSN_IN's inputs or uses of outputs can validate its
+     implicit predication.
+
+   * If all inputs come from instructions that are explicitly or implicitly
+     predicated by the same predicate then it is safe to implicitly predicate
+     this instruction.
+   * If INSN_IN is an operation across lanes with the "mve_safe_imp_xlane_pred"
+     attribute, then all it's operands must have zeroed falsely predicated tail
+     lanes.
+
+   * Otherwise, check if the implicit predication of INSN_IN can be validated
+     based on its inputs, and if not check whether it can be validated based on
+     how its outputs are used.  */
+
+static bool
+arm_mve_impl_predicated_p (vec <rtx_insn *> *props_zero_set,
+			   rtx_insn *insn_in, rtx vctp_vpr_generated)
+{
+
+  /* If INSN_IN is a store, then it is always unsafe to implicitly
+     predicate it.  */
+  if (arm_mve_load_store_insn_p (insn_in, DL_USAGE_WRITE))
+    return false;
+
+  /* If INSN_IN is a load, only reject implicit predication if its uses
+     directly invalidate it.  */
+  if (arm_mve_load_store_insn_p (insn_in, DL_USAGE_READ))
+    {
+      if (!arm_mve_impl_pred_on_outputs_p (insn_in, vctp_vpr_generated))
+	return false;
+      return true;
+    }
+
+  /* If INSN_IN operates across vector lanes and does not have the
+     "mve_safe_imp_xlane_pred" attribute, then it is always unsafe to implicitly
+     predicate.  */
+  if (arm_mve_across_lane_insn_p (insn_in)
+      && (get_attr_mve_safe_imp_xlane_pred (insn_in)
+	  != MVE_SAFE_IMP_XLANE_PRED_YES))
+    return false;
+
+  /* If INSN_IN operates on Floating Point elements and we are not compiling
+     with -Ofast, then it is unsafe to implicitly predicate it as we may be
+     changing exception and cumulative bits behaviour.  */
+  if (!flag_unsafe_math_optimizations
+      && flag_trapping_math
+      && MVE_VPT_UNPREDICATED_INSN_P (insn_in))
+    {
+      df_ref def;
+      FOR_EACH_INSN_DEF (def, insn_in)
+	if (DF_REF_TYPE (def) == DF_REF_REG_DEF
+	    && FLOAT_MODE_P (GET_MODE (DF_REF_REG (def))))
+	  return false;
+      FOR_EACH_INSN_USE (def, insn_in)
+	if (DF_REF_TYPE (def) == DF_REF_REG_DEF
+	    && FLOAT_MODE_P (GET_MODE (DF_REF_REG (def))))
+	  return false;
+    }
+
+  /* If INSN_IN is a VCTP instruction, then it is safe to implicitly predicate,
+     but instructions that use this predicate will need to be checked
+     just like any other UNPREDICATED MVE instruction.  */
+  if (arm_get_required_vpr_reg_ret_val (insn_in)
+      && (arm_mve_get_vctp_lanes (insn_in) != 0))
+    return true;
+
+  /* Otherwise, check if the implicit predication of INSN_IN can be validated
+     based on its inputs, and if not check whether it can be validated based on
+     how its outputs are used.  */
+  return (arm_mve_impl_pred_on_inputs_p (props_zero_set, insn_in, vctp_vpr_generated)
+	  || arm_mve_impl_pred_on_outputs_p (insn_in, vctp_vpr_generated));
+}
+
+/* Helper function to `arm_mve_dlstp_check_inc_counter` and to
+   `arm_mve_dlstp_check_dec_counter`.  In the situations where the loop counter
+   is incrementing by 1 or decrementing by 1 in each iteration, ensure that the
+   number of iterations, the value of REG, going into the loop, was calculated
+   as:
+    REG = (N + [1, VCTP_STEP - 1]) / VCTP_STEP
+
+  where N is equivalent to the VCTP_REG.
+*/
+
+static bool
+arm_mve_check_reg_origin_is_num_elems (loop *loop, rtx reg, rtx vctp_step,
+				       rtx vctp_reg)
+{
+  df_ref counter_max_last_def = NULL;
+
+  /* More than one reaching definition.  */
+  if (DF_REG_DEF_COUNT (REGNO (reg)) > 2)
+    return false;
+
+  /* Look for a single defition of REG going into the loop.  The DEF_CHAIN will
+     have at least two values, as this is a loop induction variable that is
+     defined outside the loop.  */
+  for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
+       def;
+       def = DF_REF_NEXT_REG (def))
+    {
+      /* Skip the update inside the loop, this has already been checked by the
+	 iv_analyze call earlier.  */
+      if (DF_REF_BB (def) == loop->header)
+	continue;
+
+      counter_max_last_def = def;
+      break;
+    }
+
+  if (!counter_max_last_def)
+    return false;
+
+  rtx counter_max_last_set = single_set (DF_REF_INSN (counter_max_last_def));
+
+  if (!counter_max_last_set)
+    return false;
+
+  /* If we encounter a simple SET from a REG, follow it through.  */
+  if (REG_P (SET_SRC (counter_max_last_set)))
+    {
+      if (DF_REG_DEF_COUNT (REGNO (SET_SRC (counter_max_last_set))) != 1)
+	return false;
+
+      counter_max_last_def
+	= DF_REG_DEF_CHAIN (REGNO (SET_SRC (counter_max_last_set)));
+      counter_max_last_set
+	= single_set (DF_REF_INSN (counter_max_last_def));
+
+      if (!counter_max_last_set)
+	return false;
+    }
+
+  /* We are looking for:
+      COUNTER_MAX_LAST_SET = (N + VCTP_STEP - 1) / VCTP_STEP.
+     We currently only support the unsigned VCTP_OP case.  */
+  rtx division = SET_SRC (counter_max_last_set);
+  if (GET_CODE (division) != LSHIFTRT)
+    return false;
+
+  /* Now check that we are dividing by VCTP_STEP, i.e. the number of lanes.  */
+  rtx divisor = XEXP (division, 1);
+  unsigned vctp_step_cst = abs_hwi (INTVAL (vctp_step));
+  if (!CONST_INT_P (divisor)
+      || (1U << INTVAL (divisor) != vctp_step_cst))
+    return false;
+
+  rtx dividend = XEXP (division, 0);
+  if (!REG_P (dividend))
+    /* Subreg? */
+    return false;
+
+  /* For now only support the simple case, this only works for unsigned N, any
+     signed N will have further computations to deal with overflow.  */
+  if (DF_REG_DEF_COUNT (REGNO (dividend)) != 1)
+    return false;
+
+  rtx_insn *dividend_insn = DF_REF_INSN (DF_REG_DEF_CHAIN (REGNO (dividend)));
+  rtx dividend_op = single_set (dividend_insn);
+  if (!dividend_op
+      && GET_CODE (SET_SRC (dividend_op)) != PLUS)
+    return false;
+
+  /* Check if PLUS_OP is (VCTP_OP + VAL), where VAL = [1, VCTP_STEP - 1].  */
+  rtx plus_op = SET_SRC (dividend_op);
+  if (!REG_P (XEXP (plus_op, 0))
+      || !CONST_INT_P (XEXP (plus_op, 1))
+      || !IN_RANGE (INTVAL (XEXP (plus_op, 1)), 1, vctp_step_cst - 1))
+    return false;
+
+  /* VCTP_REG may have been copied before entering the loop, let's see if we can
+     trace such a copy back.  If we have more than one reaching definition then
+     bail out as analysis will be too difficult.  */
+  if (DF_REG_DEF_COUNT (REGNO (vctp_reg)) > 2)
+    return false;
+
+  /* Look for the definition of N. */
+  for (df_ref def = DF_REG_DEF_CHAIN (REGNO (vctp_reg));
+       def;
+       def = DF_REF_NEXT_REG (def))
+    {
+      if (DF_REF_BB (def) == loop->header)
+       continue;
+      rtx set = single_set (DF_REF_INSN (def));
+      if (set
+	  && REG_P (SET_SRC (set))
+	  && !HARD_REGISTER_P (SET_SRC (set)))
+	vctp_reg = SET_SRC (set);
+    }
+
+  return rtx_equal_p (vctp_reg, XEXP (plus_op, 0));
+}
+
+/* If we have identified the loop to have an incrementing counter, we need to
+   make sure that it increments by 1 and that the loop is structured correctly:
+    * The counter starts from 0
+    * The counter terminates at (num_of_elem + num_of_lanes - 1) / num_of_lanes
+    * The vctp insn uses a reg that decrements appropriately in each iteration.
+*/
+
+static rtx_insn*
+arm_mve_dlstp_check_inc_counter (loop *loop, rtx_insn* vctp_insn,
+				 rtx condconst, rtx condcount)
+{
+  rtx vctp_reg = XVECEXP (XEXP (PATTERN (vctp_insn), 1), 0, 0);
+  /* The loop latch has to be empty.  When compiling all the known MVE LoLs in
+     user applications, none of those with incrementing counters had any real
+     insns in the loop latch.  As such, this function has only been tested with
+     an empty latch and may misbehave or ICE if we somehow get here with an
+     increment in the latch, so, for correctness, error out early.  */
+  if (!empty_block_p (loop->latch))
+    return NULL;
+
+  class rtx_iv vctp_reg_iv;
+  /* For loops of DLSTP_TYPE_B, the loop counter is independent of the decrement
+     of the reg used in the vctp_insn. So run iv analysis on that reg.  This
+     has to succeed for such loops to be supported.  */
+  if (!iv_analyze (vctp_insn, as_a<scalar_int_mode> (GET_MODE (vctp_reg)),
+      vctp_reg, &vctp_reg_iv))
+    return NULL;
+
+  /* Extract the decrementnum of the vctp reg from the iv.  This decrementnum
+     is the number of lanes/elements it decrements from the remaining number of
+     lanes/elements to process in the loop, for this reason this is always a
+     negative number, but to simplify later checks we use it's absolute value.  */
+  HOST_WIDE_INT decrementnum = INTVAL (vctp_reg_iv.step);
+  if (decrementnum >= 0)
+    return NULL;
+  decrementnum = abs_hwi (decrementnum);
+
+  /* Find where both of those are modified in the loop header bb.  */
+  df_ref condcount_reg_set_df = df_bb_regno_only_def_find (loop->header,
+							   REGNO (condcount));
+  df_ref vctp_reg_set_df = df_bb_regno_only_def_find (loop->header,
+						      REGNO (vctp_reg));
+  if (!condcount_reg_set_df || !vctp_reg_set_df)
+    return NULL;
+  rtx condcount_reg_set = single_set (DF_REF_INSN (condcount_reg_set_df));
+  rtx vctp_reg_set = single_set (DF_REF_INSN (vctp_reg_set_df));
+  if (!condcount_reg_set || !vctp_reg_set)
+    return NULL;
+
+  /* Ensure the modification of the vctp reg from df is consistent with
+     the iv and the number of lanes on the vctp insn.  */
+  if (GET_CODE (SET_SRC (vctp_reg_set)) != PLUS
+      || !REG_P (SET_DEST (vctp_reg_set))
+      || !REG_P (XEXP (SET_SRC (vctp_reg_set), 0))
+      || REGNO (SET_DEST (vctp_reg_set))
+	  != REGNO (XEXP (SET_SRC (vctp_reg_set), 0))
+      || !CONST_INT_P (XEXP (SET_SRC (vctp_reg_set), 1))
+      || INTVAL (XEXP (SET_SRC (vctp_reg_set), 1)) >= 0
+      || decrementnum != abs_hwi (INTVAL (XEXP (SET_SRC (vctp_reg_set), 1)))
+      || decrementnum != arm_mve_get_vctp_lanes (vctp_insn))
+    return NULL;
+
+  if (REG_P (condcount) && REG_P (condconst))
+    {
+      /* First we need to prove that the loop is going 0..condconst with an
+	 inc of 1 in each iteration.  */
+      if (GET_CODE (SET_SRC (condcount_reg_set)) == PLUS
+	  && CONST_INT_P (XEXP (SET_SRC (condcount_reg_set), 1))
+	  && INTVAL (XEXP (SET_SRC (condcount_reg_set), 1)) == 1)
+	{
+	    rtx counter_reg = SET_DEST (condcount_reg_set);
+	    /* Check that the counter did indeed start from zero.  */
+	    df_ref this_set = DF_REG_DEF_CHAIN (REGNO (counter_reg));
+	    if (!this_set)
+	      return NULL;
+	    df_ref last_set_def = DF_REF_NEXT_REG (this_set);
+	    if (!last_set_def)
+	      return NULL;
+	    rtx_insn* last_set_insn = DF_REF_INSN (last_set_def);
+	    rtx last_set = single_set (last_set_insn);
+	    if (!last_set)
+	      return NULL;
+	    rtx counter_orig_set;
+	    counter_orig_set = SET_SRC (last_set);
+	    if (!CONST_INT_P (counter_orig_set)
+		|| (INTVAL (counter_orig_set) != 0))
+	      return NULL;
+	    /* And finally check that the target value of the counter,
+	       condconst, is of the correct shape.  */
+	    if (!arm_mve_check_reg_origin_is_num_elems (loop, condconst,
+							vctp_reg_iv.step,
+							vctp_reg))
+	      return NULL;
+	}
+      else
+	return NULL;
+    }
+  else
+    return NULL;
+
+  /* Everything looks valid.  */
+  return vctp_insn;
+}
+
+/* Helper function to `arm_mve_loop_valid_for_dlstp`.  In the case of a
+   counter that is decrementing, ensure that it is decrementing by the
+   right amount in each iteration and that the target condition is what
+   we expect.  */
+
+static rtx_insn*
+arm_mve_dlstp_check_dec_counter (loop *loop, rtx_insn* vctp_insn,
+				 rtx condconst, rtx condcount)
+{
+  rtx vctp_reg = XVECEXP (XEXP (PATTERN (vctp_insn), 1), 0, 0);
+  class rtx_iv vctp_reg_iv;
+  HOST_WIDE_INT decrementnum;
+  /* For decrementing loops of DLSTP_TYPE_A, the counter is usually present in the
+     loop latch.  Here we simply need to verify that this counter is the same
+     reg that is also used in the vctp_insn and that it is not otherwise
+     modified.  */
+  rtx_insn *dec_insn = BB_END (loop->latch);
+  /* If not in the loop latch, try to find the decrement in the loop header.  */
+  if (!NONDEBUG_INSN_P (dec_insn))
+  {
+    df_ref temp = df_bb_regno_only_def_find (loop->header, REGNO (condcount));
+    /* If we haven't been able to find the decrement, bail out.  */
+    if (!temp)
+      return NULL;
+    dec_insn = DF_REF_INSN (temp);
+  }
+
+  rtx dec_set = single_set (dec_insn);
+
+  /* Next, ensure that it is a PLUS of the form:
+     (set (reg a) (plus (reg a) (const_int)))
+     where (reg a) is the same as condcount.  */
+  if (!dec_set
+      || !REG_P (SET_DEST (dec_set))
+      || !REG_P (XEXP (SET_SRC (dec_set), 0))
+      || !CONST_INT_P (XEXP (SET_SRC (dec_set), 1))
+      || REGNO (SET_DEST (dec_set))
+	  != REGNO (XEXP (SET_SRC (dec_set), 0))
+      || REGNO (SET_DEST (dec_set)) != REGNO (condcount))
+    return NULL;
+
+  decrementnum = INTVAL (XEXP (SET_SRC (dec_set), 1));
+
+  /* This decrementnum is the number of lanes/elements it decrements from the
+     remaining number of lanes/elements to process in the loop, for this reason
+     this is always a negative number, but to simplify later checks we use its
+     absolute value.  */
+  if (decrementnum >= 0)
+    return NULL;
+  decrementnum = -decrementnum;
+
+  /* If the decrementnum is a 1, then we need to look at the loop vctp_reg and
+     verify that it also decrements correctly.
+     Then, we need to establish that the starting value of the loop decrement
+     originates from the starting value of the vctp decrement.  */
+  if (decrementnum == 1)
+    {
+      class rtx_iv vctp_reg_iv, condcount_reg_iv;
+      /* The loop counter is found to be independent of the decrement
+	 of the reg used in the vctp_insn, again.  Ensure that IV analysis
+	 succeeds and check the step.  */
+      if (!iv_analyze (vctp_insn, as_a<scalar_int_mode> (GET_MODE (vctp_reg)),
+		       vctp_reg, &vctp_reg_iv))
+	return NULL;
+      /* Ensure it matches the number of lanes of the vctp instruction.  */
+      if (abs (INTVAL (vctp_reg_iv.step))
+	  != arm_mve_get_vctp_lanes (vctp_insn))
+	return NULL;
+
+      if (!arm_mve_check_reg_origin_is_num_elems (loop, condcount,
+						  vctp_reg_iv.step,
+						  vctp_reg))
+	return NULL;
+    }
+  /* If the decrements are the same, then the situation is simple: either they
+     are also the same reg, which is safe, or they are different registers, in
+     which case makse sure that there is a only simple SET from one to the
+     other inside the loop.*/
+  else if (decrementnum == arm_mve_get_vctp_lanes (vctp_insn))
+    {
+      if (REGNO (condcount) != REGNO (vctp_reg))
+	{
+	  /* It wasn't the same reg, but it could be behild a
+	     (set (vctp_reg) (condcount)), so instead find where
+	     the VCTP insn is DEF'd inside the loop.  */
+	  rtx_insn *vctp_reg_insn
+	    = DF_REF_INSN (df_bb_regno_only_def_find (loop->header,
+						      REGNO (vctp_reg)));
+	  rtx vctp_reg_set = single_set (vctp_reg_insn);
+	  /* This must just be a simple SET from the condcount.  */
+	  if (!vctp_reg_set
+	      || !REG_P (SET_DEST (vctp_reg_set))
+	      || !REG_P (SET_SRC (vctp_reg_set))
+	      || REGNO (SET_SRC (vctp_reg_set)) != REGNO (condcount))
+	    return NULL;
+	}
+    }
+  else
+    return NULL;
+
+  /* We now only need to find out that the loop terminates with a LE
+     zero condition.  If condconst is a const_int, then this is easy.
+     If its a REG, look at the last condition+jump in a bb before
+     the loop, because that usually will have a branch jumping over
+     the loop header.  */
+  rtx_insn *jump_insn = BB_END (loop->header);
+  if (CONST_INT_P (condconst)
+      && !(INTVAL (condconst) == 0 && JUMP_P (jump_insn)
+	   && GET_CODE (XEXP (PATTERN (jump_insn), 1)) == IF_THEN_ELSE
+	   && (GET_CODE (XEXP (XEXP (PATTERN (jump_insn), 1), 0)) == NE
+	       ||GET_CODE (XEXP (XEXP (PATTERN (jump_insn), 1), 0)) == GT)))
+    return NULL;
+  else if (REG_P (condconst))
+    {
+      basic_block pre_loop_bb = single_pred (loop_preheader_edge (loop)->src);
+      if (!pre_loop_bb)
+	return NULL;
+
+      rtx initial_compare = NULL_RTX;
+      if (!(prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb))
+	    && INSN_P (prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb)))))
+	return NULL;
+      else
+	initial_compare
+	    = single_set (prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb)));
+      if (!(initial_compare
+	    && cc_register (SET_DEST (initial_compare), VOIDmode)
+	    && GET_CODE (SET_SRC (initial_compare)) == COMPARE
+	    && CONST_INT_P (XEXP (SET_SRC (initial_compare), 1))
+	    && INTVAL (XEXP (SET_SRC (initial_compare), 1)) == 0))
+	return NULL;
+
+      /* Usually this is a LE condition, but it can also just be a GT or an EQ
+	 condition (if the value is unsigned or the compiler knows its not negative)  */
+      rtx_insn *loop_jumpover = BB_END (pre_loop_bb);
+      if (!(JUMP_P (loop_jumpover)
+	    && GET_CODE (XEXP (PATTERN (loop_jumpover), 1)) == IF_THEN_ELSE
+	    && (GET_CODE (XEXP (XEXP (PATTERN (loop_jumpover), 1), 0)) == LE
+		|| GET_CODE (XEXP (XEXP (PATTERN (loop_jumpover), 1), 0)) == GT
+		|| GET_CODE (XEXP (XEXP (PATTERN (loop_jumpover), 1), 0)) == EQ)))
+	return NULL;
+    }
+
+  /* Everything looks valid.  */
+  return vctp_insn;
+}
+
+/* Function to check a loop's structure to see if it is a valid candidate for
+   an MVE Tail Predicated Low-Overhead Loop.  Returns the loop's VCTP_INSN if
+   it is valid, or NULL if it isn't.  */
+
+static rtx_insn*
+arm_mve_loop_valid_for_dlstp (loop *loop)
+{
+  /* Doloop can only be done "elementwise" with predicated dlstp/letp if it
+     contains a VCTP on the number of elements processed by the loop.
+     Find the VCTP predicate generation inside the loop body BB.  */
+  rtx_insn *vctp_insn = arm_mve_get_loop_vctp (loop->header);
+  if (!vctp_insn)
+    return NULL;
+
+  /* We only support two loop forms for tail predication:
+      DLSTP_TYPE_A) Loops of the form:
+	  int num_of_lanes = 128 / elem_size;
+	  while (num_of_elem > 0)
+	    {
+	      p = vctp<size> (num_of_elem);
+	      num_of_elem -= num_of_lanes;
+	    }
+      DLSTP_TYPE_B) Loops of the form:
+	  int num_of_lanes = 128 / elem_size;
+	  int num_of_iters = (num_of_elem + num_of_lanes - 1) / num_of_lanes;
+	  for (i = 0; i < num_of_iters; i++)
+	    {
+	      p = vctp<size> (num_of_elem);
+	      num_of_elem -= num_of_lanes;
+	    }
+
+    Then, depending on the type of loop above we need will need to do
+    different sets of checks.  */
+  iv_analysis_loop_init (loop);
+
+  /* In order to find out if the loop is of DLSTP_TYPE_A or DLSTP_TYPE_B above
+     look for the loop counter: it will either be incrementing by one per
+     iteration or it will be decrementing by num_of_lanes.  We can find the
+     loop counter in the condition at the end of the loop.  */
+  rtx_insn *loop_cond = prev_nonnote_nondebug_insn_bb (BB_END (loop->header));
+  if (!(cc_register (XEXP (PATTERN (loop_cond), 0), VOIDmode)
+	&& GET_CODE (XEXP (PATTERN (loop_cond), 1)) == COMPARE))
+    return NULL;
+
+  /* The operands in the condition:  Try to identify which one is the
+     constant and which is the counter and run IV analysis on the latter.  */
+  rtx cond_arg_1 = XEXP (XEXP (PATTERN (loop_cond), 1), 0);
+  rtx cond_arg_2 = XEXP (XEXP (PATTERN (loop_cond), 1), 1);
+
+  rtx loop_cond_constant;
+  rtx loop_counter;
+  class rtx_iv cond_counter_iv, cond_temp_iv;
+
+  if (CONST_INT_P (cond_arg_1))
+    {
+      /* cond_arg_1 is the constant and cond_arg_2 is the counter.  */
+      loop_cond_constant = cond_arg_1;
+      loop_counter = cond_arg_2;
+      iv_analyze (loop_cond, as_a<scalar_int_mode> (GET_MODE (cond_arg_2)),
+		  cond_arg_2, &cond_counter_iv);
+    }
+  else if (CONST_INT_P (cond_arg_2))
+    {
+      /* cond_arg_2 is the constant and cond_arg_1 is the counter.  */
+      loop_cond_constant = cond_arg_2;
+      loop_counter = cond_arg_1;
+      iv_analyze (loop_cond, as_a<scalar_int_mode> (GET_MODE (cond_arg_1)),
+		  cond_arg_1, &cond_counter_iv);
+    }
+  else if (REG_P (cond_arg_1) && REG_P (cond_arg_2))
+    {
+      /* If both operands to the compare are REGs, we can safely
+	 run IV analysis on both and then determine which is the
+	 constant by looking at the step.
+	 First assume cond_arg_1 is the counter.  */
+      loop_counter = cond_arg_1;
+      loop_cond_constant = cond_arg_2;
+      iv_analyze (loop_cond, as_a<scalar_int_mode> (GET_MODE (cond_arg_1)),
+		  cond_arg_1, &cond_counter_iv);
+      iv_analyze (loop_cond, as_a<scalar_int_mode> (GET_MODE (cond_arg_2)),
+		  cond_arg_2, &cond_temp_iv);
+
+      /* Look at the steps and swap around the rtx's if needed.  Error out if
+	 one of them cannot be identified as constant.  */
+      if (!CONST_INT_P (cond_counter_iv.step) || !CONST_INT_P (cond_temp_iv.step))
+	return NULL;
+      if (INTVAL (cond_counter_iv.step) != 0 && INTVAL (cond_temp_iv.step) != 0)
+	return NULL;
+      if (INTVAL (cond_counter_iv.step) == 0 && INTVAL (cond_temp_iv.step) != 0)
+	{
+	  loop_counter = cond_arg_2;
+	  loop_cond_constant = cond_arg_1;
+	  cond_counter_iv = cond_temp_iv;
+	}
+    }
+  else
+    return NULL;
+
+  if (!REG_P (loop_counter))
+    return NULL;
+  if (!(REG_P (loop_cond_constant) || CONST_INT_P (loop_cond_constant)))
+    return NULL;
+
+  /* Now we have extracted the IV step of the loop counter, call the
+     appropriate checking function.  */
+  if (INTVAL (cond_counter_iv.step) > 0)
+    return arm_mve_dlstp_check_inc_counter (loop, vctp_insn,
+					    loop_cond_constant, loop_counter);
+  else if (INTVAL (cond_counter_iv.step) < 0)
+    return arm_mve_dlstp_check_dec_counter (loop, vctp_insn,
+					    loop_cond_constant, loop_counter);
+  else
+    return NULL;
+}
+
+/* Predict whether the given loop in gimple will be transformed in the RTL
+   doloop_optimize pass.  It could be argued that turning large enough loops
+   into low-overhead loops would not show a signficant performance boost.
+   However, in the case of tail predication we would still avoid using VPT/VPST
+   instructions inside the loop, and in either case using low-overhead loops
+   would not be detrimental, so we decided to not consider size, avoiding the
+   need of a heuristic to determine what an appropriate size boundary is.  */
+
+static bool
+arm_predict_doloop_p (struct loop *loop)
+{
+  gcc_assert (loop);
+  /* On arm, targetm.can_use_doloop_p is actually
+     can_use_doloop_if_innermost.  Ensure the loop is innermost,
+     it is valid and as per arm_target_bb_ok_for_lob and the
+     correct architecture flags are enabled.  */
+  if (!(TARGET_HAVE_LOB && optimize > 0))
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Predict doloop failure due to"
+			    " target architecture or optimisation flags.\n");
+      return false;
+    }
+  else if (loop->inner != NULL)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Predict doloop failure due to"
+			    " loop nesting.\n");
+      return false;
+    }
+  else if (!arm_target_bb_ok_for_lob (loop->header->next_bb))
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Predict doloop failure due to"
+			    " loop bb complexity.\n");
+      return false;
+    }
+
+  return true;
+}
+
+/* Implement targetm.loop_unroll_adjust.  Use this to block unrolling of loops
+   that may later be turned into MVE Tail Predicated Low Overhead Loops.  The
+   performance benefit of an MVE LoL is likely to be much higher than that of
+   the unrolling.  */
+
+unsigned
+arm_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
+{
+  if (TARGET_HAVE_MVE
+      && arm_target_bb_ok_for_lob (loop->latch)
+      && arm_mve_loop_valid_for_dlstp (loop))
+    return 0;
+  else
+    return nunroll;
+}
+
+/* Function to hadle emitting a VPT-unpredicated version of a VPT-predicated
+   insn to a sequence.  */
+
+static bool
+arm_emit_mve_unpredicated_insn_to_seq (rtx_insn* insn)
+{
+  rtx insn_vpr_reg_operand = arm_get_required_vpr_reg_param (insn);
+  int new_icode = get_attr_mve_unpredicated_insn (insn);
+  if (!in_sequence_p ()
+      || !MVE_VPT_PREDICATED_INSN_P (insn)
+      || (!insn_vpr_reg_operand)
+      || (!new_icode))
+    return false;
+
+  extract_insn (insn);
+  rtx arr[8];
+  int j = 0;
+
+  /* When transforming a VPT-predicated instruction into its unpredicated
+     equivalent we need to drop the VPR operand and we may need to also drop a
+     merge "vuninit" input operand, depending on the instruction pattern.  Here
+     ensure that we have at most a two-operand difference between the two
+     instrunctions.  */
+  int n_operands_diff
+      = recog_data.n_operands - insn_data[new_icode].n_operands;
+  if (!(n_operands_diff > 0 && n_operands_diff <= 2))
+    return false;
+
+  rtx move = NULL_RTX;
+  /* Then, loop through the operands of the predicated
+     instruction, and retain the ones that map to the
+     unpredicated instruction.  */
+  for (int i = 0; i < recog_data.n_operands; i++)
+    {
+      /* Ignore the VPR and, if needed, the vuninit
+	 operand.  */
+      if (insn_vpr_reg_operand == recog_data.operand[i])
+	continue;
+      if (n_operands_diff == 2
+	  && !strcmp (recog_data.constraints[i], "0"))
+	{
+	  move = gen_rtx_SET (arr[0], recog_data.operand[i]);
+	  arr[0] = recog_data.operand[i];
+	}
+      else
+	arr[j++] = recog_data.operand[i];
+    }
+
+  /* Finally, emit the upredicated instruction.  */
+  rtx_insn *new_insn;
+  switch (j)
+    {
+      case 1:
+	new_insn = emit_insn (GEN_FCN (new_icode) (arr[0]));
+	break;
+      case 2:
+	new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1]));
+	break;
+      case 3:
+	new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2]));
+	break;
+      case 4:
+	new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2],
+						   arr[3]));
+	break;
+      case 5:
+	new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2],
+						   arr[3], arr[4]));
+	break;
+      case 6:
+	new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2],
+						   arr[3], arr[4], arr[5]));
+	break;
+      case 7:
+	new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2],
+						   arr[3], arr[4], arr[5],
+						   arr[6]));
+	break;
+      default:
+	gcc_unreachable ();
+    }
+  INSN_LOCATION (new_insn) = INSN_LOCATION (insn);
+  if (move)
+    {
+      new_insn = emit_insn (move);
+      INSN_LOCATION (new_insn) = INSN_LOCATION (insn);
+    }
+  return true;
+}
+
+/* Return TRUE if INSN defines a MVE vector operand that has zeroed
+   tail-predicated lanes.  This is either true if:
+   * INSN is predicated by VCTP_VPR_GENERATED and the 'invalid lanes' operand
+     is in the PROPS_ZERO_SET,
+   * all MVE vector operands are in the PROPS_ZERO_SET
+*/
+
+static bool
+arm_mve_propagate_zero_pred_p (vec <rtx_insn *> *props_zero_set,
+			       rtx_insn *insn, rtx vctp_vpr_generated)
+{
+  if (arm_mve_load_store_insn_p (insn, DL_USAGE_READ))
+    return true;
+  if (arm_mve_load_store_insn_p (insn, DL_USAGE_WRITE))
+    return false;
+
+  int inactive_idx = -1;
+
+  extract_insn (insn);
+  /* If INSN is predicated by VCTP_VPR_GENERATED, then all tail-predicated
+     lanes will keep the value that is in the 'invalid lanes' register which we
+     identify by the "0" constraint, to ensure it is the same as the 'result'
+     register of this instruction.  */
+  if (arm_mve_insn_predicated_by (insn, vctp_vpr_generated))
+    {
+      for (int i = 0; i < recog_data.n_operands; i++)
+	{
+	  if (strcmp (recog_data.constraints[i], "0") == 0
+	      && VALID_MVE_MODE (GET_MODE (recog_data.operand[i])))
+	    {
+	      inactive_idx = i;
+	      break;
+	    }
+	}
+    }
+
+  if (inactive_idx > 0)
+    {
+      rtx op = recog_data.operand[inactive_idx];
+      rtx_insn *def_insn =  arm_last_vect_def_insn (op, insn);
+      return def_insn != NULL_RTX && props_zero_set->contains (def_insn);
+    }
+
+  /* If this instruction is not predicated by VCTP_VPR_GENERATED, then we must
+     check that all vector operands have zeroed tail-predicated lanes, and that
+     it has at least one vector operand.  */
+  bool at_least_one_vector = false;
+  df_ref insn_uses;
+  FOR_EACH_INSN_USE (insn_uses, insn)
+    {
+      rtx reg = DF_REF_REG (insn_uses);
+      if (!VALID_MVE_MODE (GET_MODE (reg)))
+	continue;
+
+      rtx_insn *def_insn = arm_last_vect_def_insn (reg, insn);
+      if (def_insn && props_zero_set->contains (def_insn))
+	at_least_one_vector |= true;
+      else
+	return false;
+
+    }
+  return at_least_one_vector;
+}
+
+
+/* Attempt to transform the loop contents of loop basic block from VPT
+   predicated insns into unpredicated insns for a dlstp/letp loop.  Returns
+   the number to decrement from the total number of elements each iteration.
+   Returns 1 if tail predication can not be performed and fallback to scalar
+   low-overhead loops.  */
+
+int
+arm_attempt_dlstp_transform (rtx label)
+{
+  if (!dlstp_enabled)
+    return 1;
+
+  basic_block body = single_succ (BLOCK_FOR_INSN (label));
+
+  /* Ensure that the bb is within a loop that has all required metadata.  */
+  if (!body->loop_father || !body->loop_father->header
+      || !body->loop_father->simple_loop_desc)
+    return 1;
+
+  loop *loop = body->loop_father;
+  /* Instruction that sets the predicate mask depending on how many elements
+     are left to process.  */
+  rtx_insn *vctp_insn = arm_mve_loop_valid_for_dlstp (loop);
+  if (!vctp_insn)
+    return 1;
+
+  gcc_assert (single_set (vctp_insn));
+
+  rtx vctp_vpr_generated = single_set (vctp_insn);
+  if (!vctp_vpr_generated)
+    return 1;
+
+  vctp_vpr_generated = SET_DEST (vctp_vpr_generated);
+
+  if (!vctp_vpr_generated || !REG_P (vctp_vpr_generated)
+      || !VALID_MVE_PRED_MODE (GET_MODE (vctp_vpr_generated)))
+    return 1;
+
+  /* decrementunum is already known to be valid at this point.  */
+  int decrementnum = arm_mve_get_vctp_lanes (vctp_insn);
+
+  rtx_insn *insn = 0;
+  rtx_insn *cur_insn = 0;
+  rtx_insn *seq;
+  auto_vec <rtx_insn *> props_zero_set;
+
+  /* Scan through the insns in the loop bb and emit the transformed bb
+     insns to a sequence.  */
+  start_sequence ();
+  FOR_BB_INSNS (body, insn)
+    {
+      if (GET_CODE (insn) == CODE_LABEL || NOTE_INSN_BASIC_BLOCK_P (insn))
+	continue;
+      else if (NOTE_P (insn))
+	emit_note ((enum insn_note)NOTE_KIND (insn));
+      else if (DEBUG_INSN_P (insn))
+	emit_debug_insn (PATTERN (insn));
+      else if (!INSN_P (insn))
+	{
+	  end_sequence ();
+	  return 1;
+	}
+      /* If the transformation is successful we no longer need the vctp
+	 instruction.  */
+      else if (insn == vctp_insn)
+	continue;
+      /* If the insn pattern requires the use of the VPR value from the
+	 vctp as an input parameter for predication.  */
+      else if (arm_mve_insn_predicated_by (insn, vctp_vpr_generated))
+	{
+	  /* Check whether this INSN propagates the zeroed tail-predication
+	     lanes.  */
+	  if (arm_mve_propagate_zero_pred_p (&props_zero_set, insn,
+					     vctp_vpr_generated))
+	    props_zero_set.safe_push (insn);
+	  bool success = arm_emit_mve_unpredicated_insn_to_seq (insn);
+	  if (!success)
+	    {
+	      end_sequence ();
+	      return 1;
+	    }
+	}
+      /* If the insn isn't VPT predicated on vctp_vpr_generated, we need to
+	 make sure that it is still valid within the dlstp/letp loop.  */
+      else
+	{
+	  /* If this instruction USE-s the vctp_vpr_generated other than for
+	     predication, this blocks the transformation as we are not allowed
+	     to optimise the VPR value away.  */
+	  df_ref insn_uses = NULL;
+	  FOR_EACH_INSN_USE (insn_uses, insn)
+	  {
+	    if (rtx_equal_p (vctp_vpr_generated, DF_REF_REG (insn_uses)))
+	      {
+		end_sequence ();
+		return 1;
+	      }
+	  }
+	  /* If within the loop we have an MVE vector instruction that is
+	     unpredicated, the dlstp/letp looping will add implicit
+	     predication to it.  This will result in a change in behaviour
+	     of the instruction, so we need to find out if any instructions
+	     that feed into the current instruction were implicitly
+	     predicated.  */
+	  if (MVE_VPT_PREDICABLE_INSN_P (insn)
+	      && !arm_mve_impl_predicated_p (&props_zero_set, insn,
+					     vctp_vpr_generated))
+	    {
+	      end_sequence ();
+	      return 1;
+	    }
+	  emit_insn (PATTERN (insn));
+	}
+    }
+  seq = get_insns ();
+  end_sequence ();
+
+  /* Re-write the entire BB contents with the transformed
+     sequence.  */
+  FOR_BB_INSNS_SAFE (body, insn, cur_insn)
+    if (!(GET_CODE (insn) == CODE_LABEL || NOTE_INSN_BASIC_BLOCK_P (insn)))
+      delete_insn (insn);
+
+  emit_insn_after (seq, BB_END (body));
+
+  /* The transformation has succeeded, so now modify the "count"
+     (a.k.a. niter_expr) for the middle-end.  Also set noloop_assumptions
+     to NULL to stop the middle-end from making assumptions about the
+     number of iterations.  */
+  simple_loop_desc (body->loop_father)->niter_expr
+    = XVECEXP (SET_SRC (PATTERN (vctp_insn)), 0, 0);
+  simple_loop_desc (body->loop_father)->noloop_assumptions = NULL_RTX;
+  return decrementnum;
 }
 
 #if CHECKING_P
diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt
index 0cd3fc2cd0cc6..d88c7a52e7591 100644
--- a/gcc/config/arm/arm.opt
+++ b/gcc/config/arm/arm.opt
@@ -363,5 +363,8 @@ Target Joined RejectNegative String Var(arm_stack_protector_guard_offset_str)
 Use an immediate to offset from the TLS register. This option is for use with
 fstack-protector-guard=tls and not for use in user-land code.
 
+mdlstp
+Target Var(dlstp_enabled) Init(1) Undocumented
+
 TargetVariable
 long arm_stack_protector_guard_offset = 0
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 8d066fcf05df6..987602da1bf5f 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -2686,6 +2686,17 @@
 (define_int_attr mrrc [(VUNSPEC_MRRC "mrrc") (VUNSPEC_MRRC2 "mrrc2")])
 (define_int_attr MRRC [(VUNSPEC_MRRC "MRRC") (VUNSPEC_MRRC2 "MRRC2")])
 
+(define_int_attr dlstp_elemsize [(DLSTP8 "8") (DLSTP16 "16") (DLSTP32 "32")
+				 (DLSTP64 "64")])
+
+(define_int_attr letp_num_lanes [(LETP8 "16") (LETP16 "8") (LETP32 "4")
+				 (LETP64 "2")])
+(define_int_attr letp_num_lanes_neg [(LETP8 "-16") (LETP16 "-8") (LETP32 "-4")
+				     (LETP64 "-2")])
+
+(define_int_attr letp_num_lanes_minus_1 [(LETP8 "15") (LETP16 "7") (LETP32 "3")
+					 (LETP64 "1")])
+
 (define_int_attr opsuffix [(UNSPEC_DOT_S "s8")
 			   (UNSPEC_DOT_U "u8")
 			   (UNSPEC_DOT_US "s8")
@@ -2926,6 +2937,10 @@
 (define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U])
 (define_int_iterator VQSHLUQ_M_N [VQSHLUQ_M_N_S])
 (define_int_iterator VQSHLUQ_N [VQSHLUQ_N_S])
+(define_int_iterator DLSTP [DLSTP8 DLSTP16 DLSTP32
+				   DLSTP64])
+(define_int_iterator LETP [LETP8 LETP16 LETP32
+			   LETP64])
 
 ;; Define iterators for VCMLA operations
 (define_int_iterator VCMLA_OP [UNSPEC_VCMLA
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 9fe51298cdc2b..4b4d6298ffb18 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -6930,3 +6930,53 @@
       }
   }
 )
+
+;; Originally expanded by 'predicated_doloop_end'.
+;; In the rare situation where the branch is too far, we do also need to
+;; revert FPSCR.LTPSIZE back to 0x100 after the last iteration.
+(define_insn "predicated_doloop_end_internal<letp_num_lanes>"
+  [(set (pc)
+	(if_then_else
+	 (gtu (plus:SI (reg:SI LR_REGNUM)
+	       (const_int <letp_num_lanes_neg>))
+	      (const_int <letp_num_lanes_minus_1>))
+	 (match_operand 0 "" "")
+	 (pc)))
+   (set (reg:SI LR_REGNUM)
+	(plus:SI (reg:SI LR_REGNUM) (const_int <letp_num_lanes_neg>)))
+  ;; We use UNSPEC here to guarantee this pattern can not be
+  ;; generated by a RTL optimization and be matched by other
+  ;; patterns, since this pattern is also responsible for turning off
+  ;; the tail predication machinery if we were to exit the loop.
+  ;; This is done by either the LETP or the LCTP instructions that
+  ;; this pattern generates.
+   (use (unspec:SI [(const_int 0)] LETP))
+   (clobber (reg:CC CC_REGNUM))]
+  "TARGET_HAVE_MVE"
+  {
+    if (get_attr_length (insn) == 4)
+      return "letp\t%|lr, %l0";
+    else
+      return "subs\t%|lr, #<letp_num_lanes>\n\tbhi\t%l0\n\tlctp";
+  }
+  [(set (attr "length")
+	(if_then_else
+	   (ltu (minus (pc) (match_dup 0)) (const_int 1024))
+	    (const_int 4)
+	    (const_int 12)))
+   (set_attr "type" "branch")
+   (set_attr "conds" "unconditional")])
+
+(define_insn "dlstp<dlstp_elemsize>_insn"
+  [
+    (set (reg:SI LR_REGNUM)
+;; Similar to the previous pattern, we use UNSPEC here to make sure this
+;; rtx construct is not matched by other patterns, as this pattern is also
+;; responsible for setting the element size of the tail predication machinery
+;; using the dlsp.<size> instruction.
+	 (unspec_volatile:SI [(match_operand:SI 0 "s_register_operand" "r")]
+	  DLSTP))
+  ]
+  "TARGET_HAVE_MVE"
+  "dlstp.<dlstp_elemsize>\t%|lr, %0"
+  [(set_attr "type" "mve_misc")])
diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md
index 84c9c3dfe8009..66b3ae6040c0c 100644
--- a/gcc/config/arm/thumb2.md
+++ b/gcc/config/arm/thumb2.md
@@ -1613,7 +1613,7 @@
    (use (match_operand 1 "" ""))]     ; label
   "TARGET_32BIT"
   "
- {
+{
    /* Currently SMS relies on the do-loop pattern to recognize loops
       where (1) the control part consists of all insns defining and/or
       using a certain 'count' register and (2) the loop count can be
@@ -1623,41 +1623,75 @@
 
       Also used to implement the low over head loops feature, which is part of
       the Armv8.1-M Mainline Low Overhead Branch (LOB) extension.  */
-   if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB))
-   {
-     rtx s0;
-     rtx bcomp;
-     rtx loc_ref;
-     rtx cc_reg;
-     rtx insn;
-     rtx cmp;
-
-     if (GET_MODE (operands[0]) != SImode)
-       FAIL;
-
-     s0 = operands [0];
-
-     /* Low over head loop instructions require the first operand to be LR.  */
-     if (TARGET_HAVE_LOB && arm_target_insn_ok_for_lob (operands [1]))
-       s0 = gen_rtx_REG (SImode, LR_REGNUM);
-
-     if (TARGET_THUMB2)
-       insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, GEN_INT (-1)));
-     else
-       insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1)));
-
-     cmp = XVECEXP (PATTERN (insn), 0, 0);
-     cc_reg = SET_DEST (cmp);
-     bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
-     loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands [1]);
-     emit_jump_insn (gen_rtx_SET (pc_rtx,
-                                  gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
-                                                        loc_ref, pc_rtx)));
-     DONE;
-   }
- else
-   FAIL;
- }")
+  if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB))
+    {
+      rtx s0;
+      rtx bcomp;
+      rtx loc_ref;
+      rtx cc_reg;
+      rtx insn;
+      rtx cmp;
+      int decrement_num;
+
+      if (GET_MODE (operands[0]) != SImode)
+	FAIL;
+
+      s0 = operands[0];
+
+      if (TARGET_HAVE_LOB
+	  && arm_target_bb_ok_for_lob (BLOCK_FOR_INSN (operands[1])))
+	{
+	  /* If we have a compatible MVE target, try and analyse the loop
+	     contents to determine if we can use predicated dlstp/letp
+	     looping.  These patterns implicitly use LR as the loop counter.  */
+	  if (TARGET_HAVE_MVE
+	      && ((decrement_num = arm_attempt_dlstp_transform (operands[1]))
+		  != 1))
+	    {
+	      loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]);
+	      switch (decrement_num)
+		{
+		  case 2:
+		    insn = gen_predicated_doloop_end_internal2 (loc_ref);
+		    break;
+		  case 4:
+		    insn = gen_predicated_doloop_end_internal4 (loc_ref);
+		    break;
+		  case 8:
+		    insn = gen_predicated_doloop_end_internal8 (loc_ref);
+		    break;
+		  case 16:
+		    insn = gen_predicated_doloop_end_internal16 (loc_ref);
+		    break;
+		  default:
+		    gcc_unreachable ();
+		}
+	      emit_jump_insn (insn);
+	      DONE;
+	    }
+	  /* Remaining LOB cases need to explicitly use LR.  */
+	  s0 = gen_rtx_REG (SImode, LR_REGNUM);
+	}
+
+	/* Otherwise, try standard decrement-by-one dls/le looping.  */
+	if (TARGET_THUMB2)
+	  insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0,
+							GEN_INT (-1)));
+	else
+	  insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1)));
+
+	cmp = XVECEXP (PATTERN (insn), 0, 0);
+	cc_reg = SET_DEST (cmp);
+	bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
+	loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]);
+	emit_jump_insn (gen_rtx_SET (pc_rtx,
+				     gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
+							   loc_ref, pc_rtx)));
+	DONE;
+    }
+  else
+    FAIL;
+}")
 
 (define_insn "*clear_apsr"
   [(unspec_volatile:SI [(const_int 0)] VUNSPEC_CLRM_APSR)
@@ -1755,7 +1789,37 @@
   {
     if (REGNO (operands[0]) == LR_REGNUM)
       {
-	emit_insn (gen_dls_insn (operands[0]));
+	/* Pick out the number by which we are decrementing the loop counter
+	   in every iteration.  If it's > 1, then use dlstp.  */
+	int const_int_dec_num
+	     = abs (INTVAL (XEXP (XEXP (XVECEXP (PATTERN (operands[1]), 0, 1),
+				  1),
+			    1)));
+	switch (const_int_dec_num)
+	  {
+	    case 16:
+	      emit_insn (gen_dlstp8_insn (operands[0]));
+	      break;
+
+	    case 8:
+	      emit_insn (gen_dlstp16_insn (operands[0]));
+	      break;
+
+	    case 4:
+	      emit_insn (gen_dlstp32_insn (operands[0]));
+	      break;
+
+	    case 2:
+	      emit_insn (gen_dlstp64_insn (operands[0]));
+	      break;
+
+	    case 1:
+	      emit_insn (gen_dls_insn (operands[0]));
+	      break;
+
+	    default:
+	      gcc_unreachable ();
+	  }
 	DONE;
       }
     else
diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md
index e2b70da1001f2..9527bdb9e87e7 100644
--- a/gcc/config/arm/types.md
+++ b/gcc/config/arm/types.md
@@ -574,6 +574,7 @@
 ; mve_move
 ; mve_store
 ; mve_load
+; mve_misc
 
 (define_attr "type"
  "adc_imm,\
@@ -1126,7 +1127,8 @@
   ls64,\
   mve_move,\
   mve_store,\
-  mve_load"
+  mve_load, \
+  mve_misc"
    (cond [(eq_attr "autodetect_type" "alu_shift_lsr_op2,alu_shift_asr_op2")
             (const_string "alu_shift_imm_other")
           (eq_attr "autodetect_type" "alu_shift_lsl_op2")
@@ -1292,7 +1294,7 @@
 ;; No otherwise.
 (define_attr "is_mve_type" "yes,no"
         (if_then_else (eq_attr "type"
-        "mve_move, mve_load, mve_store, mrs")
+        "mve_move, mve_load, mve_store, mrs, mve_misc")
         (const_string "yes")
         (const_string "no")))
 
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 46ac8b3715740..f5f4d1543645b 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -591,6 +591,10 @@
   VADDLVQ_U
   VCTP
   VCTP_M
+  LETP8
+  LETP16
+  LETP32
+  LETP64
   VPNOT
   VCREATEQ_F
   VCVTQ_N_TO_F_S
@@ -1259,6 +1263,14 @@
   UQRSHLL_48
   SQRSHRL_64
   SQRSHRL_48
-  VSHLCQ_M_
   REINTERPRET
 ])
+
+; DLSTP unspecs must be volatile to guarantee the scheduler does not reschedule
+; these instructions within the loop preheader.
+(define_c_enum "unspecv" [
+  DLSTP8
+  DLSTP16
+  DLSTP32
+  DLSTP64
+])
diff --git a/gcc/config/i386/driver-i386.cc b/gcc/config/i386/driver-i386.cc
index 0176d8b6cd296..11470eaea1254 100644
--- a/gcc/config/i386/driver-i386.cc
+++ b/gcc/config/i386/driver-i386.cc
@@ -558,10 +558,12 @@ const char *host_detect_local_cpu (int argc, const char **argv)
       switch (family)
 	{
 	case 7:
-	  if (model == 0x3b)
-	    processor = PROCESSOR_LUJIAZUI;
-	  else if (model >= 0x5b)
+	  if (model >= 0x6b)
+	    processor = PROCESSOR_SHIJIDADAO;
+	  else if (model == 0x5b)
 	    processor = PROCESSOR_YONGFENG;
+	  else if (model == 0x3b)
+	    processor = PROCESSOR_LUJIAZUI;
 	  break;
 	default:
 	  break;
@@ -853,6 +855,9 @@ const char *host_detect_local_cpu (int argc, const char **argv)
     case PROCESSOR_YONGFENG:
       cpu = "yongfeng";
       break;
+    case PROCESSOR_SHIJIDADAO:
+      cpu = "shijidadao";
+      break;
 
     default:
       /* Use something reasonable.  */
diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc
index 7b0ad9e9181ee..403475d5b6bb2 100644
--- a/gcc/config/i386/i386-c.cc
+++ b/gcc/config/i386/i386-c.cc
@@ -156,6 +156,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
       def_or_undef (parse_in, "__yongfeng");
       def_or_undef (parse_in, "__yongfeng__");
       break;
+    case PROCESSOR_SHIJIDADAO:
+      def_or_undef (parse_in, "__shijidadao");
+      def_or_undef (parse_in, "__shijidadao__");
+      break;
     case PROCESSOR_PENTIUM4:
       def_or_undef (parse_in, "__pentium4");
       def_or_undef (parse_in, "__pentium4__");
@@ -386,6 +390,9 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
     case PROCESSOR_YONGFENG:
       def_or_undef (parse_in, "__tune_yongfeng__");
        break;
+    case PROCESSOR_SHIJIDADAO:
+      def_or_undef (parse_in, "__tune_shijidadao__");
+       break;
     case PROCESSOR_PENTIUM4:
       def_or_undef (parse_in, "__tune_pentium4__");
       break;
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index f2cecc0e2545b..65c5bad9c285e 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -155,7 +155,8 @@ along with GCC; see the file COPYING3.  If not see
 
 #define m_LUJIAZUI (HOST_WIDE_INT_1U<<PROCESSOR_LUJIAZUI)
 #define m_YONGFENG (HOST_WIDE_INT_1U<<PROCESSOR_YONGFENG)
-#define m_ZHAOXIN  (m_LUJIAZUI | m_YONGFENG)
+#define m_SHIJIDADAO (HOST_WIDE_INT_1U<<PROCESSOR_SHIJIDADAO)
+#define m_ZHAOXIN  (m_LUJIAZUI | m_YONGFENG | m_SHIJIDADAO)
 
 #define m_GEODE (HOST_WIDE_INT_1U<<PROCESSOR_GEODE)
 #define m_K6 (HOST_WIDE_INT_1U<<PROCESSOR_K6)
@@ -793,6 +794,7 @@ static const struct processor_costs *processor_cost_table[] =
   &intel_cost,
   &lujiazui_cost,
   &yongfeng_cost,
+  &shijidadao_cost,
   &geode_cost,
   &k6_cost,
   &athlon_cost,
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index dc1a1f4432072..ff5ed32e21e3c 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -2304,6 +2304,7 @@ enum processor_type
   PROCESSOR_INTEL,
   PROCESSOR_LUJIAZUI,
   PROCESSOR_YONGFENG,
+  PROCESSOR_SHIJIDADAO,
   PROCESSOR_GEODE,
   PROCESSOR_K6,
   PROCESSOR_ATHLON,
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index d3aaaa4b5cc25..a933794ed5059 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -3644,6 +3644,122 @@ struct processor_costs yongfeng_cost = {
   2,					/* Small unroll factor.  */
 };
 
+/* shijidadao_cost should produce code tuned for ZHAOXIN shijidadao CPU.  */
+static stringop_algs shijidadao_memcpy[2] = {
+  {libcall, {{8, unrolled_loop, true}, {256, unrolled_loop, false},
+			 {-1, libcall, false}}},
+  {libcall, {{10, loop, true}, {256, unrolled_loop, false},
+			 {-1, libcall, false}}}};
+static stringop_algs shijidadao_memset[2] = {
+  {libcall, {{4, loop, true}, {128, unrolled_loop, false},
+			 {-1, libcall, false}}},
+  {libcall, {{1, rep_prefix_4_byte, false}, {14, loop, true},
+			 {1024, vector_loop, false},
+			 {-1, libcall, false}}}};
+static const
+struct processor_costs shijidadao_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2.  */
+  8,				/* cost for loading QImode using movzbl.  */
+  {8, 8, 8},			/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {8, 8, 8},			/* cost of storing integer registers.  */
+  2,					/* cost of reg,reg fld/fst.  */
+  {8, 8, 8},			/* cost of loading fp registers
+				in SFmode, DFmode and XFmode.  */
+  {8, 8, 8},			/* cost of storing fp registers
+				in SFmode, DFmode and XFmode.  */
+  2,				/* cost of moving MMX register.  */
+  {8, 8},			/* cost of loading MMX registers
+				in SImode and DImode.  */
+  {8, 8},			/* cost of storing MMX registers
+				in SImode and DImode.  */
+  2, 3, 4,			/* cost of moving XMM,YMM,ZMM register.  */
+  {8, 8, 8, 10, 15},	/* cost of loading SSE registers
+				in 32,64,128,256 and 512-bit.  */
+  {8, 8, 8, 10, 15},	/* cost of storing SSE registers
+				in 32,64,128,256 and 512-bit.  */
+  8, 8,				/* SSE->integer and integer->SSE moves.  */
+  8, 8,				/* mask->integer and integer->mask moves.  */
+  {8, 8, 8},		/* cost of loading mask register
+				in QImode, HImode, SImode.  */
+  {8, 8, 8},		/* cost if storing mask register
+				in QImode, HImode, SImode.  */
+  2,				/* cost of moving mask register.  */
+  /* End of register allocator costs.  */
+  },
+
+  COSTS_N_INSNS (1),			/* cost of an add instruction.  */
+  COSTS_N_INSNS (1),		        /* cost of a lea instruction.  */
+  COSTS_N_INSNS (1),			/* variable shift costs.  */
+  COSTS_N_INSNS (1),			/* constant shift costs.  */
+  {COSTS_N_INSNS (2),			/* cost of starting multiply for QI.  */
+   COSTS_N_INSNS (3),			/*				 HI.  */
+   COSTS_N_INSNS (2),			/*				 SI.  */
+   COSTS_N_INSNS (2),			/*				 DI.  */
+   COSTS_N_INSNS (3)},		/*				 other.  */
+  0,				/* cost of multiply per each bit set.  */
+  {COSTS_N_INSNS (9),			/* cost of a divide/mod for QI.  */
+   COSTS_N_INSNS (10),			/*			    HI.  */
+   COSTS_N_INSNS (9),			/*			    SI.  */
+   COSTS_N_INSNS (50),			/*			    DI.  */
+   COSTS_N_INSNS (50)},		/*			    other.  */
+  COSTS_N_INSNS (1),			/* cost of movsx.  */
+  COSTS_N_INSNS (1),			/* cost of movzx.  */
+  8,					/* "large" insn.  */
+  17,					/* MOVE_RATIO.  */
+  6,					/* CLEAR_RATIO.  */
+  {8, 8, 8},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {8, 8, 8},			/* cost of storing integer registers.  */
+  {8, 8, 8, 12, 15},			/* cost of loading SSE register
+				in 32bit, 64bit, 128bit, 256bit and 512bit.  */
+  {8, 8, 8, 12, 15},			/* cost of storing SSE register
+				in 32bit, 64bit, 128bit, 256bit and 512bit.  */
+  {8, 8, 8, 12, 15},			/* cost of unaligned loads.  */
+  {8, 8, 8, 12, 15},			/* cost of unaligned storess.  */
+  2, 3, 4,			/* cost of moving XMM,YMM,ZMM register.  */
+  8,				/* cost of moving SSE register to integer.  */
+  18, 6,				/* Gather load static, per_elt.  */
+  18, 6,				/* Gather store static, per_elt.  */
+  32,				  	/* size of l1 cache.  */
+  256,					/* size of l2 cache.  */
+  64,					/* size of prefetch block.  */
+  12,					/* number of parallel prefetches.  */
+  3,					/* Branch cost.  */
+  COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (3),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (13),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (44),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
+  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
+  COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
+  COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
+  COSTS_N_INSNS (11),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (14),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (11),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
+  4, 4, 4, 4,				/* reassoc int, fp, vec_int, vec_fp.  */
+  shijidadao_memcpy,
+  shijidadao_memset,
+  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
+  "16:11:8",				/* Loop alignment.  */
+  "16:11:8",				/* Jump alignment.  */
+  "0:0:8",				/* Label alignment.  */
+  "16",				/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+};
+
+
 
 /* Generic should produce code tuned for Core-i7 (and newer chips)
    and btver1 (and newer chips).  */
diff --git a/gcc/config/i386/x86-tune-sched.cc b/gcc/config/i386/x86-tune-sched.cc
index f70846e628e57..d77298b0e34dc 100644
--- a/gcc/config/i386/x86-tune-sched.cc
+++ b/gcc/config/i386/x86-tune-sched.cc
@@ -79,6 +79,7 @@ ix86_issue_rate (void)
     case PROCESSOR_CANNONLAKE:
     case PROCESSOR_ALDERLAKE:
     case PROCESSOR_YONGFENG:
+    case PROCESSOR_SHIJIDADAO:
     case PROCESSOR_GENERIC:
       return 4;
 
@@ -446,6 +447,7 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
       break;
 
     case PROCESSOR_YONGFENG:
+    case PROCESSOR_SHIJIDADAO:
       /* Stack engine allows to execute push&pop instructions in parallel.  */
       if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
 	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 66512992b7b5b..343c32c291fa8 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -477,7 +477,7 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
    elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
 	  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID
-	    | m_YONGFENG | m_CORE_ATOM | m_GENERIC | m_GDS))
+	    | m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2
    elements.  */
@@ -488,7 +488,7 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
    elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
 	  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID
-	    | m_YONGFENG | m_CORE_ATOM | m_GENERIC | m_GDS))
+	    | m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4
    elements.  */
@@ -499,7 +499,7 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
    elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts",
 	  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_CORE_HYBRID | m_CORE_ATOM
-	    | m_YONGFENG | m_GENERIC | m_GDS))
+	    | m_YONGFENG | m_SHIJIDADAO | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
    elements.  */
@@ -509,7 +509,7 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
    smaller FMA chain.  */
 DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4
-          | m_YONGFENG | m_GENERIC)
+          | m_YONGFENG | m_SHIJIDADAO | m_GENERIC)
 
 /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
    smaller FMA chain.  */
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index b6f6e4ff37e78..596b88cc8a3cd 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -1420,12 +1420,8 @@ class fcmp : public function_base
     switch (e.op_info->op)
       {
 	case OP_TYPE_vf: {
-	  if (CODE == EQ || CODE == NE)
-	    return e.use_compare_insn (CODE, code_for_pred_eqne_scalar (
-					       e.vector_mode ()));
-	  else
-	    return e.use_compare_insn (CODE, code_for_pred_cmp_scalar (
-					       e.vector_mode ()));
+	  return e.use_compare_insn (CODE, code_for_pred_cmp_scalar (
+					     e.vector_mode ()));
 	}
 	case OP_TYPE_vv: {
 	  return e.use_compare_insn (CODE,
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index fbcdf96f038ba..f8fae6557d935 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -7545,92 +7545,6 @@
    (set_attr "mode" "<MODE>")
    (set_attr "spec_restriction" "none,thv,thv,none,none")])
 
-(define_expand "@pred_eqne<mode>_scalar"
-  [(set (match_operand:<VM> 0 "register_operand")
-	(if_then_else:<VM>
-	  (unspec:<VM>
-	    [(match_operand:<VM> 1 "vector_mask_operand")
-	     (match_operand 6 "vector_length_operand")
-	     (match_operand 7 "const_int_operand")
-	     (match_operand 8 "const_int_operand")
-	     (reg:SI VL_REGNUM)
-	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
-	  (match_operator:<VM> 3 "equality_operator"
-	     [(vec_duplicate:V_VLSF
-	        (match_operand:<VEL> 5 "register_operand"))
-	      (match_operand:V_VLSF 4 "register_operand")])
-	  (match_operand:<VM> 2 "vector_merge_operand")))]
-  "TARGET_VECTOR"
-  {})
-
-(define_insn "*pred_eqne<mode>_scalar_merge_tie_mask"
-  [(set (match_operand:<VM> 0 "register_operand"              "=vm")
-	(if_then_else:<VM>
-	  (unspec:<VM>
-	    [(match_operand:<VM> 1 "register_operand"         "  0")
-	     (match_operand 5 "vector_length_operand"         " rK")
-	     (match_operand 6 "const_int_operand"             "  i")
-	     (match_operand 7 "const_int_operand"             "  i")
-	     (reg:SI VL_REGNUM)
-	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
-	  (match_operator:<VM> 2 "equality_operator"
-	     [(vec_duplicate:V_VLSF
-	        (match_operand:<VEL> 4 "register_operand"     "  f"))
-	      (match_operand:V_VLSF 3 "register_operand"      " vr")])
-	  (match_dup 1)))]
-  "TARGET_VECTOR"
-  "vmf%B2.vf\t%0,%3,%4,v0.t"
-  [(set_attr "type" "vfcmp")
-   (set_attr "mode" "<MODE>")
-   (set_attr "merge_op_idx" "1")
-   (set_attr "vl_op_idx" "5")
-   (set (attr "ma") (symbol_ref "riscv_vector::get_ma(operands[6])"))
-   (set (attr "avl_type_idx") (const_int 7))])
-
-;; We don't use early-clobber for LMUL <= 1 to get better codegen.
-(define_insn "*pred_eqne<mode>_scalar"
-  [(set (match_operand:<VM> 0 "register_operand"                "=vr,   vr,   &vr,   &vr")
-	(if_then_else:<VM>
-	  (unspec:<VM>
-	    [(match_operand:<VM> 1 "vector_mask_operand"      "vmWc1,vmWc1,vmWc1,vmWc1")
-	     (match_operand 6 "vector_length_operand"         "   rK,   rK,   rK,   rK")
-	     (match_operand 7 "const_int_operand"             "    i,    i,    i,    i")
-	     (match_operand 8 "const_int_operand"             "    i,    i,    i,    i")
-	     (reg:SI VL_REGNUM)
-	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
-	  (match_operator:<VM> 3 "equality_operator"
-	     [(vec_duplicate:V_VLSF
-	        (match_operand:<VEL> 5 "register_operand"     "    f,    f,    f,    f"))
-	      (match_operand:V_VLSF 4 "register_operand"      "   vr,   vr,   vr,   vr")])
-	  (match_operand:<VM> 2 "vector_merge_operand"        "   vu,    0,    vu,    0")))]
-  "TARGET_VECTOR && riscv_vector::cmp_lmul_le_one (<MODE>mode)"
-  "vmf%B3.vf\t%0,%4,%5%p1"
-  [(set_attr "type" "vfcmp")
-   (set_attr "mode" "<MODE>")
-   (set_attr "spec_restriction" "thv,thv,rvv,rvv")])
-
-;; We use early-clobber for source LMUL > dest LMUL.
-(define_insn "*pred_eqne<mode>_scalar_narrow"
-  [(set (match_operand:<VM> 0 "register_operand"                "=vm,   vr,   vr,  &vr,  &vr")
-	(if_then_else:<VM>
-	  (unspec:<VM>
-	    [(match_operand:<VM> 1 "vector_mask_operand"      "    0,vmWc1,vmWc1,vmWc1,vmWc1")
-	     (match_operand 6 "vector_length_operand"         "   rK,   rK,   rK,   rK,   rK")
-	     (match_operand 7 "const_int_operand"             "    i,    i,    i,    i,    i")
-	     (match_operand 8 "const_int_operand"             "    i,    i,    i,    i,    i")
-	     (reg:SI VL_REGNUM)
-	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
-	  (match_operator:<VM> 3 "equality_operator"
-	     [(vec_duplicate:V_VLSF
-	        (match_operand:<VEL> 5 "register_operand"     "    f,    f,    f,    f,    f"))
-	      (match_operand:V_VLSF 4 "register_operand"      "   vr,    0,    0,   vr,   vr")])
-	  (match_operand:<VM> 2 "vector_merge_operand"        "   vu,   vu,    0,   vu,    0")))]
-  "TARGET_VECTOR && riscv_vector::cmp_lmul_gt_one (<MODE>mode)"
-  "vmf%B3.vf\t%0,%4,%5%p1"
-  [(set_attr "type" "vfcmp")
-   (set_attr "mode" "<MODE>")
-   (set_attr "spec_restriction" "none,thv,thv,none,none")])
-
 ;; -------------------------------------------------------------------------------
 ;; ---- Predicated floating-point merge
 ;; -------------------------------------------------------------------------------
diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc
index bc127997ac6cb..e2549de5df050 100644
--- a/gcc/config/xtensa/xtensa.cc
+++ b/gcc/config/xtensa/xtensa.cc
@@ -3078,7 +3078,17 @@ print_operand (FILE *file, rtx x, int letter)
 	  /* For a volatile memory reference, emit a MEMW before the
 	     load or store.  */
 	  if (MEM_VOLATILE_P (x) && TARGET_SERIALIZE_VOLATILE)
-	    fprintf (file, "memw\n\t");
+	    {
+	      rtx_insn *prev_insn
+			= prev_nonnote_nondebug_insn (current_output_insn);
+	      rtx pat, src;
+
+	      if (! (prev_insn && NONJUMP_INSN_P (prev_insn)
+		     && GET_CODE (pat = PATTERN (prev_insn)) == SET
+		     && GET_CODE (src = SET_SRC (pat)) == UNSPEC
+		     && XINT (src, 1) == UNSPEC_MEMW))
+		fprintf (file, "memw\n\t");
+	    }
 	}
       else
 	output_operand_lossage ("invalid %%v value");
diff --git a/gcc/configure b/gcc/configure
index b536af664d3de..9dc0b65dfaace 100755
--- a/gcc/configure
+++ b/gcc/configure
@@ -30239,7 +30239,7 @@ else
 fi
 { $as_echo "$as_me:${as_lineno-$LINENO}: result: $gcc_cv_as_mips_explicit_relocs_pcrel" >&5
 $as_echo "$gcc_cv_as_mips_explicit_relocs_pcrel" >&6; }
-if test "x$gcc_cv_as_mips_explicit_relocs_pcrel" = "xyes"; then
+if test $gcc_cv_as_mips_explicit_relocs_pcrel = yes; then
 
 $as_echo "#define MIPS_EXPLICIT_RELOCS MIPS_EXPLICIT_RELOCS_PCREL" >>confdefs.h
 
diff --git a/gcc/configure.ac b/gcc/configure.ac
index 1501bf89c89da..b2243e9954aac 100644
--- a/gcc/configure.ac
+++ b/gcc/configure.ac
@@ -5317,7 +5317,7 @@ x:
 
     AC_MSG_CHECKING(assembler and linker for explicit JALR relocation)
     gcc_cv_as_ld_jalr_reloc=no
-    if test "x$gcc_cv_as_mips_explicit_relocs" = "xyes"; then
+    if test $gcc_cv_as_mips_explicit_relocs = yes; then
       if test $in_tree_ld = yes ; then
         if test "$gcc_cv_gld_major_version" -eq 2 -a "$gcc_cv_gld_minor_version" -ge 20 -o "$gcc_cv_gld_major_version" -gt 2 \
            && test $in_tree_ld_is_elf = yes; then
diff --git a/gcc/df-core.cc b/gcc/df-core.cc
index f0eb4c93957ff..b0e8a88d433bb 100644
--- a/gcc/df-core.cc
+++ b/gcc/df-core.cc
@@ -1964,6 +1964,21 @@ df_bb_regno_last_def_find (basic_block bb, unsigned int regno)
   return NULL;
 }
 
+/* Return the one and only def of REGNO within BB.  If there is no def or
+   there are multiple defs, return NULL.  */
+
+df_ref
+df_bb_regno_only_def_find (basic_block bb, unsigned int regno)
+{
+  df_ref temp = df_bb_regno_first_def_find (bb, regno);
+  if (!temp)
+    return NULL;
+  else if (temp == df_bb_regno_last_def_find (bb, regno))
+    return temp;
+  else
+    return NULL;
+}
+
 /* Finds the reference corresponding to the definition of REG in INSN.
    DF is the dataflow object.  */
 
diff --git a/gcc/df.h b/gcc/df.h
index 84e5aa8b524df..c4e690b40cf21 100644
--- a/gcc/df.h
+++ b/gcc/df.h
@@ -987,6 +987,7 @@ extern void df_check_cfg_clean (void);
 #endif
 extern df_ref df_bb_regno_first_def_find (basic_block, unsigned int);
 extern df_ref df_bb_regno_last_def_find (basic_block, unsigned int);
+extern df_ref df_bb_regno_only_def_find (basic_block, unsigned int);
 extern df_ref df_find_def (rtx_insn *, rtx);
 extern bool df_reg_defined (rtx_insn *, rtx);
 extern df_ref df_find_use (rtx_insn *, rtx);
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 173cdef013160..b2e41a581dd1c 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -26245,6 +26245,9 @@ ZHAOXIN lujiazui CPU.
 @item yongfeng
 ZHAOXIN yongfeng CPU.
 
+@item shijidadao
+ZHAOXIN shijidadao CPU.
+
 @item amdfam10h
 AMD Family 10h CPU.
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 5d7a87fde86c4..c790e2f35184c 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -34873,6 +34873,12 @@ SSE4.2, AVX, POPCNT, AES, PCLMUL, RDRND, XSAVE, XSAVEOPT, FSGSBASE, CX16,
 ABM, BMI, BMI2, F16C, FXSR, RDSEED, AVX2, FMA, SHA, LZCNT
 instruction set support.
 
+@item shijidadao
+ZHAOXIN shijidadao CPU with x86-64, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1,
+SSE4.2, AVX, POPCNT, AES, PCLMUL, RDRND, XSAVE, XSAVEOPT, FSGSBASE, CX16,
+ABM, BMI, BMI2, F16C, FXSR, RDSEED, AVX2, FMA, SHA, LZCNT
+instruction set support.
+
 @item geode
 AMD Geode embedded processor with MMX and 3DNow!@: instruction set support.
 @end table
diff --git a/gcc/fortran/ChangeLog b/gcc/fortran/ChangeLog
index a2275728a46b9..8fcd6d40c956c 100644
--- a/gcc/fortran/ChangeLog
+++ b/gcc/fortran/ChangeLog
@@ -1,3 +1,19 @@
+2024-06-19  Harald Anlauf  <anlauf@gmx.de>
+
+	PR fortran/115390
+	* trans-decl.cc (gfc_conv_cfi_to_gfc): Move derivation of type sizes
+	for character via gfc_trans_vla_type_sizes to after character length
+	has been set.
+
+2024-06-19  Andre Vehreschild  <vehre@gcc.gnu.org>
+
+	PR fortran/90076
+	* trans-decl.cc (gfc_generate_function_code): Set vptr for
+	results to declared class type.
+	* trans-expr.cc (gfc_reset_vptr): Allow to provide the typespec
+	instead of the expression.
+	* trans.h (gfc_reset_vptr): Same.
+
 2024-06-17  Andre Vehreschild  <vehre@gcc.gnu.org>
 
 	* trans.cc (gfc_deallocate_with_status): Check that object to deref
diff --git a/gcc/fortran/trans-decl.cc b/gcc/fortran/trans-decl.cc
index 88538713a02b4..f7fb6eec336a8 100644
--- a/gcc/fortran/trans-decl.cc
+++ b/gcc/fortran/trans-decl.cc
@@ -7063,8 +7063,8 @@ gfc_conv_cfi_to_gfc (stmtblock_t *init, stmtblock_t *finally,
   if (sym->ts.type == BT_CHARACTER
       && !INTEGER_CST_P (sym->ts.u.cl->backend_decl))
     {
-      gfc_conv_string_length (sym->ts.u.cl, NULL, init);
-      gfc_trans_vla_type_sizes (sym, init);
+      gfc_conv_string_length (sym->ts.u.cl, NULL, &block);
+      gfc_trans_vla_type_sizes (sym, &block);
     }
 
   /* gfc->data = cfi->base_addr - or for scalars: gfc = cfi->base_addr.
diff --git a/gcc/gimple-lower-bitint.cc b/gcc/gimple-lower-bitint.cc
index 56e5f826a8d9f..f955f3eabd9b6 100644
--- a/gcc/gimple-lower-bitint.cc
+++ b/gcc/gimple-lower-bitint.cc
@@ -6630,7 +6630,10 @@ gimple_lower_bitint (void)
 		    continue;
 		  if (gimple_code (use_stmt) == GIMPLE_PHI
 		      || is_gimple_call (use_stmt)
-		      || gimple_code (use_stmt) == GIMPLE_ASM)
+		      || gimple_code (use_stmt) == GIMPLE_ASM
+		      || (is_gimple_assign (use_stmt)
+			  && (gimple_assign_rhs_code (use_stmt)
+			      == COMPLEX_EXPR)))
 		    {
 		      optimizable_load = false;
 		      break;
diff --git a/gcc/loop-doloop.cc b/gcc/loop-doloop.cc
index 529e810e530c2..8953e1de96094 100644
--- a/gcc/loop-doloop.cc
+++ b/gcc/loop-doloop.cc
@@ -85,10 +85,10 @@ doloop_condition_get (rtx_insn *doloop_pat)
      forms:
 
      1)  (parallel [(set (pc) (if_then_else (condition)
-	  			            (label_ref (label))
-				            (pc)))
-	             (set (reg) (plus (reg) (const_int -1)))
-	             (additional clobbers and uses)])
+					    (label_ref (label))
+					    (pc)))
+		     (set (reg) (plus (reg) (const_int -1)))
+		     (additional clobbers and uses)])
 
      The branch must be the first entry of the parallel (also required
      by jump.cc), and the second entry of the parallel must be a set of
@@ -96,19 +96,33 @@ doloop_condition_get (rtx_insn *doloop_pat)
      the loop counter in an if_then_else too.
 
      2)  (set (reg) (plus (reg) (const_int -1))
-         (set (pc) (if_then_else (reg != 0)
-	                         (label_ref (label))
-			         (pc))).  
+	 (set (pc) (if_then_else (reg != 0)
+				 (label_ref (label))
+				 (pc))).
 
-     Some targets (ARM) do the comparison before the branch, as in the
+     3) Some targets (Arm) do the comparison before the branch, as in the
      following form:
 
-     3) (parallel [(set (cc) (compare ((plus (reg) (const_int -1), 0)))
-                   (set (reg) (plus (reg) (const_int -1)))])
-        (set (pc) (if_then_else (cc == NE)
-                                (label_ref (label))
-                                (pc))) */
-
+     (parallel [(set (cc) (compare (plus (reg) (const_int -1)) 0))
+		(set (reg) (plus (reg) (const_int -1)))])
+     (set (pc) (if_then_else (cc == NE)
+			     (label_ref (label))
+			     (pc)))
+
+      4) This form supports a construct that is used to represent a vectorized
+      do loop with predication, however we do not need to care about the
+      details of the predication here.
+      Arm uses this construct to support MVE tail predication.
+
+      (parallel
+       [(set (pc)
+	     (if_then_else (gtu (plus (reg) (const_int -n))
+				(const_int n-1))
+			   (label_ref)
+			   (pc)))
+	(set (reg) (plus (reg) (const_int -n)))
+	(additional clobbers and uses)])
+     */
   pattern = PATTERN (doloop_pat);
 
   if (GET_CODE (pattern) != PARALLEL)
@@ -173,15 +187,17 @@ doloop_condition_get (rtx_insn *doloop_pat)
   if (! REG_P (reg))
     return 0;
 
-  /* Check if something = (plus (reg) (const_int -1)).
+  /* Check if something = (plus (reg) (const_int -n)).
      On IA-64, this decrement is wrapped in an if_then_else.  */
   inc_src = SET_SRC (inc);
   if (GET_CODE (inc_src) == IF_THEN_ELSE)
     inc_src = XEXP (inc_src, 1);
   if (GET_CODE (inc_src) != PLUS
-      || XEXP (inc_src, 0) != reg
-      || XEXP (inc_src, 1) != constm1_rtx)
+      || !rtx_equal_p (XEXP (inc_src, 0), reg)
+      || !CONST_INT_P (XEXP (inc_src, 1))
+      || INTVAL (XEXP (inc_src, 1)) >= 0)
     return 0;
+  int dec_num = -INTVAL (XEXP (inc_src, 1));
 
   /* Check for (set (pc) (if_then_else (condition)
                                        (label_ref (label))
@@ -196,60 +212,63 @@ doloop_condition_get (rtx_insn *doloop_pat)
   /* Extract loop termination condition.  */
   condition = XEXP (SET_SRC (cmp), 0);
 
-  /* We expect a GE or NE comparison with 0 or 1.  */
-  if ((GET_CODE (condition) != GE
-       && GET_CODE (condition) != NE)
-      || (XEXP (condition, 1) != const0_rtx
-          && XEXP (condition, 1) != const1_rtx))
+  /* We expect a GE or NE comparison with 0 or 1, or a GTU comparison with
+     dec_num - 1.  */
+  if (!((GET_CODE (condition) == GE
+	 || GET_CODE (condition) == NE)
+	&& (XEXP (condition, 1) == const0_rtx
+	    || XEXP (condition, 1) == const1_rtx ))
+      &&!(GET_CODE (condition) == GTU
+	  && ((INTVAL (XEXP (condition, 1))) == (dec_num - 1))))
     return 0;
 
-  if ((XEXP (condition, 0) == reg)
+  if (rtx_equal_p (XEXP (condition, 0), reg)
       /* For the third case:  */  
       || ((cc_reg != NULL_RTX)
 	  && (XEXP (condition, 0) == cc_reg)
-	  && (reg_orig == reg))
+	  && (rtx_equal_p (reg_orig, reg)))
       || (GET_CODE (XEXP (condition, 0)) == PLUS
-	  && XEXP (XEXP (condition, 0), 0) == reg))
-   {
-     if (GET_CODE (pattern) != PARALLEL)
-     /*  For the second form we expect:
+	  && rtx_equal_p (XEXP (XEXP (condition, 0), 0), reg, NULL)))
+    {
+      if (GET_CODE (pattern) != PARALLEL)
+      /* For the second form we expect:
 
-         (set (reg) (plus (reg) (const_int -1))
-         (set (pc) (if_then_else (reg != 0)
-                                 (label_ref (label))
-                                 (pc))).
+	 (set (reg) (plus (reg) (const_int -1))
+	 (set (pc) (if_then_else (reg != 0)
+				 (label_ref (label))
+				 (pc))).
 
-         is equivalent to the following:
+	 is equivalent to the following:
 
-         (parallel [(set (pc) (if_then_else (reg != 1)
-                                            (label_ref (label))
-                                            (pc)))
-                     (set (reg) (plus (reg) (const_int -1)))
-                     (additional clobbers and uses)])
+	 (parallel [(set (pc) (if_then_else (reg != 1)
+					    (label_ref (label))
+					    (pc)))
+		     (set (reg) (plus (reg) (const_int -1)))
+		     (additional clobbers and uses)])
 
-        For the third form we expect:
+	For the third form we expect:
 
-        (parallel [(set (cc) (compare ((plus (reg) (const_int -1)), 0))
-                   (set (reg) (plus (reg) (const_int -1)))])
-        (set (pc) (if_then_else (cc == NE)
-                                (label_ref (label))
-                                (pc))) 
+	(parallel [(set (cc) (compare ((plus (reg) (const_int -1)), 0))
+		   (set (reg) (plus (reg) (const_int -1)))])
+	(set (pc) (if_then_else (cc == NE)
+				(label_ref (label))
+				(pc)))
 
-        which is equivalent to the following:
+	which is equivalent to the following:
 
-        (parallel [(set (cc) (compare (reg,  1))
-                   (set (reg) (plus (reg) (const_int -1)))
-                   (set (pc) (if_then_else (NE == cc)
-                                           (label_ref (label))
-                                           (pc))))])
+	(parallel [(set (cc) (compare (reg,  1))
+		   (set (reg) (plus (reg) (const_int -1)))
+		   (set (pc) (if_then_else (NE == cc)
+					   (label_ref (label))
+					   (pc))))])
 
-        So we return the second form instead for the two cases.
+	So we return the second form instead for the two cases.
 
      */
-        condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);
+	condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);
 
     return condition;
-   }
+    }
 
   /* ??? If a machine uses a funny comparison, we could return a
      canonicalized form here.  */
@@ -507,6 +526,11 @@ doloop_modify (class loop *loop, class niter_desc *desc,
 	nonneg = 1;
       break;
 
+    case GTU:
+      /* The iteration count does not need incrementing for a GTU test.  */
+      increment_count = false;
+      break;
+
       /* Abort if an invalid doloop pattern has been generated.  */
     default:
       gcc_unreachable ();
@@ -529,6 +553,10 @@ doloop_modify (class loop *loop, class niter_desc *desc,
 
   if (desc->noloop_assumptions)
     {
+      /* The GTU case has only been implemented for Arm, where
+	 noloop_assumptions gets explicitly set to NULL for that case, so
+	 assert here for safety.  */
+      gcc_assert (GET_CODE (condition) != GTU);
       rtx ass = copy_rtx (desc->noloop_assumptions);
       basic_block preheader = loop_preheader_edge (loop)->src;
       basic_block set_zero = split_edge (loop_preheader_edge (loop));
@@ -642,7 +670,7 @@ doloop_optimize (class loop *loop)
 {
   scalar_int_mode mode;
   rtx doloop_reg;
-  rtx count;
+  rtx count = NULL_RTX;
   widest_int iterations, iterations_max;
   rtx_code_label *start_label;
   rtx condition;
@@ -685,17 +713,6 @@ doloop_optimize (class loop *loop)
       return false;
     }
 
-  max_cost
-    = COSTS_N_INSNS (param_max_iterations_computation_cost);
-  if (set_src_cost (desc->niter_expr, mode, optimize_loop_for_speed_p (loop))
-      > max_cost)
-    {
-      if (dump_file)
-	fprintf (dump_file,
-		 "Doloop: number of iterations too costly to compute.\n");
-      return false;
-    }
-
   if (desc->const_iter)
     iterations = widest_int::from (rtx_mode_t (desc->niter_expr, mode),
 				   UNSIGNED);
@@ -716,12 +733,25 @@ doloop_optimize (class loop *loop)
 
   /* Generate looping insn.  If the pattern FAILs then give up trying
      to modify the loop since there is some aspect the back-end does
-     not like.  */
-  count = copy_rtx (desc->niter_expr);
+     not like.  If this succeeds, there is a chance that the loop
+     desc->niter_expr has been altered by the backend, so only extract
+     that data after the gen_doloop_end.  */
   start_label = block_label (desc->in_edge->dest);
   doloop_reg = gen_reg_rtx (mode);
   rtx_insn *doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label);
 
+  max_cost
+    = COSTS_N_INSNS (param_max_iterations_computation_cost);
+  if (set_src_cost (desc->niter_expr, mode, optimize_loop_for_speed_p (loop))
+      > max_cost)
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "Doloop: number of iterations too costly to compute.\n");
+      return false;
+    }
+
+  count = copy_rtx (desc->niter_expr);
   word_mode_size = GET_MODE_PRECISION (word_mode);
   word_mode_max = (HOST_WIDE_INT_1U << (word_mode_size - 1) << 1) - 1;
   if (! doloop_seq
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 2ae5731931d92..69e269330d9f1 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,264 @@
+2024-06-19  demin.han  <demin.han@starfivetech.com>
+
+	* gcc.target/riscv/rvv/base/float-point-cmp-eqne.c: New test.
+
+2024-06-19  Jakub Jelinek  <jakub@redhat.com>
+
+	PR tree-optimization/115544
+	* gcc.dg/bitint-107.c: New test.
+
+2024-06-19  mayshao  <mayshao-oc@zhaoxin.com>
+
+	* g++.target/i386/mv32.C: Handle new -march
+	* gcc.target/i386/funcspec-56.inc: Ditto.
+
+2024-06-19  Harald Anlauf  <anlauf@gmx.de>
+
+	PR fortran/115390
+	* gfortran.dg/bind_c_char_11.f90: New test.
+
+2024-06-19  Andre Vieira  <andre.simoesdiasvieira@arm.com>
+	    Stam Markianos-Wright  <stam.markianos-wright@arm.com>
+
+	* gcc.target/arm/lob.h: Add new helpers.
+	* gcc.target/arm/lob1.c: Use new helpers.
+	* gcc.target/arm/lob6.c: Likewise.
+	* gcc.target/arm/mve/dlstp-compile-asm-1.c: New test.
+	* gcc.target/arm/mve/dlstp-compile-asm-2.c: New test.
+	* gcc.target/arm/mve/dlstp-compile-asm-3.c: New test.
+	* gcc.target/arm/mve/dlstp-int8x16.c: New test.
+	* gcc.target/arm/mve/dlstp-int8x16-run.c: New test.
+	* gcc.target/arm/mve/dlstp-int16x8.c: New test.
+	* gcc.target/arm/mve/dlstp-int16x8-run.c: New test.
+	* gcc.target/arm/mve/dlstp-int32x4.c: New test.
+	* gcc.target/arm/mve/dlstp-int32x4-run.c: New test.
+	* gcc.target/arm/mve/dlstp-int64x2.c: New test.
+	* gcc.target/arm/mve/dlstp-int64x2-run.c: New test.
+	* gcc.target/arm/mve/dlstp-invalid-asm.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-37.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-38.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-39.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-40.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-37.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-38.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-39.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-40.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-33.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-34.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-35.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-36.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-33.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-34.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-35.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-36.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-29.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-30.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-31.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-32.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-29.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-30.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-31.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-32.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-25.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-26.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-27.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-28.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-25.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-26.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-27.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-28.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-21.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-22.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-23.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-24.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-21.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-22.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-23.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-24.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-17.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-18.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-19.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-20.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-17.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-18.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-19.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-20.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-13.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-14.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-15.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-16.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-13.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-14.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-15.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-16.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-10.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-11.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-12.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-9.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-10.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-11.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-12.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-9.c: New test.
+
+2024-06-19  Richard Biener  <rguenther@suse.de>
+
+	* gcc.dg/vect/bb-slp-32.c: Add check for correctness.
+
+2024-06-19  Andre Vehreschild  <vehre@gcc.gnu.org>
+
+	PR fortran/90076
+	* gfortran.dg/class_76.f90: Add declared vtab occurrence.
+	* gfortran.dg/class_78.f90: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add helper
+	macro for testing.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-29.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-30.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-31.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-32.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-29.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-30.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-31.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-32.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add helper
+	macro for testing.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-25.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-26.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-27.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-28.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-25.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-26.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-27.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-28.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add helper
+	macro for testing.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-21.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-22.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-23.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-24.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-21.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-22.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-23.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-24.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add helper
+	macro for testing.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-17.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-18.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-19.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-20.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-17.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-18.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-19.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-20.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add helper
+	macro for testing.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-13.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-14.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-15.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-16.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-13.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-14.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-15.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-16.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add helper
+	macro for testing.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-10.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-11.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-12.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-9.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-10.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-11.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-12.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-9.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add helper
+	macro for testing.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-5.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-6.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-7.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-8.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-5.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-6.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-7.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-8.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/sat_arith.h: Add helper macro for
+	testing.
+	* gcc.target/riscv/sat_u_sub-45.c: New test.
+	* gcc.target/riscv/sat_u_sub-46.c: New test.
+	* gcc.target/riscv/sat_u_sub-47.c: New test.
+	* gcc.target/riscv/sat_u_sub-48.c: New test.
+	* gcc.target/riscv/sat_u_sub-run-45.c: New test.
+	* gcc.target/riscv/sat_u_sub-run-46.c: New test.
+	* gcc.target/riscv/sat_u_sub-run-47.c: New test.
+	* gcc.target/riscv/sat_u_sub-run-48.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/sat_arith.h: Add helper
+	macro for testing.
+	* gcc.target/riscv/sat_u_sub-41.c: New test.
+	* gcc.target/riscv/sat_u_sub-42.c: New test.
+	* gcc.target/riscv/sat_u_sub-43.c: New test.
+	* gcc.target/riscv/sat_u_sub-44.c: New test.
+	* gcc.target/riscv/sat_u_sub-run-41.c: New test.
+	* gcc.target/riscv/sat_u_sub-run-42.c: New test.
+	* gcc.target/riscv/sat_u_sub-run-43.c: New test.
+	* gcc.target/riscv/sat_u_sub-run-44.c: New test.
+
 2024-06-18  Jeff Law  <jlaw@ventanamicro.com>
 
 	* gcc.target/riscv/zbs-ext-2.c: Do not run for -Os.
diff --git a/gcc/testsuite/g++.target/aarch64/afmv/Makefile b/gcc/testsuite/g++.target/aarch64/afmv/Makefile
new file mode 100644
index 0000000000000..ebaf89cba1b5b
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/afmv/Makefile
@@ -0,0 +1 @@
+dg-runtest $(srcdir)/gcc.target/aarch64/afmv/*.c
diff --git a/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_1.c b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_1.c
new file mode 100644
index 0000000000000..a24e8e8805d60
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_1.c
@@ -0,0 +1,11 @@
+#include <stdio.h>
+
+__attribute__((target_clones("default", "sse4.2", "avx2")))
+void foo() {
+    printf("Function foo\n");
+}
+
+int main() {
+    foo();
+    return 0;
+}
diff --git a/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_1.exp b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_1.exp
new file mode 100644
index 0000000000000..4f98486cc7537
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_1.exp
@@ -0,0 +1,15 @@
+# Load the DejaGnu framework
+load_lib gcc-dg.exp
+
+# Set the source directory
+set srcdir [file dirname [info script]]
+
+# Define the test
+dg-init
+
+set testname "afmv_test_1"
+set srcfile "${srcdir}/afmv_test_1.c"
+
+dg-runtest $srcfile "-O2 -fdump-tree-all" ""
+
+dg-finish
diff --git a/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_2.c b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_2.c
new file mode 100644
index 0000000000000..19133d253f104
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_2.c
@@ -0,0 +1,11 @@
+#include <stdio.h>
+
+__attribute__((target_clones("default", "sse4.2", "avx2")))
+void bar() {
+    printf("Function bar\n");
+}
+
+int main() {
+    bar();
+    return 0;
+}
diff --git a/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_2.exp b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_2.exp
new file mode 100644
index 0000000000000..782baf0b5bb69
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_2.exp
@@ -0,0 +1,15 @@
+# Load the DejaGnu framework
+load_lib gcc-dg.exp
+
+# Set the source directory
+set srcdir [file dirname [info script]]
+
+# Define the test
+dg-init
+
+set testname "afmv_test_2"
+set srcfile "${srcdir}/afmv_test_2.c"
+
+dg-runtest $srcfile "-O2 -fdump-tree-all" ""
+
+dg-finish
diff --git a/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_3.c b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_3.c
new file mode 100644
index 0000000000000..b20356ba86123
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_3.c
@@ -0,0 +1,11 @@
+#include <stdio.h>
+
+__attribute__((target_clones("default", "sse4.2", "avx2")))
+inline void baz() {
+    printf("Function baz\n");
+}
+
+int main() {
+    baz();
+    return 0;
+}
diff --git a/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_3.exp b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_3.exp
new file mode 100644
index 0000000000000..9fa6b66ce6182
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_3.exp
@@ -0,0 +1,15 @@
+# Load the DejaGnu framework
+load_lib gcc-dg.exp
+
+# Set the source directory
+set srcdir [file dirname [info script]]
+
+# Define the test
+dg-init
+
+set testname "afmv_test_3"
+set srcfile "${srcdir}/afmv_test_3.c"
+
+dg-runtest $srcfile "-O2 -fdump-tree-all" ""
+
+dg-finish
diff --git a/gcc/testsuite/g++.target/i386/mv32.C b/gcc/testsuite/g++.target/i386/mv32.C
index 6c993218d01c3..b311c35baa3d9 100644
--- a/gcc/testsuite/g++.target/i386/mv32.C
+++ b/gcc/testsuite/g++.target/i386/mv32.C
@@ -21,6 +21,10 @@ int __attribute__ ((target("arch=yongfeng"))) foo () {
   return 2;
 }
 
+int __attribute__ ((target("arch=shijidadao"))) foo () {
+  return 3;
+}
+
 int main ()
 {
   int val = foo ();
@@ -29,6 +33,8 @@ int main ()
     assert (val == 1);
   else if (__builtin_cpu_is ("yongfeng"))
     assert (val == 2);
+  else if (__builtin_cpu_is ("shijidadao"))
+    assert (val == 3);
   else
     assert (val == 0);
 
diff --git a/gcc/testsuite/gcc.dg/bitint-107.c b/gcc/testsuite/gcc.dg/bitint-107.c
new file mode 100644
index 0000000000000..a3f5f534088f3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bitint-107.c
@@ -0,0 +1,16 @@
+/* PR tree-optimization/115544 */
+/* { dg-do compile { target bitint } } */
+/* { dg-options "-O -fno-tree-fre -fno-tree-ccp -fno-tree-forwprop" } */
+
+#if __BITINT_MAXWIDTH__ >= 129
+typedef _BitInt(129) B;
+#else
+typedef _BitInt(63) B;
+#endif
+B a, b;
+
+int
+foo (void)
+{
+  return __builtin_mul_overflow (a, 1, &b);
+}
diff --git a/gcc/testsuite/gcc.target/arm/lob.h b/gcc/testsuite/gcc.target/arm/lob.h
index feaae7cc89959..3941fe7a8b620 100644
--- a/gcc/testsuite/gcc.target/arm/lob.h
+++ b/gcc/testsuite/gcc.target/arm/lob.h
@@ -1,15 +1,131 @@
 #include <string.h>
-
+#include <stdint.h>
 /* Common code for lob tests.  */
 
 #define NO_LOB asm volatile ("@ clobber lr" : : : "lr" )
 
-#define N 10000
+#define N 100
+
+static void
+reset_data (int *a, int *b, int *c, int x)
+{
+  memset (a, -1, x * sizeof (*a));
+  memset (b, -1, x * sizeof (*b));
+  memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data8 (int8_t *a, int8_t *b, int8_t *c, int x)
+{
+  memset (a, -1, x * sizeof (*a));
+  memset (b, -1, x * sizeof (*b));
+  memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data16 (int16_t *a, int16_t *b, int16_t *c, int x)
+{
+  memset (a, -1, x * sizeof (*a));
+  memset (b, -1, x * sizeof (*b));
+  memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data32 (int32_t *a, int32_t *b, int32_t *c, int x)
+{
+  memset (a, -1, x * sizeof (*a));
+  memset (b, -1, x * sizeof (*b));
+  memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data64 (int64_t *a, int64_t *c, int x)
+{
+  memset (a, -1, x * sizeof (*a));
+  memset (c, 0, x * sizeof (*c));
+}
+
+static void
+check_plus (int *a, int *b, int *c, int x)
+{
+  for (int i = 0; i < N; i++)
+    {
+      NO_LOB;
+      if (i < x)
+	{
+	  if (c[i] != (a[i] + b[i])) abort ();
+	}
+      else
+	{
+	  if (c[i] != 0) abort ();
+	}
+    }
+}
+
+static void
+check_plus8 (int8_t *a, int8_t *b, int8_t *c, int x)
+{
+  for (int i = 0; i < N; i++)
+    {
+      NO_LOB;
+      if (i < x)
+	{
+	  if (c[i] != (a[i] + b[i])) abort ();
+	}
+      else
+	{
+	  if (c[i] != 0) abort ();
+	}
+    }
+}
+
+static void
+check_plus16 (int16_t *a, int16_t *b, int16_t *c, int x)
+{
+  for (int i = 0; i < N; i++)
+    {
+      NO_LOB;
+      if (i < x)
+	{
+	  if (c[i] != (a[i] + b[i])) abort ();
+	}
+      else
+	{
+	  if (c[i] != 0) abort ();
+	}
+    }
+}
+
+static void
+check_plus32 (int32_t *a, int32_t *b, int32_t *c, int x)
+{
+  for (int i = 0; i < N; i++)
+    {
+      NO_LOB;
+      if (i < x)
+	{
+	  if (c[i] != (a[i] + b[i])) abort ();
+	}
+      else
+	{
+	  if (c[i] != 0) abort ();
+	}
+    }
+}
 
 static void
-reset_data (int *a, int *b, int *c)
+check_memcpy64 (int64_t *a, int64_t *c, int x)
 {
-  memset (a, -1, N * sizeof (*a));
-  memset (b, -1, N * sizeof (*b));
-  memset (c, -1, N * sizeof (*c));
+  for (int i = 0; i < N; i++)
+    {
+      NO_LOB;
+      if (i < x)
+	{
+	  if (c[i] != a[i]) abort ();
+	}
+      else
+	{
+	  if (c[i] != 0) abort ();
+	}
+    }
 }
diff --git a/gcc/testsuite/gcc.target/arm/lob1.c b/gcc/testsuite/gcc.target/arm/lob1.c
index ba5c82cd55c58..c8ce653a5c39f 100644
--- a/gcc/testsuite/gcc.target/arm/lob1.c
+++ b/gcc/testsuite/gcc.target/arm/lob1.c
@@ -54,29 +54,18 @@ loop3 (int *a, int *b, int *c)
     } while (i < N);
 }
 
-void
-check (int *a, int *b, int *c)
-{
-  for (int i = 0; i < N; i++)
-    {
-      NO_LOB;
-      if (c[i] != a[i] + b[i])
-	abort ();
-    }
-}
-
 int
 main (void)
 {
-  reset_data (a, b, c);
+  reset_data (a, b, c, N);
   loop1 (a, b ,c);
-  check (a, b ,c);
-  reset_data (a, b, c);
+  check_plus (a, b, c, N);
+  reset_data (a, b, c, N);
   loop2 (a, b ,c);
-  check (a, b ,c);
-  reset_data (a, b, c);
+  check_plus (a, b, c, N);
+  reset_data (a, b, c, N);
   loop3 (a, b ,c);
-  check (a, b ,c);
+  check_plus (a, b, c, N);
 
   return 0;
 }
diff --git a/gcc/testsuite/gcc.target/arm/lob6.c b/gcc/testsuite/gcc.target/arm/lob6.c
index 17b6124295e8a..4fe116e2c2be3 100644
--- a/gcc/testsuite/gcc.target/arm/lob6.c
+++ b/gcc/testsuite/gcc.target/arm/lob6.c
@@ -79,14 +79,14 @@ check (void)
 int
 main (void)
 {
-  reset_data (a1, b1, c1);
-  reset_data (a2, b2, c2);
+  reset_data (a1, b1, c1, N);
+  reset_data (a2, b2, c2, N);
   loop1 (a1, b1, c1);
   ref1 (a2, b2, c2);
   check ();
 
-  reset_data (a1, b1, c1);
-  reset_data (a2, b2, c2);
+  reset_data (a1, b1, c1, N);
+  reset_data (a2, b2, c2, N);
   loop2 (a1, b1, c1);
   ref2 (a2, b2, c2);
   check ();
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-1.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-1.c
new file mode 100644
index 0000000000000..6e6da3d3d596b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-1.c
@@ -0,0 +1,146 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O3 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+
+#define IMM 5
+
+#define TEST_COMPILE_IN_DLSTP_TERNARY(BITS, LANES, LDRSTRYTPE, TYPE, SIGN, NAME, PRED)				\
+void test_##NAME##PRED##_##SIGN##BITS (TYPE##BITS##_t *a, TYPE##BITS##_t *b,  TYPE##BITS##_t *c, int n)	\
+{											\
+  while (n > 0)										\
+    {											\
+      mve_pred16_t p = vctp##BITS##q (n);						\
+      TYPE##BITS##x##LANES##_t va = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (a, p);		\
+      TYPE##BITS##x##LANES##_t vb = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (b, p);		\
+      TYPE##BITS##x##LANES##_t vc = NAME##PRED##_##SIGN##BITS (va, vb, p);		\
+      vstr##LDRSTRYTPE##q_p_##SIGN##BITS (c, vc, p);					\
+      c += LANES;									\
+      a += LANES;									\
+      b += LANES;									\
+      n -= LANES;									\
+    }											\
+}
+
+#define TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY(BITS, LANES, LDRSTRYTPE, NAME, PRED)	\
+TEST_COMPILE_IN_DLSTP_TERNARY (BITS, LANES, LDRSTRYTPE, int, s, NAME, PRED)			\
+TEST_COMPILE_IN_DLSTP_TERNARY (BITS, LANES, LDRSTRYTPE, uint, u, NAME, PRED)
+
+#define TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY(NAME, PRED)			\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY (8, 16, b, NAME, PRED)				\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY (16, 8, h, NAME, PRED)				\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY (32, 4, w, NAME, PRED)
+
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vaddq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vmulq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vsubq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vhaddq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vorrq, _x)
+
+
+#define TEST_COMPILE_IN_DLSTP_TERNARY_M(BITS, LANES, LDRSTRYTPE, TYPE, SIGN, NAME, PRED)				\
+void test_##NAME##PRED##_##SIGN##BITS (TYPE##BITS##x##LANES##_t __inactive, TYPE##BITS##_t *a, TYPE##BITS##_t *b,  TYPE##BITS##_t *c, int n)	\
+{											\
+  while (n > 0)										\
+    {											\
+      mve_pred16_t p = vctp##BITS##q (n);						\
+      TYPE##BITS##x##LANES##_t va = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (a, p);		\
+      TYPE##BITS##x##LANES##_t vb = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (b, p);		\
+      TYPE##BITS##x##LANES##_t vc = NAME##PRED##_##SIGN##BITS (__inactive, va, vb, p);		\
+      vstr##LDRSTRYTPE##q_p_##SIGN##BITS (c, vc, p);					\
+      c += LANES;									\
+      a += LANES;									\
+      b += LANES;									\
+      n -= LANES;									\
+    }											\
+}
+
+#define TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M(BITS, LANES, LDRSTRYTPE, NAME, PRED)	\
+TEST_COMPILE_IN_DLSTP_TERNARY_M (BITS, LANES, LDRSTRYTPE, int, s, NAME, PRED)			\
+TEST_COMPILE_IN_DLSTP_TERNARY_M (BITS, LANES, LDRSTRYTPE, uint, u, NAME, PRED)
+
+#define TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M(NAME, PRED)			\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M (8, 16, b, NAME, PRED)				\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M (16, 8, h, NAME, PRED)				\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M (32, 4, w, NAME, PRED)
+
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vaddq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vmulq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vsubq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vhaddq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vorrq, _m)
+
+#define TEST_COMPILE_IN_DLSTP_TERNARY_N(BITS, LANES, LDRSTRYTPE, TYPE, SIGN, NAME, PRED)	\
+void test_##NAME##PRED##_n_##SIGN##BITS (TYPE##BITS##_t *a,  TYPE##BITS##_t *c, int n)	\
+{											\
+  while (n > 0)										\
+    {											\
+      mve_pred16_t p = vctp##BITS##q (n);						\
+      TYPE##BITS##x##LANES##_t va = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (a, p);		\
+      TYPE##BITS##x##LANES##_t vc = NAME##PRED##_n_##SIGN##BITS (va, IMM, p);		\
+      vstr##LDRSTRYTPE##q_p_##SIGN##BITS (c, vc, p);					\
+      c += LANES;									\
+      a += LANES;									\
+      n -= LANES;									\
+    }											\
+}
+
+#define TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_N(BITS, LANES, LDRSTRYTPE, NAME, PRED)	\
+TEST_COMPILE_IN_DLSTP_TERNARY_N (BITS, LANES, LDRSTRYTPE, int, s, NAME, PRED)			\
+TEST_COMPILE_IN_DLSTP_TERNARY_N (BITS, LANES, LDRSTRYTPE, uint, u, NAME, PRED)
+
+#define TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N(NAME, PRED)			\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_N (8, 16, b, NAME, PRED)				\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_N (16, 8, h, NAME, PRED)				\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_N (32, 4, w, NAME, PRED)
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vaddq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vmulq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vsubq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vhaddq, _x)
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vbrsrq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vshlq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vshrq, _x)
+
+#define TEST_COMPILE_IN_DLSTP_TERNARY_M_N(BITS, LANES, LDRSTRYTPE, TYPE, SIGN, NAME, PRED)	\
+void test_##NAME##PRED##_n_##SIGN##BITS (TYPE##BITS##x##LANES##_t __inactive, TYPE##BITS##_t *a,  TYPE##BITS##_t *c, int n)	\
+{											\
+  while (n > 0)										\
+    {											\
+      mve_pred16_t p = vctp##BITS##q (n);						\
+      TYPE##BITS##x##LANES##_t va = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (a, p);		\
+      TYPE##BITS##x##LANES##_t vc = NAME##PRED##_n_##SIGN##BITS (__inactive, va, IMM, p);		\
+      vstr##LDRSTRYTPE##q_p_##SIGN##BITS (c, vc, p);					\
+      c += LANES;									\
+      a += LANES;									\
+      n -= LANES;									\
+    }											\
+}
+
+#define TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M_N(BITS, LANES, LDRSTRYTPE, NAME, PRED)	\
+TEST_COMPILE_IN_DLSTP_TERNARY_M_N (BITS, LANES, LDRSTRYTPE, int, s, NAME, PRED)			\
+TEST_COMPILE_IN_DLSTP_TERNARY_M_N (BITS, LANES, LDRSTRYTPE, uint, u, NAME, PRED)
+
+#define TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N(NAME, PRED)			\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M_N (8, 16, b, NAME, PRED)				\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M_N (16, 8, h, NAME, PRED)				\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M_N (32, 4, w, NAME, PRED)
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vaddq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vmulq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vsubq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vhaddq, _m)
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vbrsrq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vshlq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vshrq, _m)
+
+/* The final number of DLSTPs currently is calculated by the number of
+  `TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY.*` macros * 6.  */
+/* { dg-final { scan-assembler-times {\tdlstp} 144 } } */
+/* { dg-final { scan-assembler-times {\tletp} 144 } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c
new file mode 100644
index 0000000000000..84f4a2fc4f9bc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c
@@ -0,0 +1,749 @@
+
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O3 -save-temps -fno-schedule-insns2 " } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-mtune=cortex-m55" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_mve.h>
+/* Using a >=1 condition.  */
+void test1 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n >= 1)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c+=4;
+      a+=4;
+      b+=4;
+      n-=4;
+    }
+}
+/*
+** test1:
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vldrw.32	q[0-9]+, \[r1\], #16
+**	vadd.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\1, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Test a for loop format of decrementing to zero */
+int32_t a[] = {0, 1, 2, 3, 4, 5, 6, 7};
+void test2 (int32_t *b, int num_elems)
+{
+    for (int i = num_elems; i > 0; i-= 4)
+    {
+        mve_pred16_t p = vctp32q (i);
+        int32x4_t va = vldrwq_z_s32 (&(a[i]), p);
+        vstrwq_p_s32 (b + i, va, p);
+    }
+}
+/*
+** test2:
+**...
+**	dlstp.32	lr, r1
+**...
+**	vldrw.32	(q[0-9]+), \[r3\], #-16
+**	vstrw.32	\1, \[r0\], #-16
+**	letp	lr, .*
+**...
+*/
+
+/* Iteration counter counting up to num_iter.  */
+void test3 (uint8_t *a, uint8_t *b, uint8_t *c, unsigned n)
+{
+    int num_iter = (n + 15)/16;
+    for (int i = 0; i < num_iter; i++)
+    {
+	mve_pred16_t p = vctp8q (n);
+	uint8x16_t va = vldrbq_z_u8 (a, p);
+	uint8x16_t vb = vldrbq_z_u8 (b, p);
+	uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+	vstrbq_p_u8 (c, vc, p);
+	n-=16;
+	a += 16;
+	b += 16;
+	c += 16;
+    }
+}
+
+/*
+** test3:
+**...
+**	dlstp.8	lr, r3
+**...
+**	vldrb.8	q[0-9]+, \[(r[0-9]+|ip)\]
+**	vldrb.8	q[0-9]+, \[(r[0-9]+|ip)\]
+**...
+**	vadd.i8	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrb.8	\3, \[(r[0-9]+|ip)\]
+**...
+**	letp	lr, .*
+**...
+*/
+
+/* Iteration counter counting down from num_iter.  */
+void test4 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+    int num_iter = (n + 15)/16;
+    for (int i = num_iter; i > 0; i--)
+    {
+	mve_pred16_t p = vctp8q (n);
+	uint8x16_t va = vldrbq_z_u8 (a, p);
+	uint8x16_t vb = vldrbq_z_u8 (b, p);
+	uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+	vstrbq_p_u8 (c, vc, p);
+	n-=16;
+	a += 16;
+	b += 16;
+	c += 16;
+    }
+}
+/*
+** test4:
+**...
+**	dlstp.8	lr, r3
+**...
+**	vldrb.8	q[0-9]+, \[(r[0-9]+|ip)\]
+**	vldrb.8	q[0-9]+, \[(r[0-9]+|ip)\]
+**...
+**	vadd.i8	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrb.8	\3, \[(r[0-9]+|ip)\]
+**...
+**	letp	lr, .*
+**...
+*/
+
+/* Using an unpredicated arithmetic instruction within the loop.  */
+void test5 (uint8_t *a, uint8_t *b, uint8_t *c,  uint8_t *d, int n)
+{
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        uint8x16_t va = vldrbq_z_u8 (a, p);
+        uint8x16_t vb = vldrbq_u8 (b);
+	/* Is affected by implicit predication, because vb also
+	came from an unpredicated load, but there is no functional
+	problem, because the result is used in a predicated store.  */
+        uint8x16_t vc = vaddq_u8 (va, vb);
+        uint8x16_t vd = vaddq_x_u8 (va, vb, p);
+        vstrbq_p_u8 (c, vc, p);
+        vstrbq_p_u8 (d, vd, p);
+        n-=16;
+	a += 16;
+	b += 16;
+	c += 16;
+    }
+}
+
+/*
+** test5:
+**...
+**	dlstp.8	lr, r[0-9]+
+**...
+**	vldrb.8	q[0-9]+, \[r1\]
+**	vldrb.8	q[0-9]+, \[r2\]
+**...
+**	vadd.i8	(q[0-9]+), q[0-9]+, q[0-9]+
+**...
+**	vstrb.8	\1, \[r2\]
+**	vstrb.8	\1, \[r3\]
+**	letp	lr, .*
+**...
+*/
+
+/* Using a different VPR value for one instruction in the loop.  */
+void test6 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p1);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/*
+** test6:
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vpst
+**	vldrwt.32	q[0-9]+, \[r1\], #16
+**	vadd.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\1, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Generating and using another VPR value in the loop, with a vctp.
+   The doloop logic will always try to do the transform on the first
+   vctp it encounters, so this is still expected to work.  */
+void test7 (int32_t *a, int32_t *b, int32_t *c, int n, int g)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      mve_pred16_t p1 = vctp32q (g);
+      int32x4_t vb = vldrwq_z_s32 (b, p1);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+/*
+** test7:
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vpst
+**	vldrwt.32	q[0-9]+, \[r1\], #16
+**	vadd.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\1, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop, with a vctp,
+   but this time the p1 will also change in every loop (still fine)  */
+void test8 (int32_t *a, int32_t *b, int32_t *c, int n, int g)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      mve_pred16_t p1 = vctp32q (g);
+      int32x4_t vb = vldrwq_z_s32 (b, p1);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+      g++;
+    }
+}
+
+/*
+** test8:
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vctp.32	r4
+**	vpst
+**	vldrwt.32	q[0-9]+, \[r1\], #16
+**...
+**	vadd.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\1, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop, with a vctp_m
+   that is independent of the loop vctp VPR.  */
+void test9 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      mve_pred16_t p2 = vctp32q_m (n, p1);
+      int32x4_t vb = vldrwq_z_s32 (b, p1);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p2);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/*
+** test9:
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vmsr	p0, (r[0-9]+)	@ movhi
+**	vpst
+**	vctpt.32	r3
+**	vmrs	(r[0-9]+), p0	@ movhi
+**	vmsr	p0, \1	@ movhi
+**	vpst
+**	vldrwt.32	q[0-9]+, \[r1\], #16
+**	vmsr	p0, \2	@ movhi
+**	vpst
+**	vaddt.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**...
+**	vstrw.32	\3, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop,
+   with a vctp_m that is tied to the base vctp VPR.  This
+   is still fine, because the vctp_m will be transformed
+   into a vctp and be implicitly predicated.  */
+void test10 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      mve_pred16_t p1 = vctp32q_m (n, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p1);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p1);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+/*
+   We don't need that extra vctp in the loop, but we currently do not optimize
+   it away, however, it is not wrong to use it...
+*/
+/*
+** test10:
+**...
+**	dlstp.32	lr, r3
+**	vctp.32	r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**...
+**	vpst
+**	vldrwt.32	q[0-9]+, \[r1\], #16
+**	vpst
+**	vaddt.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\1, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop, with a vcmp.  */
+void test11 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      mve_pred16_t p1 = vcmpeqq_s32 (va, vb);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p1);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/*
+** test11:
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vldrw.32	q[0-9]+, \[r1\], #16
+**	vcmp.i32	eq, q[0-9]+, q[0-9]+
+**	vpst
+**	vaddt.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\1, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop, with a vcmp_m.  */
+void test12 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      mve_pred16_t p2 = vcmpeqq_m_s32 (va, vb, p1);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p2);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/*
+** test12:
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vldrw.32	q[0-9]+, \[r1\], #16
+**	vmsr	p0, (r[0-9]+|ip)	@ movhi
+**	vpst
+**	vcmpt.i32	eq, q[0-9]+, q[0-9]+
+**	vpst
+**	vaddt.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\2, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop, with a vcmp_m
+   that is tied to the base vctp VPR (same as above, this will be turned
+   into a vcmp and be implicitly predicated).  */
+void test13 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      mve_pred16_t p2 = vcmpeqq_m_s32 (va, vb, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p2);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/*
+** test13:
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vldrw.32	q[0-9]+, \[r1\], #16
+**	vcmp.i32	eq, q[0-9]+, q[0-9]+
+**	vpst
+**	vaddt.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\1, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Similar to test27 in dsltp-invalid-asm.c, but use a predicated load to make
+   it safe to implicitly predicate the vaddv.  */
+void test14 (int32_t *a, int32_t *c, int n)
+{
+  int32_t res = 0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      res += vaddvq_s32 (va);
+      int32x4_t vc = vdupq_n_s32 (res);
+      vstrwq_p_s32 (c, vc, p);
+      a += 4;
+      n -= 4;
+    }
+}
+
+/*
+** test14:
+**...
+**	dlstp.32	lr, r2
+**	vldrw.32	(q[0-9]+), \[r0\], #16
+**	vaddv.s32	(r[0-9]+|ip), \1
+**	add	(r[0-9]+|ip), \3, \2
+**	vdup.32	(q[0-9]+), \3
+**	vstrw.32	\4, \[r1\]
+**	letp	lr, .*
+**...
+*/
+
+uint8_t test15 (uint8_t *a, uint8_t *b, int n)
+{
+    uint8_t res = 0;
+    uint8x16_t vc = vdupq_n_u8 (0);
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_u8 (b);
+       vc = vaddq_m (vc, va, vc, p);
+       res = vgetq_lane (vc, 5);
+
+       a += 16;
+       b += 16;
+       n -= 16;
+    }
+    return res;
+}
+
+/*
+** test15:
+**...
+**	dlstp.8	lr, r2
+**...
+**	vldrb.8	q[0-9]+, \[(r[0-9]+|ip)\]
+**...
+**	vadd.i8	(q[0-9]+), q[0-9]+, q[0-9]+
+**...
+**	letp	lr, .*
+**	vmov.u8	r[0-9]+, \2\[5\]
+**...
+*/
+
+uint8_t test16 (uint8_t *a, uint8_t *b, int n)
+{
+    uint8_t res = 0;
+    uint8x16_t vc = vdupq_n_u8 (0);
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_u8 (b);
+       vc = vaddq (va, vc);
+       vc = vaddq_m (vc, va, vc, p);
+       res = vgetq_lane (vc, 5);
+
+       a += 16;
+       b += 16;
+       n -= 16;
+    }
+    return res;
+}
+
+/*
+** test16:
+**...
+**	dlstp.8	lr, r2
+**...
+**	vldrb.8	q[0-9]+, \[(r[0-9]+|ip)\]
+**...
+**	vadd.i8	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vadd.i8	\2, q[0-9]+, q[0-9]+
+**	letp	lr, .*
+**	vmov.u8	r[0-9]+, \2\[5\]
+**...
+*/
+
+
+
+/* Using an across-vector unpredicated instruction in a valid way.
+   This tests that "vc" has correctly masked the risky "vb".  */
+uint16_t test18 (uint16_t *a, uint16_t *b,  uint16_t *c, int n)
+{
+  uint16x8_t vb = vldrhq_u16 (b);
+  uint16_t res = 0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp16q (n);
+      uint16x8_t va = vldrhq_z_u16 (a, p);
+      uint16x8_t vc = vaddq_m_u16 (va, va, vb, p);
+      res += vaddvq_u16 (vc);
+      c += 8;
+      a += 8;
+      b += 8;
+      n -= 8;
+    }
+  return res;
+}
+
+/*
+** test18:
+**...
+**	dlstp.16	lr, r3
+**	vldrh.16	(q[0-9]+), \[r2\], #16
+**	vadd.i16	\1, q[0-9]+, q[0-9]+
+**	vaddv.u16	(r[0-9]+|ip), \1
+**	add	(r[0-9]+|ip), \3, \2
+**	uxth	\3, \3
+**	letp	lr, .*
+**...
+*/
+
+/* Using an across-vector unpredicated instruction with implicit scalar adding from outside the loop.  */
+uint16_t test19 (uint16_t *a, uint16_t *b,  uint16_t *c, int n)
+{
+  uint16x8_t vb = vldrhq_u16 (b);
+  uint16_t res = 0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp16q (n);
+      uint16x8_t va = vldrhq_z_u16 (a, p);
+      uint16x8_t vc = vaddq_m_u16 (va, va, vb, p);
+      res = vaddvaq_u16 (res, vc);
+      c += 8;
+      a += 8;
+      b += 8;
+      n -= 8;
+    }
+  return res;
+}
+
+/*
+** test19:
+**...
+**	dlstp.16	lr, r3
+**	vldrh.16	(q[0-9]+), \[r2\], #16
+**	vadd.i16	\1, q[0-9]+, q[0-9]+
+**	vaddva.u16	(r[0-9]+|ip), \1
+**	uxth	\2, \2
+**	letp	lr, .*
+**...
+*/
+
+
+/* Using an across-vector predicated instruction in a valid way.  */
+uint16_t  test20 (uint16_t *a, uint16_t *b,  uint16_t *c, int n)
+{
+  uint16_t res = 0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp16q (n);
+      uint16x8_t va = vldrhq_u16 (a);
+      res = vaddvaq_p_u16 (res, va, p);
+      c += 8;
+      a += 8;
+      b += 8;
+      n -= 8;
+    }
+  return res;
+}
+
+/* The uxth could be moved outside the loop.  */
+/*
+** test20:
+**...
+**	dlstp.16	lr, r3
+**	vldrh.16	(q[0-9]+), \[r2\], #16
+**	vaddva.u16	(r[0-9]+|ip), \1
+**	uxth	\2, \2
+**	letp	lr, .*
+**...
+*/
+
+/* Using an across-vector predicated instruction in a valid way.  */
+uint16_t  test21 (uint16_t *a, uint16_t *b,  uint16_t *c, int n)
+{
+  uint16_t res = 0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp16q (n);
+      uint16x8_t va = vldrhq_u16 (a);
+      res++;
+      res = vaddvaq_p_u16 (res, va, p);
+      c += 8;
+      a += 8;
+      b += 8;
+      n -= 8;
+    }
+  return res;
+}
+
+/* Also think it'd be safe to move uxth outside of the loop here.  */
+/*
+** test21:
+**...
+**	dlstp.16	lr, r3
+**	vldrh.16	(q[0-9]+), \[r2\], #16
+**	adds	(r[0-9]+|ip), \2, #1
+**	uxth	\2, \2
+**	vaddva.u16	\2, \1
+**	uxth	\2, \2
+**	letp	lr, .*
+**...
+*/
+
+int test22 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+    int res = 0;
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        uint8x16_t va = vldrbq_z_u8 (a, p);
+        res = vmaxvq (res, va);
+        n-=16;
+        a+=16;
+    }
+    return res;
+}
+
+/*
+** test22:
+**...
+**	dlstp.8	lr, r3
+**...
+**	vldrb.8	(q[0-9]+), \[r[0-9]+\]
+**...
+**	vmaxv.u8	(r[0-9]+|ip), \1
+**	uxtb	\2, \2
+**	letp	lr, .*
+**...
+*/
+
+int test23 (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+    int res = 0;
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        int8x16_t va = vldrbq_z_s8 (a, p);
+        res = vmaxavq (res, va);
+        n-=16;
+        a+=16;
+    }
+    return res;
+}
+
+/*
+** test23:
+**...
+**	dlstp.8	lr, r3
+**...
+**	vldrb.8	(q[0-9]+), \[r3\]
+**...
+**	vmaxav.s8	(r[0-9]+|ip), \1
+**	uxtb	\2, \2
+**	letp	lr, .*
+**...
+*/
+
+/* Like test1, but update n before vctp, meaning we should only iterate for n-4
+   elements.  */
+void test24 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n >= 1)
+    {
+      n-=4;
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c+=4;
+      a+=4;
+      b+=4;
+    }
+}
+/*
+** test24:
+**...
+**	subs	r3, r3, #4
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vldrw.32	q[0-9]+, \[r1\], #16
+**	vadd.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\1, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-3.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-3.c
new file mode 100644
index 0000000000000..c784f54013177
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-3.c
@@ -0,0 +1,46 @@
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O3 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+
+/* We don't support pattern recognition of signed N values when computing num_iter.  */
+void test3 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+    int num_iter = (n + 15)/16;
+    for (int i = 0; i < num_iter; i++)
+    {
+	mve_pred16_t p = vctp8q (n);
+	uint8x16_t va = vldrbq_z_u8 (a, p);
+	uint8x16_t vb = vldrbq_z_u8 (b, p);
+	uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+	vstrbq_p_u8 (c, vc, p);
+	n-=16;
+	a += 16;
+	b += 16;
+	c += 16;
+    }
+}
+
+/* Using a predicated vcmp to generate a new predicate value in the
+   loop and then using it in a predicated store insn.  */
+void test17 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_s32 (va, vb);
+      mve_pred16_t p1 = vcmpeqq_m_s32 (va, vc, p);
+      vstrwq_p_s32 (c, vc, p1);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+/* This is an example of a loop that we could tail predicate but currently don't.  */
+/* { dg-final { scan-assembler "letp" { xfail *-*-* } } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8-run.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8-run.c
new file mode 100644
index 0000000000000..6966a3966046f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8-run.c
@@ -0,0 +1,44 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-require-effective-target arm_mve_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+#include "dlstp-int16x8.c"
+
+int main ()
+{
+  int i;
+  int16_t temp1[N];
+  int16_t temp2[N];
+  int16_t temp3[N];
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 0);
+  check_plus16 (temp1, temp2, temp3, 0);
+
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 1);
+  check_plus16 (temp1, temp2, temp3, 1);
+
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 7);
+  check_plus16 (temp1, temp2, temp3, 7);
+
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 8);
+  check_plus16 (temp1, temp2, temp3, 8);
+
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 9);
+  check_plus16 (temp1, temp2, temp3, 9);
+
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 16);
+  check_plus16 (temp1, temp2, temp3, 16);
+
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 17);
+  check_plus16 (temp1, temp2, temp3, 17);
+
+  reset_data16 (temp1, temp2, temp3, N);
+}
+
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8.c
new file mode 100644
index 0000000000000..33632c5f14dc6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8.c
@@ -0,0 +1,31 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../lob.h"
+
+void  __attribute__ ((noinline)) test (int16_t *a, int16_t *b, int16_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp16q (n);
+      int16x8_t va = vldrhq_z_s16 (a, p);
+      int16x8_t vb = vldrhq_z_s16 (b, p);
+      int16x8_t vc = vaddq_x_s16 (va, vb, p);
+      vstrhq_p_s16 (c, vc, p);
+      c+=8;
+      a+=8;
+      b+=8;
+      n-=8;
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.16} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp" } } */
+/* { dg-final { scan-assembler-not "\tvpst" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4-run.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4-run.c
new file mode 100644
index 0000000000000..6833dddde92b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4-run.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-require-effective-target arm_mve_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include "dlstp-int32x4.c"
+
+int main ()
+{
+  int i;
+  int32_t temp1[N];
+  int32_t temp2[N];
+  int32_t temp3[N];
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 0);
+  check_plus32 (temp1, temp2, temp3, 0);
+
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 1);
+  check_plus32 (temp1, temp2, temp3, 1);
+
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 3);
+  check_plus32 (temp1, temp2, temp3, 3);
+
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 4);
+  check_plus32 (temp1, temp2, temp3, 4);
+
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 5);
+  check_plus32 (temp1, temp2, temp3, 5);
+
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 8);
+  check_plus32 (temp1, temp2, temp3, 8);
+
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 9);
+  check_plus32 (temp1, temp2, temp3, 9);
+
+  reset_data32 (temp1, temp2, temp3, N);
+}
+
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4.c
new file mode 100644
index 0000000000000..5d09f784b7716
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4.c
@@ -0,0 +1,31 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../lob.h"
+
+void  __attribute__ ((noinline)) test (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c+=4;
+      a+=4;
+      b+=4;
+      n-=4;
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.32} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp" } } */
+/* { dg-final { scan-assembler-not "\tvpst" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2-run.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2-run.c
new file mode 100644
index 0000000000000..cc0b9ce7ee9a5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2-run.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-require-effective-target arm_mve_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include "dlstp-int64x2.c"
+
+int main ()
+{
+  int i;
+  int64_t temp1[N];
+  int64_t temp3[N];
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 0);
+  check_memcpy64 (temp1, temp3, 0);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 1);
+  check_memcpy64 (temp1, temp3, 1);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 2);
+  check_memcpy64 (temp1, temp3, 2);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 3);
+  check_memcpy64 (temp1, temp3, 3);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 4);
+  check_memcpy64 (temp1, temp3, 4);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 5);
+  check_memcpy64 (temp1, temp3, 5);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 6);
+  check_memcpy64 (temp1, temp3, 6);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 7);
+  check_memcpy64 (temp1, temp3, 7);
+
+  reset_data64  (temp1, temp3, N);
+}
+
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2.c
new file mode 100644
index 0000000000000..21e882424ec3b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2.c
@@ -0,0 +1,28 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../lob.h"
+
+void  __attribute__ ((noinline)) test (int64_t *a, int64_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp64q (n);
+      int64x2_t va = vldrdq_gather_offset_z_s64 (a, vcreateq_u64 (0, 8), p);
+      vstrdq_scatter_offset_p_s64 (c, vcreateq_u64 (0, 8), va, p);
+      c+=2;
+      a+=2;
+      n-=2;
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.64} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp" } } */
+/* { dg-final { scan-assembler-not "\tvpst" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16-run.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16-run.c
new file mode 100644
index 0000000000000..d46571f329cf3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16-run.c
@@ -0,0 +1,44 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-require-effective-target arm_mve_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include "dlstp-int8x16.c"
+
+int main ()
+{
+  int i;
+  int8_t temp1[N];
+  int8_t temp2[N];
+  int8_t temp3[N];
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 0);
+  check_plus8 (temp1, temp2, temp3, 0);
+
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 1);
+  check_plus8 (temp1, temp2, temp3, 1);
+
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 15);
+  check_plus8 (temp1, temp2, temp3, 15);
+
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 16);
+  check_plus8 (temp1, temp2, temp3, 16);
+
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 17);
+  check_plus8 (temp1, temp2, temp3, 17);
+
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 32);
+  check_plus8 (temp1, temp2, temp3, 32);
+
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 33);
+  check_plus8 (temp1, temp2, temp3, 33);
+
+  reset_data8 (temp1, temp2, temp3, N);
+}
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16.c
new file mode 100644
index 0000000000000..d5f22b5026259
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16.c
@@ -0,0 +1,32 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../lob.h"
+
+void  __attribute__ ((noinline)) test (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp8q (n);
+      int8x16_t va = vldrbq_z_s8 (a, p);
+      int8x16_t vb = vldrbq_z_s8 (b, p);
+      int8x16_t vc = vaddq_x_s8 (va, vb, p);
+      vstrbq_p_s8 (c, vc, p);
+      c+=16;
+      a+=16;
+      b+=16;
+      n-=16;
+    }
+}
+
+
+/* { dg-final { scan-assembler-times {\tdlstp.8} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp" } } */
+/* { dg-final { scan-assembler-not "\tvpst" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c
new file mode 100644
index 0000000000000..26df2d30523ce
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c
@@ -0,0 +1,521 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O3 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <limits.h>
+#include <arm_mve.h>
+
+/* Terminating on a non-zero number of elements.  */
+void test0 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+    while (n > 1)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_z_u8 (b, p);
+       uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+       vstrbq_p_u8 (c, vc, p);
+       n -= 16;
+    }
+}
+
+/* Terminating on n >= 0.  */
+void test1 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+    while (n >= 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_z_u8 (b, p);
+       uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+       vstrbq_p_u8 (c, vc, p);
+       n -= 16;
+    }
+}
+
+/* Similar, terminating on a non-zero number of elements, but in a for loop
+   format.  */
+int32_t a[] = {0, 1, 2, 3, 4, 5, 6, 7};
+void test2 (int32_t *b, int num_elems)
+{
+    for (int i = num_elems; i >= 2; i-= 4)
+    {
+       mve_pred16_t p = vctp32q (i);
+       int32x4_t va = vldrwq_z_s32 (&(a[i]), p);
+       vstrwq_p_s32 (b + i, va, p);
+    }
+}
+
+/* Iteration counter counting up to num_iter, with a non-zero starting num.  */
+void test3 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+    int num_iter = (n + 15)/16;
+    for (int i = 1; i < num_iter; i++)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_z_u8 (b, p);
+       uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+       vstrbq_p_u8 (c, vc, p);
+       n -= 16;
+    }
+}
+
+/* Iteration counter counting up to num_iter, with a larger increment  */
+void test4 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+    int num_iter = (n + 15)/16;
+    for (int i = 0; i < num_iter; i+=2)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_z_u8 (b, p);
+       uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+       vstrbq_p_u8 (c, vc, p);
+       n -= 16;
+    }
+}
+
+/* Using an unpredicated store instruction within the loop.  */
+void test5 (uint8_t *a, uint8_t *b, uint8_t *c,  uint8_t *d, int n)
+{
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_z_u8 (b, p);
+       uint8x16_t vc = vaddq_u8 (va, vb);
+       uint8x16_t vd = vaddq_x_u8 (va, vb, p);
+       vstrbq_u8 (d, vd);
+       n -= 16;
+    }
+}
+
+/* Using an unpredicated store outside the loop.  */
+void test6 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx)
+{
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_z_u8 (b, p);
+       uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p);
+       vx = vaddq_u8 (vx, vc);
+       a += 16;
+       b += 16;
+       n -= 16;
+    }
+    vstrbq_u8 (c, vx);
+}
+
+/* Using a VPR that gets modified within the loop.  */
+void test9 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      p++;
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/* Using a VPR that gets re-generated within the loop.  */
+void test10 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  mve_pred16_t p = vctp32q (n);
+  while (n > 0)
+    {
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      p = vctp32q (n);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/* Using vctp32q_m instead of vctp32q.  */
+void test11 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p0)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q_m (n, p0);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/* Using an unpredicated op with a scalar output, where the result is valid
+   outside the bb.  This is invalid, because one of the inputs to the
+   unpredicated op is also unpredicated.  */
+uint8_t test12 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx)
+{
+    uint8_t sum = 0;
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_u8 (b);
+       uint8x16_t vc = vaddq_u8 (va, vb);
+       sum += vaddvq_u8 (vc);
+       a += 16;
+       b += 16;
+       n -= 16;
+    }
+    return sum;
+}
+
+/* Using an unpredicated vcmp to generate a new predicate value in the
+   loop and then using that VPR to predicate a store insn.  */
+void test13 (int32_t *a, int32_t *b, int32x4_t vc, int32_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_s32 (a);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_s32 (va, vb);
+      mve_pred16_t p1 = vcmpeqq_s32 (va, vc);
+      vstrwq_p_s32 (c, vc, p1);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/* Using an across-vector unpredicated instruction. "vb" is the risk.  */
+uint16_t test14 (uint16_t *a, uint16_t *b,  uint16_t *c, int n)
+{
+  uint16x8_t vb = vldrhq_u16 (b);
+  uint16_t res = 0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp16q (n);
+      uint16x8_t va = vldrhq_z_u16 (a, p);
+      vb = vaddq_u16 (va, vb);
+      res = vaddvq_u16 (vb);
+      c += 8;
+      a += 8;
+      b += 8;
+      n -= 8;
+    }
+  return res;
+}
+
+/* Using an across-vector unpredicated instruction. "vc" is the risk. */
+uint16_t test15 (uint16_t *a, uint16_t *b,  uint16_t *c, int n)
+{
+  uint16x8_t vb = vldrhq_u16 (b);
+  uint16_t res = 0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp16q (n);
+      uint16x8_t va = vldrhq_z_u16 (a, p);
+      uint16x8_t vc = vaddq_u16 (va, vb);
+      res = vaddvaq_u16 (res, vc);
+      c += 8;
+      a += 8;
+      b += 8;
+      n -= 8;
+    }
+  return res;
+}
+
+uint16_t test16 (uint16_t *a, uint16_t *b,  uint16_t *c, int n)
+{
+  uint16_t res =0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp16q (n);
+      uint16x8_t vb = vldrhq_u16 (b);
+      uint16x8_t va = vldrhq_z_u16 (a, p);
+      res = vaddvaq_u16 (res, vb);
+      res = vaddvaq_p_u16 (res, va, p);
+      c += 8;
+      a += 8;
+      b += 8;
+      n -= 8;
+    }
+  return res;
+}
+
+int test17 (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+    int res = 0;
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        int8x16_t va = vldrbq_z_s8 (a, p);
+        res = vmaxvq (res, va);
+        n-=16;
+        a+=16;
+    }
+    return res;
+}
+
+
+
+int test18 (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+    int res = 0;
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        int8x16_t va = vldrbq_z_s8 (a, p);
+        res = vminvq (res, va);
+        n-=16;
+        a+=16;
+    }
+    return res;
+}
+
+int test19 (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+    int res = 0;
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        int8x16_t va = vldrbq_z_s8 (a, p);
+        res = vminavq (res, va);
+        n-=16;
+        a+=16;
+    }
+    return res;
+}
+
+int test20 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+    int res = 0;
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        uint8x16_t va = vldrbq_z_u8 (a, p);
+        res = vminvq (res, va);
+        n-=16;
+        a+=16;
+    }
+    return res;
+}
+
+uint8x16_t test21 (uint8_t *a, uint32_t *b, int n, uint8x16_t res)
+{
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        uint8x16_t va = vldrbq_z_u8 (a, p);
+        res = vshlcq_u8 (va, b, 1);
+        n-=16;
+        a+=16;
+    }
+    return res;
+}
+
+int8x16_t test22 (int8_t *a, int32_t *b, int n, int8x16_t res)
+{
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        int8x16_t va = vldrbq_z_s8 (a, p);
+        res = vshlcq_s8 (va, b, 1);
+        n-=16;
+        a+=16;
+    }
+    return res;
+}
+
+/* Using an unsigned number of elements to count down from, with a >0*/
+void test23 (int32_t *a, int32_t *b, int32_t *c, unsigned int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c+=4;
+      a+=4;
+      b+=4;
+      n-=4;
+    }
+}
+
+/* Using an unsigned number of elements to count up to, with a <n*/
+void test24 (uint8_t *a, uint8_t *b, uint8_t *c, unsigned int n)
+{
+    for (int i = 0; i < n; i+=16)
+    {
+        mve_pred16_t p = vctp8q (n-i);
+        uint8x16_t va = vldrbq_z_u8 (a, p);
+        uint8x16_t vb = vldrbq_z_u8 (b, p);
+        uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+        vstrbq_p_u8 (c, vc, p);
+        n-=16;
+    }
+}
+
+
+/* Using an unsigned number of elements to count up to, with a <=n*/
+void test25 (uint8_t *a, uint8_t *b, uint8_t *c, unsigned int n)
+{
+    for (int i = 1; i <= n; i+=16)
+    {
+        mve_pred16_t p = vctp8q (n-i+1);
+        uint8x16_t va = vldrbq_z_u8 (a, p);
+        uint8x16_t vb = vldrbq_z_u8 (b, p);
+        uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+        vstrbq_p_u8 (c, vc, p);
+        n-=16;
+    }
+}
+/* Update n twice in the loop.  */
+void test26 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n >= 1)
+    {
+      n-=4;
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c+=4;
+      a+=4;
+      b+=4;
+      n-=4;
+    }
+}
+
+void test27 (int32_t *a, int32_t *c, int n)
+{
+  int32_t res = 0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_s32 (a);
+      res += vaddvq_s32 (va);
+      int32x4_t vc = vdupq_n_s32 (res);
+      vstrwq_p_s32 (c, vc, p);
+      a += 4;
+      n -= 4;
+    }
+}
+
+/* Using an unpredicated vcmp to generate a new predicate value in the
+   loop and then using it in a predicated store insn.  */
+void test28 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_s32 (b);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      mve_pred16_t p1 = vcmpeqq_s32 (va, vc);
+      vstrwq_p_s32 (c, vc, p1);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/* Using an unpredicated op with a scalar output, where the result is valid
+   outside the bb.  The unpredicated lanes are not guaranteed zero, so would
+   affect the vaddv in the non-tail predicated case.  */
+uint8_t test29 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx)
+{
+    uint8_t sum = 0;
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_z_u8 (b, p);
+       uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p);
+       sum += vaddvq_u8 (vc);
+       a += 16;
+       b += 16;
+       n -= 16;
+    }
+    return sum;
+}
+
+/* Same as above, but with another scalar op between the unpredicated op and
+   the scalar op outside the loop.  */
+uint8_t test30 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx, int g)
+{
+    uint8_t sum = 0;
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_z_u8 (b, p);
+       uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p);
+       sum += vaddvq_u8 (vc);
+       sum += g;
+       a += 16;
+       b += 16;
+       n -= 16;
+    }
+    return sum;
+}
+
+uint8_t test31 (uint8_t *a, uint8_t *b, int n)
+{
+    uint8_t res = 0;
+    uint8x16_t vc = vdupq_n_u8 (0);
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_u8 (b);
+       vc = vaddq (vb, vc);
+       res = vgetq_lane (vc, 5);
+
+       a += 16;
+       b += 16;
+       n -= 16;
+    }
+    return res;
+}
+
+uint8_t test32 (uint8_t *a, uint8_t *b, int n)
+{
+    uint8_t res = 0;
+    uint8x16_t vc = vdupq_n_u8 (0);
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_u8 (b);
+       vc = vaddq_m (vc, va, vc, p);
+       vc = vaddq (vb, vc);
+       res = vgetq_lane (vc, 5);
+
+       a += 16;
+       b += 16;
+       n -= 16;
+    }
+    return res;
+}
+
+/* { dg-final { scan-assembler-not "\tdlstp" } } */
+/* { dg-final { scan-assembler-not "\tletp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/funcspec-56.inc b/gcc/testsuite/gcc.target/i386/funcspec-56.inc
index 2a50f5bf67c86..c4dc89367ef58 100644
--- a/gcc/testsuite/gcc.target/i386/funcspec-56.inc
+++ b/gcc/testsuite/gcc.target/i386/funcspec-56.inc
@@ -208,6 +208,7 @@ extern void test_arch_arrowlake_s (void)	__attribute__((__target__("arch=arrowla
 extern void test_arch_pantherlake (void)	__attribute__((__target__("arch=pantherlake")));
 extern void test_arch_lujiazui (void)		__attribute__((__target__("arch=lujiazui")));
 extern void test_arch_yongfeng (void)		__attribute__((__target__("arch=yongfeng")));
+extern void test_arch_shijidadao (void)		__attribute__((__target__("arch=shijidadao")));
 extern void test_arch_k8 (void)			__attribute__((__target__("arch=k8")));
 extern void test_arch_k8_sse3 (void)		__attribute__((__target__("arch=k8-sse3")));
 extern void test_arch_opteron (void)		__attribute__((__target__("arch=opteron")));
@@ -233,6 +234,7 @@ extern void test_tune_corei7_avx (void)		__attribute__((__target__("tune=corei7-
 extern void test_tune_core_avx2 (void)		__attribute__((__target__("tune=core-avx2")));
 extern void test_tune_lujiazui (void)		__attribute__((__target__("tune=lujiazui")));
 extern void test_tune_yongfeng (void)		__attribute__((__target__("tune=yongfeng")));
+extern void test_tune_shijidadao (void)		__attribute__((__target__("tune=shijidadao")));
 extern void test_tune_k8 (void)			__attribute__((__target__("tune=k8")));
 extern void test_tune_k8_sse3 (void)		__attribute__((__target__("tune=k8-sse3")));
 extern void test_tune_opteron (void)		__attribute__((__target__("tune=opteron")));
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-cmp-eqne.c b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-cmp-eqne.c
new file mode 100644
index 0000000000000..572bcb8f291be
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-cmp-eqne.c
@@ -0,0 +1,54 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64 -O3" } */
+
+#include "riscv_vector.h"
+
+#define CMP_FLOAT_VF_1(ID, S, OP, IMM)                                         \
+  vbool##S##_t test_float_1_##ID##_##S (vfloat##S##m1_t op1, size_t vl)        \
+  {                                                                            \
+    return __riscv_vmf##OP##_vf_f##S##m1_b##S (op1, IMM, vl);                  \
+  }
+
+CMP_FLOAT_VF_1 (0, 32, eq, 0.0)
+CMP_FLOAT_VF_1 (1, 32, eq, 1.0)
+CMP_FLOAT_VF_1 (2, 32, eq, __builtin_nanf ("123"))
+CMP_FLOAT_VF_1 (3, 32, ne, 0.0)
+CMP_FLOAT_VF_1 (4, 32, ne, 1.0)
+CMP_FLOAT_VF_1 (5, 32, ne, __builtin_nanf ("123"))
+
+CMP_FLOAT_VF_1 (0, 64, eq, 0.0)
+CMP_FLOAT_VF_1 (1, 64, eq, 1.0)
+CMP_FLOAT_VF_1 (2, 64, eq, __builtin_nan ("123"))
+CMP_FLOAT_VF_1 (3, 64, ne, 0.0)
+CMP_FLOAT_VF_1 (4, 64, ne, 1.0)
+CMP_FLOAT_VF_1 (5, 64, ne, __builtin_nan ("123"))
+
+#define CMP_FLOAT_VF_2(ID, S, OP, IMM)                                         \
+  vfloat##S##m1_t test_float_2_##ID##_##S (vfloat##S##m1_t op1,                \
+					   vfloat##S##m1_t op2, size_t vl)     \
+  {                                                                            \
+    vfloat##S##m1_t op3 = __riscv_vfmv_s_f_f##S##m1 (IMM, vl);                 \
+    vbool##S##_t mask1 = __riscv_vmf##OP##_vf_f##S##m1_b##S (op1, IMM, vl);    \
+    vbool##S##_t mask2 = __riscv_vmf##OP##_vv_f##S##m1_b##S (op1, op3, vl);    \
+    vbool##S##_t mask3 = __riscv_vmor (mask1, mask2, vl);                      \
+    return __riscv_vmerge_vvm_f##S##m1_tu (op1, op1, op2, mask3, vl);          \
+  }
+
+CMP_FLOAT_VF_2 (0, 32, eq, 0.0)
+CMP_FLOAT_VF_2 (1, 32, eq, 1.0)
+CMP_FLOAT_VF_2 (2, 32, eq, __builtin_nanf ("123"))
+CMP_FLOAT_VF_2 (3, 32, ne, 0.0)
+CMP_FLOAT_VF_2 (4, 32, ne, 1.0)
+CMP_FLOAT_VF_2 (5, 32, ne, __builtin_nanf ("123"))
+
+CMP_FLOAT_VF_2 (0, 64, eq, 0.0)
+CMP_FLOAT_VF_2 (1, 64, eq, 1.0)
+CMP_FLOAT_VF_2 (2, 64, eq, __builtin_nan ("123"))
+CMP_FLOAT_VF_2 (3, 64, ne, 0.0)
+CMP_FLOAT_VF_2 (4, 64, ne, 1.0)
+CMP_FLOAT_VF_2 (5, 64, ne, __builtin_nan ("123"))
+
+/* { dg-final { scan-assembler-times {vmfeq\.vf} 12 } } */
+/* { dg-final { scan-assembler-times {vmfne\.vf} 12 } } */
+/* { dg-final { scan-assembler-times {vmfeq\.vv} 6 } } */
+/* { dg-final { scan-assembler-times {vmfne\.vv} 6 } } */
diff --git a/gcc/testsuite/gfortran.dg/bind_c_char_11.f90 b/gcc/testsuite/gfortran.dg/bind_c_char_11.f90
new file mode 100644
index 0000000000000..5ed8e82853bf0
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/bind_c_char_11.f90
@@ -0,0 +1,45 @@
+! { dg-do compile }
+! { dg-additional-options "-Wuninitialized" }
+!
+! PR fortran/115390 - fixes for CHARACTER(len=*) dummies with bind(C)
+
+module test
+  implicit none
+contains
+  subroutine bar(s,t) bind(c)
+    character(*), intent(in) :: s,t
+    optional                 :: t
+    call foo(s,t)
+  end
+  subroutine bar1(s,t) bind(c)
+    character(*), intent(in) :: s(:),t(:)
+    optional                 :: t
+    call foo1(s,t)
+  end
+  subroutine bar4(s,t) bind(c)
+    character(len=*,kind=4), intent(in) :: s,t
+    optional                            :: t
+    call foo4(s,t)
+  end
+  subroutine bar5(s,t) bind(c)
+    character(len=*,kind=4), intent(in) :: s(:),t(:)
+    optional                            :: t
+    call foo5(s,t)
+  end
+  subroutine foo(s,t)
+    character(*), intent(in) :: s,t
+    optional                 :: t
+  end
+  subroutine foo1(s,t)
+    character(*), intent(in) :: s(:),t(:)
+    optional                 :: t
+  end
+  subroutine foo4(s,t)
+    character(len=*,kind=4), intent(in) :: s,t
+    optional                            :: t
+  end
+  subroutine foo5(s,t)
+    character(len=*,kind=4), intent(in) :: s(:),t(:)
+    optional                            :: t
+  end
+end
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index eeb75c09e91aa..347dac97e497e 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -7815,11 +7815,9 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 			     "use not simple.\n");
 	  return false;
 	}
-      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
-	continue;
 
-      /* For an IFN_COND_OP we might hit the reduction definition operand
-	 twice (once as definition, once as else).  */
+      /* Skip reduction operands, and for an IFN_COND_OP we might hit the
+	 reduction operand twice (once as definition, once as else).  */
       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
 	continue;
 
@@ -7870,10 +7868,10 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   if (lane_reducing)
     STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
 
-  enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
-  STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
+  enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
+  STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
   /* If we have a condition reduction, see if we can simplify it further.  */
-  if (v_reduc_type == COND_REDUCTION)
+  if (reduction_type == COND_REDUCTION)
     {
       if (slp_node && SLP_TREE_LANES (slp_node) != 1)
 	return false;
@@ -8040,7 +8038,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 
   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
 
-  vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
+  reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
   if (reduction_type == TREE_CODE_REDUCTION)
     {
       /* Check whether it's ok to change the order of the computation.
@@ -8582,9 +8580,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 
   /* Transform.  */
   tree new_temp = NULL_TREE;
-  auto_vec<tree> vec_oprnds0;
-  auto_vec<tree> vec_oprnds1;
-  auto_vec<tree> vec_oprnds2;
+  auto_vec<tree> vec_oprnds[3];
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
@@ -8622,7 +8618,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
     }
 
   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
-  gcc_assert (single_defuse_cycle || lane_reducing_op_p (code));
+  bool lane_reducing = lane_reducing_op_p (code);
+  gcc_assert (single_defuse_cycle || lane_reducing);
 
   /* Create the destination vector  */
   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
@@ -8632,14 +8629,15 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
      definition.  */
   if (!cond_fn_p)
     {
+      gcc_assert (reduc_index >= 0 && reduc_index <= 2);
       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
 			 single_defuse_cycle && reduc_index == 0
-			 ? NULL_TREE : op.ops[0], &vec_oprnds0,
+			 ? NULL_TREE : op.ops[0], &vec_oprnds[0],
 			 single_defuse_cycle && reduc_index == 1
-			 ? NULL_TREE : op.ops[1], &vec_oprnds1,
+			 ? NULL_TREE : op.ops[1], &vec_oprnds[1],
 			 op.num_ops == 3
 			 && !(single_defuse_cycle && reduc_index == 2)
-			 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
+			 ? op.ops[2] : NULL_TREE, &vec_oprnds[2]);
     }
   else
     {
@@ -8647,12 +8645,12 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 	 vectype.  */
       gcc_assert (single_defuse_cycle
 		  && (reduc_index == 1 || reduc_index == 2));
-      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
-			 op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
+      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies, op.ops[0],
+			 truth_type_for (vectype_in), &vec_oprnds[0],
 			 reduc_index == 1 ? NULL_TREE : op.ops[1],
-			 NULL_TREE, &vec_oprnds1,
+			 NULL_TREE, &vec_oprnds[1],
 			 reduc_index == 2 ? NULL_TREE : op.ops[2],
-			 NULL_TREE, &vec_oprnds2);
+			 NULL_TREE, &vec_oprnds[2]);
     }
 
   /* For single def-use cycles get one copy of the vectorized reduction
@@ -8660,24 +8658,26 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   if (single_defuse_cycle)
     {
       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, 1,
-			 reduc_index == 0 ? op.ops[0] : NULL_TREE, &vec_oprnds0,
-			 reduc_index == 1 ? op.ops[1] : NULL_TREE, &vec_oprnds1,
+			 reduc_index == 0 ? op.ops[0] : NULL_TREE,
+			 &vec_oprnds[0],
+			 reduc_index == 1 ? op.ops[1] : NULL_TREE,
+			 &vec_oprnds[1],
 			 reduc_index == 2 ? op.ops[2] : NULL_TREE,
-			 &vec_oprnds2);
+			 &vec_oprnds[2]);
     }
 
   bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
+  unsigned num = vec_oprnds[reduc_index == 0 ? 1 : 0].length ();
 
-  unsigned num = (reduc_index == 0
-		  ? vec_oprnds1.length () : vec_oprnds0.length ());
   for (unsigned i = 0; i < num; ++i)
     {
       gimple *new_stmt;
-      tree vop[3] = { vec_oprnds0[i], vec_oprnds1[i], NULL_TREE };
+      tree vop[3] = { vec_oprnds[0][i], vec_oprnds[1][i], NULL_TREE };
       if (masked_loop_p && !mask_by_cond_expr)
 	{
-	  /* No conditional ifns have been defined for dot-product yet.  */
-	  gcc_assert (code != DOT_PROD_EXPR);
+	  /* No conditional ifns have been defined for lane-reducing op
+	     yet.  */
+	  gcc_assert (!lane_reducing);
 
 	  /* Make sure that the reduction accumulator is vop[0].  */
 	  if (reduc_index == 1)
@@ -8698,7 +8698,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
       else
 	{
 	  if (op.num_ops >= 3)
-	    vop[2] = vec_oprnds2[i];
+	    vop[2] = vec_oprnds[2][i];
 
 	  if (masked_loop_p && mask_by_cond_expr)
 	    {
@@ -8729,14 +8729,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 	}
 
       if (single_defuse_cycle && i < num - 1)
-	{
-	  if (reduc_index == 0)
-	    vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
-	  else if (reduc_index == 1)
-	    vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
-	  else if (reduc_index == 2)
-	    vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
-	}
+	vec_oprnds[reduc_index].safe_push (gimple_get_lhs (new_stmt));
       else if (slp_node)
 	slp_node->push_vec_def (new_stmt);
       else
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 7d18b5bfee5d2..a5665946a4ebc 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -3919,7 +3919,6 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
 	  scalar_stmts.create (loop_vinfo->reductions.length ());
 	  for (auto next_info : loop_vinfo->reductions)
 	    {
-	      gassign *g;
 	      next_info = vect_stmt_to_vectorize (next_info);
 	      if ((STMT_VINFO_RELEVANT_P (next_info)
 		   || STMT_VINFO_LIVE_P (next_info))
@@ -3931,8 +3930,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
 		{
 		  /* Do not discover SLP reductions combining lane-reducing
 		     ops, that will fail later.  */
-		  if (!(g = dyn_cast <gassign *> (STMT_VINFO_STMT (next_info)))
-		      || !lane_reducing_op_p (gimple_assign_rhs_code (g)))
+		  if (!lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
 		    scalar_stmts.quick_push (next_info);
 		  else
 		    {
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 6bb0f5c3a56f6..60224f4e28472 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2169,12 +2169,24 @@ vect_apply_runtime_profitability_check_p (loop_vec_info loop_vinfo)
 	  && th >= vect_vf_for_cost (loop_vinfo));
 }
 
+/* Return true if CODE is a lane-reducing opcode.  */
+
 inline bool
 lane_reducing_op_p (code_helper code)
 {
   return code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR;
 }
 
+/* Return true if STMT is a lane-reducing statement.  */
+
+inline bool
+lane_reducing_stmt_p (gimple *stmt)
+{
+  if (auto *assign = dyn_cast <gassign *> (stmt))
+    return lane_reducing_op_p (gimple_assign_rhs_code (assign));
+  return false;
+}
+
 /* Source location + hotness information. */
 extern dump_user_location_t vect_location;
 
diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog
index 907f6cfb0e83c..94a5ce9a1329c 100644
--- a/libstdc++-v3/ChangeLog
+++ b/libstdc++-v3/ChangeLog
@@ -1,3 +1,27 @@
+2024-06-19  Jonathan Wakely  <jwakely@redhat.com>
+
+	* include/std/future: Adjust whitespace to use tabs for
+	indentation.
+
+2024-06-19  Jonathan Wakely  <jwakely@redhat.com>
+
+	* include/std/future (_State_baseV2::_Setter<R, void>): Add
+	noexcept to call operator.
+	(_State_baseV2::_Setter<R, __exception_ptr_tag>): Likewise.
+
+2024-06-19  Jonathan Wakely  <jwakely@redhat.com>
+
+	* include/bits/stl_pair.h [__cpp_lib_concepts] (pair()): Add
+	conditional noexcept.
+
+2024-06-19  Jonathan Wakely  <jwakely@redhat.com>
+
+	* include/bits/stl_tempbuf.h (__get_temporary_buffer): Cast
+	argument to size_t to handle negative values and suppress
+	-Wsign-compare warning.
+	(_Temporary_buffer): Move diagnostic pragmas to new location of
+	call to std::get_temporary_buffer.
+
 2024-06-18  Jonathan Wakely  <jwakely@redhat.com>
 
 	* include/bits/cpp_type_traits.h: Fix outdated comment about the
diff --git a/libstdc++-v3/include/bits/stl_pair.h b/libstdc++-v3/include/bits/stl_pair.h
index 0c1e5719a1a3b..0d60eaba1941e 100644
--- a/libstdc++-v3/include/bits/stl_pair.h
+++ b/libstdc++-v3/include/bits/stl_pair.h
@@ -344,6 +344,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       explicit(__not_<__and_<__is_implicitly_default_constructible<_T1>,
 			     __is_implicitly_default_constructible<_T2>>>())
       pair()
+      noexcept(is_nothrow_default_constructible_v<_T1>
+		&& is_nothrow_default_constructible_v<_T2>)
       requires is_default_constructible_v<_T1>
 	       && is_default_constructible_v<_T2>
       : first(), second()
diff --git a/libstdc++-v3/include/std/future b/libstdc++-v3/include/std/future
index 9e75ae98b13d2..6ce7d89ca3ffe 100644
--- a/libstdc++-v3/include/std/future
+++ b/libstdc++-v3/include/std/future
@@ -292,7 +292,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       {
 	using __allocator_type = __alloc_rebind<_Alloc, _Result_alloc>;
 
-        explicit
+	explicit
 	_Result_alloc(const _Alloc& __a) : _Result<_Res>(), _Alloc(__a)
 	{ }
 
@@ -362,9 +362,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       }
 
       template<typename _Rep, typename _Period>
-        future_status
-        wait_for(const chrono::duration<_Rep, _Period>& __rel)
-        {
+	future_status
+	wait_for(const chrono::duration<_Rep, _Period>& __rel)
+	{
 	  // First, check if the future has been made ready.  Use acquire MO
 	  // to synchronize with the thread that made it ready.
 	  if (_M_status._M_load(memory_order_acquire) == _Status::__ready)
@@ -396,9 +396,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	}
 
       template<typename _Clock, typename _Duration>
-        future_status
-        wait_until(const chrono::time_point<_Clock, _Duration>& __abs)
-        {
+	future_status
+	wait_until(const chrono::time_point<_Clock, _Duration>& __abs)
+	{
 #if __cplusplus > 201703L
 	  static_assert(chrono::is_clock_v<_Clock>);
 #endif
@@ -430,8 +430,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       _M_set_result(function<_Ptr_type()> __res, bool __ignore_failure = false)
       {
 	bool __did_set = false;
-        // all calls to this function are serialized,
-        // side-effects of invoking __res only happen once
+	// all calls to this function are serialized,
+	// side-effects of invoking __res only happen once
 	call_once(_M_once, &_State_baseV2::_M_do_set, this,
 		  std::__addressof(__res), std::__addressof(__did_set));
 	if (__did_set)
@@ -439,7 +439,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	  _M_status._M_store_notify_all(_Status::__ready,
 					memory_order_release);
 	else if (!__ignore_failure)
-          __throw_future_error(int(future_errc::promise_already_satisfied));
+	  __throw_future_error(int(future_errc::promise_already_satisfied));
       }
 
       // Provide a result to the shared state but delay making it ready
@@ -451,12 +451,12 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       {
 	bool __did_set = false;
 	unique_ptr<_Make_ready> __mr{new _Make_ready};
-        // all calls to this function are serialized,
-        // side-effects of invoking __res only happen once
+	// all calls to this function are serialized,
+	// side-effects of invoking __res only happen once
 	call_once(_M_once, &_State_baseV2::_M_do_set, this,
 		  std::__addressof(__res), std::__addressof(__did_set));
 	if (!__did_set)
-          __throw_future_error(int(future_errc::promise_already_satisfied));
+	  __throw_future_error(int(future_errc::promise_already_satisfied));
 	__mr->_M_shared_state = std::move(__self);
 	__mr->_M_set();
 	__mr.release();
@@ -490,41 +490,41 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       }
 
       template<typename _Res, typename _Arg>
-        struct _Setter;
+	struct _Setter;
 
       // set lvalues
       template<typename _Res, typename _Arg>
-        struct _Setter<_Res, _Arg&>
-        {
-          // check this is only used by promise<R>::set_value(const R&)
-          // or promise<R&>::set_value(R&)
-          static_assert(is_same<_Res, _Arg&>::value  // promise<R&>
-              || is_same<const _Res, _Arg>::value,   // promise<R>
-              "Invalid specialisation");
+	struct _Setter<_Res, _Arg&>
+	{
+	  // check this is only used by promise<R>::set_value(const R&)
+	  // or promise<R&>::set_value(R&)
+	  static_assert(is_same<_Res, _Arg&>::value              // promise<R&>
+			  || is_same<const _Res, _Arg>::value,   // promise<R>
+			"Invalid specialisation");
 
 	  // Used by std::promise to copy construct the result.
-          typename promise<_Res>::_Ptr_type operator()() const
-          {
-            _M_promise->_M_storage->_M_set(*_M_arg);
-            return std::move(_M_promise->_M_storage);
-          }
-          promise<_Res>*    _M_promise;
-          _Arg*             _M_arg;
-        };
+	  typename promise<_Res>::_Ptr_type operator()() const
+	  {
+	    _M_promise->_M_storage->_M_set(*_M_arg);
+	    return std::move(_M_promise->_M_storage);
+	  }
+	  promise<_Res>*    _M_promise;
+	  _Arg*             _M_arg;
+	};
 
       // set rvalues
       template<typename _Res>
-        struct _Setter<_Res, _Res&&>
-        {
+	struct _Setter<_Res, _Res&&>
+	{
 	  // Used by std::promise to move construct the result.
-          typename promise<_Res>::_Ptr_type operator()() const
-          {
-            _M_promise->_M_storage->_M_set(std::move(*_M_arg));
-            return std::move(_M_promise->_M_storage);
-          }
-          promise<_Res>*    _M_promise;
-          _Res*             _M_arg;
-        };
+	  typename promise<_Res>::_Ptr_type operator()() const
+	  {
+	    _M_promise->_M_storage->_M_set(std::move(*_M_arg));
+	    return std::move(_M_promise->_M_storage);
+	  }
+	  promise<_Res>*    _M_promise;
+	  _Res*             _M_arg;
+	};
 
       // set void
       template<typename _Res>
@@ -532,7 +532,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	{
 	  static_assert(is_void<_Res>::value, "Only used for promise<void>");
 
-	  typename promise<_Res>::_Ptr_type operator()() const
+	  typename promise<_Res>::_Ptr_type operator()() const noexcept
 	  { return std::move(_M_promise->_M_storage); }
 
 	  promise<_Res>*    _M_promise;
@@ -542,35 +542,35 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
       // set exceptions
       template<typename _Res>
-        struct _Setter<_Res, __exception_ptr_tag>
-        {
+	struct _Setter<_Res, __exception_ptr_tag>
+	{
 	  // Used by std::promise to store an exception as the result.
-          typename promise<_Res>::_Ptr_type operator()() const
-          {
-            _M_promise->_M_storage->_M_error = *_M_ex;
-            return std::move(_M_promise->_M_storage);
-          }
+	  typename promise<_Res>::_Ptr_type operator()() const noexcept
+	  {
+	    _M_promise->_M_storage->_M_error = *_M_ex;
+	    return std::move(_M_promise->_M_storage);
+	  }
 
-          promise<_Res>*   _M_promise;
-          exception_ptr*    _M_ex;
-        };
+	  promise<_Res>*   _M_promise;
+	  exception_ptr*    _M_ex;
+	};
 
       template<typename _Res, typename _Arg>
 	__attribute__((__always_inline__))
-        static _Setter<_Res, _Arg&&>
-        __setter(promise<_Res>* __prom, _Arg&& __arg) noexcept
-        {
-          return _Setter<_Res, _Arg&&>{ __prom, std::__addressof(__arg) };
-        }
+	static _Setter<_Res, _Arg&&>
+	__setter(promise<_Res>* __prom, _Arg&& __arg) noexcept
+	{
+	  return _Setter<_Res, _Arg&&>{ __prom, std::__addressof(__arg) };
+	}
 
       template<typename _Res>
 	__attribute__((__always_inline__))
-        static _Setter<_Res, __exception_ptr_tag>
-        __setter(exception_ptr& __ex, promise<_Res>* __prom) noexcept
-        {
-          __glibcxx_assert(__ex != nullptr); // LWG 2276
-          return _Setter<_Res, __exception_ptr_tag>{ __prom, &__ex };
-        }
+	static _Setter<_Res, __exception_ptr_tag>
+	__setter(exception_ptr& __ex, promise<_Res>* __prom) noexcept
+	{
+	  __glibcxx_assert(__ex != nullptr); // LWG 2276
+	  return _Setter<_Res, __exception_ptr_tag>{ __prom, &__ex };
+	}
 
       template<typename _Res>
 	__attribute__((__always_inline__))
@@ -581,24 +581,24 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	}
 
       template<typename _Tp>
-        static void
-        _S_check(const shared_ptr<_Tp>& __p)
-        {
-          if (!static_cast<bool>(__p))
-            __throw_future_error((int)future_errc::no_state);
-        }
+	static void
+	_S_check(const shared_ptr<_Tp>& __p)
+	{
+	  if (!static_cast<bool>(__p))
+	    __throw_future_error((int)future_errc::no_state);
+	}
 
     private:
       // The function invoked with std::call_once(_M_once, ...).
       void
       _M_do_set(function<_Ptr_type()>* __f, bool* __did_set)
       {
-        _Ptr_type __res = (*__f)();
-        // Notify the caller that we did try to set; if we do not throw an
-        // exception, the caller will be aware that it did set (e.g., see
-        // _M_set_result).
+	_Ptr_type __res = (*__f)();
+	// Notify the caller that we did try to set; if we do not throw an
+	// exception, the caller will be aware that it did set (e.g., see
+	// _M_set_result).
 	*__did_set = true;
-        _M_result.swap(__res); // nothrow
+	_M_result.swap(__res); // nothrow
       }
 
       // Wait for completion of async function.
@@ -719,49 +719,49 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       void
       wait() const
       {
-        _State_base::_S_check(_M_state);
-        _M_state->wait();
+	_State_base::_S_check(_M_state);
+	_M_state->wait();
       }
 
       template<typename _Rep, typename _Period>
-        future_status
-        wait_for(const chrono::duration<_Rep, _Period>& __rel) const
-        {
-          _State_base::_S_check(_M_state);
-          return _M_state->wait_for(__rel);
-        }
+	future_status
+	wait_for(const chrono::duration<_Rep, _Period>& __rel) const
+	{
+	  _State_base::_S_check(_M_state);
+	  return _M_state->wait_for(__rel);
+	}
 
       template<typename _Clock, typename _Duration>
-        future_status
-        wait_until(const chrono::time_point<_Clock, _Duration>& __abs) const
-        {
-          _State_base::_S_check(_M_state);
-          return _M_state->wait_until(__abs);
-        }
+	future_status
+	wait_until(const chrono::time_point<_Clock, _Duration>& __abs) const
+	{
+	  _State_base::_S_check(_M_state);
+	  return _M_state->wait_until(__abs);
+	}
 
     protected:
       /// Wait for the state to be ready and rethrow any stored exception
       __result_type
       _M_get_result() const
       {
-        _State_base::_S_check(_M_state);
-        _Result_base& __res = _M_state->wait();
-        if (!(__res._M_error == nullptr))
-          rethrow_exception(__res._M_error);
-        return static_cast<__result_type>(__res);
+	_State_base::_S_check(_M_state);
+	_Result_base& __res = _M_state->wait();
+	if (!(__res._M_error == nullptr))
+	  rethrow_exception(__res._M_error);
+	return static_cast<__result_type>(__res);
       }
 
       void _M_swap(__basic_future& __that) noexcept
       {
-        _M_state.swap(__that._M_state);
+	_M_state.swap(__that._M_state);
       }
 
       // Construction of a future by promise::get_future()
       explicit
       __basic_future(const __state_type& __state) : _M_state(__state)
       {
-        _State_base::_S_check(_M_state);
-        _M_state->_M_set_retrieved_flag();
+	_State_base::_S_check(_M_state);
+	_M_state->_M_set_retrieved_flag();
       }
 
       // Copy construction from a shared_future
@@ -780,9 +780,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
       struct _Reset
       {
-        explicit _Reset(__basic_future& __fut) noexcept : _M_fut(__fut) { }
-        ~_Reset() { _M_fut._M_state.reset(); }
-        __basic_future& _M_fut;
+	explicit _Reset(__basic_future& __fut) noexcept : _M_fut(__fut) { }
+	~_Reset() { _M_fut._M_state.reset(); }
+	__basic_future& _M_fut;
       };
     };
 
@@ -801,8 +801,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       friend class promise<_Res>;
       template<typename> friend class packaged_task;
       template<typename _Fn, typename... _Args>
-        friend future<__async_result_of<_Fn, _Args...>>
-        async(launch, _Fn&&, _Args&&...);
+	friend future<__async_result_of<_Fn, _Args...>>
+	async(launch, _Fn&&, _Args&&...);
 
       typedef __basic_future<_Res> _Base_type;
       typedef typename _Base_type::__state_type __state_type;
@@ -822,16 +822,16 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
       future& operator=(future&& __fut) noexcept
       {
-        future(std::move(__fut))._M_swap(*this);
-        return *this;
+	future(std::move(__fut))._M_swap(*this);
+	return *this;
       }
 
       /// Retrieving the value
       _Res
       get()
       {
-        typename _Base_type::_Reset __reset(*this);
-        return std::move(this->_M_get_result()._M_value());
+	typename _Base_type::_Reset __reset(*this);
+	return std::move(this->_M_get_result()._M_value());
       }
 
       shared_future<_Res> share() noexcept;
@@ -844,8 +844,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       friend class promise<_Res&>;
       template<typename> friend class packaged_task;
       template<typename _Fn, typename... _Args>
-        friend future<__async_result_of<_Fn, _Args...>>
-        async(launch, _Fn&&, _Args&&...);
+	friend future<__async_result_of<_Fn, _Args...>>
+	async(launch, _Fn&&, _Args&&...);
 
       typedef __basic_future<_Res&> _Base_type;
       typedef typename _Base_type::__state_type __state_type;
@@ -865,16 +865,16 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
       future& operator=(future&& __fut) noexcept
       {
-        future(std::move(__fut))._M_swap(*this);
-        return *this;
+	future(std::move(__fut))._M_swap(*this);
+	return *this;
       }
 
       /// Retrieving the value
       _Res&
       get()
       {
-        typename _Base_type::_Reset __reset(*this);
-        return this->_M_get_result()._M_get();
+	typename _Base_type::_Reset __reset(*this);
+	return this->_M_get_result()._M_get();
       }
 
       shared_future<_Res&> share() noexcept;
@@ -887,8 +887,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       friend class promise<void>;
       template<typename> friend class packaged_task;
       template<typename _Fn, typename... _Args>
-        friend future<__async_result_of<_Fn, _Args...>>
-        async(launch, _Fn&&, _Args&&...);
+	friend future<__async_result_of<_Fn, _Args...>>
+	async(launch, _Fn&&, _Args&&...);
 
       typedef __basic_future<void> _Base_type;
       typedef typename _Base_type::__state_type __state_type;
@@ -908,16 +908,16 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
       future& operator=(future&& __fut) noexcept
       {
-        future(std::move(__fut))._M_swap(*this);
-        return *this;
+	future(std::move(__fut))._M_swap(*this);
+	return *this;
       }
 
       /// Retrieving the value
       void
       get()
       {
-        typename _Base_type::_Reset __reset(*this);
-        this->_M_get_result();
+	typename _Base_type::_Reset __reset(*this);
+	this->_M_get_result();
       }
 
       shared_future<void> share() noexcept;
@@ -955,14 +955,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
       shared_future& operator=(const shared_future& __sf) noexcept
       {
-        shared_future(__sf)._M_swap(*this);
-        return *this;
+	shared_future(__sf)._M_swap(*this);
+	return *this;
       }
 
       shared_future& operator=(shared_future&& __sf) noexcept
       {
-        shared_future(std::move(__sf))._M_swap(*this);
-        return *this;
+	shared_future(std::move(__sf))._M_swap(*this);
+	return *this;
       }
 
       /// Retrieving the value
@@ -994,14 +994,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
       shared_future& operator=(const shared_future& __sf)
       {
-        shared_future(__sf)._M_swap(*this);
-        return *this;
+	shared_future(__sf)._M_swap(*this);
+	return *this;
       }
 
       shared_future& operator=(shared_future&& __sf) noexcept
       {
-        shared_future(std::move(__sf))._M_swap(*this);
-        return *this;
+	shared_future(std::move(__sf))._M_swap(*this);
+	return *this;
       }
 
       /// Retrieving the value
@@ -1033,14 +1033,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
       shared_future& operator=(const shared_future& __sf)
       {
-        shared_future(__sf)._M_swap(*this);
-        return *this;
+	shared_future(__sf)._M_swap(*this);
+	return *this;
       }
 
       shared_future& operator=(shared_future&& __sf) noexcept
       {
-        shared_future(std::move(__sf))._M_swap(*this);
-        return *this;
+	shared_future(std::move(__sf))._M_swap(*this);
+	return *this;
       }
 
       // Retrieving the value
@@ -1115,31 +1115,31 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       { }
 
       template<typename _Allocator>
-        promise(allocator_arg_t, const _Allocator& __a)
-        : _M_future(std::allocate_shared<_State>(__a)),
+	promise(allocator_arg_t, const _Allocator& __a)
+	: _M_future(std::allocate_shared<_State>(__a)),
 	  _M_storage(__future_base::_S_allocate_result<_Res>(__a))
-        { }
+	{ }
 
       template<typename _Allocator>
-        promise(allocator_arg_t, const _Allocator&, promise&& __rhs)
-        : _M_future(std::move(__rhs._M_future)),
+	promise(allocator_arg_t, const _Allocator&, promise&& __rhs)
+	: _M_future(std::move(__rhs._M_future)),
 	  _M_storage(std::move(__rhs._M_storage))
-        { }
+	{ }
 
       promise(const promise&) = delete;
 
       ~promise()
       {
-        if (static_cast<bool>(_M_future) && !_M_future.unique())
-          _M_future->_M_break_promise(std::move(_M_storage));
+	if (static_cast<bool>(_M_future) && !_M_future.unique())
+	  _M_future->_M_break_promise(std::move(_M_storage));
       }
 
       // Assignment
       promise&
       operator=(promise&& __rhs) noexcept
       {
-        promise(std::move(__rhs)).swap(*this);
-        return *this;
+	promise(std::move(__rhs)).swap(*this);
+	return *this;
       }
 
       promise& operator=(const promise&) = delete;
@@ -1147,8 +1147,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       void
       swap(promise& __rhs) noexcept
       {
-        _M_future.swap(__rhs._M_future);
-        _M_storage.swap(__rhs._M_storage);
+	_M_future.swap(__rhs._M_future);
+	_M_storage.swap(__rhs._M_storage);
       }
 
       // Retrieving the result
@@ -1234,31 +1234,31 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       { }
 
       template<typename _Allocator>
-        promise(allocator_arg_t, const _Allocator& __a)
-        : _M_future(std::allocate_shared<_State>(__a)),
+	promise(allocator_arg_t, const _Allocator& __a)
+	: _M_future(std::allocate_shared<_State>(__a)),
 	  _M_storage(__future_base::_S_allocate_result<_Res&>(__a))
-        { }
+	{ }
 
       template<typename _Allocator>
-        promise(allocator_arg_t, const _Allocator&, promise&& __rhs)
-        : _M_future(std::move(__rhs._M_future)),
+	promise(allocator_arg_t, const _Allocator&, promise&& __rhs)
+	: _M_future(std::move(__rhs._M_future)),
 	  _M_storage(std::move(__rhs._M_storage))
-        { }
+	{ }
 
       promise(const promise&) = delete;
 
       ~promise()
       {
-        if (static_cast<bool>(_M_future) && !_M_future.unique())
-          _M_future->_M_break_promise(std::move(_M_storage));
+	if (static_cast<bool>(_M_future) && !_M_future.unique())
+	  _M_future->_M_break_promise(std::move(_M_storage));
       }
 
       // Assignment
       promise&
       operator=(promise&& __rhs) noexcept
       {
-        promise(std::move(__rhs)).swap(*this);
-        return *this;
+	promise(std::move(__rhs)).swap(*this);
+	return *this;
       }
 
       promise& operator=(const promise&) = delete;
@@ -1266,8 +1266,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       void
       swap(promise& __rhs) noexcept
       {
-        _M_future.swap(__rhs._M_future);
-        _M_storage.swap(__rhs._M_storage);
+	_M_future.swap(__rhs._M_future);
+	_M_storage.swap(__rhs._M_storage);
       }
 
       // Retrieving the result
@@ -1332,33 +1332,33 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       { }
 
       template<typename _Allocator>
-        promise(allocator_arg_t, const _Allocator& __a)
-        : _M_future(std::allocate_shared<_State>(__a)),
+	promise(allocator_arg_t, const _Allocator& __a)
+	: _M_future(std::allocate_shared<_State>(__a)),
 	  _M_storage(__future_base::_S_allocate_result<void>(__a))
-        { }
+	{ }
 
       // _GLIBCXX_RESOLVE_LIB_DEFECTS
       // 2095.  missing constructors needed for uses-allocator construction
       template<typename _Allocator>
-        promise(allocator_arg_t, const _Allocator&, promise&& __rhs)
-        : _M_future(std::move(__rhs._M_future)),
+	promise(allocator_arg_t, const _Allocator&, promise&& __rhs)
+	: _M_future(std::move(__rhs._M_future)),
 	  _M_storage(std::move(__rhs._M_storage))
-        { }
+	{ }
 
       promise(const promise&) = delete;
 
       ~promise()
       {
-        if (static_cast<bool>(_M_future) && !_M_future.unique())
-          _M_future->_M_break_promise(std::move(_M_storage));
+	if (static_cast<bool>(_M_future) && !_M_future.unique())
+	  _M_future->_M_break_promise(std::move(_M_storage));
       }
 
       // Assignment
       promise&
       operator=(promise&& __rhs) noexcept
       {
-        promise(std::move(__rhs)).swap(*this);
-        return *this;
+	promise(std::move(__rhs)).swap(*this);
+	return *this;
       }
 
       promise& operator=(const promise&) = delete;
@@ -1366,8 +1366,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       void
       swap(promise& __rhs) noexcept
       {
-        _M_future.swap(__rhs._M_future);
-        _M_storage.swap(__rhs._M_storage);
+	_M_future.swap(__rhs._M_future);
+	_M_storage.swap(__rhs._M_storage);
       }
 
       // Retrieving the result
@@ -1596,7 +1596,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
       ~packaged_task()
       {
-        if (static_cast<bool>(_M_state) && !_M_state.unique())
+	if (static_cast<bool>(_M_state) && !_M_state.unique())
 	  _M_state->_M_break_promise(std::move(_M_state->_M_result));
       }
 
@@ -1709,7 +1709,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	// result in _M_result, swaps that with the base _M_result and makes
 	// the state ready. Tell _M_set_result to ignore failure so all later
 	// calls do nothing.
-        _M_set_result(_S_task_setter(_M_result, _M_fn), true);
+	_M_set_result(_S_task_setter(_M_result, _M_fn), true);
       }
 
       // Caller should check whether the state is ready first, because this