From 5d0c1b4e0d33c2d1077264636d0a65ce206d0d96 Mon Sep 17 00:00:00 2001
From: Andre Vieira <andre.simoesdiasvieira@arm.com>
Date: Wed, 19 Jun 2024 17:05:45 +0100
Subject: [PATCH 01/22] doloop: Add support for predicated vectorized loops

This patch adds support in the target agnostic doloop pass for the detection of
predicated vectorized hardware loops.  Arm is currently the only target that
will make use of this feature.

gcc/ChangeLog:

	* df-core.cc (df_bb_regno_only_def_find): New helper function.
	* df.h (df_bb_regno_only_def_find): Declare new function.
	* loop-doloop.cc (doloop_condition_get): Add support for detecting
	predicated vectorized hardware loops.
	(doloop_modify): Add support for GTU condition checks.
	(doloop_optimize): Update costing computation to support alterations to
	desc->niter_expr by the backend.

Co-authored-by: Stam Markianos-Wright <stam.markianos-wright@arm.com>
---
 gcc/df-core.cc     |  15 +++++
 gcc/df.h           |   1 +
 gcc/loop-doloop.cc | 164 +++++++++++++++++++++++++++------------------
 3 files changed, 113 insertions(+), 67 deletions(-)

diff --git a/gcc/df-core.cc b/gcc/df-core.cc
index f0eb4c93957ff..b0e8a88d433bb 100644
--- a/gcc/df-core.cc
+++ b/gcc/df-core.cc
@@ -1964,6 +1964,21 @@ df_bb_regno_last_def_find (basic_block bb, unsigned int regno)
   return NULL;
 }
 
+/* Return the one and only def of REGNO within BB.  If there is no def or
+   there are multiple defs, return NULL.  */
+
+df_ref
+df_bb_regno_only_def_find (basic_block bb, unsigned int regno)
+{
+  df_ref temp = df_bb_regno_first_def_find (bb, regno);
+  if (!temp)
+    return NULL;
+  else if (temp == df_bb_regno_last_def_find (bb, regno))
+    return temp;
+  else
+    return NULL;
+}
+
 /* Finds the reference corresponding to the definition of REG in INSN.
    DF is the dataflow object.  */
 
diff --git a/gcc/df.h b/gcc/df.h
index 84e5aa8b524df..c4e690b40cf21 100644
--- a/gcc/df.h
+++ b/gcc/df.h
@@ -987,6 +987,7 @@ extern void df_check_cfg_clean (void);
 #endif
 extern df_ref df_bb_regno_first_def_find (basic_block, unsigned int);
 extern df_ref df_bb_regno_last_def_find (basic_block, unsigned int);
+extern df_ref df_bb_regno_only_def_find (basic_block, unsigned int);
 extern df_ref df_find_def (rtx_insn *, rtx);
 extern bool df_reg_defined (rtx_insn *, rtx);
 extern df_ref df_find_use (rtx_insn *, rtx);
diff --git a/gcc/loop-doloop.cc b/gcc/loop-doloop.cc
index 529e810e530c2..8953e1de96094 100644
--- a/gcc/loop-doloop.cc
+++ b/gcc/loop-doloop.cc
@@ -85,10 +85,10 @@ doloop_condition_get (rtx_insn *doloop_pat)
      forms:
 
      1)  (parallel [(set (pc) (if_then_else (condition)
-	  			            (label_ref (label))
-				            (pc)))
-	             (set (reg) (plus (reg) (const_int -1)))
-	             (additional clobbers and uses)])
+					    (label_ref (label))
+					    (pc)))
+		     (set (reg) (plus (reg) (const_int -1)))
+		     (additional clobbers and uses)])
 
      The branch must be the first entry of the parallel (also required
      by jump.cc), and the second entry of the parallel must be a set of
@@ -96,19 +96,33 @@ doloop_condition_get (rtx_insn *doloop_pat)
      the loop counter in an if_then_else too.
 
      2)  (set (reg) (plus (reg) (const_int -1))
-         (set (pc) (if_then_else (reg != 0)
-	                         (label_ref (label))
-			         (pc))).  
+	 (set (pc) (if_then_else (reg != 0)
+				 (label_ref (label))
+				 (pc))).
 
-     Some targets (ARM) do the comparison before the branch, as in the
+     3) Some targets (Arm) do the comparison before the branch, as in the
      following form:
 
-     3) (parallel [(set (cc) (compare ((plus (reg) (const_int -1), 0)))
-                   (set (reg) (plus (reg) (const_int -1)))])
-        (set (pc) (if_then_else (cc == NE)
-                                (label_ref (label))
-                                (pc))) */
-
+     (parallel [(set (cc) (compare (plus (reg) (const_int -1)) 0))
+		(set (reg) (plus (reg) (const_int -1)))])
+     (set (pc) (if_then_else (cc == NE)
+			     (label_ref (label))
+			     (pc)))
+
+      4) This form supports a construct that is used to represent a vectorized
+      do loop with predication, however we do not need to care about the
+      details of the predication here.
+      Arm uses this construct to support MVE tail predication.
+
+      (parallel
+       [(set (pc)
+	     (if_then_else (gtu (plus (reg) (const_int -n))
+				(const_int n-1))
+			   (label_ref)
+			   (pc)))
+	(set (reg) (plus (reg) (const_int -n)))
+	(additional clobbers and uses)])
+     */
   pattern = PATTERN (doloop_pat);
 
   if (GET_CODE (pattern) != PARALLEL)
@@ -173,15 +187,17 @@ doloop_condition_get (rtx_insn *doloop_pat)
   if (! REG_P (reg))
     return 0;
 
-  /* Check if something = (plus (reg) (const_int -1)).
+  /* Check if something = (plus (reg) (const_int -n)).
      On IA-64, this decrement is wrapped in an if_then_else.  */
   inc_src = SET_SRC (inc);
   if (GET_CODE (inc_src) == IF_THEN_ELSE)
     inc_src = XEXP (inc_src, 1);
   if (GET_CODE (inc_src) != PLUS
-      || XEXP (inc_src, 0) != reg
-      || XEXP (inc_src, 1) != constm1_rtx)
+      || !rtx_equal_p (XEXP (inc_src, 0), reg)
+      || !CONST_INT_P (XEXP (inc_src, 1))
+      || INTVAL (XEXP (inc_src, 1)) >= 0)
     return 0;
+  int dec_num = -INTVAL (XEXP (inc_src, 1));
 
   /* Check for (set (pc) (if_then_else (condition)
                                        (label_ref (label))
@@ -196,60 +212,63 @@ doloop_condition_get (rtx_insn *doloop_pat)
   /* Extract loop termination condition.  */
   condition = XEXP (SET_SRC (cmp), 0);
 
-  /* We expect a GE or NE comparison with 0 or 1.  */
-  if ((GET_CODE (condition) != GE
-       && GET_CODE (condition) != NE)
-      || (XEXP (condition, 1) != const0_rtx
-          && XEXP (condition, 1) != const1_rtx))
+  /* We expect a GE or NE comparison with 0 or 1, or a GTU comparison with
+     dec_num - 1.  */
+  if (!((GET_CODE (condition) == GE
+	 || GET_CODE (condition) == NE)
+	&& (XEXP (condition, 1) == const0_rtx
+	    || XEXP (condition, 1) == const1_rtx ))
+      &&!(GET_CODE (condition) == GTU
+	  && ((INTVAL (XEXP (condition, 1))) == (dec_num - 1))))
     return 0;
 
-  if ((XEXP (condition, 0) == reg)
+  if (rtx_equal_p (XEXP (condition, 0), reg)
       /* For the third case:  */  
       || ((cc_reg != NULL_RTX)
 	  && (XEXP (condition, 0) == cc_reg)
-	  && (reg_orig == reg))
+	  && (rtx_equal_p (reg_orig, reg)))
       || (GET_CODE (XEXP (condition, 0)) == PLUS
-	  && XEXP (XEXP (condition, 0), 0) == reg))
-   {
-     if (GET_CODE (pattern) != PARALLEL)
-     /*  For the second form we expect:
+	  && rtx_equal_p (XEXP (XEXP (condition, 0), 0), reg, NULL)))
+    {
+      if (GET_CODE (pattern) != PARALLEL)
+      /* For the second form we expect:
 
-         (set (reg) (plus (reg) (const_int -1))
-         (set (pc) (if_then_else (reg != 0)
-                                 (label_ref (label))
-                                 (pc))).
+	 (set (reg) (plus (reg) (const_int -1))
+	 (set (pc) (if_then_else (reg != 0)
+				 (label_ref (label))
+				 (pc))).
 
-         is equivalent to the following:
+	 is equivalent to the following:
 
-         (parallel [(set (pc) (if_then_else (reg != 1)
-                                            (label_ref (label))
-                                            (pc)))
-                     (set (reg) (plus (reg) (const_int -1)))
-                     (additional clobbers and uses)])
+	 (parallel [(set (pc) (if_then_else (reg != 1)
+					    (label_ref (label))
+					    (pc)))
+		     (set (reg) (plus (reg) (const_int -1)))
+		     (additional clobbers and uses)])
 
-        For the third form we expect:
+	For the third form we expect:
 
-        (parallel [(set (cc) (compare ((plus (reg) (const_int -1)), 0))
-                   (set (reg) (plus (reg) (const_int -1)))])
-        (set (pc) (if_then_else (cc == NE)
-                                (label_ref (label))
-                                (pc))) 
+	(parallel [(set (cc) (compare ((plus (reg) (const_int -1)), 0))
+		   (set (reg) (plus (reg) (const_int -1)))])
+	(set (pc) (if_then_else (cc == NE)
+				(label_ref (label))
+				(pc)))
 
-        which is equivalent to the following:
+	which is equivalent to the following:
 
-        (parallel [(set (cc) (compare (reg,  1))
-                   (set (reg) (plus (reg) (const_int -1)))
-                   (set (pc) (if_then_else (NE == cc)
-                                           (label_ref (label))
-                                           (pc))))])
+	(parallel [(set (cc) (compare (reg,  1))
+		   (set (reg) (plus (reg) (const_int -1)))
+		   (set (pc) (if_then_else (NE == cc)
+					   (label_ref (label))
+					   (pc))))])
 
-        So we return the second form instead for the two cases.
+	So we return the second form instead for the two cases.
 
      */
-        condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);
+	condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);
 
     return condition;
-   }
+    }
 
   /* ??? If a machine uses a funny comparison, we could return a
      canonicalized form here.  */
@@ -507,6 +526,11 @@ doloop_modify (class loop *loop, class niter_desc *desc,
 	nonneg = 1;
       break;
 
+    case GTU:
+      /* The iteration count does not need incrementing for a GTU test.  */
+      increment_count = false;
+      break;
+
       /* Abort if an invalid doloop pattern has been generated.  */
     default:
       gcc_unreachable ();
@@ -529,6 +553,10 @@ doloop_modify (class loop *loop, class niter_desc *desc,
 
   if (desc->noloop_assumptions)
     {
+      /* The GTU case has only been implemented for Arm, where
+	 noloop_assumptions gets explicitly set to NULL for that case, so
+	 assert here for safety.  */
+      gcc_assert (GET_CODE (condition) != GTU);
       rtx ass = copy_rtx (desc->noloop_assumptions);
       basic_block preheader = loop_preheader_edge (loop)->src;
       basic_block set_zero = split_edge (loop_preheader_edge (loop));
@@ -642,7 +670,7 @@ doloop_optimize (class loop *loop)
 {
   scalar_int_mode mode;
   rtx doloop_reg;
-  rtx count;
+  rtx count = NULL_RTX;
   widest_int iterations, iterations_max;
   rtx_code_label *start_label;
   rtx condition;
@@ -685,17 +713,6 @@ doloop_optimize (class loop *loop)
       return false;
     }
 
-  max_cost
-    = COSTS_N_INSNS (param_max_iterations_computation_cost);
-  if (set_src_cost (desc->niter_expr, mode, optimize_loop_for_speed_p (loop))
-      > max_cost)
-    {
-      if (dump_file)
-	fprintf (dump_file,
-		 "Doloop: number of iterations too costly to compute.\n");
-      return false;
-    }
-
   if (desc->const_iter)
     iterations = widest_int::from (rtx_mode_t (desc->niter_expr, mode),
 				   UNSIGNED);
@@ -716,12 +733,25 @@ doloop_optimize (class loop *loop)
 
   /* Generate looping insn.  If the pattern FAILs then give up trying
      to modify the loop since there is some aspect the back-end does
-     not like.  */
-  count = copy_rtx (desc->niter_expr);
+     not like.  If this succeeds, there is a chance that the loop
+     desc->niter_expr has been altered by the backend, so only extract
+     that data after the gen_doloop_end.  */
   start_label = block_label (desc->in_edge->dest);
   doloop_reg = gen_reg_rtx (mode);
   rtx_insn *doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label);
 
+  max_cost
+    = COSTS_N_INSNS (param_max_iterations_computation_cost);
+  if (set_src_cost (desc->niter_expr, mode, optimize_loop_for_speed_p (loop))
+      > max_cost)
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "Doloop: number of iterations too costly to compute.\n");
+      return false;
+    }
+
+  count = copy_rtx (desc->niter_expr);
   word_mode_size = GET_MODE_PRECISION (word_mode);
   word_mode_max = (HOST_WIDE_INT_1U << (word_mode_size - 1) << 1) - 1;
   if (! doloop_seq

From 3dfc28dbbd21b1d708aa40064380ef4c42c994d7 Mon Sep 17 00:00:00 2001
From: Andre Vieira <andre.simoesdiasvieira@arm.com>
Date: Wed, 19 Jun 2024 17:05:55 +0100
Subject: [PATCH 02/22] arm: Add support for MVE Tail-Predicated Low Overhead
 Loops

This patch adds support for MVE Tail-Predicated Low Overhead Loops by using the
doloop funcitonality added to support predicated vectorized hardware loops.

gcc/ChangeLog:

	* config/arm/arm-protos.h (arm_target_bb_ok_for_lob): Change
	declaration to pass basic_block.
	(arm_attempt_dlstp_transform): New declaration.
	* config/arm/arm.cc (TARGET_LOOP_UNROLL_ADJUST): Define targethook.
	(TARGET_PREDICT_DOLOOP_P): Likewise.
	(arm_target_bb_ok_for_lob): Adapt condition.
	(arm_mve_get_vctp_lanes): New function.
	(arm_dl_usage_type): New internal enum.
	(arm_get_required_vpr_reg): New function.
	(arm_get_required_vpr_reg_param): New function.
	(arm_get_required_vpr_reg_ret_val): New function.
	(arm_mve_get_loop_vctp): New function.
	(arm_mve_insn_predicated_by): New function.
	(arm_mve_across_lane_insn_p): New function.
	(arm_mve_load_store_insn_p): New function.
	(arm_mve_impl_pred_on_outputs_p): New function.
	(arm_mve_impl_pred_on_inputs_p): New function.
	(arm_last_vect_def_insn): New function.
	(arm_mve_impl_predicated_p): New function.
	(arm_mve_check_reg_origin_is_num_elems): New function.
	(arm_mve_dlstp_check_inc_counter): New function.
	(arm_mve_dlstp_check_dec_counter): New function.
	(arm_mve_loop_valid_for_dlstp): New function.
	(arm_predict_doloop_p): New function.
	(arm_loop_unroll_adjust): New function.
	(arm_emit_mve_unpredicated_insn_to_seq): New function.
	(arm_attempt_dlstp_transform): New function.
	* config/arm/arm.opt (mdlstp): New option.
	* config/arm/iterators.md (dlstp_elemsize, letp_num_lanes,
	letp_num_lanes_neg, letp_num_lanes_minus_1): New attributes.
	(DLSTP, LETP): New iterators.
	* config/arm/mve.md (predicated_doloop_end_internal<letp_num_lanes>,
	dlstp<dlstp_elemsize>_insn): New insn patterns.
	* config/arm/thumb2.md (doloop_end): Adapt to support tail-predicated
	loops.
	(doloop_begin): Likewise.
	* config/arm/types.md (mve_misc): New mve type to represent
	predicated_loop_end insn sequences.
	* config/arm/unspecs.md:
	(DLSTP8, DLSTP16, DLSTP32, DSLTP64,
	LETP8, LETP16, LETP32, LETP64): New unspecs for DLSTP and LETP.

gcc/testsuite/ChangeLog:

	* gcc.target/arm/lob.h: Add new helpers.
	* gcc.target/arm/lob1.c: Use new helpers.
	* gcc.target/arm/lob6.c: Likewise.
	* gcc.target/arm/mve/dlstp-compile-asm-1.c: New test.
	* gcc.target/arm/mve/dlstp-compile-asm-2.c: New test.
	* gcc.target/arm/mve/dlstp-compile-asm-3.c: New test.
	* gcc.target/arm/mve/dlstp-int8x16.c: New test.
	* gcc.target/arm/mve/dlstp-int8x16-run.c: New test.
	* gcc.target/arm/mve/dlstp-int16x8.c: New test.
	* gcc.target/arm/mve/dlstp-int16x8-run.c: New test.
	* gcc.target/arm/mve/dlstp-int32x4.c: New test.
	* gcc.target/arm/mve/dlstp-int32x4-run.c: New test.
	* gcc.target/arm/mve/dlstp-int64x2.c: New test.
	* gcc.target/arm/mve/dlstp-int64x2-run.c: New test.
	* gcc.target/arm/mve/dlstp-invalid-asm.c: New test.

Co-authored-by: Stam Markianos-Wright <stam.markianos-wright@arm.com>
---
 gcc/config/arm/arm-protos.h                   |    4 +-
 gcc/config/arm/arm.cc                         | 1249 ++++++++++++++++-
 gcc/config/arm/arm.opt                        |    3 +
 gcc/config/arm/iterators.md                   |   15 +
 gcc/config/arm/mve.md                         |   50 +
 gcc/config/arm/thumb2.md                      |  138 +-
 gcc/config/arm/types.md                       |    6 +-
 gcc/config/arm/unspecs.md                     |   14 +-
 gcc/testsuite/gcc.target/arm/lob.h            |  128 +-
 gcc/testsuite/gcc.target/arm/lob1.c           |   23 +-
 gcc/testsuite/gcc.target/arm/lob6.c           |    8 +-
 .../gcc.target/arm/mve/dlstp-compile-asm-1.c  |  146 ++
 .../gcc.target/arm/mve/dlstp-compile-asm-2.c  |  749 ++++++++++
 .../gcc.target/arm/mve/dlstp-compile-asm-3.c  |   46 +
 .../gcc.target/arm/mve/dlstp-int16x8-run.c    |   44 +
 .../gcc.target/arm/mve/dlstp-int16x8.c        |   31 +
 .../gcc.target/arm/mve/dlstp-int32x4-run.c    |   45 +
 .../gcc.target/arm/mve/dlstp-int32x4.c        |   31 +
 .../gcc.target/arm/mve/dlstp-int64x2-run.c    |   48 +
 .../gcc.target/arm/mve/dlstp-int64x2.c        |   28 +
 .../gcc.target/arm/mve/dlstp-int8x16-run.c    |   44 +
 .../gcc.target/arm/mve/dlstp-int8x16.c        |   32 +
 .../gcc.target/arm/mve/dlstp-invalid-asm.c    |  521 +++++++
 23 files changed, 3321 insertions(+), 82 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-1.c
 create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c
 create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-3.c
 create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8-run.c
 create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8.c
 create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4-run.c
 create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4.c
 create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2-run.c
 create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2.c
 create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16-run.c
 create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16.c
 create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 2cd560c99254b..34d6be76e94ac 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -65,8 +65,8 @@ extern void arm_emit_speculation_barrier_function (void);
 extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *);
 extern bool arm_q_bit_access (void);
 extern bool arm_ge_bits_access (void);
-extern bool arm_target_insn_ok_for_lob (rtx);
-
+extern bool arm_target_bb_ok_for_lob (basic_block);
+extern int arm_attempt_dlstp_transform (rtx);
 #ifdef RTX_CODE
 enum reg_class
 arm_mode_base_reg_class (machine_mode);
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index b8c32db0a1d7f..7d67d2cfee9f4 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -668,6 +668,12 @@ static const scoped_attribute_specs *const arm_attribute_table[] =
 #undef TARGET_HAVE_CONDITIONAL_EXECUTION
 #define TARGET_HAVE_CONDITIONAL_EXECUTION arm_have_conditional_execution
 
+#undef TARGET_LOOP_UNROLL_ADJUST
+#define TARGET_LOOP_UNROLL_ADJUST arm_loop_unroll_adjust
+
+#undef TARGET_PREDICT_DOLOOP_P
+#define TARGET_PREDICT_DOLOOP_P arm_predict_doloop_p
+
 #undef TARGET_LEGITIMATE_CONSTANT_P
 #define TARGET_LEGITIMATE_CONSTANT_P arm_legitimate_constant_p
 
@@ -34659,19 +34665,1236 @@ arm_invalid_within_doloop (const rtx_insn *insn)
 }
 
 bool
-arm_target_insn_ok_for_lob (rtx insn)
-{
-  basic_block bb = BLOCK_FOR_INSN (insn);
-  /* Make sure the basic block of the target insn is a simple latch
-     having as single predecessor and successor the body of the loop
-     itself.  Only simple loops with a single basic block as body are
-     supported for 'low over head loop' making sure that LE target is
-     above LE itself in the generated code.  */
-
-  return single_succ_p (bb)
-    && single_pred_p (bb)
-    && single_succ_edge (bb)->dest == single_pred_edge (bb)->src
-    && contains_no_active_insn_p (bb);
+arm_target_bb_ok_for_lob (basic_block bb)
+{
+  /* Make sure the basic block is a simple latch having as the single
+     predecessor and successor the body of the loop itself.
+     Only simple loops with a single basic block as body are supported for
+     low over head loops, making sure that LE target is above LE instruction
+     in the generated code.  */
+  return (single_succ_p (bb)
+	  && single_pred_p (bb)
+	  && single_succ_edge (bb)->dest == single_pred_edge (bb)->src);
+}
+
+/* Utility fuction: Given a VCTP or a VCTP_M insn, return the number of MVE
+   lanes based on the machine mode being used.  */
+
+static int
+arm_mve_get_vctp_lanes (rtx_insn *insn)
+{
+  rtx insn_set = single_set (insn);
+  if (insn_set
+      && GET_CODE (SET_SRC (insn_set)) == UNSPEC
+      && (XINT (SET_SRC (insn_set), 1) == VCTP
+	  || XINT (SET_SRC (insn_set), 1) == VCTP_M))
+    {
+      machine_mode mode = GET_MODE (SET_SRC (insn_set));
+      return ((VECTOR_MODE_P (mode) && VALID_MVE_PRED_MODE (mode))
+	      ? GET_MODE_NUNITS (mode) : 0);
+    }
+  return 0;
+}
+
+enum arm_dl_usage_type { DL_USAGE_ANY = 0,
+			 DL_USAGE_READ = 1,
+			 DL_USAGE_WRITE = 2 };
+
+/* Check if INSN requires the use of the VPR reg, if it does, return the
+   sub-rtx of the VPR reg.  The TYPE argument controls whether
+   this function should:
+   * For TYPE == DL_USAGE_ANY, check all operands, including the OUT operands,
+     and return the first occurrence of the VPR reg.
+   * For TYPE == DL_USAGE_READ, only check the input operands.
+   * For TYPE == DL_USAGE_WRITE, only check the output operands.
+   (INOUT operands are considered both as input and output operands)
+*/
+static rtx
+arm_get_required_vpr_reg (rtx_insn *insn,
+			  arm_dl_usage_type type = DL_USAGE_ANY)
+{
+  gcc_assert (type < 3);
+  if (!NONJUMP_INSN_P (insn))
+    return NULL_RTX;
+
+  bool requires_vpr;
+  extract_constrain_insn (insn);
+  int n_operands = recog_data.n_operands;
+  if (recog_data.n_alternatives == 0)
+    return NULL_RTX;
+
+  /* Fill in recog_op_alt with information about the constraints of
+     this insn.  */
+  preprocess_constraints (insn);
+
+  for (int op = 0; op < n_operands; op++)
+    {
+      requires_vpr = true;
+      if (type == DL_USAGE_READ
+	  && recog_data.operand_type[op] == OP_OUT)
+	continue;
+      else if (type == DL_USAGE_WRITE
+	       && recog_data.operand_type[op] == OP_IN)
+	continue;
+
+      /* Iterate through alternatives of operand "op" in recog_op_alt and
+	 identify if the operand is required to be the VPR.  */
+      for (int alt = 0; alt < recog_data.n_alternatives; alt++)
+	{
+	  const operand_alternative *op_alt
+	      = &recog_op_alt[alt * n_operands];
+	  /* Fetch the reg_class for each entry and check it against the
+	     VPR_REG reg_class.  */
+	  if (alternative_class (op_alt, op) != VPR_REG)
+	    requires_vpr = false;
+	}
+      /* If all alternatives of the insn require the VPR reg for this operand,
+	 it means that either this is VPR-generating instruction, like a vctp,
+	 vcmp, etc., or it is a VPT-predicated insruction.  Return the subrtx
+	 of the VPR reg operand.  */
+      if (requires_vpr)
+	return recog_data.operand[op];
+    }
+  return NULL_RTX;
+}
+
+/* Wrapper function of arm_get_required_vpr_reg with TYPE == DL_USAGE_READ,
+   so return the VPR only if it is an input operand to the insn.  */
+
+static rtx
+arm_get_required_vpr_reg_param (rtx_insn *insn)
+{
+  return arm_get_required_vpr_reg (insn, DL_USAGE_READ);
+}
+
+/* Wrapper function of arm_get_required_vpr_reg with TYPE == DL_USAGE_WRITE,
+   so return the VPR only if it is the return value, an output of, or is
+   clobbered by the insn.  */
+
+static rtx
+arm_get_required_vpr_reg_ret_val (rtx_insn *insn)
+{
+  return arm_get_required_vpr_reg (insn, DL_USAGE_WRITE);
+}
+
+/* Return the first VCTP instruction in BB, if it exists, or NULL otherwise.  */
+
+static rtx_insn *
+arm_mve_get_loop_vctp (basic_block bb)
+{
+  rtx_insn *insn = BB_HEAD (bb);
+
+  /* Now scan through all the instruction patterns and pick out the VCTP
+     instruction.  We require arm_get_required_vpr_reg_param to be false
+     to make sure we pick up a VCTP, rather than a VCTP_M.  */
+  FOR_BB_INSNS (bb, insn)
+    if (NONDEBUG_INSN_P (insn))
+      if (arm_get_required_vpr_reg_ret_val (insn)
+	  && (arm_mve_get_vctp_lanes (insn) != 0)
+	  && !arm_get_required_vpr_reg_param (insn))
+	return insn;
+  return NULL;
+}
+
+/* Return true if INSN is a MVE instruction that is VPT-predicable and is
+   predicated on VPR_REG.  */
+
+static bool
+arm_mve_insn_predicated_by (rtx_insn *insn, rtx vpr_reg)
+{
+  rtx insn_vpr_reg_operand = (MVE_VPT_PREDICATED_INSN_P (insn)
+			      ? arm_get_required_vpr_reg_param (insn)
+			      : NULL_RTX);
+  return (insn_vpr_reg_operand
+	  && rtx_equal_p (vpr_reg, insn_vpr_reg_operand));
+}
+
+/* Utility function to identify if INSN is an MVE instruction that performs
+   some across lane operation (and as a result does not align with normal
+   lane predication rules).  All such instructions give one only scalar
+   output, except for vshlcq which gives a PARALLEL of a vector and a scalar
+   (one vector result and one carry output).  */
+
+static bool
+arm_mve_across_lane_insn_p (rtx_insn* insn)
+{
+  df_ref insn_defs = NULL;
+  if (!MVE_VPT_PREDICABLE_INSN_P (insn))
+    return false;
+
+  FOR_EACH_INSN_DEF (insn_defs, insn)
+    if (!VALID_MVE_MODE (GET_MODE (DF_REF_REG (insn_defs)))
+	&& !arm_get_required_vpr_reg_ret_val (insn))
+      return true;
+
+  return false;
+}
+
+/* Utility function to identify if INSN is an MVE load or store instruction.
+   * For TYPE == DL_USAGE_ANY, check all operands.  If the function returns
+     true, INSN is a load or a store insn.
+   * For TYPE == DL_USAGE_READ, only check the input operands.  If the
+     function returns true, INSN is a load insn.
+   * For TYPE == DL_USAGE_WRITE, only check the output operands.  If the
+     function returns true, INSN is a store insn.  */
+
+static bool
+arm_mve_load_store_insn_p (rtx_insn* insn,
+			   arm_dl_usage_type type = DL_USAGE_ANY)
+{
+  gcc_assert (type < 3);
+  int n_operands = recog_data.n_operands;
+  extract_insn (insn);
+
+  for (int op = 0; op < n_operands; op++)
+    {
+      if (type == DL_USAGE_READ && recog_data.operand_type[op] == OP_OUT)
+	continue;
+      else if (type == DL_USAGE_WRITE && recog_data.operand_type[op] == OP_IN)
+	continue;
+      if (mve_memory_operand (recog_data.operand[op],
+			      GET_MODE (recog_data.operand[op])))
+	return true;
+    }
+  return false;
+}
+
+/* Return TRUE if INSN is validated for implicit predication by how its outputs
+   are used.
+
+   If INSN is a MVE operation across lanes that is not predicated by
+   VCTP_VPR_GENERATED it can not be validated by the use of its ouputs.
+
+   Any other INSN is safe to implicit predicate if we don't use its outputs
+   outside the loop.  The instructions that use this INSN's outputs will be
+   validated as we go through the analysis.  */
+
+static bool
+arm_mve_impl_pred_on_outputs_p (rtx_insn *insn, rtx vctp_vpr_generated)
+{
+  /* Reject any unpredicated across lane operation.  */
+  if (!arm_mve_insn_predicated_by (insn, vctp_vpr_generated)
+      && arm_mve_across_lane_insn_p (insn))
+    return false;
+
+  /* Next, scan forward to the various USEs of the DEFs in this insn.  */
+  df_ref insn_def = NULL;
+  basic_block insn_bb = BLOCK_FOR_INSN (insn);
+  FOR_EACH_INSN_DEF (insn_def, insn)
+    {
+      for (df_ref use = DF_REG_USE_CHAIN (DF_REF_REGNO (insn_def));
+	   use;
+	   use = DF_REF_NEXT_REG (use))
+	{
+	  rtx_insn *next_use_insn = DF_REF_INSN (use);
+	  if (!INSN_P (next_use_insn) || DEBUG_INSN_P (next_use_insn))
+	    continue;
+
+	  if (insn_bb != BLOCK_FOR_INSN (next_use_insn))
+	    return false;
+	}
+    }
+  return true;
+}
+
+
+/* Returns the prevailing definition of OP before CUR_INSN in the same
+   basic block as CUR_INSN, if one exists, returns NULL otherwise.  */
+
+static rtx_insn*
+arm_last_vect_def_insn (rtx op, rtx_insn *cur_insn)
+{
+  if (!REG_P (op)
+      || !BLOCK_FOR_INSN (cur_insn))
+    return NULL;
+
+  df_ref def_insns;
+  rtx_insn *last_def = NULL;
+  for (def_insns = DF_REG_DEF_CHAIN (REGNO (op));
+       def_insns;
+       def_insns = DF_REF_NEXT_REG (def_insns))
+    {
+      rtx_insn *def_insn = DF_REF_INSN (def_insns);
+      /* Definition not in the loop body or after the current insn.  */
+      if (DF_REF_BB (def_insns) != BLOCK_FOR_INSN (cur_insn)
+	  || INSN_UID (def_insn) >= INSN_UID (cur_insn))
+	continue;
+
+      if (!last_def || INSN_UID (def_insn) > INSN_UID (last_def))
+	last_def = def_insn;
+    }
+  return last_def;
+}
+
+
+/* This function returns TRUE if we can validate the implicit predication of
+   INSN_IN with VCTP_VPR_GENERATED based on the definition of the instruction's
+   input operands.
+
+   If INSN_IN is a MVE operation across lanes then all of its MVE vector
+   operands must have its tail-predicated lanes be zeroes.  We keep track of any
+   instructions that define vector operands for which this is true in
+   PROPS_ZERO_SET.
+
+   For any other INSN_IN, the definition of all its operands must be defined
+   inside the loop body by an instruction that comes before INSN_IN and not be
+   a MVE load predicated by a different VPR.  These instructions have all been
+   validated for explicit or implicit predication.
+ */
+
+static bool
+arm_mve_impl_pred_on_inputs_p (vec <rtx_insn *> *props_zero_set,
+			       rtx_insn *insn_in, rtx vctp_vpr_generated)
+{
+  /* If all inputs come from instructions that are explicitly or
+     implicitly predicated by the same predicate then it is safe to
+     implicitly predicate this instruction.  */
+  df_ref insn_uses = NULL;
+  bool across_lane = arm_mve_across_lane_insn_p (insn_in);
+  FOR_EACH_INSN_USE (insn_uses, insn_in)
+  {
+    rtx op = DF_REF_REG (insn_uses);
+    rtx_insn *def_insn = arm_last_vect_def_insn (op, insn_in);
+    if (across_lane)
+      {
+	if (!VALID_MVE_MODE (GET_MODE (op)))
+	  continue;
+	if (!def_insn || !props_zero_set->contains (def_insn))
+	  return false;
+
+	continue;
+      }
+
+    if (!def_insn
+	|| (!arm_mve_insn_predicated_by (def_insn, vctp_vpr_generated)
+	    && arm_mve_load_store_insn_p (def_insn, DL_USAGE_READ)))
+      return false;
+  }
+
+  return true;
+}
+
+
+/* Determine whether INSN_IN is safe to implicitly predicate based on the type
+   of instruction and where needed the definition of its inputs and the uses of
+   its outputs.
+   Return TRUE if it is safe to implicitly predicate and FALSE otherwise.
+
+   * If INSN_IN is a store, then it is always unsafe to implicitly predicate it.
+   * If INSN_IN is a load, only reject implicit predication if its uses
+     directly invalidate it.
+   * If INSN_IN operates across vector lanes and does not have the
+     "mve_safe_imp_xlane_pred" attribute, then it is always unsafe to implicitly
+     predicate.
+   * If INSN_IN operates on Floating Point elements and we are not compiling
+     with -Ofast, then it is unsafe to implicitly predicate it as we may be
+     changing exception and cumulative bits behaviour.
+   * If INSN_IN is a VCTP instruction, then it is safe to implicitly predicate,
+     but instructions that use this predicate will need to be checked
+     just like any other UNPREDICATED MVE instruction.
+   * Otherwise check if INSN_IN's inputs or uses of outputs can validate its
+     implicit predication.
+
+   * If all inputs come from instructions that are explicitly or implicitly
+     predicated by the same predicate then it is safe to implicitly predicate
+     this instruction.
+   * If INSN_IN is an operation across lanes with the "mve_safe_imp_xlane_pred"
+     attribute, then all it's operands must have zeroed falsely predicated tail
+     lanes.
+
+   * Otherwise, check if the implicit predication of INSN_IN can be validated
+     based on its inputs, and if not check whether it can be validated based on
+     how its outputs are used.  */
+
+static bool
+arm_mve_impl_predicated_p (vec <rtx_insn *> *props_zero_set,
+			   rtx_insn *insn_in, rtx vctp_vpr_generated)
+{
+
+  /* If INSN_IN is a store, then it is always unsafe to implicitly
+     predicate it.  */
+  if (arm_mve_load_store_insn_p (insn_in, DL_USAGE_WRITE))
+    return false;
+
+  /* If INSN_IN is a load, only reject implicit predication if its uses
+     directly invalidate it.  */
+  if (arm_mve_load_store_insn_p (insn_in, DL_USAGE_READ))
+    {
+      if (!arm_mve_impl_pred_on_outputs_p (insn_in, vctp_vpr_generated))
+	return false;
+      return true;
+    }
+
+  /* If INSN_IN operates across vector lanes and does not have the
+     "mve_safe_imp_xlane_pred" attribute, then it is always unsafe to implicitly
+     predicate.  */
+  if (arm_mve_across_lane_insn_p (insn_in)
+      && (get_attr_mve_safe_imp_xlane_pred (insn_in)
+	  != MVE_SAFE_IMP_XLANE_PRED_YES))
+    return false;
+
+  /* If INSN_IN operates on Floating Point elements and we are not compiling
+     with -Ofast, then it is unsafe to implicitly predicate it as we may be
+     changing exception and cumulative bits behaviour.  */
+  if (!flag_unsafe_math_optimizations
+      && flag_trapping_math
+      && MVE_VPT_UNPREDICATED_INSN_P (insn_in))
+    {
+      df_ref def;
+      FOR_EACH_INSN_DEF (def, insn_in)
+	if (DF_REF_TYPE (def) == DF_REF_REG_DEF
+	    && FLOAT_MODE_P (GET_MODE (DF_REF_REG (def))))
+	  return false;
+      FOR_EACH_INSN_USE (def, insn_in)
+	if (DF_REF_TYPE (def) == DF_REF_REG_DEF
+	    && FLOAT_MODE_P (GET_MODE (DF_REF_REG (def))))
+	  return false;
+    }
+
+  /* If INSN_IN is a VCTP instruction, then it is safe to implicitly predicate,
+     but instructions that use this predicate will need to be checked
+     just like any other UNPREDICATED MVE instruction.  */
+  if (arm_get_required_vpr_reg_ret_val (insn_in)
+      && (arm_mve_get_vctp_lanes (insn_in) != 0))
+    return true;
+
+  /* Otherwise, check if the implicit predication of INSN_IN can be validated
+     based on its inputs, and if not check whether it can be validated based on
+     how its outputs are used.  */
+  return (arm_mve_impl_pred_on_inputs_p (props_zero_set, insn_in, vctp_vpr_generated)
+	  || arm_mve_impl_pred_on_outputs_p (insn_in, vctp_vpr_generated));
+}
+
+/* Helper function to `arm_mve_dlstp_check_inc_counter` and to
+   `arm_mve_dlstp_check_dec_counter`.  In the situations where the loop counter
+   is incrementing by 1 or decrementing by 1 in each iteration, ensure that the
+   number of iterations, the value of REG, going into the loop, was calculated
+   as:
+    REG = (N + [1, VCTP_STEP - 1]) / VCTP_STEP
+
+  where N is equivalent to the VCTP_REG.
+*/
+
+static bool
+arm_mve_check_reg_origin_is_num_elems (loop *loop, rtx reg, rtx vctp_step,
+				       rtx vctp_reg)
+{
+  df_ref counter_max_last_def = NULL;
+
+  /* More than one reaching definition.  */
+  if (DF_REG_DEF_COUNT (REGNO (reg)) > 2)
+    return false;
+
+  /* Look for a single defition of REG going into the loop.  The DEF_CHAIN will
+     have at least two values, as this is a loop induction variable that is
+     defined outside the loop.  */
+  for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
+       def;
+       def = DF_REF_NEXT_REG (def))
+    {
+      /* Skip the update inside the loop, this has already been checked by the
+	 iv_analyze call earlier.  */
+      if (DF_REF_BB (def) == loop->header)
+	continue;
+
+      counter_max_last_def = def;
+      break;
+    }
+
+  if (!counter_max_last_def)
+    return false;
+
+  rtx counter_max_last_set = single_set (DF_REF_INSN (counter_max_last_def));
+
+  if (!counter_max_last_set)
+    return false;
+
+  /* If we encounter a simple SET from a REG, follow it through.  */
+  if (REG_P (SET_SRC (counter_max_last_set)))
+    {
+      if (DF_REG_DEF_COUNT (REGNO (SET_SRC (counter_max_last_set))) != 1)
+	return false;
+
+      counter_max_last_def
+	= DF_REG_DEF_CHAIN (REGNO (SET_SRC (counter_max_last_set)));
+      counter_max_last_set
+	= single_set (DF_REF_INSN (counter_max_last_def));
+
+      if (!counter_max_last_set)
+	return false;
+    }
+
+  /* We are looking for:
+      COUNTER_MAX_LAST_SET = (N + VCTP_STEP - 1) / VCTP_STEP.
+     We currently only support the unsigned VCTP_OP case.  */
+  rtx division = SET_SRC (counter_max_last_set);
+  if (GET_CODE (division) != LSHIFTRT)
+    return false;
+
+  /* Now check that we are dividing by VCTP_STEP, i.e. the number of lanes.  */
+  rtx divisor = XEXP (division, 1);
+  unsigned vctp_step_cst = abs_hwi (INTVAL (vctp_step));
+  if (!CONST_INT_P (divisor)
+      || (1U << INTVAL (divisor) != vctp_step_cst))
+    return false;
+
+  rtx dividend = XEXP (division, 0);
+  if (!REG_P (dividend))
+    /* Subreg? */
+    return false;
+
+  /* For now only support the simple case, this only works for unsigned N, any
+     signed N will have further computations to deal with overflow.  */
+  if (DF_REG_DEF_COUNT (REGNO (dividend)) != 1)
+    return false;
+
+  rtx_insn *dividend_insn = DF_REF_INSN (DF_REG_DEF_CHAIN (REGNO (dividend)));
+  rtx dividend_op = single_set (dividend_insn);
+  if (!dividend_op
+      && GET_CODE (SET_SRC (dividend_op)) != PLUS)
+    return false;
+
+  /* Check if PLUS_OP is (VCTP_OP + VAL), where VAL = [1, VCTP_STEP - 1].  */
+  rtx plus_op = SET_SRC (dividend_op);
+  if (!REG_P (XEXP (plus_op, 0))
+      || !CONST_INT_P (XEXP (plus_op, 1))
+      || !IN_RANGE (INTVAL (XEXP (plus_op, 1)), 1, vctp_step_cst - 1))
+    return false;
+
+  /* VCTP_REG may have been copied before entering the loop, let's see if we can
+     trace such a copy back.  If we have more than one reaching definition then
+     bail out as analysis will be too difficult.  */
+  if (DF_REG_DEF_COUNT (REGNO (vctp_reg)) > 2)
+    return false;
+
+  /* Look for the definition of N. */
+  for (df_ref def = DF_REG_DEF_CHAIN (REGNO (vctp_reg));
+       def;
+       def = DF_REF_NEXT_REG (def))
+    {
+      if (DF_REF_BB (def) == loop->header)
+       continue;
+      rtx set = single_set (DF_REF_INSN (def));
+      if (set
+	  && REG_P (SET_SRC (set))
+	  && !HARD_REGISTER_P (SET_SRC (set)))
+	vctp_reg = SET_SRC (set);
+    }
+
+  return rtx_equal_p (vctp_reg, XEXP (plus_op, 0));
+}
+
+/* If we have identified the loop to have an incrementing counter, we need to
+   make sure that it increments by 1 and that the loop is structured correctly:
+    * The counter starts from 0
+    * The counter terminates at (num_of_elem + num_of_lanes - 1) / num_of_lanes
+    * The vctp insn uses a reg that decrements appropriately in each iteration.
+*/
+
+static rtx_insn*
+arm_mve_dlstp_check_inc_counter (loop *loop, rtx_insn* vctp_insn,
+				 rtx condconst, rtx condcount)
+{
+  rtx vctp_reg = XVECEXP (XEXP (PATTERN (vctp_insn), 1), 0, 0);
+  /* The loop latch has to be empty.  When compiling all the known MVE LoLs in
+     user applications, none of those with incrementing counters had any real
+     insns in the loop latch.  As such, this function has only been tested with
+     an empty latch and may misbehave or ICE if we somehow get here with an
+     increment in the latch, so, for correctness, error out early.  */
+  if (!empty_block_p (loop->latch))
+    return NULL;
+
+  class rtx_iv vctp_reg_iv;
+  /* For loops of DLSTP_TYPE_B, the loop counter is independent of the decrement
+     of the reg used in the vctp_insn. So run iv analysis on that reg.  This
+     has to succeed for such loops to be supported.  */
+  if (!iv_analyze (vctp_insn, as_a<scalar_int_mode> (GET_MODE (vctp_reg)),
+      vctp_reg, &vctp_reg_iv))
+    return NULL;
+
+  /* Extract the decrementnum of the vctp reg from the iv.  This decrementnum
+     is the number of lanes/elements it decrements from the remaining number of
+     lanes/elements to process in the loop, for this reason this is always a
+     negative number, but to simplify later checks we use it's absolute value.  */
+  HOST_WIDE_INT decrementnum = INTVAL (vctp_reg_iv.step);
+  if (decrementnum >= 0)
+    return NULL;
+  decrementnum = abs_hwi (decrementnum);
+
+  /* Find where both of those are modified in the loop header bb.  */
+  df_ref condcount_reg_set_df = df_bb_regno_only_def_find (loop->header,
+							   REGNO (condcount));
+  df_ref vctp_reg_set_df = df_bb_regno_only_def_find (loop->header,
+						      REGNO (vctp_reg));
+  if (!condcount_reg_set_df || !vctp_reg_set_df)
+    return NULL;
+  rtx condcount_reg_set = single_set (DF_REF_INSN (condcount_reg_set_df));
+  rtx vctp_reg_set = single_set (DF_REF_INSN (vctp_reg_set_df));
+  if (!condcount_reg_set || !vctp_reg_set)
+    return NULL;
+
+  /* Ensure the modification of the vctp reg from df is consistent with
+     the iv and the number of lanes on the vctp insn.  */
+  if (GET_CODE (SET_SRC (vctp_reg_set)) != PLUS
+      || !REG_P (SET_DEST (vctp_reg_set))
+      || !REG_P (XEXP (SET_SRC (vctp_reg_set), 0))
+      || REGNO (SET_DEST (vctp_reg_set))
+	  != REGNO (XEXP (SET_SRC (vctp_reg_set), 0))
+      || !CONST_INT_P (XEXP (SET_SRC (vctp_reg_set), 1))
+      || INTVAL (XEXP (SET_SRC (vctp_reg_set), 1)) >= 0
+      || decrementnum != abs_hwi (INTVAL (XEXP (SET_SRC (vctp_reg_set), 1)))
+      || decrementnum != arm_mve_get_vctp_lanes (vctp_insn))
+    return NULL;
+
+  if (REG_P (condcount) && REG_P (condconst))
+    {
+      /* First we need to prove that the loop is going 0..condconst with an
+	 inc of 1 in each iteration.  */
+      if (GET_CODE (SET_SRC (condcount_reg_set)) == PLUS
+	  && CONST_INT_P (XEXP (SET_SRC (condcount_reg_set), 1))
+	  && INTVAL (XEXP (SET_SRC (condcount_reg_set), 1)) == 1)
+	{
+	    rtx counter_reg = SET_DEST (condcount_reg_set);
+	    /* Check that the counter did indeed start from zero.  */
+	    df_ref this_set = DF_REG_DEF_CHAIN (REGNO (counter_reg));
+	    if (!this_set)
+	      return NULL;
+	    df_ref last_set_def = DF_REF_NEXT_REG (this_set);
+	    if (!last_set_def)
+	      return NULL;
+	    rtx_insn* last_set_insn = DF_REF_INSN (last_set_def);
+	    rtx last_set = single_set (last_set_insn);
+	    if (!last_set)
+	      return NULL;
+	    rtx counter_orig_set;
+	    counter_orig_set = SET_SRC (last_set);
+	    if (!CONST_INT_P (counter_orig_set)
+		|| (INTVAL (counter_orig_set) != 0))
+	      return NULL;
+	    /* And finally check that the target value of the counter,
+	       condconst, is of the correct shape.  */
+	    if (!arm_mve_check_reg_origin_is_num_elems (loop, condconst,
+							vctp_reg_iv.step,
+							vctp_reg))
+	      return NULL;
+	}
+      else
+	return NULL;
+    }
+  else
+    return NULL;
+
+  /* Everything looks valid.  */
+  return vctp_insn;
+}
+
+/* Helper function to `arm_mve_loop_valid_for_dlstp`.  In the case of a
+   counter that is decrementing, ensure that it is decrementing by the
+   right amount in each iteration and that the target condition is what
+   we expect.  */
+
+static rtx_insn*
+arm_mve_dlstp_check_dec_counter (loop *loop, rtx_insn* vctp_insn,
+				 rtx condconst, rtx condcount)
+{
+  rtx vctp_reg = XVECEXP (XEXP (PATTERN (vctp_insn), 1), 0, 0);
+  class rtx_iv vctp_reg_iv;
+  HOST_WIDE_INT decrementnum;
+  /* For decrementing loops of DLSTP_TYPE_A, the counter is usually present in the
+     loop latch.  Here we simply need to verify that this counter is the same
+     reg that is also used in the vctp_insn and that it is not otherwise
+     modified.  */
+  rtx_insn *dec_insn = BB_END (loop->latch);
+  /* If not in the loop latch, try to find the decrement in the loop header.  */
+  if (!NONDEBUG_INSN_P (dec_insn))
+  {
+    df_ref temp = df_bb_regno_only_def_find (loop->header, REGNO (condcount));
+    /* If we haven't been able to find the decrement, bail out.  */
+    if (!temp)
+      return NULL;
+    dec_insn = DF_REF_INSN (temp);
+  }
+
+  rtx dec_set = single_set (dec_insn);
+
+  /* Next, ensure that it is a PLUS of the form:
+     (set (reg a) (plus (reg a) (const_int)))
+     where (reg a) is the same as condcount.  */
+  if (!dec_set
+      || !REG_P (SET_DEST (dec_set))
+      || !REG_P (XEXP (SET_SRC (dec_set), 0))
+      || !CONST_INT_P (XEXP (SET_SRC (dec_set), 1))
+      || REGNO (SET_DEST (dec_set))
+	  != REGNO (XEXP (SET_SRC (dec_set), 0))
+      || REGNO (SET_DEST (dec_set)) != REGNO (condcount))
+    return NULL;
+
+  decrementnum = INTVAL (XEXP (SET_SRC (dec_set), 1));
+
+  /* This decrementnum is the number of lanes/elements it decrements from the
+     remaining number of lanes/elements to process in the loop, for this reason
+     this is always a negative number, but to simplify later checks we use its
+     absolute value.  */
+  if (decrementnum >= 0)
+    return NULL;
+  decrementnum = -decrementnum;
+
+  /* If the decrementnum is a 1, then we need to look at the loop vctp_reg and
+     verify that it also decrements correctly.
+     Then, we need to establish that the starting value of the loop decrement
+     originates from the starting value of the vctp decrement.  */
+  if (decrementnum == 1)
+    {
+      class rtx_iv vctp_reg_iv, condcount_reg_iv;
+      /* The loop counter is found to be independent of the decrement
+	 of the reg used in the vctp_insn, again.  Ensure that IV analysis
+	 succeeds and check the step.  */
+      if (!iv_analyze (vctp_insn, as_a<scalar_int_mode> (GET_MODE (vctp_reg)),
+		       vctp_reg, &vctp_reg_iv))
+	return NULL;
+      /* Ensure it matches the number of lanes of the vctp instruction.  */
+      if (abs (INTVAL (vctp_reg_iv.step))
+	  != arm_mve_get_vctp_lanes (vctp_insn))
+	return NULL;
+
+      if (!arm_mve_check_reg_origin_is_num_elems (loop, condcount,
+						  vctp_reg_iv.step,
+						  vctp_reg))
+	return NULL;
+    }
+  /* If the decrements are the same, then the situation is simple: either they
+     are also the same reg, which is safe, or they are different registers, in
+     which case makse sure that there is a only simple SET from one to the
+     other inside the loop.*/
+  else if (decrementnum == arm_mve_get_vctp_lanes (vctp_insn))
+    {
+      if (REGNO (condcount) != REGNO (vctp_reg))
+	{
+	  /* It wasn't the same reg, but it could be behild a
+	     (set (vctp_reg) (condcount)), so instead find where
+	     the VCTP insn is DEF'd inside the loop.  */
+	  rtx_insn *vctp_reg_insn
+	    = DF_REF_INSN (df_bb_regno_only_def_find (loop->header,
+						      REGNO (vctp_reg)));
+	  rtx vctp_reg_set = single_set (vctp_reg_insn);
+	  /* This must just be a simple SET from the condcount.  */
+	  if (!vctp_reg_set
+	      || !REG_P (SET_DEST (vctp_reg_set))
+	      || !REG_P (SET_SRC (vctp_reg_set))
+	      || REGNO (SET_SRC (vctp_reg_set)) != REGNO (condcount))
+	    return NULL;
+	}
+    }
+  else
+    return NULL;
+
+  /* We now only need to find out that the loop terminates with a LE
+     zero condition.  If condconst is a const_int, then this is easy.
+     If its a REG, look at the last condition+jump in a bb before
+     the loop, because that usually will have a branch jumping over
+     the loop header.  */
+  rtx_insn *jump_insn = BB_END (loop->header);
+  if (CONST_INT_P (condconst)
+      && !(INTVAL (condconst) == 0 && JUMP_P (jump_insn)
+	   && GET_CODE (XEXP (PATTERN (jump_insn), 1)) == IF_THEN_ELSE
+	   && (GET_CODE (XEXP (XEXP (PATTERN (jump_insn), 1), 0)) == NE
+	       ||GET_CODE (XEXP (XEXP (PATTERN (jump_insn), 1), 0)) == GT)))
+    return NULL;
+  else if (REG_P (condconst))
+    {
+      basic_block pre_loop_bb = single_pred (loop_preheader_edge (loop)->src);
+      if (!pre_loop_bb)
+	return NULL;
+
+      rtx initial_compare = NULL_RTX;
+      if (!(prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb))
+	    && INSN_P (prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb)))))
+	return NULL;
+      else
+	initial_compare
+	    = single_set (prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb)));
+      if (!(initial_compare
+	    && cc_register (SET_DEST (initial_compare), VOIDmode)
+	    && GET_CODE (SET_SRC (initial_compare)) == COMPARE
+	    && CONST_INT_P (XEXP (SET_SRC (initial_compare), 1))
+	    && INTVAL (XEXP (SET_SRC (initial_compare), 1)) == 0))
+	return NULL;
+
+      /* Usually this is a LE condition, but it can also just be a GT or an EQ
+	 condition (if the value is unsigned or the compiler knows its not negative)  */
+      rtx_insn *loop_jumpover = BB_END (pre_loop_bb);
+      if (!(JUMP_P (loop_jumpover)
+	    && GET_CODE (XEXP (PATTERN (loop_jumpover), 1)) == IF_THEN_ELSE
+	    && (GET_CODE (XEXP (XEXP (PATTERN (loop_jumpover), 1), 0)) == LE
+		|| GET_CODE (XEXP (XEXP (PATTERN (loop_jumpover), 1), 0)) == GT
+		|| GET_CODE (XEXP (XEXP (PATTERN (loop_jumpover), 1), 0)) == EQ)))
+	return NULL;
+    }
+
+  /* Everything looks valid.  */
+  return vctp_insn;
+}
+
+/* Function to check a loop's structure to see if it is a valid candidate for
+   an MVE Tail Predicated Low-Overhead Loop.  Returns the loop's VCTP_INSN if
+   it is valid, or NULL if it isn't.  */
+
+static rtx_insn*
+arm_mve_loop_valid_for_dlstp (loop *loop)
+{
+  /* Doloop can only be done "elementwise" with predicated dlstp/letp if it
+     contains a VCTP on the number of elements processed by the loop.
+     Find the VCTP predicate generation inside the loop body BB.  */
+  rtx_insn *vctp_insn = arm_mve_get_loop_vctp (loop->header);
+  if (!vctp_insn)
+    return NULL;
+
+  /* We only support two loop forms for tail predication:
+      DLSTP_TYPE_A) Loops of the form:
+	  int num_of_lanes = 128 / elem_size;
+	  while (num_of_elem > 0)
+	    {
+	      p = vctp<size> (num_of_elem);
+	      num_of_elem -= num_of_lanes;
+	    }
+      DLSTP_TYPE_B) Loops of the form:
+	  int num_of_lanes = 128 / elem_size;
+	  int num_of_iters = (num_of_elem + num_of_lanes - 1) / num_of_lanes;
+	  for (i = 0; i < num_of_iters; i++)
+	    {
+	      p = vctp<size> (num_of_elem);
+	      num_of_elem -= num_of_lanes;
+	    }
+
+    Then, depending on the type of loop above we need will need to do
+    different sets of checks.  */
+  iv_analysis_loop_init (loop);
+
+  /* In order to find out if the loop is of DLSTP_TYPE_A or DLSTP_TYPE_B above
+     look for the loop counter: it will either be incrementing by one per
+     iteration or it will be decrementing by num_of_lanes.  We can find the
+     loop counter in the condition at the end of the loop.  */
+  rtx_insn *loop_cond = prev_nonnote_nondebug_insn_bb (BB_END (loop->header));
+  if (!(cc_register (XEXP (PATTERN (loop_cond), 0), VOIDmode)
+	&& GET_CODE (XEXP (PATTERN (loop_cond), 1)) == COMPARE))
+    return NULL;
+
+  /* The operands in the condition:  Try to identify which one is the
+     constant and which is the counter and run IV analysis on the latter.  */
+  rtx cond_arg_1 = XEXP (XEXP (PATTERN (loop_cond), 1), 0);
+  rtx cond_arg_2 = XEXP (XEXP (PATTERN (loop_cond), 1), 1);
+
+  rtx loop_cond_constant;
+  rtx loop_counter;
+  class rtx_iv cond_counter_iv, cond_temp_iv;
+
+  if (CONST_INT_P (cond_arg_1))
+    {
+      /* cond_arg_1 is the constant and cond_arg_2 is the counter.  */
+      loop_cond_constant = cond_arg_1;
+      loop_counter = cond_arg_2;
+      iv_analyze (loop_cond, as_a<scalar_int_mode> (GET_MODE (cond_arg_2)),
+		  cond_arg_2, &cond_counter_iv);
+    }
+  else if (CONST_INT_P (cond_arg_2))
+    {
+      /* cond_arg_2 is the constant and cond_arg_1 is the counter.  */
+      loop_cond_constant = cond_arg_2;
+      loop_counter = cond_arg_1;
+      iv_analyze (loop_cond, as_a<scalar_int_mode> (GET_MODE (cond_arg_1)),
+		  cond_arg_1, &cond_counter_iv);
+    }
+  else if (REG_P (cond_arg_1) && REG_P (cond_arg_2))
+    {
+      /* If both operands to the compare are REGs, we can safely
+	 run IV analysis on both and then determine which is the
+	 constant by looking at the step.
+	 First assume cond_arg_1 is the counter.  */
+      loop_counter = cond_arg_1;
+      loop_cond_constant = cond_arg_2;
+      iv_analyze (loop_cond, as_a<scalar_int_mode> (GET_MODE (cond_arg_1)),
+		  cond_arg_1, &cond_counter_iv);
+      iv_analyze (loop_cond, as_a<scalar_int_mode> (GET_MODE (cond_arg_2)),
+		  cond_arg_2, &cond_temp_iv);
+
+      /* Look at the steps and swap around the rtx's if needed.  Error out if
+	 one of them cannot be identified as constant.  */
+      if (!CONST_INT_P (cond_counter_iv.step) || !CONST_INT_P (cond_temp_iv.step))
+	return NULL;
+      if (INTVAL (cond_counter_iv.step) != 0 && INTVAL (cond_temp_iv.step) != 0)
+	return NULL;
+      if (INTVAL (cond_counter_iv.step) == 0 && INTVAL (cond_temp_iv.step) != 0)
+	{
+	  loop_counter = cond_arg_2;
+	  loop_cond_constant = cond_arg_1;
+	  cond_counter_iv = cond_temp_iv;
+	}
+    }
+  else
+    return NULL;
+
+  if (!REG_P (loop_counter))
+    return NULL;
+  if (!(REG_P (loop_cond_constant) || CONST_INT_P (loop_cond_constant)))
+    return NULL;
+
+  /* Now we have extracted the IV step of the loop counter, call the
+     appropriate checking function.  */
+  if (INTVAL (cond_counter_iv.step) > 0)
+    return arm_mve_dlstp_check_inc_counter (loop, vctp_insn,
+					    loop_cond_constant, loop_counter);
+  else if (INTVAL (cond_counter_iv.step) < 0)
+    return arm_mve_dlstp_check_dec_counter (loop, vctp_insn,
+					    loop_cond_constant, loop_counter);
+  else
+    return NULL;
+}
+
+/* Predict whether the given loop in gimple will be transformed in the RTL
+   doloop_optimize pass.  It could be argued that turning large enough loops
+   into low-overhead loops would not show a signficant performance boost.
+   However, in the case of tail predication we would still avoid using VPT/VPST
+   instructions inside the loop, and in either case using low-overhead loops
+   would not be detrimental, so we decided to not consider size, avoiding the
+   need of a heuristic to determine what an appropriate size boundary is.  */
+
+static bool
+arm_predict_doloop_p (struct loop *loop)
+{
+  gcc_assert (loop);
+  /* On arm, targetm.can_use_doloop_p is actually
+     can_use_doloop_if_innermost.  Ensure the loop is innermost,
+     it is valid and as per arm_target_bb_ok_for_lob and the
+     correct architecture flags are enabled.  */
+  if (!(TARGET_HAVE_LOB && optimize > 0))
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Predict doloop failure due to"
+			    " target architecture or optimisation flags.\n");
+      return false;
+    }
+  else if (loop->inner != NULL)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Predict doloop failure due to"
+			    " loop nesting.\n");
+      return false;
+    }
+  else if (!arm_target_bb_ok_for_lob (loop->header->next_bb))
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Predict doloop failure due to"
+			    " loop bb complexity.\n");
+      return false;
+    }
+
+  return true;
+}
+
+/* Implement targetm.loop_unroll_adjust.  Use this to block unrolling of loops
+   that may later be turned into MVE Tail Predicated Low Overhead Loops.  The
+   performance benefit of an MVE LoL is likely to be much higher than that of
+   the unrolling.  */
+
+unsigned
+arm_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
+{
+  if (TARGET_HAVE_MVE
+      && arm_target_bb_ok_for_lob (loop->latch)
+      && arm_mve_loop_valid_for_dlstp (loop))
+    return 0;
+  else
+    return nunroll;
+}
+
+/* Function to hadle emitting a VPT-unpredicated version of a VPT-predicated
+   insn to a sequence.  */
+
+static bool
+arm_emit_mve_unpredicated_insn_to_seq (rtx_insn* insn)
+{
+  rtx insn_vpr_reg_operand = arm_get_required_vpr_reg_param (insn);
+  int new_icode = get_attr_mve_unpredicated_insn (insn);
+  if (!in_sequence_p ()
+      || !MVE_VPT_PREDICATED_INSN_P (insn)
+      || (!insn_vpr_reg_operand)
+      || (!new_icode))
+    return false;
+
+  extract_insn (insn);
+  rtx arr[8];
+  int j = 0;
+
+  /* When transforming a VPT-predicated instruction into its unpredicated
+     equivalent we need to drop the VPR operand and we may need to also drop a
+     merge "vuninit" input operand, depending on the instruction pattern.  Here
+     ensure that we have at most a two-operand difference between the two
+     instrunctions.  */
+  int n_operands_diff
+      = recog_data.n_operands - insn_data[new_icode].n_operands;
+  if (!(n_operands_diff > 0 && n_operands_diff <= 2))
+    return false;
+
+  rtx move = NULL_RTX;
+  /* Then, loop through the operands of the predicated
+     instruction, and retain the ones that map to the
+     unpredicated instruction.  */
+  for (int i = 0; i < recog_data.n_operands; i++)
+    {
+      /* Ignore the VPR and, if needed, the vuninit
+	 operand.  */
+      if (insn_vpr_reg_operand == recog_data.operand[i])
+	continue;
+      if (n_operands_diff == 2
+	  && !strcmp (recog_data.constraints[i], "0"))
+	{
+	  move = gen_rtx_SET (arr[0], recog_data.operand[i]);
+	  arr[0] = recog_data.operand[i];
+	}
+      else
+	arr[j++] = recog_data.operand[i];
+    }
+
+  /* Finally, emit the upredicated instruction.  */
+  rtx_insn *new_insn;
+  switch (j)
+    {
+      case 1:
+	new_insn = emit_insn (GEN_FCN (new_icode) (arr[0]));
+	break;
+      case 2:
+	new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1]));
+	break;
+      case 3:
+	new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2]));
+	break;
+      case 4:
+	new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2],
+						   arr[3]));
+	break;
+      case 5:
+	new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2],
+						   arr[3], arr[4]));
+	break;
+      case 6:
+	new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2],
+						   arr[3], arr[4], arr[5]));
+	break;
+      case 7:
+	new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2],
+						   arr[3], arr[4], arr[5],
+						   arr[6]));
+	break;
+      default:
+	gcc_unreachable ();
+    }
+  INSN_LOCATION (new_insn) = INSN_LOCATION (insn);
+  if (move)
+    {
+      new_insn = emit_insn (move);
+      INSN_LOCATION (new_insn) = INSN_LOCATION (insn);
+    }
+  return true;
+}
+
+/* Return TRUE if INSN defines a MVE vector operand that has zeroed
+   tail-predicated lanes.  This is either true if:
+   * INSN is predicated by VCTP_VPR_GENERATED and the 'invalid lanes' operand
+     is in the PROPS_ZERO_SET,
+   * all MVE vector operands are in the PROPS_ZERO_SET
+*/
+
+static bool
+arm_mve_propagate_zero_pred_p (vec <rtx_insn *> *props_zero_set,
+			       rtx_insn *insn, rtx vctp_vpr_generated)
+{
+  if (arm_mve_load_store_insn_p (insn, DL_USAGE_READ))
+    return true;
+  if (arm_mve_load_store_insn_p (insn, DL_USAGE_WRITE))
+    return false;
+
+  int inactive_idx = -1;
+
+  extract_insn (insn);
+  /* If INSN is predicated by VCTP_VPR_GENERATED, then all tail-predicated
+     lanes will keep the value that is in the 'invalid lanes' register which we
+     identify by the "0" constraint, to ensure it is the same as the 'result'
+     register of this instruction.  */
+  if (arm_mve_insn_predicated_by (insn, vctp_vpr_generated))
+    {
+      for (int i = 0; i < recog_data.n_operands; i++)
+	{
+	  if (strcmp (recog_data.constraints[i], "0") == 0
+	      && VALID_MVE_MODE (GET_MODE (recog_data.operand[i])))
+	    {
+	      inactive_idx = i;
+	      break;
+	    }
+	}
+    }
+
+  if (inactive_idx > 0)
+    {
+      rtx op = recog_data.operand[inactive_idx];
+      rtx_insn *def_insn =  arm_last_vect_def_insn (op, insn);
+      return def_insn != NULL_RTX && props_zero_set->contains (def_insn);
+    }
+
+  /* If this instruction is not predicated by VCTP_VPR_GENERATED, then we must
+     check that all vector operands have zeroed tail-predicated lanes, and that
+     it has at least one vector operand.  */
+  bool at_least_one_vector = false;
+  df_ref insn_uses;
+  FOR_EACH_INSN_USE (insn_uses, insn)
+    {
+      rtx reg = DF_REF_REG (insn_uses);
+      if (!VALID_MVE_MODE (GET_MODE (reg)))
+	continue;
+
+      rtx_insn *def_insn = arm_last_vect_def_insn (reg, insn);
+      if (def_insn && props_zero_set->contains (def_insn))
+	at_least_one_vector |= true;
+      else
+	return false;
+
+    }
+  return at_least_one_vector;
+}
+
+
+/* Attempt to transform the loop contents of loop basic block from VPT
+   predicated insns into unpredicated insns for a dlstp/letp loop.  Returns
+   the number to decrement from the total number of elements each iteration.
+   Returns 1 if tail predication can not be performed and fallback to scalar
+   low-overhead loops.  */
+
+int
+arm_attempt_dlstp_transform (rtx label)
+{
+  if (!dlstp_enabled)
+    return 1;
+
+  basic_block body = single_succ (BLOCK_FOR_INSN (label));
+
+  /* Ensure that the bb is within a loop that has all required metadata.  */
+  if (!body->loop_father || !body->loop_father->header
+      || !body->loop_father->simple_loop_desc)
+    return 1;
+
+  loop *loop = body->loop_father;
+  /* Instruction that sets the predicate mask depending on how many elements
+     are left to process.  */
+  rtx_insn *vctp_insn = arm_mve_loop_valid_for_dlstp (loop);
+  if (!vctp_insn)
+    return 1;
+
+  gcc_assert (single_set (vctp_insn));
+
+  rtx vctp_vpr_generated = single_set (vctp_insn);
+  if (!vctp_vpr_generated)
+    return 1;
+
+  vctp_vpr_generated = SET_DEST (vctp_vpr_generated);
+
+  if (!vctp_vpr_generated || !REG_P (vctp_vpr_generated)
+      || !VALID_MVE_PRED_MODE (GET_MODE (vctp_vpr_generated)))
+    return 1;
+
+  /* decrementunum is already known to be valid at this point.  */
+  int decrementnum = arm_mve_get_vctp_lanes (vctp_insn);
+
+  rtx_insn *insn = 0;
+  rtx_insn *cur_insn = 0;
+  rtx_insn *seq;
+  auto_vec <rtx_insn *> props_zero_set;
+
+  /* Scan through the insns in the loop bb and emit the transformed bb
+     insns to a sequence.  */
+  start_sequence ();
+  FOR_BB_INSNS (body, insn)
+    {
+      if (GET_CODE (insn) == CODE_LABEL || NOTE_INSN_BASIC_BLOCK_P (insn))
+	continue;
+      else if (NOTE_P (insn))
+	emit_note ((enum insn_note)NOTE_KIND (insn));
+      else if (DEBUG_INSN_P (insn))
+	emit_debug_insn (PATTERN (insn));
+      else if (!INSN_P (insn))
+	{
+	  end_sequence ();
+	  return 1;
+	}
+      /* If the transformation is successful we no longer need the vctp
+	 instruction.  */
+      else if (insn == vctp_insn)
+	continue;
+      /* If the insn pattern requires the use of the VPR value from the
+	 vctp as an input parameter for predication.  */
+      else if (arm_mve_insn_predicated_by (insn, vctp_vpr_generated))
+	{
+	  /* Check whether this INSN propagates the zeroed tail-predication
+	     lanes.  */
+	  if (arm_mve_propagate_zero_pred_p (&props_zero_set, insn,
+					     vctp_vpr_generated))
+	    props_zero_set.safe_push (insn);
+	  bool success = arm_emit_mve_unpredicated_insn_to_seq (insn);
+	  if (!success)
+	    {
+	      end_sequence ();
+	      return 1;
+	    }
+	}
+      /* If the insn isn't VPT predicated on vctp_vpr_generated, we need to
+	 make sure that it is still valid within the dlstp/letp loop.  */
+      else
+	{
+	  /* If this instruction USE-s the vctp_vpr_generated other than for
+	     predication, this blocks the transformation as we are not allowed
+	     to optimise the VPR value away.  */
+	  df_ref insn_uses = NULL;
+	  FOR_EACH_INSN_USE (insn_uses, insn)
+	  {
+	    if (rtx_equal_p (vctp_vpr_generated, DF_REF_REG (insn_uses)))
+	      {
+		end_sequence ();
+		return 1;
+	      }
+	  }
+	  /* If within the loop we have an MVE vector instruction that is
+	     unpredicated, the dlstp/letp looping will add implicit
+	     predication to it.  This will result in a change in behaviour
+	     of the instruction, so we need to find out if any instructions
+	     that feed into the current instruction were implicitly
+	     predicated.  */
+	  if (MVE_VPT_PREDICABLE_INSN_P (insn)
+	      && !arm_mve_impl_predicated_p (&props_zero_set, insn,
+					     vctp_vpr_generated))
+	    {
+	      end_sequence ();
+	      return 1;
+	    }
+	  emit_insn (PATTERN (insn));
+	}
+    }
+  seq = get_insns ();
+  end_sequence ();
+
+  /* Re-write the entire BB contents with the transformed
+     sequence.  */
+  FOR_BB_INSNS_SAFE (body, insn, cur_insn)
+    if (!(GET_CODE (insn) == CODE_LABEL || NOTE_INSN_BASIC_BLOCK_P (insn)))
+      delete_insn (insn);
+
+  emit_insn_after (seq, BB_END (body));
+
+  /* The transformation has succeeded, so now modify the "count"
+     (a.k.a. niter_expr) for the middle-end.  Also set noloop_assumptions
+     to NULL to stop the middle-end from making assumptions about the
+     number of iterations.  */
+  simple_loop_desc (body->loop_father)->niter_expr
+    = XVECEXP (SET_SRC (PATTERN (vctp_insn)), 0, 0);
+  simple_loop_desc (body->loop_father)->noloop_assumptions = NULL_RTX;
+  return decrementnum;
 }
 
 #if CHECKING_P
diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt
index 0cd3fc2cd0cc6..d88c7a52e7591 100644
--- a/gcc/config/arm/arm.opt
+++ b/gcc/config/arm/arm.opt
@@ -363,5 +363,8 @@ Target Joined RejectNegative String Var(arm_stack_protector_guard_offset_str)
 Use an immediate to offset from the TLS register. This option is for use with
 fstack-protector-guard=tls and not for use in user-land code.
 
+mdlstp
+Target Var(dlstp_enabled) Init(1) Undocumented
+
 TargetVariable
 long arm_stack_protector_guard_offset = 0
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 8d066fcf05df6..987602da1bf5f 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -2686,6 +2686,17 @@
 (define_int_attr mrrc [(VUNSPEC_MRRC "mrrc") (VUNSPEC_MRRC2 "mrrc2")])
 (define_int_attr MRRC [(VUNSPEC_MRRC "MRRC") (VUNSPEC_MRRC2 "MRRC2")])
 
+(define_int_attr dlstp_elemsize [(DLSTP8 "8") (DLSTP16 "16") (DLSTP32 "32")
+				 (DLSTP64 "64")])
+
+(define_int_attr letp_num_lanes [(LETP8 "16") (LETP16 "8") (LETP32 "4")
+				 (LETP64 "2")])
+(define_int_attr letp_num_lanes_neg [(LETP8 "-16") (LETP16 "-8") (LETP32 "-4")
+				     (LETP64 "-2")])
+
+(define_int_attr letp_num_lanes_minus_1 [(LETP8 "15") (LETP16 "7") (LETP32 "3")
+					 (LETP64 "1")])
+
 (define_int_attr opsuffix [(UNSPEC_DOT_S "s8")
 			   (UNSPEC_DOT_U "u8")
 			   (UNSPEC_DOT_US "s8")
@@ -2926,6 +2937,10 @@
 (define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U])
 (define_int_iterator VQSHLUQ_M_N [VQSHLUQ_M_N_S])
 (define_int_iterator VQSHLUQ_N [VQSHLUQ_N_S])
+(define_int_iterator DLSTP [DLSTP8 DLSTP16 DLSTP32
+				   DLSTP64])
+(define_int_iterator LETP [LETP8 LETP16 LETP32
+			   LETP64])
 
 ;; Define iterators for VCMLA operations
 (define_int_iterator VCMLA_OP [UNSPEC_VCMLA
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 9fe51298cdc2b..4b4d6298ffb18 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -6930,3 +6930,53 @@
       }
   }
 )
+
+;; Originally expanded by 'predicated_doloop_end'.
+;; In the rare situation where the branch is too far, we do also need to
+;; revert FPSCR.LTPSIZE back to 0x100 after the last iteration.
+(define_insn "predicated_doloop_end_internal<letp_num_lanes>"
+  [(set (pc)
+	(if_then_else
+	 (gtu (plus:SI (reg:SI LR_REGNUM)
+	       (const_int <letp_num_lanes_neg>))
+	      (const_int <letp_num_lanes_minus_1>))
+	 (match_operand 0 "" "")
+	 (pc)))
+   (set (reg:SI LR_REGNUM)
+	(plus:SI (reg:SI LR_REGNUM) (const_int <letp_num_lanes_neg>)))
+  ;; We use UNSPEC here to guarantee this pattern can not be
+  ;; generated by a RTL optimization and be matched by other
+  ;; patterns, since this pattern is also responsible for turning off
+  ;; the tail predication machinery if we were to exit the loop.
+  ;; This is done by either the LETP or the LCTP instructions that
+  ;; this pattern generates.
+   (use (unspec:SI [(const_int 0)] LETP))
+   (clobber (reg:CC CC_REGNUM))]
+  "TARGET_HAVE_MVE"
+  {
+    if (get_attr_length (insn) == 4)
+      return "letp\t%|lr, %l0";
+    else
+      return "subs\t%|lr, #<letp_num_lanes>\n\tbhi\t%l0\n\tlctp";
+  }
+  [(set (attr "length")
+	(if_then_else
+	   (ltu (minus (pc) (match_dup 0)) (const_int 1024))
+	    (const_int 4)
+	    (const_int 12)))
+   (set_attr "type" "branch")
+   (set_attr "conds" "unconditional")])
+
+(define_insn "dlstp<dlstp_elemsize>_insn"
+  [
+    (set (reg:SI LR_REGNUM)
+;; Similar to the previous pattern, we use UNSPEC here to make sure this
+;; rtx construct is not matched by other patterns, as this pattern is also
+;; responsible for setting the element size of the tail predication machinery
+;; using the dlsp.<size> instruction.
+	 (unspec_volatile:SI [(match_operand:SI 0 "s_register_operand" "r")]
+	  DLSTP))
+  ]
+  "TARGET_HAVE_MVE"
+  "dlstp.<dlstp_elemsize>\t%|lr, %0"
+  [(set_attr "type" "mve_misc")])
diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md
index 84c9c3dfe8009..66b3ae6040c0c 100644
--- a/gcc/config/arm/thumb2.md
+++ b/gcc/config/arm/thumb2.md
@@ -1613,7 +1613,7 @@
    (use (match_operand 1 "" ""))]     ; label
   "TARGET_32BIT"
   "
- {
+{
    /* Currently SMS relies on the do-loop pattern to recognize loops
       where (1) the control part consists of all insns defining and/or
       using a certain 'count' register and (2) the loop count can be
@@ -1623,41 +1623,75 @@
 
       Also used to implement the low over head loops feature, which is part of
       the Armv8.1-M Mainline Low Overhead Branch (LOB) extension.  */
-   if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB))
-   {
-     rtx s0;
-     rtx bcomp;
-     rtx loc_ref;
-     rtx cc_reg;
-     rtx insn;
-     rtx cmp;
-
-     if (GET_MODE (operands[0]) != SImode)
-       FAIL;
-
-     s0 = operands [0];
-
-     /* Low over head loop instructions require the first operand to be LR.  */
-     if (TARGET_HAVE_LOB && arm_target_insn_ok_for_lob (operands [1]))
-       s0 = gen_rtx_REG (SImode, LR_REGNUM);
-
-     if (TARGET_THUMB2)
-       insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, GEN_INT (-1)));
-     else
-       insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1)));
-
-     cmp = XVECEXP (PATTERN (insn), 0, 0);
-     cc_reg = SET_DEST (cmp);
-     bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
-     loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands [1]);
-     emit_jump_insn (gen_rtx_SET (pc_rtx,
-                                  gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
-                                                        loc_ref, pc_rtx)));
-     DONE;
-   }
- else
-   FAIL;
- }")
+  if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB))
+    {
+      rtx s0;
+      rtx bcomp;
+      rtx loc_ref;
+      rtx cc_reg;
+      rtx insn;
+      rtx cmp;
+      int decrement_num;
+
+      if (GET_MODE (operands[0]) != SImode)
+	FAIL;
+
+      s0 = operands[0];
+
+      if (TARGET_HAVE_LOB
+	  && arm_target_bb_ok_for_lob (BLOCK_FOR_INSN (operands[1])))
+	{
+	  /* If we have a compatible MVE target, try and analyse the loop
+	     contents to determine if we can use predicated dlstp/letp
+	     looping.  These patterns implicitly use LR as the loop counter.  */
+	  if (TARGET_HAVE_MVE
+	      && ((decrement_num = arm_attempt_dlstp_transform (operands[1]))
+		  != 1))
+	    {
+	      loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]);
+	      switch (decrement_num)
+		{
+		  case 2:
+		    insn = gen_predicated_doloop_end_internal2 (loc_ref);
+		    break;
+		  case 4:
+		    insn = gen_predicated_doloop_end_internal4 (loc_ref);
+		    break;
+		  case 8:
+		    insn = gen_predicated_doloop_end_internal8 (loc_ref);
+		    break;
+		  case 16:
+		    insn = gen_predicated_doloop_end_internal16 (loc_ref);
+		    break;
+		  default:
+		    gcc_unreachable ();
+		}
+	      emit_jump_insn (insn);
+	      DONE;
+	    }
+	  /* Remaining LOB cases need to explicitly use LR.  */
+	  s0 = gen_rtx_REG (SImode, LR_REGNUM);
+	}
+
+	/* Otherwise, try standard decrement-by-one dls/le looping.  */
+	if (TARGET_THUMB2)
+	  insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0,
+							GEN_INT (-1)));
+	else
+	  insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1)));
+
+	cmp = XVECEXP (PATTERN (insn), 0, 0);
+	cc_reg = SET_DEST (cmp);
+	bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
+	loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]);
+	emit_jump_insn (gen_rtx_SET (pc_rtx,
+				     gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
+							   loc_ref, pc_rtx)));
+	DONE;
+    }
+  else
+    FAIL;
+}")
 
 (define_insn "*clear_apsr"
   [(unspec_volatile:SI [(const_int 0)] VUNSPEC_CLRM_APSR)
@@ -1755,7 +1789,37 @@
   {
     if (REGNO (operands[0]) == LR_REGNUM)
       {
-	emit_insn (gen_dls_insn (operands[0]));
+	/* Pick out the number by which we are decrementing the loop counter
+	   in every iteration.  If it's > 1, then use dlstp.  */
+	int const_int_dec_num
+	     = abs (INTVAL (XEXP (XEXP (XVECEXP (PATTERN (operands[1]), 0, 1),
+				  1),
+			    1)));
+	switch (const_int_dec_num)
+	  {
+	    case 16:
+	      emit_insn (gen_dlstp8_insn (operands[0]));
+	      break;
+
+	    case 8:
+	      emit_insn (gen_dlstp16_insn (operands[0]));
+	      break;
+
+	    case 4:
+	      emit_insn (gen_dlstp32_insn (operands[0]));
+	      break;
+
+	    case 2:
+	      emit_insn (gen_dlstp64_insn (operands[0]));
+	      break;
+
+	    case 1:
+	      emit_insn (gen_dls_insn (operands[0]));
+	      break;
+
+	    default:
+	      gcc_unreachable ();
+	  }
 	DONE;
       }
     else
diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md
index e2b70da1001f2..9527bdb9e87e7 100644
--- a/gcc/config/arm/types.md
+++ b/gcc/config/arm/types.md
@@ -574,6 +574,7 @@
 ; mve_move
 ; mve_store
 ; mve_load
+; mve_misc
 
 (define_attr "type"
  "adc_imm,\
@@ -1126,7 +1127,8 @@
   ls64,\
   mve_move,\
   mve_store,\
-  mve_load"
+  mve_load, \
+  mve_misc"
    (cond [(eq_attr "autodetect_type" "alu_shift_lsr_op2,alu_shift_asr_op2")
             (const_string "alu_shift_imm_other")
           (eq_attr "autodetect_type" "alu_shift_lsl_op2")
@@ -1292,7 +1294,7 @@
 ;; No otherwise.
 (define_attr "is_mve_type" "yes,no"
         (if_then_else (eq_attr "type"
-        "mve_move, mve_load, mve_store, mrs")
+        "mve_move, mve_load, mve_store, mrs, mve_misc")
         (const_string "yes")
         (const_string "no")))
 
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 46ac8b3715740..f5f4d1543645b 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -591,6 +591,10 @@
   VADDLVQ_U
   VCTP
   VCTP_M
+  LETP8
+  LETP16
+  LETP32
+  LETP64
   VPNOT
   VCREATEQ_F
   VCVTQ_N_TO_F_S
@@ -1259,6 +1263,14 @@
   UQRSHLL_48
   SQRSHRL_64
   SQRSHRL_48
-  VSHLCQ_M_
   REINTERPRET
 ])
+
+; DLSTP unspecs must be volatile to guarantee the scheduler does not reschedule
+; these instructions within the loop preheader.
+(define_c_enum "unspecv" [
+  DLSTP8
+  DLSTP16
+  DLSTP32
+  DLSTP64
+])
diff --git a/gcc/testsuite/gcc.target/arm/lob.h b/gcc/testsuite/gcc.target/arm/lob.h
index feaae7cc89959..3941fe7a8b620 100644
--- a/gcc/testsuite/gcc.target/arm/lob.h
+++ b/gcc/testsuite/gcc.target/arm/lob.h
@@ -1,15 +1,131 @@
 #include <string.h>
-
+#include <stdint.h>
 /* Common code for lob tests.  */
 
 #define NO_LOB asm volatile ("@ clobber lr" : : : "lr" )
 
-#define N 10000
+#define N 100
+
+static void
+reset_data (int *a, int *b, int *c, int x)
+{
+  memset (a, -1, x * sizeof (*a));
+  memset (b, -1, x * sizeof (*b));
+  memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data8 (int8_t *a, int8_t *b, int8_t *c, int x)
+{
+  memset (a, -1, x * sizeof (*a));
+  memset (b, -1, x * sizeof (*b));
+  memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data16 (int16_t *a, int16_t *b, int16_t *c, int x)
+{
+  memset (a, -1, x * sizeof (*a));
+  memset (b, -1, x * sizeof (*b));
+  memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data32 (int32_t *a, int32_t *b, int32_t *c, int x)
+{
+  memset (a, -1, x * sizeof (*a));
+  memset (b, -1, x * sizeof (*b));
+  memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data64 (int64_t *a, int64_t *c, int x)
+{
+  memset (a, -1, x * sizeof (*a));
+  memset (c, 0, x * sizeof (*c));
+}
+
+static void
+check_plus (int *a, int *b, int *c, int x)
+{
+  for (int i = 0; i < N; i++)
+    {
+      NO_LOB;
+      if (i < x)
+	{
+	  if (c[i] != (a[i] + b[i])) abort ();
+	}
+      else
+	{
+	  if (c[i] != 0) abort ();
+	}
+    }
+}
+
+static void
+check_plus8 (int8_t *a, int8_t *b, int8_t *c, int x)
+{
+  for (int i = 0; i < N; i++)
+    {
+      NO_LOB;
+      if (i < x)
+	{
+	  if (c[i] != (a[i] + b[i])) abort ();
+	}
+      else
+	{
+	  if (c[i] != 0) abort ();
+	}
+    }
+}
+
+static void
+check_plus16 (int16_t *a, int16_t *b, int16_t *c, int x)
+{
+  for (int i = 0; i < N; i++)
+    {
+      NO_LOB;
+      if (i < x)
+	{
+	  if (c[i] != (a[i] + b[i])) abort ();
+	}
+      else
+	{
+	  if (c[i] != 0) abort ();
+	}
+    }
+}
+
+static void
+check_plus32 (int32_t *a, int32_t *b, int32_t *c, int x)
+{
+  for (int i = 0; i < N; i++)
+    {
+      NO_LOB;
+      if (i < x)
+	{
+	  if (c[i] != (a[i] + b[i])) abort ();
+	}
+      else
+	{
+	  if (c[i] != 0) abort ();
+	}
+    }
+}
 
 static void
-reset_data (int *a, int *b, int *c)
+check_memcpy64 (int64_t *a, int64_t *c, int x)
 {
-  memset (a, -1, N * sizeof (*a));
-  memset (b, -1, N * sizeof (*b));
-  memset (c, -1, N * sizeof (*c));
+  for (int i = 0; i < N; i++)
+    {
+      NO_LOB;
+      if (i < x)
+	{
+	  if (c[i] != a[i]) abort ();
+	}
+      else
+	{
+	  if (c[i] != 0) abort ();
+	}
+    }
 }
diff --git a/gcc/testsuite/gcc.target/arm/lob1.c b/gcc/testsuite/gcc.target/arm/lob1.c
index ba5c82cd55c58..c8ce653a5c39f 100644
--- a/gcc/testsuite/gcc.target/arm/lob1.c
+++ b/gcc/testsuite/gcc.target/arm/lob1.c
@@ -54,29 +54,18 @@ loop3 (int *a, int *b, int *c)
     } while (i < N);
 }
 
-void
-check (int *a, int *b, int *c)
-{
-  for (int i = 0; i < N; i++)
-    {
-      NO_LOB;
-      if (c[i] != a[i] + b[i])
-	abort ();
-    }
-}
-
 int
 main (void)
 {
-  reset_data (a, b, c);
+  reset_data (a, b, c, N);
   loop1 (a, b ,c);
-  check (a, b ,c);
-  reset_data (a, b, c);
+  check_plus (a, b, c, N);
+  reset_data (a, b, c, N);
   loop2 (a, b ,c);
-  check (a, b ,c);
-  reset_data (a, b, c);
+  check_plus (a, b, c, N);
+  reset_data (a, b, c, N);
   loop3 (a, b ,c);
-  check (a, b ,c);
+  check_plus (a, b, c, N);
 
   return 0;
 }
diff --git a/gcc/testsuite/gcc.target/arm/lob6.c b/gcc/testsuite/gcc.target/arm/lob6.c
index 17b6124295e8a..4fe116e2c2be3 100644
--- a/gcc/testsuite/gcc.target/arm/lob6.c
+++ b/gcc/testsuite/gcc.target/arm/lob6.c
@@ -79,14 +79,14 @@ check (void)
 int
 main (void)
 {
-  reset_data (a1, b1, c1);
-  reset_data (a2, b2, c2);
+  reset_data (a1, b1, c1, N);
+  reset_data (a2, b2, c2, N);
   loop1 (a1, b1, c1);
   ref1 (a2, b2, c2);
   check ();
 
-  reset_data (a1, b1, c1);
-  reset_data (a2, b2, c2);
+  reset_data (a1, b1, c1, N);
+  reset_data (a2, b2, c2, N);
   loop2 (a1, b1, c1);
   ref2 (a2, b2, c2);
   check ();
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-1.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-1.c
new file mode 100644
index 0000000000000..6e6da3d3d596b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-1.c
@@ -0,0 +1,146 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O3 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+
+#define IMM 5
+
+#define TEST_COMPILE_IN_DLSTP_TERNARY(BITS, LANES, LDRSTRYTPE, TYPE, SIGN, NAME, PRED)				\
+void test_##NAME##PRED##_##SIGN##BITS (TYPE##BITS##_t *a, TYPE##BITS##_t *b,  TYPE##BITS##_t *c, int n)	\
+{											\
+  while (n > 0)										\
+    {											\
+      mve_pred16_t p = vctp##BITS##q (n);						\
+      TYPE##BITS##x##LANES##_t va = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (a, p);		\
+      TYPE##BITS##x##LANES##_t vb = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (b, p);		\
+      TYPE##BITS##x##LANES##_t vc = NAME##PRED##_##SIGN##BITS (va, vb, p);		\
+      vstr##LDRSTRYTPE##q_p_##SIGN##BITS (c, vc, p);					\
+      c += LANES;									\
+      a += LANES;									\
+      b += LANES;									\
+      n -= LANES;									\
+    }											\
+}
+
+#define TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY(BITS, LANES, LDRSTRYTPE, NAME, PRED)	\
+TEST_COMPILE_IN_DLSTP_TERNARY (BITS, LANES, LDRSTRYTPE, int, s, NAME, PRED)			\
+TEST_COMPILE_IN_DLSTP_TERNARY (BITS, LANES, LDRSTRYTPE, uint, u, NAME, PRED)
+
+#define TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY(NAME, PRED)			\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY (8, 16, b, NAME, PRED)				\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY (16, 8, h, NAME, PRED)				\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY (32, 4, w, NAME, PRED)
+
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vaddq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vmulq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vsubq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vhaddq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vorrq, _x)
+
+
+#define TEST_COMPILE_IN_DLSTP_TERNARY_M(BITS, LANES, LDRSTRYTPE, TYPE, SIGN, NAME, PRED)				\
+void test_##NAME##PRED##_##SIGN##BITS (TYPE##BITS##x##LANES##_t __inactive, TYPE##BITS##_t *a, TYPE##BITS##_t *b,  TYPE##BITS##_t *c, int n)	\
+{											\
+  while (n > 0)										\
+    {											\
+      mve_pred16_t p = vctp##BITS##q (n);						\
+      TYPE##BITS##x##LANES##_t va = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (a, p);		\
+      TYPE##BITS##x##LANES##_t vb = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (b, p);		\
+      TYPE##BITS##x##LANES##_t vc = NAME##PRED##_##SIGN##BITS (__inactive, va, vb, p);		\
+      vstr##LDRSTRYTPE##q_p_##SIGN##BITS (c, vc, p);					\
+      c += LANES;									\
+      a += LANES;									\
+      b += LANES;									\
+      n -= LANES;									\
+    }											\
+}
+
+#define TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M(BITS, LANES, LDRSTRYTPE, NAME, PRED)	\
+TEST_COMPILE_IN_DLSTP_TERNARY_M (BITS, LANES, LDRSTRYTPE, int, s, NAME, PRED)			\
+TEST_COMPILE_IN_DLSTP_TERNARY_M (BITS, LANES, LDRSTRYTPE, uint, u, NAME, PRED)
+
+#define TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M(NAME, PRED)			\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M (8, 16, b, NAME, PRED)				\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M (16, 8, h, NAME, PRED)				\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M (32, 4, w, NAME, PRED)
+
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vaddq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vmulq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vsubq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vhaddq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vorrq, _m)
+
+#define TEST_COMPILE_IN_DLSTP_TERNARY_N(BITS, LANES, LDRSTRYTPE, TYPE, SIGN, NAME, PRED)	\
+void test_##NAME##PRED##_n_##SIGN##BITS (TYPE##BITS##_t *a,  TYPE##BITS##_t *c, int n)	\
+{											\
+  while (n > 0)										\
+    {											\
+      mve_pred16_t p = vctp##BITS##q (n);						\
+      TYPE##BITS##x##LANES##_t va = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (a, p);		\
+      TYPE##BITS##x##LANES##_t vc = NAME##PRED##_n_##SIGN##BITS (va, IMM, p);		\
+      vstr##LDRSTRYTPE##q_p_##SIGN##BITS (c, vc, p);					\
+      c += LANES;									\
+      a += LANES;									\
+      n -= LANES;									\
+    }											\
+}
+
+#define TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_N(BITS, LANES, LDRSTRYTPE, NAME, PRED)	\
+TEST_COMPILE_IN_DLSTP_TERNARY_N (BITS, LANES, LDRSTRYTPE, int, s, NAME, PRED)			\
+TEST_COMPILE_IN_DLSTP_TERNARY_N (BITS, LANES, LDRSTRYTPE, uint, u, NAME, PRED)
+
+#define TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N(NAME, PRED)			\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_N (8, 16, b, NAME, PRED)				\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_N (16, 8, h, NAME, PRED)				\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_N (32, 4, w, NAME, PRED)
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vaddq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vmulq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vsubq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vhaddq, _x)
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vbrsrq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vshlq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vshrq, _x)
+
+#define TEST_COMPILE_IN_DLSTP_TERNARY_M_N(BITS, LANES, LDRSTRYTPE, TYPE, SIGN, NAME, PRED)	\
+void test_##NAME##PRED##_n_##SIGN##BITS (TYPE##BITS##x##LANES##_t __inactive, TYPE##BITS##_t *a,  TYPE##BITS##_t *c, int n)	\
+{											\
+  while (n > 0)										\
+    {											\
+      mve_pred16_t p = vctp##BITS##q (n);						\
+      TYPE##BITS##x##LANES##_t va = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (a, p);		\
+      TYPE##BITS##x##LANES##_t vc = NAME##PRED##_n_##SIGN##BITS (__inactive, va, IMM, p);		\
+      vstr##LDRSTRYTPE##q_p_##SIGN##BITS (c, vc, p);					\
+      c += LANES;									\
+      a += LANES;									\
+      n -= LANES;									\
+    }											\
+}
+
+#define TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M_N(BITS, LANES, LDRSTRYTPE, NAME, PRED)	\
+TEST_COMPILE_IN_DLSTP_TERNARY_M_N (BITS, LANES, LDRSTRYTPE, int, s, NAME, PRED)			\
+TEST_COMPILE_IN_DLSTP_TERNARY_M_N (BITS, LANES, LDRSTRYTPE, uint, u, NAME, PRED)
+
+#define TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N(NAME, PRED)			\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M_N (8, 16, b, NAME, PRED)				\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M_N (16, 8, h, NAME, PRED)				\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M_N (32, 4, w, NAME, PRED)
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vaddq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vmulq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vsubq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vhaddq, _m)
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vbrsrq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vshlq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vshrq, _m)
+
+/* The final number of DLSTPs currently is calculated by the number of
+  `TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY.*` macros * 6.  */
+/* { dg-final { scan-assembler-times {\tdlstp} 144 } } */
+/* { dg-final { scan-assembler-times {\tletp} 144 } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c
new file mode 100644
index 0000000000000..84f4a2fc4f9bc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c
@@ -0,0 +1,749 @@
+
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O3 -save-temps -fno-schedule-insns2 " } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-mtune=cortex-m55" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_mve.h>
+/* Using a >=1 condition.  */
+void test1 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n >= 1)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c+=4;
+      a+=4;
+      b+=4;
+      n-=4;
+    }
+}
+/*
+** test1:
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vldrw.32	q[0-9]+, \[r1\], #16
+**	vadd.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\1, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Test a for loop format of decrementing to zero */
+int32_t a[] = {0, 1, 2, 3, 4, 5, 6, 7};
+void test2 (int32_t *b, int num_elems)
+{
+    for (int i = num_elems; i > 0; i-= 4)
+    {
+        mve_pred16_t p = vctp32q (i);
+        int32x4_t va = vldrwq_z_s32 (&(a[i]), p);
+        vstrwq_p_s32 (b + i, va, p);
+    }
+}
+/*
+** test2:
+**...
+**	dlstp.32	lr, r1
+**...
+**	vldrw.32	(q[0-9]+), \[r3\], #-16
+**	vstrw.32	\1, \[r0\], #-16
+**	letp	lr, .*
+**...
+*/
+
+/* Iteration counter counting up to num_iter.  */
+void test3 (uint8_t *a, uint8_t *b, uint8_t *c, unsigned n)
+{
+    int num_iter = (n + 15)/16;
+    for (int i = 0; i < num_iter; i++)
+    {
+	mve_pred16_t p = vctp8q (n);
+	uint8x16_t va = vldrbq_z_u8 (a, p);
+	uint8x16_t vb = vldrbq_z_u8 (b, p);
+	uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+	vstrbq_p_u8 (c, vc, p);
+	n-=16;
+	a += 16;
+	b += 16;
+	c += 16;
+    }
+}
+
+/*
+** test3:
+**...
+**	dlstp.8	lr, r3
+**...
+**	vldrb.8	q[0-9]+, \[(r[0-9]+|ip)\]
+**	vldrb.8	q[0-9]+, \[(r[0-9]+|ip)\]
+**...
+**	vadd.i8	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrb.8	\3, \[(r[0-9]+|ip)\]
+**...
+**	letp	lr, .*
+**...
+*/
+
+/* Iteration counter counting down from num_iter.  */
+void test4 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+    int num_iter = (n + 15)/16;
+    for (int i = num_iter; i > 0; i--)
+    {
+	mve_pred16_t p = vctp8q (n);
+	uint8x16_t va = vldrbq_z_u8 (a, p);
+	uint8x16_t vb = vldrbq_z_u8 (b, p);
+	uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+	vstrbq_p_u8 (c, vc, p);
+	n-=16;
+	a += 16;
+	b += 16;
+	c += 16;
+    }
+}
+/*
+** test4:
+**...
+**	dlstp.8	lr, r3
+**...
+**	vldrb.8	q[0-9]+, \[(r[0-9]+|ip)\]
+**	vldrb.8	q[0-9]+, \[(r[0-9]+|ip)\]
+**...
+**	vadd.i8	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrb.8	\3, \[(r[0-9]+|ip)\]
+**...
+**	letp	lr, .*
+**...
+*/
+
+/* Using an unpredicated arithmetic instruction within the loop.  */
+void test5 (uint8_t *a, uint8_t *b, uint8_t *c,  uint8_t *d, int n)
+{
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        uint8x16_t va = vldrbq_z_u8 (a, p);
+        uint8x16_t vb = vldrbq_u8 (b);
+	/* Is affected by implicit predication, because vb also
+	came from an unpredicated load, but there is no functional
+	problem, because the result is used in a predicated store.  */
+        uint8x16_t vc = vaddq_u8 (va, vb);
+        uint8x16_t vd = vaddq_x_u8 (va, vb, p);
+        vstrbq_p_u8 (c, vc, p);
+        vstrbq_p_u8 (d, vd, p);
+        n-=16;
+	a += 16;
+	b += 16;
+	c += 16;
+    }
+}
+
+/*
+** test5:
+**...
+**	dlstp.8	lr, r[0-9]+
+**...
+**	vldrb.8	q[0-9]+, \[r1\]
+**	vldrb.8	q[0-9]+, \[r2\]
+**...
+**	vadd.i8	(q[0-9]+), q[0-9]+, q[0-9]+
+**...
+**	vstrb.8	\1, \[r2\]
+**	vstrb.8	\1, \[r3\]
+**	letp	lr, .*
+**...
+*/
+
+/* Using a different VPR value for one instruction in the loop.  */
+void test6 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p1);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/*
+** test6:
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vpst
+**	vldrwt.32	q[0-9]+, \[r1\], #16
+**	vadd.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\1, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Generating and using another VPR value in the loop, with a vctp.
+   The doloop logic will always try to do the transform on the first
+   vctp it encounters, so this is still expected to work.  */
+void test7 (int32_t *a, int32_t *b, int32_t *c, int n, int g)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      mve_pred16_t p1 = vctp32q (g);
+      int32x4_t vb = vldrwq_z_s32 (b, p1);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+/*
+** test7:
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vpst
+**	vldrwt.32	q[0-9]+, \[r1\], #16
+**	vadd.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\1, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop, with a vctp,
+   but this time the p1 will also change in every loop (still fine)  */
+void test8 (int32_t *a, int32_t *b, int32_t *c, int n, int g)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      mve_pred16_t p1 = vctp32q (g);
+      int32x4_t vb = vldrwq_z_s32 (b, p1);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+      g++;
+    }
+}
+
+/*
+** test8:
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vctp.32	r4
+**	vpst
+**	vldrwt.32	q[0-9]+, \[r1\], #16
+**...
+**	vadd.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\1, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop, with a vctp_m
+   that is independent of the loop vctp VPR.  */
+void test9 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      mve_pred16_t p2 = vctp32q_m (n, p1);
+      int32x4_t vb = vldrwq_z_s32 (b, p1);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p2);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/*
+** test9:
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vmsr	p0, (r[0-9]+)	@ movhi
+**	vpst
+**	vctpt.32	r3
+**	vmrs	(r[0-9]+), p0	@ movhi
+**	vmsr	p0, \1	@ movhi
+**	vpst
+**	vldrwt.32	q[0-9]+, \[r1\], #16
+**	vmsr	p0, \2	@ movhi
+**	vpst
+**	vaddt.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**...
+**	vstrw.32	\3, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop,
+   with a vctp_m that is tied to the base vctp VPR.  This
+   is still fine, because the vctp_m will be transformed
+   into a vctp and be implicitly predicated.  */
+void test10 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      mve_pred16_t p1 = vctp32q_m (n, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p1);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p1);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+/*
+   We don't need that extra vctp in the loop, but we currently do not optimize
+   it away, however, it is not wrong to use it...
+*/
+/*
+** test10:
+**...
+**	dlstp.32	lr, r3
+**	vctp.32	r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**...
+**	vpst
+**	vldrwt.32	q[0-9]+, \[r1\], #16
+**	vpst
+**	vaddt.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\1, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop, with a vcmp.  */
+void test11 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      mve_pred16_t p1 = vcmpeqq_s32 (va, vb);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p1);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/*
+** test11:
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vldrw.32	q[0-9]+, \[r1\], #16
+**	vcmp.i32	eq, q[0-9]+, q[0-9]+
+**	vpst
+**	vaddt.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\1, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop, with a vcmp_m.  */
+void test12 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      mve_pred16_t p2 = vcmpeqq_m_s32 (va, vb, p1);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p2);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/*
+** test12:
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vldrw.32	q[0-9]+, \[r1\], #16
+**	vmsr	p0, (r[0-9]+|ip)	@ movhi
+**	vpst
+**	vcmpt.i32	eq, q[0-9]+, q[0-9]+
+**	vpst
+**	vaddt.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\2, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop, with a vcmp_m
+   that is tied to the base vctp VPR (same as above, this will be turned
+   into a vcmp and be implicitly predicated).  */
+void test13 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      mve_pred16_t p2 = vcmpeqq_m_s32 (va, vb, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p2);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/*
+** test13:
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vldrw.32	q[0-9]+, \[r1\], #16
+**	vcmp.i32	eq, q[0-9]+, q[0-9]+
+**	vpst
+**	vaddt.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\1, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Similar to test27 in dsltp-invalid-asm.c, but use a predicated load to make
+   it safe to implicitly predicate the vaddv.  */
+void test14 (int32_t *a, int32_t *c, int n)
+{
+  int32_t res = 0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      res += vaddvq_s32 (va);
+      int32x4_t vc = vdupq_n_s32 (res);
+      vstrwq_p_s32 (c, vc, p);
+      a += 4;
+      n -= 4;
+    }
+}
+
+/*
+** test14:
+**...
+**	dlstp.32	lr, r2
+**	vldrw.32	(q[0-9]+), \[r0\], #16
+**	vaddv.s32	(r[0-9]+|ip), \1
+**	add	(r[0-9]+|ip), \3, \2
+**	vdup.32	(q[0-9]+), \3
+**	vstrw.32	\4, \[r1\]
+**	letp	lr, .*
+**...
+*/
+
+uint8_t test15 (uint8_t *a, uint8_t *b, int n)
+{
+    uint8_t res = 0;
+    uint8x16_t vc = vdupq_n_u8 (0);
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_u8 (b);
+       vc = vaddq_m (vc, va, vc, p);
+       res = vgetq_lane (vc, 5);
+
+       a += 16;
+       b += 16;
+       n -= 16;
+    }
+    return res;
+}
+
+/*
+** test15:
+**...
+**	dlstp.8	lr, r2
+**...
+**	vldrb.8	q[0-9]+, \[(r[0-9]+|ip)\]
+**...
+**	vadd.i8	(q[0-9]+), q[0-9]+, q[0-9]+
+**...
+**	letp	lr, .*
+**	vmov.u8	r[0-9]+, \2\[5\]
+**...
+*/
+
+uint8_t test16 (uint8_t *a, uint8_t *b, int n)
+{
+    uint8_t res = 0;
+    uint8x16_t vc = vdupq_n_u8 (0);
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_u8 (b);
+       vc = vaddq (va, vc);
+       vc = vaddq_m (vc, va, vc, p);
+       res = vgetq_lane (vc, 5);
+
+       a += 16;
+       b += 16;
+       n -= 16;
+    }
+    return res;
+}
+
+/*
+** test16:
+**...
+**	dlstp.8	lr, r2
+**...
+**	vldrb.8	q[0-9]+, \[(r[0-9]+|ip)\]
+**...
+**	vadd.i8	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vadd.i8	\2, q[0-9]+, q[0-9]+
+**	letp	lr, .*
+**	vmov.u8	r[0-9]+, \2\[5\]
+**...
+*/
+
+
+
+/* Using an across-vector unpredicated instruction in a valid way.
+   This tests that "vc" has correctly masked the risky "vb".  */
+uint16_t test18 (uint16_t *a, uint16_t *b,  uint16_t *c, int n)
+{
+  uint16x8_t vb = vldrhq_u16 (b);
+  uint16_t res = 0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp16q (n);
+      uint16x8_t va = vldrhq_z_u16 (a, p);
+      uint16x8_t vc = vaddq_m_u16 (va, va, vb, p);
+      res += vaddvq_u16 (vc);
+      c += 8;
+      a += 8;
+      b += 8;
+      n -= 8;
+    }
+  return res;
+}
+
+/*
+** test18:
+**...
+**	dlstp.16	lr, r3
+**	vldrh.16	(q[0-9]+), \[r2\], #16
+**	vadd.i16	\1, q[0-9]+, q[0-9]+
+**	vaddv.u16	(r[0-9]+|ip), \1
+**	add	(r[0-9]+|ip), \3, \2
+**	uxth	\3, \3
+**	letp	lr, .*
+**...
+*/
+
+/* Using an across-vector unpredicated instruction with implicit scalar adding from outside the loop.  */
+uint16_t test19 (uint16_t *a, uint16_t *b,  uint16_t *c, int n)
+{
+  uint16x8_t vb = vldrhq_u16 (b);
+  uint16_t res = 0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp16q (n);
+      uint16x8_t va = vldrhq_z_u16 (a, p);
+      uint16x8_t vc = vaddq_m_u16 (va, va, vb, p);
+      res = vaddvaq_u16 (res, vc);
+      c += 8;
+      a += 8;
+      b += 8;
+      n -= 8;
+    }
+  return res;
+}
+
+/*
+** test19:
+**...
+**	dlstp.16	lr, r3
+**	vldrh.16	(q[0-9]+), \[r2\], #16
+**	vadd.i16	\1, q[0-9]+, q[0-9]+
+**	vaddva.u16	(r[0-9]+|ip), \1
+**	uxth	\2, \2
+**	letp	lr, .*
+**...
+*/
+
+
+/* Using an across-vector predicated instruction in a valid way.  */
+uint16_t  test20 (uint16_t *a, uint16_t *b,  uint16_t *c, int n)
+{
+  uint16_t res = 0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp16q (n);
+      uint16x8_t va = vldrhq_u16 (a);
+      res = vaddvaq_p_u16 (res, va, p);
+      c += 8;
+      a += 8;
+      b += 8;
+      n -= 8;
+    }
+  return res;
+}
+
+/* The uxth could be moved outside the loop.  */
+/*
+** test20:
+**...
+**	dlstp.16	lr, r3
+**	vldrh.16	(q[0-9]+), \[r2\], #16
+**	vaddva.u16	(r[0-9]+|ip), \1
+**	uxth	\2, \2
+**	letp	lr, .*
+**...
+*/
+
+/* Using an across-vector predicated instruction in a valid way.  */
+uint16_t  test21 (uint16_t *a, uint16_t *b,  uint16_t *c, int n)
+{
+  uint16_t res = 0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp16q (n);
+      uint16x8_t va = vldrhq_u16 (a);
+      res++;
+      res = vaddvaq_p_u16 (res, va, p);
+      c += 8;
+      a += 8;
+      b += 8;
+      n -= 8;
+    }
+  return res;
+}
+
+/* Also think it'd be safe to move uxth outside of the loop here.  */
+/*
+** test21:
+**...
+**	dlstp.16	lr, r3
+**	vldrh.16	(q[0-9]+), \[r2\], #16
+**	adds	(r[0-9]+|ip), \2, #1
+**	uxth	\2, \2
+**	vaddva.u16	\2, \1
+**	uxth	\2, \2
+**	letp	lr, .*
+**...
+*/
+
+int test22 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+    int res = 0;
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        uint8x16_t va = vldrbq_z_u8 (a, p);
+        res = vmaxvq (res, va);
+        n-=16;
+        a+=16;
+    }
+    return res;
+}
+
+/*
+** test22:
+**...
+**	dlstp.8	lr, r3
+**...
+**	vldrb.8	(q[0-9]+), \[r[0-9]+\]
+**...
+**	vmaxv.u8	(r[0-9]+|ip), \1
+**	uxtb	\2, \2
+**	letp	lr, .*
+**...
+*/
+
+int test23 (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+    int res = 0;
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        int8x16_t va = vldrbq_z_s8 (a, p);
+        res = vmaxavq (res, va);
+        n-=16;
+        a+=16;
+    }
+    return res;
+}
+
+/*
+** test23:
+**...
+**	dlstp.8	lr, r3
+**...
+**	vldrb.8	(q[0-9]+), \[r3\]
+**...
+**	vmaxav.s8	(r[0-9]+|ip), \1
+**	uxtb	\2, \2
+**	letp	lr, .*
+**...
+*/
+
+/* Like test1, but update n before vctp, meaning we should only iterate for n-4
+   elements.  */
+void test24 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n >= 1)
+    {
+      n-=4;
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c+=4;
+      a+=4;
+      b+=4;
+    }
+}
+/*
+** test24:
+**...
+**	subs	r3, r3, #4
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vldrw.32	q[0-9]+, \[r1\], #16
+**	vadd.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\1, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-3.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-3.c
new file mode 100644
index 0000000000000..c784f54013177
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-3.c
@@ -0,0 +1,46 @@
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O3 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+
+/* We don't support pattern recognition of signed N values when computing num_iter.  */
+void test3 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+    int num_iter = (n + 15)/16;
+    for (int i = 0; i < num_iter; i++)
+    {
+	mve_pred16_t p = vctp8q (n);
+	uint8x16_t va = vldrbq_z_u8 (a, p);
+	uint8x16_t vb = vldrbq_z_u8 (b, p);
+	uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+	vstrbq_p_u8 (c, vc, p);
+	n-=16;
+	a += 16;
+	b += 16;
+	c += 16;
+    }
+}
+
+/* Using a predicated vcmp to generate a new predicate value in the
+   loop and then using it in a predicated store insn.  */
+void test17 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_s32 (va, vb);
+      mve_pred16_t p1 = vcmpeqq_m_s32 (va, vc, p);
+      vstrwq_p_s32 (c, vc, p1);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+/* This is an example of a loop that we could tail predicate but currently don't.  */
+/* { dg-final { scan-assembler "letp" { xfail *-*-* } } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8-run.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8-run.c
new file mode 100644
index 0000000000000..6966a3966046f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8-run.c
@@ -0,0 +1,44 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-require-effective-target arm_mve_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+#include "dlstp-int16x8.c"
+
+int main ()
+{
+  int i;
+  int16_t temp1[N];
+  int16_t temp2[N];
+  int16_t temp3[N];
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 0);
+  check_plus16 (temp1, temp2, temp3, 0);
+
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 1);
+  check_plus16 (temp1, temp2, temp3, 1);
+
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 7);
+  check_plus16 (temp1, temp2, temp3, 7);
+
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 8);
+  check_plus16 (temp1, temp2, temp3, 8);
+
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 9);
+  check_plus16 (temp1, temp2, temp3, 9);
+
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 16);
+  check_plus16 (temp1, temp2, temp3, 16);
+
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 17);
+  check_plus16 (temp1, temp2, temp3, 17);
+
+  reset_data16 (temp1, temp2, temp3, N);
+}
+
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8.c
new file mode 100644
index 0000000000000..33632c5f14dc6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8.c
@@ -0,0 +1,31 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../lob.h"
+
+void  __attribute__ ((noinline)) test (int16_t *a, int16_t *b, int16_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp16q (n);
+      int16x8_t va = vldrhq_z_s16 (a, p);
+      int16x8_t vb = vldrhq_z_s16 (b, p);
+      int16x8_t vc = vaddq_x_s16 (va, vb, p);
+      vstrhq_p_s16 (c, vc, p);
+      c+=8;
+      a+=8;
+      b+=8;
+      n-=8;
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.16} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp" } } */
+/* { dg-final { scan-assembler-not "\tvpst" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4-run.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4-run.c
new file mode 100644
index 0000000000000..6833dddde92b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4-run.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-require-effective-target arm_mve_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include "dlstp-int32x4.c"
+
+int main ()
+{
+  int i;
+  int32_t temp1[N];
+  int32_t temp2[N];
+  int32_t temp3[N];
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 0);
+  check_plus32 (temp1, temp2, temp3, 0);
+
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 1);
+  check_plus32 (temp1, temp2, temp3, 1);
+
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 3);
+  check_plus32 (temp1, temp2, temp3, 3);
+
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 4);
+  check_plus32 (temp1, temp2, temp3, 4);
+
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 5);
+  check_plus32 (temp1, temp2, temp3, 5);
+
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 8);
+  check_plus32 (temp1, temp2, temp3, 8);
+
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 9);
+  check_plus32 (temp1, temp2, temp3, 9);
+
+  reset_data32 (temp1, temp2, temp3, N);
+}
+
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4.c
new file mode 100644
index 0000000000000..5d09f784b7716
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4.c
@@ -0,0 +1,31 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../lob.h"
+
+void  __attribute__ ((noinline)) test (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c+=4;
+      a+=4;
+      b+=4;
+      n-=4;
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.32} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp" } } */
+/* { dg-final { scan-assembler-not "\tvpst" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2-run.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2-run.c
new file mode 100644
index 0000000000000..cc0b9ce7ee9a5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2-run.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-require-effective-target arm_mve_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include "dlstp-int64x2.c"
+
+int main ()
+{
+  int i;
+  int64_t temp1[N];
+  int64_t temp3[N];
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 0);
+  check_memcpy64 (temp1, temp3, 0);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 1);
+  check_memcpy64 (temp1, temp3, 1);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 2);
+  check_memcpy64 (temp1, temp3, 2);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 3);
+  check_memcpy64 (temp1, temp3, 3);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 4);
+  check_memcpy64 (temp1, temp3, 4);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 5);
+  check_memcpy64 (temp1, temp3, 5);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 6);
+  check_memcpy64 (temp1, temp3, 6);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 7);
+  check_memcpy64 (temp1, temp3, 7);
+
+  reset_data64  (temp1, temp3, N);
+}
+
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2.c
new file mode 100644
index 0000000000000..21e882424ec3b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2.c
@@ -0,0 +1,28 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../lob.h"
+
+void  __attribute__ ((noinline)) test (int64_t *a, int64_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp64q (n);
+      int64x2_t va = vldrdq_gather_offset_z_s64 (a, vcreateq_u64 (0, 8), p);
+      vstrdq_scatter_offset_p_s64 (c, vcreateq_u64 (0, 8), va, p);
+      c+=2;
+      a+=2;
+      n-=2;
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.64} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp" } } */
+/* { dg-final { scan-assembler-not "\tvpst" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16-run.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16-run.c
new file mode 100644
index 0000000000000..d46571f329cf3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16-run.c
@@ -0,0 +1,44 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-require-effective-target arm_mve_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include "dlstp-int8x16.c"
+
+int main ()
+{
+  int i;
+  int8_t temp1[N];
+  int8_t temp2[N];
+  int8_t temp3[N];
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 0);
+  check_plus8 (temp1, temp2, temp3, 0);
+
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 1);
+  check_plus8 (temp1, temp2, temp3, 1);
+
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 15);
+  check_plus8 (temp1, temp2, temp3, 15);
+
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 16);
+  check_plus8 (temp1, temp2, temp3, 16);
+
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 17);
+  check_plus8 (temp1, temp2, temp3, 17);
+
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 32);
+  check_plus8 (temp1, temp2, temp3, 32);
+
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 33);
+  check_plus8 (temp1, temp2, temp3, 33);
+
+  reset_data8 (temp1, temp2, temp3, N);
+}
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16.c
new file mode 100644
index 0000000000000..d5f22b5026259
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16.c
@@ -0,0 +1,32 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../lob.h"
+
+void  __attribute__ ((noinline)) test (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp8q (n);
+      int8x16_t va = vldrbq_z_s8 (a, p);
+      int8x16_t vb = vldrbq_z_s8 (b, p);
+      int8x16_t vc = vaddq_x_s8 (va, vb, p);
+      vstrbq_p_s8 (c, vc, p);
+      c+=16;
+      a+=16;
+      b+=16;
+      n-=16;
+    }
+}
+
+
+/* { dg-final { scan-assembler-times {\tdlstp.8} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp" } } */
+/* { dg-final { scan-assembler-not "\tvpst" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c
new file mode 100644
index 0000000000000..26df2d30523ce
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c
@@ -0,0 +1,521 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O3 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <limits.h>
+#include <arm_mve.h>
+
+/* Terminating on a non-zero number of elements.  */
+void test0 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+    while (n > 1)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_z_u8 (b, p);
+       uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+       vstrbq_p_u8 (c, vc, p);
+       n -= 16;
+    }
+}
+
+/* Terminating on n >= 0.  */
+void test1 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+    while (n >= 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_z_u8 (b, p);
+       uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+       vstrbq_p_u8 (c, vc, p);
+       n -= 16;
+    }
+}
+
+/* Similar, terminating on a non-zero number of elements, but in a for loop
+   format.  */
+int32_t a[] = {0, 1, 2, 3, 4, 5, 6, 7};
+void test2 (int32_t *b, int num_elems)
+{
+    for (int i = num_elems; i >= 2; i-= 4)
+    {
+       mve_pred16_t p = vctp32q (i);
+       int32x4_t va = vldrwq_z_s32 (&(a[i]), p);
+       vstrwq_p_s32 (b + i, va, p);
+    }
+}
+
+/* Iteration counter counting up to num_iter, with a non-zero starting num.  */
+void test3 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+    int num_iter = (n + 15)/16;
+    for (int i = 1; i < num_iter; i++)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_z_u8 (b, p);
+       uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+       vstrbq_p_u8 (c, vc, p);
+       n -= 16;
+    }
+}
+
+/* Iteration counter counting up to num_iter, with a larger increment  */
+void test4 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+    int num_iter = (n + 15)/16;
+    for (int i = 0; i < num_iter; i+=2)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_z_u8 (b, p);
+       uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+       vstrbq_p_u8 (c, vc, p);
+       n -= 16;
+    }
+}
+
+/* Using an unpredicated store instruction within the loop.  */
+void test5 (uint8_t *a, uint8_t *b, uint8_t *c,  uint8_t *d, int n)
+{
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_z_u8 (b, p);
+       uint8x16_t vc = vaddq_u8 (va, vb);
+       uint8x16_t vd = vaddq_x_u8 (va, vb, p);
+       vstrbq_u8 (d, vd);
+       n -= 16;
+    }
+}
+
+/* Using an unpredicated store outside the loop.  */
+void test6 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx)
+{
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_z_u8 (b, p);
+       uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p);
+       vx = vaddq_u8 (vx, vc);
+       a += 16;
+       b += 16;
+       n -= 16;
+    }
+    vstrbq_u8 (c, vx);
+}
+
+/* Using a VPR that gets modified within the loop.  */
+void test9 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      p++;
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/* Using a VPR that gets re-generated within the loop.  */
+void test10 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  mve_pred16_t p = vctp32q (n);
+  while (n > 0)
+    {
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      p = vctp32q (n);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/* Using vctp32q_m instead of vctp32q.  */
+void test11 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p0)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q_m (n, p0);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/* Using an unpredicated op with a scalar output, where the result is valid
+   outside the bb.  This is invalid, because one of the inputs to the
+   unpredicated op is also unpredicated.  */
+uint8_t test12 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx)
+{
+    uint8_t sum = 0;
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_u8 (b);
+       uint8x16_t vc = vaddq_u8 (va, vb);
+       sum += vaddvq_u8 (vc);
+       a += 16;
+       b += 16;
+       n -= 16;
+    }
+    return sum;
+}
+
+/* Using an unpredicated vcmp to generate a new predicate value in the
+   loop and then using that VPR to predicate a store insn.  */
+void test13 (int32_t *a, int32_t *b, int32x4_t vc, int32_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_s32 (a);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_s32 (va, vb);
+      mve_pred16_t p1 = vcmpeqq_s32 (va, vc);
+      vstrwq_p_s32 (c, vc, p1);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/* Using an across-vector unpredicated instruction. "vb" is the risk.  */
+uint16_t test14 (uint16_t *a, uint16_t *b,  uint16_t *c, int n)
+{
+  uint16x8_t vb = vldrhq_u16 (b);
+  uint16_t res = 0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp16q (n);
+      uint16x8_t va = vldrhq_z_u16 (a, p);
+      vb = vaddq_u16 (va, vb);
+      res = vaddvq_u16 (vb);
+      c += 8;
+      a += 8;
+      b += 8;
+      n -= 8;
+    }
+  return res;
+}
+
+/* Using an across-vector unpredicated instruction. "vc" is the risk. */
+uint16_t test15 (uint16_t *a, uint16_t *b,  uint16_t *c, int n)
+{
+  uint16x8_t vb = vldrhq_u16 (b);
+  uint16_t res = 0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp16q (n);
+      uint16x8_t va = vldrhq_z_u16 (a, p);
+      uint16x8_t vc = vaddq_u16 (va, vb);
+      res = vaddvaq_u16 (res, vc);
+      c += 8;
+      a += 8;
+      b += 8;
+      n -= 8;
+    }
+  return res;
+}
+
+uint16_t test16 (uint16_t *a, uint16_t *b,  uint16_t *c, int n)
+{
+  uint16_t res =0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp16q (n);
+      uint16x8_t vb = vldrhq_u16 (b);
+      uint16x8_t va = vldrhq_z_u16 (a, p);
+      res = vaddvaq_u16 (res, vb);
+      res = vaddvaq_p_u16 (res, va, p);
+      c += 8;
+      a += 8;
+      b += 8;
+      n -= 8;
+    }
+  return res;
+}
+
+int test17 (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+    int res = 0;
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        int8x16_t va = vldrbq_z_s8 (a, p);
+        res = vmaxvq (res, va);
+        n-=16;
+        a+=16;
+    }
+    return res;
+}
+
+
+
+int test18 (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+    int res = 0;
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        int8x16_t va = vldrbq_z_s8 (a, p);
+        res = vminvq (res, va);
+        n-=16;
+        a+=16;
+    }
+    return res;
+}
+
+int test19 (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+    int res = 0;
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        int8x16_t va = vldrbq_z_s8 (a, p);
+        res = vminavq (res, va);
+        n-=16;
+        a+=16;
+    }
+    return res;
+}
+
+int test20 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+    int res = 0;
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        uint8x16_t va = vldrbq_z_u8 (a, p);
+        res = vminvq (res, va);
+        n-=16;
+        a+=16;
+    }
+    return res;
+}
+
+uint8x16_t test21 (uint8_t *a, uint32_t *b, int n, uint8x16_t res)
+{
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        uint8x16_t va = vldrbq_z_u8 (a, p);
+        res = vshlcq_u8 (va, b, 1);
+        n-=16;
+        a+=16;
+    }
+    return res;
+}
+
+int8x16_t test22 (int8_t *a, int32_t *b, int n, int8x16_t res)
+{
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        int8x16_t va = vldrbq_z_s8 (a, p);
+        res = vshlcq_s8 (va, b, 1);
+        n-=16;
+        a+=16;
+    }
+    return res;
+}
+
+/* Using an unsigned number of elements to count down from, with a >0*/
+void test23 (int32_t *a, int32_t *b, int32_t *c, unsigned int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c+=4;
+      a+=4;
+      b+=4;
+      n-=4;
+    }
+}
+
+/* Using an unsigned number of elements to count up to, with a <n*/
+void test24 (uint8_t *a, uint8_t *b, uint8_t *c, unsigned int n)
+{
+    for (int i = 0; i < n; i+=16)
+    {
+        mve_pred16_t p = vctp8q (n-i);
+        uint8x16_t va = vldrbq_z_u8 (a, p);
+        uint8x16_t vb = vldrbq_z_u8 (b, p);
+        uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+        vstrbq_p_u8 (c, vc, p);
+        n-=16;
+    }
+}
+
+
+/* Using an unsigned number of elements to count up to, with a <=n*/
+void test25 (uint8_t *a, uint8_t *b, uint8_t *c, unsigned int n)
+{
+    for (int i = 1; i <= n; i+=16)
+    {
+        mve_pred16_t p = vctp8q (n-i+1);
+        uint8x16_t va = vldrbq_z_u8 (a, p);
+        uint8x16_t vb = vldrbq_z_u8 (b, p);
+        uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+        vstrbq_p_u8 (c, vc, p);
+        n-=16;
+    }
+}
+/* Update n twice in the loop.  */
+void test26 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n >= 1)
+    {
+      n-=4;
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c+=4;
+      a+=4;
+      b+=4;
+      n-=4;
+    }
+}
+
+void test27 (int32_t *a, int32_t *c, int n)
+{
+  int32_t res = 0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_s32 (a);
+      res += vaddvq_s32 (va);
+      int32x4_t vc = vdupq_n_s32 (res);
+      vstrwq_p_s32 (c, vc, p);
+      a += 4;
+      n -= 4;
+    }
+}
+
+/* Using an unpredicated vcmp to generate a new predicate value in the
+   loop and then using it in a predicated store insn.  */
+void test28 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_s32 (b);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      mve_pred16_t p1 = vcmpeqq_s32 (va, vc);
+      vstrwq_p_s32 (c, vc, p1);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/* Using an unpredicated op with a scalar output, where the result is valid
+   outside the bb.  The unpredicated lanes are not guaranteed zero, so would
+   affect the vaddv in the non-tail predicated case.  */
+uint8_t test29 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx)
+{
+    uint8_t sum = 0;
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_z_u8 (b, p);
+       uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p);
+       sum += vaddvq_u8 (vc);
+       a += 16;
+       b += 16;
+       n -= 16;
+    }
+    return sum;
+}
+
+/* Same as above, but with another scalar op between the unpredicated op and
+   the scalar op outside the loop.  */
+uint8_t test30 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx, int g)
+{
+    uint8_t sum = 0;
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_z_u8 (b, p);
+       uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p);
+       sum += vaddvq_u8 (vc);
+       sum += g;
+       a += 16;
+       b += 16;
+       n -= 16;
+    }
+    return sum;
+}
+
+uint8_t test31 (uint8_t *a, uint8_t *b, int n)
+{
+    uint8_t res = 0;
+    uint8x16_t vc = vdupq_n_u8 (0);
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_u8 (b);
+       vc = vaddq (vb, vc);
+       res = vgetq_lane (vc, 5);
+
+       a += 16;
+       b += 16;
+       n -= 16;
+    }
+    return res;
+}
+
+uint8_t test32 (uint8_t *a, uint8_t *b, int n)
+{
+    uint8_t res = 0;
+    uint8x16_t vc = vdupq_n_u8 (0);
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_u8 (b);
+       vc = vaddq_m (vc, va, vc, p);
+       vc = vaddq (vb, vc);
+       res = vgetq_lane (vc, 5);
+
+       a += 16;
+       b += 16;
+       n -= 16;
+    }
+    return res;
+}
+
+/* { dg-final { scan-assembler-not "\tdlstp" } } */
+/* { dg-final { scan-assembler-not "\tletp" } } */

From 954f9011c4923b72f42cc6ca8460333e7c7aad98 Mon Sep 17 00:00:00 2001
From: Harald Anlauf <anlauf@gmx.de>
Date: Tue, 18 Jun 2024 21:57:19 +0200
Subject: [PATCH 03/22] Fortran: fix for CHARACTER(len=*) dummies with bind(C)
 [PR115390]

gcc/fortran/ChangeLog:

	PR fortran/115390
	* trans-decl.cc (gfc_conv_cfi_to_gfc): Move derivation of type sizes
	for character via gfc_trans_vla_type_sizes to after character length
	has been set.

gcc/testsuite/ChangeLog:

	PR fortran/115390
	* gfortran.dg/bind_c_char_11.f90: New test.
---
 gcc/fortran/trans-decl.cc                    |  4 +-
 gcc/testsuite/gfortran.dg/bind_c_char_11.f90 | 45 ++++++++++++++++++++
 2 files changed, 47 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gfortran.dg/bind_c_char_11.f90

diff --git a/gcc/fortran/trans-decl.cc b/gcc/fortran/trans-decl.cc
index 88538713a02b4..f7fb6eec336a8 100644
--- a/gcc/fortran/trans-decl.cc
+++ b/gcc/fortran/trans-decl.cc
@@ -7063,8 +7063,8 @@ gfc_conv_cfi_to_gfc (stmtblock_t *init, stmtblock_t *finally,
   if (sym->ts.type == BT_CHARACTER
       && !INTEGER_CST_P (sym->ts.u.cl->backend_decl))
     {
-      gfc_conv_string_length (sym->ts.u.cl, NULL, init);
-      gfc_trans_vla_type_sizes (sym, init);
+      gfc_conv_string_length (sym->ts.u.cl, NULL, &block);
+      gfc_trans_vla_type_sizes (sym, &block);
     }
 
   /* gfc->data = cfi->base_addr - or for scalars: gfc = cfi->base_addr.
diff --git a/gcc/testsuite/gfortran.dg/bind_c_char_11.f90 b/gcc/testsuite/gfortran.dg/bind_c_char_11.f90
new file mode 100644
index 0000000000000..5ed8e82853bf0
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/bind_c_char_11.f90
@@ -0,0 +1,45 @@
+! { dg-do compile }
+! { dg-additional-options "-Wuninitialized" }
+!
+! PR fortran/115390 - fixes for CHARACTER(len=*) dummies with bind(C)
+
+module test
+  implicit none
+contains
+  subroutine bar(s,t) bind(c)
+    character(*), intent(in) :: s,t
+    optional                 :: t
+    call foo(s,t)
+  end
+  subroutine bar1(s,t) bind(c)
+    character(*), intent(in) :: s(:),t(:)
+    optional                 :: t
+    call foo1(s,t)
+  end
+  subroutine bar4(s,t) bind(c)
+    character(len=*,kind=4), intent(in) :: s,t
+    optional                            :: t
+    call foo4(s,t)
+  end
+  subroutine bar5(s,t) bind(c)
+    character(len=*,kind=4), intent(in) :: s(:),t(:)
+    optional                            :: t
+    call foo5(s,t)
+  end
+  subroutine foo(s,t)
+    character(*), intent(in) :: s,t
+    optional                 :: t
+  end
+  subroutine foo1(s,t)
+    character(*), intent(in) :: s(:),t(:)
+    optional                 :: t
+  end
+  subroutine foo4(s,t)
+    character(len=*,kind=4), intent(in) :: s,t
+    optional                            :: t
+  end
+  subroutine foo5(s,t)
+    character(len=*,kind=4), intent(in) :: s(:),t(:)
+    optional                            :: t
+  end
+end

From 9651d6005f9c1ac60aecf7b36d6c0bd1ead8a63b Mon Sep 17 00:00:00 2001
From: Jonathan Wakely <jwakely@redhat.com>
Date: Tue, 18 Jun 2024 20:57:24 +0100
Subject: [PATCH 04/22] libstdc++: Add conditional noexcept to std::pair
 default ctor

Most of std::pair constructors implemented using C++20 concepts have a
conditional noexcept-specifier, but the default constructor doesn't.
This fixes that.

libstdc++-v3/ChangeLog:

	* include/bits/stl_pair.h [__cpp_lib_concepts] (pair()): Add
	conditional noexcept.
---
 libstdc++-v3/include/bits/stl_pair.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libstdc++-v3/include/bits/stl_pair.h b/libstdc++-v3/include/bits/stl_pair.h
index 0c1e5719a1a3b..0d60eaba1941e 100644
--- a/libstdc++-v3/include/bits/stl_pair.h
+++ b/libstdc++-v3/include/bits/stl_pair.h
@@ -344,6 +344,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       explicit(__not_<__and_<__is_implicitly_default_constructible<_T1>,
 			     __is_implicitly_default_constructible<_T2>>>())
       pair()
+      noexcept(is_nothrow_default_constructible_v<_T1>
+		&& is_nothrow_default_constructible_v<_T2>)
       requires is_default_constructible_v<_T1>
 	       && is_default_constructible_v<_T2>
       : first(), second()

From 5d156a91853a7863d674ed35df87562e3a1eba0e Mon Sep 17 00:00:00 2001
From: Jonathan Wakely <jwakely@redhat.com>
Date: Tue, 18 Jun 2024 20:59:25 +0100
Subject: [PATCH 05/22] libstdc++: Add noexcept to some std::promise shared
 state internals

Making the state ready for a std::promise<void> only needs to move a
unique_ptr, which cannot throw. Make its call operator noexcept.
Similarly, making the state ready by storing an exception_ptr also can't
throw, so make that call operator noexcept too.

libstdc++-v3/ChangeLog:

	* include/std/future (_State_baseV2::_Setter<R, void>): Add
	noexcept to call operator.
	(_State_baseV2::_Setter<R, __exception_ptr_tag>): Likewise.
---
 libstdc++-v3/include/std/future | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libstdc++-v3/include/std/future b/libstdc++-v3/include/std/future
index 9e75ae98b13d2..d7be205af5061 100644
--- a/libstdc++-v3/include/std/future
+++ b/libstdc++-v3/include/std/future
@@ -532,7 +532,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	{
 	  static_assert(is_void<_Res>::value, "Only used for promise<void>");
 
-	  typename promise<_Res>::_Ptr_type operator()() const
+	  typename promise<_Res>::_Ptr_type operator()() const noexcept
 	  { return std::move(_M_promise->_M_storage); }
 
 	  promise<_Res>*    _M_promise;
@@ -545,7 +545,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
         struct _Setter<_Res, __exception_ptr_tag>
         {
 	  // Used by std::promise to store an exception as the result.
-          typename promise<_Res>::_Ptr_type operator()() const
+          typename promise<_Res>::_Ptr_type operator()() const noexcept
           {
             _M_promise->_M_storage->_M_error = *_M_ex;
             return std::move(_M_promise->_M_storage);

From bcb9dad9f6123c14ab8b14d2c3d360461dd5ee17 Mon Sep 17 00:00:00 2001
From: Jonathan Wakely <jwakely@redhat.com>
Date: Wed, 19 Jun 2024 14:16:27 +0100
Subject: [PATCH 06/22] libstdc++: Consistently indent <future> with tabs

libstdc++-v3/ChangeLog:

	* include/std/future: Adjust whitespace to use tabs for
	indentation.
---
 libstdc++-v3/include/std/future | 328 ++++++++++++++++----------------
 1 file changed, 164 insertions(+), 164 deletions(-)

diff --git a/libstdc++-v3/include/std/future b/libstdc++-v3/include/std/future
index d7be205af5061..6ce7d89ca3ffe 100644
--- a/libstdc++-v3/include/std/future
+++ b/libstdc++-v3/include/std/future
@@ -292,7 +292,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       {
 	using __allocator_type = __alloc_rebind<_Alloc, _Result_alloc>;
 
-        explicit
+	explicit
 	_Result_alloc(const _Alloc& __a) : _Result<_Res>(), _Alloc(__a)
 	{ }
 
@@ -362,9 +362,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       }
 
       template<typename _Rep, typename _Period>
-        future_status
-        wait_for(const chrono::duration<_Rep, _Period>& __rel)
-        {
+	future_status
+	wait_for(const chrono::duration<_Rep, _Period>& __rel)
+	{
 	  // First, check if the future has been made ready.  Use acquire MO
 	  // to synchronize with the thread that made it ready.
 	  if (_M_status._M_load(memory_order_acquire) == _Status::__ready)
@@ -396,9 +396,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	}
 
       template<typename _Clock, typename _Duration>
-        future_status
-        wait_until(const chrono::time_point<_Clock, _Duration>& __abs)
-        {
+	future_status
+	wait_until(const chrono::time_point<_Clock, _Duration>& __abs)
+	{
 #if __cplusplus > 201703L
 	  static_assert(chrono::is_clock_v<_Clock>);
 #endif
@@ -430,8 +430,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       _M_set_result(function<_Ptr_type()> __res, bool __ignore_failure = false)
       {
 	bool __did_set = false;
-        // all calls to this function are serialized,
-        // side-effects of invoking __res only happen once
+	// all calls to this function are serialized,
+	// side-effects of invoking __res only happen once
 	call_once(_M_once, &_State_baseV2::_M_do_set, this,
 		  std::__addressof(__res), std::__addressof(__did_set));
 	if (__did_set)
@@ -439,7 +439,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	  _M_status._M_store_notify_all(_Status::__ready,
 					memory_order_release);
 	else if (!__ignore_failure)
-          __throw_future_error(int(future_errc::promise_already_satisfied));
+	  __throw_future_error(int(future_errc::promise_already_satisfied));
       }
 
       // Provide a result to the shared state but delay making it ready
@@ -451,12 +451,12 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       {
 	bool __did_set = false;
 	unique_ptr<_Make_ready> __mr{new _Make_ready};
-        // all calls to this function are serialized,
-        // side-effects of invoking __res only happen once
+	// all calls to this function are serialized,
+	// side-effects of invoking __res only happen once
 	call_once(_M_once, &_State_baseV2::_M_do_set, this,
 		  std::__addressof(__res), std::__addressof(__did_set));
 	if (!__did_set)
-          __throw_future_error(int(future_errc::promise_already_satisfied));
+	  __throw_future_error(int(future_errc::promise_already_satisfied));
 	__mr->_M_shared_state = std::move(__self);
 	__mr->_M_set();
 	__mr.release();
@@ -490,41 +490,41 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       }
 
       template<typename _Res, typename _Arg>
-        struct _Setter;
+	struct _Setter;
 
       // set lvalues
       template<typename _Res, typename _Arg>
-        struct _Setter<_Res, _Arg&>
-        {
-          // check this is only used by promise<R>::set_value(const R&)
-          // or promise<R&>::set_value(R&)
-          static_assert(is_same<_Res, _Arg&>::value  // promise<R&>
-              || is_same<const _Res, _Arg>::value,   // promise<R>
-              "Invalid specialisation");
+	struct _Setter<_Res, _Arg&>
+	{
+	  // check this is only used by promise<R>::set_value(const R&)
+	  // or promise<R&>::set_value(R&)
+	  static_assert(is_same<_Res, _Arg&>::value              // promise<R&>
+			  || is_same<const _Res, _Arg>::value,   // promise<R>
+			"Invalid specialisation");
 
 	  // Used by std::promise to copy construct the result.
-          typename promise<_Res>::_Ptr_type operator()() const
-          {
-            _M_promise->_M_storage->_M_set(*_M_arg);
-            return std::move(_M_promise->_M_storage);
-          }
-          promise<_Res>*    _M_promise;
-          _Arg*             _M_arg;
-        };
+	  typename promise<_Res>::_Ptr_type operator()() const
+	  {
+	    _M_promise->_M_storage->_M_set(*_M_arg);
+	    return std::move(_M_promise->_M_storage);
+	  }
+	  promise<_Res>*    _M_promise;
+	  _Arg*             _M_arg;
+	};
 
       // set rvalues
       template<typename _Res>
-        struct _Setter<_Res, _Res&&>
-        {
+	struct _Setter<_Res, _Res&&>
+	{
 	  // Used by std::promise to move construct the result.
-          typename promise<_Res>::_Ptr_type operator()() const
-          {
-            _M_promise->_M_storage->_M_set(std::move(*_M_arg));
-            return std::move(_M_promise->_M_storage);
-          }
-          promise<_Res>*    _M_promise;
-          _Res*             _M_arg;
-        };
+	  typename promise<_Res>::_Ptr_type operator()() const
+	  {
+	    _M_promise->_M_storage->_M_set(std::move(*_M_arg));
+	    return std::move(_M_promise->_M_storage);
+	  }
+	  promise<_Res>*    _M_promise;
+	  _Res*             _M_arg;
+	};
 
       // set void
       template<typename _Res>
@@ -542,35 +542,35 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
       // set exceptions
       template<typename _Res>
-        struct _Setter<_Res, __exception_ptr_tag>
-        {
+	struct _Setter<_Res, __exception_ptr_tag>
+	{
 	  // Used by std::promise to store an exception as the result.
-          typename promise<_Res>::_Ptr_type operator()() const noexcept
-          {
-            _M_promise->_M_storage->_M_error = *_M_ex;
-            return std::move(_M_promise->_M_storage);
-          }
+	  typename promise<_Res>::_Ptr_type operator()() const noexcept
+	  {
+	    _M_promise->_M_storage->_M_error = *_M_ex;
+	    return std::move(_M_promise->_M_storage);
+	  }
 
-          promise<_Res>*   _M_promise;
-          exception_ptr*    _M_ex;
-        };
+	  promise<_Res>*   _M_promise;
+	  exception_ptr*    _M_ex;
+	};
 
       template<typename _Res, typename _Arg>
 	__attribute__((__always_inline__))
-        static _Setter<_Res, _Arg&&>
-        __setter(promise<_Res>* __prom, _Arg&& __arg) noexcept
-        {
-          return _Setter<_Res, _Arg&&>{ __prom, std::__addressof(__arg) };
-        }
+	static _Setter<_Res, _Arg&&>
+	__setter(promise<_Res>* __prom, _Arg&& __arg) noexcept
+	{
+	  return _Setter<_Res, _Arg&&>{ __prom, std::__addressof(__arg) };
+	}
 
       template<typename _Res>
 	__attribute__((__always_inline__))
-        static _Setter<_Res, __exception_ptr_tag>
-        __setter(exception_ptr& __ex, promise<_Res>* __prom) noexcept
-        {
-          __glibcxx_assert(__ex != nullptr); // LWG 2276
-          return _Setter<_Res, __exception_ptr_tag>{ __prom, &__ex };
-        }
+	static _Setter<_Res, __exception_ptr_tag>
+	__setter(exception_ptr& __ex, promise<_Res>* __prom) noexcept
+	{
+	  __glibcxx_assert(__ex != nullptr); // LWG 2276
+	  return _Setter<_Res, __exception_ptr_tag>{ __prom, &__ex };
+	}
 
       template<typename _Res>
 	__attribute__((__always_inline__))
@@ -581,24 +581,24 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	}
 
       template<typename _Tp>
-        static void
-        _S_check(const shared_ptr<_Tp>& __p)
-        {
-          if (!static_cast<bool>(__p))
-            __throw_future_error((int)future_errc::no_state);
-        }
+	static void
+	_S_check(const shared_ptr<_Tp>& __p)
+	{
+	  if (!static_cast<bool>(__p))
+	    __throw_future_error((int)future_errc::no_state);
+	}
 
     private:
       // The function invoked with std::call_once(_M_once, ...).
       void
       _M_do_set(function<_Ptr_type()>* __f, bool* __did_set)
       {
-        _Ptr_type __res = (*__f)();
-        // Notify the caller that we did try to set; if we do not throw an
-        // exception, the caller will be aware that it did set (e.g., see
-        // _M_set_result).
+	_Ptr_type __res = (*__f)();
+	// Notify the caller that we did try to set; if we do not throw an
+	// exception, the caller will be aware that it did set (e.g., see
+	// _M_set_result).
 	*__did_set = true;
-        _M_result.swap(__res); // nothrow
+	_M_result.swap(__res); // nothrow
       }
 
       // Wait for completion of async function.
@@ -719,49 +719,49 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       void
       wait() const
       {
-        _State_base::_S_check(_M_state);
-        _M_state->wait();
+	_State_base::_S_check(_M_state);
+	_M_state->wait();
       }
 
       template<typename _Rep, typename _Period>
-        future_status
-        wait_for(const chrono::duration<_Rep, _Period>& __rel) const
-        {
-          _State_base::_S_check(_M_state);
-          return _M_state->wait_for(__rel);
-        }
+	future_status
+	wait_for(const chrono::duration<_Rep, _Period>& __rel) const
+	{
+	  _State_base::_S_check(_M_state);
+	  return _M_state->wait_for(__rel);
+	}
 
       template<typename _Clock, typename _Duration>
-        future_status
-        wait_until(const chrono::time_point<_Clock, _Duration>& __abs) const
-        {
-          _State_base::_S_check(_M_state);
-          return _M_state->wait_until(__abs);
-        }
+	future_status
+	wait_until(const chrono::time_point<_Clock, _Duration>& __abs) const
+	{
+	  _State_base::_S_check(_M_state);
+	  return _M_state->wait_until(__abs);
+	}
 
     protected:
       /// Wait for the state to be ready and rethrow any stored exception
       __result_type
       _M_get_result() const
       {
-        _State_base::_S_check(_M_state);
-        _Result_base& __res = _M_state->wait();
-        if (!(__res._M_error == nullptr))
-          rethrow_exception(__res._M_error);
-        return static_cast<__result_type>(__res);
+	_State_base::_S_check(_M_state);
+	_Result_base& __res = _M_state->wait();
+	if (!(__res._M_error == nullptr))
+	  rethrow_exception(__res._M_error);
+	return static_cast<__result_type>(__res);
       }
 
       void _M_swap(__basic_future& __that) noexcept
       {
-        _M_state.swap(__that._M_state);
+	_M_state.swap(__that._M_state);
       }
 
       // Construction of a future by promise::get_future()
       explicit
       __basic_future(const __state_type& __state) : _M_state(__state)
       {
-        _State_base::_S_check(_M_state);
-        _M_state->_M_set_retrieved_flag();
+	_State_base::_S_check(_M_state);
+	_M_state->_M_set_retrieved_flag();
       }
 
       // Copy construction from a shared_future
@@ -780,9 +780,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
       struct _Reset
       {
-        explicit _Reset(__basic_future& __fut) noexcept : _M_fut(__fut) { }
-        ~_Reset() { _M_fut._M_state.reset(); }
-        __basic_future& _M_fut;
+	explicit _Reset(__basic_future& __fut) noexcept : _M_fut(__fut) { }
+	~_Reset() { _M_fut._M_state.reset(); }
+	__basic_future& _M_fut;
       };
     };
 
@@ -801,8 +801,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       friend class promise<_Res>;
       template<typename> friend class packaged_task;
       template<typename _Fn, typename... _Args>
-        friend future<__async_result_of<_Fn, _Args...>>
-        async(launch, _Fn&&, _Args&&...);
+	friend future<__async_result_of<_Fn, _Args...>>
+	async(launch, _Fn&&, _Args&&...);
 
       typedef __basic_future<_Res> _Base_type;
       typedef typename _Base_type::__state_type __state_type;
@@ -822,16 +822,16 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
       future& operator=(future&& __fut) noexcept
       {
-        future(std::move(__fut))._M_swap(*this);
-        return *this;
+	future(std::move(__fut))._M_swap(*this);
+	return *this;
       }
 
       /// Retrieving the value
       _Res
       get()
       {
-        typename _Base_type::_Reset __reset(*this);
-        return std::move(this->_M_get_result()._M_value());
+	typename _Base_type::_Reset __reset(*this);
+	return std::move(this->_M_get_result()._M_value());
       }
 
       shared_future<_Res> share() noexcept;
@@ -844,8 +844,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       friend class promise<_Res&>;
       template<typename> friend class packaged_task;
       template<typename _Fn, typename... _Args>
-        friend future<__async_result_of<_Fn, _Args...>>
-        async(launch, _Fn&&, _Args&&...);
+	friend future<__async_result_of<_Fn, _Args...>>
+	async(launch, _Fn&&, _Args&&...);
 
       typedef __basic_future<_Res&> _Base_type;
       typedef typename _Base_type::__state_type __state_type;
@@ -865,16 +865,16 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
       future& operator=(future&& __fut) noexcept
       {
-        future(std::move(__fut))._M_swap(*this);
-        return *this;
+	future(std::move(__fut))._M_swap(*this);
+	return *this;
       }
 
       /// Retrieving the value
       _Res&
       get()
       {
-        typename _Base_type::_Reset __reset(*this);
-        return this->_M_get_result()._M_get();
+	typename _Base_type::_Reset __reset(*this);
+	return this->_M_get_result()._M_get();
       }
 
       shared_future<_Res&> share() noexcept;
@@ -887,8 +887,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       friend class promise<void>;
       template<typename> friend class packaged_task;
       template<typename _Fn, typename... _Args>
-        friend future<__async_result_of<_Fn, _Args...>>
-        async(launch, _Fn&&, _Args&&...);
+	friend future<__async_result_of<_Fn, _Args...>>
+	async(launch, _Fn&&, _Args&&...);
 
       typedef __basic_future<void> _Base_type;
       typedef typename _Base_type::__state_type __state_type;
@@ -908,16 +908,16 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
       future& operator=(future&& __fut) noexcept
       {
-        future(std::move(__fut))._M_swap(*this);
-        return *this;
+	future(std::move(__fut))._M_swap(*this);
+	return *this;
       }
 
       /// Retrieving the value
       void
       get()
       {
-        typename _Base_type::_Reset __reset(*this);
-        this->_M_get_result();
+	typename _Base_type::_Reset __reset(*this);
+	this->_M_get_result();
       }
 
       shared_future<void> share() noexcept;
@@ -955,14 +955,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
       shared_future& operator=(const shared_future& __sf) noexcept
       {
-        shared_future(__sf)._M_swap(*this);
-        return *this;
+	shared_future(__sf)._M_swap(*this);
+	return *this;
       }
 
       shared_future& operator=(shared_future&& __sf) noexcept
       {
-        shared_future(std::move(__sf))._M_swap(*this);
-        return *this;
+	shared_future(std::move(__sf))._M_swap(*this);
+	return *this;
       }
 
       /// Retrieving the value
@@ -994,14 +994,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
       shared_future& operator=(const shared_future& __sf)
       {
-        shared_future(__sf)._M_swap(*this);
-        return *this;
+	shared_future(__sf)._M_swap(*this);
+	return *this;
       }
 
       shared_future& operator=(shared_future&& __sf) noexcept
       {
-        shared_future(std::move(__sf))._M_swap(*this);
-        return *this;
+	shared_future(std::move(__sf))._M_swap(*this);
+	return *this;
       }
 
       /// Retrieving the value
@@ -1033,14 +1033,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
       shared_future& operator=(const shared_future& __sf)
       {
-        shared_future(__sf)._M_swap(*this);
-        return *this;
+	shared_future(__sf)._M_swap(*this);
+	return *this;
       }
 
       shared_future& operator=(shared_future&& __sf) noexcept
       {
-        shared_future(std::move(__sf))._M_swap(*this);
-        return *this;
+	shared_future(std::move(__sf))._M_swap(*this);
+	return *this;
       }
 
       // Retrieving the value
@@ -1115,31 +1115,31 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       { }
 
       template<typename _Allocator>
-        promise(allocator_arg_t, const _Allocator& __a)
-        : _M_future(std::allocate_shared<_State>(__a)),
+	promise(allocator_arg_t, const _Allocator& __a)
+	: _M_future(std::allocate_shared<_State>(__a)),
 	  _M_storage(__future_base::_S_allocate_result<_Res>(__a))
-        { }
+	{ }
 
       template<typename _Allocator>
-        promise(allocator_arg_t, const _Allocator&, promise&& __rhs)
-        : _M_future(std::move(__rhs._M_future)),
+	promise(allocator_arg_t, const _Allocator&, promise&& __rhs)
+	: _M_future(std::move(__rhs._M_future)),
 	  _M_storage(std::move(__rhs._M_storage))
-        { }
+	{ }
 
       promise(const promise&) = delete;
 
       ~promise()
       {
-        if (static_cast<bool>(_M_future) && !_M_future.unique())
-          _M_future->_M_break_promise(std::move(_M_storage));
+	if (static_cast<bool>(_M_future) && !_M_future.unique())
+	  _M_future->_M_break_promise(std::move(_M_storage));
       }
 
       // Assignment
       promise&
       operator=(promise&& __rhs) noexcept
       {
-        promise(std::move(__rhs)).swap(*this);
-        return *this;
+	promise(std::move(__rhs)).swap(*this);
+	return *this;
       }
 
       promise& operator=(const promise&) = delete;
@@ -1147,8 +1147,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       void
       swap(promise& __rhs) noexcept
       {
-        _M_future.swap(__rhs._M_future);
-        _M_storage.swap(__rhs._M_storage);
+	_M_future.swap(__rhs._M_future);
+	_M_storage.swap(__rhs._M_storage);
       }
 
       // Retrieving the result
@@ -1234,31 +1234,31 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       { }
 
       template<typename _Allocator>
-        promise(allocator_arg_t, const _Allocator& __a)
-        : _M_future(std::allocate_shared<_State>(__a)),
+	promise(allocator_arg_t, const _Allocator& __a)
+	: _M_future(std::allocate_shared<_State>(__a)),
 	  _M_storage(__future_base::_S_allocate_result<_Res&>(__a))
-        { }
+	{ }
 
       template<typename _Allocator>
-        promise(allocator_arg_t, const _Allocator&, promise&& __rhs)
-        : _M_future(std::move(__rhs._M_future)),
+	promise(allocator_arg_t, const _Allocator&, promise&& __rhs)
+	: _M_future(std::move(__rhs._M_future)),
 	  _M_storage(std::move(__rhs._M_storage))
-        { }
+	{ }
 
       promise(const promise&) = delete;
 
       ~promise()
       {
-        if (static_cast<bool>(_M_future) && !_M_future.unique())
-          _M_future->_M_break_promise(std::move(_M_storage));
+	if (static_cast<bool>(_M_future) && !_M_future.unique())
+	  _M_future->_M_break_promise(std::move(_M_storage));
       }
 
       // Assignment
       promise&
       operator=(promise&& __rhs) noexcept
       {
-        promise(std::move(__rhs)).swap(*this);
-        return *this;
+	promise(std::move(__rhs)).swap(*this);
+	return *this;
       }
 
       promise& operator=(const promise&) = delete;
@@ -1266,8 +1266,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       void
       swap(promise& __rhs) noexcept
       {
-        _M_future.swap(__rhs._M_future);
-        _M_storage.swap(__rhs._M_storage);
+	_M_future.swap(__rhs._M_future);
+	_M_storage.swap(__rhs._M_storage);
       }
 
       // Retrieving the result
@@ -1332,33 +1332,33 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       { }
 
       template<typename _Allocator>
-        promise(allocator_arg_t, const _Allocator& __a)
-        : _M_future(std::allocate_shared<_State>(__a)),
+	promise(allocator_arg_t, const _Allocator& __a)
+	: _M_future(std::allocate_shared<_State>(__a)),
 	  _M_storage(__future_base::_S_allocate_result<void>(__a))
-        { }
+	{ }
 
       // _GLIBCXX_RESOLVE_LIB_DEFECTS
       // 2095.  missing constructors needed for uses-allocator construction
       template<typename _Allocator>
-        promise(allocator_arg_t, const _Allocator&, promise&& __rhs)
-        : _M_future(std::move(__rhs._M_future)),
+	promise(allocator_arg_t, const _Allocator&, promise&& __rhs)
+	: _M_future(std::move(__rhs._M_future)),
 	  _M_storage(std::move(__rhs._M_storage))
-        { }
+	{ }
 
       promise(const promise&) = delete;
 
       ~promise()
       {
-        if (static_cast<bool>(_M_future) && !_M_future.unique())
-          _M_future->_M_break_promise(std::move(_M_storage));
+	if (static_cast<bool>(_M_future) && !_M_future.unique())
+	  _M_future->_M_break_promise(std::move(_M_storage));
       }
 
       // Assignment
       promise&
       operator=(promise&& __rhs) noexcept
       {
-        promise(std::move(__rhs)).swap(*this);
-        return *this;
+	promise(std::move(__rhs)).swap(*this);
+	return *this;
       }
 
       promise& operator=(const promise&) = delete;
@@ -1366,8 +1366,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       void
       swap(promise& __rhs) noexcept
       {
-        _M_future.swap(__rhs._M_future);
-        _M_storage.swap(__rhs._M_storage);
+	_M_future.swap(__rhs._M_future);
+	_M_storage.swap(__rhs._M_storage);
       }
 
       // Retrieving the result
@@ -1596,7 +1596,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
       ~packaged_task()
       {
-        if (static_cast<bool>(_M_state) && !_M_state.unique())
+	if (static_cast<bool>(_M_state) && !_M_state.unique())
 	  _M_state->_M_break_promise(std::move(_M_state->_M_result));
       }
 
@@ -1709,7 +1709,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	// result in _M_result, swaps that with the base _M_result and makes
 	// the state ready. Tell _M_set_result to ignore failure so all later
 	// calls do nothing.
-        _M_set_result(_S_task_setter(_M_result, _M_fn), true);
+	_M_set_result(_S_task_setter(_M_result, _M_fn), true);
       }
 
       // Caller should check whether the state is ready first, because this

From 0982552bc4eeffb5520deba10dedecfb2390a8de Mon Sep 17 00:00:00 2001
From: Takayuki 'January June' Suwa <jjsuwa_sys3175@yahoo.co.jp>
Date: Wed, 19 Jun 2024 13:59:54 +0900
Subject: [PATCH 07/22] xtensa: Eliminate double MEMW insertions for volatile
 memory

This patch makes avoid inserting a MEMW instruction before a load/store
nstruction with volatile memory reference if there is already a MEMW
immediately before it.

gcc/ChangeLog:

	* config/xtensa/xtensa.cc (print_operand):
	When outputting MEMW before the instruction, check if the previous
	instruction is already that.
---
 gcc/config/xtensa/xtensa.cc | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc
index bc127997ac6cb..e2549de5df050 100644
--- a/gcc/config/xtensa/xtensa.cc
+++ b/gcc/config/xtensa/xtensa.cc
@@ -3078,7 +3078,17 @@ print_operand (FILE *file, rtx x, int letter)
 	  /* For a volatile memory reference, emit a MEMW before the
 	     load or store.  */
 	  if (MEM_VOLATILE_P (x) && TARGET_SERIALIZE_VOLATILE)
-	    fprintf (file, "memw\n\t");
+	    {
+	      rtx_insn *prev_insn
+			= prev_nonnote_nondebug_insn (current_output_insn);
+	      rtx pat, src;
+
+	      if (! (prev_insn && NONJUMP_INSN_P (prev_insn)
+		     && GET_CODE (pat = PATTERN (prev_insn)) == SET
+		     && GET_CODE (src = SET_SRC (pat)) == UNSPEC
+		     && XINT (src, 1) == UNSPEC_MEMW))
+		fprintf (file, "memw\n\t");
+	    }
 	}
       else
 	output_operand_lossage ("invalid %%v value");

From 6f6ea27d17e9bbc917b94ffea1c933755e736bdc Mon Sep 17 00:00:00 2001
From: mayshao <mayshao-oc@zhaoxin.com>
Date: Wed, 19 Jun 2024 16:03:25 +0200
Subject: [PATCH 08/22] i386: Zhaoxin shijidadao enablement

This patch enables -march/-mtune=shijidadao, costs and tunings are set
according to the characteristics of the processor.

gcc/ChangeLog:

	* common/config/i386/cpuinfo.h (get_zhaoxin_cpu): Recognize shijidadao.
	* common/config/i386/i386-common.cc: Add shijidadao.
	* common/config/i386/i386-cpuinfo.h (enum processor_subtypes):
	Add ZHAOXIN_FAM7H_SHIJIDADAO.
	* config.gcc: Add shijidadao.
	* config/i386/driver-i386.cc (host_detect_local_cpu):
	Let -march=native recognize shijidadao processors.
	* config/i386/i386-c.cc (ix86_target_macros_internal): Add shijidadao.
	* config/i386/i386-options.cc (m_ZHAOXIN): Add m_SHIJIDADAO.
	(m_SHIJIDADAO): New definition.
	* config/i386/i386.h (enum processor_type): Add PROCESSOR_SHIJIDADAO.
	* config/i386/x86-tune-costs.h (struct processor_costs):
	Add shijidadao_cost.
	* config/i386/x86-tune-sched.cc (ix86_issue_rate): Add shijidadao.
	(ix86_adjust_cost): Ditto.
	* config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Add m_SHIJIDADAO.
	(X86_TUNE_USE_GATHER_4PARTS): Ditto.
	(X86_TUNE_USE_GATHER_8PARTS): Ditto.
	(X86_TUNE_AVOID_128FMA_CHAINS): Ditto.
	* doc/extend.texi: Add details about shijidadao.
	* doc/invoke.texi: Ditto.

gcc/testsuite/ChangeLog:

	* g++.target/i386/mv32.C: Handle new -march
	* gcc.target/i386/funcspec-56.inc: Ditto.
---
 gcc/common/config/i386/cpuinfo.h              |   8 +-
 gcc/common/config/i386/i386-common.cc         |   8 +-
 gcc/common/config/i386/i386-cpuinfo.h         |   1 +
 gcc/config.gcc                                |  14 ++-
 gcc/config/i386/driver-i386.cc                |  11 +-
 gcc/config/i386/i386-c.cc                     |   7 ++
 gcc/config/i386/i386-options.cc               |   4 +-
 gcc/config/i386/i386.h                        |   1 +
 gcc/config/i386/x86-tune-costs.h              | 116 ++++++++++++++++++
 gcc/config/i386/x86-tune-sched.cc             |   2 +
 gcc/config/i386/x86-tune.def                  |   8 +-
 gcc/doc/extend.texi                           |   3 +
 gcc/doc/invoke.texi                           |   6 +
 gcc/testsuite/g++.target/i386/mv32.C          |   6 +
 gcc/testsuite/gcc.target/i386/funcspec-56.inc |   2 +
 15 files changed, 183 insertions(+), 14 deletions(-)

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 4610bf6d6a458..936039725ab6c 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -667,12 +667,18 @@ get_zhaoxin_cpu (struct __processor_model *cpu_model,
 	  reset_cpu_feature (cpu_model, cpu_features2, FEATURE_F16C);
 	  cpu_model->__cpu_subtype = ZHAOXIN_FAM7H_LUJIAZUI;
 	}
-     else if (model >= 0x5b)
+     else if (model == 0x5b)
 	{
 	  cpu = "yongfeng";
 	  CHECK___builtin_cpu_is ("yongfeng");
 	  cpu_model->__cpu_subtype = ZHAOXIN_FAM7H_YONGFENG;
 	}
+     else if (model >= 0x6b)
+	{
+	  cpu = "shijidadao";
+	  CHECK___builtin_cpu_is ("shijidadao");
+	  cpu_model->__cpu_subtype = ZHAOXIN_FAM7H_SHIJIDADAO;
+	}
       break;
     default:
       break;
diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc
index 5d9c188c9c7db..e38b1b22ffb10 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -2066,6 +2066,7 @@ const char *const processor_names[] =
   "intel",
   "lujiazui",
   "yongfeng",
+  "shijidadao",
   "geode",
   "k6",
   "athlon",
@@ -2271,10 +2272,13 @@ const pta processor_alias_table[] =
       | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR, 0, P_NONE},
   {"lujiazui", PROCESSOR_LUJIAZUI, CPU_LUJIAZUI,
 	PTA_LUJIAZUI,
-	M_CPU_SUBTYPE (ZHAOXIN_FAM7H_LUJIAZUI), P_NONE},
+	M_CPU_SUBTYPE (ZHAOXIN_FAM7H_LUJIAZUI), P_PROC_BMI},
   {"yongfeng", PROCESSOR_YONGFENG, CPU_YONGFENG,
 	PTA_YONGFENG,
-	M_CPU_SUBTYPE (ZHAOXIN_FAM7H_YONGFENG), P_NONE},
+	M_CPU_SUBTYPE (ZHAOXIN_FAM7H_YONGFENG), P_PROC_AVX2},
+  {"shijidadao", PROCESSOR_SHIJIDADAO, CPU_YONGFENG,
+	PTA_YONGFENG,
+	M_CPU_SUBTYPE (ZHAOXIN_FAM7H_SHIJIDADAO), P_PROC_AVX2},
   {"k8", PROCESSOR_K8, CPU_K8,
     PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
       | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR, 0, P_NONE},
diff --git a/gcc/common/config/i386/i386-cpuinfo.h b/gcc/common/config/i386/i386-cpuinfo.h
index 3ec9e005a6ad5..ccc6deb63853e 100644
--- a/gcc/common/config/i386/i386-cpuinfo.h
+++ b/gcc/common/config/i386/i386-cpuinfo.h
@@ -104,6 +104,7 @@ enum processor_subtypes
   INTEL_COREI7_PANTHERLAKE,
   ZHAOXIN_FAM7H_YONGFENG,
   AMDFAM1AH_ZNVER5,
+  ZHAOXIN_FAM7H_SHIJIDADAO,
   CPU_SUBTYPE_MAX
 };
 
diff --git a/gcc/config.gcc b/gcc/config.gcc
index e500ba63e3222..644c456290dc1 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -711,9 +711,9 @@ atom slm nehalem westmere sandybridge ivybridge haswell broadwell bonnell \
 silvermont skylake-avx512 cannonlake icelake-client icelake-server \
 skylake goldmont goldmont-plus tremont cascadelake tigerlake cooperlake \
 sapphirerapids alderlake rocketlake eden-x2 nano nano-1000 nano-2000 nano-3000 \
-nano-x2 eden-x4 nano-x4 lujiazui yongfeng x86-64 x86-64-v2 x86-64-v3 x86-64-v4 \
-sierraforest graniterapids graniterapids-d grandridge arrowlake arrowlake-s \
-clearwaterforest pantherlake native"
+nano-x2 eden-x4 nano-x4 lujiazui yongfeng shijidadao x86-64 x86-64-v2 \
+x86-64-v3 x86-64-v4 sierraforest graniterapids graniterapids-d grandridge \
+arrowlake arrowlake-s clearwaterforest pantherlake native"
 
 # Additional x86 processors supported by --with-cpu=.  Each processor
 # MUST be separated by exactly one space.
@@ -3855,6 +3855,10 @@ case ${target} in
 	arch=yongfeng
 	cpu=yongfeng
 	;;
+      shijidadao-*)
+	arch=shijidadao
+	cpu=shijidadao
+	;;
       pentium2-*)
 	arch=pentium2
 	cpu=pentium2
@@ -3980,6 +3984,10 @@ case ${target} in
 	arch=yongfeng
 	cpu=yongfeng
 	;;
+      shijidadao-*)
+	arch=shijidadao
+	cpu=shijidadao
+	;;
       nocona-*)
 	arch=nocona
 	cpu=nocona
diff --git a/gcc/config/i386/driver-i386.cc b/gcc/config/i386/driver-i386.cc
index 0176d8b6cd296..11470eaea1254 100644
--- a/gcc/config/i386/driver-i386.cc
+++ b/gcc/config/i386/driver-i386.cc
@@ -558,10 +558,12 @@ const char *host_detect_local_cpu (int argc, const char **argv)
       switch (family)
 	{
 	case 7:
-	  if (model == 0x3b)
-	    processor = PROCESSOR_LUJIAZUI;
-	  else if (model >= 0x5b)
+	  if (model >= 0x6b)
+	    processor = PROCESSOR_SHIJIDADAO;
+	  else if (model == 0x5b)
 	    processor = PROCESSOR_YONGFENG;
+	  else if (model == 0x3b)
+	    processor = PROCESSOR_LUJIAZUI;
 	  break;
 	default:
 	  break;
@@ -853,6 +855,9 @@ const char *host_detect_local_cpu (int argc, const char **argv)
     case PROCESSOR_YONGFENG:
       cpu = "yongfeng";
       break;
+    case PROCESSOR_SHIJIDADAO:
+      cpu = "shijidadao";
+      break;
 
     default:
       /* Use something reasonable.  */
diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc
index 7b0ad9e9181ee..403475d5b6bb2 100644
--- a/gcc/config/i386/i386-c.cc
+++ b/gcc/config/i386/i386-c.cc
@@ -156,6 +156,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
       def_or_undef (parse_in, "__yongfeng");
       def_or_undef (parse_in, "__yongfeng__");
       break;
+    case PROCESSOR_SHIJIDADAO:
+      def_or_undef (parse_in, "__shijidadao");
+      def_or_undef (parse_in, "__shijidadao__");
+      break;
     case PROCESSOR_PENTIUM4:
       def_or_undef (parse_in, "__pentium4");
       def_or_undef (parse_in, "__pentium4__");
@@ -386,6 +390,9 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
     case PROCESSOR_YONGFENG:
       def_or_undef (parse_in, "__tune_yongfeng__");
        break;
+    case PROCESSOR_SHIJIDADAO:
+      def_or_undef (parse_in, "__tune_shijidadao__");
+       break;
     case PROCESSOR_PENTIUM4:
       def_or_undef (parse_in, "__tune_pentium4__");
       break;
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index f2cecc0e2545b..65c5bad9c285e 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -155,7 +155,8 @@ along with GCC; see the file COPYING3.  If not see
 
 #define m_LUJIAZUI (HOST_WIDE_INT_1U<<PROCESSOR_LUJIAZUI)
 #define m_YONGFENG (HOST_WIDE_INT_1U<<PROCESSOR_YONGFENG)
-#define m_ZHAOXIN  (m_LUJIAZUI | m_YONGFENG)
+#define m_SHIJIDADAO (HOST_WIDE_INT_1U<<PROCESSOR_SHIJIDADAO)
+#define m_ZHAOXIN  (m_LUJIAZUI | m_YONGFENG | m_SHIJIDADAO)
 
 #define m_GEODE (HOST_WIDE_INT_1U<<PROCESSOR_GEODE)
 #define m_K6 (HOST_WIDE_INT_1U<<PROCESSOR_K6)
@@ -793,6 +794,7 @@ static const struct processor_costs *processor_cost_table[] =
   &intel_cost,
   &lujiazui_cost,
   &yongfeng_cost,
+  &shijidadao_cost,
   &geode_cost,
   &k6_cost,
   &athlon_cost,
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index dc1a1f4432072..ff5ed32e21e3c 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -2304,6 +2304,7 @@ enum processor_type
   PROCESSOR_INTEL,
   PROCESSOR_LUJIAZUI,
   PROCESSOR_YONGFENG,
+  PROCESSOR_SHIJIDADAO,
   PROCESSOR_GEODE,
   PROCESSOR_K6,
   PROCESSOR_ATHLON,
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index d3aaaa4b5cc25..a933794ed5059 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -3644,6 +3644,122 @@ struct processor_costs yongfeng_cost = {
   2,					/* Small unroll factor.  */
 };
 
+/* shijidadao_cost should produce code tuned for ZHAOXIN shijidadao CPU.  */
+static stringop_algs shijidadao_memcpy[2] = {
+  {libcall, {{8, unrolled_loop, true}, {256, unrolled_loop, false},
+			 {-1, libcall, false}}},
+  {libcall, {{10, loop, true}, {256, unrolled_loop, false},
+			 {-1, libcall, false}}}};
+static stringop_algs shijidadao_memset[2] = {
+  {libcall, {{4, loop, true}, {128, unrolled_loop, false},
+			 {-1, libcall, false}}},
+  {libcall, {{1, rep_prefix_4_byte, false}, {14, loop, true},
+			 {1024, vector_loop, false},
+			 {-1, libcall, false}}}};
+static const
+struct processor_costs shijidadao_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2.  */
+  8,				/* cost for loading QImode using movzbl.  */
+  {8, 8, 8},			/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {8, 8, 8},			/* cost of storing integer registers.  */
+  2,					/* cost of reg,reg fld/fst.  */
+  {8, 8, 8},			/* cost of loading fp registers
+				in SFmode, DFmode and XFmode.  */
+  {8, 8, 8},			/* cost of storing fp registers
+				in SFmode, DFmode and XFmode.  */
+  2,				/* cost of moving MMX register.  */
+  {8, 8},			/* cost of loading MMX registers
+				in SImode and DImode.  */
+  {8, 8},			/* cost of storing MMX registers
+				in SImode and DImode.  */
+  2, 3, 4,			/* cost of moving XMM,YMM,ZMM register.  */
+  {8, 8, 8, 10, 15},	/* cost of loading SSE registers
+				in 32,64,128,256 and 512-bit.  */
+  {8, 8, 8, 10, 15},	/* cost of storing SSE registers
+				in 32,64,128,256 and 512-bit.  */
+  8, 8,				/* SSE->integer and integer->SSE moves.  */
+  8, 8,				/* mask->integer and integer->mask moves.  */
+  {8, 8, 8},		/* cost of loading mask register
+				in QImode, HImode, SImode.  */
+  {8, 8, 8},		/* cost if storing mask register
+				in QImode, HImode, SImode.  */
+  2,				/* cost of moving mask register.  */
+  /* End of register allocator costs.  */
+  },
+
+  COSTS_N_INSNS (1),			/* cost of an add instruction.  */
+  COSTS_N_INSNS (1),		        /* cost of a lea instruction.  */
+  COSTS_N_INSNS (1),			/* variable shift costs.  */
+  COSTS_N_INSNS (1),			/* constant shift costs.  */
+  {COSTS_N_INSNS (2),			/* cost of starting multiply for QI.  */
+   COSTS_N_INSNS (3),			/*				 HI.  */
+   COSTS_N_INSNS (2),			/*				 SI.  */
+   COSTS_N_INSNS (2),			/*				 DI.  */
+   COSTS_N_INSNS (3)},		/*				 other.  */
+  0,				/* cost of multiply per each bit set.  */
+  {COSTS_N_INSNS (9),			/* cost of a divide/mod for QI.  */
+   COSTS_N_INSNS (10),			/*			    HI.  */
+   COSTS_N_INSNS (9),			/*			    SI.  */
+   COSTS_N_INSNS (50),			/*			    DI.  */
+   COSTS_N_INSNS (50)},		/*			    other.  */
+  COSTS_N_INSNS (1),			/* cost of movsx.  */
+  COSTS_N_INSNS (1),			/* cost of movzx.  */
+  8,					/* "large" insn.  */
+  17,					/* MOVE_RATIO.  */
+  6,					/* CLEAR_RATIO.  */
+  {8, 8, 8},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {8, 8, 8},			/* cost of storing integer registers.  */
+  {8, 8, 8, 12, 15},			/* cost of loading SSE register
+				in 32bit, 64bit, 128bit, 256bit and 512bit.  */
+  {8, 8, 8, 12, 15},			/* cost of storing SSE register
+				in 32bit, 64bit, 128bit, 256bit and 512bit.  */
+  {8, 8, 8, 12, 15},			/* cost of unaligned loads.  */
+  {8, 8, 8, 12, 15},			/* cost of unaligned storess.  */
+  2, 3, 4,			/* cost of moving XMM,YMM,ZMM register.  */
+  8,				/* cost of moving SSE register to integer.  */
+  18, 6,				/* Gather load static, per_elt.  */
+  18, 6,				/* Gather store static, per_elt.  */
+  32,				  	/* size of l1 cache.  */
+  256,					/* size of l2 cache.  */
+  64,					/* size of prefetch block.  */
+  12,					/* number of parallel prefetches.  */
+  3,					/* Branch cost.  */
+  COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (3),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (13),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (44),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
+  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
+  COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
+  COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
+  COSTS_N_INSNS (11),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (14),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (11),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
+  4, 4, 4, 4,				/* reassoc int, fp, vec_int, vec_fp.  */
+  shijidadao_memcpy,
+  shijidadao_memset,
+  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
+  "16:11:8",				/* Loop alignment.  */
+  "16:11:8",				/* Jump alignment.  */
+  "0:0:8",				/* Label alignment.  */
+  "16",				/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+};
+
+
 
 /* Generic should produce code tuned for Core-i7 (and newer chips)
    and btver1 (and newer chips).  */
diff --git a/gcc/config/i386/x86-tune-sched.cc b/gcc/config/i386/x86-tune-sched.cc
index f70846e628e57..d77298b0e34dc 100644
--- a/gcc/config/i386/x86-tune-sched.cc
+++ b/gcc/config/i386/x86-tune-sched.cc
@@ -79,6 +79,7 @@ ix86_issue_rate (void)
     case PROCESSOR_CANNONLAKE:
     case PROCESSOR_ALDERLAKE:
     case PROCESSOR_YONGFENG:
+    case PROCESSOR_SHIJIDADAO:
     case PROCESSOR_GENERIC:
       return 4;
 
@@ -446,6 +447,7 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
       break;
 
     case PROCESSOR_YONGFENG:
+    case PROCESSOR_SHIJIDADAO:
       /* Stack engine allows to execute push&pop instructions in parallel.  */
       if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
 	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 66512992b7b5b..343c32c291fa8 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -477,7 +477,7 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
    elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
 	  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID
-	    | m_YONGFENG | m_CORE_ATOM | m_GENERIC | m_GDS))
+	    | m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2
    elements.  */
@@ -488,7 +488,7 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
    elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
 	  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID
-	    | m_YONGFENG | m_CORE_ATOM | m_GENERIC | m_GDS))
+	    | m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4
    elements.  */
@@ -499,7 +499,7 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
    elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts",
 	  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_CORE_HYBRID | m_CORE_ATOM
-	    | m_YONGFENG | m_GENERIC | m_GDS))
+	    | m_YONGFENG | m_SHIJIDADAO | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
    elements.  */
@@ -509,7 +509,7 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
    smaller FMA chain.  */
 DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4
-          | m_YONGFENG | m_GENERIC)
+          | m_YONGFENG | m_SHIJIDADAO | m_GENERIC)
 
 /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
    smaller FMA chain.  */
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 173cdef013160..b2e41a581dd1c 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -26245,6 +26245,9 @@ ZHAOXIN lujiazui CPU.
 @item yongfeng
 ZHAOXIN yongfeng CPU.
 
+@item shijidadao
+ZHAOXIN shijidadao CPU.
+
 @item amdfam10h
 AMD Family 10h CPU.
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 5d7a87fde86c4..c790e2f35184c 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -34873,6 +34873,12 @@ SSE4.2, AVX, POPCNT, AES, PCLMUL, RDRND, XSAVE, XSAVEOPT, FSGSBASE, CX16,
 ABM, BMI, BMI2, F16C, FXSR, RDSEED, AVX2, FMA, SHA, LZCNT
 instruction set support.
 
+@item shijidadao
+ZHAOXIN shijidadao CPU with x86-64, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1,
+SSE4.2, AVX, POPCNT, AES, PCLMUL, RDRND, XSAVE, XSAVEOPT, FSGSBASE, CX16,
+ABM, BMI, BMI2, F16C, FXSR, RDSEED, AVX2, FMA, SHA, LZCNT
+instruction set support.
+
 @item geode
 AMD Geode embedded processor with MMX and 3DNow!@: instruction set support.
 @end table
diff --git a/gcc/testsuite/g++.target/i386/mv32.C b/gcc/testsuite/g++.target/i386/mv32.C
index 6c993218d01c3..b311c35baa3d9 100644
--- a/gcc/testsuite/g++.target/i386/mv32.C
+++ b/gcc/testsuite/g++.target/i386/mv32.C
@@ -21,6 +21,10 @@ int __attribute__ ((target("arch=yongfeng"))) foo () {
   return 2;
 }
 
+int __attribute__ ((target("arch=shijidadao"))) foo () {
+  return 3;
+}
+
 int main ()
 {
   int val = foo ();
@@ -29,6 +33,8 @@ int main ()
     assert (val == 1);
   else if (__builtin_cpu_is ("yongfeng"))
     assert (val == 2);
+  else if (__builtin_cpu_is ("shijidadao"))
+    assert (val == 3);
   else
     assert (val == 0);
 
diff --git a/gcc/testsuite/gcc.target/i386/funcspec-56.inc b/gcc/testsuite/gcc.target/i386/funcspec-56.inc
index 2a50f5bf67c86..c4dc89367ef58 100644
--- a/gcc/testsuite/gcc.target/i386/funcspec-56.inc
+++ b/gcc/testsuite/gcc.target/i386/funcspec-56.inc
@@ -208,6 +208,7 @@ extern void test_arch_arrowlake_s (void)	__attribute__((__target__("arch=arrowla
 extern void test_arch_pantherlake (void)	__attribute__((__target__("arch=pantherlake")));
 extern void test_arch_lujiazui (void)		__attribute__((__target__("arch=lujiazui")));
 extern void test_arch_yongfeng (void)		__attribute__((__target__("arch=yongfeng")));
+extern void test_arch_shijidadao (void)		__attribute__((__target__("arch=shijidadao")));
 extern void test_arch_k8 (void)			__attribute__((__target__("arch=k8")));
 extern void test_arch_k8_sse3 (void)		__attribute__((__target__("arch=k8-sse3")));
 extern void test_arch_opteron (void)		__attribute__((__target__("arch=opteron")));
@@ -233,6 +234,7 @@ extern void test_tune_corei7_avx (void)		__attribute__((__target__("tune=corei7-
 extern void test_tune_core_avx2 (void)		__attribute__((__target__("tune=core-avx2")));
 extern void test_tune_lujiazui (void)		__attribute__((__target__("tune=lujiazui")));
 extern void test_tune_yongfeng (void)		__attribute__((__target__("tune=yongfeng")));
+extern void test_tune_shijidadao (void)		__attribute__((__target__("tune=shijidadao")));
 extern void test_tune_k8 (void)			__attribute__((__target__("tune=k8")));
 extern void test_tune_k8_sse3 (void)		__attribute__((__target__("tune=k8-sse3")));
 extern void test_tune_opteron (void)		__attribute__((__target__("tune=opteron")));

From 25860fd2a674373a6476af5ff0bd92354fc53d06 Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Wed, 19 Jun 2024 21:10:39 +0200
Subject: [PATCH 09/22] bitint: Fix up lowering of COMPLEX_EXPR [PR115544]

We don't really support _Complex _BitInt(N), the only place we use
bitint complex types is for the .{ADD,SUB,MUL}_OVERFLOW internal function
results and COMPLEX_EXPR in the usual case should be either not present
yet because the ifns weren't folded and will be lowered, or optimized
into something simpler, because normally the complex bitint should be
used just for extracting the 2 subparts from it.
Still, with disabled optimizations it can occassionally happen that it
appears in the IL and that is why there is support for lowering those,
but it doesn't handle optimizing those too much, so if it uses SSA_NAME,
it relies on them having a backing VAR_DECL during the lowering.
This is normally achieves through the
                      && ((is_gimple_assign (use_stmt)
                           && (gimple_assign_rhs_code (use_stmt)
                               != COMPLEX_EXPR))
                          || gimple_code (use_stmt) == GIMPLE_COND)
hunk in gimple_lower_bitint, but as the following testcase shows, there
is one thing I've missed, the load optimization isn't guarded by the
above stuff.  So, either we'd need to add support for loads to
lower_complexexpr_stmt, or because they should be really rare, this
patch just disables the load optimization if at least one load use is
a COMPLEX_EXPR (like we do already for PHIs, calls, asm).

2024-06-19  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/115544
	* gimple-lower-bitint.cc (gimple_lower_bitint): Disable optimizing
	loads used by COMPLEX_EXPR operands.

	* gcc.dg/bitint-107.c: New test.
---
 gcc/gimple-lower-bitint.cc        |  5 ++++-
 gcc/testsuite/gcc.dg/bitint-107.c | 16 ++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.dg/bitint-107.c

diff --git a/gcc/gimple-lower-bitint.cc b/gcc/gimple-lower-bitint.cc
index 56e5f826a8d9f..f955f3eabd9b6 100644
--- a/gcc/gimple-lower-bitint.cc
+++ b/gcc/gimple-lower-bitint.cc
@@ -6630,7 +6630,10 @@ gimple_lower_bitint (void)
 		    continue;
 		  if (gimple_code (use_stmt) == GIMPLE_PHI
 		      || is_gimple_call (use_stmt)
-		      || gimple_code (use_stmt) == GIMPLE_ASM)
+		      || gimple_code (use_stmt) == GIMPLE_ASM
+		      || (is_gimple_assign (use_stmt)
+			  && (gimple_assign_rhs_code (use_stmt)
+			      == COMPLEX_EXPR)))
 		    {
 		      optimizable_load = false;
 		      break;
diff --git a/gcc/testsuite/gcc.dg/bitint-107.c b/gcc/testsuite/gcc.dg/bitint-107.c
new file mode 100644
index 0000000000000..a3f5f534088f3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bitint-107.c
@@ -0,0 +1,16 @@
+/* PR tree-optimization/115544 */
+/* { dg-do compile { target bitint } } */
+/* { dg-options "-O -fno-tree-fre -fno-tree-ccp -fno-tree-forwprop" } */
+
+#if __BITINT_MAXWIDTH__ >= 129
+typedef _BitInt(129) B;
+#else
+typedef _BitInt(63) B;
+#endif
+B a, b;
+
+int
+foo (void)
+{
+  return __builtin_mul_overflow (a, 1, &b);
+}

From e03583e7ee99552276a90a4094776fda55ab2e02 Mon Sep 17 00:00:00 2001
From: Patrick O'Neill <patrick@rivosinc.com>
Date: Tue, 18 Jun 2024 14:40:15 -0700
Subject: [PATCH 10/22] RISC-V: Promote Zaamo/Zalrsc to a when using an old
 binutils

Binutils 2.42 and before don't support Zaamo/Zalrsc. When users specify
both Zaamo and Zalrsc, promote them to 'a' in the -march string.

This does not affect testsuite results for users with old versions of binutils.
Testcases that failed due to 'call'/isa string continue to fail after this PATCH
when using an old version of binutils.

gcc/ChangeLog:

	* common/config/riscv/riscv-common.cc: Add 'a' extension to
	riscv_combine_info.

Signed-off-by: Patrick O'Neill <patrick@rivosinc.com>
---
 gcc/common/config/riscv/riscv-common.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gcc/common/config/riscv/riscv-common.cc b/gcc/common/config/riscv/riscv-common.cc
index 1dc1d9904c7bd..410e673f5e017 100644
--- a/gcc/common/config/riscv/riscv-common.cc
+++ b/gcc/common/config/riscv/riscv-common.cc
@@ -401,6 +401,7 @@ static const struct riscv_ext_version riscv_ext_version_table[] =
 /* Combine extensions defined in this table  */
 static const struct riscv_ext_version riscv_combine_info[] =
 {
+  {"a", ISA_SPEC_CLASS_20191213, 2, 1},
   {"zk",  ISA_SPEC_CLASS_NONE, 1, 0},
   {"zkn",  ISA_SPEC_CLASS_NONE, 1, 0},
   {"zks",  ISA_SPEC_CLASS_NONE, 1, 0},

From f0204ae3861e5f2e6099719c2cb1718e064c8c12 Mon Sep 17 00:00:00 2001
From: "demin.han" <demin.han@starfivetech.com>
Date: Wed, 19 Jun 2024 16:21:13 -0600
Subject: [PATCH 11/22] [PATCH v2] RISC-V: Remove float vector eqne pattern

We can unify eqne and other comparison operations.

Tested on RV32 and RV64

gcc/ChangeLog:

	* config/riscv/riscv-vector-builtins-bases.cc: Remove eqne cond
	* config/riscv/vector.md (@pred_eqne<mode>_scalar): Remove patterns
	(*pred_eqne<mode>_scalar_merge_tie_mask): Ditto
	(*pred_eqne<mode>_scalar): Ditto
	(*pred_eqne<mode>_scalar_narrow): Ditto

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/rvv/base/float-point-cmp-eqne.c: New test.
---
 .../riscv/riscv-vector-builtins-bases.cc      |  8 +-
 gcc/config/riscv/vector.md                    | 86 -------------------
 .../riscv/rvv/base/float-point-cmp-eqne.c     | 54 ++++++++++++
 3 files changed, 56 insertions(+), 92 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/float-point-cmp-eqne.c

diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index b6f6e4ff37e78..596b88cc8a3cd 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -1420,12 +1420,8 @@ class fcmp : public function_base
     switch (e.op_info->op)
       {
 	case OP_TYPE_vf: {
-	  if (CODE == EQ || CODE == NE)
-	    return e.use_compare_insn (CODE, code_for_pred_eqne_scalar (
-					       e.vector_mode ()));
-	  else
-	    return e.use_compare_insn (CODE, code_for_pred_cmp_scalar (
-					       e.vector_mode ()));
+	  return e.use_compare_insn (CODE, code_for_pred_cmp_scalar (
+					     e.vector_mode ()));
 	}
 	case OP_TYPE_vv: {
 	  return e.use_compare_insn (CODE,
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index fbcdf96f038ba..f8fae6557d935 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -7545,92 +7545,6 @@
    (set_attr "mode" "<MODE>")
    (set_attr "spec_restriction" "none,thv,thv,none,none")])
 
-(define_expand "@pred_eqne<mode>_scalar"
-  [(set (match_operand:<VM> 0 "register_operand")
-	(if_then_else:<VM>
-	  (unspec:<VM>
-	    [(match_operand:<VM> 1 "vector_mask_operand")
-	     (match_operand 6 "vector_length_operand")
-	     (match_operand 7 "const_int_operand")
-	     (match_operand 8 "const_int_operand")
-	     (reg:SI VL_REGNUM)
-	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
-	  (match_operator:<VM> 3 "equality_operator"
-	     [(vec_duplicate:V_VLSF
-	        (match_operand:<VEL> 5 "register_operand"))
-	      (match_operand:V_VLSF 4 "register_operand")])
-	  (match_operand:<VM> 2 "vector_merge_operand")))]
-  "TARGET_VECTOR"
-  {})
-
-(define_insn "*pred_eqne<mode>_scalar_merge_tie_mask"
-  [(set (match_operand:<VM> 0 "register_operand"              "=vm")
-	(if_then_else:<VM>
-	  (unspec:<VM>
-	    [(match_operand:<VM> 1 "register_operand"         "  0")
-	     (match_operand 5 "vector_length_operand"         " rK")
-	     (match_operand 6 "const_int_operand"             "  i")
-	     (match_operand 7 "const_int_operand"             "  i")
-	     (reg:SI VL_REGNUM)
-	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
-	  (match_operator:<VM> 2 "equality_operator"
-	     [(vec_duplicate:V_VLSF
-	        (match_operand:<VEL> 4 "register_operand"     "  f"))
-	      (match_operand:V_VLSF 3 "register_operand"      " vr")])
-	  (match_dup 1)))]
-  "TARGET_VECTOR"
-  "vmf%B2.vf\t%0,%3,%4,v0.t"
-  [(set_attr "type" "vfcmp")
-   (set_attr "mode" "<MODE>")
-   (set_attr "merge_op_idx" "1")
-   (set_attr "vl_op_idx" "5")
-   (set (attr "ma") (symbol_ref "riscv_vector::get_ma(operands[6])"))
-   (set (attr "avl_type_idx") (const_int 7))])
-
-;; We don't use early-clobber for LMUL <= 1 to get better codegen.
-(define_insn "*pred_eqne<mode>_scalar"
-  [(set (match_operand:<VM> 0 "register_operand"                "=vr,   vr,   &vr,   &vr")
-	(if_then_else:<VM>
-	  (unspec:<VM>
-	    [(match_operand:<VM> 1 "vector_mask_operand"      "vmWc1,vmWc1,vmWc1,vmWc1")
-	     (match_operand 6 "vector_length_operand"         "   rK,   rK,   rK,   rK")
-	     (match_operand 7 "const_int_operand"             "    i,    i,    i,    i")
-	     (match_operand 8 "const_int_operand"             "    i,    i,    i,    i")
-	     (reg:SI VL_REGNUM)
-	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
-	  (match_operator:<VM> 3 "equality_operator"
-	     [(vec_duplicate:V_VLSF
-	        (match_operand:<VEL> 5 "register_operand"     "    f,    f,    f,    f"))
-	      (match_operand:V_VLSF 4 "register_operand"      "   vr,   vr,   vr,   vr")])
-	  (match_operand:<VM> 2 "vector_merge_operand"        "   vu,    0,    vu,    0")))]
-  "TARGET_VECTOR && riscv_vector::cmp_lmul_le_one (<MODE>mode)"
-  "vmf%B3.vf\t%0,%4,%5%p1"
-  [(set_attr "type" "vfcmp")
-   (set_attr "mode" "<MODE>")
-   (set_attr "spec_restriction" "thv,thv,rvv,rvv")])
-
-;; We use early-clobber for source LMUL > dest LMUL.
-(define_insn "*pred_eqne<mode>_scalar_narrow"
-  [(set (match_operand:<VM> 0 "register_operand"                "=vm,   vr,   vr,  &vr,  &vr")
-	(if_then_else:<VM>
-	  (unspec:<VM>
-	    [(match_operand:<VM> 1 "vector_mask_operand"      "    0,vmWc1,vmWc1,vmWc1,vmWc1")
-	     (match_operand 6 "vector_length_operand"         "   rK,   rK,   rK,   rK,   rK")
-	     (match_operand 7 "const_int_operand"             "    i,    i,    i,    i,    i")
-	     (match_operand 8 "const_int_operand"             "    i,    i,    i,    i,    i")
-	     (reg:SI VL_REGNUM)
-	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
-	  (match_operator:<VM> 3 "equality_operator"
-	     [(vec_duplicate:V_VLSF
-	        (match_operand:<VEL> 5 "register_operand"     "    f,    f,    f,    f,    f"))
-	      (match_operand:V_VLSF 4 "register_operand"      "   vr,    0,    0,   vr,   vr")])
-	  (match_operand:<VM> 2 "vector_merge_operand"        "   vu,   vu,    0,   vu,    0")))]
-  "TARGET_VECTOR && riscv_vector::cmp_lmul_gt_one (<MODE>mode)"
-  "vmf%B3.vf\t%0,%4,%5%p1"
-  [(set_attr "type" "vfcmp")
-   (set_attr "mode" "<MODE>")
-   (set_attr "spec_restriction" "none,thv,thv,none,none")])
-
 ;; -------------------------------------------------------------------------------
 ;; ---- Predicated floating-point merge
 ;; -------------------------------------------------------------------------------
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-cmp-eqne.c b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-cmp-eqne.c
new file mode 100644
index 0000000000000..572bcb8f291be
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-cmp-eqne.c
@@ -0,0 +1,54 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64 -O3" } */
+
+#include "riscv_vector.h"
+
+#define CMP_FLOAT_VF_1(ID, S, OP, IMM)                                         \
+  vbool##S##_t test_float_1_##ID##_##S (vfloat##S##m1_t op1, size_t vl)        \
+  {                                                                            \
+    return __riscv_vmf##OP##_vf_f##S##m1_b##S (op1, IMM, vl);                  \
+  }
+
+CMP_FLOAT_VF_1 (0, 32, eq, 0.0)
+CMP_FLOAT_VF_1 (1, 32, eq, 1.0)
+CMP_FLOAT_VF_1 (2, 32, eq, __builtin_nanf ("123"))
+CMP_FLOAT_VF_1 (3, 32, ne, 0.0)
+CMP_FLOAT_VF_1 (4, 32, ne, 1.0)
+CMP_FLOAT_VF_1 (5, 32, ne, __builtin_nanf ("123"))
+
+CMP_FLOAT_VF_1 (0, 64, eq, 0.0)
+CMP_FLOAT_VF_1 (1, 64, eq, 1.0)
+CMP_FLOAT_VF_1 (2, 64, eq, __builtin_nan ("123"))
+CMP_FLOAT_VF_1 (3, 64, ne, 0.0)
+CMP_FLOAT_VF_1 (4, 64, ne, 1.0)
+CMP_FLOAT_VF_1 (5, 64, ne, __builtin_nan ("123"))
+
+#define CMP_FLOAT_VF_2(ID, S, OP, IMM)                                         \
+  vfloat##S##m1_t test_float_2_##ID##_##S (vfloat##S##m1_t op1,                \
+					   vfloat##S##m1_t op2, size_t vl)     \
+  {                                                                            \
+    vfloat##S##m1_t op3 = __riscv_vfmv_s_f_f##S##m1 (IMM, vl);                 \
+    vbool##S##_t mask1 = __riscv_vmf##OP##_vf_f##S##m1_b##S (op1, IMM, vl);    \
+    vbool##S##_t mask2 = __riscv_vmf##OP##_vv_f##S##m1_b##S (op1, op3, vl);    \
+    vbool##S##_t mask3 = __riscv_vmor (mask1, mask2, vl);                      \
+    return __riscv_vmerge_vvm_f##S##m1_tu (op1, op1, op2, mask3, vl);          \
+  }
+
+CMP_FLOAT_VF_2 (0, 32, eq, 0.0)
+CMP_FLOAT_VF_2 (1, 32, eq, 1.0)
+CMP_FLOAT_VF_2 (2, 32, eq, __builtin_nanf ("123"))
+CMP_FLOAT_VF_2 (3, 32, ne, 0.0)
+CMP_FLOAT_VF_2 (4, 32, ne, 1.0)
+CMP_FLOAT_VF_2 (5, 32, ne, __builtin_nanf ("123"))
+
+CMP_FLOAT_VF_2 (0, 64, eq, 0.0)
+CMP_FLOAT_VF_2 (1, 64, eq, 1.0)
+CMP_FLOAT_VF_2 (2, 64, eq, __builtin_nan ("123"))
+CMP_FLOAT_VF_2 (3, 64, ne, 0.0)
+CMP_FLOAT_VF_2 (4, 64, ne, 1.0)
+CMP_FLOAT_VF_2 (5, 64, ne, __builtin_nan ("123"))
+
+/* { dg-final { scan-assembler-times {vmfeq\.vf} 12 } } */
+/* { dg-final { scan-assembler-times {vmfne\.vf} 12 } } */
+/* { dg-final { scan-assembler-times {vmfeq\.vv} 6 } } */
+/* { dg-final { scan-assembler-times {vmfne\.vv} 6 } } */

From a334189739e13f8de1f9af99f8d16970435cebc4 Mon Sep 17 00:00:00 2001
From: YunQiang Su <syq@gcc.gnu.org>
Date: Thu, 20 Jun 2024 07:02:33 +0800
Subject: [PATCH 12/22] Revert "Build: Fix typo ac_cv_search_pthread_crate"

This reverts commit 8088374a868aacab4dff208ec3e3fde790a1d9a3.
---
 configure    | 2 +-
 configure.ac | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/configure b/configure
index 1469cd735392a..6e95b27d9df4a 100755
--- a/configure
+++ b/configure
@@ -9002,7 +9002,7 @@ fi
 
 if test "$ac_cv_search_pthread_create" = -lpthread; then
     CRAB1_LIBS="$CRAB1_LIBS -lpthread"
-elif test "$ac_cv_search_pthread_create" = no; then
+elif test "$ac_cv_search_pthread_crate" = no; then
     missing_rust_dynlibs="$missing_rust_dynlibs, libpthread"
 fi
 
diff --git a/configure.ac b/configure.ac
index 20457005e2993..88576b31bfcd5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2053,7 +2053,7 @@ fi
 
 if test "$ac_cv_search_pthread_create" = -lpthread; then
     CRAB1_LIBS="$CRAB1_LIBS -lpthread"
-elif test "$ac_cv_search_pthread_create" = no; then
+elif test "$ac_cv_search_pthread_crate" = no; then
     missing_rust_dynlibs="$missing_rust_dynlibs, libpthread"
 fi
 

From 6d6587bc37f2039225e4fba9acaf7b26e600e3d3 Mon Sep 17 00:00:00 2001
From: YunQiang Su <syq@gcc.gnu.org>
Date: Thu, 20 Jun 2024 07:02:47 +0800
Subject: [PATCH 13/22] Revert "build: Fix missing variable quotes"

This reverts commit c6a9ab8c920f297c4efd289182aef9fbc73f5906.
---
 configure        | 10 +++++-----
 configure.ac     |  8 ++++----
 gcc/configure    |  2 +-
 gcc/configure.ac |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/configure b/configure
index 6e95b27d9df4a..51576a41f3037 100755
--- a/configure
+++ b/configure
@@ -8994,15 +8994,15 @@ if test "$ac_res" != no; then :
 fi
 
 
-if test "$ac_cv_search_dlopen" = -ldl; then
+if test $ac_cv_search_dlopen = -ldl; then
     CRAB1_LIBS="$CRAB1_LIBS -ldl"
-elif test "$ac_cv_search_dlopen" = no; then
+elif test $ac_cv_search_dlopen = no; then
     missing_rust_dynlibs="libdl"
 fi
 
-if test "$ac_cv_search_pthread_create" = -lpthread; then
+if test $ac_cv_search_pthread_create = -lpthread; then
     CRAB1_LIBS="$CRAB1_LIBS -lpthread"
-elif test "$ac_cv_search_pthread_crate" = no; then
+elif test $ac_cv_search_pthread_crate = no; then
     missing_rust_dynlibs="$missing_rust_dynlibs, libpthread"
 fi
 
@@ -19746,7 +19746,7 @@ config.status
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
-Copyright (C)  Free Software Foundation, Inc.
+Copyright (C) 2012 Free Software Foundation, Inc.
 This config.status script is free software; the Free Software Foundation
 gives unlimited permission to copy, distribute and modify it."
 
diff --git a/configure.ac b/configure.ac
index 88576b31bfcd5..5eda8dcdbf726 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2045,15 +2045,15 @@ missing_rust_dynlibs=none
 AC_SEARCH_LIBS([dlopen], [dl])
 AC_SEARCH_LIBS([pthread_create], [pthread])
 
-if test "$ac_cv_search_dlopen" = -ldl; then
+if test $ac_cv_search_dlopen = -ldl; then
     CRAB1_LIBS="$CRAB1_LIBS -ldl"
-elif test "$ac_cv_search_dlopen" = no; then
+elif test $ac_cv_search_dlopen = no; then
     missing_rust_dynlibs="libdl"
 fi
 
-if test "$ac_cv_search_pthread_create" = -lpthread; then
+if test $ac_cv_search_pthread_create = -lpthread; then
     CRAB1_LIBS="$CRAB1_LIBS -lpthread"
-elif test "$ac_cv_search_pthread_crate" = no; then
+elif test $ac_cv_search_pthread_crate = no; then
     missing_rust_dynlibs="$missing_rust_dynlibs, libpthread"
 fi
 
diff --git a/gcc/configure b/gcc/configure
index b536af664d3de..9dc0b65dfaace 100755
--- a/gcc/configure
+++ b/gcc/configure
@@ -30239,7 +30239,7 @@ else
 fi
 { $as_echo "$as_me:${as_lineno-$LINENO}: result: $gcc_cv_as_mips_explicit_relocs_pcrel" >&5
 $as_echo "$gcc_cv_as_mips_explicit_relocs_pcrel" >&6; }
-if test "x$gcc_cv_as_mips_explicit_relocs_pcrel" = "xyes"; then
+if test $gcc_cv_as_mips_explicit_relocs_pcrel = yes; then
 
 $as_echo "#define MIPS_EXPLICIT_RELOCS MIPS_EXPLICIT_RELOCS_PCREL" >>confdefs.h
 
diff --git a/gcc/configure.ac b/gcc/configure.ac
index 1501bf89c89da..b2243e9954aac 100644
--- a/gcc/configure.ac
+++ b/gcc/configure.ac
@@ -5317,7 +5317,7 @@ x:
 
     AC_MSG_CHECKING(assembler and linker for explicit JALR relocation)
     gcc_cv_as_ld_jalr_reloc=no
-    if test "x$gcc_cv_as_mips_explicit_relocs" = "xyes"; then
+    if test $gcc_cv_as_mips_explicit_relocs = yes; then
       if test $in_tree_ld = yes ; then
         if test "$gcc_cv_gld_major_version" -eq 2 -a "$gcc_cv_gld_minor_version" -ge 20 -o "$gcc_cv_gld_major_version" -gt 2 \
            && test $in_tree_ld_is_elf = yes; then

From ebfffb6c6557f1375c230ae6751f697cdfab4a60 Mon Sep 17 00:00:00 2001
From: GCC Administrator <gccadmin@gcc.gnu.org>
Date: Thu, 20 Jun 2024 00:17:14 +0000
Subject: [PATCH 14/22] Daily bump.

---
 ChangeLog               |  32 +++++
 gcc/ChangeLog           | 140 +++++++++++++++++++++
 gcc/DATESTAMP           |   2 +-
 gcc/fortran/ChangeLog   |  16 +++
 gcc/testsuite/ChangeLog | 261 ++++++++++++++++++++++++++++++++++++++++
 libstdc++-v3/ChangeLog  |  24 ++++
 6 files changed, 474 insertions(+), 1 deletion(-)

diff --git a/ChangeLog b/ChangeLog
index bdd1e5e342422..201193fee8c0a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,35 @@
+2024-06-19  YunQiang Su  <syq@gcc.gnu.org>
+
+	Revert:
+	2024-06-19  Collin Funk  <collin.funk1@gmail.com>
+
+	* configure.ac: Quote variable result of AC_SEARCH_LIBS.
+	* configure: Regenerate.
+
+2024-06-19  YunQiang Su  <syq@gcc.gnu.org>
+
+	Revert:
+	2024-06-19  YunQiang Su  <syq@gcc.gnu.org>
+
+	PR bootstrap/115453
+	* configure.ac: Fix typo ac_cv_search_pthread_crate.
+	* configure: Regnerate.
+
+2024-06-19  YunQiang Su  <syq@gcc.gnu.org>
+
+	PR bootstrap/115453
+	* configure.ac: Fix typo ac_cv_search_pthread_crate.
+	* configure: Regnerate.
+
+2024-06-19  Collin Funk  <collin.funk1@gmail.com>
+
+	* configure.ac: Quote variable result of AC_SEARCH_LIBS.
+	* configure: Regenerate.
+
+2024-06-19  Ramana Radhakrishnan  <ramanara@nvidia.com>
+
+	* MAINTAINERS: Update my email address.
+
 2024-06-18  Kyrylo Tkachov  <ktkachov@nvidia.com>
 
 	* MAINTAINERS (aarch64 port): Update my email address.
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index d64a751a55ea5..8610e76b07b30 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,143 @@
+2024-06-19  YunQiang Su  <syq@gcc.gnu.org>
+
+	Revert:
+	2024-06-19  Collin Funk  <collin.funk1@gmail.com>
+
+	* configure.ac: Add missing quotation of variable
+	gcc_cv_as_mips_explicit_relocs.
+	* configure: Regenerate.
+
+2024-06-19  demin.han  <demin.han@starfivetech.com>
+
+	* config/riscv/riscv-vector-builtins-bases.cc: Remove eqne cond
+	* config/riscv/vector.md (@pred_eqne<mode>_scalar): Remove patterns
+	(*pred_eqne<mode>_scalar_merge_tie_mask): Ditto
+	(*pred_eqne<mode>_scalar): Ditto
+	(*pred_eqne<mode>_scalar_narrow): Ditto
+
+2024-06-19  Patrick O'Neill  <patrick@rivosinc.com>
+
+	* common/config/riscv/riscv-common.cc: Add 'a' extension to
+	riscv_combine_info.
+
+2024-06-19  Jakub Jelinek  <jakub@redhat.com>
+
+	PR tree-optimization/115544
+	* gimple-lower-bitint.cc (gimple_lower_bitint): Disable optimizing
+	loads used by COMPLEX_EXPR operands.
+
+2024-06-19  mayshao  <mayshao-oc@zhaoxin.com>
+
+	* common/config/i386/cpuinfo.h (get_zhaoxin_cpu): Recognize shijidadao.
+	* common/config/i386/i386-common.cc: Add shijidadao.
+	* common/config/i386/i386-cpuinfo.h (enum processor_subtypes):
+	Add ZHAOXIN_FAM7H_SHIJIDADAO.
+	* config.gcc: Add shijidadao.
+	* config/i386/driver-i386.cc (host_detect_local_cpu):
+	Let -march=native recognize shijidadao processors.
+	* config/i386/i386-c.cc (ix86_target_macros_internal): Add shijidadao.
+	* config/i386/i386-options.cc (m_ZHAOXIN): Add m_SHIJIDADAO.
+	(m_SHIJIDADAO): New definition.
+	* config/i386/i386.h (enum processor_type): Add PROCESSOR_SHIJIDADAO.
+	* config/i386/x86-tune-costs.h (struct processor_costs):
+	Add shijidadao_cost.
+	* config/i386/x86-tune-sched.cc (ix86_issue_rate): Add shijidadao.
+	(ix86_adjust_cost): Ditto.
+	* config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Add m_SHIJIDADAO.
+	(X86_TUNE_USE_GATHER_4PARTS): Ditto.
+	(X86_TUNE_USE_GATHER_8PARTS): Ditto.
+	(X86_TUNE_AVOID_128FMA_CHAINS): Ditto.
+	* doc/extend.texi: Add details about shijidadao.
+	* doc/invoke.texi: Ditto.
+
+2024-06-19  Takayuki 'January June' Suwa  <jjsuwa_sys3175@yahoo.co.jp>
+
+	* config/xtensa/xtensa.cc (print_operand):
+	When outputting MEMW before the instruction, check if the previous
+	instruction is already that.
+
+2024-06-19  Andre Vieira  <andre.simoesdiasvieira@arm.com>
+	    Stam Markianos-Wright  <stam.markianos-wright@arm.com>
+
+	* config/arm/arm-protos.h (arm_target_bb_ok_for_lob): Change
+	declaration to pass basic_block.
+	(arm_attempt_dlstp_transform): New declaration.
+	* config/arm/arm.cc (TARGET_LOOP_UNROLL_ADJUST): Define targethook.
+	(TARGET_PREDICT_DOLOOP_P): Likewise.
+	(arm_target_bb_ok_for_lob): Adapt condition.
+	(arm_mve_get_vctp_lanes): New function.
+	(arm_dl_usage_type): New internal enum.
+	(arm_get_required_vpr_reg): New function.
+	(arm_get_required_vpr_reg_param): New function.
+	(arm_get_required_vpr_reg_ret_val): New function.
+	(arm_mve_get_loop_vctp): New function.
+	(arm_mve_insn_predicated_by): New function.
+	(arm_mve_across_lane_insn_p): New function.
+	(arm_mve_load_store_insn_p): New function.
+	(arm_mve_impl_pred_on_outputs_p): New function.
+	(arm_mve_impl_pred_on_inputs_p): New function.
+	(arm_last_vect_def_insn): New function.
+	(arm_mve_impl_predicated_p): New function.
+	(arm_mve_check_reg_origin_is_num_elems): New function.
+	(arm_mve_dlstp_check_inc_counter): New function.
+	(arm_mve_dlstp_check_dec_counter): New function.
+	(arm_mve_loop_valid_for_dlstp): New function.
+	(arm_predict_doloop_p): New function.
+	(arm_loop_unroll_adjust): New function.
+	(arm_emit_mve_unpredicated_insn_to_seq): New function.
+	(arm_attempt_dlstp_transform): New function.
+	* config/arm/arm.opt (mdlstp): New option.
+	* config/arm/iterators.md (dlstp_elemsize, letp_num_lanes,
+	letp_num_lanes_neg, letp_num_lanes_minus_1): New attributes.
+	(DLSTP, LETP): New iterators.
+	* config/arm/mve.md (predicated_doloop_end_internal<letp_num_lanes>,
+	dlstp<dlstp_elemsize>_insn): New insn patterns.
+	* config/arm/thumb2.md (doloop_end): Adapt to support tail-predicated
+	loops.
+	(doloop_begin): Likewise.
+	* config/arm/types.md (mve_misc): New mve type to represent
+	predicated_loop_end insn sequences.
+	* config/arm/unspecs.md:
+	(DLSTP8, DLSTP16, DLSTP32, DSLTP64,
+	LETP8, LETP16, LETP32, LETP64): New unspecs for DLSTP and LETP.
+
+2024-06-19  Andre Vieira  <andre.simoesdiasvieira@arm.com>
+	    Stam Markianos-Wright  <stam.markianos-wright@arm.com>
+
+	* df-core.cc (df_bb_regno_only_def_find): New helper function.
+	* df.h (df_bb_regno_only_def_find): Declare new function.
+	* loop-doloop.cc (doloop_condition_get): Add support for detecting
+	predicated vectorized hardware loops.
+	(doloop_modify): Add support for GTU condition checks.
+	(doloop_optimize): Update costing computation to support alterations to
+	desc->niter_expr by the backend.
+
+2024-06-19  Collin Funk  <collin.funk1@gmail.com>
+
+	* configure.ac: Add missing quotation of variable
+	gcc_cv_as_mips_explicit_relocs.
+	* configure: Regenerate.
+
+2024-06-19  Takayuki 'January June' Suwa  <jjsuwa_sys3175@yahoo.co.jp>
+
+	* config/xtensa/xtensa-protos.h (xtensa_constantsynth):
+	Change the second argument from HOST_WIDE_INT to rtx.
+	* config/xtensa/xtensa.cc (#include):
+	Add "context.h" and "pass_manager.h".
+	(machine_function): Add a new hash_map field "litpool_usage".
+	(xtensa_constantsynth): Make "src" (the second operand) accept
+	RTX literal instead of its value, and treat both bare and pooled
+	SI/SFmode literals equally by bit-exact canonicalization into
+	CONST_INT RTX internally.  And then, make avoid synthesis if
+	such multiple identical canonicalized literals are found in same
+	function when optimizing for size.  Finally, for literals where
+	synthesis is not possible or has been avoided, re-emit "move"
+	RTXes with canonicalized ones to increase the chances of sharing
+	literal pool entries.
+	* config/xtensa/xtensa.md (split patterns for constant synthesis):
+	Change to simply invoke xtensa_constantsynth() as mentioned above,
+	and add new patterns for when TARGET_AUTO_LITPOOLS is enabled.
+
 2024-06-18  Edwin Lu  <ewlu@rivosinc.com>
 	    Robin Dapp  <rdapp@ventanamicro.com>
 
diff --git a/gcc/DATESTAMP b/gcc/DATESTAMP
index 6fe37f7c38677..9df1831b6e340 100644
--- a/gcc/DATESTAMP
+++ b/gcc/DATESTAMP
@@ -1 +1 @@
-20240619
+20240620
diff --git a/gcc/fortran/ChangeLog b/gcc/fortran/ChangeLog
index a2275728a46b9..8fcd6d40c956c 100644
--- a/gcc/fortran/ChangeLog
+++ b/gcc/fortran/ChangeLog
@@ -1,3 +1,19 @@
+2024-06-19  Harald Anlauf  <anlauf@gmx.de>
+
+	PR fortran/115390
+	* trans-decl.cc (gfc_conv_cfi_to_gfc): Move derivation of type sizes
+	for character via gfc_trans_vla_type_sizes to after character length
+	has been set.
+
+2024-06-19  Andre Vehreschild  <vehre@gcc.gnu.org>
+
+	PR fortran/90076
+	* trans-decl.cc (gfc_generate_function_code): Set vptr for
+	results to declared class type.
+	* trans-expr.cc (gfc_reset_vptr): Allow to provide the typespec
+	instead of the expression.
+	* trans.h (gfc_reset_vptr): Same.
+
 2024-06-17  Andre Vehreschild  <vehre@gcc.gnu.org>
 
 	* trans.cc (gfc_deallocate_with_status): Check that object to deref
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 2ae5731931d92..69e269330d9f1 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,264 @@
+2024-06-19  demin.han  <demin.han@starfivetech.com>
+
+	* gcc.target/riscv/rvv/base/float-point-cmp-eqne.c: New test.
+
+2024-06-19  Jakub Jelinek  <jakub@redhat.com>
+
+	PR tree-optimization/115544
+	* gcc.dg/bitint-107.c: New test.
+
+2024-06-19  mayshao  <mayshao-oc@zhaoxin.com>
+
+	* g++.target/i386/mv32.C: Handle new -march
+	* gcc.target/i386/funcspec-56.inc: Ditto.
+
+2024-06-19  Harald Anlauf  <anlauf@gmx.de>
+
+	PR fortran/115390
+	* gfortran.dg/bind_c_char_11.f90: New test.
+
+2024-06-19  Andre Vieira  <andre.simoesdiasvieira@arm.com>
+	    Stam Markianos-Wright  <stam.markianos-wright@arm.com>
+
+	* gcc.target/arm/lob.h: Add new helpers.
+	* gcc.target/arm/lob1.c: Use new helpers.
+	* gcc.target/arm/lob6.c: Likewise.
+	* gcc.target/arm/mve/dlstp-compile-asm-1.c: New test.
+	* gcc.target/arm/mve/dlstp-compile-asm-2.c: New test.
+	* gcc.target/arm/mve/dlstp-compile-asm-3.c: New test.
+	* gcc.target/arm/mve/dlstp-int8x16.c: New test.
+	* gcc.target/arm/mve/dlstp-int8x16-run.c: New test.
+	* gcc.target/arm/mve/dlstp-int16x8.c: New test.
+	* gcc.target/arm/mve/dlstp-int16x8-run.c: New test.
+	* gcc.target/arm/mve/dlstp-int32x4.c: New test.
+	* gcc.target/arm/mve/dlstp-int32x4-run.c: New test.
+	* gcc.target/arm/mve/dlstp-int64x2.c: New test.
+	* gcc.target/arm/mve/dlstp-int64x2-run.c: New test.
+	* gcc.target/arm/mve/dlstp-invalid-asm.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-37.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-38.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-39.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-40.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-37.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-38.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-39.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-40.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-33.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-34.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-35.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-36.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-33.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-34.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-35.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-36.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-29.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-30.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-31.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-32.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-29.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-30.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-31.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-32.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-25.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-26.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-27.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-28.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-25.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-26.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-27.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-28.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-21.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-22.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-23.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-24.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-21.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-22.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-23.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-24.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-17.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-18.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-19.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-20.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-17.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-18.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-19.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-20.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-13.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-14.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-15.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-16.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-13.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-14.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-15.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-16.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-10.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-11.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-12.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-9.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-10.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-11.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-12.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-9.c: New test.
+
+2024-06-19  Richard Biener  <rguenther@suse.de>
+
+	* gcc.dg/vect/bb-slp-32.c: Add check for correctness.
+
+2024-06-19  Andre Vehreschild  <vehre@gcc.gnu.org>
+
+	PR fortran/90076
+	* gfortran.dg/class_76.f90: Add declared vtab occurrence.
+	* gfortran.dg/class_78.f90: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add helper
+	macro for testing.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-29.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-30.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-31.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-32.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-29.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-30.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-31.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-32.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add helper
+	macro for testing.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-25.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-26.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-27.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-28.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-25.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-26.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-27.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-28.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add helper
+	macro for testing.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-21.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-22.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-23.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-24.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-21.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-22.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-23.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-24.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add helper
+	macro for testing.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-17.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-18.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-19.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-20.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-17.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-18.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-19.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-20.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add helper
+	macro for testing.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-13.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-14.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-15.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-16.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-13.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-14.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-15.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-16.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add helper
+	macro for testing.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-10.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-11.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-12.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-9.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-10.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-11.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-12.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-9.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add helper
+	macro for testing.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-5.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-6.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-7.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-8.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-5.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-6.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-7.c: New test.
+	* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-8.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/sat_arith.h: Add helper macro for
+	testing.
+	* gcc.target/riscv/sat_u_sub-45.c: New test.
+	* gcc.target/riscv/sat_u_sub-46.c: New test.
+	* gcc.target/riscv/sat_u_sub-47.c: New test.
+	* gcc.target/riscv/sat_u_sub-48.c: New test.
+	* gcc.target/riscv/sat_u_sub-run-45.c: New test.
+	* gcc.target/riscv/sat_u_sub-run-46.c: New test.
+	* gcc.target/riscv/sat_u_sub-run-47.c: New test.
+	* gcc.target/riscv/sat_u_sub-run-48.c: New test.
+
+2024-06-19  Pan Li  <pan2.li@intel.com>
+
+	* gcc.target/riscv/sat_arith.h: Add helper
+	macro for testing.
+	* gcc.target/riscv/sat_u_sub-41.c: New test.
+	* gcc.target/riscv/sat_u_sub-42.c: New test.
+	* gcc.target/riscv/sat_u_sub-43.c: New test.
+	* gcc.target/riscv/sat_u_sub-44.c: New test.
+	* gcc.target/riscv/sat_u_sub-run-41.c: New test.
+	* gcc.target/riscv/sat_u_sub-run-42.c: New test.
+	* gcc.target/riscv/sat_u_sub-run-43.c: New test.
+	* gcc.target/riscv/sat_u_sub-run-44.c: New test.
+
 2024-06-18  Jeff Law  <jlaw@ventanamicro.com>
 
 	* gcc.target/riscv/zbs-ext-2.c: Do not run for -Os.
diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog
index 907f6cfb0e83c..94a5ce9a1329c 100644
--- a/libstdc++-v3/ChangeLog
+++ b/libstdc++-v3/ChangeLog
@@ -1,3 +1,27 @@
+2024-06-19  Jonathan Wakely  <jwakely@redhat.com>
+
+	* include/std/future: Adjust whitespace to use tabs for
+	indentation.
+
+2024-06-19  Jonathan Wakely  <jwakely@redhat.com>
+
+	* include/std/future (_State_baseV2::_Setter<R, void>): Add
+	noexcept to call operator.
+	(_State_baseV2::_Setter<R, __exception_ptr_tag>): Likewise.
+
+2024-06-19  Jonathan Wakely  <jwakely@redhat.com>
+
+	* include/bits/stl_pair.h [__cpp_lib_concepts] (pair()): Add
+	conditional noexcept.
+
+2024-06-19  Jonathan Wakely  <jwakely@redhat.com>
+
+	* include/bits/stl_tempbuf.h (__get_temporary_buffer): Cast
+	argument to size_t to handle negative values and suppress
+	-Wsign-compare warning.
+	(_Temporary_buffer): Move diagnostic pragmas to new location of
+	call to std::get_temporary_buffer.
+
 2024-06-18  Jonathan Wakely  <jwakely@redhat.com>
 
 	* include/bits/cpp_type_traits.h: Fix outdated comment about the

From 70466e6f9d9fb87f78ffe2e397ca876b380cb493 Mon Sep 17 00:00:00 2001
From: Feng Xue <fxue@os.amperecomputing.com>
Date: Sat, 15 Jun 2024 23:17:10 +0800
Subject: [PATCH 15/22] vect: Add a function to check lane-reducing stmt

Add a utility function to check if a statement is lane-reducing operation,
which could simplify some existing code.

2024-06-16 Feng Xue <fxue@os.amperecomputing.com>

gcc/
	* tree-vectorizer.h (lane_reducing_stmt_p): New function.
	* tree-vect-slp.cc (vect_analyze_slp): Use new function
	lane_reducing_stmt_p to check statement.
---
 gcc/tree-vect-slp.cc  |  4 +---
 gcc/tree-vectorizer.h | 12 ++++++++++++
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 7d18b5bfee5d2..a5665946a4ebc 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -3919,7 +3919,6 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
 	  scalar_stmts.create (loop_vinfo->reductions.length ());
 	  for (auto next_info : loop_vinfo->reductions)
 	    {
-	      gassign *g;
 	      next_info = vect_stmt_to_vectorize (next_info);
 	      if ((STMT_VINFO_RELEVANT_P (next_info)
 		   || STMT_VINFO_LIVE_P (next_info))
@@ -3931,8 +3930,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
 		{
 		  /* Do not discover SLP reductions combining lane-reducing
 		     ops, that will fail later.  */
-		  if (!(g = dyn_cast <gassign *> (STMT_VINFO_STMT (next_info)))
-		      || !lane_reducing_op_p (gimple_assign_rhs_code (g)))
+		  if (!lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
 		    scalar_stmts.quick_push (next_info);
 		  else
 		    {
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 6bb0f5c3a56f6..60224f4e28472 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2169,12 +2169,24 @@ vect_apply_runtime_profitability_check_p (loop_vec_info loop_vinfo)
 	  && th >= vect_vf_for_cost (loop_vinfo));
 }
 
+/* Return true if CODE is a lane-reducing opcode.  */
+
 inline bool
 lane_reducing_op_p (code_helper code)
 {
   return code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR;
 }
 
+/* Return true if STMT is a lane-reducing statement.  */
+
+inline bool
+lane_reducing_stmt_p (gimple *stmt)
+{
+  if (auto *assign = dyn_cast <gassign *> (stmt))
+    return lane_reducing_op_p (gimple_assign_rhs_code (assign));
+  return false;
+}
+
 /* Source location + hotness information. */
 extern dump_user_location_t vect_location;
 

From a944e57506fc64b8eede79c2405ba0b498461f0b Mon Sep 17 00:00:00 2001
From: Feng Xue <fxue@os.amperecomputing.com>
Date: Sun, 16 Jun 2024 12:08:56 +0800
Subject: [PATCH 16/22] vect: Remove duplicated check on reduction operand

In vectorizable_reduction, one check on a reduction operand via index could be
contained by another one check via pointer, so remove the former.

2024-06-16 Feng Xue <fxue@os.amperecomputing.com>

gcc/
	* tree-vect-loop.cc (vectorizable_reduction): Remove the duplicated
	check.
---
 gcc/tree-vect-loop.cc | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index eeb75c09e91aa..aab408d1019db 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -7815,11 +7815,9 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 			     "use not simple.\n");
 	  return false;
 	}
-      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
-	continue;
 
-      /* For an IFN_COND_OP we might hit the reduction definition operand
-	 twice (once as definition, once as else).  */
+      /* Skip reduction operands, and for an IFN_COND_OP we might hit the
+	 reduction operand twice (once as definition, once as else).  */
       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
 	continue;
 

From 0726f1cde5459ccdbaa6af8c6904276a28d572ba Mon Sep 17 00:00:00 2001
From: Feng Xue <fxue@os.amperecomputing.com>
Date: Sun, 16 Jun 2024 12:17:26 +0800
Subject: [PATCH 17/22] vect: Use one reduction_type local variable

Two local variables were defined to refer same STMT_VINFO_REDUC_TYPE, better
to keep only one.

2024-06-16 Feng Xue <fxue@os.amperecomputing.com>

gcc/
	* tree-vect-loop.cc (vectorizable_reduction): Remove v_reduc_type, and
	replace it to another local variable reduction_type.
---
 gcc/tree-vect-loop.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index aab408d1019db..27f77ed8b0b60 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -7868,10 +7868,10 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   if (lane_reducing)
     STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
 
-  enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
-  STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
+  enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
+  STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
   /* If we have a condition reduction, see if we can simplify it further.  */
-  if (v_reduc_type == COND_REDUCTION)
+  if (reduction_type == COND_REDUCTION)
     {
       if (slp_node && SLP_TREE_LANES (slp_node) != 1)
 	return false;
@@ -8038,7 +8038,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 
   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
 
-  vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
+  reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
   if (reduction_type == TREE_CODE_REDUCTION)
     {
       /* Check whether it's ok to change the order of the computation.

From b9c369d900ccfbd2271028611af3f08b5cf6f998 Mon Sep 17 00:00:00 2001
From: Feng Xue <fxue@os.amperecomputing.com>
Date: Sun, 16 Jun 2024 13:21:13 +0800
Subject: [PATCH 18/22] vect: Use an array to replace 3 relevant variables

It's better to place 3 relevant independent variables into array, since we
have requirement to access them via an index in the following patch. At the
same time, this change may get some duplicated code be more compact.

2024-06-16 Feng Xue <fxue@os.amperecomputing.com>

gcc/
	* tree-vect-loop.cc (vect_transform_reduction): Replace vec_oprnds0/1/2
	with one new array variable vec_oprnds[3].
---
 gcc/tree-vect-loop.cc | 43 ++++++++++++++++++-------------------------
 1 file changed, 18 insertions(+), 25 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 27f77ed8b0b60..1d60ac47e5531 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -8580,9 +8580,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 
   /* Transform.  */
   tree new_temp = NULL_TREE;
-  auto_vec<tree> vec_oprnds0;
-  auto_vec<tree> vec_oprnds1;
-  auto_vec<tree> vec_oprnds2;
+  auto_vec<tree> vec_oprnds[3];
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
@@ -8630,14 +8628,15 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
      definition.  */
   if (!cond_fn_p)
     {
+      gcc_assert (reduc_index >= 0 && reduc_index <= 2);
       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
 			 single_defuse_cycle && reduc_index == 0
-			 ? NULL_TREE : op.ops[0], &vec_oprnds0,
+			 ? NULL_TREE : op.ops[0], &vec_oprnds[0],
 			 single_defuse_cycle && reduc_index == 1
-			 ? NULL_TREE : op.ops[1], &vec_oprnds1,
+			 ? NULL_TREE : op.ops[1], &vec_oprnds[1],
 			 op.num_ops == 3
 			 && !(single_defuse_cycle && reduc_index == 2)
-			 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
+			 ? op.ops[2] : NULL_TREE, &vec_oprnds[2]);
     }
   else
     {
@@ -8645,12 +8644,12 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 	 vectype.  */
       gcc_assert (single_defuse_cycle
 		  && (reduc_index == 1 || reduc_index == 2));
-      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
-			 op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
+      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies, op.ops[0],
+			 truth_type_for (vectype_in), &vec_oprnds[0],
 			 reduc_index == 1 ? NULL_TREE : op.ops[1],
-			 NULL_TREE, &vec_oprnds1,
+			 NULL_TREE, &vec_oprnds[1],
 			 reduc_index == 2 ? NULL_TREE : op.ops[2],
-			 NULL_TREE, &vec_oprnds2);
+			 NULL_TREE, &vec_oprnds[2]);
     }
 
   /* For single def-use cycles get one copy of the vectorized reduction
@@ -8658,20 +8657,21 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   if (single_defuse_cycle)
     {
       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, 1,
-			 reduc_index == 0 ? op.ops[0] : NULL_TREE, &vec_oprnds0,
-			 reduc_index == 1 ? op.ops[1] : NULL_TREE, &vec_oprnds1,
+			 reduc_index == 0 ? op.ops[0] : NULL_TREE,
+			 &vec_oprnds[0],
+			 reduc_index == 1 ? op.ops[1] : NULL_TREE,
+			 &vec_oprnds[1],
 			 reduc_index == 2 ? op.ops[2] : NULL_TREE,
-			 &vec_oprnds2);
+			 &vec_oprnds[2]);
     }
 
   bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
+  unsigned num = vec_oprnds[reduc_index == 0 ? 1 : 0].length ();
 
-  unsigned num = (reduc_index == 0
-		  ? vec_oprnds1.length () : vec_oprnds0.length ());
   for (unsigned i = 0; i < num; ++i)
     {
       gimple *new_stmt;
-      tree vop[3] = { vec_oprnds0[i], vec_oprnds1[i], NULL_TREE };
+      tree vop[3] = { vec_oprnds[0][i], vec_oprnds[1][i], NULL_TREE };
       if (masked_loop_p && !mask_by_cond_expr)
 	{
 	  /* No conditional ifns have been defined for dot-product yet.  */
@@ -8696,7 +8696,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
       else
 	{
 	  if (op.num_ops >= 3)
-	    vop[2] = vec_oprnds2[i];
+	    vop[2] = vec_oprnds[2][i];
 
 	  if (masked_loop_p && mask_by_cond_expr)
 	    {
@@ -8727,14 +8727,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 	}
 
       if (single_defuse_cycle && i < num - 1)
-	{
-	  if (reduc_index == 0)
-	    vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
-	  else if (reduc_index == 1)
-	    vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
-	  else if (reduc_index == 2)
-	    vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
-	}
+	vec_oprnds[reduc_index].safe_push (gimple_get_lhs (new_stmt));
       else if (slp_node)
 	slp_node->push_vec_def (new_stmt);
       else

From ecbc96bb2873e453b0bd33d602ce34ad0d9d9cfd Mon Sep 17 00:00:00 2001
From: Feng Xue <fxue@os.amperecomputing.com>
Date: Sun, 16 Jun 2024 13:33:52 +0800
Subject: [PATCH 19/22] vect: Tighten an assertion for lane-reducing in
 transform

According to logic of code nearby the assertion, all lane-reducing operations
should not appear, not just DOT_PROD_EXPR. Since "use_mask_by_cond_expr_p"
treats SAD_EXPR same as DOT_PROD_EXPR, and WIDEN_SUM_EXPR should not be allowed
by the following assertion "gcc_assert (commutative_binary_op_p (...))", so
tighten the assertion.

2024-06-16 Feng Xue <fxue@os.amperecomputing.com>

gcc/
	* tree-vect-loop.cc (vect_transform_reduction): Change assertion to
	cover all lane-reducing ops.
---
 gcc/tree-vect-loop.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 1d60ac47e5531..347dac97e497e 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -8618,7 +8618,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
     }
 
   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
-  gcc_assert (single_defuse_cycle || lane_reducing_op_p (code));
+  bool lane_reducing = lane_reducing_op_p (code);
+  gcc_assert (single_defuse_cycle || lane_reducing);
 
   /* Create the destination vector  */
   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
@@ -8674,8 +8675,9 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
       tree vop[3] = { vec_oprnds[0][i], vec_oprnds[1][i], NULL_TREE };
       if (masked_loop_p && !mask_by_cond_expr)
 	{
-	  /* No conditional ifns have been defined for dot-product yet.  */
-	  gcc_assert (code != DOT_PROD_EXPR);
+	  /* No conditional ifns have been defined for lane-reducing op
+	     yet.  */
+	  gcc_assert (!lane_reducing);
 
 	  /* Make sure that the reduction accumulator is vop[0].  */
 	  if (reduc_index == 1)

From bea447a2982f3094aa3423b5045cea929f4f4700 Mon Sep 17 00:00:00 2001
From: Collin Funk <collin.funk1@gmail.com>
Date: Wed, 19 Jun 2024 16:36:50 -0700
Subject: [PATCH 20/22] build: Fix missing variable quotes and typo

When dlopen and pthread_create are in libc the variable is
set to "none required", therefore running configure will show
the following errors:

./configure: line 8997: test: too many arguments
./configure: line 8999: test: too many arguments
./configure: line 9003: test: too many arguments
./configure: line 9005: test: =: unary operator expected

ChangeLog:

	PR bootstrap/115453
	* configure.ac: Quote variable result of AC_SEARCH_LIBS.  Fix
	typo ac_cv_search_pthread_crate.
	* configure: Regenerate.

Signed-off-by: Collin Funk <collin.funk1@gmail.com>
---
 configure    | 8 ++++----
 configure.ac | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/configure b/configure
index 51576a41f3037..51bf1d1add185 100755
--- a/configure
+++ b/configure
@@ -8994,15 +8994,15 @@ if test "$ac_res" != no; then :
 fi
 
 
-if test $ac_cv_search_dlopen = -ldl; then
+if test "$ac_cv_search_dlopen" = -ldl; then
     CRAB1_LIBS="$CRAB1_LIBS -ldl"
-elif test $ac_cv_search_dlopen = no; then
+elif test "$ac_cv_search_dlopen" = no; then
     missing_rust_dynlibs="libdl"
 fi
 
-if test $ac_cv_search_pthread_create = -lpthread; then
+if test "$ac_cv_search_pthread_create" = -lpthread; then
     CRAB1_LIBS="$CRAB1_LIBS -lpthread"
-elif test $ac_cv_search_pthread_crate = no; then
+elif test "$ac_cv_search_pthread_create" = no; then
     missing_rust_dynlibs="$missing_rust_dynlibs, libpthread"
 fi
 
diff --git a/configure.ac b/configure.ac
index 5eda8dcdbf726..20457005e2993 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2045,15 +2045,15 @@ missing_rust_dynlibs=none
 AC_SEARCH_LIBS([dlopen], [dl])
 AC_SEARCH_LIBS([pthread_create], [pthread])
 
-if test $ac_cv_search_dlopen = -ldl; then
+if test "$ac_cv_search_dlopen" = -ldl; then
     CRAB1_LIBS="$CRAB1_LIBS -ldl"
-elif test $ac_cv_search_dlopen = no; then
+elif test "$ac_cv_search_dlopen" = no; then
     missing_rust_dynlibs="libdl"
 fi
 
-if test $ac_cv_search_pthread_create = -lpthread; then
+if test "$ac_cv_search_pthread_create" = -lpthread; then
     CRAB1_LIBS="$CRAB1_LIBS -lpthread"
-elif test $ac_cv_search_pthread_crate = no; then
+elif test "$ac_cv_search_pthread_create" = no; then
     missing_rust_dynlibs="$missing_rust_dynlibs, libpthread"
 fi
 

From 492b983aef615f329845af0929aa8560bd82a3f2 Mon Sep 17 00:00:00 2001
From: li1115 <103242378+zijunlii@users.noreply.github.com>
Date: Thu, 20 Jun 2024 00:54:07 -0400
Subject: [PATCH 21/22] Create afmv_test_2.exp

---
 .../g++.target/aarch64/afmv/afmv_test_2.exp       | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 gcc/testsuite/g++.target/aarch64/afmv/afmv_test_2.exp

diff --git a/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_2.exp b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_2.exp
new file mode 100644
index 0000000000000..4182078ee1841
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_2.exp
@@ -0,0 +1,15 @@
+# Load the DejaGnu framework
+load_lib gcc-dg.exp
+
+# Set the source directory
+set srcdir [file dirname [info script]]
+
+# Define the test
+dg-init
+
+set testname "afmv_test_2"
+set srcfile "${srcdir}/afmv_test_2.c"
+
+dg-runtest $srcfile "-O2 -fdump-tree-all" ""
+
+dg-finish

From 2ddeca783d876b828d3d7b22925d0fd860a7b97c Mon Sep 17 00:00:00 2001
From: li1115 <103242378+zijunlii@users.noreply.github.com>
Date: Thu, 20 Jun 2024 01:01:15 -0400
Subject: [PATCH 22/22] Add files via upload

---
 .../g++.target/aarch64/afmv/Makefile          |  1 +
 .../g++.target/aarch64/afmv/afmv_test_1.c     | 11 +++++++
 .../g++.target/aarch64/afmv/afmv_test_1.exp   | 15 ++++++++++
 .../g++.target/aarch64/afmv/afmv_test_2.c     | 11 +++++++
 .../g++.target/aarch64/afmv/afmv_test_2.exp   | 30 +++++++++----------
 .../g++.target/aarch64/afmv/afmv_test_3.c     | 11 +++++++
 .../g++.target/aarch64/afmv/afmv_test_3.exp   | 15 ++++++++++
 7 files changed, 79 insertions(+), 15 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/aarch64/afmv/Makefile
 create mode 100644 gcc/testsuite/g++.target/aarch64/afmv/afmv_test_1.c
 create mode 100644 gcc/testsuite/g++.target/aarch64/afmv/afmv_test_1.exp
 create mode 100644 gcc/testsuite/g++.target/aarch64/afmv/afmv_test_2.c
 create mode 100644 gcc/testsuite/g++.target/aarch64/afmv/afmv_test_3.c
 create mode 100644 gcc/testsuite/g++.target/aarch64/afmv/afmv_test_3.exp

diff --git a/gcc/testsuite/g++.target/aarch64/afmv/Makefile b/gcc/testsuite/g++.target/aarch64/afmv/Makefile
new file mode 100644
index 0000000000000..ebaf89cba1b5b
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/afmv/Makefile
@@ -0,0 +1 @@
+dg-runtest $(srcdir)/gcc.target/aarch64/afmv/*.c
diff --git a/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_1.c b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_1.c
new file mode 100644
index 0000000000000..a24e8e8805d60
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_1.c
@@ -0,0 +1,11 @@
+#include <stdio.h>
+
+__attribute__((target_clones("default", "sse4.2", "avx2")))
+void foo() {
+    printf("Function foo\n");
+}
+
+int main() {
+    foo();
+    return 0;
+}
diff --git a/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_1.exp b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_1.exp
new file mode 100644
index 0000000000000..4f98486cc7537
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_1.exp
@@ -0,0 +1,15 @@
+# Load the DejaGnu framework
+load_lib gcc-dg.exp
+
+# Set the source directory
+set srcdir [file dirname [info script]]
+
+# Define the test
+dg-init
+
+set testname "afmv_test_1"
+set srcfile "${srcdir}/afmv_test_1.c"
+
+dg-runtest $srcfile "-O2 -fdump-tree-all" ""
+
+dg-finish
diff --git a/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_2.c b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_2.c
new file mode 100644
index 0000000000000..19133d253f104
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_2.c
@@ -0,0 +1,11 @@
+#include <stdio.h>
+
+__attribute__((target_clones("default", "sse4.2", "avx2")))
+void bar() {
+    printf("Function bar\n");
+}
+
+int main() {
+    bar();
+    return 0;
+}
diff --git a/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_2.exp b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_2.exp
index 4182078ee1841..782baf0b5bb69 100644
--- a/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_2.exp
+++ b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_2.exp
@@ -1,15 +1,15 @@
-# Load the DejaGnu framework
-load_lib gcc-dg.exp
-
-# Set the source directory
-set srcdir [file dirname [info script]]
-
-# Define the test
-dg-init
-
-set testname "afmv_test_2"
-set srcfile "${srcdir}/afmv_test_2.c"
-
-dg-runtest $srcfile "-O2 -fdump-tree-all" ""
-
-dg-finish
+# Load the DejaGnu framework
+load_lib gcc-dg.exp
+
+# Set the source directory
+set srcdir [file dirname [info script]]
+
+# Define the test
+dg-init
+
+set testname "afmv_test_2"
+set srcfile "${srcdir}/afmv_test_2.c"
+
+dg-runtest $srcfile "-O2 -fdump-tree-all" ""
+
+dg-finish
diff --git a/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_3.c b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_3.c
new file mode 100644
index 0000000000000..b20356ba86123
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_3.c
@@ -0,0 +1,11 @@
+#include <stdio.h>
+
+__attribute__((target_clones("default", "sse4.2", "avx2")))
+inline void baz() {
+    printf("Function baz\n");
+}
+
+int main() {
+    baz();
+    return 0;
+}
diff --git a/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_3.exp b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_3.exp
new file mode 100644
index 0000000000000..9fa6b66ce6182
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_3.exp
@@ -0,0 +1,15 @@
+# Load the DejaGnu framework
+load_lib gcc-dg.exp
+
+# Set the source directory
+set srcdir [file dirname [info script]]
+
+# Define the test
+dg-init
+
+set testname "afmv_test_3"
+set srcfile "${srcdir}/afmv_test_3.c"
+
+dg-runtest $srcfile "-O2 -fdump-tree-all" ""
+
+dg-finish