diff --git a/ChangeLog b/ChangeLog index bdd1e5e342422..201193fee8c0a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,35 @@ +2024-06-19 YunQiang Su + + Revert: + 2024-06-19 Collin Funk + + * configure.ac: Quote variable result of AC_SEARCH_LIBS. + * configure: Regenerate. + +2024-06-19 YunQiang Su + + Revert: + 2024-06-19 YunQiang Su + + PR bootstrap/115453 + * configure.ac: Fix typo ac_cv_search_pthread_crate. + * configure: Regnerate. + +2024-06-19 YunQiang Su + + PR bootstrap/115453 + * configure.ac: Fix typo ac_cv_search_pthread_crate. + * configure: Regnerate. + +2024-06-19 Collin Funk + + * configure.ac: Quote variable result of AC_SEARCH_LIBS. + * configure: Regenerate. + +2024-06-19 Ramana Radhakrishnan + + * MAINTAINERS: Update my email address. + 2024-06-18 Kyrylo Tkachov * MAINTAINERS (aarch64 port): Update my email address. diff --git a/configure b/configure index 1469cd735392a..51bf1d1add185 100755 --- a/configure +++ b/configure @@ -19746,7 +19746,7 @@ config.status configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" -Copyright (C) Free Software Foundation, Inc. +Copyright (C) 2012 Free Software Foundation, Inc. This config.status script is free software; the Free Software Foundation gives unlimited permission to copy, distribute and modify it." diff --git a/gcc/ChangeLog b/gcc/ChangeLog index d64a751a55ea5..8610e76b07b30 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,143 @@ +2024-06-19 YunQiang Su + + Revert: + 2024-06-19 Collin Funk + + * configure.ac: Add missing quotation of variable + gcc_cv_as_mips_explicit_relocs. + * configure: Regenerate. + +2024-06-19 demin.han + + * config/riscv/riscv-vector-builtins-bases.cc: Remove eqne cond + * config/riscv/vector.md (@pred_eqne_scalar): Remove patterns + (*pred_eqne_scalar_merge_tie_mask): Ditto + (*pred_eqne_scalar): Ditto + (*pred_eqne_scalar_narrow): Ditto + +2024-06-19 Patrick O'Neill + + * common/config/riscv/riscv-common.cc: Add 'a' extension to + riscv_combine_info. + +2024-06-19 Jakub Jelinek + + PR tree-optimization/115544 + * gimple-lower-bitint.cc (gimple_lower_bitint): Disable optimizing + loads used by COMPLEX_EXPR operands. + +2024-06-19 mayshao + + * common/config/i386/cpuinfo.h (get_zhaoxin_cpu): Recognize shijidadao. + * common/config/i386/i386-common.cc: Add shijidadao. + * common/config/i386/i386-cpuinfo.h (enum processor_subtypes): + Add ZHAOXIN_FAM7H_SHIJIDADAO. + * config.gcc: Add shijidadao. + * config/i386/driver-i386.cc (host_detect_local_cpu): + Let -march=native recognize shijidadao processors. + * config/i386/i386-c.cc (ix86_target_macros_internal): Add shijidadao. + * config/i386/i386-options.cc (m_ZHAOXIN): Add m_SHIJIDADAO. + (m_SHIJIDADAO): New definition. + * config/i386/i386.h (enum processor_type): Add PROCESSOR_SHIJIDADAO. + * config/i386/x86-tune-costs.h (struct processor_costs): + Add shijidadao_cost. + * config/i386/x86-tune-sched.cc (ix86_issue_rate): Add shijidadao. + (ix86_adjust_cost): Ditto. + * config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Add m_SHIJIDADAO. + (X86_TUNE_USE_GATHER_4PARTS): Ditto. + (X86_TUNE_USE_GATHER_8PARTS): Ditto. + (X86_TUNE_AVOID_128FMA_CHAINS): Ditto. + * doc/extend.texi: Add details about shijidadao. + * doc/invoke.texi: Ditto. + +2024-06-19 Takayuki 'January June' Suwa + + * config/xtensa/xtensa.cc (print_operand): + When outputting MEMW before the instruction, check if the previous + instruction is already that. + +2024-06-19 Andre Vieira + Stam Markianos-Wright + + * config/arm/arm-protos.h (arm_target_bb_ok_for_lob): Change + declaration to pass basic_block. + (arm_attempt_dlstp_transform): New declaration. + * config/arm/arm.cc (TARGET_LOOP_UNROLL_ADJUST): Define targethook. + (TARGET_PREDICT_DOLOOP_P): Likewise. + (arm_target_bb_ok_for_lob): Adapt condition. + (arm_mve_get_vctp_lanes): New function. + (arm_dl_usage_type): New internal enum. + (arm_get_required_vpr_reg): New function. + (arm_get_required_vpr_reg_param): New function. + (arm_get_required_vpr_reg_ret_val): New function. + (arm_mve_get_loop_vctp): New function. + (arm_mve_insn_predicated_by): New function. + (arm_mve_across_lane_insn_p): New function. + (arm_mve_load_store_insn_p): New function. + (arm_mve_impl_pred_on_outputs_p): New function. + (arm_mve_impl_pred_on_inputs_p): New function. + (arm_last_vect_def_insn): New function. + (arm_mve_impl_predicated_p): New function. + (arm_mve_check_reg_origin_is_num_elems): New function. + (arm_mve_dlstp_check_inc_counter): New function. + (arm_mve_dlstp_check_dec_counter): New function. + (arm_mve_loop_valid_for_dlstp): New function. + (arm_predict_doloop_p): New function. + (arm_loop_unroll_adjust): New function. + (arm_emit_mve_unpredicated_insn_to_seq): New function. + (arm_attempt_dlstp_transform): New function. + * config/arm/arm.opt (mdlstp): New option. + * config/arm/iterators.md (dlstp_elemsize, letp_num_lanes, + letp_num_lanes_neg, letp_num_lanes_minus_1): New attributes. + (DLSTP, LETP): New iterators. + * config/arm/mve.md (predicated_doloop_end_internal, + dlstp_insn): New insn patterns. + * config/arm/thumb2.md (doloop_end): Adapt to support tail-predicated + loops. + (doloop_begin): Likewise. + * config/arm/types.md (mve_misc): New mve type to represent + predicated_loop_end insn sequences. + * config/arm/unspecs.md: + (DLSTP8, DLSTP16, DLSTP32, DSLTP64, + LETP8, LETP16, LETP32, LETP64): New unspecs for DLSTP and LETP. + +2024-06-19 Andre Vieira + Stam Markianos-Wright + + * df-core.cc (df_bb_regno_only_def_find): New helper function. + * df.h (df_bb_regno_only_def_find): Declare new function. + * loop-doloop.cc (doloop_condition_get): Add support for detecting + predicated vectorized hardware loops. + (doloop_modify): Add support for GTU condition checks. + (doloop_optimize): Update costing computation to support alterations to + desc->niter_expr by the backend. + +2024-06-19 Collin Funk + + * configure.ac: Add missing quotation of variable + gcc_cv_as_mips_explicit_relocs. + * configure: Regenerate. + +2024-06-19 Takayuki 'January June' Suwa + + * config/xtensa/xtensa-protos.h (xtensa_constantsynth): + Change the second argument from HOST_WIDE_INT to rtx. + * config/xtensa/xtensa.cc (#include): + Add "context.h" and "pass_manager.h". + (machine_function): Add a new hash_map field "litpool_usage". + (xtensa_constantsynth): Make "src" (the second operand) accept + RTX literal instead of its value, and treat both bare and pooled + SI/SFmode literals equally by bit-exact canonicalization into + CONST_INT RTX internally. And then, make avoid synthesis if + such multiple identical canonicalized literals are found in same + function when optimizing for size. Finally, for literals where + synthesis is not possible or has been avoided, re-emit "move" + RTXes with canonicalized ones to increase the chances of sharing + literal pool entries. + * config/xtensa/xtensa.md (split patterns for constant synthesis): + Change to simply invoke xtensa_constantsynth() as mentioned above, + and add new patterns for when TARGET_AUTO_LITPOOLS is enabled. + 2024-06-18 Edwin Lu Robin Dapp diff --git a/gcc/DATESTAMP b/gcc/DATESTAMP index 6fe37f7c38677..9df1831b6e340 100644 --- a/gcc/DATESTAMP +++ b/gcc/DATESTAMP @@ -1 +1 @@ -20240619 +20240620 diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h index 4610bf6d6a458..936039725ab6c 100644 --- a/gcc/common/config/i386/cpuinfo.h +++ b/gcc/common/config/i386/cpuinfo.h @@ -667,12 +667,18 @@ get_zhaoxin_cpu (struct __processor_model *cpu_model, reset_cpu_feature (cpu_model, cpu_features2, FEATURE_F16C); cpu_model->__cpu_subtype = ZHAOXIN_FAM7H_LUJIAZUI; } - else if (model >= 0x5b) + else if (model == 0x5b) { cpu = "yongfeng"; CHECK___builtin_cpu_is ("yongfeng"); cpu_model->__cpu_subtype = ZHAOXIN_FAM7H_YONGFENG; } + else if (model >= 0x6b) + { + cpu = "shijidadao"; + CHECK___builtin_cpu_is ("shijidadao"); + cpu_model->__cpu_subtype = ZHAOXIN_FAM7H_SHIJIDADAO; + } break; default: break; diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc index 5d9c188c9c7db..e38b1b22ffb10 100644 --- a/gcc/common/config/i386/i386-common.cc +++ b/gcc/common/config/i386/i386-common.cc @@ -2066,6 +2066,7 @@ const char *const processor_names[] = "intel", "lujiazui", "yongfeng", + "shijidadao", "geode", "k6", "athlon", @@ -2271,10 +2272,13 @@ const pta processor_alias_table[] = | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR, 0, P_NONE}, {"lujiazui", PROCESSOR_LUJIAZUI, CPU_LUJIAZUI, PTA_LUJIAZUI, - M_CPU_SUBTYPE (ZHAOXIN_FAM7H_LUJIAZUI), P_NONE}, + M_CPU_SUBTYPE (ZHAOXIN_FAM7H_LUJIAZUI), P_PROC_BMI}, {"yongfeng", PROCESSOR_YONGFENG, CPU_YONGFENG, PTA_YONGFENG, - M_CPU_SUBTYPE (ZHAOXIN_FAM7H_YONGFENG), P_NONE}, + M_CPU_SUBTYPE (ZHAOXIN_FAM7H_YONGFENG), P_PROC_AVX2}, + {"shijidadao", PROCESSOR_SHIJIDADAO, CPU_YONGFENG, + PTA_YONGFENG, + M_CPU_SUBTYPE (ZHAOXIN_FAM7H_SHIJIDADAO), P_PROC_AVX2}, {"k8", PROCESSOR_K8, CPU_K8, PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR, 0, P_NONE}, diff --git a/gcc/common/config/i386/i386-cpuinfo.h b/gcc/common/config/i386/i386-cpuinfo.h index 3ec9e005a6ad5..ccc6deb63853e 100644 --- a/gcc/common/config/i386/i386-cpuinfo.h +++ b/gcc/common/config/i386/i386-cpuinfo.h @@ -104,6 +104,7 @@ enum processor_subtypes INTEL_COREI7_PANTHERLAKE, ZHAOXIN_FAM7H_YONGFENG, AMDFAM1AH_ZNVER5, + ZHAOXIN_FAM7H_SHIJIDADAO, CPU_SUBTYPE_MAX }; diff --git a/gcc/common/config/riscv/riscv-common.cc b/gcc/common/config/riscv/riscv-common.cc index 1dc1d9904c7bd..410e673f5e017 100644 --- a/gcc/common/config/riscv/riscv-common.cc +++ b/gcc/common/config/riscv/riscv-common.cc @@ -401,6 +401,7 @@ static const struct riscv_ext_version riscv_ext_version_table[] = /* Combine extensions defined in this table */ static const struct riscv_ext_version riscv_combine_info[] = { + {"a", ISA_SPEC_CLASS_20191213, 2, 1}, {"zk", ISA_SPEC_CLASS_NONE, 1, 0}, {"zkn", ISA_SPEC_CLASS_NONE, 1, 0}, {"zks", ISA_SPEC_CLASS_NONE, 1, 0}, diff --git a/gcc/config.gcc b/gcc/config.gcc index e500ba63e3222..644c456290dc1 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -711,9 +711,9 @@ atom slm nehalem westmere sandybridge ivybridge haswell broadwell bonnell \ silvermont skylake-avx512 cannonlake icelake-client icelake-server \ skylake goldmont goldmont-plus tremont cascadelake tigerlake cooperlake \ sapphirerapids alderlake rocketlake eden-x2 nano nano-1000 nano-2000 nano-3000 \ -nano-x2 eden-x4 nano-x4 lujiazui yongfeng x86-64 x86-64-v2 x86-64-v3 x86-64-v4 \ -sierraforest graniterapids graniterapids-d grandridge arrowlake arrowlake-s \ -clearwaterforest pantherlake native" +nano-x2 eden-x4 nano-x4 lujiazui yongfeng shijidadao x86-64 x86-64-v2 \ +x86-64-v3 x86-64-v4 sierraforest graniterapids graniterapids-d grandridge \ +arrowlake arrowlake-s clearwaterforest pantherlake native" # Additional x86 processors supported by --with-cpu=. Each processor # MUST be separated by exactly one space. @@ -3855,6 +3855,10 @@ case ${target} in arch=yongfeng cpu=yongfeng ;; + shijidadao-*) + arch=shijidadao + cpu=shijidadao + ;; pentium2-*) arch=pentium2 cpu=pentium2 @@ -3980,6 +3984,10 @@ case ${target} in arch=yongfeng cpu=yongfeng ;; + shijidadao-*) + arch=shijidadao + cpu=shijidadao + ;; nocona-*) arch=nocona cpu=nocona diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 2cd560c99254b..34d6be76e94ac 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -65,8 +65,8 @@ extern void arm_emit_speculation_barrier_function (void); extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *); extern bool arm_q_bit_access (void); extern bool arm_ge_bits_access (void); -extern bool arm_target_insn_ok_for_lob (rtx); - +extern bool arm_target_bb_ok_for_lob (basic_block); +extern int arm_attempt_dlstp_transform (rtx); #ifdef RTX_CODE enum reg_class arm_mode_base_reg_class (machine_mode); diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc index b8c32db0a1d7f..7d67d2cfee9f4 100644 --- a/gcc/config/arm/arm.cc +++ b/gcc/config/arm/arm.cc @@ -668,6 +668,12 @@ static const scoped_attribute_specs *const arm_attribute_table[] = #undef TARGET_HAVE_CONDITIONAL_EXECUTION #define TARGET_HAVE_CONDITIONAL_EXECUTION arm_have_conditional_execution +#undef TARGET_LOOP_UNROLL_ADJUST +#define TARGET_LOOP_UNROLL_ADJUST arm_loop_unroll_adjust + +#undef TARGET_PREDICT_DOLOOP_P +#define TARGET_PREDICT_DOLOOP_P arm_predict_doloop_p + #undef TARGET_LEGITIMATE_CONSTANT_P #define TARGET_LEGITIMATE_CONSTANT_P arm_legitimate_constant_p @@ -34659,19 +34665,1236 @@ arm_invalid_within_doloop (const rtx_insn *insn) } bool -arm_target_insn_ok_for_lob (rtx insn) -{ - basic_block bb = BLOCK_FOR_INSN (insn); - /* Make sure the basic block of the target insn is a simple latch - having as single predecessor and successor the body of the loop - itself. Only simple loops with a single basic block as body are - supported for 'low over head loop' making sure that LE target is - above LE itself in the generated code. */ - - return single_succ_p (bb) - && single_pred_p (bb) - && single_succ_edge (bb)->dest == single_pred_edge (bb)->src - && contains_no_active_insn_p (bb); +arm_target_bb_ok_for_lob (basic_block bb) +{ + /* Make sure the basic block is a simple latch having as the single + predecessor and successor the body of the loop itself. + Only simple loops with a single basic block as body are supported for + low over head loops, making sure that LE target is above LE instruction + in the generated code. */ + return (single_succ_p (bb) + && single_pred_p (bb) + && single_succ_edge (bb)->dest == single_pred_edge (bb)->src); +} + +/* Utility fuction: Given a VCTP or a VCTP_M insn, return the number of MVE + lanes based on the machine mode being used. */ + +static int +arm_mve_get_vctp_lanes (rtx_insn *insn) +{ + rtx insn_set = single_set (insn); + if (insn_set + && GET_CODE (SET_SRC (insn_set)) == UNSPEC + && (XINT (SET_SRC (insn_set), 1) == VCTP + || XINT (SET_SRC (insn_set), 1) == VCTP_M)) + { + machine_mode mode = GET_MODE (SET_SRC (insn_set)); + return ((VECTOR_MODE_P (mode) && VALID_MVE_PRED_MODE (mode)) + ? GET_MODE_NUNITS (mode) : 0); + } + return 0; +} + +enum arm_dl_usage_type { DL_USAGE_ANY = 0, + DL_USAGE_READ = 1, + DL_USAGE_WRITE = 2 }; + +/* Check if INSN requires the use of the VPR reg, if it does, return the + sub-rtx of the VPR reg. The TYPE argument controls whether + this function should: + * For TYPE == DL_USAGE_ANY, check all operands, including the OUT operands, + and return the first occurrence of the VPR reg. + * For TYPE == DL_USAGE_READ, only check the input operands. + * For TYPE == DL_USAGE_WRITE, only check the output operands. + (INOUT operands are considered both as input and output operands) +*/ +static rtx +arm_get_required_vpr_reg (rtx_insn *insn, + arm_dl_usage_type type = DL_USAGE_ANY) +{ + gcc_assert (type < 3); + if (!NONJUMP_INSN_P (insn)) + return NULL_RTX; + + bool requires_vpr; + extract_constrain_insn (insn); + int n_operands = recog_data.n_operands; + if (recog_data.n_alternatives == 0) + return NULL_RTX; + + /* Fill in recog_op_alt with information about the constraints of + this insn. */ + preprocess_constraints (insn); + + for (int op = 0; op < n_operands; op++) + { + requires_vpr = true; + if (type == DL_USAGE_READ + && recog_data.operand_type[op] == OP_OUT) + continue; + else if (type == DL_USAGE_WRITE + && recog_data.operand_type[op] == OP_IN) + continue; + + /* Iterate through alternatives of operand "op" in recog_op_alt and + identify if the operand is required to be the VPR. */ + for (int alt = 0; alt < recog_data.n_alternatives; alt++) + { + const operand_alternative *op_alt + = &recog_op_alt[alt * n_operands]; + /* Fetch the reg_class for each entry and check it against the + VPR_REG reg_class. */ + if (alternative_class (op_alt, op) != VPR_REG) + requires_vpr = false; + } + /* If all alternatives of the insn require the VPR reg for this operand, + it means that either this is VPR-generating instruction, like a vctp, + vcmp, etc., or it is a VPT-predicated insruction. Return the subrtx + of the VPR reg operand. */ + if (requires_vpr) + return recog_data.operand[op]; + } + return NULL_RTX; +} + +/* Wrapper function of arm_get_required_vpr_reg with TYPE == DL_USAGE_READ, + so return the VPR only if it is an input operand to the insn. */ + +static rtx +arm_get_required_vpr_reg_param (rtx_insn *insn) +{ + return arm_get_required_vpr_reg (insn, DL_USAGE_READ); +} + +/* Wrapper function of arm_get_required_vpr_reg with TYPE == DL_USAGE_WRITE, + so return the VPR only if it is the return value, an output of, or is + clobbered by the insn. */ + +static rtx +arm_get_required_vpr_reg_ret_val (rtx_insn *insn) +{ + return arm_get_required_vpr_reg (insn, DL_USAGE_WRITE); +} + +/* Return the first VCTP instruction in BB, if it exists, or NULL otherwise. */ + +static rtx_insn * +arm_mve_get_loop_vctp (basic_block bb) +{ + rtx_insn *insn = BB_HEAD (bb); + + /* Now scan through all the instruction patterns and pick out the VCTP + instruction. We require arm_get_required_vpr_reg_param to be false + to make sure we pick up a VCTP, rather than a VCTP_M. */ + FOR_BB_INSNS (bb, insn) + if (NONDEBUG_INSN_P (insn)) + if (arm_get_required_vpr_reg_ret_val (insn) + && (arm_mve_get_vctp_lanes (insn) != 0) + && !arm_get_required_vpr_reg_param (insn)) + return insn; + return NULL; +} + +/* Return true if INSN is a MVE instruction that is VPT-predicable and is + predicated on VPR_REG. */ + +static bool +arm_mve_insn_predicated_by (rtx_insn *insn, rtx vpr_reg) +{ + rtx insn_vpr_reg_operand = (MVE_VPT_PREDICATED_INSN_P (insn) + ? arm_get_required_vpr_reg_param (insn) + : NULL_RTX); + return (insn_vpr_reg_operand + && rtx_equal_p (vpr_reg, insn_vpr_reg_operand)); +} + +/* Utility function to identify if INSN is an MVE instruction that performs + some across lane operation (and as a result does not align with normal + lane predication rules). All such instructions give one only scalar + output, except for vshlcq which gives a PARALLEL of a vector and a scalar + (one vector result and one carry output). */ + +static bool +arm_mve_across_lane_insn_p (rtx_insn* insn) +{ + df_ref insn_defs = NULL; + if (!MVE_VPT_PREDICABLE_INSN_P (insn)) + return false; + + FOR_EACH_INSN_DEF (insn_defs, insn) + if (!VALID_MVE_MODE (GET_MODE (DF_REF_REG (insn_defs))) + && !arm_get_required_vpr_reg_ret_val (insn)) + return true; + + return false; +} + +/* Utility function to identify if INSN is an MVE load or store instruction. + * For TYPE == DL_USAGE_ANY, check all operands. If the function returns + true, INSN is a load or a store insn. + * For TYPE == DL_USAGE_READ, only check the input operands. If the + function returns true, INSN is a load insn. + * For TYPE == DL_USAGE_WRITE, only check the output operands. If the + function returns true, INSN is a store insn. */ + +static bool +arm_mve_load_store_insn_p (rtx_insn* insn, + arm_dl_usage_type type = DL_USAGE_ANY) +{ + gcc_assert (type < 3); + int n_operands = recog_data.n_operands; + extract_insn (insn); + + for (int op = 0; op < n_operands; op++) + { + if (type == DL_USAGE_READ && recog_data.operand_type[op] == OP_OUT) + continue; + else if (type == DL_USAGE_WRITE && recog_data.operand_type[op] == OP_IN) + continue; + if (mve_memory_operand (recog_data.operand[op], + GET_MODE (recog_data.operand[op]))) + return true; + } + return false; +} + +/* Return TRUE if INSN is validated for implicit predication by how its outputs + are used. + + If INSN is a MVE operation across lanes that is not predicated by + VCTP_VPR_GENERATED it can not be validated by the use of its ouputs. + + Any other INSN is safe to implicit predicate if we don't use its outputs + outside the loop. The instructions that use this INSN's outputs will be + validated as we go through the analysis. */ + +static bool +arm_mve_impl_pred_on_outputs_p (rtx_insn *insn, rtx vctp_vpr_generated) +{ + /* Reject any unpredicated across lane operation. */ + if (!arm_mve_insn_predicated_by (insn, vctp_vpr_generated) + && arm_mve_across_lane_insn_p (insn)) + return false; + + /* Next, scan forward to the various USEs of the DEFs in this insn. */ + df_ref insn_def = NULL; + basic_block insn_bb = BLOCK_FOR_INSN (insn); + FOR_EACH_INSN_DEF (insn_def, insn) + { + for (df_ref use = DF_REG_USE_CHAIN (DF_REF_REGNO (insn_def)); + use; + use = DF_REF_NEXT_REG (use)) + { + rtx_insn *next_use_insn = DF_REF_INSN (use); + if (!INSN_P (next_use_insn) || DEBUG_INSN_P (next_use_insn)) + continue; + + if (insn_bb != BLOCK_FOR_INSN (next_use_insn)) + return false; + } + } + return true; +} + + +/* Returns the prevailing definition of OP before CUR_INSN in the same + basic block as CUR_INSN, if one exists, returns NULL otherwise. */ + +static rtx_insn* +arm_last_vect_def_insn (rtx op, rtx_insn *cur_insn) +{ + if (!REG_P (op) + || !BLOCK_FOR_INSN (cur_insn)) + return NULL; + + df_ref def_insns; + rtx_insn *last_def = NULL; + for (def_insns = DF_REG_DEF_CHAIN (REGNO (op)); + def_insns; + def_insns = DF_REF_NEXT_REG (def_insns)) + { + rtx_insn *def_insn = DF_REF_INSN (def_insns); + /* Definition not in the loop body or after the current insn. */ + if (DF_REF_BB (def_insns) != BLOCK_FOR_INSN (cur_insn) + || INSN_UID (def_insn) >= INSN_UID (cur_insn)) + continue; + + if (!last_def || INSN_UID (def_insn) > INSN_UID (last_def)) + last_def = def_insn; + } + return last_def; +} + + +/* This function returns TRUE if we can validate the implicit predication of + INSN_IN with VCTP_VPR_GENERATED based on the definition of the instruction's + input operands. + + If INSN_IN is a MVE operation across lanes then all of its MVE vector + operands must have its tail-predicated lanes be zeroes. We keep track of any + instructions that define vector operands for which this is true in + PROPS_ZERO_SET. + + For any other INSN_IN, the definition of all its operands must be defined + inside the loop body by an instruction that comes before INSN_IN and not be + a MVE load predicated by a different VPR. These instructions have all been + validated for explicit or implicit predication. + */ + +static bool +arm_mve_impl_pred_on_inputs_p (vec *props_zero_set, + rtx_insn *insn_in, rtx vctp_vpr_generated) +{ + /* If all inputs come from instructions that are explicitly or + implicitly predicated by the same predicate then it is safe to + implicitly predicate this instruction. */ + df_ref insn_uses = NULL; + bool across_lane = arm_mve_across_lane_insn_p (insn_in); + FOR_EACH_INSN_USE (insn_uses, insn_in) + { + rtx op = DF_REF_REG (insn_uses); + rtx_insn *def_insn = arm_last_vect_def_insn (op, insn_in); + if (across_lane) + { + if (!VALID_MVE_MODE (GET_MODE (op))) + continue; + if (!def_insn || !props_zero_set->contains (def_insn)) + return false; + + continue; + } + + if (!def_insn + || (!arm_mve_insn_predicated_by (def_insn, vctp_vpr_generated) + && arm_mve_load_store_insn_p (def_insn, DL_USAGE_READ))) + return false; + } + + return true; +} + + +/* Determine whether INSN_IN is safe to implicitly predicate based on the type + of instruction and where needed the definition of its inputs and the uses of + its outputs. + Return TRUE if it is safe to implicitly predicate and FALSE otherwise. + + * If INSN_IN is a store, then it is always unsafe to implicitly predicate it. + * If INSN_IN is a load, only reject implicit predication if its uses + directly invalidate it. + * If INSN_IN operates across vector lanes and does not have the + "mve_safe_imp_xlane_pred" attribute, then it is always unsafe to implicitly + predicate. + * If INSN_IN operates on Floating Point elements and we are not compiling + with -Ofast, then it is unsafe to implicitly predicate it as we may be + changing exception and cumulative bits behaviour. + * If INSN_IN is a VCTP instruction, then it is safe to implicitly predicate, + but instructions that use this predicate will need to be checked + just like any other UNPREDICATED MVE instruction. + * Otherwise check if INSN_IN's inputs or uses of outputs can validate its + implicit predication. + + * If all inputs come from instructions that are explicitly or implicitly + predicated by the same predicate then it is safe to implicitly predicate + this instruction. + * If INSN_IN is an operation across lanes with the "mve_safe_imp_xlane_pred" + attribute, then all it's operands must have zeroed falsely predicated tail + lanes. + + * Otherwise, check if the implicit predication of INSN_IN can be validated + based on its inputs, and if not check whether it can be validated based on + how its outputs are used. */ + +static bool +arm_mve_impl_predicated_p (vec *props_zero_set, + rtx_insn *insn_in, rtx vctp_vpr_generated) +{ + + /* If INSN_IN is a store, then it is always unsafe to implicitly + predicate it. */ + if (arm_mve_load_store_insn_p (insn_in, DL_USAGE_WRITE)) + return false; + + /* If INSN_IN is a load, only reject implicit predication if its uses + directly invalidate it. */ + if (arm_mve_load_store_insn_p (insn_in, DL_USAGE_READ)) + { + if (!arm_mve_impl_pred_on_outputs_p (insn_in, vctp_vpr_generated)) + return false; + return true; + } + + /* If INSN_IN operates across vector lanes and does not have the + "mve_safe_imp_xlane_pred" attribute, then it is always unsafe to implicitly + predicate. */ + if (arm_mve_across_lane_insn_p (insn_in) + && (get_attr_mve_safe_imp_xlane_pred (insn_in) + != MVE_SAFE_IMP_XLANE_PRED_YES)) + return false; + + /* If INSN_IN operates on Floating Point elements and we are not compiling + with -Ofast, then it is unsafe to implicitly predicate it as we may be + changing exception and cumulative bits behaviour. */ + if (!flag_unsafe_math_optimizations + && flag_trapping_math + && MVE_VPT_UNPREDICATED_INSN_P (insn_in)) + { + df_ref def; + FOR_EACH_INSN_DEF (def, insn_in) + if (DF_REF_TYPE (def) == DF_REF_REG_DEF + && FLOAT_MODE_P (GET_MODE (DF_REF_REG (def)))) + return false; + FOR_EACH_INSN_USE (def, insn_in) + if (DF_REF_TYPE (def) == DF_REF_REG_DEF + && FLOAT_MODE_P (GET_MODE (DF_REF_REG (def)))) + return false; + } + + /* If INSN_IN is a VCTP instruction, then it is safe to implicitly predicate, + but instructions that use this predicate will need to be checked + just like any other UNPREDICATED MVE instruction. */ + if (arm_get_required_vpr_reg_ret_val (insn_in) + && (arm_mve_get_vctp_lanes (insn_in) != 0)) + return true; + + /* Otherwise, check if the implicit predication of INSN_IN can be validated + based on its inputs, and if not check whether it can be validated based on + how its outputs are used. */ + return (arm_mve_impl_pred_on_inputs_p (props_zero_set, insn_in, vctp_vpr_generated) + || arm_mve_impl_pred_on_outputs_p (insn_in, vctp_vpr_generated)); +} + +/* Helper function to `arm_mve_dlstp_check_inc_counter` and to + `arm_mve_dlstp_check_dec_counter`. In the situations where the loop counter + is incrementing by 1 or decrementing by 1 in each iteration, ensure that the + number of iterations, the value of REG, going into the loop, was calculated + as: + REG = (N + [1, VCTP_STEP - 1]) / VCTP_STEP + + where N is equivalent to the VCTP_REG. +*/ + +static bool +arm_mve_check_reg_origin_is_num_elems (loop *loop, rtx reg, rtx vctp_step, + rtx vctp_reg) +{ + df_ref counter_max_last_def = NULL; + + /* More than one reaching definition. */ + if (DF_REG_DEF_COUNT (REGNO (reg)) > 2) + return false; + + /* Look for a single defition of REG going into the loop. The DEF_CHAIN will + have at least two values, as this is a loop induction variable that is + defined outside the loop. */ + for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg)); + def; + def = DF_REF_NEXT_REG (def)) + { + /* Skip the update inside the loop, this has already been checked by the + iv_analyze call earlier. */ + if (DF_REF_BB (def) == loop->header) + continue; + + counter_max_last_def = def; + break; + } + + if (!counter_max_last_def) + return false; + + rtx counter_max_last_set = single_set (DF_REF_INSN (counter_max_last_def)); + + if (!counter_max_last_set) + return false; + + /* If we encounter a simple SET from a REG, follow it through. */ + if (REG_P (SET_SRC (counter_max_last_set))) + { + if (DF_REG_DEF_COUNT (REGNO (SET_SRC (counter_max_last_set))) != 1) + return false; + + counter_max_last_def + = DF_REG_DEF_CHAIN (REGNO (SET_SRC (counter_max_last_set))); + counter_max_last_set + = single_set (DF_REF_INSN (counter_max_last_def)); + + if (!counter_max_last_set) + return false; + } + + /* We are looking for: + COUNTER_MAX_LAST_SET = (N + VCTP_STEP - 1) / VCTP_STEP. + We currently only support the unsigned VCTP_OP case. */ + rtx division = SET_SRC (counter_max_last_set); + if (GET_CODE (division) != LSHIFTRT) + return false; + + /* Now check that we are dividing by VCTP_STEP, i.e. the number of lanes. */ + rtx divisor = XEXP (division, 1); + unsigned vctp_step_cst = abs_hwi (INTVAL (vctp_step)); + if (!CONST_INT_P (divisor) + || (1U << INTVAL (divisor) != vctp_step_cst)) + return false; + + rtx dividend = XEXP (division, 0); + if (!REG_P (dividend)) + /* Subreg? */ + return false; + + /* For now only support the simple case, this only works for unsigned N, any + signed N will have further computations to deal with overflow. */ + if (DF_REG_DEF_COUNT (REGNO (dividend)) != 1) + return false; + + rtx_insn *dividend_insn = DF_REF_INSN (DF_REG_DEF_CHAIN (REGNO (dividend))); + rtx dividend_op = single_set (dividend_insn); + if (!dividend_op + && GET_CODE (SET_SRC (dividend_op)) != PLUS) + return false; + + /* Check if PLUS_OP is (VCTP_OP + VAL), where VAL = [1, VCTP_STEP - 1]. */ + rtx plus_op = SET_SRC (dividend_op); + if (!REG_P (XEXP (plus_op, 0)) + || !CONST_INT_P (XEXP (plus_op, 1)) + || !IN_RANGE (INTVAL (XEXP (plus_op, 1)), 1, vctp_step_cst - 1)) + return false; + + /* VCTP_REG may have been copied before entering the loop, let's see if we can + trace such a copy back. If we have more than one reaching definition then + bail out as analysis will be too difficult. */ + if (DF_REG_DEF_COUNT (REGNO (vctp_reg)) > 2) + return false; + + /* Look for the definition of N. */ + for (df_ref def = DF_REG_DEF_CHAIN (REGNO (vctp_reg)); + def; + def = DF_REF_NEXT_REG (def)) + { + if (DF_REF_BB (def) == loop->header) + continue; + rtx set = single_set (DF_REF_INSN (def)); + if (set + && REG_P (SET_SRC (set)) + && !HARD_REGISTER_P (SET_SRC (set))) + vctp_reg = SET_SRC (set); + } + + return rtx_equal_p (vctp_reg, XEXP (plus_op, 0)); +} + +/* If we have identified the loop to have an incrementing counter, we need to + make sure that it increments by 1 and that the loop is structured correctly: + * The counter starts from 0 + * The counter terminates at (num_of_elem + num_of_lanes - 1) / num_of_lanes + * The vctp insn uses a reg that decrements appropriately in each iteration. +*/ + +static rtx_insn* +arm_mve_dlstp_check_inc_counter (loop *loop, rtx_insn* vctp_insn, + rtx condconst, rtx condcount) +{ + rtx vctp_reg = XVECEXP (XEXP (PATTERN (vctp_insn), 1), 0, 0); + /* The loop latch has to be empty. When compiling all the known MVE LoLs in + user applications, none of those with incrementing counters had any real + insns in the loop latch. As such, this function has only been tested with + an empty latch and may misbehave or ICE if we somehow get here with an + increment in the latch, so, for correctness, error out early. */ + if (!empty_block_p (loop->latch)) + return NULL; + + class rtx_iv vctp_reg_iv; + /* For loops of DLSTP_TYPE_B, the loop counter is independent of the decrement + of the reg used in the vctp_insn. So run iv analysis on that reg. This + has to succeed for such loops to be supported. */ + if (!iv_analyze (vctp_insn, as_a (GET_MODE (vctp_reg)), + vctp_reg, &vctp_reg_iv)) + return NULL; + + /* Extract the decrementnum of the vctp reg from the iv. This decrementnum + is the number of lanes/elements it decrements from the remaining number of + lanes/elements to process in the loop, for this reason this is always a + negative number, but to simplify later checks we use it's absolute value. */ + HOST_WIDE_INT decrementnum = INTVAL (vctp_reg_iv.step); + if (decrementnum >= 0) + return NULL; + decrementnum = abs_hwi (decrementnum); + + /* Find where both of those are modified in the loop header bb. */ + df_ref condcount_reg_set_df = df_bb_regno_only_def_find (loop->header, + REGNO (condcount)); + df_ref vctp_reg_set_df = df_bb_regno_only_def_find (loop->header, + REGNO (vctp_reg)); + if (!condcount_reg_set_df || !vctp_reg_set_df) + return NULL; + rtx condcount_reg_set = single_set (DF_REF_INSN (condcount_reg_set_df)); + rtx vctp_reg_set = single_set (DF_REF_INSN (vctp_reg_set_df)); + if (!condcount_reg_set || !vctp_reg_set) + return NULL; + + /* Ensure the modification of the vctp reg from df is consistent with + the iv and the number of lanes on the vctp insn. */ + if (GET_CODE (SET_SRC (vctp_reg_set)) != PLUS + || !REG_P (SET_DEST (vctp_reg_set)) + || !REG_P (XEXP (SET_SRC (vctp_reg_set), 0)) + || REGNO (SET_DEST (vctp_reg_set)) + != REGNO (XEXP (SET_SRC (vctp_reg_set), 0)) + || !CONST_INT_P (XEXP (SET_SRC (vctp_reg_set), 1)) + || INTVAL (XEXP (SET_SRC (vctp_reg_set), 1)) >= 0 + || decrementnum != abs_hwi (INTVAL (XEXP (SET_SRC (vctp_reg_set), 1))) + || decrementnum != arm_mve_get_vctp_lanes (vctp_insn)) + return NULL; + + if (REG_P (condcount) && REG_P (condconst)) + { + /* First we need to prove that the loop is going 0..condconst with an + inc of 1 in each iteration. */ + if (GET_CODE (SET_SRC (condcount_reg_set)) == PLUS + && CONST_INT_P (XEXP (SET_SRC (condcount_reg_set), 1)) + && INTVAL (XEXP (SET_SRC (condcount_reg_set), 1)) == 1) + { + rtx counter_reg = SET_DEST (condcount_reg_set); + /* Check that the counter did indeed start from zero. */ + df_ref this_set = DF_REG_DEF_CHAIN (REGNO (counter_reg)); + if (!this_set) + return NULL; + df_ref last_set_def = DF_REF_NEXT_REG (this_set); + if (!last_set_def) + return NULL; + rtx_insn* last_set_insn = DF_REF_INSN (last_set_def); + rtx last_set = single_set (last_set_insn); + if (!last_set) + return NULL; + rtx counter_orig_set; + counter_orig_set = SET_SRC (last_set); + if (!CONST_INT_P (counter_orig_set) + || (INTVAL (counter_orig_set) != 0)) + return NULL; + /* And finally check that the target value of the counter, + condconst, is of the correct shape. */ + if (!arm_mve_check_reg_origin_is_num_elems (loop, condconst, + vctp_reg_iv.step, + vctp_reg)) + return NULL; + } + else + return NULL; + } + else + return NULL; + + /* Everything looks valid. */ + return vctp_insn; +} + +/* Helper function to `arm_mve_loop_valid_for_dlstp`. In the case of a + counter that is decrementing, ensure that it is decrementing by the + right amount in each iteration and that the target condition is what + we expect. */ + +static rtx_insn* +arm_mve_dlstp_check_dec_counter (loop *loop, rtx_insn* vctp_insn, + rtx condconst, rtx condcount) +{ + rtx vctp_reg = XVECEXP (XEXP (PATTERN (vctp_insn), 1), 0, 0); + class rtx_iv vctp_reg_iv; + HOST_WIDE_INT decrementnum; + /* For decrementing loops of DLSTP_TYPE_A, the counter is usually present in the + loop latch. Here we simply need to verify that this counter is the same + reg that is also used in the vctp_insn and that it is not otherwise + modified. */ + rtx_insn *dec_insn = BB_END (loop->latch); + /* If not in the loop latch, try to find the decrement in the loop header. */ + if (!NONDEBUG_INSN_P (dec_insn)) + { + df_ref temp = df_bb_regno_only_def_find (loop->header, REGNO (condcount)); + /* If we haven't been able to find the decrement, bail out. */ + if (!temp) + return NULL; + dec_insn = DF_REF_INSN (temp); + } + + rtx dec_set = single_set (dec_insn); + + /* Next, ensure that it is a PLUS of the form: + (set (reg a) (plus (reg a) (const_int))) + where (reg a) is the same as condcount. */ + if (!dec_set + || !REG_P (SET_DEST (dec_set)) + || !REG_P (XEXP (SET_SRC (dec_set), 0)) + || !CONST_INT_P (XEXP (SET_SRC (dec_set), 1)) + || REGNO (SET_DEST (dec_set)) + != REGNO (XEXP (SET_SRC (dec_set), 0)) + || REGNO (SET_DEST (dec_set)) != REGNO (condcount)) + return NULL; + + decrementnum = INTVAL (XEXP (SET_SRC (dec_set), 1)); + + /* This decrementnum is the number of lanes/elements it decrements from the + remaining number of lanes/elements to process in the loop, for this reason + this is always a negative number, but to simplify later checks we use its + absolute value. */ + if (decrementnum >= 0) + return NULL; + decrementnum = -decrementnum; + + /* If the decrementnum is a 1, then we need to look at the loop vctp_reg and + verify that it also decrements correctly. + Then, we need to establish that the starting value of the loop decrement + originates from the starting value of the vctp decrement. */ + if (decrementnum == 1) + { + class rtx_iv vctp_reg_iv, condcount_reg_iv; + /* The loop counter is found to be independent of the decrement + of the reg used in the vctp_insn, again. Ensure that IV analysis + succeeds and check the step. */ + if (!iv_analyze (vctp_insn, as_a (GET_MODE (vctp_reg)), + vctp_reg, &vctp_reg_iv)) + return NULL; + /* Ensure it matches the number of lanes of the vctp instruction. */ + if (abs (INTVAL (vctp_reg_iv.step)) + != arm_mve_get_vctp_lanes (vctp_insn)) + return NULL; + + if (!arm_mve_check_reg_origin_is_num_elems (loop, condcount, + vctp_reg_iv.step, + vctp_reg)) + return NULL; + } + /* If the decrements are the same, then the situation is simple: either they + are also the same reg, which is safe, or they are different registers, in + which case makse sure that there is a only simple SET from one to the + other inside the loop.*/ + else if (decrementnum == arm_mve_get_vctp_lanes (vctp_insn)) + { + if (REGNO (condcount) != REGNO (vctp_reg)) + { + /* It wasn't the same reg, but it could be behild a + (set (vctp_reg) (condcount)), so instead find where + the VCTP insn is DEF'd inside the loop. */ + rtx_insn *vctp_reg_insn + = DF_REF_INSN (df_bb_regno_only_def_find (loop->header, + REGNO (vctp_reg))); + rtx vctp_reg_set = single_set (vctp_reg_insn); + /* This must just be a simple SET from the condcount. */ + if (!vctp_reg_set + || !REG_P (SET_DEST (vctp_reg_set)) + || !REG_P (SET_SRC (vctp_reg_set)) + || REGNO (SET_SRC (vctp_reg_set)) != REGNO (condcount)) + return NULL; + } + } + else + return NULL; + + /* We now only need to find out that the loop terminates with a LE + zero condition. If condconst is a const_int, then this is easy. + If its a REG, look at the last condition+jump in a bb before + the loop, because that usually will have a branch jumping over + the loop header. */ + rtx_insn *jump_insn = BB_END (loop->header); + if (CONST_INT_P (condconst) + && !(INTVAL (condconst) == 0 && JUMP_P (jump_insn) + && GET_CODE (XEXP (PATTERN (jump_insn), 1)) == IF_THEN_ELSE + && (GET_CODE (XEXP (XEXP (PATTERN (jump_insn), 1), 0)) == NE + ||GET_CODE (XEXP (XEXP (PATTERN (jump_insn), 1), 0)) == GT))) + return NULL; + else if (REG_P (condconst)) + { + basic_block pre_loop_bb = single_pred (loop_preheader_edge (loop)->src); + if (!pre_loop_bb) + return NULL; + + rtx initial_compare = NULL_RTX; + if (!(prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb)) + && INSN_P (prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb))))) + return NULL; + else + initial_compare + = single_set (prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb))); + if (!(initial_compare + && cc_register (SET_DEST (initial_compare), VOIDmode) + && GET_CODE (SET_SRC (initial_compare)) == COMPARE + && CONST_INT_P (XEXP (SET_SRC (initial_compare), 1)) + && INTVAL (XEXP (SET_SRC (initial_compare), 1)) == 0)) + return NULL; + + /* Usually this is a LE condition, but it can also just be a GT or an EQ + condition (if the value is unsigned or the compiler knows its not negative) */ + rtx_insn *loop_jumpover = BB_END (pre_loop_bb); + if (!(JUMP_P (loop_jumpover) + && GET_CODE (XEXP (PATTERN (loop_jumpover), 1)) == IF_THEN_ELSE + && (GET_CODE (XEXP (XEXP (PATTERN (loop_jumpover), 1), 0)) == LE + || GET_CODE (XEXP (XEXP (PATTERN (loop_jumpover), 1), 0)) == GT + || GET_CODE (XEXP (XEXP (PATTERN (loop_jumpover), 1), 0)) == EQ))) + return NULL; + } + + /* Everything looks valid. */ + return vctp_insn; +} + +/* Function to check a loop's structure to see if it is a valid candidate for + an MVE Tail Predicated Low-Overhead Loop. Returns the loop's VCTP_INSN if + it is valid, or NULL if it isn't. */ + +static rtx_insn* +arm_mve_loop_valid_for_dlstp (loop *loop) +{ + /* Doloop can only be done "elementwise" with predicated dlstp/letp if it + contains a VCTP on the number of elements processed by the loop. + Find the VCTP predicate generation inside the loop body BB. */ + rtx_insn *vctp_insn = arm_mve_get_loop_vctp (loop->header); + if (!vctp_insn) + return NULL; + + /* We only support two loop forms for tail predication: + DLSTP_TYPE_A) Loops of the form: + int num_of_lanes = 128 / elem_size; + while (num_of_elem > 0) + { + p = vctp (num_of_elem); + num_of_elem -= num_of_lanes; + } + DLSTP_TYPE_B) Loops of the form: + int num_of_lanes = 128 / elem_size; + int num_of_iters = (num_of_elem + num_of_lanes - 1) / num_of_lanes; + for (i = 0; i < num_of_iters; i++) + { + p = vctp (num_of_elem); + num_of_elem -= num_of_lanes; + } + + Then, depending on the type of loop above we need will need to do + different sets of checks. */ + iv_analysis_loop_init (loop); + + /* In order to find out if the loop is of DLSTP_TYPE_A or DLSTP_TYPE_B above + look for the loop counter: it will either be incrementing by one per + iteration or it will be decrementing by num_of_lanes. We can find the + loop counter in the condition at the end of the loop. */ + rtx_insn *loop_cond = prev_nonnote_nondebug_insn_bb (BB_END (loop->header)); + if (!(cc_register (XEXP (PATTERN (loop_cond), 0), VOIDmode) + && GET_CODE (XEXP (PATTERN (loop_cond), 1)) == COMPARE)) + return NULL; + + /* The operands in the condition: Try to identify which one is the + constant and which is the counter and run IV analysis on the latter. */ + rtx cond_arg_1 = XEXP (XEXP (PATTERN (loop_cond), 1), 0); + rtx cond_arg_2 = XEXP (XEXP (PATTERN (loop_cond), 1), 1); + + rtx loop_cond_constant; + rtx loop_counter; + class rtx_iv cond_counter_iv, cond_temp_iv; + + if (CONST_INT_P (cond_arg_1)) + { + /* cond_arg_1 is the constant and cond_arg_2 is the counter. */ + loop_cond_constant = cond_arg_1; + loop_counter = cond_arg_2; + iv_analyze (loop_cond, as_a (GET_MODE (cond_arg_2)), + cond_arg_2, &cond_counter_iv); + } + else if (CONST_INT_P (cond_arg_2)) + { + /* cond_arg_2 is the constant and cond_arg_1 is the counter. */ + loop_cond_constant = cond_arg_2; + loop_counter = cond_arg_1; + iv_analyze (loop_cond, as_a (GET_MODE (cond_arg_1)), + cond_arg_1, &cond_counter_iv); + } + else if (REG_P (cond_arg_1) && REG_P (cond_arg_2)) + { + /* If both operands to the compare are REGs, we can safely + run IV analysis on both and then determine which is the + constant by looking at the step. + First assume cond_arg_1 is the counter. */ + loop_counter = cond_arg_1; + loop_cond_constant = cond_arg_2; + iv_analyze (loop_cond, as_a (GET_MODE (cond_arg_1)), + cond_arg_1, &cond_counter_iv); + iv_analyze (loop_cond, as_a (GET_MODE (cond_arg_2)), + cond_arg_2, &cond_temp_iv); + + /* Look at the steps and swap around the rtx's if needed. Error out if + one of them cannot be identified as constant. */ + if (!CONST_INT_P (cond_counter_iv.step) || !CONST_INT_P (cond_temp_iv.step)) + return NULL; + if (INTVAL (cond_counter_iv.step) != 0 && INTVAL (cond_temp_iv.step) != 0) + return NULL; + if (INTVAL (cond_counter_iv.step) == 0 && INTVAL (cond_temp_iv.step) != 0) + { + loop_counter = cond_arg_2; + loop_cond_constant = cond_arg_1; + cond_counter_iv = cond_temp_iv; + } + } + else + return NULL; + + if (!REG_P (loop_counter)) + return NULL; + if (!(REG_P (loop_cond_constant) || CONST_INT_P (loop_cond_constant))) + return NULL; + + /* Now we have extracted the IV step of the loop counter, call the + appropriate checking function. */ + if (INTVAL (cond_counter_iv.step) > 0) + return arm_mve_dlstp_check_inc_counter (loop, vctp_insn, + loop_cond_constant, loop_counter); + else if (INTVAL (cond_counter_iv.step) < 0) + return arm_mve_dlstp_check_dec_counter (loop, vctp_insn, + loop_cond_constant, loop_counter); + else + return NULL; +} + +/* Predict whether the given loop in gimple will be transformed in the RTL + doloop_optimize pass. It could be argued that turning large enough loops + into low-overhead loops would not show a signficant performance boost. + However, in the case of tail predication we would still avoid using VPT/VPST + instructions inside the loop, and in either case using low-overhead loops + would not be detrimental, so we decided to not consider size, avoiding the + need of a heuristic to determine what an appropriate size boundary is. */ + +static bool +arm_predict_doloop_p (struct loop *loop) +{ + gcc_assert (loop); + /* On arm, targetm.can_use_doloop_p is actually + can_use_doloop_if_innermost. Ensure the loop is innermost, + it is valid and as per arm_target_bb_ok_for_lob and the + correct architecture flags are enabled. */ + if (!(TARGET_HAVE_LOB && optimize > 0)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Predict doloop failure due to" + " target architecture or optimisation flags.\n"); + return false; + } + else if (loop->inner != NULL) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Predict doloop failure due to" + " loop nesting.\n"); + return false; + } + else if (!arm_target_bb_ok_for_lob (loop->header->next_bb)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Predict doloop failure due to" + " loop bb complexity.\n"); + return false; + } + + return true; +} + +/* Implement targetm.loop_unroll_adjust. Use this to block unrolling of loops + that may later be turned into MVE Tail Predicated Low Overhead Loops. The + performance benefit of an MVE LoL is likely to be much higher than that of + the unrolling. */ + +unsigned +arm_loop_unroll_adjust (unsigned nunroll, struct loop *loop) +{ + if (TARGET_HAVE_MVE + && arm_target_bb_ok_for_lob (loop->latch) + && arm_mve_loop_valid_for_dlstp (loop)) + return 0; + else + return nunroll; +} + +/* Function to hadle emitting a VPT-unpredicated version of a VPT-predicated + insn to a sequence. */ + +static bool +arm_emit_mve_unpredicated_insn_to_seq (rtx_insn* insn) +{ + rtx insn_vpr_reg_operand = arm_get_required_vpr_reg_param (insn); + int new_icode = get_attr_mve_unpredicated_insn (insn); + if (!in_sequence_p () + || !MVE_VPT_PREDICATED_INSN_P (insn) + || (!insn_vpr_reg_operand) + || (!new_icode)) + return false; + + extract_insn (insn); + rtx arr[8]; + int j = 0; + + /* When transforming a VPT-predicated instruction into its unpredicated + equivalent we need to drop the VPR operand and we may need to also drop a + merge "vuninit" input operand, depending on the instruction pattern. Here + ensure that we have at most a two-operand difference between the two + instrunctions. */ + int n_operands_diff + = recog_data.n_operands - insn_data[new_icode].n_operands; + if (!(n_operands_diff > 0 && n_operands_diff <= 2)) + return false; + + rtx move = NULL_RTX; + /* Then, loop through the operands of the predicated + instruction, and retain the ones that map to the + unpredicated instruction. */ + for (int i = 0; i < recog_data.n_operands; i++) + { + /* Ignore the VPR and, if needed, the vuninit + operand. */ + if (insn_vpr_reg_operand == recog_data.operand[i]) + continue; + if (n_operands_diff == 2 + && !strcmp (recog_data.constraints[i], "0")) + { + move = gen_rtx_SET (arr[0], recog_data.operand[i]); + arr[0] = recog_data.operand[i]; + } + else + arr[j++] = recog_data.operand[i]; + } + + /* Finally, emit the upredicated instruction. */ + rtx_insn *new_insn; + switch (j) + { + case 1: + new_insn = emit_insn (GEN_FCN (new_icode) (arr[0])); + break; + case 2: + new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1])); + break; + case 3: + new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2])); + break; + case 4: + new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2], + arr[3])); + break; + case 5: + new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2], + arr[3], arr[4])); + break; + case 6: + new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2], + arr[3], arr[4], arr[5])); + break; + case 7: + new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2], + arr[3], arr[4], arr[5], + arr[6])); + break; + default: + gcc_unreachable (); + } + INSN_LOCATION (new_insn) = INSN_LOCATION (insn); + if (move) + { + new_insn = emit_insn (move); + INSN_LOCATION (new_insn) = INSN_LOCATION (insn); + } + return true; +} + +/* Return TRUE if INSN defines a MVE vector operand that has zeroed + tail-predicated lanes. This is either true if: + * INSN is predicated by VCTP_VPR_GENERATED and the 'invalid lanes' operand + is in the PROPS_ZERO_SET, + * all MVE vector operands are in the PROPS_ZERO_SET +*/ + +static bool +arm_mve_propagate_zero_pred_p (vec *props_zero_set, + rtx_insn *insn, rtx vctp_vpr_generated) +{ + if (arm_mve_load_store_insn_p (insn, DL_USAGE_READ)) + return true; + if (arm_mve_load_store_insn_p (insn, DL_USAGE_WRITE)) + return false; + + int inactive_idx = -1; + + extract_insn (insn); + /* If INSN is predicated by VCTP_VPR_GENERATED, then all tail-predicated + lanes will keep the value that is in the 'invalid lanes' register which we + identify by the "0" constraint, to ensure it is the same as the 'result' + register of this instruction. */ + if (arm_mve_insn_predicated_by (insn, vctp_vpr_generated)) + { + for (int i = 0; i < recog_data.n_operands; i++) + { + if (strcmp (recog_data.constraints[i], "0") == 0 + && VALID_MVE_MODE (GET_MODE (recog_data.operand[i]))) + { + inactive_idx = i; + break; + } + } + } + + if (inactive_idx > 0) + { + rtx op = recog_data.operand[inactive_idx]; + rtx_insn *def_insn = arm_last_vect_def_insn (op, insn); + return def_insn != NULL_RTX && props_zero_set->contains (def_insn); + } + + /* If this instruction is not predicated by VCTP_VPR_GENERATED, then we must + check that all vector operands have zeroed tail-predicated lanes, and that + it has at least one vector operand. */ + bool at_least_one_vector = false; + df_ref insn_uses; + FOR_EACH_INSN_USE (insn_uses, insn) + { + rtx reg = DF_REF_REG (insn_uses); + if (!VALID_MVE_MODE (GET_MODE (reg))) + continue; + + rtx_insn *def_insn = arm_last_vect_def_insn (reg, insn); + if (def_insn && props_zero_set->contains (def_insn)) + at_least_one_vector |= true; + else + return false; + + } + return at_least_one_vector; +} + + +/* Attempt to transform the loop contents of loop basic block from VPT + predicated insns into unpredicated insns for a dlstp/letp loop. Returns + the number to decrement from the total number of elements each iteration. + Returns 1 if tail predication can not be performed and fallback to scalar + low-overhead loops. */ + +int +arm_attempt_dlstp_transform (rtx label) +{ + if (!dlstp_enabled) + return 1; + + basic_block body = single_succ (BLOCK_FOR_INSN (label)); + + /* Ensure that the bb is within a loop that has all required metadata. */ + if (!body->loop_father || !body->loop_father->header + || !body->loop_father->simple_loop_desc) + return 1; + + loop *loop = body->loop_father; + /* Instruction that sets the predicate mask depending on how many elements + are left to process. */ + rtx_insn *vctp_insn = arm_mve_loop_valid_for_dlstp (loop); + if (!vctp_insn) + return 1; + + gcc_assert (single_set (vctp_insn)); + + rtx vctp_vpr_generated = single_set (vctp_insn); + if (!vctp_vpr_generated) + return 1; + + vctp_vpr_generated = SET_DEST (vctp_vpr_generated); + + if (!vctp_vpr_generated || !REG_P (vctp_vpr_generated) + || !VALID_MVE_PRED_MODE (GET_MODE (vctp_vpr_generated))) + return 1; + + /* decrementunum is already known to be valid at this point. */ + int decrementnum = arm_mve_get_vctp_lanes (vctp_insn); + + rtx_insn *insn = 0; + rtx_insn *cur_insn = 0; + rtx_insn *seq; + auto_vec props_zero_set; + + /* Scan through the insns in the loop bb and emit the transformed bb + insns to a sequence. */ + start_sequence (); + FOR_BB_INSNS (body, insn) + { + if (GET_CODE (insn) == CODE_LABEL || NOTE_INSN_BASIC_BLOCK_P (insn)) + continue; + else if (NOTE_P (insn)) + emit_note ((enum insn_note)NOTE_KIND (insn)); + else if (DEBUG_INSN_P (insn)) + emit_debug_insn (PATTERN (insn)); + else if (!INSN_P (insn)) + { + end_sequence (); + return 1; + } + /* If the transformation is successful we no longer need the vctp + instruction. */ + else if (insn == vctp_insn) + continue; + /* If the insn pattern requires the use of the VPR value from the + vctp as an input parameter for predication. */ + else if (arm_mve_insn_predicated_by (insn, vctp_vpr_generated)) + { + /* Check whether this INSN propagates the zeroed tail-predication + lanes. */ + if (arm_mve_propagate_zero_pred_p (&props_zero_set, insn, + vctp_vpr_generated)) + props_zero_set.safe_push (insn); + bool success = arm_emit_mve_unpredicated_insn_to_seq (insn); + if (!success) + { + end_sequence (); + return 1; + } + } + /* If the insn isn't VPT predicated on vctp_vpr_generated, we need to + make sure that it is still valid within the dlstp/letp loop. */ + else + { + /* If this instruction USE-s the vctp_vpr_generated other than for + predication, this blocks the transformation as we are not allowed + to optimise the VPR value away. */ + df_ref insn_uses = NULL; + FOR_EACH_INSN_USE (insn_uses, insn) + { + if (rtx_equal_p (vctp_vpr_generated, DF_REF_REG (insn_uses))) + { + end_sequence (); + return 1; + } + } + /* If within the loop we have an MVE vector instruction that is + unpredicated, the dlstp/letp looping will add implicit + predication to it. This will result in a change in behaviour + of the instruction, so we need to find out if any instructions + that feed into the current instruction were implicitly + predicated. */ + if (MVE_VPT_PREDICABLE_INSN_P (insn) + && !arm_mve_impl_predicated_p (&props_zero_set, insn, + vctp_vpr_generated)) + { + end_sequence (); + return 1; + } + emit_insn (PATTERN (insn)); + } + } + seq = get_insns (); + end_sequence (); + + /* Re-write the entire BB contents with the transformed + sequence. */ + FOR_BB_INSNS_SAFE (body, insn, cur_insn) + if (!(GET_CODE (insn) == CODE_LABEL || NOTE_INSN_BASIC_BLOCK_P (insn))) + delete_insn (insn); + + emit_insn_after (seq, BB_END (body)); + + /* The transformation has succeeded, so now modify the "count" + (a.k.a. niter_expr) for the middle-end. Also set noloop_assumptions + to NULL to stop the middle-end from making assumptions about the + number of iterations. */ + simple_loop_desc (body->loop_father)->niter_expr + = XVECEXP (SET_SRC (PATTERN (vctp_insn)), 0, 0); + simple_loop_desc (body->loop_father)->noloop_assumptions = NULL_RTX; + return decrementnum; } #if CHECKING_P diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt index 0cd3fc2cd0cc6..d88c7a52e7591 100644 --- a/gcc/config/arm/arm.opt +++ b/gcc/config/arm/arm.opt @@ -363,5 +363,8 @@ Target Joined RejectNegative String Var(arm_stack_protector_guard_offset_str) Use an immediate to offset from the TLS register. This option is for use with fstack-protector-guard=tls and not for use in user-land code. +mdlstp +Target Var(dlstp_enabled) Init(1) Undocumented + TargetVariable long arm_stack_protector_guard_offset = 0 diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 8d066fcf05df6..987602da1bf5f 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -2686,6 +2686,17 @@ (define_int_attr mrrc [(VUNSPEC_MRRC "mrrc") (VUNSPEC_MRRC2 "mrrc2")]) (define_int_attr MRRC [(VUNSPEC_MRRC "MRRC") (VUNSPEC_MRRC2 "MRRC2")]) +(define_int_attr dlstp_elemsize [(DLSTP8 "8") (DLSTP16 "16") (DLSTP32 "32") + (DLSTP64 "64")]) + +(define_int_attr letp_num_lanes [(LETP8 "16") (LETP16 "8") (LETP32 "4") + (LETP64 "2")]) +(define_int_attr letp_num_lanes_neg [(LETP8 "-16") (LETP16 "-8") (LETP32 "-4") + (LETP64 "-2")]) + +(define_int_attr letp_num_lanes_minus_1 [(LETP8 "15") (LETP16 "7") (LETP32 "3") + (LETP64 "1")]) + (define_int_attr opsuffix [(UNSPEC_DOT_S "s8") (UNSPEC_DOT_U "u8") (UNSPEC_DOT_US "s8") @@ -2926,6 +2937,10 @@ (define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U]) (define_int_iterator VQSHLUQ_M_N [VQSHLUQ_M_N_S]) (define_int_iterator VQSHLUQ_N [VQSHLUQ_N_S]) +(define_int_iterator DLSTP [DLSTP8 DLSTP16 DLSTP32 + DLSTP64]) +(define_int_iterator LETP [LETP8 LETP16 LETP32 + LETP64]) ;; Define iterators for VCMLA operations (define_int_iterator VCMLA_OP [UNSPEC_VCMLA diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 9fe51298cdc2b..4b4d6298ffb18 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -6930,3 +6930,53 @@ } } ) + +;; Originally expanded by 'predicated_doloop_end'. +;; In the rare situation where the branch is too far, we do also need to +;; revert FPSCR.LTPSIZE back to 0x100 after the last iteration. +(define_insn "predicated_doloop_end_internal" + [(set (pc) + (if_then_else + (gtu (plus:SI (reg:SI LR_REGNUM) + (const_int )) + (const_int )) + (match_operand 0 "" "") + (pc))) + (set (reg:SI LR_REGNUM) + (plus:SI (reg:SI LR_REGNUM) (const_int ))) + ;; We use UNSPEC here to guarantee this pattern can not be + ;; generated by a RTL optimization and be matched by other + ;; patterns, since this pattern is also responsible for turning off + ;; the tail predication machinery if we were to exit the loop. + ;; This is done by either the LETP or the LCTP instructions that + ;; this pattern generates. + (use (unspec:SI [(const_int 0)] LETP)) + (clobber (reg:CC CC_REGNUM))] + "TARGET_HAVE_MVE" + { + if (get_attr_length (insn) == 4) + return "letp\t%|lr, %l0"; + else + return "subs\t%|lr, #\n\tbhi\t%l0\n\tlctp"; + } + [(set (attr "length") + (if_then_else + (ltu (minus (pc) (match_dup 0)) (const_int 1024)) + (const_int 4) + (const_int 12))) + (set_attr "type" "branch") + (set_attr "conds" "unconditional")]) + +(define_insn "dlstp_insn" + [ + (set (reg:SI LR_REGNUM) +;; Similar to the previous pattern, we use UNSPEC here to make sure this +;; rtx construct is not matched by other patterns, as this pattern is also +;; responsible for setting the element size of the tail predication machinery +;; using the dlsp. instruction. + (unspec_volatile:SI [(match_operand:SI 0 "s_register_operand" "r")] + DLSTP)) + ] + "TARGET_HAVE_MVE" + "dlstp.\t%|lr, %0" + [(set_attr "type" "mve_misc")]) diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md index 84c9c3dfe8009..66b3ae6040c0c 100644 --- a/gcc/config/arm/thumb2.md +++ b/gcc/config/arm/thumb2.md @@ -1613,7 +1613,7 @@ (use (match_operand 1 "" ""))] ; label "TARGET_32BIT" " - { +{ /* Currently SMS relies on the do-loop pattern to recognize loops where (1) the control part consists of all insns defining and/or using a certain 'count' register and (2) the loop count can be @@ -1623,41 +1623,75 @@ Also used to implement the low over head loops feature, which is part of the Armv8.1-M Mainline Low Overhead Branch (LOB) extension. */ - if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB)) - { - rtx s0; - rtx bcomp; - rtx loc_ref; - rtx cc_reg; - rtx insn; - rtx cmp; - - if (GET_MODE (operands[0]) != SImode) - FAIL; - - s0 = operands [0]; - - /* Low over head loop instructions require the first operand to be LR. */ - if (TARGET_HAVE_LOB && arm_target_insn_ok_for_lob (operands [1])) - s0 = gen_rtx_REG (SImode, LR_REGNUM); - - if (TARGET_THUMB2) - insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, GEN_INT (-1))); - else - insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1))); - - cmp = XVECEXP (PATTERN (insn), 0, 0); - cc_reg = SET_DEST (cmp); - bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx); - loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands [1]); - emit_jump_insn (gen_rtx_SET (pc_rtx, - gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp, - loc_ref, pc_rtx))); - DONE; - } - else - FAIL; - }") + if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB)) + { + rtx s0; + rtx bcomp; + rtx loc_ref; + rtx cc_reg; + rtx insn; + rtx cmp; + int decrement_num; + + if (GET_MODE (operands[0]) != SImode) + FAIL; + + s0 = operands[0]; + + if (TARGET_HAVE_LOB + && arm_target_bb_ok_for_lob (BLOCK_FOR_INSN (operands[1]))) + { + /* If we have a compatible MVE target, try and analyse the loop + contents to determine if we can use predicated dlstp/letp + looping. These patterns implicitly use LR as the loop counter. */ + if (TARGET_HAVE_MVE + && ((decrement_num = arm_attempt_dlstp_transform (operands[1])) + != 1)) + { + loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]); + switch (decrement_num) + { + case 2: + insn = gen_predicated_doloop_end_internal2 (loc_ref); + break; + case 4: + insn = gen_predicated_doloop_end_internal4 (loc_ref); + break; + case 8: + insn = gen_predicated_doloop_end_internal8 (loc_ref); + break; + case 16: + insn = gen_predicated_doloop_end_internal16 (loc_ref); + break; + default: + gcc_unreachable (); + } + emit_jump_insn (insn); + DONE; + } + /* Remaining LOB cases need to explicitly use LR. */ + s0 = gen_rtx_REG (SImode, LR_REGNUM); + } + + /* Otherwise, try standard decrement-by-one dls/le looping. */ + if (TARGET_THUMB2) + insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, + GEN_INT (-1))); + else + insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1))); + + cmp = XVECEXP (PATTERN (insn), 0, 0); + cc_reg = SET_DEST (cmp); + bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx); + loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]); + emit_jump_insn (gen_rtx_SET (pc_rtx, + gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp, + loc_ref, pc_rtx))); + DONE; + } + else + FAIL; +}") (define_insn "*clear_apsr" [(unspec_volatile:SI [(const_int 0)] VUNSPEC_CLRM_APSR) @@ -1755,7 +1789,37 @@ { if (REGNO (operands[0]) == LR_REGNUM) { - emit_insn (gen_dls_insn (operands[0])); + /* Pick out the number by which we are decrementing the loop counter + in every iteration. If it's > 1, then use dlstp. */ + int const_int_dec_num + = abs (INTVAL (XEXP (XEXP (XVECEXP (PATTERN (operands[1]), 0, 1), + 1), + 1))); + switch (const_int_dec_num) + { + case 16: + emit_insn (gen_dlstp8_insn (operands[0])); + break; + + case 8: + emit_insn (gen_dlstp16_insn (operands[0])); + break; + + case 4: + emit_insn (gen_dlstp32_insn (operands[0])); + break; + + case 2: + emit_insn (gen_dlstp64_insn (operands[0])); + break; + + case 1: + emit_insn (gen_dls_insn (operands[0])); + break; + + default: + gcc_unreachable (); + } DONE; } else diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md index e2b70da1001f2..9527bdb9e87e7 100644 --- a/gcc/config/arm/types.md +++ b/gcc/config/arm/types.md @@ -574,6 +574,7 @@ ; mve_move ; mve_store ; mve_load +; mve_misc (define_attr "type" "adc_imm,\ @@ -1126,7 +1127,8 @@ ls64,\ mve_move,\ mve_store,\ - mve_load" + mve_load, \ + mve_misc" (cond [(eq_attr "autodetect_type" "alu_shift_lsr_op2,alu_shift_asr_op2") (const_string "alu_shift_imm_other") (eq_attr "autodetect_type" "alu_shift_lsl_op2") @@ -1292,7 +1294,7 @@ ;; No otherwise. (define_attr "is_mve_type" "yes,no" (if_then_else (eq_attr "type" - "mve_move, mve_load, mve_store, mrs") + "mve_move, mve_load, mve_store, mrs, mve_misc") (const_string "yes") (const_string "no"))) diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index 46ac8b3715740..f5f4d1543645b 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -591,6 +591,10 @@ VADDLVQ_U VCTP VCTP_M + LETP8 + LETP16 + LETP32 + LETP64 VPNOT VCREATEQ_F VCVTQ_N_TO_F_S @@ -1259,6 +1263,14 @@ UQRSHLL_48 SQRSHRL_64 SQRSHRL_48 - VSHLCQ_M_ REINTERPRET ]) + +; DLSTP unspecs must be volatile to guarantee the scheduler does not reschedule +; these instructions within the loop preheader. +(define_c_enum "unspecv" [ + DLSTP8 + DLSTP16 + DLSTP32 + DLSTP64 +]) diff --git a/gcc/config/i386/driver-i386.cc b/gcc/config/i386/driver-i386.cc index 0176d8b6cd296..11470eaea1254 100644 --- a/gcc/config/i386/driver-i386.cc +++ b/gcc/config/i386/driver-i386.cc @@ -558,10 +558,12 @@ const char *host_detect_local_cpu (int argc, const char **argv) switch (family) { case 7: - if (model == 0x3b) - processor = PROCESSOR_LUJIAZUI; - else if (model >= 0x5b) + if (model >= 0x6b) + processor = PROCESSOR_SHIJIDADAO; + else if (model == 0x5b) processor = PROCESSOR_YONGFENG; + else if (model == 0x3b) + processor = PROCESSOR_LUJIAZUI; break; default: break; @@ -853,6 +855,9 @@ const char *host_detect_local_cpu (int argc, const char **argv) case PROCESSOR_YONGFENG: cpu = "yongfeng"; break; + case PROCESSOR_SHIJIDADAO: + cpu = "shijidadao"; + break; default: /* Use something reasonable. */ diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc index 7b0ad9e9181ee..403475d5b6bb2 100644 --- a/gcc/config/i386/i386-c.cc +++ b/gcc/config/i386/i386-c.cc @@ -156,6 +156,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, def_or_undef (parse_in, "__yongfeng"); def_or_undef (parse_in, "__yongfeng__"); break; + case PROCESSOR_SHIJIDADAO: + def_or_undef (parse_in, "__shijidadao"); + def_or_undef (parse_in, "__shijidadao__"); + break; case PROCESSOR_PENTIUM4: def_or_undef (parse_in, "__pentium4"); def_or_undef (parse_in, "__pentium4__"); @@ -386,6 +390,9 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, case PROCESSOR_YONGFENG: def_or_undef (parse_in, "__tune_yongfeng__"); break; + case PROCESSOR_SHIJIDADAO: + def_or_undef (parse_in, "__tune_shijidadao__"); + break; case PROCESSOR_PENTIUM4: def_or_undef (parse_in, "__tune_pentium4__"); break; diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc index f2cecc0e2545b..65c5bad9c285e 100644 --- a/gcc/config/i386/i386-options.cc +++ b/gcc/config/i386/i386-options.cc @@ -155,7 +155,8 @@ along with GCC; see the file COPYING3. If not see #define m_LUJIAZUI (HOST_WIDE_INT_1U<integer move cost is 2. */ + 8, /* cost for loading QImode using movzbl. */ + {8, 8, 8}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {8, 8, 8}, /* cost of storing integer registers. */ + 2, /* cost of reg,reg fld/fst. */ + {8, 8, 8}, /* cost of loading fp registers + in SFmode, DFmode and XFmode. */ + {8, 8, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode. */ + 2, /* cost of moving MMX register. */ + {8, 8}, /* cost of loading MMX registers + in SImode and DImode. */ + {8, 8}, /* cost of storing MMX registers + in SImode and DImode. */ + 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */ + {8, 8, 8, 10, 15}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit. */ + {8, 8, 8, 10, 15}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit. */ + 8, 8, /* SSE->integer and integer->SSE moves. */ + 8, 8, /* mask->integer and integer->mask moves. */ + {8, 8, 8}, /* cost of loading mask register + in QImode, HImode, SImode. */ + {8, 8, 8}, /* cost if storing mask register + in QImode, HImode, SImode. */ + 2, /* cost of moving mask register. */ + /* End of register allocator costs. */ + }, + + COSTS_N_INSNS (1), /* cost of an add instruction. */ + COSTS_N_INSNS (1), /* cost of a lea instruction. */ + COSTS_N_INSNS (1), /* variable shift costs. */ + COSTS_N_INSNS (1), /* constant shift costs. */ + {COSTS_N_INSNS (2), /* cost of starting multiply for QI. */ + COSTS_N_INSNS (3), /* HI. */ + COSTS_N_INSNS (2), /* SI. */ + COSTS_N_INSNS (2), /* DI. */ + COSTS_N_INSNS (3)}, /* other. */ + 0, /* cost of multiply per each bit set. */ + {COSTS_N_INSNS (9), /* cost of a divide/mod for QI. */ + COSTS_N_INSNS (10), /* HI. */ + COSTS_N_INSNS (9), /* SI. */ + COSTS_N_INSNS (50), /* DI. */ + COSTS_N_INSNS (50)}, /* other. */ + COSTS_N_INSNS (1), /* cost of movsx. */ + COSTS_N_INSNS (1), /* cost of movzx. */ + 8, /* "large" insn. */ + 17, /* MOVE_RATIO. */ + 6, /* CLEAR_RATIO. */ + {8, 8, 8}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {8, 8, 8}, /* cost of storing integer registers. */ + {8, 8, 8, 12, 15}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit. */ + {8, 8, 8, 12, 15}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit. */ + {8, 8, 8, 12, 15}, /* cost of unaligned loads. */ + {8, 8, 8, 12, 15}, /* cost of unaligned storess. */ + 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */ + 8, /* cost of moving SSE register to integer. */ + 18, 6, /* Gather load static, per_elt. */ + 18, 6, /* Gather store static, per_elt. */ + 32, /* size of l1 cache. */ + 256, /* size of l2 cache. */ + 64, /* size of prefetch block. */ + 12, /* number of parallel prefetches. */ + 3, /* Branch cost. */ + COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (3), /* cost of FMUL instruction. */ + COSTS_N_INSNS (13), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (44), /* cost of FSQRT instruction. */ + + COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ + COSTS_N_INSNS (3), /* cost of MULSS instruction. */ + COSTS_N_INSNS (3), /* cost of MULSD instruction. */ + COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ + COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ + COSTS_N_INSNS (11), /* cost of DIVSS instruction. */ + COSTS_N_INSNS (14), /* cost of DIVSD instruction. */ + COSTS_N_INSNS (11), /* cost of SQRTSS instruction. */ + COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */ + 4, 4, 4, 4, /* reassoc int, fp, vec_int, vec_fp. */ + shijidadao_memcpy, + shijidadao_memset, + COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ + COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ + "16:11:8", /* Loop alignment. */ + "16:11:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ + 4, /* Small unroll limit. */ + 2, /* Small unroll factor. */ +}; + + /* Generic should produce code tuned for Core-i7 (and newer chips) and btver1 (and newer chips). */ diff --git a/gcc/config/i386/x86-tune-sched.cc b/gcc/config/i386/x86-tune-sched.cc index f70846e628e57..d77298b0e34dc 100644 --- a/gcc/config/i386/x86-tune-sched.cc +++ b/gcc/config/i386/x86-tune-sched.cc @@ -79,6 +79,7 @@ ix86_issue_rate (void) case PROCESSOR_CANNONLAKE: case PROCESSOR_ALDERLAKE: case PROCESSOR_YONGFENG: + case PROCESSOR_SHIJIDADAO: case PROCESSOR_GENERIC: return 4; @@ -446,6 +447,7 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost, break; case PROCESSOR_YONGFENG: + case PROCESSOR_SHIJIDADAO: /* Stack engine allows to execute push&pop instructions in parallel. */ if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 66512992b7b5b..343c32c291fa8 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -477,7 +477,7 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes", elements. */ DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts", ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID - | m_YONGFENG | m_CORE_ATOM | m_GENERIC | m_GDS)) + | m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS)) /* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2 elements. */ @@ -488,7 +488,7 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts", elements. */ DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts", ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID - | m_YONGFENG | m_CORE_ATOM | m_GENERIC | m_GDS)) + | m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS)) /* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4 elements. */ @@ -499,7 +499,7 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts", elements. */ DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts", ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_CORE_HYBRID | m_CORE_ATOM - | m_YONGFENG | m_GENERIC | m_GDS)) + | m_YONGFENG | m_SHIJIDADAO | m_GENERIC | m_GDS)) /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more elements. */ @@ -509,7 +509,7 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts", /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or smaller FMA chain. */ DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 - | m_YONGFENG | m_GENERIC) + | m_YONGFENG | m_SHIJIDADAO | m_GENERIC) /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or smaller FMA chain. */ diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc b/gcc/config/riscv/riscv-vector-builtins-bases.cc index b6f6e4ff37e78..596b88cc8a3cd 100644 --- a/gcc/config/riscv/riscv-vector-builtins-bases.cc +++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc @@ -1420,12 +1420,8 @@ class fcmp : public function_base switch (e.op_info->op) { case OP_TYPE_vf: { - if (CODE == EQ || CODE == NE) - return e.use_compare_insn (CODE, code_for_pred_eqne_scalar ( - e.vector_mode ())); - else - return e.use_compare_insn (CODE, code_for_pred_cmp_scalar ( - e.vector_mode ())); + return e.use_compare_insn (CODE, code_for_pred_cmp_scalar ( + e.vector_mode ())); } case OP_TYPE_vv: { return e.use_compare_insn (CODE, diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md index fbcdf96f038ba..f8fae6557d935 100644 --- a/gcc/config/riscv/vector.md +++ b/gcc/config/riscv/vector.md @@ -7545,92 +7545,6 @@ (set_attr "mode" "") (set_attr "spec_restriction" "none,thv,thv,none,none")]) -(define_expand "@pred_eqne_scalar" - [(set (match_operand: 0 "register_operand") - (if_then_else: - (unspec: - [(match_operand: 1 "vector_mask_operand") - (match_operand 6 "vector_length_operand") - (match_operand 7 "const_int_operand") - (match_operand 8 "const_int_operand") - (reg:SI VL_REGNUM) - (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) - (match_operator: 3 "equality_operator" - [(vec_duplicate:V_VLSF - (match_operand: 5 "register_operand")) - (match_operand:V_VLSF 4 "register_operand")]) - (match_operand: 2 "vector_merge_operand")))] - "TARGET_VECTOR" - {}) - -(define_insn "*pred_eqne_scalar_merge_tie_mask" - [(set (match_operand: 0 "register_operand" "=vm") - (if_then_else: - (unspec: - [(match_operand: 1 "register_operand" " 0") - (match_operand 5 "vector_length_operand" " rK") - (match_operand 6 "const_int_operand" " i") - (match_operand 7 "const_int_operand" " i") - (reg:SI VL_REGNUM) - (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) - (match_operator: 2 "equality_operator" - [(vec_duplicate:V_VLSF - (match_operand: 4 "register_operand" " f")) - (match_operand:V_VLSF 3 "register_operand" " vr")]) - (match_dup 1)))] - "TARGET_VECTOR" - "vmf%B2.vf\t%0,%3,%4,v0.t" - [(set_attr "type" "vfcmp") - (set_attr "mode" "") - (set_attr "merge_op_idx" "1") - (set_attr "vl_op_idx" "5") - (set (attr "ma") (symbol_ref "riscv_vector::get_ma(operands[6])")) - (set (attr "avl_type_idx") (const_int 7))]) - -;; We don't use early-clobber for LMUL <= 1 to get better codegen. -(define_insn "*pred_eqne_scalar" - [(set (match_operand: 0 "register_operand" "=vr, vr, &vr, &vr") - (if_then_else: - (unspec: - [(match_operand: 1 "vector_mask_operand" "vmWc1,vmWc1,vmWc1,vmWc1") - (match_operand 6 "vector_length_operand" " rK, rK, rK, rK") - (match_operand 7 "const_int_operand" " i, i, i, i") - (match_operand 8 "const_int_operand" " i, i, i, i") - (reg:SI VL_REGNUM) - (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) - (match_operator: 3 "equality_operator" - [(vec_duplicate:V_VLSF - (match_operand: 5 "register_operand" " f, f, f, f")) - (match_operand:V_VLSF 4 "register_operand" " vr, vr, vr, vr")]) - (match_operand: 2 "vector_merge_operand" " vu, 0, vu, 0")))] - "TARGET_VECTOR && riscv_vector::cmp_lmul_le_one (mode)" - "vmf%B3.vf\t%0,%4,%5%p1" - [(set_attr "type" "vfcmp") - (set_attr "mode" "") - (set_attr "spec_restriction" "thv,thv,rvv,rvv")]) - -;; We use early-clobber for source LMUL > dest LMUL. -(define_insn "*pred_eqne_scalar_narrow" - [(set (match_operand: 0 "register_operand" "=vm, vr, vr, &vr, &vr") - (if_then_else: - (unspec: - [(match_operand: 1 "vector_mask_operand" " 0,vmWc1,vmWc1,vmWc1,vmWc1") - (match_operand 6 "vector_length_operand" " rK, rK, rK, rK, rK") - (match_operand 7 "const_int_operand" " i, i, i, i, i") - (match_operand 8 "const_int_operand" " i, i, i, i, i") - (reg:SI VL_REGNUM) - (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) - (match_operator: 3 "equality_operator" - [(vec_duplicate:V_VLSF - (match_operand: 5 "register_operand" " f, f, f, f, f")) - (match_operand:V_VLSF 4 "register_operand" " vr, 0, 0, vr, vr")]) - (match_operand: 2 "vector_merge_operand" " vu, vu, 0, vu, 0")))] - "TARGET_VECTOR && riscv_vector::cmp_lmul_gt_one (mode)" - "vmf%B3.vf\t%0,%4,%5%p1" - [(set_attr "type" "vfcmp") - (set_attr "mode" "") - (set_attr "spec_restriction" "none,thv,thv,none,none")]) - ;; ------------------------------------------------------------------------------- ;; ---- Predicated floating-point merge ;; ------------------------------------------------------------------------------- diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc index bc127997ac6cb..e2549de5df050 100644 --- a/gcc/config/xtensa/xtensa.cc +++ b/gcc/config/xtensa/xtensa.cc @@ -3078,7 +3078,17 @@ print_operand (FILE *file, rtx x, int letter) /* For a volatile memory reference, emit a MEMW before the load or store. */ if (MEM_VOLATILE_P (x) && TARGET_SERIALIZE_VOLATILE) - fprintf (file, "memw\n\t"); + { + rtx_insn *prev_insn + = prev_nonnote_nondebug_insn (current_output_insn); + rtx pat, src; + + if (! (prev_insn && NONJUMP_INSN_P (prev_insn) + && GET_CODE (pat = PATTERN (prev_insn)) == SET + && GET_CODE (src = SET_SRC (pat)) == UNSPEC + && XINT (src, 1) == UNSPEC_MEMW)) + fprintf (file, "memw\n\t"); + } } else output_operand_lossage ("invalid %%v value"); diff --git a/gcc/configure b/gcc/configure index b536af664d3de..9dc0b65dfaace 100755 --- a/gcc/configure +++ b/gcc/configure @@ -30239,7 +30239,7 @@ else fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: $gcc_cv_as_mips_explicit_relocs_pcrel" >&5 $as_echo "$gcc_cv_as_mips_explicit_relocs_pcrel" >&6; } -if test "x$gcc_cv_as_mips_explicit_relocs_pcrel" = "xyes"; then +if test $gcc_cv_as_mips_explicit_relocs_pcrel = yes; then $as_echo "#define MIPS_EXPLICIT_RELOCS MIPS_EXPLICIT_RELOCS_PCREL" >>confdefs.h diff --git a/gcc/configure.ac b/gcc/configure.ac index 1501bf89c89da..b2243e9954aac 100644 --- a/gcc/configure.ac +++ b/gcc/configure.ac @@ -5317,7 +5317,7 @@ x: AC_MSG_CHECKING(assembler and linker for explicit JALR relocation) gcc_cv_as_ld_jalr_reloc=no - if test "x$gcc_cv_as_mips_explicit_relocs" = "xyes"; then + if test $gcc_cv_as_mips_explicit_relocs = yes; then if test $in_tree_ld = yes ; then if test "$gcc_cv_gld_major_version" -eq 2 -a "$gcc_cv_gld_minor_version" -ge 20 -o "$gcc_cv_gld_major_version" -gt 2 \ && test $in_tree_ld_is_elf = yes; then diff --git a/gcc/df-core.cc b/gcc/df-core.cc index f0eb4c93957ff..b0e8a88d433bb 100644 --- a/gcc/df-core.cc +++ b/gcc/df-core.cc @@ -1964,6 +1964,21 @@ df_bb_regno_last_def_find (basic_block bb, unsigned int regno) return NULL; } +/* Return the one and only def of REGNO within BB. If there is no def or + there are multiple defs, return NULL. */ + +df_ref +df_bb_regno_only_def_find (basic_block bb, unsigned int regno) +{ + df_ref temp = df_bb_regno_first_def_find (bb, regno); + if (!temp) + return NULL; + else if (temp == df_bb_regno_last_def_find (bb, regno)) + return temp; + else + return NULL; +} + /* Finds the reference corresponding to the definition of REG in INSN. DF is the dataflow object. */ diff --git a/gcc/df.h b/gcc/df.h index 84e5aa8b524df..c4e690b40cf21 100644 --- a/gcc/df.h +++ b/gcc/df.h @@ -987,6 +987,7 @@ extern void df_check_cfg_clean (void); #endif extern df_ref df_bb_regno_first_def_find (basic_block, unsigned int); extern df_ref df_bb_regno_last_def_find (basic_block, unsigned int); +extern df_ref df_bb_regno_only_def_find (basic_block, unsigned int); extern df_ref df_find_def (rtx_insn *, rtx); extern bool df_reg_defined (rtx_insn *, rtx); extern df_ref df_find_use (rtx_insn *, rtx); diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index 173cdef013160..b2e41a581dd1c 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -26245,6 +26245,9 @@ ZHAOXIN lujiazui CPU. @item yongfeng ZHAOXIN yongfeng CPU. +@item shijidadao +ZHAOXIN shijidadao CPU. + @item amdfam10h AMD Family 10h CPU. diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 5d7a87fde86c4..c790e2f35184c 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -34873,6 +34873,12 @@ SSE4.2, AVX, POPCNT, AES, PCLMUL, RDRND, XSAVE, XSAVEOPT, FSGSBASE, CX16, ABM, BMI, BMI2, F16C, FXSR, RDSEED, AVX2, FMA, SHA, LZCNT instruction set support. +@item shijidadao +ZHAOXIN shijidadao CPU with x86-64, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, +SSE4.2, AVX, POPCNT, AES, PCLMUL, RDRND, XSAVE, XSAVEOPT, FSGSBASE, CX16, +ABM, BMI, BMI2, F16C, FXSR, RDSEED, AVX2, FMA, SHA, LZCNT +instruction set support. + @item geode AMD Geode embedded processor with MMX and 3DNow!@: instruction set support. @end table diff --git a/gcc/fortran/ChangeLog b/gcc/fortran/ChangeLog index a2275728a46b9..8fcd6d40c956c 100644 --- a/gcc/fortran/ChangeLog +++ b/gcc/fortran/ChangeLog @@ -1,3 +1,19 @@ +2024-06-19 Harald Anlauf + + PR fortran/115390 + * trans-decl.cc (gfc_conv_cfi_to_gfc): Move derivation of type sizes + for character via gfc_trans_vla_type_sizes to after character length + has been set. + +2024-06-19 Andre Vehreschild + + PR fortran/90076 + * trans-decl.cc (gfc_generate_function_code): Set vptr for + results to declared class type. + * trans-expr.cc (gfc_reset_vptr): Allow to provide the typespec + instead of the expression. + * trans.h (gfc_reset_vptr): Same. + 2024-06-17 Andre Vehreschild * trans.cc (gfc_deallocate_with_status): Check that object to deref diff --git a/gcc/fortran/trans-decl.cc b/gcc/fortran/trans-decl.cc index 88538713a02b4..f7fb6eec336a8 100644 --- a/gcc/fortran/trans-decl.cc +++ b/gcc/fortran/trans-decl.cc @@ -7063,8 +7063,8 @@ gfc_conv_cfi_to_gfc (stmtblock_t *init, stmtblock_t *finally, if (sym->ts.type == BT_CHARACTER && !INTEGER_CST_P (sym->ts.u.cl->backend_decl)) { - gfc_conv_string_length (sym->ts.u.cl, NULL, init); - gfc_trans_vla_type_sizes (sym, init); + gfc_conv_string_length (sym->ts.u.cl, NULL, &block); + gfc_trans_vla_type_sizes (sym, &block); } /* gfc->data = cfi->base_addr - or for scalars: gfc = cfi->base_addr. diff --git a/gcc/gimple-lower-bitint.cc b/gcc/gimple-lower-bitint.cc index 56e5f826a8d9f..f955f3eabd9b6 100644 --- a/gcc/gimple-lower-bitint.cc +++ b/gcc/gimple-lower-bitint.cc @@ -6630,7 +6630,10 @@ gimple_lower_bitint (void) continue; if (gimple_code (use_stmt) == GIMPLE_PHI || is_gimple_call (use_stmt) - || gimple_code (use_stmt) == GIMPLE_ASM) + || gimple_code (use_stmt) == GIMPLE_ASM + || (is_gimple_assign (use_stmt) + && (gimple_assign_rhs_code (use_stmt) + == COMPLEX_EXPR))) { optimizable_load = false; break; diff --git a/gcc/loop-doloop.cc b/gcc/loop-doloop.cc index 529e810e530c2..8953e1de96094 100644 --- a/gcc/loop-doloop.cc +++ b/gcc/loop-doloop.cc @@ -85,10 +85,10 @@ doloop_condition_get (rtx_insn *doloop_pat) forms: 1) (parallel [(set (pc) (if_then_else (condition) - (label_ref (label)) - (pc))) - (set (reg) (plus (reg) (const_int -1))) - (additional clobbers and uses)]) + (label_ref (label)) + (pc))) + (set (reg) (plus (reg) (const_int -1))) + (additional clobbers and uses)]) The branch must be the first entry of the parallel (also required by jump.cc), and the second entry of the parallel must be a set of @@ -96,19 +96,33 @@ doloop_condition_get (rtx_insn *doloop_pat) the loop counter in an if_then_else too. 2) (set (reg) (plus (reg) (const_int -1)) - (set (pc) (if_then_else (reg != 0) - (label_ref (label)) - (pc))). + (set (pc) (if_then_else (reg != 0) + (label_ref (label)) + (pc))). - Some targets (ARM) do the comparison before the branch, as in the + 3) Some targets (Arm) do the comparison before the branch, as in the following form: - 3) (parallel [(set (cc) (compare ((plus (reg) (const_int -1), 0))) - (set (reg) (plus (reg) (const_int -1)))]) - (set (pc) (if_then_else (cc == NE) - (label_ref (label)) - (pc))) */ - + (parallel [(set (cc) (compare (plus (reg) (const_int -1)) 0)) + (set (reg) (plus (reg) (const_int -1)))]) + (set (pc) (if_then_else (cc == NE) + (label_ref (label)) + (pc))) + + 4) This form supports a construct that is used to represent a vectorized + do loop with predication, however we do not need to care about the + details of the predication here. + Arm uses this construct to support MVE tail predication. + + (parallel + [(set (pc) + (if_then_else (gtu (plus (reg) (const_int -n)) + (const_int n-1)) + (label_ref) + (pc))) + (set (reg) (plus (reg) (const_int -n))) + (additional clobbers and uses)]) + */ pattern = PATTERN (doloop_pat); if (GET_CODE (pattern) != PARALLEL) @@ -173,15 +187,17 @@ doloop_condition_get (rtx_insn *doloop_pat) if (! REG_P (reg)) return 0; - /* Check if something = (plus (reg) (const_int -1)). + /* Check if something = (plus (reg) (const_int -n)). On IA-64, this decrement is wrapped in an if_then_else. */ inc_src = SET_SRC (inc); if (GET_CODE (inc_src) == IF_THEN_ELSE) inc_src = XEXP (inc_src, 1); if (GET_CODE (inc_src) != PLUS - || XEXP (inc_src, 0) != reg - || XEXP (inc_src, 1) != constm1_rtx) + || !rtx_equal_p (XEXP (inc_src, 0), reg) + || !CONST_INT_P (XEXP (inc_src, 1)) + || INTVAL (XEXP (inc_src, 1)) >= 0) return 0; + int dec_num = -INTVAL (XEXP (inc_src, 1)); /* Check for (set (pc) (if_then_else (condition) (label_ref (label)) @@ -196,60 +212,63 @@ doloop_condition_get (rtx_insn *doloop_pat) /* Extract loop termination condition. */ condition = XEXP (SET_SRC (cmp), 0); - /* We expect a GE or NE comparison with 0 or 1. */ - if ((GET_CODE (condition) != GE - && GET_CODE (condition) != NE) - || (XEXP (condition, 1) != const0_rtx - && XEXP (condition, 1) != const1_rtx)) + /* We expect a GE or NE comparison with 0 or 1, or a GTU comparison with + dec_num - 1. */ + if (!((GET_CODE (condition) == GE + || GET_CODE (condition) == NE) + && (XEXP (condition, 1) == const0_rtx + || XEXP (condition, 1) == const1_rtx )) + &&!(GET_CODE (condition) == GTU + && ((INTVAL (XEXP (condition, 1))) == (dec_num - 1)))) return 0; - if ((XEXP (condition, 0) == reg) + if (rtx_equal_p (XEXP (condition, 0), reg) /* For the third case: */ || ((cc_reg != NULL_RTX) && (XEXP (condition, 0) == cc_reg) - && (reg_orig == reg)) + && (rtx_equal_p (reg_orig, reg))) || (GET_CODE (XEXP (condition, 0)) == PLUS - && XEXP (XEXP (condition, 0), 0) == reg)) - { - if (GET_CODE (pattern) != PARALLEL) - /* For the second form we expect: + && rtx_equal_p (XEXP (XEXP (condition, 0), 0), reg, NULL))) + { + if (GET_CODE (pattern) != PARALLEL) + /* For the second form we expect: - (set (reg) (plus (reg) (const_int -1)) - (set (pc) (if_then_else (reg != 0) - (label_ref (label)) - (pc))). + (set (reg) (plus (reg) (const_int -1)) + (set (pc) (if_then_else (reg != 0) + (label_ref (label)) + (pc))). - is equivalent to the following: + is equivalent to the following: - (parallel [(set (pc) (if_then_else (reg != 1) - (label_ref (label)) - (pc))) - (set (reg) (plus (reg) (const_int -1))) - (additional clobbers and uses)]) + (parallel [(set (pc) (if_then_else (reg != 1) + (label_ref (label)) + (pc))) + (set (reg) (plus (reg) (const_int -1))) + (additional clobbers and uses)]) - For the third form we expect: + For the third form we expect: - (parallel [(set (cc) (compare ((plus (reg) (const_int -1)), 0)) - (set (reg) (plus (reg) (const_int -1)))]) - (set (pc) (if_then_else (cc == NE) - (label_ref (label)) - (pc))) + (parallel [(set (cc) (compare ((plus (reg) (const_int -1)), 0)) + (set (reg) (plus (reg) (const_int -1)))]) + (set (pc) (if_then_else (cc == NE) + (label_ref (label)) + (pc))) - which is equivalent to the following: + which is equivalent to the following: - (parallel [(set (cc) (compare (reg, 1)) - (set (reg) (plus (reg) (const_int -1))) - (set (pc) (if_then_else (NE == cc) - (label_ref (label)) - (pc))))]) + (parallel [(set (cc) (compare (reg, 1)) + (set (reg) (plus (reg) (const_int -1))) + (set (pc) (if_then_else (NE == cc) + (label_ref (label)) + (pc))))]) - So we return the second form instead for the two cases. + So we return the second form instead for the two cases. */ - condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx); + condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx); return condition; - } + } /* ??? If a machine uses a funny comparison, we could return a canonicalized form here. */ @@ -507,6 +526,11 @@ doloop_modify (class loop *loop, class niter_desc *desc, nonneg = 1; break; + case GTU: + /* The iteration count does not need incrementing for a GTU test. */ + increment_count = false; + break; + /* Abort if an invalid doloop pattern has been generated. */ default: gcc_unreachable (); @@ -529,6 +553,10 @@ doloop_modify (class loop *loop, class niter_desc *desc, if (desc->noloop_assumptions) { + /* The GTU case has only been implemented for Arm, where + noloop_assumptions gets explicitly set to NULL for that case, so + assert here for safety. */ + gcc_assert (GET_CODE (condition) != GTU); rtx ass = copy_rtx (desc->noloop_assumptions); basic_block preheader = loop_preheader_edge (loop)->src; basic_block set_zero = split_edge (loop_preheader_edge (loop)); @@ -642,7 +670,7 @@ doloop_optimize (class loop *loop) { scalar_int_mode mode; rtx doloop_reg; - rtx count; + rtx count = NULL_RTX; widest_int iterations, iterations_max; rtx_code_label *start_label; rtx condition; @@ -685,17 +713,6 @@ doloop_optimize (class loop *loop) return false; } - max_cost - = COSTS_N_INSNS (param_max_iterations_computation_cost); - if (set_src_cost (desc->niter_expr, mode, optimize_loop_for_speed_p (loop)) - > max_cost) - { - if (dump_file) - fprintf (dump_file, - "Doloop: number of iterations too costly to compute.\n"); - return false; - } - if (desc->const_iter) iterations = widest_int::from (rtx_mode_t (desc->niter_expr, mode), UNSIGNED); @@ -716,12 +733,25 @@ doloop_optimize (class loop *loop) /* Generate looping insn. If the pattern FAILs then give up trying to modify the loop since there is some aspect the back-end does - not like. */ - count = copy_rtx (desc->niter_expr); + not like. If this succeeds, there is a chance that the loop + desc->niter_expr has been altered by the backend, so only extract + that data after the gen_doloop_end. */ start_label = block_label (desc->in_edge->dest); doloop_reg = gen_reg_rtx (mode); rtx_insn *doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label); + max_cost + = COSTS_N_INSNS (param_max_iterations_computation_cost); + if (set_src_cost (desc->niter_expr, mode, optimize_loop_for_speed_p (loop)) + > max_cost) + { + if (dump_file) + fprintf (dump_file, + "Doloop: number of iterations too costly to compute.\n"); + return false; + } + + count = copy_rtx (desc->niter_expr); word_mode_size = GET_MODE_PRECISION (word_mode); word_mode_max = (HOST_WIDE_INT_1U << (word_mode_size - 1) << 1) - 1; if (! doloop_seq diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 2ae5731931d92..69e269330d9f1 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,264 @@ +2024-06-19 demin.han + + * gcc.target/riscv/rvv/base/float-point-cmp-eqne.c: New test. + +2024-06-19 Jakub Jelinek + + PR tree-optimization/115544 + * gcc.dg/bitint-107.c: New test. + +2024-06-19 mayshao + + * g++.target/i386/mv32.C: Handle new -march + * gcc.target/i386/funcspec-56.inc: Ditto. + +2024-06-19 Harald Anlauf + + PR fortran/115390 + * gfortran.dg/bind_c_char_11.f90: New test. + +2024-06-19 Andre Vieira + Stam Markianos-Wright + + * gcc.target/arm/lob.h: Add new helpers. + * gcc.target/arm/lob1.c: Use new helpers. + * gcc.target/arm/lob6.c: Likewise. + * gcc.target/arm/mve/dlstp-compile-asm-1.c: New test. + * gcc.target/arm/mve/dlstp-compile-asm-2.c: New test. + * gcc.target/arm/mve/dlstp-compile-asm-3.c: New test. + * gcc.target/arm/mve/dlstp-int8x16.c: New test. + * gcc.target/arm/mve/dlstp-int8x16-run.c: New test. + * gcc.target/arm/mve/dlstp-int16x8.c: New test. + * gcc.target/arm/mve/dlstp-int16x8-run.c: New test. + * gcc.target/arm/mve/dlstp-int32x4.c: New test. + * gcc.target/arm/mve/dlstp-int32x4-run.c: New test. + * gcc.target/arm/mve/dlstp-int64x2.c: New test. + * gcc.target/arm/mve/dlstp-int64x2-run.c: New test. + * gcc.target/arm/mve/dlstp-invalid-asm.c: New test. + +2024-06-19 Pan Li + + * gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-37.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-38.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-39.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-40.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-37.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-38.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-39.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-40.c: New test. + +2024-06-19 Pan Li + + * gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-33.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-34.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-35.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-36.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-33.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-34.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-35.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-36.c: New test. + +2024-06-19 Pan Li + + * gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-29.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-30.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-31.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-32.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-29.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-30.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-31.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-32.c: New test. + +2024-06-19 Pan Li + + * gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-25.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-26.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-27.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-28.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-25.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-26.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-27.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-28.c: New test. + +2024-06-19 Pan Li + + * gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-21.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-22.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-23.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-24.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-21.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-22.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-23.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-24.c: New test. + +2024-06-19 Pan Li + + * gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-17.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-18.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-19.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-20.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-17.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-18.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-19.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-20.c: New test. + +2024-06-19 Pan Li + + * gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-13.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-14.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-15.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-16.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-13.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-14.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-15.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-16.c: New test. + +2024-06-19 Pan Li + + * gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-10.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-11.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-12.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-9.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-10.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-11.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-12.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-9.c: New test. + +2024-06-19 Richard Biener + + * gcc.dg/vect/bb-slp-32.c: Add check for correctness. + +2024-06-19 Andre Vehreschild + + PR fortran/90076 + * gfortran.dg/class_76.f90: Add declared vtab occurrence. + * gfortran.dg/class_78.f90: New test. + +2024-06-19 Pan Li + + * gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add helper + macro for testing. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-29.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-30.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-31.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-32.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-29.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-30.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-31.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-32.c: New test. + +2024-06-19 Pan Li + + * gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add helper + macro for testing. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-25.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-26.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-27.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-28.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-25.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-26.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-27.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-28.c: New test. + +2024-06-19 Pan Li + + * gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add helper + macro for testing. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-21.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-22.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-23.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-24.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-21.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-22.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-23.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-24.c: New test. + +2024-06-19 Pan Li + + * gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add helper + macro for testing. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-17.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-18.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-19.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-20.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-17.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-18.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-19.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-20.c: New test. + +2024-06-19 Pan Li + + * gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add helper + macro for testing. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-13.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-14.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-15.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-16.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-13.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-14.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-15.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-16.c: New test. + +2024-06-19 Pan Li + + * gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add helper + macro for testing. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-10.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-11.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-12.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-9.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-10.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-11.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-12.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-9.c: New test. + +2024-06-19 Pan Li + + * gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add helper + macro for testing. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-5.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-6.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-7.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-8.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-5.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-6.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-7.c: New test. + * gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-8.c: New test. + +2024-06-19 Pan Li + + * gcc.target/riscv/sat_arith.h: Add helper macro for + testing. + * gcc.target/riscv/sat_u_sub-45.c: New test. + * gcc.target/riscv/sat_u_sub-46.c: New test. + * gcc.target/riscv/sat_u_sub-47.c: New test. + * gcc.target/riscv/sat_u_sub-48.c: New test. + * gcc.target/riscv/sat_u_sub-run-45.c: New test. + * gcc.target/riscv/sat_u_sub-run-46.c: New test. + * gcc.target/riscv/sat_u_sub-run-47.c: New test. + * gcc.target/riscv/sat_u_sub-run-48.c: New test. + +2024-06-19 Pan Li + + * gcc.target/riscv/sat_arith.h: Add helper + macro for testing. + * gcc.target/riscv/sat_u_sub-41.c: New test. + * gcc.target/riscv/sat_u_sub-42.c: New test. + * gcc.target/riscv/sat_u_sub-43.c: New test. + * gcc.target/riscv/sat_u_sub-44.c: New test. + * gcc.target/riscv/sat_u_sub-run-41.c: New test. + * gcc.target/riscv/sat_u_sub-run-42.c: New test. + * gcc.target/riscv/sat_u_sub-run-43.c: New test. + * gcc.target/riscv/sat_u_sub-run-44.c: New test. + 2024-06-18 Jeff Law * gcc.target/riscv/zbs-ext-2.c: Do not run for -Os. diff --git a/gcc/testsuite/g++.target/aarch64/afmv/Makefile b/gcc/testsuite/g++.target/aarch64/afmv/Makefile new file mode 100644 index 0000000000000..ebaf89cba1b5b --- /dev/null +++ b/gcc/testsuite/g++.target/aarch64/afmv/Makefile @@ -0,0 +1 @@ +dg-runtest $(srcdir)/gcc.target/aarch64/afmv/*.c diff --git a/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_1.c b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_1.c new file mode 100644 index 0000000000000..a24e8e8805d60 --- /dev/null +++ b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_1.c @@ -0,0 +1,11 @@ +#include + +__attribute__((target_clones("default", "sse4.2", "avx2"))) +void foo() { + printf("Function foo\n"); +} + +int main() { + foo(); + return 0; +} diff --git a/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_1.exp b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_1.exp new file mode 100644 index 0000000000000..4f98486cc7537 --- /dev/null +++ b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_1.exp @@ -0,0 +1,15 @@ +# Load the DejaGnu framework +load_lib gcc-dg.exp + +# Set the source directory +set srcdir [file dirname [info script]] + +# Define the test +dg-init + +set testname "afmv_test_1" +set srcfile "${srcdir}/afmv_test_1.c" + +dg-runtest $srcfile "-O2 -fdump-tree-all" "" + +dg-finish diff --git a/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_2.c b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_2.c new file mode 100644 index 0000000000000..19133d253f104 --- /dev/null +++ b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_2.c @@ -0,0 +1,11 @@ +#include + +__attribute__((target_clones("default", "sse4.2", "avx2"))) +void bar() { + printf("Function bar\n"); +} + +int main() { + bar(); + return 0; +} diff --git a/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_2.exp b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_2.exp new file mode 100644 index 0000000000000..782baf0b5bb69 --- /dev/null +++ b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_2.exp @@ -0,0 +1,15 @@ +# Load the DejaGnu framework +load_lib gcc-dg.exp + +# Set the source directory +set srcdir [file dirname [info script]] + +# Define the test +dg-init + +set testname "afmv_test_2" +set srcfile "${srcdir}/afmv_test_2.c" + +dg-runtest $srcfile "-O2 -fdump-tree-all" "" + +dg-finish diff --git a/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_3.c b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_3.c new file mode 100644 index 0000000000000..b20356ba86123 --- /dev/null +++ b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_3.c @@ -0,0 +1,11 @@ +#include + +__attribute__((target_clones("default", "sse4.2", "avx2"))) +inline void baz() { + printf("Function baz\n"); +} + +int main() { + baz(); + return 0; +} diff --git a/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_3.exp b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_3.exp new file mode 100644 index 0000000000000..9fa6b66ce6182 --- /dev/null +++ b/gcc/testsuite/g++.target/aarch64/afmv/afmv_test_3.exp @@ -0,0 +1,15 @@ +# Load the DejaGnu framework +load_lib gcc-dg.exp + +# Set the source directory +set srcdir [file dirname [info script]] + +# Define the test +dg-init + +set testname "afmv_test_3" +set srcfile "${srcdir}/afmv_test_3.c" + +dg-runtest $srcfile "-O2 -fdump-tree-all" "" + +dg-finish diff --git a/gcc/testsuite/g++.target/i386/mv32.C b/gcc/testsuite/g++.target/i386/mv32.C index 6c993218d01c3..b311c35baa3d9 100644 --- a/gcc/testsuite/g++.target/i386/mv32.C +++ b/gcc/testsuite/g++.target/i386/mv32.C @@ -21,6 +21,10 @@ int __attribute__ ((target("arch=yongfeng"))) foo () { return 2; } +int __attribute__ ((target("arch=shijidadao"))) foo () { + return 3; +} + int main () { int val = foo (); @@ -29,6 +33,8 @@ int main () assert (val == 1); else if (__builtin_cpu_is ("yongfeng")) assert (val == 2); + else if (__builtin_cpu_is ("shijidadao")) + assert (val == 3); else assert (val == 0); diff --git a/gcc/testsuite/gcc.dg/bitint-107.c b/gcc/testsuite/gcc.dg/bitint-107.c new file mode 100644 index 0000000000000..a3f5f534088f3 --- /dev/null +++ b/gcc/testsuite/gcc.dg/bitint-107.c @@ -0,0 +1,16 @@ +/* PR tree-optimization/115544 */ +/* { dg-do compile { target bitint } } */ +/* { dg-options "-O -fno-tree-fre -fno-tree-ccp -fno-tree-forwprop" } */ + +#if __BITINT_MAXWIDTH__ >= 129 +typedef _BitInt(129) B; +#else +typedef _BitInt(63) B; +#endif +B a, b; + +int +foo (void) +{ + return __builtin_mul_overflow (a, 1, &b); +} diff --git a/gcc/testsuite/gcc.target/arm/lob.h b/gcc/testsuite/gcc.target/arm/lob.h index feaae7cc89959..3941fe7a8b620 100644 --- a/gcc/testsuite/gcc.target/arm/lob.h +++ b/gcc/testsuite/gcc.target/arm/lob.h @@ -1,15 +1,131 @@ #include - +#include /* Common code for lob tests. */ #define NO_LOB asm volatile ("@ clobber lr" : : : "lr" ) -#define N 10000 +#define N 100 + +static void +reset_data (int *a, int *b, int *c, int x) +{ + memset (a, -1, x * sizeof (*a)); + memset (b, -1, x * sizeof (*b)); + memset (c, 0, x * sizeof (*c)); +} + +static void +reset_data8 (int8_t *a, int8_t *b, int8_t *c, int x) +{ + memset (a, -1, x * sizeof (*a)); + memset (b, -1, x * sizeof (*b)); + memset (c, 0, x * sizeof (*c)); +} + +static void +reset_data16 (int16_t *a, int16_t *b, int16_t *c, int x) +{ + memset (a, -1, x * sizeof (*a)); + memset (b, -1, x * sizeof (*b)); + memset (c, 0, x * sizeof (*c)); +} + +static void +reset_data32 (int32_t *a, int32_t *b, int32_t *c, int x) +{ + memset (a, -1, x * sizeof (*a)); + memset (b, -1, x * sizeof (*b)); + memset (c, 0, x * sizeof (*c)); +} + +static void +reset_data64 (int64_t *a, int64_t *c, int x) +{ + memset (a, -1, x * sizeof (*a)); + memset (c, 0, x * sizeof (*c)); +} + +static void +check_plus (int *a, int *b, int *c, int x) +{ + for (int i = 0; i < N; i++) + { + NO_LOB; + if (i < x) + { + if (c[i] != (a[i] + b[i])) abort (); + } + else + { + if (c[i] != 0) abort (); + } + } +} + +static void +check_plus8 (int8_t *a, int8_t *b, int8_t *c, int x) +{ + for (int i = 0; i < N; i++) + { + NO_LOB; + if (i < x) + { + if (c[i] != (a[i] + b[i])) abort (); + } + else + { + if (c[i] != 0) abort (); + } + } +} + +static void +check_plus16 (int16_t *a, int16_t *b, int16_t *c, int x) +{ + for (int i = 0; i < N; i++) + { + NO_LOB; + if (i < x) + { + if (c[i] != (a[i] + b[i])) abort (); + } + else + { + if (c[i] != 0) abort (); + } + } +} + +static void +check_plus32 (int32_t *a, int32_t *b, int32_t *c, int x) +{ + for (int i = 0; i < N; i++) + { + NO_LOB; + if (i < x) + { + if (c[i] != (a[i] + b[i])) abort (); + } + else + { + if (c[i] != 0) abort (); + } + } +} static void -reset_data (int *a, int *b, int *c) +check_memcpy64 (int64_t *a, int64_t *c, int x) { - memset (a, -1, N * sizeof (*a)); - memset (b, -1, N * sizeof (*b)); - memset (c, -1, N * sizeof (*c)); + for (int i = 0; i < N; i++) + { + NO_LOB; + if (i < x) + { + if (c[i] != a[i]) abort (); + } + else + { + if (c[i] != 0) abort (); + } + } } diff --git a/gcc/testsuite/gcc.target/arm/lob1.c b/gcc/testsuite/gcc.target/arm/lob1.c index ba5c82cd55c58..c8ce653a5c39f 100644 --- a/gcc/testsuite/gcc.target/arm/lob1.c +++ b/gcc/testsuite/gcc.target/arm/lob1.c @@ -54,29 +54,18 @@ loop3 (int *a, int *b, int *c) } while (i < N); } -void -check (int *a, int *b, int *c) -{ - for (int i = 0; i < N; i++) - { - NO_LOB; - if (c[i] != a[i] + b[i]) - abort (); - } -} - int main (void) { - reset_data (a, b, c); + reset_data (a, b, c, N); loop1 (a, b ,c); - check (a, b ,c); - reset_data (a, b, c); + check_plus (a, b, c, N); + reset_data (a, b, c, N); loop2 (a, b ,c); - check (a, b ,c); - reset_data (a, b, c); + check_plus (a, b, c, N); + reset_data (a, b, c, N); loop3 (a, b ,c); - check (a, b ,c); + check_plus (a, b, c, N); return 0; } diff --git a/gcc/testsuite/gcc.target/arm/lob6.c b/gcc/testsuite/gcc.target/arm/lob6.c index 17b6124295e8a..4fe116e2c2be3 100644 --- a/gcc/testsuite/gcc.target/arm/lob6.c +++ b/gcc/testsuite/gcc.target/arm/lob6.c @@ -79,14 +79,14 @@ check (void) int main (void) { - reset_data (a1, b1, c1); - reset_data (a2, b2, c2); + reset_data (a1, b1, c1, N); + reset_data (a2, b2, c2, N); loop1 (a1, b1, c1); ref1 (a2, b2, c2); check (); - reset_data (a1, b1, c1); - reset_data (a2, b2, c2); + reset_data (a1, b1, c1, N); + reset_data (a2, b2, c2, N); loop2 (a1, b1, c1); ref2 (a2, b2, c2); check (); diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-1.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-1.c new file mode 100644 index 0000000000000..6e6da3d3d596b --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-1.c @@ -0,0 +1,146 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-options "-O3 -save-temps" } */ +/* { dg-add-options arm_v8_1m_mve } */ + +#include + +#define IMM 5 + +#define TEST_COMPILE_IN_DLSTP_TERNARY(BITS, LANES, LDRSTRYTPE, TYPE, SIGN, NAME, PRED) \ +void test_##NAME##PRED##_##SIGN##BITS (TYPE##BITS##_t *a, TYPE##BITS##_t *b, TYPE##BITS##_t *c, int n) \ +{ \ + while (n > 0) \ + { \ + mve_pred16_t p = vctp##BITS##q (n); \ + TYPE##BITS##x##LANES##_t va = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (a, p); \ + TYPE##BITS##x##LANES##_t vb = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (b, p); \ + TYPE##BITS##x##LANES##_t vc = NAME##PRED##_##SIGN##BITS (va, vb, p); \ + vstr##LDRSTRYTPE##q_p_##SIGN##BITS (c, vc, p); \ + c += LANES; \ + a += LANES; \ + b += LANES; \ + n -= LANES; \ + } \ +} + +#define TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY(BITS, LANES, LDRSTRYTPE, NAME, PRED) \ +TEST_COMPILE_IN_DLSTP_TERNARY (BITS, LANES, LDRSTRYTPE, int, s, NAME, PRED) \ +TEST_COMPILE_IN_DLSTP_TERNARY (BITS, LANES, LDRSTRYTPE, uint, u, NAME, PRED) + +#define TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY(NAME, PRED) \ +TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY (8, 16, b, NAME, PRED) \ +TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY (16, 8, h, NAME, PRED) \ +TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY (32, 4, w, NAME, PRED) + + +TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vaddq, _x) +TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vmulq, _x) +TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vsubq, _x) +TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vhaddq, _x) +TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vorrq, _x) + + +#define TEST_COMPILE_IN_DLSTP_TERNARY_M(BITS, LANES, LDRSTRYTPE, TYPE, SIGN, NAME, PRED) \ +void test_##NAME##PRED##_##SIGN##BITS (TYPE##BITS##x##LANES##_t __inactive, TYPE##BITS##_t *a, TYPE##BITS##_t *b, TYPE##BITS##_t *c, int n) \ +{ \ + while (n > 0) \ + { \ + mve_pred16_t p = vctp##BITS##q (n); \ + TYPE##BITS##x##LANES##_t va = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (a, p); \ + TYPE##BITS##x##LANES##_t vb = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (b, p); \ + TYPE##BITS##x##LANES##_t vc = NAME##PRED##_##SIGN##BITS (__inactive, va, vb, p); \ + vstr##LDRSTRYTPE##q_p_##SIGN##BITS (c, vc, p); \ + c += LANES; \ + a += LANES; \ + b += LANES; \ + n -= LANES; \ + } \ +} + +#define TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M(BITS, LANES, LDRSTRYTPE, NAME, PRED) \ +TEST_COMPILE_IN_DLSTP_TERNARY_M (BITS, LANES, LDRSTRYTPE, int, s, NAME, PRED) \ +TEST_COMPILE_IN_DLSTP_TERNARY_M (BITS, LANES, LDRSTRYTPE, uint, u, NAME, PRED) + +#define TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M(NAME, PRED) \ +TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M (8, 16, b, NAME, PRED) \ +TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M (16, 8, h, NAME, PRED) \ +TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M (32, 4, w, NAME, PRED) + + +TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vaddq, _m) +TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vmulq, _m) +TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vsubq, _m) +TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vhaddq, _m) +TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vorrq, _m) + +#define TEST_COMPILE_IN_DLSTP_TERNARY_N(BITS, LANES, LDRSTRYTPE, TYPE, SIGN, NAME, PRED) \ +void test_##NAME##PRED##_n_##SIGN##BITS (TYPE##BITS##_t *a, TYPE##BITS##_t *c, int n) \ +{ \ + while (n > 0) \ + { \ + mve_pred16_t p = vctp##BITS##q (n); \ + TYPE##BITS##x##LANES##_t va = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (a, p); \ + TYPE##BITS##x##LANES##_t vc = NAME##PRED##_n_##SIGN##BITS (va, IMM, p); \ + vstr##LDRSTRYTPE##q_p_##SIGN##BITS (c, vc, p); \ + c += LANES; \ + a += LANES; \ + n -= LANES; \ + } \ +} + +#define TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_N(BITS, LANES, LDRSTRYTPE, NAME, PRED) \ +TEST_COMPILE_IN_DLSTP_TERNARY_N (BITS, LANES, LDRSTRYTPE, int, s, NAME, PRED) \ +TEST_COMPILE_IN_DLSTP_TERNARY_N (BITS, LANES, LDRSTRYTPE, uint, u, NAME, PRED) + +#define TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N(NAME, PRED) \ +TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_N (8, 16, b, NAME, PRED) \ +TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_N (16, 8, h, NAME, PRED) \ +TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_N (32, 4, w, NAME, PRED) + +TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vaddq, _x) +TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vmulq, _x) +TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vsubq, _x) +TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vhaddq, _x) + +TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vbrsrq, _x) +TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vshlq, _x) +TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vshrq, _x) + +#define TEST_COMPILE_IN_DLSTP_TERNARY_M_N(BITS, LANES, LDRSTRYTPE, TYPE, SIGN, NAME, PRED) \ +void test_##NAME##PRED##_n_##SIGN##BITS (TYPE##BITS##x##LANES##_t __inactive, TYPE##BITS##_t *a, TYPE##BITS##_t *c, int n) \ +{ \ + while (n > 0) \ + { \ + mve_pred16_t p = vctp##BITS##q (n); \ + TYPE##BITS##x##LANES##_t va = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (a, p); \ + TYPE##BITS##x##LANES##_t vc = NAME##PRED##_n_##SIGN##BITS (__inactive, va, IMM, p); \ + vstr##LDRSTRYTPE##q_p_##SIGN##BITS (c, vc, p); \ + c += LANES; \ + a += LANES; \ + n -= LANES; \ + } \ +} + +#define TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M_N(BITS, LANES, LDRSTRYTPE, NAME, PRED) \ +TEST_COMPILE_IN_DLSTP_TERNARY_M_N (BITS, LANES, LDRSTRYTPE, int, s, NAME, PRED) \ +TEST_COMPILE_IN_DLSTP_TERNARY_M_N (BITS, LANES, LDRSTRYTPE, uint, u, NAME, PRED) + +#define TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N(NAME, PRED) \ +TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M_N (8, 16, b, NAME, PRED) \ +TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M_N (16, 8, h, NAME, PRED) \ +TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M_N (32, 4, w, NAME, PRED) + +TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vaddq, _m) +TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vmulq, _m) +TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vsubq, _m) +TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vhaddq, _m) + +TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vbrsrq, _m) +TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vshlq, _m) +TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vshrq, _m) + +/* The final number of DLSTPs currently is calculated by the number of + `TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY.*` macros * 6. */ +/* { dg-final { scan-assembler-times {\tdlstp} 144 } } */ +/* { dg-final { scan-assembler-times {\tletp} 144 } } */ diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c new file mode 100644 index 0000000000000..84f4a2fc4f9bc --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c @@ -0,0 +1,749 @@ + +/* { dg-do compile { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-options "-O3 -save-temps -fno-schedule-insns2 " } */ +/* { dg-add-options arm_v8_1m_mve } */ +/* { dg-additional-options "-mtune=cortex-m55" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#include +/* Using a >=1 condition. */ +void test1 (int32_t *a, int32_t *b, int32_t *c, int n) +{ + while (n >= 1) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + int32x4_t vb = vldrwq_z_s32 (b, p); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c+=4; + a+=4; + b+=4; + n-=4; + } +} +/* +** test1: +**... +** dlstp.32 lr, r3 +** vldrw.32 q[0-9]+, \[r0\], #16 +** vldrw.32 q[0-9]+, \[r1\], #16 +** vadd.i32 (q[0-9]+), q[0-9]+, q[0-9]+ +** vstrw.32 \1, \[r2\], #16 +** letp lr, .* +**... +*/ + +/* Test a for loop format of decrementing to zero */ +int32_t a[] = {0, 1, 2, 3, 4, 5, 6, 7}; +void test2 (int32_t *b, int num_elems) +{ + for (int i = num_elems; i > 0; i-= 4) + { + mve_pred16_t p = vctp32q (i); + int32x4_t va = vldrwq_z_s32 (&(a[i]), p); + vstrwq_p_s32 (b + i, va, p); + } +} +/* +** test2: +**... +** dlstp.32 lr, r1 +**... +** vldrw.32 (q[0-9]+), \[r3\], #-16 +** vstrw.32 \1, \[r0\], #-16 +** letp lr, .* +**... +*/ + +/* Iteration counter counting up to num_iter. */ +void test3 (uint8_t *a, uint8_t *b, uint8_t *c, unsigned n) +{ + int num_iter = (n + 15)/16; + for (int i = 0; i < num_iter; i++) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_x_u8 (va, vb, p); + vstrbq_p_u8 (c, vc, p); + n-=16; + a += 16; + b += 16; + c += 16; + } +} + +/* +** test3: +**... +** dlstp.8 lr, r3 +**... +** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\] +** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\] +**... +** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+ +** vstrb.8 \3, \[(r[0-9]+|ip)\] +**... +** letp lr, .* +**... +*/ + +/* Iteration counter counting down from num_iter. */ +void test4 (uint8_t *a, uint8_t *b, uint8_t *c, int n) +{ + int num_iter = (n + 15)/16; + for (int i = num_iter; i > 0; i--) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_x_u8 (va, vb, p); + vstrbq_p_u8 (c, vc, p); + n-=16; + a += 16; + b += 16; + c += 16; + } +} +/* +** test4: +**... +** dlstp.8 lr, r3 +**... +** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\] +** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\] +**... +** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+ +** vstrb.8 \3, \[(r[0-9]+|ip)\] +**... +** letp lr, .* +**... +*/ + +/* Using an unpredicated arithmetic instruction within the loop. */ +void test5 (uint8_t *a, uint8_t *b, uint8_t *c, uint8_t *d, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_u8 (b); + /* Is affected by implicit predication, because vb also + came from an unpredicated load, but there is no functional + problem, because the result is used in a predicated store. */ + uint8x16_t vc = vaddq_u8 (va, vb); + uint8x16_t vd = vaddq_x_u8 (va, vb, p); + vstrbq_p_u8 (c, vc, p); + vstrbq_p_u8 (d, vd, p); + n-=16; + a += 16; + b += 16; + c += 16; + } +} + +/* +** test5: +**... +** dlstp.8 lr, r[0-9]+ +**... +** vldrb.8 q[0-9]+, \[r1\] +** vldrb.8 q[0-9]+, \[r2\] +**... +** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+ +**... +** vstrb.8 \1, \[r2\] +** vstrb.8 \1, \[r3\] +** letp lr, .* +**... +*/ + +/* Using a different VPR value for one instruction in the loop. */ +void test6 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + int32x4_t vb = vldrwq_z_s32 (b, p1); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* +** test6: +**... +** dlstp.32 lr, r3 +** vldrw.32 q[0-9]+, \[r0\], #16 +** vpst +** vldrwt.32 q[0-9]+, \[r1\], #16 +** vadd.i32 (q[0-9]+), q[0-9]+, q[0-9]+ +** vstrw.32 \1, \[r2\], #16 +** letp lr, .* +**... +*/ + +/* Generating and using another VPR value in the loop, with a vctp. + The doloop logic will always try to do the transform on the first + vctp it encounters, so this is still expected to work. */ +void test7 (int32_t *a, int32_t *b, int32_t *c, int n, int g) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + mve_pred16_t p1 = vctp32q (g); + int32x4_t vb = vldrwq_z_s32 (b, p1); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} +/* +** test7: +**... +** dlstp.32 lr, r3 +** vldrw.32 q[0-9]+, \[r0\], #16 +** vpst +** vldrwt.32 q[0-9]+, \[r1\], #16 +** vadd.i32 (q[0-9]+), q[0-9]+, q[0-9]+ +** vstrw.32 \1, \[r2\], #16 +** letp lr, .* +**... +*/ + +/* Generating and using a different VPR value in the loop, with a vctp, + but this time the p1 will also change in every loop (still fine) */ +void test8 (int32_t *a, int32_t *b, int32_t *c, int n, int g) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + mve_pred16_t p1 = vctp32q (g); + int32x4_t vb = vldrwq_z_s32 (b, p1); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + g++; + } +} + +/* +** test8: +**... +** dlstp.32 lr, r3 +** vldrw.32 q[0-9]+, \[r0\], #16 +** vctp.32 r4 +** vpst +** vldrwt.32 q[0-9]+, \[r1\], #16 +**... +** vadd.i32 (q[0-9]+), q[0-9]+, q[0-9]+ +** vstrw.32 \1, \[r2\], #16 +** letp lr, .* +**... +*/ + +/* Generating and using a different VPR value in the loop, with a vctp_m + that is independent of the loop vctp VPR. */ +void test9 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + mve_pred16_t p2 = vctp32q_m (n, p1); + int32x4_t vb = vldrwq_z_s32 (b, p1); + int32x4_t vc = vaddq_x_s32 (va, vb, p2); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* +** test9: +**... +** dlstp.32 lr, r3 +** vldrw.32 q[0-9]+, \[r0\], #16 +** vmsr p0, (r[0-9]+) @ movhi +** vpst +** vctpt.32 r3 +** vmrs (r[0-9]+), p0 @ movhi +** vmsr p0, \1 @ movhi +** vpst +** vldrwt.32 q[0-9]+, \[r1\], #16 +** vmsr p0, \2 @ movhi +** vpst +** vaddt.i32 (q[0-9]+), q[0-9]+, q[0-9]+ +**... +** vstrw.32 \3, \[r2\], #16 +** letp lr, .* +**... +*/ + +/* Generating and using a different VPR value in the loop, + with a vctp_m that is tied to the base vctp VPR. This + is still fine, because the vctp_m will be transformed + into a vctp and be implicitly predicated. */ +void test10 (int32_t *a, int32_t *b, int32_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + mve_pred16_t p1 = vctp32q_m (n, p); + int32x4_t vb = vldrwq_z_s32 (b, p1); + int32x4_t vc = vaddq_x_s32 (va, vb, p1); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} +/* + We don't need that extra vctp in the loop, but we currently do not optimize + it away, however, it is not wrong to use it... +*/ +/* +** test10: +**... +** dlstp.32 lr, r3 +** vctp.32 r3 +** vldrw.32 q[0-9]+, \[r0\], #16 +**... +** vpst +** vldrwt.32 q[0-9]+, \[r1\], #16 +** vpst +** vaddt.i32 (q[0-9]+), q[0-9]+, q[0-9]+ +** vstrw.32 \1, \[r2\], #16 +** letp lr, .* +**... +*/ + +/* Generating and using a different VPR value in the loop, with a vcmp. */ +void test11 (int32_t *a, int32_t *b, int32_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + int32x4_t vb = vldrwq_z_s32 (b, p); + mve_pred16_t p1 = vcmpeqq_s32 (va, vb); + int32x4_t vc = vaddq_x_s32 (va, vb, p1); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* +** test11: +**... +** dlstp.32 lr, r3 +** vldrw.32 q[0-9]+, \[r0\], #16 +** vldrw.32 q[0-9]+, \[r1\], #16 +** vcmp.i32 eq, q[0-9]+, q[0-9]+ +** vpst +** vaddt.i32 (q[0-9]+), q[0-9]+, q[0-9]+ +** vstrw.32 \1, \[r2\], #16 +** letp lr, .* +**... +*/ + +/* Generating and using a different VPR value in the loop, with a vcmp_m. */ +void test12 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + int32x4_t vb = vldrwq_z_s32 (b, p); + mve_pred16_t p2 = vcmpeqq_m_s32 (va, vb, p1); + int32x4_t vc = vaddq_x_s32 (va, vb, p2); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* +** test12: +**... +** dlstp.32 lr, r3 +** vldrw.32 q[0-9]+, \[r0\], #16 +** vldrw.32 q[0-9]+, \[r1\], #16 +** vmsr p0, (r[0-9]+|ip) @ movhi +** vpst +** vcmpt.i32 eq, q[0-9]+, q[0-9]+ +** vpst +** vaddt.i32 (q[0-9]+), q[0-9]+, q[0-9]+ +** vstrw.32 \2, \[r2\], #16 +** letp lr, .* +**... +*/ + +/* Generating and using a different VPR value in the loop, with a vcmp_m + that is tied to the base vctp VPR (same as above, this will be turned + into a vcmp and be implicitly predicated). */ +void test13 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + int32x4_t vb = vldrwq_z_s32 (b, p); + mve_pred16_t p2 = vcmpeqq_m_s32 (va, vb, p); + int32x4_t vc = vaddq_x_s32 (va, vb, p2); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* +** test13: +**... +** dlstp.32 lr, r3 +** vldrw.32 q[0-9]+, \[r0\], #16 +** vldrw.32 q[0-9]+, \[r1\], #16 +** vcmp.i32 eq, q[0-9]+, q[0-9]+ +** vpst +** vaddt.i32 (q[0-9]+), q[0-9]+, q[0-9]+ +** vstrw.32 \1, \[r2\], #16 +** letp lr, .* +**... +*/ + +/* Similar to test27 in dsltp-invalid-asm.c, but use a predicated load to make + it safe to implicitly predicate the vaddv. */ +void test14 (int32_t *a, int32_t *c, int n) +{ + int32_t res = 0; + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + res += vaddvq_s32 (va); + int32x4_t vc = vdupq_n_s32 (res); + vstrwq_p_s32 (c, vc, p); + a += 4; + n -= 4; + } +} + +/* +** test14: +**... +** dlstp.32 lr, r2 +** vldrw.32 (q[0-9]+), \[r0\], #16 +** vaddv.s32 (r[0-9]+|ip), \1 +** add (r[0-9]+|ip), \3, \2 +** vdup.32 (q[0-9]+), \3 +** vstrw.32 \4, \[r1\] +** letp lr, .* +**... +*/ + +uint8_t test15 (uint8_t *a, uint8_t *b, int n) +{ + uint8_t res = 0; + uint8x16_t vc = vdupq_n_u8 (0); + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_u8 (b); + vc = vaddq_m (vc, va, vc, p); + res = vgetq_lane (vc, 5); + + a += 16; + b += 16; + n -= 16; + } + return res; +} + +/* +** test15: +**... +** dlstp.8 lr, r2 +**... +** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\] +**... +** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+ +**... +** letp lr, .* +** vmov.u8 r[0-9]+, \2\[5\] +**... +*/ + +uint8_t test16 (uint8_t *a, uint8_t *b, int n) +{ + uint8_t res = 0; + uint8x16_t vc = vdupq_n_u8 (0); + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_u8 (b); + vc = vaddq (va, vc); + vc = vaddq_m (vc, va, vc, p); + res = vgetq_lane (vc, 5); + + a += 16; + b += 16; + n -= 16; + } + return res; +} + +/* +** test16: +**... +** dlstp.8 lr, r2 +**... +** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\] +**... +** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+ +** vadd.i8 \2, q[0-9]+, q[0-9]+ +** letp lr, .* +** vmov.u8 r[0-9]+, \2\[5\] +**... +*/ + + + +/* Using an across-vector unpredicated instruction in a valid way. + This tests that "vc" has correctly masked the risky "vb". */ +uint16_t test18 (uint16_t *a, uint16_t *b, uint16_t *c, int n) +{ + uint16x8_t vb = vldrhq_u16 (b); + uint16_t res = 0; + while (n > 0) + { + mve_pred16_t p = vctp16q (n); + uint16x8_t va = vldrhq_z_u16 (a, p); + uint16x8_t vc = vaddq_m_u16 (va, va, vb, p); + res += vaddvq_u16 (vc); + c += 8; + a += 8; + b += 8; + n -= 8; + } + return res; +} + +/* +** test18: +**... +** dlstp.16 lr, r3 +** vldrh.16 (q[0-9]+), \[r2\], #16 +** vadd.i16 \1, q[0-9]+, q[0-9]+ +** vaddv.u16 (r[0-9]+|ip), \1 +** add (r[0-9]+|ip), \3, \2 +** uxth \3, \3 +** letp lr, .* +**... +*/ + +/* Using an across-vector unpredicated instruction with implicit scalar adding from outside the loop. */ +uint16_t test19 (uint16_t *a, uint16_t *b, uint16_t *c, int n) +{ + uint16x8_t vb = vldrhq_u16 (b); + uint16_t res = 0; + while (n > 0) + { + mve_pred16_t p = vctp16q (n); + uint16x8_t va = vldrhq_z_u16 (a, p); + uint16x8_t vc = vaddq_m_u16 (va, va, vb, p); + res = vaddvaq_u16 (res, vc); + c += 8; + a += 8; + b += 8; + n -= 8; + } + return res; +} + +/* +** test19: +**... +** dlstp.16 lr, r3 +** vldrh.16 (q[0-9]+), \[r2\], #16 +** vadd.i16 \1, q[0-9]+, q[0-9]+ +** vaddva.u16 (r[0-9]+|ip), \1 +** uxth \2, \2 +** letp lr, .* +**... +*/ + + +/* Using an across-vector predicated instruction in a valid way. */ +uint16_t test20 (uint16_t *a, uint16_t *b, uint16_t *c, int n) +{ + uint16_t res = 0; + while (n > 0) + { + mve_pred16_t p = vctp16q (n); + uint16x8_t va = vldrhq_u16 (a); + res = vaddvaq_p_u16 (res, va, p); + c += 8; + a += 8; + b += 8; + n -= 8; + } + return res; +} + +/* The uxth could be moved outside the loop. */ +/* +** test20: +**... +** dlstp.16 lr, r3 +** vldrh.16 (q[0-9]+), \[r2\], #16 +** vaddva.u16 (r[0-9]+|ip), \1 +** uxth \2, \2 +** letp lr, .* +**... +*/ + +/* Using an across-vector predicated instruction in a valid way. */ +uint16_t test21 (uint16_t *a, uint16_t *b, uint16_t *c, int n) +{ + uint16_t res = 0; + while (n > 0) + { + mve_pred16_t p = vctp16q (n); + uint16x8_t va = vldrhq_u16 (a); + res++; + res = vaddvaq_p_u16 (res, va, p); + c += 8; + a += 8; + b += 8; + n -= 8; + } + return res; +} + +/* Also think it'd be safe to move uxth outside of the loop here. */ +/* +** test21: +**... +** dlstp.16 lr, r3 +** vldrh.16 (q[0-9]+), \[r2\], #16 +** adds (r[0-9]+|ip), \2, #1 +** uxth \2, \2 +** vaddva.u16 \2, \1 +** uxth \2, \2 +** letp lr, .* +**... +*/ + +int test22 (uint8_t *a, uint8_t *b, uint8_t *c, int n) +{ + int res = 0; + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + res = vmaxvq (res, va); + n-=16; + a+=16; + } + return res; +} + +/* +** test22: +**... +** dlstp.8 lr, r3 +**... +** vldrb.8 (q[0-9]+), \[r[0-9]+\] +**... +** vmaxv.u8 (r[0-9]+|ip), \1 +** uxtb \2, \2 +** letp lr, .* +**... +*/ + +int test23 (int8_t *a, int8_t *b, int8_t *c, int n) +{ + int res = 0; + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + int8x16_t va = vldrbq_z_s8 (a, p); + res = vmaxavq (res, va); + n-=16; + a+=16; + } + return res; +} + +/* +** test23: +**... +** dlstp.8 lr, r3 +**... +** vldrb.8 (q[0-9]+), \[r3\] +**... +** vmaxav.s8 (r[0-9]+|ip), \1 +** uxtb \2, \2 +** letp lr, .* +**... +*/ + +/* Like test1, but update n before vctp, meaning we should only iterate for n-4 + elements. */ +void test24 (int32_t *a, int32_t *b, int32_t *c, int n) +{ + while (n >= 1) + { + n-=4; + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + int32x4_t vb = vldrwq_z_s32 (b, p); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c+=4; + a+=4; + b+=4; + } +} +/* +** test24: +**... +** subs r3, r3, #4 +**... +** dlstp.32 lr, r3 +** vldrw.32 q[0-9]+, \[r0\], #16 +** vldrw.32 q[0-9]+, \[r1\], #16 +** vadd.i32 (q[0-9]+), q[0-9]+, q[0-9]+ +** vstrw.32 \1, \[r2\], #16 +** letp lr, .* +**... +*/ + diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-3.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-3.c new file mode 100644 index 0000000000000..c784f54013177 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-3.c @@ -0,0 +1,46 @@ + +/* { dg-do compile } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-options "-O3 -save-temps" } */ +/* { dg-add-options arm_v8_1m_mve } */ + +#include + +/* We don't support pattern recognition of signed N values when computing num_iter. */ +void test3 (uint8_t *a, uint8_t *b, uint8_t *c, int n) +{ + int num_iter = (n + 15)/16; + for (int i = 0; i < num_iter; i++) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_x_u8 (va, vb, p); + vstrbq_p_u8 (c, vc, p); + n-=16; + a += 16; + b += 16; + c += 16; + } +} + +/* Using a predicated vcmp to generate a new predicate value in the + loop and then using it in a predicated store insn. */ +void test17 (int32_t *a, int32_t *b, int32_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + int32x4_t vb = vldrwq_z_s32 (b, p); + int32x4_t vc = vaddq_s32 (va, vb); + mve_pred16_t p1 = vcmpeqq_m_s32 (va, vc, p); + vstrwq_p_s32 (c, vc, p1); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} +/* This is an example of a loop that we could tail predicate but currently don't. */ +/* { dg-final { scan-assembler "letp" { xfail *-*-* } } } */ diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8-run.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8-run.c new file mode 100644 index 0000000000000..6966a3966046f --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8-run.c @@ -0,0 +1,44 @@ +/* { dg-do run { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-require-effective-target arm_mve_hw } */ +/* { dg-options "-O2 -save-temps" } */ +/* { dg-add-options arm_v8_1m_mve } */ +#include "dlstp-int16x8.c" + +int main () +{ + int i; + int16_t temp1[N]; + int16_t temp2[N]; + int16_t temp3[N]; + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 0); + check_plus16 (temp1, temp2, temp3, 0); + + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 1); + check_plus16 (temp1, temp2, temp3, 1); + + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 7); + check_plus16 (temp1, temp2, temp3, 7); + + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 8); + check_plus16 (temp1, temp2, temp3, 8); + + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 9); + check_plus16 (temp1, temp2, temp3, 9); + + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 16); + check_plus16 (temp1, temp2, temp3, 16); + + reset_data16 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 17); + check_plus16 (temp1, temp2, temp3, 17); + + reset_data16 (temp1, temp2, temp3, N); +} + diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8.c new file mode 100644 index 0000000000000..33632c5f14dc6 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8.c @@ -0,0 +1,31 @@ +/* { dg-do compile { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-options "-O2 -save-temps" } */ +/* { dg-add-options arm_v8_1m_mve } */ + +#include +#include +#include +#include "../lob.h" + +void __attribute__ ((noinline)) test (int16_t *a, int16_t *b, int16_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp16q (n); + int16x8_t va = vldrhq_z_s16 (a, p); + int16x8_t vb = vldrhq_z_s16 (b, p); + int16x8_t vc = vaddq_x_s16 (va, vb, p); + vstrhq_p_s16 (c, vc, p); + c+=8; + a+=8; + b+=8; + n-=8; + } +} + +/* { dg-final { scan-assembler-times {\tdlstp.16} 1 } } */ +/* { dg-final { scan-assembler-times {\tletp} 1 } } */ +/* { dg-final { scan-assembler-not "\tvctp" } } */ +/* { dg-final { scan-assembler-not "\tvpst" } } */ +/* { dg-final { scan-assembler-not "p0" } } */ diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4-run.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4-run.c new file mode 100644 index 0000000000000..6833dddde92b7 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4-run.c @@ -0,0 +1,45 @@ +/* { dg-do run { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-require-effective-target arm_mve_hw } */ +/* { dg-options "-O2 -save-temps" } */ +/* { dg-add-options arm_v8_1m_mve } */ + +#include "dlstp-int32x4.c" + +int main () +{ + int i; + int32_t temp1[N]; + int32_t temp2[N]; + int32_t temp3[N]; + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 0); + check_plus32 (temp1, temp2, temp3, 0); + + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 1); + check_plus32 (temp1, temp2, temp3, 1); + + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 3); + check_plus32 (temp1, temp2, temp3, 3); + + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 4); + check_plus32 (temp1, temp2, temp3, 4); + + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 5); + check_plus32 (temp1, temp2, temp3, 5); + + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 8); + check_plus32 (temp1, temp2, temp3, 8); + + reset_data32 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 9); + check_plus32 (temp1, temp2, temp3, 9); + + reset_data32 (temp1, temp2, temp3, N); +} + diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4.c new file mode 100644 index 0000000000000..5d09f784b7716 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4.c @@ -0,0 +1,31 @@ +/* { dg-do compile { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-options "-O2 -save-temps" } */ +/* { dg-add-options arm_v8_1m_mve } */ + +#include +#include +#include +#include "../lob.h" + +void __attribute__ ((noinline)) test (int32_t *a, int32_t *b, int32_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + int32x4_t vb = vldrwq_z_s32 (b, p); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c+=4; + a+=4; + b+=4; + n-=4; + } +} + +/* { dg-final { scan-assembler-times {\tdlstp.32} 1 } } */ +/* { dg-final { scan-assembler-times {\tletp} 1 } } */ +/* { dg-final { scan-assembler-not "\tvctp" } } */ +/* { dg-final { scan-assembler-not "\tvpst" } } */ +/* { dg-final { scan-assembler-not "p0" } } */ diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2-run.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2-run.c new file mode 100644 index 0000000000000..cc0b9ce7ee9a5 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2-run.c @@ -0,0 +1,48 @@ +/* { dg-do run { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-require-effective-target arm_mve_hw } */ +/* { dg-options "-O2 -save-temps" } */ +/* { dg-add-options arm_v8_1m_mve } */ + +#include "dlstp-int64x2.c" + +int main () +{ + int i; + int64_t temp1[N]; + int64_t temp3[N]; + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 0); + check_memcpy64 (temp1, temp3, 0); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 1); + check_memcpy64 (temp1, temp3, 1); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 2); + check_memcpy64 (temp1, temp3, 2); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 3); + check_memcpy64 (temp1, temp3, 3); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 4); + check_memcpy64 (temp1, temp3, 4); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 5); + check_memcpy64 (temp1, temp3, 5); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 6); + check_memcpy64 (temp1, temp3, 6); + + reset_data64 (temp1, temp3, N); + test (temp1, temp3, 7); + check_memcpy64 (temp1, temp3, 7); + + reset_data64 (temp1, temp3, N); +} + diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2.c new file mode 100644 index 0000000000000..21e882424ec3b --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2.c @@ -0,0 +1,28 @@ +/* { dg-do compile { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-options "-O2 -save-temps" } */ +/* { dg-add-options arm_v8_1m_mve } */ + +#include +#include +#include +#include "../lob.h" + +void __attribute__ ((noinline)) test (int64_t *a, int64_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp64q (n); + int64x2_t va = vldrdq_gather_offset_z_s64 (a, vcreateq_u64 (0, 8), p); + vstrdq_scatter_offset_p_s64 (c, vcreateq_u64 (0, 8), va, p); + c+=2; + a+=2; + n-=2; + } +} + +/* { dg-final { scan-assembler-times {\tdlstp.64} 1 } } */ +/* { dg-final { scan-assembler-times {\tletp} 1 } } */ +/* { dg-final { scan-assembler-not "\tvctp" } } */ +/* { dg-final { scan-assembler-not "\tvpst" } } */ +/* { dg-final { scan-assembler-not "p0" } } */ diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16-run.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16-run.c new file mode 100644 index 0000000000000..d46571f329cf3 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16-run.c @@ -0,0 +1,44 @@ +/* { dg-do run { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-require-effective-target arm_mve_hw } */ +/* { dg-options "-O2 -save-temps" } */ +/* { dg-add-options arm_v8_1m_mve } */ + +#include "dlstp-int8x16.c" + +int main () +{ + int i; + int8_t temp1[N]; + int8_t temp2[N]; + int8_t temp3[N]; + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 0); + check_plus8 (temp1, temp2, temp3, 0); + + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 1); + check_plus8 (temp1, temp2, temp3, 1); + + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 15); + check_plus8 (temp1, temp2, temp3, 15); + + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 16); + check_plus8 (temp1, temp2, temp3, 16); + + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 17); + check_plus8 (temp1, temp2, temp3, 17); + + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 32); + check_plus8 (temp1, temp2, temp3, 32); + + reset_data8 (temp1, temp2, temp3, N); + test (temp1, temp2, temp3, 33); + check_plus8 (temp1, temp2, temp3, 33); + + reset_data8 (temp1, temp2, temp3, N); +} diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16.c new file mode 100644 index 0000000000000..d5f22b5026259 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16.c @@ -0,0 +1,32 @@ +/* { dg-do compile { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-options "-O2 -save-temps" } */ +/* { dg-add-options arm_v8_1m_mve } */ + +#include +#include +#include +#include "../lob.h" + +void __attribute__ ((noinline)) test (int8_t *a, int8_t *b, int8_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + int8x16_t va = vldrbq_z_s8 (a, p); + int8x16_t vb = vldrbq_z_s8 (b, p); + int8x16_t vc = vaddq_x_s8 (va, vb, p); + vstrbq_p_s8 (c, vc, p); + c+=16; + a+=16; + b+=16; + n-=16; + } +} + + +/* { dg-final { scan-assembler-times {\tdlstp.8} 1 } } */ +/* { dg-final { scan-assembler-times {\tletp} 1 } } */ +/* { dg-final { scan-assembler-not "\tvctp" } } */ +/* { dg-final { scan-assembler-not "\tvpst" } } */ +/* { dg-final { scan-assembler-not "p0" } } */ diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c new file mode 100644 index 0000000000000..26df2d30523ce --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c @@ -0,0 +1,521 @@ +/* { dg-do compile { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-options "-O3 -save-temps" } */ +/* { dg-add-options arm_v8_1m_mve } */ + +#include +#include + +/* Terminating on a non-zero number of elements. */ +void test0 (uint8_t *a, uint8_t *b, uint8_t *c, int n) +{ + while (n > 1) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_x_u8 (va, vb, p); + vstrbq_p_u8 (c, vc, p); + n -= 16; + } +} + +/* Terminating on n >= 0. */ +void test1 (uint8_t *a, uint8_t *b, uint8_t *c, int n) +{ + while (n >= 0) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_x_u8 (va, vb, p); + vstrbq_p_u8 (c, vc, p); + n -= 16; + } +} + +/* Similar, terminating on a non-zero number of elements, but in a for loop + format. */ +int32_t a[] = {0, 1, 2, 3, 4, 5, 6, 7}; +void test2 (int32_t *b, int num_elems) +{ + for (int i = num_elems; i >= 2; i-= 4) + { + mve_pred16_t p = vctp32q (i); + int32x4_t va = vldrwq_z_s32 (&(a[i]), p); + vstrwq_p_s32 (b + i, va, p); + } +} + +/* Iteration counter counting up to num_iter, with a non-zero starting num. */ +void test3 (uint8_t *a, uint8_t *b, uint8_t *c, int n) +{ + int num_iter = (n + 15)/16; + for (int i = 1; i < num_iter; i++) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_x_u8 (va, vb, p); + vstrbq_p_u8 (c, vc, p); + n -= 16; + } +} + +/* Iteration counter counting up to num_iter, with a larger increment */ +void test4 (uint8_t *a, uint8_t *b, uint8_t *c, int n) +{ + int num_iter = (n + 15)/16; + for (int i = 0; i < num_iter; i+=2) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_x_u8 (va, vb, p); + vstrbq_p_u8 (c, vc, p); + n -= 16; + } +} + +/* Using an unpredicated store instruction within the loop. */ +void test5 (uint8_t *a, uint8_t *b, uint8_t *c, uint8_t *d, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_u8 (va, vb); + uint8x16_t vd = vaddq_x_u8 (va, vb, p); + vstrbq_u8 (d, vd); + n -= 16; + } +} + +/* Using an unpredicated store outside the loop. */ +void test6 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx) +{ + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p); + vx = vaddq_u8 (vx, vc); + a += 16; + b += 16; + n -= 16; + } + vstrbq_u8 (c, vx); +} + +/* Using a VPR that gets modified within the loop. */ +void test9 (int32_t *a, int32_t *b, int32_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + p++; + int32x4_t vb = vldrwq_z_s32 (b, p); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* Using a VPR that gets re-generated within the loop. */ +void test10 (int32_t *a, int32_t *b, int32_t *c, int n) +{ + mve_pred16_t p = vctp32q (n); + while (n > 0) + { + int32x4_t va = vldrwq_z_s32 (a, p); + p = vctp32q (n); + int32x4_t vb = vldrwq_z_s32 (b, p); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* Using vctp32q_m instead of vctp32q. */ +void test11 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p0) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q_m (n, p0); + int32x4_t va = vldrwq_z_s32 (a, p); + int32x4_t vb = vldrwq_z_s32 (b, p); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* Using an unpredicated op with a scalar output, where the result is valid + outside the bb. This is invalid, because one of the inputs to the + unpredicated op is also unpredicated. */ +uint8_t test12 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx) +{ + uint8_t sum = 0; + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_u8 (b); + uint8x16_t vc = vaddq_u8 (va, vb); + sum += vaddvq_u8 (vc); + a += 16; + b += 16; + n -= 16; + } + return sum; +} + +/* Using an unpredicated vcmp to generate a new predicate value in the + loop and then using that VPR to predicate a store insn. */ +void test13 (int32_t *a, int32_t *b, int32x4_t vc, int32_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_s32 (a); + int32x4_t vb = vldrwq_z_s32 (b, p); + int32x4_t vc = vaddq_s32 (va, vb); + mve_pred16_t p1 = vcmpeqq_s32 (va, vc); + vstrwq_p_s32 (c, vc, p1); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* Using an across-vector unpredicated instruction. "vb" is the risk. */ +uint16_t test14 (uint16_t *a, uint16_t *b, uint16_t *c, int n) +{ + uint16x8_t vb = vldrhq_u16 (b); + uint16_t res = 0; + while (n > 0) + { + mve_pred16_t p = vctp16q (n); + uint16x8_t va = vldrhq_z_u16 (a, p); + vb = vaddq_u16 (va, vb); + res = vaddvq_u16 (vb); + c += 8; + a += 8; + b += 8; + n -= 8; + } + return res; +} + +/* Using an across-vector unpredicated instruction. "vc" is the risk. */ +uint16_t test15 (uint16_t *a, uint16_t *b, uint16_t *c, int n) +{ + uint16x8_t vb = vldrhq_u16 (b); + uint16_t res = 0; + while (n > 0) + { + mve_pred16_t p = vctp16q (n); + uint16x8_t va = vldrhq_z_u16 (a, p); + uint16x8_t vc = vaddq_u16 (va, vb); + res = vaddvaq_u16 (res, vc); + c += 8; + a += 8; + b += 8; + n -= 8; + } + return res; +} + +uint16_t test16 (uint16_t *a, uint16_t *b, uint16_t *c, int n) +{ + uint16_t res =0; + while (n > 0) + { + mve_pred16_t p = vctp16q (n); + uint16x8_t vb = vldrhq_u16 (b); + uint16x8_t va = vldrhq_z_u16 (a, p); + res = vaddvaq_u16 (res, vb); + res = vaddvaq_p_u16 (res, va, p); + c += 8; + a += 8; + b += 8; + n -= 8; + } + return res; +} + +int test17 (int8_t *a, int8_t *b, int8_t *c, int n) +{ + int res = 0; + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + int8x16_t va = vldrbq_z_s8 (a, p); + res = vmaxvq (res, va); + n-=16; + a+=16; + } + return res; +} + + + +int test18 (int8_t *a, int8_t *b, int8_t *c, int n) +{ + int res = 0; + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + int8x16_t va = vldrbq_z_s8 (a, p); + res = vminvq (res, va); + n-=16; + a+=16; + } + return res; +} + +int test19 (int8_t *a, int8_t *b, int8_t *c, int n) +{ + int res = 0; + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + int8x16_t va = vldrbq_z_s8 (a, p); + res = vminavq (res, va); + n-=16; + a+=16; + } + return res; +} + +int test20 (uint8_t *a, uint8_t *b, uint8_t *c, int n) +{ + int res = 0; + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + res = vminvq (res, va); + n-=16; + a+=16; + } + return res; +} + +uint8x16_t test21 (uint8_t *a, uint32_t *b, int n, uint8x16_t res) +{ + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + res = vshlcq_u8 (va, b, 1); + n-=16; + a+=16; + } + return res; +} + +int8x16_t test22 (int8_t *a, int32_t *b, int n, int8x16_t res) +{ + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + int8x16_t va = vldrbq_z_s8 (a, p); + res = vshlcq_s8 (va, b, 1); + n-=16; + a+=16; + } + return res; +} + +/* Using an unsigned number of elements to count down from, with a >0*/ +void test23 (int32_t *a, int32_t *b, int32_t *c, unsigned int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + int32x4_t vb = vldrwq_z_s32 (b, p); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c+=4; + a+=4; + b+=4; + n-=4; + } +} + +/* Using an unsigned number of elements to count up to, with a = 1) + { + n-=4; + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + int32x4_t vb = vldrwq_z_s32 (b, p); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c+=4; + a+=4; + b+=4; + n-=4; + } +} + +void test27 (int32_t *a, int32_t *c, int n) +{ + int32_t res = 0; + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_s32 (a); + res += vaddvq_s32 (va); + int32x4_t vc = vdupq_n_s32 (res); + vstrwq_p_s32 (c, vc, p); + a += 4; + n -= 4; + } +} + +/* Using an unpredicated vcmp to generate a new predicate value in the + loop and then using it in a predicated store insn. */ +void test28 (int32_t *a, int32_t *b, int32_t *c, int n) +{ + while (n > 0) + { + mve_pred16_t p = vctp32q (n); + int32x4_t va = vldrwq_z_s32 (a, p); + int32x4_t vb = vldrwq_s32 (b); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + mve_pred16_t p1 = vcmpeqq_s32 (va, vc); + vstrwq_p_s32 (c, vc, p1); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + +/* Using an unpredicated op with a scalar output, where the result is valid + outside the bb. The unpredicated lanes are not guaranteed zero, so would + affect the vaddv in the non-tail predicated case. */ +uint8_t test29 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx) +{ + uint8_t sum = 0; + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p); + sum += vaddvq_u8 (vc); + a += 16; + b += 16; + n -= 16; + } + return sum; +} + +/* Same as above, but with another scalar op between the unpredicated op and + the scalar op outside the loop. */ +uint8_t test30 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx, int g) +{ + uint8_t sum = 0; + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_z_u8 (b, p); + uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p); + sum += vaddvq_u8 (vc); + sum += g; + a += 16; + b += 16; + n -= 16; + } + return sum; +} + +uint8_t test31 (uint8_t *a, uint8_t *b, int n) +{ + uint8_t res = 0; + uint8x16_t vc = vdupq_n_u8 (0); + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_u8 (b); + vc = vaddq (vb, vc); + res = vgetq_lane (vc, 5); + + a += 16; + b += 16; + n -= 16; + } + return res; +} + +uint8_t test32 (uint8_t *a, uint8_t *b, int n) +{ + uint8_t res = 0; + uint8x16_t vc = vdupq_n_u8 (0); + while (n > 0) + { + mve_pred16_t p = vctp8q (n); + uint8x16_t va = vldrbq_z_u8 (a, p); + uint8x16_t vb = vldrbq_u8 (b); + vc = vaddq_m (vc, va, vc, p); + vc = vaddq (vb, vc); + res = vgetq_lane (vc, 5); + + a += 16; + b += 16; + n -= 16; + } + return res; +} + +/* { dg-final { scan-assembler-not "\tdlstp" } } */ +/* { dg-final { scan-assembler-not "\tletp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/funcspec-56.inc b/gcc/testsuite/gcc.target/i386/funcspec-56.inc index 2a50f5bf67c86..c4dc89367ef58 100644 --- a/gcc/testsuite/gcc.target/i386/funcspec-56.inc +++ b/gcc/testsuite/gcc.target/i386/funcspec-56.inc @@ -208,6 +208,7 @@ extern void test_arch_arrowlake_s (void) __attribute__((__target__("arch=arrowla extern void test_arch_pantherlake (void) __attribute__((__target__("arch=pantherlake"))); extern void test_arch_lujiazui (void) __attribute__((__target__("arch=lujiazui"))); extern void test_arch_yongfeng (void) __attribute__((__target__("arch=yongfeng"))); +extern void test_arch_shijidadao (void) __attribute__((__target__("arch=shijidadao"))); extern void test_arch_k8 (void) __attribute__((__target__("arch=k8"))); extern void test_arch_k8_sse3 (void) __attribute__((__target__("arch=k8-sse3"))); extern void test_arch_opteron (void) __attribute__((__target__("arch=opteron"))); @@ -233,6 +234,7 @@ extern void test_tune_corei7_avx (void) __attribute__((__target__("tune=corei7- extern void test_tune_core_avx2 (void) __attribute__((__target__("tune=core-avx2"))); extern void test_tune_lujiazui (void) __attribute__((__target__("tune=lujiazui"))); extern void test_tune_yongfeng (void) __attribute__((__target__("tune=yongfeng"))); +extern void test_tune_shijidadao (void) __attribute__((__target__("tune=shijidadao"))); extern void test_tune_k8 (void) __attribute__((__target__("tune=k8"))); extern void test_tune_k8_sse3 (void) __attribute__((__target__("tune=k8-sse3"))); extern void test_tune_opteron (void) __attribute__((__target__("tune=opteron"))); diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-cmp-eqne.c b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-cmp-eqne.c new file mode 100644 index 0000000000000..572bcb8f291be --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-cmp-eqne.c @@ -0,0 +1,54 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv -mabi=lp64 -O3" } */ + +#include "riscv_vector.h" + +#define CMP_FLOAT_VF_1(ID, S, OP, IMM) \ + vbool##S##_t test_float_1_##ID##_##S (vfloat##S##m1_t op1, size_t vl) \ + { \ + return __riscv_vmf##OP##_vf_f##S##m1_b##S (op1, IMM, vl); \ + } + +CMP_FLOAT_VF_1 (0, 32, eq, 0.0) +CMP_FLOAT_VF_1 (1, 32, eq, 1.0) +CMP_FLOAT_VF_1 (2, 32, eq, __builtin_nanf ("123")) +CMP_FLOAT_VF_1 (3, 32, ne, 0.0) +CMP_FLOAT_VF_1 (4, 32, ne, 1.0) +CMP_FLOAT_VF_1 (5, 32, ne, __builtin_nanf ("123")) + +CMP_FLOAT_VF_1 (0, 64, eq, 0.0) +CMP_FLOAT_VF_1 (1, 64, eq, 1.0) +CMP_FLOAT_VF_1 (2, 64, eq, __builtin_nan ("123")) +CMP_FLOAT_VF_1 (3, 64, ne, 0.0) +CMP_FLOAT_VF_1 (4, 64, ne, 1.0) +CMP_FLOAT_VF_1 (5, 64, ne, __builtin_nan ("123")) + +#define CMP_FLOAT_VF_2(ID, S, OP, IMM) \ + vfloat##S##m1_t test_float_2_##ID##_##S (vfloat##S##m1_t op1, \ + vfloat##S##m1_t op2, size_t vl) \ + { \ + vfloat##S##m1_t op3 = __riscv_vfmv_s_f_f##S##m1 (IMM, vl); \ + vbool##S##_t mask1 = __riscv_vmf##OP##_vf_f##S##m1_b##S (op1, IMM, vl); \ + vbool##S##_t mask2 = __riscv_vmf##OP##_vv_f##S##m1_b##S (op1, op3, vl); \ + vbool##S##_t mask3 = __riscv_vmor (mask1, mask2, vl); \ + return __riscv_vmerge_vvm_f##S##m1_tu (op1, op1, op2, mask3, vl); \ + } + +CMP_FLOAT_VF_2 (0, 32, eq, 0.0) +CMP_FLOAT_VF_2 (1, 32, eq, 1.0) +CMP_FLOAT_VF_2 (2, 32, eq, __builtin_nanf ("123")) +CMP_FLOAT_VF_2 (3, 32, ne, 0.0) +CMP_FLOAT_VF_2 (4, 32, ne, 1.0) +CMP_FLOAT_VF_2 (5, 32, ne, __builtin_nanf ("123")) + +CMP_FLOAT_VF_2 (0, 64, eq, 0.0) +CMP_FLOAT_VF_2 (1, 64, eq, 1.0) +CMP_FLOAT_VF_2 (2, 64, eq, __builtin_nan ("123")) +CMP_FLOAT_VF_2 (3, 64, ne, 0.0) +CMP_FLOAT_VF_2 (4, 64, ne, 1.0) +CMP_FLOAT_VF_2 (5, 64, ne, __builtin_nan ("123")) + +/* { dg-final { scan-assembler-times {vmfeq\.vf} 12 } } */ +/* { dg-final { scan-assembler-times {vmfne\.vf} 12 } } */ +/* { dg-final { scan-assembler-times {vmfeq\.vv} 6 } } */ +/* { dg-final { scan-assembler-times {vmfne\.vv} 6 } } */ diff --git a/gcc/testsuite/gfortran.dg/bind_c_char_11.f90 b/gcc/testsuite/gfortran.dg/bind_c_char_11.f90 new file mode 100644 index 0000000000000..5ed8e82853bf0 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/bind_c_char_11.f90 @@ -0,0 +1,45 @@ +! { dg-do compile } +! { dg-additional-options "-Wuninitialized" } +! +! PR fortran/115390 - fixes for CHARACTER(len=*) dummies with bind(C) + +module test + implicit none +contains + subroutine bar(s,t) bind(c) + character(*), intent(in) :: s,t + optional :: t + call foo(s,t) + end + subroutine bar1(s,t) bind(c) + character(*), intent(in) :: s(:),t(:) + optional :: t + call foo1(s,t) + end + subroutine bar4(s,t) bind(c) + character(len=*,kind=4), intent(in) :: s,t + optional :: t + call foo4(s,t) + end + subroutine bar5(s,t) bind(c) + character(len=*,kind=4), intent(in) :: s(:),t(:) + optional :: t + call foo5(s,t) + end + subroutine foo(s,t) + character(*), intent(in) :: s,t + optional :: t + end + subroutine foo1(s,t) + character(*), intent(in) :: s(:),t(:) + optional :: t + end + subroutine foo4(s,t) + character(len=*,kind=4), intent(in) :: s,t + optional :: t + end + subroutine foo5(s,t) + character(len=*,kind=4), intent(in) :: s(:),t(:) + optional :: t + end +end diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index eeb75c09e91aa..347dac97e497e 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -7815,11 +7815,9 @@ vectorizable_reduction (loop_vec_info loop_vinfo, "use not simple.\n"); return false; } - if (i == STMT_VINFO_REDUC_IDX (stmt_info)) - continue; - /* For an IFN_COND_OP we might hit the reduction definition operand - twice (once as definition, once as else). */ + /* Skip reduction operands, and for an IFN_COND_OP we might hit the + reduction operand twice (once as definition, once as else). */ if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)]) continue; @@ -7870,10 +7868,10 @@ vectorizable_reduction (loop_vec_info loop_vinfo, if (lane_reducing) STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in; - enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info); - STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type; + enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info); + STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type; /* If we have a condition reduction, see if we can simplify it further. */ - if (v_reduc_type == COND_REDUCTION) + if (reduction_type == COND_REDUCTION) { if (slp_node && SLP_TREE_LANES (slp_node) != 1) return false; @@ -8040,7 +8038,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, STMT_VINFO_REDUC_CODE (reduc_info) = orig_code; - vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); + reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); if (reduction_type == TREE_CODE_REDUCTION) { /* Check whether it's ok to change the order of the computation. @@ -8582,9 +8580,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo, /* Transform. */ tree new_temp = NULL_TREE; - auto_vec vec_oprnds0; - auto_vec vec_oprnds1; - auto_vec vec_oprnds2; + auto_vec vec_oprnds[3]; if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n"); @@ -8622,7 +8618,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo, } bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info); - gcc_assert (single_defuse_cycle || lane_reducing_op_p (code)); + bool lane_reducing = lane_reducing_op_p (code); + gcc_assert (single_defuse_cycle || lane_reducing); /* Create the destination vector */ tree scalar_dest = gimple_get_lhs (stmt_info->stmt); @@ -8632,14 +8629,15 @@ vect_transform_reduction (loop_vec_info loop_vinfo, definition. */ if (!cond_fn_p) { + gcc_assert (reduc_index >= 0 && reduc_index <= 2); vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies, single_defuse_cycle && reduc_index == 0 - ? NULL_TREE : op.ops[0], &vec_oprnds0, + ? NULL_TREE : op.ops[0], &vec_oprnds[0], single_defuse_cycle && reduc_index == 1 - ? NULL_TREE : op.ops[1], &vec_oprnds1, + ? NULL_TREE : op.ops[1], &vec_oprnds[1], op.num_ops == 3 && !(single_defuse_cycle && reduc_index == 2) - ? op.ops[2] : NULL_TREE, &vec_oprnds2); + ? op.ops[2] : NULL_TREE, &vec_oprnds[2]); } else { @@ -8647,12 +8645,12 @@ vect_transform_reduction (loop_vec_info loop_vinfo, vectype. */ gcc_assert (single_defuse_cycle && (reduc_index == 1 || reduc_index == 2)); - vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies, - op.ops[0], truth_type_for (vectype_in), &vec_oprnds0, + vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies, op.ops[0], + truth_type_for (vectype_in), &vec_oprnds[0], reduc_index == 1 ? NULL_TREE : op.ops[1], - NULL_TREE, &vec_oprnds1, + NULL_TREE, &vec_oprnds[1], reduc_index == 2 ? NULL_TREE : op.ops[2], - NULL_TREE, &vec_oprnds2); + NULL_TREE, &vec_oprnds[2]); } /* For single def-use cycles get one copy of the vectorized reduction @@ -8660,24 +8658,26 @@ vect_transform_reduction (loop_vec_info loop_vinfo, if (single_defuse_cycle) { vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, 1, - reduc_index == 0 ? op.ops[0] : NULL_TREE, &vec_oprnds0, - reduc_index == 1 ? op.ops[1] : NULL_TREE, &vec_oprnds1, + reduc_index == 0 ? op.ops[0] : NULL_TREE, + &vec_oprnds[0], + reduc_index == 1 ? op.ops[1] : NULL_TREE, + &vec_oprnds[1], reduc_index == 2 ? op.ops[2] : NULL_TREE, - &vec_oprnds2); + &vec_oprnds[2]); } bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info); + unsigned num = vec_oprnds[reduc_index == 0 ? 1 : 0].length (); - unsigned num = (reduc_index == 0 - ? vec_oprnds1.length () : vec_oprnds0.length ()); for (unsigned i = 0; i < num; ++i) { gimple *new_stmt; - tree vop[3] = { vec_oprnds0[i], vec_oprnds1[i], NULL_TREE }; + tree vop[3] = { vec_oprnds[0][i], vec_oprnds[1][i], NULL_TREE }; if (masked_loop_p && !mask_by_cond_expr) { - /* No conditional ifns have been defined for dot-product yet. */ - gcc_assert (code != DOT_PROD_EXPR); + /* No conditional ifns have been defined for lane-reducing op + yet. */ + gcc_assert (!lane_reducing); /* Make sure that the reduction accumulator is vop[0]. */ if (reduc_index == 1) @@ -8698,7 +8698,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo, else { if (op.num_ops >= 3) - vop[2] = vec_oprnds2[i]; + vop[2] = vec_oprnds[2][i]; if (masked_loop_p && mask_by_cond_expr) { @@ -8729,14 +8729,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo, } if (single_defuse_cycle && i < num - 1) - { - if (reduc_index == 0) - vec_oprnds0.safe_push (gimple_get_lhs (new_stmt)); - else if (reduc_index == 1) - vec_oprnds1.safe_push (gimple_get_lhs (new_stmt)); - else if (reduc_index == 2) - vec_oprnds2.safe_push (gimple_get_lhs (new_stmt)); - } + vec_oprnds[reduc_index].safe_push (gimple_get_lhs (new_stmt)); else if (slp_node) slp_node->push_vec_def (new_stmt); else diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 7d18b5bfee5d2..a5665946a4ebc 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -3919,7 +3919,6 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) scalar_stmts.create (loop_vinfo->reductions.length ()); for (auto next_info : loop_vinfo->reductions) { - gassign *g; next_info = vect_stmt_to_vectorize (next_info); if ((STMT_VINFO_RELEVANT_P (next_info) || STMT_VINFO_LIVE_P (next_info)) @@ -3931,8 +3930,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) { /* Do not discover SLP reductions combining lane-reducing ops, that will fail later. */ - if (!(g = dyn_cast (STMT_VINFO_STMT (next_info))) - || !lane_reducing_op_p (gimple_assign_rhs_code (g))) + if (!lane_reducing_stmt_p (STMT_VINFO_STMT (next_info))) scalar_stmts.quick_push (next_info); else { diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 6bb0f5c3a56f6..60224f4e28472 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -2169,12 +2169,24 @@ vect_apply_runtime_profitability_check_p (loop_vec_info loop_vinfo) && th >= vect_vf_for_cost (loop_vinfo)); } +/* Return true if CODE is a lane-reducing opcode. */ + inline bool lane_reducing_op_p (code_helper code) { return code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR; } +/* Return true if STMT is a lane-reducing statement. */ + +inline bool +lane_reducing_stmt_p (gimple *stmt) +{ + if (auto *assign = dyn_cast (stmt)) + return lane_reducing_op_p (gimple_assign_rhs_code (assign)); + return false; +} + /* Source location + hotness information. */ extern dump_user_location_t vect_location; diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog index 907f6cfb0e83c..94a5ce9a1329c 100644 --- a/libstdc++-v3/ChangeLog +++ b/libstdc++-v3/ChangeLog @@ -1,3 +1,27 @@ +2024-06-19 Jonathan Wakely + + * include/std/future: Adjust whitespace to use tabs for + indentation. + +2024-06-19 Jonathan Wakely + + * include/std/future (_State_baseV2::_Setter): Add + noexcept to call operator. + (_State_baseV2::_Setter): Likewise. + +2024-06-19 Jonathan Wakely + + * include/bits/stl_pair.h [__cpp_lib_concepts] (pair()): Add + conditional noexcept. + +2024-06-19 Jonathan Wakely + + * include/bits/stl_tempbuf.h (__get_temporary_buffer): Cast + argument to size_t to handle negative values and suppress + -Wsign-compare warning. + (_Temporary_buffer): Move diagnostic pragmas to new location of + call to std::get_temporary_buffer. + 2024-06-18 Jonathan Wakely * include/bits/cpp_type_traits.h: Fix outdated comment about the diff --git a/libstdc++-v3/include/bits/stl_pair.h b/libstdc++-v3/include/bits/stl_pair.h index 0c1e5719a1a3b..0d60eaba1941e 100644 --- a/libstdc++-v3/include/bits/stl_pair.h +++ b/libstdc++-v3/include/bits/stl_pair.h @@ -344,6 +344,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION explicit(__not_<__and_<__is_implicitly_default_constructible<_T1>, __is_implicitly_default_constructible<_T2>>>()) pair() + noexcept(is_nothrow_default_constructible_v<_T1> + && is_nothrow_default_constructible_v<_T2>) requires is_default_constructible_v<_T1> && is_default_constructible_v<_T2> : first(), second() diff --git a/libstdc++-v3/include/std/future b/libstdc++-v3/include/std/future index 9e75ae98b13d2..6ce7d89ca3ffe 100644 --- a/libstdc++-v3/include/std/future +++ b/libstdc++-v3/include/std/future @@ -292,7 +292,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { using __allocator_type = __alloc_rebind<_Alloc, _Result_alloc>; - explicit + explicit _Result_alloc(const _Alloc& __a) : _Result<_Res>(), _Alloc(__a) { } @@ -362,9 +362,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } template - future_status - wait_for(const chrono::duration<_Rep, _Period>& __rel) - { + future_status + wait_for(const chrono::duration<_Rep, _Period>& __rel) + { // First, check if the future has been made ready. Use acquire MO // to synchronize with the thread that made it ready. if (_M_status._M_load(memory_order_acquire) == _Status::__ready) @@ -396,9 +396,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } template - future_status - wait_until(const chrono::time_point<_Clock, _Duration>& __abs) - { + future_status + wait_until(const chrono::time_point<_Clock, _Duration>& __abs) + { #if __cplusplus > 201703L static_assert(chrono::is_clock_v<_Clock>); #endif @@ -430,8 +430,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_set_result(function<_Ptr_type()> __res, bool __ignore_failure = false) { bool __did_set = false; - // all calls to this function are serialized, - // side-effects of invoking __res only happen once + // all calls to this function are serialized, + // side-effects of invoking __res only happen once call_once(_M_once, &_State_baseV2::_M_do_set, this, std::__addressof(__res), std::__addressof(__did_set)); if (__did_set) @@ -439,7 +439,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_status._M_store_notify_all(_Status::__ready, memory_order_release); else if (!__ignore_failure) - __throw_future_error(int(future_errc::promise_already_satisfied)); + __throw_future_error(int(future_errc::promise_already_satisfied)); } // Provide a result to the shared state but delay making it ready @@ -451,12 +451,12 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { bool __did_set = false; unique_ptr<_Make_ready> __mr{new _Make_ready}; - // all calls to this function are serialized, - // side-effects of invoking __res only happen once + // all calls to this function are serialized, + // side-effects of invoking __res only happen once call_once(_M_once, &_State_baseV2::_M_do_set, this, std::__addressof(__res), std::__addressof(__did_set)); if (!__did_set) - __throw_future_error(int(future_errc::promise_already_satisfied)); + __throw_future_error(int(future_errc::promise_already_satisfied)); __mr->_M_shared_state = std::move(__self); __mr->_M_set(); __mr.release(); @@ -490,41 +490,41 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } template - struct _Setter; + struct _Setter; // set lvalues template - struct _Setter<_Res, _Arg&> - { - // check this is only used by promise::set_value(const R&) - // or promise::set_value(R&) - static_assert(is_same<_Res, _Arg&>::value // promise - || is_same::value, // promise - "Invalid specialisation"); + struct _Setter<_Res, _Arg&> + { + // check this is only used by promise::set_value(const R&) + // or promise::set_value(R&) + static_assert(is_same<_Res, _Arg&>::value // promise + || is_same::value, // promise + "Invalid specialisation"); // Used by std::promise to copy construct the result. - typename promise<_Res>::_Ptr_type operator()() const - { - _M_promise->_M_storage->_M_set(*_M_arg); - return std::move(_M_promise->_M_storage); - } - promise<_Res>* _M_promise; - _Arg* _M_arg; - }; + typename promise<_Res>::_Ptr_type operator()() const + { + _M_promise->_M_storage->_M_set(*_M_arg); + return std::move(_M_promise->_M_storage); + } + promise<_Res>* _M_promise; + _Arg* _M_arg; + }; // set rvalues template - struct _Setter<_Res, _Res&&> - { + struct _Setter<_Res, _Res&&> + { // Used by std::promise to move construct the result. - typename promise<_Res>::_Ptr_type operator()() const - { - _M_promise->_M_storage->_M_set(std::move(*_M_arg)); - return std::move(_M_promise->_M_storage); - } - promise<_Res>* _M_promise; - _Res* _M_arg; - }; + typename promise<_Res>::_Ptr_type operator()() const + { + _M_promise->_M_storage->_M_set(std::move(*_M_arg)); + return std::move(_M_promise->_M_storage); + } + promise<_Res>* _M_promise; + _Res* _M_arg; + }; // set void template @@ -532,7 +532,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { static_assert(is_void<_Res>::value, "Only used for promise"); - typename promise<_Res>::_Ptr_type operator()() const + typename promise<_Res>::_Ptr_type operator()() const noexcept { return std::move(_M_promise->_M_storage); } promise<_Res>* _M_promise; @@ -542,35 +542,35 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // set exceptions template - struct _Setter<_Res, __exception_ptr_tag> - { + struct _Setter<_Res, __exception_ptr_tag> + { // Used by std::promise to store an exception as the result. - typename promise<_Res>::_Ptr_type operator()() const - { - _M_promise->_M_storage->_M_error = *_M_ex; - return std::move(_M_promise->_M_storage); - } + typename promise<_Res>::_Ptr_type operator()() const noexcept + { + _M_promise->_M_storage->_M_error = *_M_ex; + return std::move(_M_promise->_M_storage); + } - promise<_Res>* _M_promise; - exception_ptr* _M_ex; - }; + promise<_Res>* _M_promise; + exception_ptr* _M_ex; + }; template __attribute__((__always_inline__)) - static _Setter<_Res, _Arg&&> - __setter(promise<_Res>* __prom, _Arg&& __arg) noexcept - { - return _Setter<_Res, _Arg&&>{ __prom, std::__addressof(__arg) }; - } + static _Setter<_Res, _Arg&&> + __setter(promise<_Res>* __prom, _Arg&& __arg) noexcept + { + return _Setter<_Res, _Arg&&>{ __prom, std::__addressof(__arg) }; + } template __attribute__((__always_inline__)) - static _Setter<_Res, __exception_ptr_tag> - __setter(exception_ptr& __ex, promise<_Res>* __prom) noexcept - { - __glibcxx_assert(__ex != nullptr); // LWG 2276 - return _Setter<_Res, __exception_ptr_tag>{ __prom, &__ex }; - } + static _Setter<_Res, __exception_ptr_tag> + __setter(exception_ptr& __ex, promise<_Res>* __prom) noexcept + { + __glibcxx_assert(__ex != nullptr); // LWG 2276 + return _Setter<_Res, __exception_ptr_tag>{ __prom, &__ex }; + } template __attribute__((__always_inline__)) @@ -581,24 +581,24 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } template - static void - _S_check(const shared_ptr<_Tp>& __p) - { - if (!static_cast(__p)) - __throw_future_error((int)future_errc::no_state); - } + static void + _S_check(const shared_ptr<_Tp>& __p) + { + if (!static_cast(__p)) + __throw_future_error((int)future_errc::no_state); + } private: // The function invoked with std::call_once(_M_once, ...). void _M_do_set(function<_Ptr_type()>* __f, bool* __did_set) { - _Ptr_type __res = (*__f)(); - // Notify the caller that we did try to set; if we do not throw an - // exception, the caller will be aware that it did set (e.g., see - // _M_set_result). + _Ptr_type __res = (*__f)(); + // Notify the caller that we did try to set; if we do not throw an + // exception, the caller will be aware that it did set (e.g., see + // _M_set_result). *__did_set = true; - _M_result.swap(__res); // nothrow + _M_result.swap(__res); // nothrow } // Wait for completion of async function. @@ -719,49 +719,49 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION void wait() const { - _State_base::_S_check(_M_state); - _M_state->wait(); + _State_base::_S_check(_M_state); + _M_state->wait(); } template - future_status - wait_for(const chrono::duration<_Rep, _Period>& __rel) const - { - _State_base::_S_check(_M_state); - return _M_state->wait_for(__rel); - } + future_status + wait_for(const chrono::duration<_Rep, _Period>& __rel) const + { + _State_base::_S_check(_M_state); + return _M_state->wait_for(__rel); + } template - future_status - wait_until(const chrono::time_point<_Clock, _Duration>& __abs) const - { - _State_base::_S_check(_M_state); - return _M_state->wait_until(__abs); - } + future_status + wait_until(const chrono::time_point<_Clock, _Duration>& __abs) const + { + _State_base::_S_check(_M_state); + return _M_state->wait_until(__abs); + } protected: /// Wait for the state to be ready and rethrow any stored exception __result_type _M_get_result() const { - _State_base::_S_check(_M_state); - _Result_base& __res = _M_state->wait(); - if (!(__res._M_error == nullptr)) - rethrow_exception(__res._M_error); - return static_cast<__result_type>(__res); + _State_base::_S_check(_M_state); + _Result_base& __res = _M_state->wait(); + if (!(__res._M_error == nullptr)) + rethrow_exception(__res._M_error); + return static_cast<__result_type>(__res); } void _M_swap(__basic_future& __that) noexcept { - _M_state.swap(__that._M_state); + _M_state.swap(__that._M_state); } // Construction of a future by promise::get_future() explicit __basic_future(const __state_type& __state) : _M_state(__state) { - _State_base::_S_check(_M_state); - _M_state->_M_set_retrieved_flag(); + _State_base::_S_check(_M_state); + _M_state->_M_set_retrieved_flag(); } // Copy construction from a shared_future @@ -780,9 +780,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION struct _Reset { - explicit _Reset(__basic_future& __fut) noexcept : _M_fut(__fut) { } - ~_Reset() { _M_fut._M_state.reset(); } - __basic_future& _M_fut; + explicit _Reset(__basic_future& __fut) noexcept : _M_fut(__fut) { } + ~_Reset() { _M_fut._M_state.reset(); } + __basic_future& _M_fut; }; }; @@ -801,8 +801,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION friend class promise<_Res>; template friend class packaged_task; template - friend future<__async_result_of<_Fn, _Args...>> - async(launch, _Fn&&, _Args&&...); + friend future<__async_result_of<_Fn, _Args...>> + async(launch, _Fn&&, _Args&&...); typedef __basic_future<_Res> _Base_type; typedef typename _Base_type::__state_type __state_type; @@ -822,16 +822,16 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION future& operator=(future&& __fut) noexcept { - future(std::move(__fut))._M_swap(*this); - return *this; + future(std::move(__fut))._M_swap(*this); + return *this; } /// Retrieving the value _Res get() { - typename _Base_type::_Reset __reset(*this); - return std::move(this->_M_get_result()._M_value()); + typename _Base_type::_Reset __reset(*this); + return std::move(this->_M_get_result()._M_value()); } shared_future<_Res> share() noexcept; @@ -844,8 +844,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION friend class promise<_Res&>; template friend class packaged_task; template - friend future<__async_result_of<_Fn, _Args...>> - async(launch, _Fn&&, _Args&&...); + friend future<__async_result_of<_Fn, _Args...>> + async(launch, _Fn&&, _Args&&...); typedef __basic_future<_Res&> _Base_type; typedef typename _Base_type::__state_type __state_type; @@ -865,16 +865,16 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION future& operator=(future&& __fut) noexcept { - future(std::move(__fut))._M_swap(*this); - return *this; + future(std::move(__fut))._M_swap(*this); + return *this; } /// Retrieving the value _Res& get() { - typename _Base_type::_Reset __reset(*this); - return this->_M_get_result()._M_get(); + typename _Base_type::_Reset __reset(*this); + return this->_M_get_result()._M_get(); } shared_future<_Res&> share() noexcept; @@ -887,8 +887,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION friend class promise; template friend class packaged_task; template - friend future<__async_result_of<_Fn, _Args...>> - async(launch, _Fn&&, _Args&&...); + friend future<__async_result_of<_Fn, _Args...>> + async(launch, _Fn&&, _Args&&...); typedef __basic_future _Base_type; typedef typename _Base_type::__state_type __state_type; @@ -908,16 +908,16 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION future& operator=(future&& __fut) noexcept { - future(std::move(__fut))._M_swap(*this); - return *this; + future(std::move(__fut))._M_swap(*this); + return *this; } /// Retrieving the value void get() { - typename _Base_type::_Reset __reset(*this); - this->_M_get_result(); + typename _Base_type::_Reset __reset(*this); + this->_M_get_result(); } shared_future share() noexcept; @@ -955,14 +955,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION shared_future& operator=(const shared_future& __sf) noexcept { - shared_future(__sf)._M_swap(*this); - return *this; + shared_future(__sf)._M_swap(*this); + return *this; } shared_future& operator=(shared_future&& __sf) noexcept { - shared_future(std::move(__sf))._M_swap(*this); - return *this; + shared_future(std::move(__sf))._M_swap(*this); + return *this; } /// Retrieving the value @@ -994,14 +994,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION shared_future& operator=(const shared_future& __sf) { - shared_future(__sf)._M_swap(*this); - return *this; + shared_future(__sf)._M_swap(*this); + return *this; } shared_future& operator=(shared_future&& __sf) noexcept { - shared_future(std::move(__sf))._M_swap(*this); - return *this; + shared_future(std::move(__sf))._M_swap(*this); + return *this; } /// Retrieving the value @@ -1033,14 +1033,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION shared_future& operator=(const shared_future& __sf) { - shared_future(__sf)._M_swap(*this); - return *this; + shared_future(__sf)._M_swap(*this); + return *this; } shared_future& operator=(shared_future&& __sf) noexcept { - shared_future(std::move(__sf))._M_swap(*this); - return *this; + shared_future(std::move(__sf))._M_swap(*this); + return *this; } // Retrieving the value @@ -1115,31 +1115,31 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { } template - promise(allocator_arg_t, const _Allocator& __a) - : _M_future(std::allocate_shared<_State>(__a)), + promise(allocator_arg_t, const _Allocator& __a) + : _M_future(std::allocate_shared<_State>(__a)), _M_storage(__future_base::_S_allocate_result<_Res>(__a)) - { } + { } template - promise(allocator_arg_t, const _Allocator&, promise&& __rhs) - : _M_future(std::move(__rhs._M_future)), + promise(allocator_arg_t, const _Allocator&, promise&& __rhs) + : _M_future(std::move(__rhs._M_future)), _M_storage(std::move(__rhs._M_storage)) - { } + { } promise(const promise&) = delete; ~promise() { - if (static_cast(_M_future) && !_M_future.unique()) - _M_future->_M_break_promise(std::move(_M_storage)); + if (static_cast(_M_future) && !_M_future.unique()) + _M_future->_M_break_promise(std::move(_M_storage)); } // Assignment promise& operator=(promise&& __rhs) noexcept { - promise(std::move(__rhs)).swap(*this); - return *this; + promise(std::move(__rhs)).swap(*this); + return *this; } promise& operator=(const promise&) = delete; @@ -1147,8 +1147,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION void swap(promise& __rhs) noexcept { - _M_future.swap(__rhs._M_future); - _M_storage.swap(__rhs._M_storage); + _M_future.swap(__rhs._M_future); + _M_storage.swap(__rhs._M_storage); } // Retrieving the result @@ -1234,31 +1234,31 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { } template - promise(allocator_arg_t, const _Allocator& __a) - : _M_future(std::allocate_shared<_State>(__a)), + promise(allocator_arg_t, const _Allocator& __a) + : _M_future(std::allocate_shared<_State>(__a)), _M_storage(__future_base::_S_allocate_result<_Res&>(__a)) - { } + { } template - promise(allocator_arg_t, const _Allocator&, promise&& __rhs) - : _M_future(std::move(__rhs._M_future)), + promise(allocator_arg_t, const _Allocator&, promise&& __rhs) + : _M_future(std::move(__rhs._M_future)), _M_storage(std::move(__rhs._M_storage)) - { } + { } promise(const promise&) = delete; ~promise() { - if (static_cast(_M_future) && !_M_future.unique()) - _M_future->_M_break_promise(std::move(_M_storage)); + if (static_cast(_M_future) && !_M_future.unique()) + _M_future->_M_break_promise(std::move(_M_storage)); } // Assignment promise& operator=(promise&& __rhs) noexcept { - promise(std::move(__rhs)).swap(*this); - return *this; + promise(std::move(__rhs)).swap(*this); + return *this; } promise& operator=(const promise&) = delete; @@ -1266,8 +1266,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION void swap(promise& __rhs) noexcept { - _M_future.swap(__rhs._M_future); - _M_storage.swap(__rhs._M_storage); + _M_future.swap(__rhs._M_future); + _M_storage.swap(__rhs._M_storage); } // Retrieving the result @@ -1332,33 +1332,33 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { } template - promise(allocator_arg_t, const _Allocator& __a) - : _M_future(std::allocate_shared<_State>(__a)), + promise(allocator_arg_t, const _Allocator& __a) + : _M_future(std::allocate_shared<_State>(__a)), _M_storage(__future_base::_S_allocate_result(__a)) - { } + { } // _GLIBCXX_RESOLVE_LIB_DEFECTS // 2095. missing constructors needed for uses-allocator construction template - promise(allocator_arg_t, const _Allocator&, promise&& __rhs) - : _M_future(std::move(__rhs._M_future)), + promise(allocator_arg_t, const _Allocator&, promise&& __rhs) + : _M_future(std::move(__rhs._M_future)), _M_storage(std::move(__rhs._M_storage)) - { } + { } promise(const promise&) = delete; ~promise() { - if (static_cast(_M_future) && !_M_future.unique()) - _M_future->_M_break_promise(std::move(_M_storage)); + if (static_cast(_M_future) && !_M_future.unique()) + _M_future->_M_break_promise(std::move(_M_storage)); } // Assignment promise& operator=(promise&& __rhs) noexcept { - promise(std::move(__rhs)).swap(*this); - return *this; + promise(std::move(__rhs)).swap(*this); + return *this; } promise& operator=(const promise&) = delete; @@ -1366,8 +1366,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION void swap(promise& __rhs) noexcept { - _M_future.swap(__rhs._M_future); - _M_storage.swap(__rhs._M_storage); + _M_future.swap(__rhs._M_future); + _M_storage.swap(__rhs._M_storage); } // Retrieving the result @@ -1596,7 +1596,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION ~packaged_task() { - if (static_cast(_M_state) && !_M_state.unique()) + if (static_cast(_M_state) && !_M_state.unique()) _M_state->_M_break_promise(std::move(_M_state->_M_result)); } @@ -1709,7 +1709,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // result in _M_result, swaps that with the base _M_result and makes // the state ready. Tell _M_set_result to ignore failure so all later // calls do nothing. - _M_set_result(_S_task_setter(_M_result, _M_fn), true); + _M_set_result(_S_task_setter(_M_result, _M_fn), true); } // Caller should check whether the state is ready first, because this