diff --git a/llvm/include/llvm/CodeGen/LiveInterval.h b/llvm/include/llvm/CodeGen/LiveInterval.h index 290a2381d9c9..fe5adb59dac2 100644 --- a/llvm/include/llvm/CodeGen/LiveInterval.h +++ b/llvm/include/llvm/CodeGen/LiveInterval.h @@ -836,10 +836,35 @@ namespace llvm { /// don't defne the related lane masks after they get shrunk. E.g., /// when L000F gets split into L0007 and L0008 maybe only a subset /// of the VNIs that defined L000F defines L0007. + /// + /// The clean up of the VNIs need to look at the actual instructions + /// to decide what is or is not live at a definition point. If the + /// update of the subranges occurs while the IR does not reflect these + /// changes, \p ComposeSubRegIdx can be used to specify how the + /// definition are going to be rewritten. + /// E.g., let say we want to merge: + /// V1.sub1:<2 x s32> = COPY V2.sub3:<4 x s32> + /// We do that by choosing a class where sub1:<2 x s32> and sub3:<4 x s32> + /// overlap, i.e., by choosing a class where we can find "offset + 1 == 3". + /// Put differently we align V2's sub3 with V1's sub1: + /// V2: sub0 sub1 sub2 sub3 + /// V1: sub0 sub1 + /// + /// This offset will look like a composed subregidx in the the class: + /// V1.(composed sub2 with sub1):<4 x s32> = COPY V2.sub3:<4 x s32> + /// => V1.(composed sub2 with sub1):<4 x s32> = COPY V2.sub3:<4 x s32> + /// + /// Now if we didn't rewrite the uses and def of V1, all the checks for V1 + /// need to account for this offset. + /// This happens during coalescing where we update the live-ranges while + /// still having the old IR around because updating the IR on-the-fly + /// would actually clobber some information on how the live-ranges that + /// are being updated look like. void refineSubRanges(BumpPtrAllocator &Allocator, LaneBitmask LaneMask, std::function Apply, const SlotIndexes &Indexes, - const TargetRegisterInfo &TRI); + const TargetRegisterInfo &TRI, + unsigned ComposeSubRegIdx = 0); bool operator<(const LiveInterval& other) const { const SlotIndex &thisIndex = beginIndex(); diff --git a/llvm/lib/CodeGen/LiveInterval.cpp b/llvm/lib/CodeGen/LiveInterval.cpp index 54ac46f2e7ce..930dc116205a 100644 --- a/llvm/lib/CodeGen/LiveInterval.cpp +++ b/llvm/lib/CodeGen/LiveInterval.cpp @@ -883,7 +883,8 @@ void LiveInterval::clearSubRanges() { static void stripValuesNotDefiningMask(unsigned Reg, LiveInterval::SubRange &SR, LaneBitmask LaneMask, const SlotIndexes &Indexes, - const TargetRegisterInfo &TRI) { + const TargetRegisterInfo &TRI, + unsigned ComposeSubRegIdx) { // Phys reg should not be tracked at subreg level. // Same for noreg (Reg == 0). if (!Register::isVirtualRegister(Reg) || !Reg) @@ -905,7 +906,12 @@ static void stripValuesNotDefiningMask(unsigned Reg, LiveInterval::SubRange &SR, continue; if (MOI->getReg() != Reg) continue; - if ((TRI.getSubRegIndexLaneMask(MOI->getSubReg()) & LaneMask).none()) + LaneBitmask OrigMask = TRI.getSubRegIndexLaneMask(MOI->getSubReg()); + LaneBitmask ExpectedDefMask = + ComposeSubRegIdx + ? TRI.composeSubRegIndexLaneMask(ComposeSubRegIdx, OrigMask) + : OrigMask; + if ((ExpectedDefMask & LaneMask).none()) continue; hasDef = true; break; @@ -924,7 +930,8 @@ static void stripValuesNotDefiningMask(unsigned Reg, LiveInterval::SubRange &SR, void LiveInterval::refineSubRanges( BumpPtrAllocator &Allocator, LaneBitmask LaneMask, std::function Apply, - const SlotIndexes &Indexes, const TargetRegisterInfo &TRI) { + const SlotIndexes &Indexes, const TargetRegisterInfo &TRI, + unsigned ComposeSubRegIdx) { LaneBitmask ToApply = LaneMask; for (SubRange &SR : subranges()) { LaneBitmask SRMask = SR.LaneMask; @@ -944,8 +951,10 @@ void LiveInterval::refineSubRanges( MatchingRange = createSubRangeFrom(Allocator, Matching, SR); // Now that the subrange is split in half, make sure we // only keep in the subranges the VNIs that touch the related half. - stripValuesNotDefiningMask(reg, *MatchingRange, Matching, Indexes, TRI); - stripValuesNotDefiningMask(reg, SR, SR.LaneMask, Indexes, TRI); + stripValuesNotDefiningMask(reg, *MatchingRange, Matching, Indexes, TRI, + ComposeSubRegIdx); + stripValuesNotDefiningMask(reg, SR, SR.LaneMask, Indexes, TRI, + ComposeSubRegIdx); } Apply(*MatchingRange); ToApply &= ~Matching; diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index c44a302c4996..e25f0638d68e 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -225,7 +225,8 @@ namespace { /// @p ToMerge will occupy in the coalescer register. @p LI has its subrange /// lanemasks already adjusted to the coalesced register. void mergeSubRangeInto(LiveInterval &LI, const LiveRange &ToMerge, - LaneBitmask LaneMask, CoalescerPair &CP); + LaneBitmask LaneMask, CoalescerPair &CP, + unsigned DstIdx); /// Join the liveranges of two subregisters. Joins @p RRange into /// @p LRange, @p RRange may be invalid afterwards. @@ -3271,7 +3272,8 @@ void RegisterCoalescer::joinSubRegRanges(LiveRange &LRange, LiveRange &RRange, void RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI, const LiveRange &ToMerge, LaneBitmask LaneMask, - CoalescerPair &CP) { + CoalescerPair &CP, + unsigned ComposeSubRegIdx) { BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator(); LI.refineSubRanges( Allocator, LaneMask, @@ -3284,7 +3286,7 @@ void RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI, joinSubRegRanges(SR, RangeCopy, SR.LaneMask, CP); } }, - *LIS->getSlotIndexes(), *TRI); + *LIS->getSlotIndexes(), *TRI, ComposeSubRegIdx); } bool RegisterCoalescer::isHighCostLiveInterval(LiveInterval &LI) { @@ -3350,12 +3352,12 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) { if (!RHS.hasSubRanges()) { LaneBitmask Mask = SrcIdx == 0 ? CP.getNewRC()->getLaneMask() : TRI->getSubRegIndexLaneMask(SrcIdx); - mergeSubRangeInto(LHS, RHS, Mask, CP); + mergeSubRangeInto(LHS, RHS, Mask, CP, DstIdx); } else { // Pair up subranges and merge. for (LiveInterval::SubRange &R : RHS.subranges()) { LaneBitmask Mask = TRI->composeSubRegIndexLaneMask(SrcIdx, R.LaneMask); - mergeSubRangeInto(LHS, R, Mask, CP); + mergeSubRangeInto(LHS, R, Mask, CP, DstIdx); } } LLVM_DEBUG(dbgs() << "\tJoined SubRanges " << LHS << "\n"); diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp index c9316a71bdfa..eb4d39b01cbb 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -72,6 +72,9 @@ static cl::opt ForceFastISel("arm-force-fast-isel", cl::init(false), cl::Hidden); +static cl::opt EnableSubRegLiveness("arm-enable-subreg-liveness", + cl::init(false), cl::Hidden); + /// initializeSubtargetDependencies - Initializes using a CPU and feature string /// so that we can use initializer lists for subtarget initialization. ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU, @@ -379,6 +382,8 @@ bool ARMSubtarget::enableMachineScheduler() const { return useMachineScheduler(); } +bool ARMSubtarget::enableSubRegLiveness() const { return EnableSubRegLiveness; } + // This overrides the PostRAScheduler bit in the SchedModel for any CPU. bool ARMSubtarget::enablePostRAScheduler() const { if (enableMachineScheduler()) diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h index 8478665979f6..f582a92f6563 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -806,6 +806,9 @@ class ARMSubtarget : public ARMGenSubtargetInfo { /// True for some subtargets at > -O0. bool enablePostRAMachineScheduler() const override; + /// Check whether this subtarget wants to use subregister liveness. + bool enableSubRegLiveness() const override; + /// Enable use of alias analysis during code generation (during MI /// scheduling, DAGCombine, etc.). bool useAA() const override { return true; } diff --git a/llvm/test/CodeGen/ARM/regcoal-invalid-subrange-update.mir b/llvm/test/CodeGen/ARM/regcoal-invalid-subrange-update.mir new file mode 100644 index 000000000000..ef1bfa652ec7 --- /dev/null +++ b/llvm/test/CodeGen/ARM/regcoal-invalid-subrange-update.mir @@ -0,0 +1,81 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc %s -start-before simple-register-coalescing -mtriple=arm-apple-ios -stop-after machine-scheduler -o - -arm-enable-subreg-liveness -verify-machineinstrs | FileCheck %s + +# Check that when we merge live-ranges that imply offseting +# the definition of a subregister by some other subreg index, +# we take that new index into account while updating the subrange. +# +# For this specific test case, the coalescer is going to get rid +# of `%5.dsub_1:dtriple = COPY %4.dsub_3` by aligning +# %5.dsub_1:<3 x s64> with %4.dsub_3:<4 x s64>. +# This is done by moving to a bigger register class <5 x s64> +# and offseting %5 definitions with a new subregidx: +# NewVar: <5 x s64> dsub_0 dsub_1 dsub_2 dsub_3 dsub_4 +# %4: <4 x s64> dsub_0 dsub_1 dsub_2 dsub_3 +# %5: <3 x s64> <==offset===> dsub_0 dsub_1 dsub_2 +# +# In other %5.dsub_0 needs to be mapped to NewVar.dsub_2, %5.dsub_1 +# to NewVar.dsub_3 and so on. So essentially we are offseting %5 by +# dsub_2. +# +# When updating the live-ranges, the register coalescer actually +# has not rewritten the original code, so we need to fake the +# rewrite to do that update. +# This used to be wrong and this test was failling with a machine +# verifier error: No live segment at def. +# +# The test case runs through the coalescer *and* the scheduler, just +# to force the live intervals to be carried around so that the verifier +# gets a chance to verify those. If we were to just run the coalescer, +# the live intervals would be dropped before running the verifier since +# no other pass would need that analysis around. +# +# Note: The test case looks slightly more complicated than just the +# offseting part. That's because the bug needs three things to +# trigger: +# 1. Overlapping subreg lanes: here, dsub0 == +# 2. Tuple registers with a possibility to coalesce the subreg index: +# here, what we explain with %5.dsub_1 == %4.dsub_3 +# 3. Subreg liveness enabled. +# #1 is required to trigger the splitting of subranges that implies +# looking at the IR to decide what is alive and what is not. +# #2 is what produces the IR to be out-of-synce with what the reg coalescer +# maintains for the live-ranges information. +# #3 is, well, the problem has to do with subranges updates! +# +# In the end, the expected result is to have all the variables +# being coalesced in one big (qqqq) variable. +--- +name: main +alignment: 1 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $d2, $s1, $d4 + + + ; CHECK-LABEL: name: main + ; CHECK: liveins: $d2, $s1, $d4 + ; CHECK: undef %4.dsub_0:qqqqpr_with_ssub_4 = COPY $d4 + ; CHECK: %4.ssub_4:qqqqpr_with_ssub_4 = COPY $s1 + ; CHECK: %4.dsub_1:qqqqpr_with_ssub_4 = COPY $d2 + ; CHECK: %4.dsub_3:qqqqpr_with_ssub_4 = COPY %4.dsub_1 + ; CHECK: KILL implicit-def %4.dsub_2, implicit %4.qqsub_0 + ; CHECK: %4.dsub_4:qqqqpr_with_ssub_4 = COPY %4.dsub_1 + ; CHECK: tBX_RET 14, $noreg, implicit %4.ssub_4_ssub_5_ssub_6_ssub_7_ssub_8_ssub_9 + %3:dpr_vfp2 = COPY $d4 + undef %0.ssub_0:dpr_vfp2 = COPY $s1 + %1:dpr_vfp2 = COPY $d2 + undef %4.dsub_0:dquad = COPY %3 + %4.dsub_1:dquad = COPY %1 + %4.dsub_2:dquad = COPY %0 + %4.dsub_3:dquad = COPY %1 + KILL implicit-def undef %5.dsub_0:dtriple, implicit %4 + %5.dsub_1:dtriple = COPY %4.dsub_3 + %5.dsub_2:dtriple = COPY %1 + tBX_RET 14, $noreg, implicit %5 + +...