merge ld/st for float

kuaiwei · Apr 11, 2024 · 4da0ba6 · 4da0ba6
1 parent 464c5aa
commit 4da0ba6
Show file tree

Hide file tree

Showing 3 changed files with 246 additions and 2 deletions.
diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
@@ -2022,6 +2022,35 @@ void MacroAssembler::strw(Register Rw, const Address &adr) {
   }
 }
 
+void MacroAssembler::ldrd(FloatRegister Rd, const Address &adr) {
+    // We always try to merge two adjacent loads into one ldp.
+    if (!try_merge_ldst_float(Rd, adr, 8, false)) {
+        Assembler::ldrd(Rd, adr);
+    }
+}
+
+void MacroAssembler::ldrs(FloatRegister Rs, const Address &adr) {
+    // We always try to merge two adjacent loads into one ldp.
+    if (!try_merge_ldst_float(Rs, adr, 4, false)) {
+        Assembler::ldrs(Rs, adr);
+    }
+}
+
+void MacroAssembler::strd(FloatRegister Rd, const Address &adr) {
+    // We always try to merge two adjacent stores into one stp.
+    if (!try_merge_ldst_float(Rd, adr, 8, true)) {
+        Assembler::strd(Rd, adr);
+    }
+}
+
+void MacroAssembler::strs(FloatRegister Rs, const Address &adr) {
+    // We always try to merge two adjacent stores into one stp.
+    if (!try_merge_ldst_float(Rs, adr, 4, true)) {
+        Assembler::strs(Rs, adr);
+    }
+}
+
+
 // MacroAssembler routines found actually to be needed
 
 void MacroAssembler::push(Register src)
@@ -3035,6 +3064,133 @@ void MacroAssembler::merge_ldst(Register rt,
   }
 }
 
+bool MacroAssembler::ldst_can_merge_float(FloatRegister rt,
+                                          const Address &adr,
+                                          size_t cur_size_in_bytes,
+                                          bool is_store) const {
+  address prev = pc() - NativeInstruction::instruction_size;
+  address last = code()->last_insn();
+
+  if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt_Float()) {
+    return false;
+  }
+
+  if (adr.getMode() != Address::base_plus_offset || prev != last) {
+    return false;
+  }
+
+  NativeLdStFloat* prev_ldst = NativeLdStFloat_at(prev);
+  size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
+
+  assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
+  assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
+
+  if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
+    return false;
+  }
+
+  int64_t max_offset = 63 * prev_size_in_bytes;
+  int64_t min_offset = -64 * prev_size_in_bytes;
+
+  assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
+
+  // Only same base can be merged.
+  if (adr.base() != prev_ldst->base()) {
+    return false;
+  }
+
+  int64_t cur_offset = adr.offset();
+  int64_t prev_offset = prev_ldst->offset();
+  size_t diff = abs(cur_offset - prev_offset);
+  if (diff != prev_size_in_bytes) {
+    return false;
+  }
+
+  // Following cases can not be merged:
+  // ldr x2, [x2, #8]
+  // ldr x3, [x2, #16]
+  // or:
+  // ldr x2, [x3, #8]
+  // ldr x2, [x3, #16]
+  // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
+  if (!is_store && rt == prev_ldst->target()) {
+    return false;
+  }
+
+  int64_t low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
+  // Offset range must be in ldp/stp instruction's range.
+  if (low_offset > max_offset || low_offset < min_offset) {
+    return false;
+  }
+
+  if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
+    return true;
+  }
+
+  return false;
+}
+
+bool MacroAssembler::try_merge_ldst_float(FloatRegister rt, const Address &adr, size_t size_in_bytes, bool is_store) {
+  if (ldst_can_merge_float(rt, adr, size_in_bytes, is_store)) {
+    merge_ldst_float(rt, adr, size_in_bytes, is_store);
+    code()->clear_last_insn();
+    return true;
+  } else {
+    assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
+    const uint64_t mask = size_in_bytes - 1;
+    if (adr.getMode() == Address::base_plus_offset &&
+        (adr.offset() & mask) == 0) { // only supports base_plus_offset.
+      code()->set_last_insn(pc());
+    }
+    return false;
+  }
+}
+
+void MacroAssembler::merge_ldst_float(FloatRegister rt,
+                                      const Address &adr,
+                                      size_t cur_size_in_bytes,
+                                      bool is_store) {
+  assert(ldst_can_merge_float(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
+
+  FloatRegister rt_low, rt_high;
+  address prev = pc() - NativeInstruction::instruction_size;
+  NativeLdStFloat* prev_ldst = NativeLdStFloat_at(prev);
+
+  int64_t offset;
+
+  if (adr.offset() < prev_ldst->offset()) {
+    offset = adr.offset();
+    rt_low = rt;
+    rt_high = prev_ldst->target();
+  } else {
+    offset = prev_ldst->offset();
+    rt_low = prev_ldst->target();
+    rt_high = rt;
+  }
+
+  Address adr_p = Address(prev_ldst->base(), offset);
+  // Overwrite previous generated binary.
+  code_section()->set_end(prev);
+
+  const size_t sz = prev_ldst->size_in_bytes();
+  assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
+  if (!is_store) {
+    BLOCK_COMMENT("merged ldr float pair");
+    if (sz == 8) {
+      ldpd(rt_low, rt_high, adr_p);
+    } else {
+      ldps(rt_low, rt_high, adr_p);
+    }
+  } else {
+    BLOCK_COMMENT("merged str float pair");
+    if (sz == 8) {
+      stpd(rt_low, rt_high, adr_p);
+    } else {
+      stps(rt_low, rt_high, adr_p);
+    }
+  }
+}
+
 /**
  * Multiply 64 bit by 64 bit first loop.
  */

diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
@@ -178,6 +178,11 @@ class MacroAssembler: public Assembler {
   void str(Register Rx, const Address &adr);
   void strw(Register Rx, const Address &adr);
 
+  void ldrs(FloatRegister Rx, const Address &adr);
+  void ldrd(FloatRegister Rw, const Address &adr);
+  void strs(FloatRegister Rx, const Address &adr);
+  void strd(FloatRegister Rx, const Address &adr);
+
   // Frame creation and destruction shared between JITs.
   void build_frame(int framesize);
   void remove_frame(int framesize);
@@ -1376,6 +1381,15 @@ class MacroAssembler: public Assembler {
   // Try to merge two loads/stores into ldp/stp. If success, returns true else false.
   bool try_merge_ldst(Register rt, const Address &adr, size_t cur_size_in_bytes, bool is_store);
 
+  // Check whether two loads/stores can be merged into ldp/stp.
+  bool ldst_can_merge_float(FloatRegister vx, const Address &adr, size_t cur_size_in_bytes, bool is_store) const;
+
+  // Merge current load/store with previous load/store into ldp/stp.
+  void merge_ldst_float(FloatRegister vx, const Address &adr, size_t cur_size_in_bytes, bool is_store);
+
+  // Try to merge two loads/stores into ldp/stp. If success, returns true else false.
+  bool try_merge_ldst_float(FloatRegister vx, const Address &adr, size_t cur_size_in_bytes, bool is_store);
+
 public:
   void spill(Register Rx, bool is64, int offset) {
     if (is64) {
@@ -1385,7 +1399,13 @@ class MacroAssembler: public Assembler {
     }
   }
   void spill(FloatRegister Vx, SIMD_RegVariant T, int offset) {
-    str(Vx, T, spill_address(1 << (int)T, offset));
+    if (T == D) {
+      strd(Vx, spill_address(8, offset));
+    } else if (T == S) {
+      strs(Vx, spill_address(4, offset));
+    } else {
+      str(Vx, T, spill_address(1 << (int) T, offset));
+    }
   }
   void unspill(Register Rx, bool is64, int offset) {
     if (is64) {
@@ -1395,7 +1415,13 @@ class MacroAssembler: public Assembler {
     }
   }
   void unspill(FloatRegister Vx, SIMD_RegVariant T, int offset) {
-    ldr(Vx, T, spill_address(1 << (int)T, offset));
+    if (T == D) {
+      ldrd(Vx, spill_address(8, offset));
+    } else if (T == S) {
+      ldrs(Vx, spill_address(4, offset));
+    } else {
+      ldr(Vx, T, spill_address(1 << (int) T, offset));
+    }
   }
   void spill_copy128(int src_offset, int dst_offset,
                      Register tmp1=rscratch1, Register tmp2=rscratch2) {

diff --git a/src/hotspot/cpu/aarch64/nativeInst_aarch64.hpp b/src/hotspot/cpu/aarch64/nativeInst_aarch64.hpp
@@ -139,6 +139,13 @@ class NativeInstruction {
       Instruction_aarch64::extract(insn, 23, 23) == 0b0 &&
       Instruction_aarch64::extract(insn, 26, 25) == 0b00;
   }
+
+  bool is_Imm_LdSt_Float() {
+    unsigned int insn = uint_at(0);
+    return Instruction_aarch64::extract(insn, 29, 27) == 0b111 &&
+           Instruction_aarch64::extract(insn, 23, 23) == 0b0 &&
+           Instruction_aarch64::extract(insn, 26, 25) == 0b10;
+  }
 };
 
 inline NativeInstruction* nativeInstruction_at(address address) {
@@ -701,4 +708,59 @@ inline NativeLdSt *NativeLdSt_at(address addr) {
   assert(nativeInstruction_at(addr)->is_Imm_LdSt(), "no immediate load/store found");
   return (NativeLdSt*)addr;
 }
+
+class NativeLdStFloat : public NativeInstruction {
+private:
+    int32_t size() { return Instruction_aarch64::extract(uint_at(0), 31, 30); }
+    // Check whether instruction is with unscaled offset.
+    bool is_ldst_ur_float() {
+        return (Instruction_aarch64::extract(uint_at(0), 29, 21) == 0b111100010 ||
+                Instruction_aarch64::extract(uint_at(0), 29, 21) == 0b111100000) &&
+               Instruction_aarch64::extract(uint_at(0), 11, 10) == 0b00;
+    }
+    bool is_ldst_unsigned_offset_float() {
+        return Instruction_aarch64::extract(uint_at(0), 29, 22) == 0b11110101 ||
+               Instruction_aarch64::extract(uint_at(0), 29, 22) == 0b11110100;
+    }
+public:
+    FloatRegister target() {
+        uint32_t r = Instruction_aarch64::extract(uint_at(0), 4, 0);
+        return as_FloatRegister(r);
+    }
+    Register base() {
+        uint32_t b = Instruction_aarch64::extract(uint_at(0), 9, 5);
+        return b == 0x1f ? sp : as_Register(b);
+    }
+    int64_t offset() {
+        if (is_ldst_ur_float()) {
+            return Instruction_aarch64::sextract(uint_at(0), 20, 12);
+        } else if (is_ldst_unsigned_offset_float()) {
+            return Instruction_aarch64::extract(uint_at(0), 21, 10) << size();
+        } else {
+            // others like: pre-index or post-index.
+            ShouldNotReachHere();
+            return 0;
+        }
+    }
+    size_t size_in_bytes() { return 1ULL << size(); }
+    bool is_not_pre_post_index() { return (is_ldst_ur_float() || is_ldst_unsigned_offset_float()); }
+    bool is_load() {
+        assert(Instruction_aarch64::extract(uint_at(0), 23, 22) == 0b01 ||
+               Instruction_aarch64::extract(uint_at(0), 23, 22) == 0b00, "must be ldr or str");
+
+        return Instruction_aarch64::extract(uint_at(0), 23, 22) == 0b01;
+    }
+    bool is_store() {
+        assert(Instruction_aarch64::extract(uint_at(0), 23, 22) == 0b01 ||
+               Instruction_aarch64::extract(uint_at(0), 23, 22) == 0b00, "must be ldr or str");
+
+        return Instruction_aarch64::extract(uint_at(0), 23, 22) == 0b00;
+    }
+};
+
+
+inline NativeLdStFloat *NativeLdStFloat_at(address addr) {
+    assert(nativeInstruction_at(addr)->is_Imm_LdSt_Float(), "no immediate load/store float found");
+    return (NativeLdStFloat*)addr;
+}
 #endif // CPU_AARCH64_VM_NATIVEINST_AARCH64_HPP