From 437fef500002a96d78773a08824738ad5c037980 Mon Sep 17 00:00:00 2001
From: "rodolph.perfetta@arm.com"
 <rodolph.perfetta@arm.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Date: Thu, 7 Aug 2014 10:46:40 +0000
Subject: [PATCH] ARM64: Support arbitrary offset in load/store pair.

TF calls can generate code exceeding the instruction range.

BUG=
R=bmeurer@chromium.org

Review URL: https://codereview.chromium.org/440303004

git-svn-id: https://v8.googlecode.com/svn/branches/bleeding_edge@22969 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
---
 src/arm64/assembler-arm64.cc          |   6 ++
 src/arm64/assembler-arm64.h           |   8 +-
 src/arm64/macro-assembler-arm64-inl.h |  37 ++-----
 src/arm64/macro-assembler-arm64.cc    |  33 ++++++
 src/arm64/macro-assembler-arm64.h     |  22 ++--
 test/cctest/test-assembler-arm64.cc   | 184 ++++++++++++++++++++++++++++++++++
 6 files changed, 250 insertions(+), 40 deletions(-)

diff --git a/src/arm64/assembler-arm64.cc b/src/arm64/assembler-arm64.cc
index dc2d587..7f86e14 100644
--- a/src/arm64/assembler-arm64.cc
+++ b/src/arm64/assembler-arm64.cc
@@ -2503,6 +2503,12 @@ bool Assembler::IsImmLSScaled(ptrdiff_t offset, LSDataSize size) {
 }
 
 
+bool Assembler::IsImmLSPair(ptrdiff_t offset, LSDataSize size) {
+  bool offset_is_size_multiple = (((offset >> size) << size) == offset);
+  return offset_is_size_multiple && is_int7(offset >> size);
+}
+
+
 // Test if a given value can be encoded in the immediate field of a logical
 // instruction.
 // If it can be encoded, the function returns true, and values pointed to by n,
diff --git a/src/arm64/assembler-arm64.h b/src/arm64/assembler-arm64.h
index e16ea87..d0effa7 100644
--- a/src/arm64/assembler-arm64.h
+++ b/src/arm64/assembler-arm64.h
@@ -1945,6 +1945,10 @@ class Assembler : public AssemblerBase {
   static bool IsImmLSUnscaled(ptrdiff_t offset);
   static bool IsImmLSScaled(ptrdiff_t offset, LSDataSize size);
 
+  void LoadStorePair(const CPURegister& rt, const CPURegister& rt2,
+                     const MemOperand& addr, LoadStorePairOp op);
+  static bool IsImmLSPair(ptrdiff_t offset, LSDataSize size);
+
   void Logical(const Register& rd,
                const Register& rn,
                const Operand& operand,
@@ -2027,10 +2031,6 @@ class Assembler : public AssemblerBase {
                                 const Operand& operand,
                                 FlagsUpdate S,
                                 Instr op);
-  void LoadStorePair(const CPURegister& rt,
-                     const CPURegister& rt2,
-                     const MemOperand& addr,
-                     LoadStorePairOp op);
   void LoadStorePairNonTemporal(const CPURegister& rt,
                                 const CPURegister& rt2,
                                 const MemOperand& addr,
diff --git a/src/arm64/macro-assembler-arm64-inl.h b/src/arm64/macro-assembler-arm64-inl.h
index cf5062d..f7c7248 100644
--- a/src/arm64/macro-assembler-arm64-inl.h
+++ b/src/arm64/macro-assembler-arm64-inl.h
@@ -299,6 +299,16 @@ LS_MACRO_LIST(DEFINE_FUNCTION)
 #undef DEFINE_FUNCTION
 
 
+#define DEFINE_FUNCTION(FN, REGTYPE, REG, REG2, OP)              \
+  void MacroAssembler::FN(const REGTYPE REG, const REGTYPE REG2, \
+                          const MemOperand& addr) {              \
+    DCHECK(allow_macro_instructions_);                           \
+    LoadStorePairMacro(REG, REG2, addr, OP);                     \
+  }
+LSPAIR_MACRO_LIST(DEFINE_FUNCTION)
+#undef DEFINE_FUNCTION
+
+
 void MacroAssembler::Asr(const Register& rd,
                          const Register& rn,
                          unsigned shift) {
@@ -861,25 +871,6 @@ void MacroAssembler::Ldnp(const CPURegister& rt,
 }
 
 
-void MacroAssembler::Ldp(const CPURegister& rt,
-                         const CPURegister& rt2,
-                         const MemOperand& src) {
-  DCHECK(allow_macro_instructions_);
-  DCHECK(!AreAliased(rt, rt2));
-  ldp(rt, rt2, src);
-}
-
-
-void MacroAssembler::Ldpsw(const Register& rt,
-                           const Register& rt2,
-                           const MemOperand& src) {
-  DCHECK(allow_macro_instructions_);
-  DCHECK(!rt.IsZero());
-  DCHECK(!rt2.IsZero());
-  ldpsw(rt, rt2, src);
-}
-
-
 void MacroAssembler::Ldr(const CPURegister& rt, const Immediate& imm) {
   DCHECK(allow_macro_instructions_);
   ldr(rt, imm);
@@ -1136,14 +1127,6 @@ void MacroAssembler::Stnp(const CPURegister& rt,
 }
 
 
-void MacroAssembler::Stp(const CPURegister& rt,
-                         const CPURegister& rt2,
-                         const MemOperand& dst) {
-  DCHECK(allow_macro_instructions_);
-  stp(rt, rt2, dst);
-}
-
-
 void MacroAssembler::Sxtb(const Register& rd, const Register& rn) {
   DCHECK(allow_macro_instructions_);
   DCHECK(!rd.IsZero());
diff --git a/src/arm64/macro-assembler-arm64.cc b/src/arm64/macro-assembler-arm64.cc
index 98a970e..658497b 100644
--- a/src/arm64/macro-assembler-arm64.cc
+++ b/src/arm64/macro-assembler-arm64.cc
@@ -588,6 +588,39 @@ void MacroAssembler::LoadStoreMacro(const CPURegister& rt,
   }
 }
 
+void MacroAssembler::LoadStorePairMacro(const CPURegister& rt,
+                                        const CPURegister& rt2,
+                                        const MemOperand& addr,
+                                        LoadStorePairOp op) {
+  // TODO(all): Should we support register offset for load-store-pair?
+  DCHECK(!addr.IsRegisterOffset());
+
+  int64_t offset = addr.offset();
+  LSDataSize size = CalcLSPairDataSize(op);
+
+  // Check if the offset fits in the immediate field of the appropriate
+  // instruction. If not, emit two instructions to perform the operation.
+  if (IsImmLSPair(offset, size)) {
+    // Encodable in one load/store pair instruction.
+    LoadStorePair(rt, rt2, addr, op);
+  } else {
+    Register base = addr.base();
+    if (addr.IsImmediateOffset()) {
+      UseScratchRegisterScope temps(this);
+      Register temp = temps.AcquireSameSizeAs(base);
+      Add(temp, base, offset);
+      LoadStorePair(rt, rt2, MemOperand(temp), op);
+    } else if (addr.IsPostIndex()) {
+      LoadStorePair(rt, rt2, MemOperand(base), op);
+      Add(base, base, offset);
+    } else {
+      DCHECK(addr.IsPreIndex());
+      Add(base, base, offset);
+      LoadStorePair(rt, rt2, MemOperand(base), op);
+    }
+  }
+}
+
 
 void MacroAssembler::Load(const Register& rt,
                           const MemOperand& addr,
diff --git a/src/arm64/macro-assembler-arm64.h b/src/arm64/macro-assembler-arm64.h
index efb6bcf..47ffff1 100644
--- a/src/arm64/macro-assembler-arm64.h
+++ b/src/arm64/macro-assembler-arm64.h
@@ -43,6 +43,11 @@ namespace internal {
   V(Str, CPURegister&, rt, StoreOpFor(rt))                    \
   V(Ldrsw, Register&, rt, LDRSW_x)
 
+#define LSPAIR_MACRO_LIST(V)                             \
+  V(Ldp, CPURegister&, rt, rt2, LoadPairOpFor(rt, rt2))  \
+  V(Stp, CPURegister&, rt, rt2, StorePairOpFor(rt, rt2)) \
+  V(Ldpsw, CPURegister&, rt, rt2, LDPSW_x)
+
 
 // ----------------------------------------------------------------------------
 // Static helper functions
@@ -261,6 +266,14 @@ class MacroAssembler : public Assembler {
                       const MemOperand& addr,
                       LoadStoreOp op);
 
+#define DECLARE_FUNCTION(FN, REGTYPE, REG, REG2, OP) \
+  inline void FN(const REGTYPE REG, const REGTYPE REG2, const MemOperand& addr);
+  LSPAIR_MACRO_LIST(DECLARE_FUNCTION)
+#undef DECLARE_FUNCTION
+
+  void LoadStorePairMacro(const CPURegister& rt, const CPURegister& rt2,
+                          const MemOperand& addr, LoadStorePairOp op);
+
   // V8-specific load/store helpers.
   void Load(const Register& rt, const MemOperand& addr, Representation r);
   void Store(const Register& rt, const MemOperand& addr, Representation r);
@@ -418,12 +431,6 @@ class MacroAssembler : public Assembler {
   inline void Ldnp(const CPURegister& rt,
                    const CPURegister& rt2,
                    const MemOperand& src);
-  inline void Ldp(const CPURegister& rt,
-                  const CPURegister& rt2,
-                  const MemOperand& src);
-  inline void Ldpsw(const Register& rt,
-                    const Register& rt2,
-                    const MemOperand& src);
   // Load a literal from the inline constant pool.
   inline void Ldr(const CPURegister& rt, const Immediate& imm);
   // Helper function for double immediate.
@@ -483,9 +490,6 @@ class MacroAssembler : public Assembler {
   inline void Stnp(const CPURegister& rt,
                    const CPURegister& rt2,
                    const MemOperand& dst);
-  inline void Stp(const CPURegister& rt,
-                  const CPURegister& rt2,
-                  const MemOperand& dst);
   inline void Sxtb(const Register& rd, const Register& rn);
   inline void Sxth(const Register& rd, const Register& rn);
   inline void Sxtw(const Register& rd, const Register& rn);
diff --git a/test/cctest/test-assembler-arm64.cc b/test/cctest/test-assembler-arm64.cc
index 8183920..3d05487 100644
--- a/test/cctest/test-assembler-arm64.cc
+++ b/test/cctest/test-assembler-arm64.cc
@@ -2911,6 +2911,64 @@ TEST(ldp_stp_offset) {
 }
 
 
+TEST(ldp_stp_offset_wide) {
+  INIT_V8();
+  SETUP();
+
+  uint64_t src[3] = {0x0011223344556677, 0x8899aabbccddeeff,
+                     0xffeeddccbbaa9988};
+  uint64_t dst[7] = {0, 0, 0, 0, 0, 0, 0};
+  uintptr_t src_base = reinterpret_cast<uintptr_t>(src);
+  uintptr_t dst_base = reinterpret_cast<uintptr_t>(dst);
+  // Move base too far from the array to force multiple instructions
+  // to be emitted.
+  const int64_t base_offset = 1024;
+
+  START();
+  __ Mov(x20, src_base - base_offset);
+  __ Mov(x21, dst_base - base_offset);
+  __ Mov(x18, src_base + base_offset + 24);
+  __ Mov(x19, dst_base + base_offset + 56);
+  __ Ldp(w0, w1, MemOperand(x20, base_offset));
+  __ Ldp(w2, w3, MemOperand(x20, base_offset + 4));
+  __ Ldp(x4, x5, MemOperand(x20, base_offset + 8));
+  __ Ldp(w6, w7, MemOperand(x18, -12 - base_offset));
+  __ Ldp(x8, x9, MemOperand(x18, -16 - base_offset));
+  __ Stp(w0, w1, MemOperand(x21, base_offset));
+  __ Stp(w2, w3, MemOperand(x21, base_offset + 8));
+  __ Stp(x4, x5, MemOperand(x21, base_offset + 16));
+  __ Stp(w6, w7, MemOperand(x19, -24 - base_offset));
+  __ Stp(x8, x9, MemOperand(x19, -16 - base_offset));
+  END();
+
+  RUN();
+
+  CHECK_EQUAL_64(0x44556677, x0);
+  CHECK_EQUAL_64(0x00112233, x1);
+  CHECK_EQUAL_64(0x0011223344556677UL, dst[0]);
+  CHECK_EQUAL_64(0x00112233, x2);
+  CHECK_EQUAL_64(0xccddeeff, x3);
+  CHECK_EQUAL_64(0xccddeeff00112233UL, dst[1]);
+  CHECK_EQUAL_64(0x8899aabbccddeeffUL, x4);
+  CHECK_EQUAL_64(0x8899aabbccddeeffUL, dst[2]);
+  CHECK_EQUAL_64(0xffeeddccbbaa9988UL, x5);
+  CHECK_EQUAL_64(0xffeeddccbbaa9988UL, dst[3]);
+  CHECK_EQUAL_64(0x8899aabb, x6);
+  CHECK_EQUAL_64(0xbbaa9988, x7);
+  CHECK_EQUAL_64(0xbbaa99888899aabbUL, dst[4]);
+  CHECK_EQUAL_64(0x8899aabbccddeeffUL, x8);
+  CHECK_EQUAL_64(0x8899aabbccddeeffUL, dst[5]);
+  CHECK_EQUAL_64(0xffeeddccbbaa9988UL, x9);
+  CHECK_EQUAL_64(0xffeeddccbbaa9988UL, dst[6]);
+  CHECK_EQUAL_64(src_base - base_offset, x20);
+  CHECK_EQUAL_64(dst_base - base_offset, x21);
+  CHECK_EQUAL_64(src_base + base_offset + 24, x18);
+  CHECK_EQUAL_64(dst_base + base_offset + 56, x19);
+
+  TEARDOWN();
+}
+
+
 TEST(ldnp_stnp_offset) {
   INIT_V8();
   SETUP();
@@ -3021,6 +3079,69 @@ TEST(ldp_stp_preindex) {
 }
 
 
+TEST(ldp_stp_preindex_wide) {
+  INIT_V8();
+  SETUP();
+
+  uint64_t src[3] = {0x0011223344556677, 0x8899aabbccddeeff,
+                     0xffeeddccbbaa9988};
+  uint64_t dst[5] = {0, 0, 0, 0, 0};
+  uintptr_t src_base = reinterpret_cast<uintptr_t>(src);
+  uintptr_t dst_base = reinterpret_cast<uintptr_t>(dst);
+  // Move base too far from the array to force multiple instructions
+  // to be emitted.
+  const int64_t base_offset = 1024;
+
+  START();
+  __ Mov(x24, src_base - base_offset);
+  __ Mov(x25, dst_base + base_offset);
+  __ Mov(x18, dst_base + base_offset + 16);
+  __ Ldp(w0, w1, MemOperand(x24, base_offset + 4, PreIndex));
+  __ Mov(x19, x24);
+  __ Mov(x24, src_base - base_offset + 4);
+  __ Ldp(w2, w3, MemOperand(x24, base_offset - 4, PreIndex));
+  __ Stp(w2, w3, MemOperand(x25, 4 - base_offset, PreIndex));
+  __ Mov(x20, x25);
+  __ Mov(x25, dst_base + base_offset + 4);
+  __ Mov(x24, src_base - base_offset);
+  __ Stp(w0, w1, MemOperand(x25, -4 - base_offset, PreIndex));
+  __ Ldp(x4, x5, MemOperand(x24, base_offset + 8, PreIndex));
+  __ Mov(x21, x24);
+  __ Mov(x24, src_base - base_offset + 8);
+  __ Ldp(x6, x7, MemOperand(x24, base_offset - 8, PreIndex));
+  __ Stp(x7, x6, MemOperand(x18, 8 - base_offset, PreIndex));
+  __ Mov(x22, x18);
+  __ Mov(x18, dst_base + base_offset + 16 + 8);
+  __ Stp(x5, x4, MemOperand(x18, -8 - base_offset, PreIndex));
+  END();
+
+  RUN();
+
+  CHECK_EQUAL_64(0x00112233, x0);
+  CHECK_EQUAL_64(0xccddeeff, x1);
+  CHECK_EQUAL_64(0x44556677, x2);
+  CHECK_EQUAL_64(0x00112233, x3);
+  CHECK_EQUAL_64(0xccddeeff00112233UL, dst[0]);
+  CHECK_EQUAL_64(0x0000000000112233UL, dst[1]);
+  CHECK_EQUAL_64(0x8899aabbccddeeffUL, x4);
+  CHECK_EQUAL_64(0xffeeddccbbaa9988UL, x5);
+  CHECK_EQUAL_64(0x0011223344556677UL, x6);
+  CHECK_EQUAL_64(0x8899aabbccddeeffUL, x7);
+  CHECK_EQUAL_64(0xffeeddccbbaa9988UL, dst[2]);
+  CHECK_EQUAL_64(0x8899aabbccddeeffUL, dst[3]);
+  CHECK_EQUAL_64(0x0011223344556677UL, dst[4]);
+  CHECK_EQUAL_64(src_base, x24);
+  CHECK_EQUAL_64(dst_base, x25);
+  CHECK_EQUAL_64(dst_base + 16, x18);
+  CHECK_EQUAL_64(src_base + 4, x19);
+  CHECK_EQUAL_64(dst_base + 4, x20);
+  CHECK_EQUAL_64(src_base + 8, x21);
+  CHECK_EQUAL_64(dst_base + 24, x22);
+
+  TEARDOWN();
+}
+
+
 TEST(ldp_stp_postindex) {
   INIT_V8();
   SETUP();
@@ -3076,6 +3197,69 @@ TEST(ldp_stp_postindex) {
 }
 
 
+TEST(ldp_stp_postindex_wide) {
+  INIT_V8();
+  SETUP();
+
+  uint64_t src[4] = {0x0011223344556677, 0x8899aabbccddeeff, 0xffeeddccbbaa9988,
+                     0x7766554433221100};
+  uint64_t dst[5] = {0, 0, 0, 0, 0};
+  uintptr_t src_base = reinterpret_cast<uintptr_t>(src);
+  uintptr_t dst_base = reinterpret_cast<uintptr_t>(dst);
+  // Move base too far from the array to force multiple instructions
+  // to be emitted.
+  const int64_t base_offset = 1024;
+
+  START();
+  __ Mov(x24, src_base);
+  __ Mov(x25, dst_base);
+  __ Mov(x18, dst_base + 16);
+  __ Ldp(w0, w1, MemOperand(x24, base_offset + 4, PostIndex));
+  __ Mov(x19, x24);
+  __ Sub(x24, x24, base_offset);
+  __ Ldp(w2, w3, MemOperand(x24, base_offset - 4, PostIndex));
+  __ Stp(w2, w3, MemOperand(x25, 4 - base_offset, PostIndex));
+  __ Mov(x20, x25);
+  __ Sub(x24, x24, base_offset);
+  __ Add(x25, x25, base_offset);
+  __ Stp(w0, w1, MemOperand(x25, -4 - base_offset, PostIndex));
+  __ Ldp(x4, x5, MemOperand(x24, base_offset + 8, PostIndex));
+  __ Mov(x21, x24);
+  __ Sub(x24, x24, base_offset);
+  __ Ldp(x6, x7, MemOperand(x24, base_offset - 8, PostIndex));
+  __ Stp(x7, x6, MemOperand(x18, 8 - base_offset, PostIndex));
+  __ Mov(x22, x18);
+  __ Add(x18, x18, base_offset);
+  __ Stp(x5, x4, MemOperand(x18, -8 - base_offset, PostIndex));
+  END();
+
+  RUN();
+
+  CHECK_EQUAL_64(0x44556677, x0);
+  CHECK_EQUAL_64(0x00112233, x1);
+  CHECK_EQUAL_64(0x00112233, x2);
+  CHECK_EQUAL_64(0xccddeeff, x3);
+  CHECK_EQUAL_64(0x4455667700112233UL, dst[0]);
+  CHECK_EQUAL_64(0x0000000000112233UL, dst[1]);
+  CHECK_EQUAL_64(0x0011223344556677UL, x4);
+  CHECK_EQUAL_64(0x8899aabbccddeeffUL, x5);
+  CHECK_EQUAL_64(0x8899aabbccddeeffUL, x6);
+  CHECK_EQUAL_64(0xffeeddccbbaa9988UL, x7);
+  CHECK_EQUAL_64(0xffeeddccbbaa9988UL, dst[2]);
+  CHECK_EQUAL_64(0x8899aabbccddeeffUL, dst[3]);
+  CHECK_EQUAL_64(0x0011223344556677UL, dst[4]);
+  CHECK_EQUAL_64(src_base + base_offset, x24);
+  CHECK_EQUAL_64(dst_base - base_offset, x25);
+  CHECK_EQUAL_64(dst_base - base_offset + 16, x18);
+  CHECK_EQUAL_64(src_base + base_offset + 4, x19);
+  CHECK_EQUAL_64(dst_base - base_offset + 4, x20);
+  CHECK_EQUAL_64(src_base + base_offset + 8, x21);
+  CHECK_EQUAL_64(dst_base - base_offset + 24, x22);
+
+  TEARDOWN();
+}
+
+
 TEST(ldp_sign_extend) {
   INIT_V8();
   SETUP();
-- 
2.7.4