ARM: Introduce conservative load/store optimization mode

author Matthias Braun <matze@braunis.de>

Wed, 2 Mar 2016 19:20:00 +0000 (19:20 +0000)

committer Matthias Braun <matze@braunis.de>

Wed, 2 Mar 2016 19:20:00 +0000 (19:20 +0000)
author Matthias Braun <matze@braunis.de>
Wed, 2 Mar 2016 19:20:00 +0000 (19:20 +0000)
committer Matthias Braun <matze@braunis.de>
Wed, 2 Mar 2016 19:20:00 +0000 (19:20 +0000)
diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp

index 5ee6641..cc49f9d 100644 (file)
--- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -60,6 +60,15 @@ STATISTIC(NumSTRD2STM,  "Number of strd instructions turned back into stm");
  STATISTIC(NumLDRD2LDR,  "Number of ldrd instructions turned back into ldr's");
  STATISTIC(NumSTRD2STR,  "Number of strd instructions turned back into str's");
  
+/// This switch disables formation of double/multi instructions that could
+/// potentially lead to (new) alignment traps even with CCR.UNALIGN_TRP
+/// disabled. This can be used to create libraries that are robust even when
+/// users provoke undefined behaviour by supplying misaligned pointers.
+/// \see mayCombineMisaligned()
+static cl::opt<bool>
+AssumeMisalignedLoadStores("arm-assume-misaligned-load-store", cl::Hidden,
+    cl::init(false), cl::desc("Be more conservative in ARM load/store opt"));
+
  namespace llvm {
  void initializeARMLoadStoreOptPass(PassRegistry &);
  }
@@ -916,6 +925,24 @@ static bool isValidLSDoubleOffset(int Offset) {
    return (Value % 4) == 0 && Value < 1024;
  }
  
+/// Return true for loads/stores that can be combined to a double/multi
+/// operation without increasing the requirements for alignment.
+static bool mayCombineMisaligned(const TargetSubtargetInfo &STI,
+                                 const MachineInstr &MI) {
+  // vldr/vstr trap on misaligned pointers anyway, forming vldm makes no
+  // difference.
+  unsigned Opcode = MI.getOpcode();
+  if (!isi32Load(Opcode) && !isi32Store(Opcode))
+    return true;
+
+  // Stack pointer alignment is out of the programmers control so we can trust
+  // SP-relative loads/stores.
+  if (getLoadStoreBaseOp(MI).getReg() == ARM::SP &&
+      STI.getFrameLowering()->getTransientStackAlignment() >= 4)
+    return true;
+  return false;
+}
+
  /// Find candidates for load/store multiple merge in list of MemOpQueueEntries.
  void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
    const MachineInstr *FirstMI = MemOps[0].MI;
@@ -954,6 +981,10 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
      if (PReg == ARM::SP || PReg == ARM::PC)
        CanMergeToLSMulti = CanMergeToLSDouble = false;
  
+    // Should we be conservative?
+    if (AssumeMisalignedLoadStores && !mayCombineMisaligned(*STI, *MI))
+      CanMergeToLSMulti = CanMergeToLSDouble = false;
+
      // Merge following instructions where possible.
      for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) {
        int NewOffset = MemOps[I].Offset;
@@ -1926,6 +1957,9 @@ INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-load-store-opt",
                  ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
  
  bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  if (AssumeMisalignedLoadStores)
+    return false;
+
    TD = &Fn.getDataLayout();
    STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
    TII = STI->getInstrInfo();
diff --git a/llvm/test/CodeGen/ARM/ldrd.ll b/llvm/test/CodeGen/ARM/ldrd.ll

index b259634..dd97fbf 100644 (file)
--- a/llvm/test/CodeGen/ARM/ldrd.ll
+++ b/llvm/test/CodeGen/ARM/ldrd.ll
@@ -1,9 +1,11 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=fast -optimize-regalloc=0 -verify-machineinstrs | FileCheck %s -check-prefix=A8 -check-prefix=CHECK
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-m3 -regalloc=fast -optimize-regalloc=0 | FileCheck %s -check-prefix=M3 -check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=fast -optimize-regalloc=0 -verify-machineinstrs | FileCheck %s -check-prefix=A8 -check-prefix=CHECK -check-prefix=NORMAL
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-m3 -regalloc=fast -optimize-regalloc=0 | FileCheck %s -check-prefix=M3 -check-prefix=CHECK -check-prefix=NORMAL
  ; rdar://6949835
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s -check-prefix=BASIC -check-prefix=CHECK
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=greedy | FileCheck %s -check-prefix=GREEDY -check-prefix=CHECK
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=swift | FileCheck %s -check-prefix=SWIFT -check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s -check-prefix=BASIC -check-prefix=CHECK -check-prefix=NORMAL
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=greedy | FileCheck %s -check-prefix=GREEDY -check-prefix=CHECK -check-prefix=NORMAL
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=swift | FileCheck %s -check-prefix=SWIFT -check-prefix=CHECK -check-prefix=NORMAL
+
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-assume-misaligned-load-store | FileCheck %s -check-prefix=CHECK -check-prefix=CONSERVATIVE
  
  ; Magic ARM pair hints works best with linearscan / fast.
  
@@ -15,12 +17,13 @@ declare void @use_i64(i64 %v)
  
  define void @test_ldrd(i64 %a) nounwind readonly {
  ; CHECK-LABEL: test_ldrd:
-; CHECK: bl{{x?}} _get_ptr
+; NORMAL: bl{{x?}} _get_ptr
  ; A8: ldrd r0, r1, [r0]
  ; Cortex-M3 errata 602117: LDRD with base in list may result in incorrect base
  ; register when interrupted or faulted.
  ; M3-NOT: ldrd r[[REGNUM:[0-9]+]], {{r[0-9]+}}, [r[[REGNUM]]]
-; CHECK: bl{{x?}} _use_i64
+; CONSERVATIVE-NOT: ldrd
+; NORMAL: bl{{x?}} _use_i64
    %ptr = call i64* @get_ptr()
    %v = load i64, i64* %ptr, align 8
    call void @use_i64(i64 %v)
@@ -39,11 +42,10 @@ define void @test_ldrd(i64 %a) nounwind readonly {
  ; evict another live range or use callee saved regs. Sorry if the test
  ; is sensitive to Regalloc changes, but it is an interesting case.
  ;
-; BASIC: @f
+; CHECK-LABEL: f:
  ; BASIC: %bb
  ; BASIC: ldrd
  ; BASIC: str
-; GREEDY: @f
  ; GREEDY: %bb
  ; GREEDY: ldrd
  ; GREEDY: str
@@ -76,14 +78,15 @@ return:                                           ; preds = %bb, %entry
  
  @TestVar = external global %struct.Test
  
+; CHECK-LABEL: Func1:
  define void @Func1() nounwind ssp {
-; CHECK: @Func1
  entry: 
  ; A8: movw [[BASE:r[0-9]+]], :lower16:{{.*}}TestVar{{.*}}
  ; A8: movt [[BASE]], :upper16:{{.*}}TestVar{{.*}}
  ; A8: ldrd [[FIELD1:r[0-9]+]], [[FIELD2:r[0-9]+]], {{\[}}[[BASE]], #4]
  ; A8-NEXT: add [[FIELD1]], [[FIELD2]]
  ; A8-NEXT: str [[FIELD1]], {{\[}}[[BASE]]{{\]}}
+; CONSERVATIVE-NOT: ldrd
    %orig_blocks = alloca [256 x i16], align 2
    %0 = bitcast [256 x i16]* %orig_blocks to i8*call void @llvm.lifetime.start(i64 512, i8* %0) nounwind
    %tmp1 = load i32, i32* getelementptr inbounds (%struct.Test, %struct.Test* @TestVar, i32 0, i32 1), align 4
@@ -97,8 +100,9 @@ entry:
  declare void @extfunc(i32, i32, i32, i32)
  
  ; CHECK-LABEL: Func2:
+; CONSERVATIVE-NOT: ldrd
  ; A8: ldrd
-; A8: blx
+; CHECK: bl{{x?}} _extfunc
  ; A8: pop
  define void @Func2(i32* %p) {
  entry:
@@ -116,12 +120,14 @@ entry:
  ; M3: strd r1, r0, [sp, #-8]!
  ; BASIC: strd r1, r0, [sp, #-8]!
  ; GREEDY: strd r0, r1, [sp, #-8]!
-; CHECK: @ InlineAsm Start
-; CHECK: @ InlineAsm End
+; CONSERVATIVE: strd r0, r1, [sp, #-8]!
+; NORMAL: @ InlineAsm Start
+; NORMAL: @ InlineAsm End
  ; A8: ldrd r2, r1, [sp]
  ; M3: ldrd r2, r1, [sp]
  ; BASIC: ldrd r2, r1, [sp]
  ; GREEDY: ldrd r1, r2, [sp]
+; CONSERVATIVE: ldrd r1, r2, [sp]
  ; CHECK: bl{{x?}} _extfunc
  define void @strd_spill_ldrd_reload(i32 %v0, i32 %v1) {
    ; force %v0 and %v1 to be spilled
@@ -134,8 +140,9 @@ define void @strd_spill_ldrd_reload(i32 %v0, i32 %v1) {
  declare void @extfunc2(i32*, i32, i32)
  
  ; CHECK-LABEL: ldrd_postupdate_dec:
-; CHECK: ldrd r1, r2, [r0], #-8
-; CHECK-NEXT: bl{{x?}} _extfunc
+; NORMAL: ldrd r1, r2, [r0], #-8
+; CONSERVATIVE-NOT: ldrd
+; CHECK: bl{{x?}} _extfunc
  define void @ldrd_postupdate_dec(i32* %p0) {
    %p0.1 = getelementptr i32, i32* %p0, i32 1
    %v0 = load i32, i32* %p0
@@ -146,8 +153,9 @@ define void @ldrd_postupdate_dec(i32* %p0) {
  }
  
  ; CHECK-LABEL: ldrd_postupdate_inc:
-; CHECK: ldrd r1, r2, [r0], #8
-; CHECK-NEXT: bl{{x?}} _extfunc
+; NORMAL: ldrd r1, r2, [r0], #8
+; CONSERVATIVE-NOT: ldrd
+; CHECK: bl{{x?}} _extfunc
  define void @ldrd_postupdate_inc(i32* %p0) {
    %p0.1 = getelementptr i32, i32* %p0, i32 1
    %v0 = load i32, i32* %p0
@@ -158,8 +166,9 @@ define void @ldrd_postupdate_inc(i32* %p0) {
  }
  
  ; CHECK-LABEL: strd_postupdate_dec:
-; CHECK: strd r1, r2, [r0], #-8
-; CHECK-NEXT: bx lr
+; NORMAL: strd r1, r2, [r0], #-8
+; CONSERVATIVE-NOT: strd
+; CHECK: bx lr
  define i32* @strd_postupdate_dec(i32* %p0, i32 %v0, i32 %v1) {
    %p0.1 = getelementptr i32, i32* %p0, i32 1
    store i32 %v0, i32* %p0
@@ -169,8 +178,9 @@ define i32* @strd_postupdate_dec(i32* %p0, i32 %v0, i32 %v1) {
  }
  
  ; CHECK-LABEL: strd_postupdate_inc:
-; CHECK: strd r1, r2, [r0], #8
-; CHECK-NEXT: bx lr
+; NORMAL: strd r1, r2, [r0], #8
+; CONSERVATIVE-NOT: strd
+; CHECK: bx lr
  define i32* @strd_postupdate_inc(i32* %p0, i32 %v0, i32 %v1) {
    %p0.1 = getelementptr i32, i32* %p0, i32 1
    store i32 %v0, i32* %p0
diff --git a/llvm/test/CodeGen/ARM/swift-vldm.ll b/llvm/test/CodeGen/ARM/swift-vldm.ll

index 9e50727..a53b241 100644 (file)
--- a/llvm/test/CodeGen/ARM/swift-vldm.ll
+++ b/llvm/test/CodeGen/ARM/swift-vldm.ll
@@ -1,4 +1,5 @@
  ; RUN: llc < %s -mcpu=swift -mtriple=armv7s-apple-ios | FileCheck %s
+; RUN: llc < %s -arm-assume-misaligned-load-store -mcpu=swift -mtriple=armv7s-apple-ios | FileCheck %s
  
  ; Check that we avoid producing vldm instructions using d registers that
  ; begin in the most-significant half of a q register. These require more
diff --git a/llvm/test/CodeGen/Thumb2/thumb2-ldm.ll b/llvm/test/CodeGen/Thumb2/thumb2-ldm.ll

index 28903ac..a5b4741 100644 (file)
--- a/llvm/test/CodeGen/Thumb2/thumb2-ldm.ll
+++ b/llvm/test/CodeGen/Thumb2/thumb2-ldm.ll
@@ -1,12 +1,15 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mattr=+thumb2 | FileCheck %s -check-prefix=ALL -check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mattr=+thumb2 -arm-assume-misaligned-load-store | FileCheck %s -check-prefix=ALL -check-prefix=CONSERVATIVE
  
  @X = external global [0 x i32]          ; <[0 x i32]*> [#uses=5]
  
  define i32 @t1() {
-; CHECK-LABEL: t1:
-; CHECK: push {r7, lr}
+; ALL-LABEL: t1:
+; ALL: push {r7, lr}
  ; CHECK: ldrd
-; CHECK: pop {r7, pc}
+; CONSERVATIVE-NOT: ldrd
+; CONSERVATIVE-NOT: ldm
+; ALL: pop {r7, pc}
          %tmp = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 0)            ; <i32> [#uses=1]
          %tmp3 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 1)           ; <i32> [#uses=1]
          %tmp4 = call i32 @f1( i32 %tmp, i32 %tmp3 )                ; <i32> [#uses=1]
@@ -14,10 +17,12 @@ define i32 @t1() {
  }
  
  define i32 @t2() {
-; CHECK-LABEL: t2:
-; CHECK: push {r7, lr}
+; ALL-LABEL: t2:
+; ALL: push {r7, lr}
  ; CHECK: ldm
-; CHECK: pop {r7, pc}
+; CONSERVATIVE-NOT: ldrd
+; CONSERVATIVE-NOT: ldm
+; ALL: pop {r7, pc}
          %tmp = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 2)            ; <i32> [#uses=1]
          %tmp3 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 3)           ; <i32> [#uses=1]
          %tmp5 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 4)           ; <i32> [#uses=1]
@@ -26,10 +31,12 @@ define i32 @t2() {
  }
  
  define i32 @t3() {
-; CHECK-LABEL: t3:
-; CHECK: push {r7, lr}
+; ALL-LABEL: t3:
+; ALL: push {r7, lr}
  ; CHECK: ldm
-; CHECK: pop {r7, pc}
+; CONSERVATIVE-NOT: ldrd
+; CONSERVATIVE-NOT: ldm
+; ALL: pop {r7, pc}
          %tmp = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 1)            ; <i32> [#uses=1]
          %tmp3 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 2)           ; <i32> [#uses=1]
          %tmp5 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 3)           ; <i32> [#uses=1]
@@ -37,6 +44,34 @@ define i32 @t3() {
          ret i32 %tmp6
  }
  
+@g = common global i32* null
+
+define void @t4(i32 %a0, i32 %a1, i32 %a2) {
+; ALL-LABEL: t4:
+; ALL: stm.w sp, {r0, r1, r2}
+; ALL: blx _ext
+; ALL: ldm.w sp, {r0, r1, r2}
+; ALL: blx _f2
+  %arr = alloca [4 x i32], align 4
+  %p0 = getelementptr inbounds [4 x i32], [4 x i32]* %arr, i64 0, i64 0
+  %p1 = getelementptr inbounds [4 x i32], [4 x i32]* %arr, i64 0, i64 1
+  %p2 = getelementptr inbounds [4 x i32], [4 x i32]* %arr, i64 0, i64 2
+  store i32* %p0, i32** @g, align 8
+
+  store i32 %a0, i32* %p0, align 4
+  store i32 %a1, i32* %p1, align 4
+  store i32 %a2, i32* %p2, align 4
+  call void @ext()
+
+  %v0 = load i32, i32* %p0, align 4
+  %v1 = load i32, i32* %p1, align 4
+  %v2 = load i32, i32* %p2, align 4
+  call i32 @f2(i32 %v0, i32 %v1, i32 %v2)
+  ret void
+}
+
  declare i32 @f1(i32, i32)
  
  declare i32 @f2(i32, i32, i32)
+
+declare void @ext()
author	Matthias Braun <matze@braunis.de>
	Wed, 2 Mar 2016 19:20:00 +0000 (19:20 +0000)
committer	Matthias Braun <matze@braunis.de>
	Wed, 2 Mar 2016 19:20:00 +0000 (19:20 +0000)
llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp		patch \| blob \| history
llvm/test/CodeGen/ARM/ldrd.ll		patch \| blob \| history
llvm/test/CodeGen/ARM/swift-vldm.ll		patch \| blob \| history
llvm/test/CodeGen/Thumb2/thumb2-ldm.ll		patch \| blob \| history