From bb7d7b3d33e856e8a640d394954a29a59b32a3e6 Mon Sep 17 00:00:00 2001
From: Tim Northover <tnorthover@apple.com>
Date: Fri, 7 Sep 2018 09:21:25 +0000
Subject: [PATCH] ARM: fix Thumb2 CodeGen for ldrex with folded frame-index.

Because t2LDREX (& t2STREX) were marked as AddrModeNone, but did allow a
FrameIndex operand, rewriteT2FrameIndex asserted. This gives them a
proper addressing-mode and tells the rewriter about it so that encodable
offsets are exploited and others are rejected.

Should fix PR38828.

llvm-svn: 341642
---
 llvm/lib/Target/ARM/ARMFrameLowering.cpp       |  1 +
 llvm/lib/Target/ARM/ARMInstrFormats.td         |  1 +
 llvm/lib/Target/ARM/ARMInstrThumb2.td          |  4 +-
 llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h |  4 +-
 llvm/lib/Target/ARM/Thumb2InstrInfo.cpp        |  5 ++
 llvm/test/CodeGen/ARM/ldrex-frame-size.ll      | 36 +++++++++++
 llvm/test/CodeGen/ARM/ldstrex.ll               | 85 ++++++++++++++++++++++++++
 7 files changed, 133 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/ARM/ldrex-frame-size.ll

diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 2f57019..742d5dd 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -1516,6 +1516,7 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
           break;
         case ARMII::AddrMode5:
         case ARMII::AddrModeT2_i8s4:
+        case ARMII::AddrModeT2_ldrex:
           Limit = std::min(Limit, ((1U << 8) - 1) * 4);
           break;
         case ARMII::AddrModeT2_i12:
diff --git a/llvm/lib/Target/ARM/ARMInstrFormats.td b/llvm/lib/Target/ARM/ARMInstrFormats.td
index 87a2802..0df48ba 100644
--- a/llvm/lib/Target/ARM/ARMInstrFormats.td
+++ b/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -109,6 +109,7 @@ def AddrModeT2_pc   : AddrMode<14>;
 def AddrModeT2_i8s4 : AddrMode<15>;
 def AddrMode_i12    : AddrMode<16>;
 def AddrMode5FP16   : AddrMode<17>;
+def AddrModeT2_ldrex : AddrMode<18>;
 
 // Load / store index mode.
 class IndexMode<bits<2> val> {
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td
index c005481..95195e8 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -3275,7 +3275,7 @@ def t2LDREXH : T2I_ldrex<0b0101, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
                          [(set rGPR:$Rt, (ldrex_2 addr_offset_none:$addr))]>,
                Requires<[IsThumb, HasV8MBaseline]>;
 def t2LDREX  : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_imm0_1020s4:$addr),
-                       AddrModeNone, 4, NoItinerary,
+                       AddrModeT2_ldrex, 4, NoItinerary,
                        "ldrex", "\t$Rt, $addr", "",
                      [(set rGPR:$Rt, (ldrex_4 t2addrmode_imm0_1020s4:$addr))]>,
                Requires<[IsThumb, HasV8MBaseline]> {
@@ -3354,7 +3354,7 @@ def t2STREXH : T2I_strex<0b0101, (outs rGPR:$Rd),
 
 def t2STREX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
                              t2addrmode_imm0_1020s4:$addr),
-                  AddrModeNone, 4, NoItinerary,
+                  AddrModeT2_ldrex, 4, NoItinerary,
                   "strex", "\t$Rd, $Rt, $addr", "",
                   [(set rGPR:$Rd,
                         (strex_4 rGPR:$Rt, t2addrmode_imm0_1020s4:$addr))]>,
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index cdadf19..33c32d5 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -201,7 +201,8 @@ namespace ARMII {
     AddrModeT2_pc   = 14, // +/- i12 for pc relative data
     AddrModeT2_i8s4 = 15, // i8 * 4
     AddrMode_i12    = 16,
-    AddrMode5FP16   = 17  // i8 * 2
+    AddrMode5FP16   = 17,  // i8 * 2
+    AddrModeT2_ldrex = 18, // i8 * 4, with unscaled offset in MCInst
   };
 
   inline static const char *AddrModeToString(AddrMode addrmode) {
@@ -224,6 +225,7 @@ namespace ARMII {
     case AddrModeT2_pc:   return "AddrModeT2_pc";
     case AddrModeT2_i8s4: return "AddrModeT2_i8s4";
     case AddrMode_i12:    return "AddrMode_i12";
+    case AddrModeT2_ldrex:return "AddrModeT2_ldrex";
     }
   }
 
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index d5f0ba9..1a91a70 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -621,6 +621,11 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
       // MCInst operand expects already scaled value.
       Scale = 1;
       assert((Offset & 3) == 0 && "Can't encode this offset!");
+    } else if (AddrMode == ARMII::AddrModeT2_ldrex) {
+      Offset += MI.getOperand(FrameRegIdx + 1).getImm() * 4;
+      NumBits = 8; // 8 bits scaled by 4
+      Scale = 4;
+      assert((Offset & 3) == 0 && "Can't encode this offset!");
     } else {
       llvm_unreachable("Unsupported addressing mode!");
     }
diff --git a/llvm/test/CodeGen/ARM/ldrex-frame-size.ll b/llvm/test/CodeGen/ARM/ldrex-frame-size.ll
new file mode 100644
index 0000000..5955405
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ldrex-frame-size.ll
@@ -0,0 +1,36 @@
+; RUN: llc -mtriple=thumbv7-linux-gnueabi -o - %s | FileCheck %s
+
+; This alloca is just large enough that FrameLowering decides it needs a frame
+; to guarantee access, based on the range of ldrex.
+
+; The actual alloca size is a bit of black magic, unfortunately: the real
+; maximum accessible is 1020, but FrameLowering adds 16 bytes to its estimated
+; stack size just because so the alloca is not actually the what the limit gets
+; compared to. The important point is that we don't go up to ~4096, which is the
+; default with no strange instructions.
+define void @test_large_frame() {
+; CHECK-LABEL: test_large_frame:
+; CHECK: push
+; CHECK: sub.w sp, sp, #1004
+
+  %ptr = alloca i32, i32 251
+
+  %addr = getelementptr i32, i32* %ptr, i32 1
+  call i32 @llvm.arm.ldrex.p0i32(i32* %addr)
+  ret void
+}
+
+; This alloca is just is just the other side of the limit, so no frame
+define void @test_small_frame() {
+; CHECK-LABEL: test_small_frame:
+; CHECK-NOT: push
+; CHECK: sub.w sp, sp, #1000
+
+  %ptr = alloca i32, i32 250
+
+  %addr = getelementptr i32, i32* %ptr, i32 1
+  call i32 @llvm.arm.ldrex.p0i32(i32* %addr)
+  ret void
+}
+
+declare i32 @llvm.arm.ldrex.p0i32(i32*)
diff --git a/llvm/test/CodeGen/ARM/ldstrex.ll b/llvm/test/CodeGen/ARM/ldstrex.ll
index 59349f7..73afa0e 100644
--- a/llvm/test/CodeGen/ARM/ldstrex.ll
+++ b/llvm/test/CodeGen/ARM/ldstrex.ll
@@ -142,6 +142,91 @@ define void @excl_addrmode() {
   ret void
 }
 
+define void @test_excl_addrmode_folded() {
+; CHECK-LABEL: test_excl_addrmode_folded:
+  %local = alloca i8, i32 4096
+
+  %local.0 = getelementptr i8, i8* %local, i32 4
+  %local32.0 = bitcast i8* %local.0 to i32*
+  call i32 @llvm.arm.ldrex.p0i32(i32* %local32.0)
+  call i32 @llvm.arm.strex.p0i32(i32 0, i32* %local32.0)
+; CHECK-T2ADDRMODE: ldrex {{r[0-9]+}}, [sp, #4]
+; CHECK-T2ADDRMODE: strex {{r[0-9]+}}, {{r[0-9]+}}, [sp, #4]
+
+  %local.1 = getelementptr i8, i8* %local, i32 1020
+  %local32.1 = bitcast i8* %local.1 to i32*
+  call i32 @llvm.arm.ldrex.p0i32(i32* %local32.1)
+  call i32 @llvm.arm.strex.p0i32(i32 0, i32* %local32.1)
+; CHECK-T2ADDRMODE: ldrex {{r[0-9]+}}, [sp, #1020]
+; CHECK-T2ADDRMODE: strex {{r[0-9]+}}, {{r[0-9]+}}, [sp, #1020]
+
+  ret void
+}
+
+define void @test_excl_addrmode_range() {
+; CHECK-LABEL: test_excl_addrmode_range:
+  %local = alloca i8, i32 4096
+
+  %local.0 = getelementptr i8, i8* %local, i32 1024
+  %local32.0 = bitcast i8* %local.0 to i32*
+  call i32 @llvm.arm.ldrex.p0i32(i32* %local32.0)
+  call i32 @llvm.arm.strex.p0i32(i32 0, i32* %local32.0)
+; CHECK-T2ADDRMODE: mov r[[TMP:[0-9]+]], sp
+; CHECK-T2ADDRMODE: add.w r[[ADDR:[0-9]+]], r[[TMP]], #1024
+; CHECK-T2ADDRMODE: ldrex {{r[0-9]+}}, [r[[ADDR]]]
+; CHECK-T2ADDRMODE: strex {{r[0-9]+}}, {{r[0-9]+}}, [r[[ADDR]]]
+
+  ret void
+}
+
+define void @test_excl_addrmode_align() {
+; CHECK-LABEL: test_excl_addrmode_align:
+  %local = alloca i8, i32 4096
+
+  %local.0 = getelementptr i8, i8* %local, i32 2
+  %local32.0 = bitcast i8* %local.0 to i32*
+  call i32 @llvm.arm.ldrex.p0i32(i32* %local32.0)
+  call i32 @llvm.arm.strex.p0i32(i32 0, i32* %local32.0)
+; CHECK-T2ADDRMODE: mov r[[ADDR:[0-9]+]], sp
+; CHECK-T2ADDRMODE: adds r[[ADDR:[0-9]+]], #2
+; CHECK-T2ADDRMODE: ldrex {{r[0-9]+}}, [r[[ADDR]]]
+; CHECK-T2ADDRMODE: strex {{r[0-9]+}}, {{r[0-9]+}}, [r[[ADDR]]]
+
+  ret void
+}
+
+define void @test_excl_addrmode_sign() {
+; CHECK-LABEL: test_excl_addrmode_sign:
+  %local = alloca i8, i32 4096
+
+  %local.0 = getelementptr i8, i8* %local, i32 -4
+  %local32.0 = bitcast i8* %local.0 to i32*
+  call i32 @llvm.arm.ldrex.p0i32(i32* %local32.0)
+  call i32 @llvm.arm.strex.p0i32(i32 0, i32* %local32.0)
+; CHECK-T2ADDRMODE: mov r[[ADDR:[0-9]+]], sp
+; CHECK-T2ADDRMODE: subs r[[ADDR:[0-9]+]], #4
+; CHECK-T2ADDRMODE: ldrex {{r[0-9]+}}, [r[[ADDR]]]
+; CHECK-T2ADDRMODE: strex {{r[0-9]+}}, {{r[0-9]+}}, [r[[ADDR]]]
+
+  ret void
+}
+
+define void @test_excl_addrmode_combination() {
+; CHECK-LABEL: test_excl_addrmode_combination:
+  %local = alloca i8, i32 4096
+  %unused = alloca i8, i32 64
+
+  %local.0 = getelementptr i8, i8* %local, i32 4
+  %local32.0 = bitcast i8* %local.0 to i32*
+  call i32 @llvm.arm.ldrex.p0i32(i32* %local32.0)
+  call i32 @llvm.arm.strex.p0i32(i32 0, i32* %local32.0)
+; CHECK-T2ADDRMODE: ldrex {{r[0-9]+}}, [sp, #68]
+; CHECK-T2ADDRMODE: strex {{r[0-9]+}}, {{r[0-9]+}}, [sp, #68]
+
+  ret void
+}
+
+
 ; LLVM should know, even across basic blocks, that ldrex is setting the high
 ; bits of its i32 to 0. There should be no zero-extend operation.
 define zeroext i8 @test_cross_block_zext_i8(i1 %tst, i8* %addr) {
-- 
2.7.4