[LoongArch] Split SP adjustment

author wanglei <wanglei@loongson.cn>

Fri, 28 Oct 2022 08:18:40 +0000 (16:18 +0800)

committer Weining Lu <luweining@loongson.cn>

Fri, 28 Oct 2022 08:39:00 +0000 (16:39 +0800)
author wanglei <wanglei@loongson.cn>
Fri, 28 Oct 2022 08:18:40 +0000 (16:18 +0800)
committer Weining Lu <luweining@loongson.cn>
Fri, 28 Oct 2022 08:39:00 +0000 (16:39 +0800)
diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp

index 4547215..e8985d9 100644 (file)
--- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
@@ -138,11 +138,17 @@ void LoongArchFrameLowering::emitPrologue(MachineFunction &MF,
  
    // First, compute final stack size.
    uint64_t StackSize = MFI.getStackSize();
+  uint64_t RealStackSize = StackSize;
  
    // Early exit if there is no need to allocate space in the stack.
    if (StackSize == 0 && !MFI.adjustsStack())
      return;
  
+  uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
+  // Split the SP adjustment to reduce the offsets of callee saved spill.
+  if (FirstSPAdjustAmount)
+    StackSize = FirstSPAdjustAmount;
+
    // Adjust stack.
    adjustReg(MBB, MBBI, DL, SPReg, SPReg, -StackSize, MachineInstr::FrameSetup);
    // Emit ".cfi_def_cfa_offset StackSize".
@@ -184,7 +190,29 @@ void LoongArchFrameLowering::emitPrologue(MachineFunction &MF,
      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
          .addCFIIndex(CFIIndex)
          .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  // Emit the second SP adjustment after saving callee saved registers.
+  if (FirstSPAdjustAmount) {
+    uint64_t SecondSPAdjustAmount = RealStackSize - FirstSPAdjustAmount;
+    assert(SecondSPAdjustAmount > 0 &&
+           "SecondSPAdjustAmount should be greater than zero");
+    adjustReg(MBB, MBBI, DL, SPReg, SPReg, -SecondSPAdjustAmount,
+              MachineInstr::FrameSetup);
  
+    if (!hasFP(MF)) {
+      // If we are using a frame-pointer, and thus emitted ".cfi_def_cfa fp, 0",
+      // don't emit an sp-based .cfi_def_cfa_offset
+      // Emit ".cfi_def_cfa_offset RealStackSize"
+      unsigned CFIIndex = MF.addFrameInst(
+          MCCFIInstruction::cfiDefCfaOffset(nullptr, RealStackSize));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+  }
+
+  if (hasFP(MF)) {
      // Realign stack.
      if (RI->hasStackRealignment(MF)) {
        unsigned ShiftAmount = Log2(MFI.getMaxAlign());
@@ -244,10 +272,47 @@ void LoongArchFrameLowering::emitEpilogue(MachineFunction &MF,
                MachineInstr::FrameDestroy);
    }
  
+  uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
+  if (FirstSPAdjustAmount) {
+    uint64_t SecondSPAdjustAmount = StackSize - FirstSPAdjustAmount;
+    assert(SecondSPAdjustAmount > 0 &&
+           "SecondSPAdjustAmount should be greater than zero");
+
+    adjustReg(MBB, LastFrameDestroy, DL, SPReg, SPReg, SecondSPAdjustAmount,
+              MachineInstr::FrameDestroy);
+    StackSize = FirstSPAdjustAmount;
+  }
+
    // Deallocate stack
    adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackSize, MachineInstr::FrameDestroy);
  }
  
+// We would like to split the SP adjustment to reduce prologue/epilogue
+// as following instructions. In this way, the offset of the callee saved
+// register could fit in a single store.
+// e.g.
+//   addi.d  $sp, $sp, -2032
+//   st.d    $ra, $sp,  2024
+//   st.d    $fp, $sp,  2016
+//   addi.d  $sp, $sp,   -16
+uint64_t LoongArchFrameLowering::getFirstSPAdjustAmount(
+    const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+
+  // Return the FirstSPAdjustAmount if the StackSize can not fit in a signed
+  // 12-bit and there exists a callee-saved register needing to be pushed.
+  if (!isInt<12>(MFI.getStackSize()) && (CSI.size() > 0)) {
+    // FirstSPAdjustAmount is chosen as (2048 - StackAlign) because 2048 will
+    // cause sp = sp + 2048 in the epilogue to be split into multiple
+    // instructions. Offsets smaller than 2048 can fit in a single load/store
+    // instruction, and we have to stick with the stack alignment.
+    // So (2048 - StackAlign) will satisfy the stack alignment.
+    return 2048 - getStackAlign().value();
+  }
+  return 0;
+}
+
  void LoongArchFrameLowering::determineCalleeSaves(MachineFunction &MF,
                                                    BitVector &SavedRegs,
                                                    RegScavenger *RS) const {
@@ -307,6 +372,7 @@ StackOffset LoongArchFrameLowering::getFrameIndexReference(
    const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
    auto *LoongArchFI = MF.getInfo<LoongArchMachineFunctionInfo>();
    uint64_t StackSize = MFI.getStackSize();
+  uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
  
    // Callee-saved registers should be referenced relative to the stack
    // pointer (positive offset), otherwise use the frame pointer (negative
@@ -325,7 +391,10 @@ StackOffset LoongArchFrameLowering::getFrameIndexReference(
  
    if (FI >= MinCSFI && FI <= MaxCSFI) {
      FrameReg = LoongArch::R3;
-    Offset += StackOffset::getFixed(StackSize);
+    if (FirstSPAdjustAmount)
+      Offset += StackOffset::getFixed(FirstSPAdjustAmount);
+    else
+      Offset += StackOffset::getFixed(StackSize);
    } else if (RI->hasStackRealignment(MF) && !MFI.isFixedObjectIndex(FI)) {
      // If the stack was realigned, the frame pointer is set in order to allow
      // SP to be restored, so we need another base register to record the stack
diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h

index e1e3e26..7ef79aa 100644 (file)
--- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h
@@ -45,6 +45,8 @@ public:
    bool hasFP(const MachineFunction &MF) const override;
    bool hasBP(const MachineFunction &MF) const;
  
+  uint64_t getFirstSPAdjustAmount(const MachineFunction &MF) const;
+
  private:
    void determineFrameLayout(MachineFunction &MF) const;
    void adjustReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
diff --git a/llvm/test/CodeGen/LoongArch/split-sp-adjust.ll b/llvm/test/CodeGen/LoongArch/split-sp-adjust.ll

new file mode 100644 (file)

index 0000000..093c92b
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/split-sp-adjust.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 --verify-machineinstrs < %s \
+; RUN:   | FileCheck %s
+
+;; The stack size is 2048 and the SP adjustment will be split.
+define i32 @SplitSP() nounwind {
+; CHECK-LABEL: SplitSP:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi.d $sp, $sp, -2032
+; CHECK-NEXT:    st.d $ra, $sp, 2024 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $sp, $sp, -16
+; CHECK-NEXT:    addi.d $a0, $sp, 12
+; CHECK-NEXT:    bl %plt(foo)
+; CHECK-NEXT:    move $a0, $zero
+; CHECK-NEXT:    addi.d $sp, $sp, 16
+; CHECK-NEXT:    ld.d $ra, $sp, 2024 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 2032
+; CHECK-NEXT:    ret
+entry:
+  %xx = alloca [2028 x i8], align 1
+  %0 = getelementptr inbounds [2028 x i8], ptr %xx, i32 0, i32 0
+  %call = call i32 @foo(ptr nonnull %0)
+  ret i32 0
+}
+
+;; The stack size is 2032 and the SP adjustment will not be split.
+define i32 @NoSplitSP() nounwind {
+; CHECK-LABEL: NoSplitSP:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi.d $sp, $sp, -2032
+; CHECK-NEXT:    st.d $ra, $sp, 2024 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bl %plt(foo)
+; CHECK-NEXT:    move $a0, $zero
+; CHECK-NEXT:    ld.d $ra, $sp, 2024 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 2032
+; CHECK-NEXT:    ret
+entry:
+  %xx = alloca [2024 x i8], align 1
+  %0 = getelementptr inbounds [2024 x i8], ptr %xx, i32 0, i32 0
+  %call = call i32 @foo(ptr nonnull %0)
+  ret i32 0
+}
+
+declare i32 @foo(ptr)
diff --git a/llvm/test/CodeGen/LoongArch/stack-realignment.ll b/llvm/test/CodeGen/LoongArch/stack-realignment.ll

index 16c7bcd..89672fb 100644 (file)
--- a/llvm/test/CodeGen/LoongArch/stack-realignment.ll
+++ b/llvm/test/CodeGen/LoongArch/stack-realignment.ll
@@ -453,46 +453,46 @@ define void @caller_no_realign1024() "no-realign-stack" {
  define void @caller2048() {
  ; LA32-LABEL: caller2048:
  ; LA32:       # %bb.0:
-; LA32-NEXT:    addi.w $sp, $sp, -2048
-; LA32-NEXT:    .cfi_def_cfa_offset 2048
-; LA32-NEXT:    st.w $ra, $sp, 2044 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $fp, $sp, 2040 # 4-byte Folded Spill
+; LA32-NEXT:    addi.w $sp, $sp, -2032
+; LA32-NEXT:    .cfi_def_cfa_offset 2032
+; LA32-NEXT:    st.w $ra, $sp, 2028 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 2024 # 4-byte Folded Spill
  ; LA32-NEXT:    .cfi_offset 1, -4
  ; LA32-NEXT:    .cfi_offset 22, -8
  ; LA32-NEXT:    addi.w $fp, $sp, 2032
-; LA32-NEXT:    addi.w $fp, $fp, 16
  ; LA32-NEXT:    .cfi_def_cfa 22, 0
+; LA32-NEXT:    addi.w $sp, $sp, -16
  ; LA32-NEXT:    srli.w $a0, $sp, 11
  ; LA32-NEXT:    slli.w $sp, $a0, 11
  ; LA32-NEXT:    addi.w $a0, $sp, 0
  ; LA32-NEXT:    bl %plt(callee)
  ; LA32-NEXT:    addi.w $sp, $fp, -2048
-; LA32-NEXT:    ld.w $fp, $sp, 2040 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $ra, $sp, 2044 # 4-byte Folded Reload
-; LA32-NEXT:    addi.w $sp, $sp, 2032
  ; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ld.w $fp, $sp, 2024 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 2028 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 2032
  ; LA32-NEXT:    ret
  ;
  ; LA64-LABEL: caller2048:
  ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -2048
-; LA64-NEXT:    .cfi_def_cfa_offset 2048
-; LA64-NEXT:    st.d $ra, $sp, 2040 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 2032 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -2032
+; LA64-NEXT:    .cfi_def_cfa_offset 2032
+; LA64-NEXT:    st.d $ra, $sp, 2024 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 2016 # 8-byte Folded Spill
  ; LA64-NEXT:    .cfi_offset 1, -8
  ; LA64-NEXT:    .cfi_offset 22, -16
  ; LA64-NEXT:    addi.d $fp, $sp, 2032
-; LA64-NEXT:    addi.d $fp, $fp, 16
  ; LA64-NEXT:    .cfi_def_cfa 22, 0
+; LA64-NEXT:    addi.d $sp, $sp, -16
  ; LA64-NEXT:    srli.d $a0, $sp, 11
  ; LA64-NEXT:    slli.d $sp, $a0, 11
  ; LA64-NEXT:    addi.d $a0, $sp, 0
  ; LA64-NEXT:    bl %plt(callee)
  ; LA64-NEXT:    addi.d $sp, $fp, -2048
-; LA64-NEXT:    ld.d $fp, $sp, 2032 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 2040 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 2032
  ; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ld.d $fp, $sp, 2016 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 2024 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 2032
  ; LA64-NEXT:    ret
    %1 = alloca i8, align 2048
    call void @callee(i8* %1)
@@ -531,66 +531,52 @@ define void @caller_no_realign2048() "no-realign-stack" {
  define void @caller4096() {
  ; LA32-LABEL: caller4096:
  ; LA32:       # %bb.0:
-; LA32-NEXT:    lu12i.w $a0, 1
-; LA32-NEXT:    sub.w $sp, $sp, $a0
-; LA32-NEXT:    .cfi_def_cfa_offset 4096
-; LA32-NEXT:    ori $a0, $zero, 4092
-; LA32-NEXT:    add.w $a0, $sp, $a0
-; LA32-NEXT:    st.w $ra, $a0, 0 # 4-byte Folded Spill
-; LA32-NEXT:    ori $a0, $zero, 4088
-; LA32-NEXT:    add.w $a0, $sp, $a0
-; LA32-NEXT:    st.w $fp, $a0, 0 # 4-byte Folded Spill
+; LA32-NEXT:    addi.w $sp, $sp, -2032
+; LA32-NEXT:    .cfi_def_cfa_offset 2032
+; LA32-NEXT:    st.w $ra, $sp, 2028 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 2024 # 4-byte Folded Spill
  ; LA32-NEXT:    .cfi_offset 1, -4
  ; LA32-NEXT:    .cfi_offset 22, -8
-; LA32-NEXT:    lu12i.w $a0, 1
-; LA32-NEXT:    add.w $fp, $sp, $a0
+; LA32-NEXT:    addi.w $fp, $sp, 2032
  ; LA32-NEXT:    .cfi_def_cfa 22, 0
+; LA32-NEXT:    addi.w $sp, $sp, -2048
+; LA32-NEXT:    addi.w $sp, $sp, -16
  ; LA32-NEXT:    srli.w $a0, $sp, 12
  ; LA32-NEXT:    slli.w $sp, $a0, 12
  ; LA32-NEXT:    addi.w $a0, $sp, 0
  ; LA32-NEXT:    bl %plt(callee)
  ; LA32-NEXT:    lu12i.w $a0, 1
  ; LA32-NEXT:    sub.w $sp, $fp, $a0
-; LA32-NEXT:    ori $a0, $zero, 4088
-; LA32-NEXT:    add.w $a0, $sp, $a0
-; LA32-NEXT:    ld.w $fp, $a0, 0 # 4-byte Folded Reload
-; LA32-NEXT:    ori $a0, $zero, 4092
-; LA32-NEXT:    add.w $a0, $sp, $a0
-; LA32-NEXT:    ld.w $ra, $a0, 0 # 4-byte Folded Reload
-; LA32-NEXT:    lu12i.w $a0, 1
-; LA32-NEXT:    add.w $sp, $sp, $a0
+; LA32-NEXT:    addi.w $sp, $sp, 2032
+; LA32-NEXT:    addi.w $sp, $sp, 32
+; LA32-NEXT:    ld.w $fp, $sp, 2024 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 2028 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 2032
  ; LA32-NEXT:    ret
  ;
  ; LA64-LABEL: caller4096:
  ; LA64:       # %bb.0:
-; LA64-NEXT:    lu12i.w $a0, 1
-; LA64-NEXT:    sub.d $sp, $sp, $a0
-; LA64-NEXT:    .cfi_def_cfa_offset 4096
-; LA64-NEXT:    ori $a0, $zero, 4088
-; LA64-NEXT:    add.d $a0, $sp, $a0
-; LA64-NEXT:    st.d $ra, $a0, 0 # 8-byte Folded Spill
-; LA64-NEXT:    ori $a0, $zero, 4080
-; LA64-NEXT:    add.d $a0, $sp, $a0
-; LA64-NEXT:    st.d $fp, $a0, 0 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -2032
+; LA64-NEXT:    .cfi_def_cfa_offset 2032
+; LA64-NEXT:    st.d $ra, $sp, 2024 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 2016 # 8-byte Folded Spill
  ; LA64-NEXT:    .cfi_offset 1, -8
  ; LA64-NEXT:    .cfi_offset 22, -16
-; LA64-NEXT:    lu12i.w $a0, 1
-; LA64-NEXT:    add.d $fp, $sp, $a0
+; LA64-NEXT:    addi.d $fp, $sp, 2032
  ; LA64-NEXT:    .cfi_def_cfa 22, 0
+; LA64-NEXT:    addi.d $sp, $sp, -2048
+; LA64-NEXT:    addi.d $sp, $sp, -16
  ; LA64-NEXT:    srli.d $a0, $sp, 12
  ; LA64-NEXT:    slli.d $sp, $a0, 12
  ; LA64-NEXT:    addi.d $a0, $sp, 0
  ; LA64-NEXT:    bl %plt(callee)
  ; LA64-NEXT:    lu12i.w $a0, 1
  ; LA64-NEXT:    sub.d $sp, $fp, $a0
-; LA64-NEXT:    ori $a0, $zero, 4080
-; LA64-NEXT:    add.d $a0, $sp, $a0
-; LA64-NEXT:    ld.d $fp, $a0, 0 # 8-byte Folded Reload
-; LA64-NEXT:    ori $a0, $zero, 4088
-; LA64-NEXT:    add.d $a0, $sp, $a0
-; LA64-NEXT:    ld.d $ra, $a0, 0 # 8-byte Folded Reload
-; LA64-NEXT:    lu12i.w $a0, 1
-; LA64-NEXT:    add.d $sp, $sp, $a0
+; LA64-NEXT:    addi.d $sp, $sp, 2032
+; LA64-NEXT:    addi.d $sp, $sp, 32
+; LA64-NEXT:    ld.d $fp, $sp, 2016 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 2024 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 2032
  ; LA64-NEXT:    ret
    %1 = alloca i8, align 4096
    call void @callee(i8* %1)
author	wanglei <wanglei@loongson.cn>
	Fri, 28 Oct 2022 08:18:40 +0000 (16:18 +0800)
committer	Weining Lu <luweining@loongson.cn>
	Fri, 28 Oct 2022 08:39:00 +0000 (16:39 +0800)
llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp		patch \| blob \| history
llvm/lib/Target/LoongArch/LoongArchFrameLowering.h		patch \| blob \| history
llvm/test/CodeGen/LoongArch/split-sp-adjust.ll	[new file with mode: 0644]	patch \| blob
llvm/test/CodeGen/LoongArch/stack-realignment.ll		patch \| blob \| history