From f7cc34cae890fdd711173fcb633cd262ee343764 Mon Sep 17 00:00:00 2001 From: Sjoerd Meijer Date: Thu, 31 Jan 2019 08:07:30 +0000 Subject: [PATCH] [SelectionDAG] Codesize: don't expand SHIFT to SHIFT_PARTS And instead just generate a libcall. My motivating example on ARM was a simple: shl i64 %A, %B for which the code bloat is quite significant. For other targets that also accept __int128/i128 such as AArch64 and X86, it is also beneficial for these cases to generate a libcall when optimising for minsize. On these 64-bit targets, the 64-bits shifts are of course unaffected because the SHIFT/SHIFT_PARTS lowering operation action is not set to custom/expand. Differential Revision: https://reviews.llvm.org/D57386 llvm-svn: 352736 --- llvm/include/llvm/CodeGen/TargetLowering.h | 7 ++ .../CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp | 10 +- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 6 + llvm/lib/Target/ARM/ARMISelLowering.h | 6 + llvm/lib/Target/X86/X86ISelLowering.h | 6 + llvm/test/CodeGen/AArch64/shift_minsize.ll | 122 +++++++++++++++++++ llvm/test/CodeGen/ARM/shift_minsize.ll | 32 +++++ llvm/test/CodeGen/X86/shift_minsize.ll | 134 +++++++++++++++++++++ 8 files changed, 320 insertions(+), 3 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/shift_minsize.ll create mode 100644 llvm/test/CodeGen/ARM/shift_minsize.ll create mode 100644 llvm/test/CodeGen/X86/shift_minsize.ll diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index c21eb79..72535c56 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -642,6 +642,13 @@ public: return RepRegClassCostForVT[VT.SimpleTy]; } + /// Return true if SHIFT instructions should be expanded to SHIFT_PARTS + /// instructions, and false if a library call is preferred (e.g for code-size + /// reasons). + virtual bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const { + return true; + } + /// Return true if the target has native support for the specified value type. /// This means that it has a register that directly holds it without /// promotions or expansions. diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 81d9b65..9f63d66 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -2765,11 +2765,15 @@ void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N, } // Next check to see if the target supports this SHL_PARTS operation or if it - // will custom expand it. + // will custom expand it. Don't lower this to SHL_PARTS when we optimise for + // size, but create a libcall instead. EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); TargetLowering::LegalizeAction Action = TLI.getOperationAction(PartsOpc, NVT); - if ((Action == TargetLowering::Legal && TLI.isTypeLegal(NVT)) || - Action == TargetLowering::Custom) { + const bool LegalOrCustom = + (Action == TargetLowering::Legal && TLI.isTypeLegal(NVT)) || + Action == TargetLowering::Custom; + + if (LegalOrCustom && TLI.shouldExpandShift(DAG, N)) { // Expand the subcomponents. SDValue LHSL, LHSH; GetExpandedInteger(N->getOperand(0), LHSL, LHSH); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 1b6466f..100e330 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -469,6 +469,12 @@ public: return VT.getSizeInBits() >= 64; // vector 'bic' } + bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override { + if (DAG.getMachineFunction().getFunction().optForMinSize()) + return false; + return true; + } + bool shouldTransformSignedTruncationCheck(EVT XVT, unsigned KeptBits) const override { // For vectors, we don't have a preference.. diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 5c8b9fd..5a4b326 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -567,6 +567,12 @@ class VectorType; return HasStandaloneRem; } + bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override { + if (DAG.getMachineFunction().getFunction().optForMinSize()) + return false; + return true; + } + CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const; CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const; diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 60bc276..1b497b7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -831,6 +831,12 @@ namespace llvm { return VTIsOk(XVT) && VTIsOk(KeptBitsVT); } + bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override { + if (DAG.getMachineFunction().getFunction().optForMinSize()) + return false; + return true; + } + bool shouldSplatInsEltVarIndex(EVT VT) const override; bool convertSetCCLogicToBitwiseLogic(EVT VT) const override { diff --git a/llvm/test/CodeGen/AArch64/shift_minsize.ll b/llvm/test/CodeGen/AArch64/shift_minsize.ll new file mode 100644 index 0000000..d1b95e8 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/shift_minsize.ll @@ -0,0 +1,122 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s + +define i64 @f0(i64 %val, i64 %amt) minsize optsize { +; CHECK-LABEL: f0: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl x0, x0, x1 +; CHECK-NEXT: ret + %res = shl i64 %val, %amt + ret i64 %res +} + +define i32 @f1(i64 %x, i64 %y) minsize optsize { +; CHECK-LABEL: f1: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl x0, x0, x1 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %a = shl i64 %x, %y + %b = trunc i64 %a to i32 + ret i32 %b +} + +define i32 @f2(i64 %x, i64 %y) minsize optsize { +; CHECK-LABEL: f2: +; CHECK: // %bb.0: +; CHECK-NEXT: asr x0, x0, x1 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %a = ashr i64 %x, %y + %b = trunc i64 %a to i32 + ret i32 %b +} + +define i32 @f3(i64 %x, i64 %y) minsize optsize { +; CHECK-LABEL: f3: +; CHECK: // %bb.0: +; CHECK-NEXT: lsr x0, x0, x1 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %a = lshr i64 %x, %y + %b = trunc i64 %a to i32 + ret i32 %b +} + +define dso_local { i64, i64 } @shl128(i64 %x.coerce0, i64 %x.coerce1, i8 signext %y) minsize optsize { +; CHECK-LABEL: shl128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 +; CHECK-NEXT: bl __ashlti3 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %x.sroa.2.0.insert.ext = zext i64 %x.coerce1 to i128 + %x.sroa.2.0.insert.shift = shl nuw i128 %x.sroa.2.0.insert.ext, 64 + %x.sroa.0.0.insert.ext = zext i64 %x.coerce0 to i128 + %x.sroa.0.0.insert.insert = or i128 %x.sroa.2.0.insert.shift, %x.sroa.0.0.insert.ext + %conv = sext i8 %y to i32 + %sh_prom = zext i32 %conv to i128 + %shl = shl i128 %x.sroa.0.0.insert.insert, %sh_prom + %retval.sroa.0.0.extract.trunc = trunc i128 %shl to i64 + %retval.sroa.2.0.extract.shift = lshr i128 %shl, 64 + %retval.sroa.2.0.extract.trunc = trunc i128 %retval.sroa.2.0.extract.shift to i64 + %.fca.0.insert = insertvalue { i64, i64 } undef, i64 %retval.sroa.0.0.extract.trunc, 0 + %.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %retval.sroa.2.0.extract.trunc, 1 + ret { i64, i64 } %.fca.1.insert +} + +define dso_local { i64, i64 } @ashr128(i64 %x.coerce0, i64 %x.coerce1, i8 signext %y) minsize optsize { +; CHECK-LABEL: ashr128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 +; CHECK-NEXT: bl __ashrti3 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %x.sroa.2.0.insert.ext = zext i64 %x.coerce1 to i128 + %x.sroa.2.0.insert.shift = shl nuw i128 %x.sroa.2.0.insert.ext, 64 + %x.sroa.0.0.insert.ext = zext i64 %x.coerce0 to i128 + %x.sroa.0.0.insert.insert = or i128 %x.sroa.2.0.insert.shift, %x.sroa.0.0.insert.ext + %conv = sext i8 %y to i32 + %sh_prom = zext i32 %conv to i128 + %shr = ashr i128 %x.sroa.0.0.insert.insert, %sh_prom + %retval.sroa.0.0.extract.trunc = trunc i128 %shr to i64 + %retval.sroa.2.0.extract.shift = lshr i128 %shr, 64 + %retval.sroa.2.0.extract.trunc = trunc i128 %retval.sroa.2.0.extract.shift to i64 + %.fca.0.insert = insertvalue { i64, i64 } undef, i64 %retval.sroa.0.0.extract.trunc, 0 + %.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %retval.sroa.2.0.extract.trunc, 1 + ret { i64, i64 } %.fca.1.insert +} + +define dso_local { i64, i64 } @lshr128(i64 %x.coerce0, i64 %x.coerce1, i8 signext %y) minsize optsize { +; CHECK-LABEL: lshr128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 +; CHECK-NEXT: bl __lshrti3 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %x.sroa.2.0.insert.ext = zext i64 %x.coerce1 to i128 + %x.sroa.2.0.insert.shift = shl nuw i128 %x.sroa.2.0.insert.ext, 64 + %x.sroa.0.0.insert.ext = zext i64 %x.coerce0 to i128 + %x.sroa.0.0.insert.insert = or i128 %x.sroa.2.0.insert.shift, %x.sroa.0.0.insert.ext + %conv = sext i8 %y to i32 + %sh_prom = zext i32 %conv to i128 + %shr = lshr i128 %x.sroa.0.0.insert.insert, %sh_prom + %retval.sroa.0.0.extract.trunc = trunc i128 %shr to i64 + %retval.sroa.2.0.extract.shift = lshr i128 %shr, 64 + %retval.sroa.2.0.extract.trunc = trunc i128 %retval.sroa.2.0.extract.shift to i64 + %.fca.0.insert = insertvalue { i64, i64 } undef, i64 %retval.sroa.0.0.extract.trunc, 0 + %.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %retval.sroa.2.0.extract.trunc, 1 + ret { i64, i64 } %.fca.1.insert +} diff --git a/llvm/test/CodeGen/ARM/shift_minsize.ll b/llvm/test/CodeGen/ARM/shift_minsize.ll new file mode 100644 index 0000000..4d10c64 --- /dev/null +++ b/llvm/test/CodeGen/ARM/shift_minsize.ll @@ -0,0 +1,32 @@ +; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s + +define i64 @f0(i64 %val, i64 %amt) minsize optsize { +; CHECK-LABEL: f0: +; CHECK: bl __aeabi_llsl + %res = shl i64 %val, %amt + ret i64 %res +} + +define i32 @f1(i64 %x, i64 %y) minsize optsize { +; CHECK-LABEL: f1: +; CHECK: bl __aeabi_llsl + %a = shl i64 %x, %y + %b = trunc i64 %a to i32 + ret i32 %b +} + +define i32 @f2(i64 %x, i64 %y) minsize optsize { +; CHECK-LABEL: f2: +; CHECK: bl __aeabi_lasr + %a = ashr i64 %x, %y + %b = trunc i64 %a to i32 + ret i32 %b +} + +define i32 @f3(i64 %x, i64 %y) minsize optsize { +; CHECK-LABEL: f3: +; CHECK: bl __aeabi_llsr + %a = lshr i64 %x, %y + %b = trunc i64 %a to i32 + ret i32 %b +} diff --git a/llvm/test/CodeGen/X86/shift_minsize.ll b/llvm/test/CodeGen/X86/shift_minsize.ll new file mode 100644 index 0000000..5ba4654 --- /dev/null +++ b/llvm/test/CodeGen/X86/shift_minsize.ll @@ -0,0 +1,134 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s + +define i64 @f0(i64 %val, i64 %amt) minsize optsize { +; CHECK-LABEL: f0: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rsi, %rcx +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-NEXT: shlq %cl, %rax +; CHECK-NEXT: retq + %res = shl i64 %val, %amt + ret i64 %res +} + +define i32 @f1(i64 %x, i64 %y) minsize optsize { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rsi, %rcx +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-NEXT: shlq %cl, %rax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq + %a = shl i64 %x, %y + %b = trunc i64 %a to i32 + ret i32 %b +} + +define i32 @f2(i64 %x, i64 %y) minsize optsize { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rsi, %rcx +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-NEXT: sarq %cl, %rax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq + %a = ashr i64 %x, %y + %b = trunc i64 %a to i32 + ret i32 %b +} + +define i32 @f3(i64 %x, i64 %y) minsize optsize { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rsi, %rcx +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-NEXT: shrq %cl, %rax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq + %a = lshr i64 %x, %y + %b = trunc i64 %a to i32 + ret i32 %b +} + +define dso_local { i64, i64 } @shl128(i64 %x.coerce0, i64 %x.coerce1, i8 signext %y) minsize optsize { +; CHECK-LABEL: shl128: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movzbl %dl, %edx +; CHECK-NEXT: callq __ashlti3 +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %x.sroa.2.0.insert.ext = zext i64 %x.coerce1 to i128 + %x.sroa.2.0.insert.shift = shl nuw i128 %x.sroa.2.0.insert.ext, 64 + %x.sroa.0.0.insert.ext = zext i64 %x.coerce0 to i128 + %x.sroa.0.0.insert.insert = or i128 %x.sroa.2.0.insert.shift, %x.sroa.0.0.insert.ext + %conv = sext i8 %y to i32 + %sh_prom = zext i32 %conv to i128 + %shl = shl i128 %x.sroa.0.0.insert.insert, %sh_prom + %retval.sroa.0.0.extract.trunc = trunc i128 %shl to i64 + %retval.sroa.2.0.extract.shift = lshr i128 %shl, 64 + %retval.sroa.2.0.extract.trunc = trunc i128 %retval.sroa.2.0.extract.shift to i64 + %.fca.0.insert = insertvalue { i64, i64 } undef, i64 %retval.sroa.0.0.extract.trunc, 0 + %.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %retval.sroa.2.0.extract.trunc, 1 + ret { i64, i64 } %.fca.1.insert +} + +define dso_local { i64, i64 } @ashr128(i64 %x.coerce0, i64 %x.coerce1, i8 signext %y) minsize optsize { +; CHECK-LABEL: ashr128: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: callq __ashrti3 +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %x.sroa.2.0.insert.ext = zext i64 %x.coerce1 to i128 + %x.sroa.2.0.insert.shift = shl nuw i128 %x.sroa.2.0.insert.ext, 64 + %x.sroa.0.0.insert.ext = zext i64 %x.coerce0 to i128 + %x.sroa.0.0.insert.insert = or i128 %x.sroa.2.0.insert.shift, %x.sroa.0.0.insert.ext + %conv = sext i8 %y to i32 + %sh_prom = zext i32 %conv to i128 + %shr = ashr i128 %x.sroa.0.0.insert.insert, %sh_prom + %retval.sroa.0.0.extract.trunc = trunc i128 %shr to i64 + %retval.sroa.2.0.extract.shift = lshr i128 %shr, 64 + %retval.sroa.2.0.extract.trunc = trunc i128 %retval.sroa.2.0.extract.shift to i64 + %.fca.0.insert = insertvalue { i64, i64 } undef, i64 %retval.sroa.0.0.extract.trunc, 0 + %.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %retval.sroa.2.0.extract.trunc, 1 + ret { i64, i64 } %.fca.1.insert +} + +define dso_local { i64, i64 } @lshr128(i64 %x.coerce0, i64 %x.coerce1, i8 signext %y) minsize optsize { +; CHECK-LABEL: lshr128: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movzbl %dl, %edx +; CHECK-NEXT: callq __lshrti3 +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %x.sroa.2.0.insert.ext = zext i64 %x.coerce1 to i128 + %x.sroa.2.0.insert.shift = shl nuw i128 %x.sroa.2.0.insert.ext, 64 + %x.sroa.0.0.insert.ext = zext i64 %x.coerce0 to i128 + %x.sroa.0.0.insert.insert = or i128 %x.sroa.2.0.insert.shift, %x.sroa.0.0.insert.ext + %conv = sext i8 %y to i32 + %sh_prom = zext i32 %conv to i128 + %shr = lshr i128 %x.sroa.0.0.insert.insert, %sh_prom + %retval.sroa.0.0.extract.trunc = trunc i128 %shr to i64 + %retval.sroa.2.0.extract.shift = lshr i128 %shr, 64 + %retval.sroa.2.0.extract.trunc = trunc i128 %retval.sroa.2.0.extract.shift to i64 + %.fca.0.insert = insertvalue { i64, i64 } undef, i64 %retval.sroa.0.0.extract.trunc, 0 + %.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %retval.sroa.2.0.extract.trunc, 1 + ret { i64, i64 } %.fca.1.insert +} + -- 2.7.4