From 2dace6e54bf977b255e49d70d7999da7ae4a5932 Mon Sep 17 00:00:00 2001 From: Juergen Ributzka Date: Tue, 10 Jun 2014 23:52:44 +0000 Subject: [PATCH] [FastISel][X86] Extend support for {s|u}{add|sub|mul}.with.overflow intrinsics. llvm-svn: 210610 --- llvm/include/llvm/CodeGen/FastISel.h | 6 +- llvm/lib/Target/X86/X86FastISel.cpp | 119 +++++++++---- llvm/test/CodeGen/X86/xaluo.ll | 337 +++++++++++++++++++++++++++++++++++ 3 files changed, 429 insertions(+), 33 deletions(-) create mode 100644 llvm/test/CodeGen/X86/xaluo.ll diff --git a/llvm/include/llvm/CodeGen/FastISel.h b/llvm/include/llvm/CodeGen/FastISel.h index bfeede2..b1bb8eb 100644 --- a/llvm/include/llvm/CodeGen/FastISel.h +++ b/llvm/include/llvm/CodeGen/FastISel.h @@ -373,6 +373,9 @@ protected: /// - \c Add has a constant operand. bool canFoldAddIntoGEP(const User *GEP, const Value *Add); + /// Test whether the given value has exactly one use. + bool hasTrivialKill(const Value *V) const; + private: bool SelectBinaryOp(const User *I, unsigned ISDOpcode); @@ -408,9 +411,6 @@ private: /// beginning of the block. It helps to avoid spilling cached variables across /// heavy instructions like calls. void flushLocalValueMap(); - - /// Test whether the given value has exactly one use. - bool hasTrivialKill(const Value *V) const; }; } diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp index 2ef4bf2..c77e95c 100644 --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -1637,6 +1637,18 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM, return true; } +static bool isCommutativeIntrinsic(IntrinsicInst const &I) { + switch (I.getIntrinsicID()) { + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: + return true; + default: + return false; + } +} + bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { // FIXME: Handle more intrinsics. switch (I.getIntrinsicID()) { @@ -1718,47 +1730,94 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { return true; } case Intrinsic::sadd_with_overflow: - case Intrinsic::uadd_with_overflow: { - // FIXME: Should fold immediates. - - // Replace "add with overflow" intrinsics with an "add" instruction followed - // by a seto/setc instruction. + case Intrinsic::uadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::usub_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: { + // This implements the basic lowering of the xalu with overflow intrinsics + // into add/sub/mul folowed by either seto or setb. const Function *Callee = I.getCalledFunction(); - Type *RetTy = - cast(Callee->getReturnType())->getTypeAtIndex(unsigned(0)); + auto *Ty = cast(Callee->getReturnType()); + Type *RetTy = Ty->getTypeAtIndex(0U); + Type *CondTy = Ty->getTypeAtIndex(1); MVT VT; if (!isTypeLegal(RetTy, VT)) return false; - const Value *Op1 = I.getArgOperand(0); - const Value *Op2 = I.getArgOperand(1); - unsigned Reg1 = getRegForValue(Op1); - unsigned Reg2 = getRegForValue(Op2); + if (VT < MVT::i8 || VT > MVT::i64) + return false; - if (Reg1 == 0 || Reg2 == 0) - // FIXME: Handle values *not* in registers. + const Value *LHS = I.getArgOperand(0); + const Value *RHS = I.getArgOperand(1); + + // Canonicalize immediates to the RHS. + if (isa(LHS) && !isa(RHS) && + isCommutativeIntrinsic(I)) + std::swap(LHS, RHS); + + unsigned BaseOpc, CondOpc; + switch (I.getIntrinsicID()) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::sadd_with_overflow: + BaseOpc = ISD::ADD; CondOpc = X86::SETOr; break; + case Intrinsic::uadd_with_overflow: + BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break; + case Intrinsic::ssub_with_overflow: + BaseOpc = ISD::SUB; CondOpc = X86::SETOr; break; + case Intrinsic::usub_with_overflow: + BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break; + case Intrinsic::smul_with_overflow: + BaseOpc = ISD::MUL; CondOpc = X86::SETOr; break; + case Intrinsic::umul_with_overflow: + BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break; + } + + unsigned LHSReg = getRegForValue(LHS); + if (LHSReg == 0) return false; + bool LHSIsKill = hasTrivialKill(LHS); - unsigned OpC = 0; - if (VT == MVT::i32) - OpC = X86::ADD32rr; - else if (VT == MVT::i64) - OpC = X86::ADD64rr; - else + unsigned ResultReg = 0; + // Check if we have an immediate version. + if (auto const *C = dyn_cast(RHS)) { + ResultReg = FastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill, + C->getZExtValue()); + } + + unsigned RHSReg; + bool RHSIsKill; + if (!ResultReg) { + RHSReg = getRegForValue(RHS); + if (RHSReg == 0) + return false; + RHSIsKill = hasTrivialKill(RHS); + ResultReg = FastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg, + RHSIsKill); + } + + // FastISel doesn't have a pattern for X86::MUL*r. Emit it manually. + if (BaseOpc == X86ISD::UMUL && !ResultReg) { + static const unsigned MULOpc[] = + { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r }; + static const unsigned Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX }; + // First copy the first operand into RAX, which is an implicit input to + // the X86::MUL*r instruction. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8]) + .addReg(LHSReg, getKillRegState(LHSIsKill)); + ResultReg = FastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8], + TLI.getRegClassFor(VT), RHSReg, RHSIsKill); + } + + if (!ResultReg) return false; - // The call to CreateRegs builds two sequential registers, to store the - // both the returned values. - unsigned ResultReg = FuncInfo.CreateRegs(I.getType()); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpC), ResultReg) - .addReg(Reg1).addReg(Reg2); - - unsigned Opc = X86::SETBr; - if (I.getIntrinsicID() == Intrinsic::sadd_with_overflow) - Opc = X86::SETOr; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), - ResultReg + 1); + unsigned ResultReg2 = FuncInfo.CreateRegs(CondTy); + assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers."); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc), + ResultReg2); UpdateValueMap(&I, ResultReg, 2); return true; diff --git a/llvm/test/CodeGen/X86/xaluo.ll b/llvm/test/CodeGen/X86/xaluo.ll new file mode 100644 index 0000000..a9aa8fa --- /dev/null +++ b/llvm/test/CodeGen/X86/xaluo.ll @@ -0,0 +1,337 @@ +; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s -check-prefix=DAG +; RUN: llc -mtriple=x86_64-unknown-unknown -fast-isel -fast-isel-abort < %s | FileCheck %s --check-prefix=FAST + +; +; Get the actual value of the overflow bit. +; +; SADDO reg, reg +define zeroext i1 @saddo.i8(i8 signext %v1, i8 signext %v2, i8* %res) { +entry: +; DAG-LABEL: saddo.i8 +; DAG: addb %sil, %dil +; DAG-NEXT: seto %al +; FAST-LABEL: saddo.i8 +; FAST: addb %sil, %dil +; FAST-NEXT: seto %al + %t = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %v1, i8 %v2) + %val = extractvalue {i8, i1} %t, 0 + %obit = extractvalue {i8, i1} %t, 1 + store i8 %val, i8* %res + ret i1 %obit +} + +define zeroext i1 @saddo.i16(i16 %v1, i16 %v2, i16* %res) { +entry: +; DAG-LABEL: saddo.i16 +; DAG: addw %si, %di +; DAG-NEXT: seto %al +; FAST-LABEL: saddo.i16 +; FAST: addw %si, %di +; FAST-NEXT: seto %al + %t = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %v1, i16 %v2) + %val = extractvalue {i16, i1} %t, 0 + %obit = extractvalue {i16, i1} %t, 1 + store i16 %val, i16* %res + ret i1 %obit +} + +define zeroext i1 @saddo.i32(i32 %v1, i32 %v2, i32* %res) { +entry: +; DAG-LABEL: saddo.i32 +; DAG: addl %esi, %edi +; DAG-NEXT: seto %al +; FAST-LABEL: saddo.i32 +; FAST: addl %esi, %edi +; FAST-NEXT: seto %al + %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, i32* %res + ret i1 %obit +} + +define zeroext i1 @saddo.i64(i64 %v1, i64 %v2, i64* %res) { +entry: +; DAG-LABEL: saddo.i64 +; DAG: addq %rsi, %rdi +; DAG-NEXT: seto %al +; FAST-LABEL: saddo.i64 +; FAST: addq %rsi, %rdi +; FAST-NEXT: seto %al + %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, i64* %res + ret i1 %obit +} + +; SADDO reg, imm | imm, reg +; FIXME: INC isn't supported in FastISel yet +define zeroext i1 @saddo.i64imm1(i64 %v1, i64* %res) { +entry: +; DAG-LABEL: saddo.i64imm1 +; DAG: incq %rdi +; DAG-NEXT: seto %al +; FAST-LABEL: saddo.i64imm1 +; FAST: addq $1, %rdi +; FAST-NEXT: seto %al + %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 1) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, i64* %res + ret i1 %obit +} + +; FIXME: DAG doesn't optimize immediates on the LHS. +define zeroext i1 @saddo.i64imm2(i64 %v1, i64* %res) { +entry: +; DAG-LABEL: saddo.i64imm2 +; DAG: mov +; DAG-NEXT: addq +; DAG-NEXT: seto +; FAST-LABEL: saddo.i64imm2 +; FAST: addq $1, %rdi +; FAST-NEXT: seto %al + %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 1, i64 %v1) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, i64* %res + ret i1 %obit +} + +; Check boundary conditions for large immediates. +define zeroext i1 @saddo.i64imm3(i64 %v1, i64* %res) { +entry: +; DAG-LABEL: saddo.i64imm3 +; DAG: addq $-2147483648, %rdi +; DAG-NEXT: seto %al +; FAST-LABEL: saddo.i64imm3 +; FAST: addq $-2147483648, %rdi +; FAST-NEXT: seto %al + %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -2147483648) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, i64* %res + ret i1 %obit +} + +define zeroext i1 @saddo.i64imm4(i64 %v1, i64* %res) { +entry: +; DAG-LABEL: saddo.i64imm4 +; DAG: movabsq $-21474836489, %[[REG:[a-z]+]] +; DAG-NEXT: addq %rdi, %[[REG]] +; DAG-NEXT: seto +; FAST-LABEL: saddo.i64imm4 +; FAST: movabsq $-21474836489, %[[REG:[a-z]+]] +; FAST-NEXT: addq %rdi, %[[REG]] +; FAST-NEXT: seto + %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -21474836489) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, i64* %res + ret i1 %obit +} + +define zeroext i1 @saddo.i64imm5(i64 %v1, i64* %res) { +entry: +; DAG-LABEL: saddo.i64imm5 +; DAG: addq $2147483647, %rdi +; DAG-NEXT: seto +; FAST-LABEL: saddo.i64imm5 +; FAST: addq $2147483647, %rdi +; FAST-NEXT: seto + %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 2147483647) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, i64* %res + ret i1 %obit +} + +; TODO: FastISel shouldn't use movabsq. +define zeroext i1 @saddo.i64imm6(i64 %v1, i64* %res) { +entry: +; DAG-LABEL: saddo.i64imm6 +; DAG: movl $2147483648, %ecx +; DAG: addq %rdi, %rcx +; DAG-NEXT: seto +; FAST-LABEL: saddo.i64imm6 +; FAST: movabsq $2147483648, %[[REG:[a-z]+]] +; FAST: addq %rdi, %[[REG]] +; FAST-NEXT: seto + %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 2147483648) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, i64* %res + ret i1 %obit +} + +; UADDO +define zeroext i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) { +entry: +; DAG-LABEL: uaddo.i32 +; DAG: addl %esi, %edi +; DAG-NEXT: setb %al +; FAST-LABEL: uaddo.i32 +; FAST: addl %esi, %edi +; FAST-NEXT: setb %al + %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, i32* %res + ret i1 %obit +} + +define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) { +entry: +; DAG-LABEL: uaddo.i64 +; DAG: addq %rsi, %rdi +; DAG-NEXT: setb %al +; FAST-LABEL: uaddo.i64 +; FAST: addq %rsi, %rdi +; FAST-NEXT: setb %al + %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, i64* %res + ret i1 %obit +} + +; SSUBO +define zeroext i1 @ssubo.i32(i32 %v1, i32 %v2, i32* %res) { +entry: +; DAG-LABEL: ssubo.i32 +; DAG: subl %esi, %edi +; DAG-NEXT: seto %al +; FAST-LABEL: ssubo.i32 +; FAST: subl %esi, %edi +; FAST-NEXT: seto %al + %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, i32* %res + ret i1 %obit +} + +define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) { +entry: +; DAG-LABEL: ssubo.i64 +; DAG: subq %rsi, %rdi +; DAG-NEXT: seto %al +; FAST-LABEL: ssubo.i64 +; FAST: subq %rsi, %rdi +; FAST-NEXT: seto %al + %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, i64* %res + ret i1 %obit +} + +; USUBO +define zeroext i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) { +entry: +; DAG-LABEL: usubo.i32 +; DAG: subl %esi, %edi +; DAG-NEXT: setb %al +; FAST-LABEL: usubo.i32 +; FAST: subl %esi, %edi +; FAST-NEXT: setb %al + %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, i32* %res + ret i1 %obit +} + +define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) { +entry: +; DAG-LABEL: usubo.i64 +; DAG: subq %rsi, %rdi +; DAG-NEXT: setb %al +; FAST-LABEL: usubo.i64 +; FAST: subq %rsi, %rdi +; FAST-NEXT: setb %al + %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, i64* %res + ret i1 %obit +} + +; SMULO +define zeroext i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) { +entry: +; DAG-LABEL: smulo.i32 +; DAG: imull %esi, %edi +; DAG-NEXT: seto %al +; FAST-LABEL: smulo.i32 +; FAST: imull %esi, %edi +; FAST-NEXT: seto %al + %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, i32* %res + ret i1 %obit +} + +define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) { +entry: +; DAG-LABEL: smulo.i64 +; DAG: imulq %rsi, %rdi +; DAG-NEXT: seto %al +; FAST-LABEL: smulo.i64 +; FAST: imulq %rsi, %rdi +; FAST-NEXT: seto %al + %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, i64* %res + ret i1 %obit +} + +; UMULO +define zeroext i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) { +entry: +; DAG-LABEL: umulo.i32 +; DAG: mull %esi +; DAG-NEXT: seto +; FAST-LABEL: umulo.i32 +; FAST: mull %esi +; FAST-NEXT: seto + %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, i32* %res + ret i1 %obit +} + +define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) { +entry: +; DAG-LABEL: umulo.i64 +; DAG: mulq %rsi +; DAG-NEXT: seto +; FAST-LABEL: umulo.i64 +; FAST: mulq %rsi +; FAST-NEXT: seto + %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, i64* %res + ret i1 %obit +} + +declare {i8, i1} @llvm.sadd.with.overflow.i8(i8, i8) nounwind readnone +declare {i16, i1} @llvm.sadd.with.overflow.i16(i16, i16) nounwind readnone +declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone +declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone +declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone +declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone +declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone +declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone +declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone +declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone +declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone +declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone +declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone +declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone + -- 2.7.4