[ARM] Enable SMLAW[B|T] and SMLUW[B|T] instruction selection

author Sam Parker <sam.parker@arm.com>

Fri, 8 Apr 2016 16:02:53 +0000 (16:02 +0000)

committer Sam Parker <sam.parker@arm.com>

Fri, 8 Apr 2016 16:02:53 +0000 (16:02 +0000)
author Sam Parker <sam.parker@arm.com>
Fri, 8 Apr 2016 16:02:53 +0000 (16:02 +0000)
committer Sam Parker <sam.parker@arm.com>
Fri, 8 Apr 2016 16:02:53 +0000 (16:02 +0000)
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp

index afeee04..b4a438b 100644 (file)
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -252,6 +252,8 @@ private:
  
    SDNode *SelectConcatVector(SDNode *N);
  
+  SDNode *SelectSMLAWSMULW(SDNode *N);
+
    /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
    /// inline asm expressions.
    bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
@@ -2467,6 +2469,135 @@ SDNode *ARMDAGToDAGISel::SelectABSOp(SDNode *N){
    return nullptr;
  }
  
+static bool SearchSignedMulShort(SDValue SignExt, unsigned *Opc, SDValue &Src1,
+                                 bool Accumulate) {
+  // For SM*WB, we need to some form of sext.
+  // For SM*WT, we need to search for (sra X, 16)
+  // Src1 then gets set to X.
+  if ((SignExt.getOpcode() == ISD::SIGN_EXTEND ||
+       SignExt.getOpcode() == ISD::SIGN_EXTEND_INREG ||
+       SignExt.getOpcode() == ISD::AssertSext) &&
+       SignExt.getValueType() == MVT::i32) {
+
+    *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB;
+    Src1 = SignExt.getOperand(0);
+    return true;
+  }
+
+  if (SignExt.getOpcode() != ISD::SRA)
+    return false;
+
+  ConstantSDNode *SRASrc1 = dyn_cast<ConstantSDNode>(SignExt.getOperand(1));
+  if (!SRASrc1 || SRASrc1->getZExtValue() != 16)
+    return false;
+
+  SDValue Op0 = SignExt.getOperand(0);
+
+  // The sign extend operand for SM*WB could be generated by a shl and ashr.
+  if (Op0.getOpcode() == ISD::SHL) {
+    SDValue SHL = Op0;
+    ConstantSDNode *SHLSrc1 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
+    if (!SHLSrc1 || SHLSrc1->getZExtValue() != 16)
+      return false;
+
+    *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB;
+    Src1 = Op0.getOperand(0);
+    return true;
+  }
+  *Opc = Accumulate ? ARM::SMLAWT : ARM::SMULWT;
+  Src1 = SignExt.getOperand(0);
+  return true;
+}
+
+static bool SearchSignedMulLong(SDValue OR, unsigned *Opc, SDValue &Src0,
+                                SDValue &Src1, bool Accumulate) {
+  // First we look for:
+  // (add (or (srl ?, 16), (shl ?, 16)))
+  if (OR.getOpcode() != ISD::OR)
+    return false;
+
+  SDValue SRL = OR.getOperand(0);
+  SDValue SHL = OR.getOperand(1);
+
+  if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
+    SRL = OR.getOperand(1);
+    SHL = OR.getOperand(0);
+    if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL)
+      return false;
+  }
+
+  ConstantSDNode *SRLSrc1 = dyn_cast<ConstantSDNode>(SRL.getOperand(1));
+  ConstantSDNode *SHLSrc1 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
+  if (!SRLSrc1 || !SHLSrc1 || SRLSrc1->getZExtValue() != 16 ||
+      SHLSrc1->getZExtValue() != 16)
+    return false;
+
+  // The first operands to the shifts need to be the two results from the
+  // same smul_lohi node.
+  if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
+       SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
+    return false;
+
+  SDNode *SMULLOHI = SRL.getOperand(0).getNode();
+  if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
+      SHL.getOperand(0) != SDValue(SMULLOHI, 1))
+    return false;
+
+  // Now we have:
+  // (add (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
+  // For SMLAW[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
+  // For SMLAWB the 16-bit value will signed extended somehow.
+  // For SMLAWT only the SRA is required.
+
+  // Check both sides of SMUL_LOHI
+  if (SearchSignedMulShort(SMULLOHI->getOperand(0), Opc, Src1, Accumulate)) {
+    Src0 = SMULLOHI->getOperand(1);
+  } else if (SearchSignedMulShort(SMULLOHI->getOperand(1), Opc, Src1,
+                                  Accumulate)) {
+    Src0 = SMULLOHI->getOperand(0);
+  } else {
+    return false;
+  }
+  return true;
+}
+
+SDNode *ARMDAGToDAGISel::SelectSMLAWSMULW(SDNode *N) {
+  SDLoc dl(N);
+  SDValue Src0 = N->getOperand(0);
+  SDValue Src1 = N->getOperand(1);
+  SDValue A, B;
+  unsigned Opc = 0;
+
+  if (N->getOpcode() == ISD::ADD) {
+    if (Src0.getOpcode() != ISD::OR && Src1.getOpcode() != ISD::OR)
+      return nullptr;
+
+    SDValue Acc;
+    if (SearchSignedMulLong(Src0, &Opc, A, B, true)) {
+      Acc = Src1;
+    } else if (SearchSignedMulLong(Src1, &Opc, A, B, true)) {
+      Acc = Src0;
+    } else {
+      return nullptr;
+    }
+    if (Opc == 0)
+      return nullptr;
+
+    SDValue Ops[] = { A, B, Acc, getAL(CurDAG, dl),
+                      CurDAG->getRegister(0, MVT::i32) };
+    return CurDAG->SelectNodeTo(N, Opc, MVT::i32, MVT::Other, Ops);
+  } else if (N->getOpcode() == ISD::OR &&
+             SearchSignedMulLong(SDValue(N, 0), &Opc, A, B, false)) {
+    if (Opc == 0)
+      return nullptr;
+
+    SDValue Ops[] = { A, B, getAL(CurDAG, dl),
+                      CurDAG->getRegister(0, MVT::i32)};
+    return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+  }
+  return nullptr;
+}
+
  SDNode *ARMDAGToDAGISel::SelectConcatVector(SDNode *N) {
    // The only time a CONCAT_VECTORS operation can have legal types is when
    // two 64-bit vectors are concatenated to a 128-bit vector.
@@ -2486,6 +2617,13 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
  
    switch (N->getOpcode()) {
    default: break;
+  case ISD::ADD:
+  case ISD::OR: {
+    SDNode *ResNode = SelectSMLAWSMULW(N);
+    if (ResNode)
+      return ResNode;
+    break;
+  }
    case ISD::WRITE_REGISTER: {
      SDNode *ResNode = SelectWriteRegister(N);
      if (ResNode)
diff --git a/llvm/test/CodeGen/ARM/smul.ll b/llvm/test/CodeGen/ARM/smul.ll

index 13873f5..f5d21ac 100644 (file)
--- a/llvm/test/CodeGen/ARM/smul.ll
+++ b/llvm/test/CodeGen/ARM/smul.ll
@@ -1,5 +1,6 @@
  ; RUN: llc -mtriple=arm-eabi -mcpu=generic %s -o /dev/null
  ; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb--none-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
  
  @x = weak global i16 0          ; <i16*> [#uses=1]
  @y = weak global i16 0          ; <i16*> [#uses=0]
@@ -34,3 +35,107 @@ define i32 @f3(i32 %a, i16 %x, i32 %y) {
          ret i32 %tmp5
  }
  
+define i32 @f4(i32 %a, i32 %x, i32 %y) {
+; CHECK-LABEL: f4
+; CHECK: smlatt
+        %tmp1 = ashr i32 %x, 16
+        %tmp3 = ashr i32 %y, 16
+        %tmp4 = mul i32 %tmp3, %tmp1
+        %tmp5 = add i32 %tmp4, %a
+        ret i32 %tmp5
+}
+
+define i32 @f5(i32 %a, i16 %x, i16 %y) {
+; CHECK-LABEL: f5
+; CHECK: smlabb
+        %tmp1 = sext i16 %x to i32
+        %tmp3 = sext i16 %y to i32
+        %tmp4 = mul i32 %tmp3, %tmp1
+        %tmp5 = add i32 %tmp4, %a
+        ret i32 %tmp5
+}
+
+define i32 @f6(i32 %a, i16 %x, i32 %y) {
+; CHECK-LABEL: f6
+; CHECK: smlabt
+        %tmp1 = sext i16 %x to i32
+        %tmp3 = ashr i32 %y, 16
+        %tmp4 = mul i32 %tmp3, %tmp1
+        %tmp5 = add i32 %tmp4, %a
+        ret i32 %tmp5
+}
+
+define i32 @f7(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: f7
+; CHECK: smlawb
+        %shl = shl i32 %b, 16
+        %shr = ashr exact i32 %shl, 16
+        %conv = sext i32 %a to i64
+        %conv2 = sext i32 %shr to i64
+        %mul = mul nsw i64 %conv2, %conv
+        %shr49 = lshr i64 %mul, 16
+        %conv5 = trunc i64 %shr49 to i32
+        %add = add nsw i32 %conv5, %c
+        ret i32 %add
+}
+
+define i32 @f8(i32 %a, i16 signext %b, i32 %c) {
+; CHECK-LABEL: f8
+; CHECK: smlawb
+        %conv = sext i32 %a to i64
+        %conv1 = sext i16 %b to i64
+        %mul = mul nsw i64 %conv1, %conv
+        %shr5 = lshr i64 %mul, 16
+        %conv2 = trunc i64 %shr5 to i32
+        %add = add nsw i32 %conv2, %c
+        ret i32 %add
+}
+
+define i32 @f9(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: f9
+; CHECK: smlawt
+        %conv = sext i32 %a to i64
+        %shr = ashr i32 %b, 16
+        %conv1 = sext i32 %shr to i64
+        %mul = mul nsw i64 %conv1, %conv
+        %shr26 = lshr i64 %mul, 16
+        %conv3 = trunc i64 %shr26 to i32
+        %add = add nsw i32 %conv3, %c
+        ret i32 %add
+}
+
+define i32 @f10(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: f10
+; CHECK: smulwb
+        %shl = shl i32 %b, 16
+        %shr = ashr exact i32 %shl, 16
+        %conv = sext i32 %a to i64
+        %conv2 = sext i32 %shr to i64
+        %mul = mul nsw i64 %conv2, %conv
+        %shr37 = lshr i64 %mul, 16
+        %conv4 = trunc i64 %shr37 to i32
+        ret i32 %conv4
+}
+
+define i32 @f11(i32 %a, i16 signext %b, i32 %c) {
+; CHECK-LABEL: f11
+; CHECK: smulwb
+        %conv = sext i32 %a to i64
+        %conv1 = sext i16 %b to i64
+        %mul = mul nsw i64 %conv1, %conv
+        %shr4 = lshr i64 %mul, 16
+        %conv2 = trunc i64 %shr4 to i32
+        ret i32 %conv2
+}
+
+define i32 @f12(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: f12
+; CHECK: smulwt
+        %conv = sext i32 %a to i64
+        %shr = ashr i32 %b, 16
+        %conv1 = sext i32 %shr to i64
+        %mul = mul nsw i64 %conv1, %conv
+        %shr25 = lshr i64 %mul, 16
+        %conv3 = trunc i64 %shr25 to i32
+        ret i32 %conv3
+}
diff --git a/llvm/test/CodeGen/ARM/smulw.ll b/llvm/test/CodeGen/ARM/smulw.ll

deleted file mode 100644 (file)

index 8653903..0000000
--- a/llvm/test/CodeGen/ARM/smulw.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: llc -mtriple=arm--none-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
-; RUN: llc -mtriple=thumb--none-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
-
-; We cannot codegen the smulw[bt] or smlaw[bt] instructions for these functions,
-; as the top 16 bits of the result would differ
-
-define i32 @f1(i32 %a, i16 %b) {
-; CHECK-LABEL: f1:
-; CHECK: mul
-; CHECK: asr
-  %tmp1 = sext i16 %b to i32
-  %tmp2 = mul i32 %a, %tmp1
-  %tmp3 = ashr i32 %tmp2, 16
-  ret i32 %tmp3
-}
-
-define i32 @f2(i32 %a, i16 %b, i32 %c) {
-; CHECK-LABEL: f2:
-; CHECK: mul
-; CHECK: add{{.*}}, asr #16
-  %tmp1 = sext i16 %b to i32
-  %tmp2 = mul i32 %a, %tmp1
-  %tmp3 = ashr i32 %tmp2, 16
-  %tmp4 = add i32 %tmp3, %c
-  ret i32 %tmp4
-}
author	Sam Parker <sam.parker@arm.com>
	Fri, 8 Apr 2016 16:02:53 +0000 (16:02 +0000)
committer	Sam Parker <sam.parker@arm.com>
	Fri, 8 Apr 2016 16:02:53 +0000 (16:02 +0000)
llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp		patch \| blob \| history
llvm/test/CodeGen/ARM/smul.ll		patch \| blob \| history
llvm/test/CodeGen/ARM/smulw.ll	[deleted file]	patch \| blob \| history