[PowerPC] Add vec_vsx_ld and vec_vsx_st intrinsics

author Bill Schmidt <wschmidt@linux.vnet.ibm.com>

Wed, 12 Nov 2014 04:19:40 +0000 (04:19 +0000)

committer Bill Schmidt <wschmidt@linux.vnet.ibm.com>

Wed, 12 Nov 2014 04:19:40 +0000 (04:19 +0000)
author Bill Schmidt <wschmidt@linux.vnet.ibm.com>
Wed, 12 Nov 2014 04:19:40 +0000 (04:19 +0000)
committer Bill Schmidt <wschmidt@linux.vnet.ibm.com>
Wed, 12 Nov 2014 04:19:40 +0000 (04:19 +0000)
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td

index 5f8cda5..43ee7d2 100644 (file)
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -516,6 +516,18 @@ def int_ppc_altivec_vrsqrtefp : PowerPC_Vec_FF_Intrinsic<"vrsqrtefp">;
  
  let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
  
+// Vector load.
+def int_ppc_vsx_lxvw4x :
+      Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
+def int_ppc_vsx_lxvd2x :
+      Intrinsic<[llvm_v2f64_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
+
+// Vector store.
+def int_ppc_vsx_stxvw4x :
+      Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty], [IntrReadWriteArgMem]>;
+def int_ppc_vsx_stxvd2x :
+      Intrinsic<[], [llvm_v2f64_ty, llvm_ptr_ty], [IntrReadWriteArgMem]>;
+
  // Vector maximum.
  def int_ppc_vsx_xvmaxdp : PowerPC_VSX_Vec_DDD_Intrinsic<"xvmaxdp">;
  def int_ppc_vsx_xvmaxsp : PowerPC_VSX_Vec_FFF_Intrinsic<"xvmaxsp">;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp

index 0984e60..0c844be 100644 (file)
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -7583,8 +7583,12 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
      default: return false;
      case Intrinsic::ppc_altivec_lvx:
      case Intrinsic::ppc_altivec_lvxl:
+    case Intrinsic::ppc_vsx_lxvw4x:
        VT = MVT::v4i32;
        break;
+    case Intrinsic::ppc_vsx_lxvd2x:
+      VT = MVT::v2f64;
+      break;
      case Intrinsic::ppc_altivec_lvebx:
        VT = MVT::i8;
        break;
@@ -7605,8 +7609,12 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
      default: return false;
      case Intrinsic::ppc_altivec_stvx:
      case Intrinsic::ppc_altivec_stvxl:
+    case Intrinsic::ppc_vsx_stxvw4x:
        VT = MVT::v4i32;
        break;
+    case Intrinsic::ppc_vsx_stxvd2x:
+      VT = MVT::v2f64;
+      break;
      case Intrinsic::ppc_altivec_stvebx:
        VT = MVT::i8;
        break;
@@ -9094,7 +9102,9 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
    case Intrinsic::ppc_altivec_lvxl:
    case Intrinsic::ppc_altivec_lvebx:
    case Intrinsic::ppc_altivec_lvehx:
-  case Intrinsic::ppc_altivec_lvewx: {
+  case Intrinsic::ppc_altivec_lvewx:
+  case Intrinsic::ppc_vsx_lxvd2x:
+  case Intrinsic::ppc_vsx_lxvw4x: {
      EVT VT;
      switch (Intrinsic) {
      case Intrinsic::ppc_altivec_lvebx:
@@ -9106,6 +9116,9 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
      case Intrinsic::ppc_altivec_lvewx:
        VT = MVT::i32;
        break;
+    case Intrinsic::ppc_vsx_lxvd2x:
+      VT = MVT::v2f64;
+      break;
      default:
        VT = MVT::v4i32;
        break;
@@ -9126,7 +9139,9 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
    case Intrinsic::ppc_altivec_stvxl:
    case Intrinsic::ppc_altivec_stvebx:
    case Intrinsic::ppc_altivec_stvehx:
-  case Intrinsic::ppc_altivec_stvewx: {
+  case Intrinsic::ppc_altivec_stvewx:
+  case Intrinsic::ppc_vsx_stxvd2x:
+  case Intrinsic::ppc_vsx_stxvw4x: {
      EVT VT;
      switch (Intrinsic) {
      case Intrinsic::ppc_altivec_stvebx:
@@ -9138,6 +9153,9 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
      case Intrinsic::ppc_altivec_stvewx:
        VT = MVT::i32;
        break;
+    case Intrinsic::ppc_vsx_stxvd2x:
+      VT = MVT::v2f64;
+      break;
      default:
        VT = MVT::v4i32;
        break;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td

index 3dfdf08..522e0de 100644 (file)
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -55,7 +55,7 @@ let Uses = [RM] in {
      def LXVD2X : XX1Form<31, 844,
                           (outs vsrc:$XT), (ins memrr:$src),
                           "lxvd2x $XT, $src", IIC_LdStLFD,
-                         [(set v2f64:$XT, (load xoaddr:$src))]>;
+                         [(set v2f64:$XT, (int_ppc_vsx_lxvd2x xoaddr:$src))]>;
  
      def LXVDSX : XX1Form<31, 332,
                           (outs vsrc:$XT), (ins memrr:$src),
@@ -64,7 +64,7 @@ let Uses = [RM] in {
      def LXVW4X : XX1Form<31, 780,
                           (outs vsrc:$XT), (ins memrr:$src),
                           "lxvw4x $XT, $src", IIC_LdStLFD,
-                         [(set v4i32:$XT, (load xoaddr:$src))]>;
+                         [(set v4i32:$XT, (int_ppc_vsx_lxvw4x xoaddr:$src))]>;
    }
  
    // Store indexed instructions
@@ -77,12 +77,12 @@ let Uses = [RM] in {
      def STXVD2X : XX1Form<31, 972,
                           (outs), (ins vsrc:$XT, memrr:$dst),
                           "stxvd2x $XT, $dst", IIC_LdStSTFD,
-                         [(store v2f64:$XT, xoaddr:$dst)]>;
+                         [(int_ppc_vsx_stxvd2x v2f64:$XT, xoaddr:$dst)]>;
  
      def STXVW4X : XX1Form<31, 908,
                           (outs), (ins vsrc:$XT, memrr:$dst),
                           "stxvw4x $XT, $dst", IIC_LdStSTFD,
-                         [(store v4i32:$XT, xoaddr:$dst)]>;
+                         [(int_ppc_vsx_stxvw4x v4i32:$XT, xoaddr:$dst)]>;
    }
  
    // Add/Mul Instructions
@@ -851,11 +851,14 @@ def : Pat<(v2f64 (sint_to_fp (sext_inreg v2i64:$C, v2i32))),
            (XVCVSXWDP (XXSLDWI $C, $C, 1))>;
  
  // Loads.
+def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
  def : Pat<(v4i32 (load xoaddr:$src)), (LXVW4X xoaddr:$src)>;
  
  // Stores.
-def : Pat<(store v4i32:$rS, xoaddr:$dst),
-          (STXVW4X $rS, xoaddr:$dst)>;
+def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+def : Pat<(store v4i32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>;
  
  // Selects.
  def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLT)),
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp

index 8987ee0..87e49a1 100644 (file)
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -613,6 +613,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
        return new LoadInst(Ptr);
      }
      break;
+  case Intrinsic::ppc_vsx_lxvw4x:
+  case Intrinsic::ppc_vsx_lxvd2x: {
+    // Turn PPC VSX loads into normal loads.
+    Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0),
+                                        PointerType::getUnqual(II->getType()));
+    return new LoadInst(Ptr, Twine(""), false, 1);
+  }
    case Intrinsic::ppc_altivec_stvx:
    case Intrinsic::ppc_altivec_stvxl:
      // Turn stvx -> store if the pointer is known aligned.
@@ -624,6 +631,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
        return new StoreInst(II->getArgOperand(0), Ptr);
      }
      break;
+  case Intrinsic::ppc_vsx_stxvw4x:
+  case Intrinsic::ppc_vsx_stxvd2x: {
+    // Turn PPC VSX stores into normal stores.
+    Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType());
+    Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy);
+    return new StoreInst(II->getArgOperand(0), Ptr, false, 1);
+  }
    case Intrinsic::x86_sse_storeu_ps:
    case Intrinsic::x86_sse2_storeu_pd:
    case Intrinsic::x86_sse2_storeu_dq:
diff --git a/llvm/test/CodeGen/PowerPC/vsx-fma-m.ll b/llvm/test/CodeGen/PowerPC/vsx-fma-m.ll

index da4a204..9dff9a7 100644 (file)
--- a/llvm/test/CodeGen/PowerPC/vsx-fma-m.ll
+++ b/llvm/test/CodeGen/PowerPC/vsx-fma-m.ll
@@ -177,21 +177,27 @@ entry:
    store <2 x double> %1, <2 x double>* %arrayidx3, align 8
    ret void
  
+; Note: There is some unavoidable changeability in this variant.  If the
+; FMAs are reordered differently, the algorithm can pick a different
+; multiplicand to destroy, changing the register assignment.  There isn't
+; a good way to express this possibility, so hopefully this doesn't change
+; too often.
+
  ; CHECK-LABEL: @testv3
  ; CHECK-DAG: xxlor [[V1:[0-9]+]], 34, 34
-; CHECK-DAG: xvmaddmdp 37, 35, 34
  ; CHECK-DAG: li [[C1:[0-9]+]], 48
  ; CHECK-DAG: li [[C2:[0-9]+]], 32
-; CHECK-DAG: xvmaddadp 34, 35, 38
+; CHECK-DAG: xvmaddmdp 37, 35, 34
  ; CHECK-DAG: li [[C3:[0-9]+]], 16
  
  ; Note: We could convert this next FMA to M-type as well, but it would require
  ; re-ordering the instructions.
  ; CHECK-DAG: xvmaddadp [[V1]], 35, 36
  
-; CHECK-DAG: xvmaddmdp 35, 36, 37
+; CHECK-DAG: xvmaddmdp 36, 35, 37
+; CHECK-DAG: xvmaddadp 34, 35, 38
  ; CHECK-DAG: stxvd2x 32, 0, 3
-; CHECK-DAG: stxvd2x 35, 3, [[C1]]
+; CHECK-DAG: stxvd2x 36, 3, [[C1]]
  ; CHECK-DAG: stxvd2x 34, 3, [[C2]]
  ; CHECK-DAG: stxvd2x 37, 3, [[C3]]
  ; CHECK: blr
diff --git a/llvm/test/CodeGen/PowerPC/vsx-ldst.ll b/llvm/test/CodeGen/PowerPC/vsx-ldst.ll

new file mode 100644 (file)

index 0000000..0c9ebef
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vsx-ldst.ll
@@ -0,0 +1,36 @@
+; RUN: llc -mcpu=pwr8 -mattr=+vsx -O2 -mtriple=powerpc64-unknown-linux-gnu < %s > %t
+; RUN: grep lxvw4x < %t | count 3
+; RUN: grep lxvd2x < %t | count 3
+; RUN: grep stxvw4x < %t | count 3
+; RUN: grep stxvd2x < %t | count 3
+
+@vsi = global <4 x i32> <i32 -1, i32 2, i32 -3, i32 4>, align 16
+@vui = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@vf = global <4 x float> <float -1.500000e+00, float 2.500000e+00, float -3.500000e+00, float 4.500000e+00>, align 16
+@vsll = global <2 x i64> <i64 255, i64 -937>, align 16
+@vull = global <2 x i64> <i64 1447, i64 2894>, align 16
+@vd = global <2 x double> <double 3.500000e+00, double -7.500000e+00>, align 16
+@res_vsi = common global <4 x i32> zeroinitializer, align 16
+@res_vui = common global <4 x i32> zeroinitializer, align 16
+@res_vf = common global <4 x float> zeroinitializer, align 16
+@res_vsll = common global <2 x i64> zeroinitializer, align 16
+@res_vull = common global <2 x i64> zeroinitializer, align 16
+@res_vd = common global <2 x double> zeroinitializer, align 16
+
+; Function Attrs: nounwind
+define void @test1() {
+entry:
+  %0 = load <4 x i32>* @vsi, align 16
+  %1 = load <4 x i32>* @vui, align 16
+  %2 = load <4 x i32>* bitcast (<4 x float>* @vf to <4 x i32>*), align 16
+  %3 = load <2 x double>* bitcast (<2 x i64>* @vsll to <2 x double>*), align 16
+  %4 = load <2 x double>* bitcast (<2 x i64>* @vull to <2 x double>*), align 16
+  %5 = load <2 x double>* @vd, align 16
+  store <4 x i32> %0, <4 x i32>* @res_vsi, align 16
+  store <4 x i32> %1, <4 x i32>* @res_vui, align 16
+  store <4 x i32> %2, <4 x i32>* bitcast (<4 x float>* @res_vf to <4 x i32>*), align 16
+  store <2 x double> %3, <2 x double>* bitcast (<2 x i64>* @res_vsll to <2 x double>*), align 16
+  store <2 x double> %4, <2 x double>* bitcast (<2 x i64>* @res_vull to <2 x double>*), align 16
+  store <2 x double> %5, <2 x double>* @res_vd, align 16
+  ret void
+}
diff --git a/llvm/test/Transforms/InstCombine/vsx-unaligned.ll b/llvm/test/Transforms/InstCombine/vsx-unaligned.ll

new file mode 100644 (file)

index 0000000..26e0426
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/vsx-unaligned.ll
@@ -0,0 +1,44 @@
+; Verify that we can create unaligned loads and stores from VSX intrinsics.
+
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target triple = "powerpc64-unknown-linux-gnu"
+
+@vf = common global <4 x float> zeroinitializer, align 1
+@res_vf = common global <4 x float> zeroinitializer, align 1
+@vd = common global <2 x double> zeroinitializer, align 1
+@res_vd = common global <2 x double> zeroinitializer, align 1
+
+define void @test1() {
+entry:
+  %t1 = alloca <4 x float>*, align 8
+  %t2 = alloca <2 x double>*, align 8
+  store <4 x float>* @vf, <4 x float>** %t1, align 8
+  %0 = load <4 x float>** %t1, align 8
+  %1 = bitcast <4 x float>* %0 to i8*
+  %2 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* %1)
+  store <4 x float>* @res_vf, <4 x float>** %t1, align 8
+  %3 = load <4 x float>** %t1, align 8
+  %4 = bitcast <4 x float>* %3 to i8*
+  call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %2, i8* %4)
+  store <2 x double>* @vd, <2 x double>** %t2, align 8
+  %5 = load <2 x double>** %t2, align 8
+  %6 = bitcast <2 x double>* %5 to i8*
+  %7 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* %6)
+  store <2 x double>* @res_vd, <2 x double>** %t2, align 8
+  %8 = load <2 x double>** %t2, align 8
+  %9 = bitcast <2 x double>* %8 to i8*
+  call void @llvm.ppc.vsx.stxvd2x(<2 x double> %7, i8* %9)
+  ret void
+}
+
+; CHECK-LABEL: @test1
+; CHECK: %0 = load <4 x i32>* bitcast (<4 x float>* @vf to <4 x i32>*), align 1
+; CHECK: store <4 x i32> %0, <4 x i32>* bitcast (<4 x float>* @res_vf to <4 x i32>*), align 1
+; CHECK: %1 = load <2 x double>* @vd, align 1
+; CHECK: store <2 x double> %1, <2 x double>* @res_vd, align 1
+
+declare <4 x i32> @llvm.ppc.vsx.lxvw4x(i8*)
+declare void @llvm.ppc.vsx.stxvw4x(<4 x i32>, i8*)
+declare <2 x double> @llvm.ppc.vsx.lxvd2x(i8*)
+declare void @llvm.ppc.vsx.stxvd2x(<2 x double>, i8*)
author	Bill Schmidt <wschmidt@linux.vnet.ibm.com>
	Wed, 12 Nov 2014 04:19:40 +0000 (04:19 +0000)
committer	Bill Schmidt <wschmidt@linux.vnet.ibm.com>
	Wed, 12 Nov 2014 04:19:40 +0000 (04:19 +0000)
llvm/include/llvm/IR/IntrinsicsPowerPC.td		patch \| blob \| history
llvm/lib/Target/PowerPC/PPCISelLowering.cpp		patch \| blob \| history
llvm/lib/Target/PowerPC/PPCInstrVSX.td		patch \| blob \| history
llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp		patch \| blob \| history
llvm/test/CodeGen/PowerPC/vsx-fma-m.ll		patch \| blob \| history
llvm/test/CodeGen/PowerPC/vsx-ldst.ll	[new file with mode: 0644]	patch \| blob
llvm/test/Transforms/InstCombine/vsx-unaligned.ll	[new file with mode: 0644]	patch \| blob