let TargetPrefix = "ppc" in { // All intrinsics start with "llvm.ppc.".
+// Vector load.
+def int_ppc_vsx_lxvw4x :
+ Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
+def int_ppc_vsx_lxvd2x :
+ Intrinsic<[llvm_v2f64_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
+
+// Vector store.
+def int_ppc_vsx_stxvw4x :
+ Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty], [IntrReadWriteArgMem]>;
+def int_ppc_vsx_stxvd2x :
+ Intrinsic<[], [llvm_v2f64_ty, llvm_ptr_ty], [IntrReadWriteArgMem]>;
+
// Vector maximum.
def int_ppc_vsx_xvmaxdp : PowerPC_VSX_Vec_DDD_Intrinsic<"xvmaxdp">;
def int_ppc_vsx_xvmaxsp : PowerPC_VSX_Vec_FFF_Intrinsic<"xvmaxsp">;
default: return false;
case Intrinsic::ppc_altivec_lvx:
case Intrinsic::ppc_altivec_lvxl:
+ case Intrinsic::ppc_vsx_lxvw4x:
VT = MVT::v4i32;
break;
+ case Intrinsic::ppc_vsx_lxvd2x:
+ VT = MVT::v2f64;
+ break;
case Intrinsic::ppc_altivec_lvebx:
VT = MVT::i8;
break;
default: return false;
case Intrinsic::ppc_altivec_stvx:
case Intrinsic::ppc_altivec_stvxl:
+ case Intrinsic::ppc_vsx_stxvw4x:
VT = MVT::v4i32;
break;
+ case Intrinsic::ppc_vsx_stxvd2x:
+ VT = MVT::v2f64;
+ break;
case Intrinsic::ppc_altivec_stvebx:
VT = MVT::i8;
break;
case Intrinsic::ppc_altivec_lvxl:
case Intrinsic::ppc_altivec_lvebx:
case Intrinsic::ppc_altivec_lvehx:
- case Intrinsic::ppc_altivec_lvewx: {
+ case Intrinsic::ppc_altivec_lvewx:
+ case Intrinsic::ppc_vsx_lxvd2x:
+ case Intrinsic::ppc_vsx_lxvw4x: {
EVT VT;
switch (Intrinsic) {
case Intrinsic::ppc_altivec_lvebx:
case Intrinsic::ppc_altivec_lvewx:
VT = MVT::i32;
break;
+ case Intrinsic::ppc_vsx_lxvd2x:
+ VT = MVT::v2f64;
+ break;
default:
VT = MVT::v4i32;
break;
case Intrinsic::ppc_altivec_stvxl:
case Intrinsic::ppc_altivec_stvebx:
case Intrinsic::ppc_altivec_stvehx:
- case Intrinsic::ppc_altivec_stvewx: {
+ case Intrinsic::ppc_altivec_stvewx:
+ case Intrinsic::ppc_vsx_stxvd2x:
+ case Intrinsic::ppc_vsx_stxvw4x: {
EVT VT;
switch (Intrinsic) {
case Intrinsic::ppc_altivec_stvebx:
case Intrinsic::ppc_altivec_stvewx:
VT = MVT::i32;
break;
+ case Intrinsic::ppc_vsx_stxvd2x:
+ VT = MVT::v2f64;
+ break;
default:
VT = MVT::v4i32;
break;
def LXVD2X : XX1Form<31, 844,
(outs vsrc:$XT), (ins memrr:$src),
"lxvd2x $XT, $src", IIC_LdStLFD,
- [(set v2f64:$XT, (load xoaddr:$src))]>;
+ [(set v2f64:$XT, (int_ppc_vsx_lxvd2x xoaddr:$src))]>;
def LXVDSX : XX1Form<31, 332,
(outs vsrc:$XT), (ins memrr:$src),
def LXVW4X : XX1Form<31, 780,
(outs vsrc:$XT), (ins memrr:$src),
"lxvw4x $XT, $src", IIC_LdStLFD,
- [(set v4i32:$XT, (load xoaddr:$src))]>;
+ [(set v4i32:$XT, (int_ppc_vsx_lxvw4x xoaddr:$src))]>;
}
// Store indexed instructions
def STXVD2X : XX1Form<31, 972,
(outs), (ins vsrc:$XT, memrr:$dst),
"stxvd2x $XT, $dst", IIC_LdStSTFD,
- [(store v2f64:$XT, xoaddr:$dst)]>;
+ [(int_ppc_vsx_stxvd2x v2f64:$XT, xoaddr:$dst)]>;
def STXVW4X : XX1Form<31, 908,
(outs), (ins vsrc:$XT, memrr:$dst),
"stxvw4x $XT, $dst", IIC_LdStSTFD,
- [(store v4i32:$XT, xoaddr:$dst)]>;
+ [(int_ppc_vsx_stxvw4x v4i32:$XT, xoaddr:$dst)]>;
}
// Add/Mul Instructions
(XVCVSXWDP (XXSLDWI $C, $C, 1))>;
// Loads.
+def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
def : Pat<(v4i32 (load xoaddr:$src)), (LXVW4X xoaddr:$src)>;
// Stores.
-def : Pat<(store v4i32:$rS, xoaddr:$dst),
- (STXVW4X $rS, xoaddr:$dst)>;
+def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+def : Pat<(store v4i32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>;
// Selects.
def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLT)),
return new LoadInst(Ptr);
}
break;
+ case Intrinsic::ppc_vsx_lxvw4x:
+ case Intrinsic::ppc_vsx_lxvd2x: {
+ // Turn PPC VSX loads into normal loads.
+ Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0),
+ PointerType::getUnqual(II->getType()));
+ return new LoadInst(Ptr, Twine(""), false, 1);
+ }
case Intrinsic::ppc_altivec_stvx:
case Intrinsic::ppc_altivec_stvxl:
// Turn stvx -> store if the pointer is known aligned.
return new StoreInst(II->getArgOperand(0), Ptr);
}
break;
+ case Intrinsic::ppc_vsx_stxvw4x:
+ case Intrinsic::ppc_vsx_stxvd2x: {
+ // Turn PPC VSX stores into normal stores.
+ Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType());
+ Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy);
+ return new StoreInst(II->getArgOperand(0), Ptr, false, 1);
+ }
case Intrinsic::x86_sse_storeu_ps:
case Intrinsic::x86_sse2_storeu_pd:
case Intrinsic::x86_sse2_storeu_dq:
store <2 x double> %1, <2 x double>* %arrayidx3, align 8
ret void
+; Note: There is some unavoidable changeability in this variant. If the
+; FMAs are reordered differently, the algorithm can pick a different
+; multiplicand to destroy, changing the register assignment. There isn't
+; a good way to express this possibility, so hopefully this doesn't change
+; too often.
+
; CHECK-LABEL: @testv3
; CHECK-DAG: xxlor [[V1:[0-9]+]], 34, 34
-; CHECK-DAG: xvmaddmdp 37, 35, 34
; CHECK-DAG: li [[C1:[0-9]+]], 48
; CHECK-DAG: li [[C2:[0-9]+]], 32
-; CHECK-DAG: xvmaddadp 34, 35, 38
+; CHECK-DAG: xvmaddmdp 37, 35, 34
; CHECK-DAG: li [[C3:[0-9]+]], 16
; Note: We could convert this next FMA to M-type as well, but it would require
; re-ordering the instructions.
; CHECK-DAG: xvmaddadp [[V1]], 35, 36
-; CHECK-DAG: xvmaddmdp 35, 36, 37
+; CHECK-DAG: xvmaddmdp 36, 35, 37
+; CHECK-DAG: xvmaddadp 34, 35, 38
; CHECK-DAG: stxvd2x 32, 0, 3
-; CHECK-DAG: stxvd2x 35, 3, [[C1]]
+; CHECK-DAG: stxvd2x 36, 3, [[C1]]
; CHECK-DAG: stxvd2x 34, 3, [[C2]]
; CHECK-DAG: stxvd2x 37, 3, [[C3]]
; CHECK: blr
--- /dev/null
+; RUN: llc -mcpu=pwr8 -mattr=+vsx -O2 -mtriple=powerpc64-unknown-linux-gnu < %s > %t
+; RUN: grep lxvw4x < %t | count 3
+; RUN: grep lxvd2x < %t | count 3
+; RUN: grep stxvw4x < %t | count 3
+; RUN: grep stxvd2x < %t | count 3
+
+@vsi = global <4 x i32> <i32 -1, i32 2, i32 -3, i32 4>, align 16
+@vui = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@vf = global <4 x float> <float -1.500000e+00, float 2.500000e+00, float -3.500000e+00, float 4.500000e+00>, align 16
+@vsll = global <2 x i64> <i64 255, i64 -937>, align 16
+@vull = global <2 x i64> <i64 1447, i64 2894>, align 16
+@vd = global <2 x double> <double 3.500000e+00, double -7.500000e+00>, align 16
+@res_vsi = common global <4 x i32> zeroinitializer, align 16
+@res_vui = common global <4 x i32> zeroinitializer, align 16
+@res_vf = common global <4 x float> zeroinitializer, align 16
+@res_vsll = common global <2 x i64> zeroinitializer, align 16
+@res_vull = common global <2 x i64> zeroinitializer, align 16
+@res_vd = common global <2 x double> zeroinitializer, align 16
+
+; Function Attrs: nounwind
+define void @test1() {
+entry:
+ %0 = load <4 x i32>* @vsi, align 16
+ %1 = load <4 x i32>* @vui, align 16
+ %2 = load <4 x i32>* bitcast (<4 x float>* @vf to <4 x i32>*), align 16
+ %3 = load <2 x double>* bitcast (<2 x i64>* @vsll to <2 x double>*), align 16
+ %4 = load <2 x double>* bitcast (<2 x i64>* @vull to <2 x double>*), align 16
+ %5 = load <2 x double>* @vd, align 16
+ store <4 x i32> %0, <4 x i32>* @res_vsi, align 16
+ store <4 x i32> %1, <4 x i32>* @res_vui, align 16
+ store <4 x i32> %2, <4 x i32>* bitcast (<4 x float>* @res_vf to <4 x i32>*), align 16
+ store <2 x double> %3, <2 x double>* bitcast (<2 x i64>* @res_vsll to <2 x double>*), align 16
+ store <2 x double> %4, <2 x double>* bitcast (<2 x i64>* @res_vull to <2 x double>*), align 16
+ store <2 x double> %5, <2 x double>* @res_vd, align 16
+ ret void
+}
--- /dev/null
+; Verify that we can create unaligned loads and stores from VSX intrinsics.
+
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target triple = "powerpc64-unknown-linux-gnu"
+
+@vf = common global <4 x float> zeroinitializer, align 1
+@res_vf = common global <4 x float> zeroinitializer, align 1
+@vd = common global <2 x double> zeroinitializer, align 1
+@res_vd = common global <2 x double> zeroinitializer, align 1
+
+define void @test1() {
+entry:
+ %t1 = alloca <4 x float>*, align 8
+ %t2 = alloca <2 x double>*, align 8
+ store <4 x float>* @vf, <4 x float>** %t1, align 8
+ %0 = load <4 x float>** %t1, align 8
+ %1 = bitcast <4 x float>* %0 to i8*
+ %2 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* %1)
+ store <4 x float>* @res_vf, <4 x float>** %t1, align 8
+ %3 = load <4 x float>** %t1, align 8
+ %4 = bitcast <4 x float>* %3 to i8*
+ call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %2, i8* %4)
+ store <2 x double>* @vd, <2 x double>** %t2, align 8
+ %5 = load <2 x double>** %t2, align 8
+ %6 = bitcast <2 x double>* %5 to i8*
+ %7 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* %6)
+ store <2 x double>* @res_vd, <2 x double>** %t2, align 8
+ %8 = load <2 x double>** %t2, align 8
+ %9 = bitcast <2 x double>* %8 to i8*
+ call void @llvm.ppc.vsx.stxvd2x(<2 x double> %7, i8* %9)
+ ret void
+}
+
+; CHECK-LABEL: @test1
+; CHECK: %0 = load <4 x i32>* bitcast (<4 x float>* @vf to <4 x i32>*), align 1
+; CHECK: store <4 x i32> %0, <4 x i32>* bitcast (<4 x float>* @res_vf to <4 x i32>*), align 1
+; CHECK: %1 = load <2 x double>* @vd, align 1
+; CHECK: store <2 x double> %1, <2 x double>* @res_vd, align 1
+
+declare <4 x i32> @llvm.ppc.vsx.lxvw4x(i8*)
+declare void @llvm.ppc.vsx.stxvw4x(<4 x i32>, i8*)
+declare <2 x double> @llvm.ppc.vsx.lxvd2x(i8*)
+declare void @llvm.ppc.vsx.stxvd2x(<2 x double>, i8*)