// If this is a splitted VSELECT that was previously already handled, do
// nothing.
- if (Cond->getValueType(0).getScalarSizeInBits() != 1)
+ EVT CondVT = Cond->getValueType(0);
+ if (CondVT.getScalarSizeInBits() != 1)
return SDValue();
EVT VSelVT = N->getValueType(0);
EVT SetCCResVT = getSetCCResultType(SetCCOpVT);
if (SetCCResVT.getScalarSizeInBits() == 1)
return SDValue();
+ } else if (CondVT.getScalarType() == MVT::i1) {
+ // If there is support for an i1 vector mask (or only scalar i1 conditions),
+ // don't touch.
+ while (TLI.getTypeAction(Ctx, CondVT) != TargetLowering::TypeLegal)
+ CondVT = TLI.getTypeToTransformTo(Ctx, CondVT);
+
+ if (CondVT.getScalarType() == MVT::i1)
+ return SDValue();
}
// Get the VT and operands for VSELECT, and widen if needed.
--- /dev/null
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+
+; Check that DAGTypeLegalizer::WidenVSELECTAndMask doesn't try to
+; create vselects with i64 condition masks.
+
+; FIXME: Should be able to avoid intermediate vselect
+; GCN-LABEL: {{^}}widen_vselect_and_mask_v4f64:
+; GCN: v_cmp_u_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]],
+; GCN: v_cndmask_b32_e64 v[[VSEL:[0-9]+]], 0, -1, [[CMP]]
+; GCN: v_mov_b32_e32 v[[VSEL_EXT:[0-9]+]], v[[VSEL]]
+; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[}}[[VSEL]]:[[VSEL_EXT]]{{\]}}
+define amdgpu_kernel void @widen_vselect_and_mask_v4f64(<4 x double> %arg) #0 {
+bb:
+ %tmp = extractelement <4 x double> %arg, i64 0
+ %tmp1 = fcmp uno double %tmp, 0.000000e+00
+ %tmp2 = sext i1 %tmp1 to i64
+ %tmp3 = insertelement <4 x i64> undef, i64 %tmp2, i32 0
+ %tmp4 = insertelement <4 x i64> %tmp3, i64 undef, i32 1
+ %tmp5 = insertelement <4 x i64> %tmp4, i64 undef, i32 2
+ %tmp6 = insertelement <4 x i64> %tmp5, i64 undef, i32 3
+ %tmp7 = fcmp une <4 x double> %arg, zeroinitializer
+ %tmp8 = icmp sgt <4 x i64> %tmp6, <i64 -1, i64 -1, i64 -1, i64 -1>
+ %tmp9 = and <4 x i1> %tmp8, %tmp7
+ %tmp10 = select <4 x i1> %tmp9, <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>, <4 x double> zeroinitializer
+ store <4 x double> %tmp10, <4 x double> addrspace(1)* null, align 32
+ ret void
+}
+
+; GCN-LABEL: {{^}}widen_vselect_and_mask_v4i64:
+; GCN: v_cmp_eq_u64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]],
+; GCN: v_cndmask_b32_e64 v[[VSEL:[0-9]+]], 0, -1, [[CMP]]
+; GCN: v_mov_b32_e32 v[[VSEL_EXT:[0-9]+]], v[[VSEL]]
+; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[}}[[VSEL]]:[[VSEL_EXT]]{{\]}}
+define amdgpu_kernel void @widen_vselect_and_mask_v4i64(<4 x i64> %arg) #0 {
+bb:
+ %tmp = extractelement <4 x i64> %arg, i64 0
+ %tmp1 = icmp eq i64 %tmp, 0
+ %tmp2 = sext i1 %tmp1 to i64
+ %tmp3 = insertelement <4 x i64> undef, i64 %tmp2, i32 0
+ %tmp4 = insertelement <4 x i64> %tmp3, i64 undef, i32 1
+ %tmp5 = insertelement <4 x i64> %tmp4, i64 undef, i32 2
+ %tmp6 = insertelement <4 x i64> %tmp5, i64 undef, i32 3
+ %tmp7 = icmp ne <4 x i64> %arg, zeroinitializer
+ %tmp8 = icmp sgt <4 x i64> %tmp6, <i64 -1, i64 -1, i64 -1, i64 -1>
+ %tmp9 = and <4 x i1> %tmp8, %tmp7
+ %tmp10 = select <4 x i1> %tmp9, <4 x i64> <i64 1, i64 1, i64 1, i64 1>, <4 x i64> zeroinitializer
+ store <4 x i64> %tmp10, <4 x i64> addrspace(1)* null, align 32
+ ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
; both formulations of vselect. All of this trickery is because we can't
; directly form an SDAG input to the lowering.
define <16 x double> @test2(<16 x float> %x, <16 x float> %y, <16 x double> %a, <16 x double> %b) {
-; CHECK-SKX-LABEL: test2:
-; CHECK-SKX: # BB#0: # %entry
-; CHECK-SKX-NEXT: vxorps %xmm6, %xmm6, %xmm6
-; CHECK-SKX-NEXT: vcmpltps %zmm0, %zmm6, %k0
-; CHECK-SKX-NEXT: vcmpltps %zmm6, %zmm1, %k1
-; CHECK-SKX-NEXT: korw %k1, %k0, %k0
-; CHECK-SKX-NEXT: kshiftrw $8, %k0, %k1
-; CHECK-SKX-NEXT: vpmovm2q %k1, %zmm1
-; CHECK-SKX-NEXT: vpmovm2q %k0, %zmm0
-; CHECK-SKX-NEXT: vptestmq %zmm0, %zmm0, %k1
-; CHECK-SKX-NEXT: vblendmpd %zmm2, %zmm4, %zmm0 {%k1}
-; CHECK-SKX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; CHECK-SKX-NEXT: vblendmpd %zmm3, %zmm5, %zmm1 {%k1}
-; CHECK-SKX-NEXT: retq
-;
-; CHECK-KNL-LABEL: test2:
-; CHECK-KNL: # BB#0: # %entry
-; CHECK-KNL-NEXT: vxorps %xmm6, %xmm6, %xmm6
-; CHECK-KNL-NEXT: vcmpltps %zmm0, %zmm6, %k0
-; CHECK-KNL-NEXT: vcmpltps %zmm6, %zmm1, %k1
-; CHECK-KNL-NEXT: korw %k1, %k0, %k1
-; CHECK-KNL-NEXT: kshiftrw $8, %k1, %k2
-; CHECK-KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; CHECK-KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; CHECK-KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; CHECK-KNL-NEXT: vblendmpd %zmm2, %zmm4, %zmm0 {%k1}
-; CHECK-KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
-; CHECK-KNL-NEXT: vblendmpd %zmm3, %zmm5, %zmm1 {%k1}
-; CHECK-KNL-NEXT: retq
+; CHECK-LABEL: test2:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vxorps %xmm6, %xmm6, %xmm6
+; CHECK-NEXT: vcmpltps %zmm0, %zmm6, %k0
+; CHECK-NEXT: vcmpltps %zmm6, %zmm1, %k1
+; CHECK-NEXT: korw %k1, %k0, %k1
+; CHECK-NEXT: vblendmpd %zmm2, %zmm4, %zmm0 {%k1}
+; CHECK-NEXT: kshiftrw $8, %k1, %k1
+; CHECK-NEXT: vblendmpd %zmm3, %zmm5, %zmm1 {%k1}
+; CHECK-NEXT: retq
entry:
%gt.m = fcmp ogt <16 x float> %x, zeroinitializer
%lt.m = fcmp olt <16 x float> %y, zeroinitializer