From: Chandler Carruth Date: Thu, 11 May 2017 10:52:16 +0000 (+0000) Subject: [x86] Fix a failure to select with AVX-512 when the type legalizer X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=97500a9918d7de5282bc3739a2a907d8e0f50562;p=platform%2Fupstream%2Fllvm.git [x86] Fix a failure to select with AVX-512 when the type legalizer manages to form a VSELECT with a non-i1 element type condition. Those are technically allowed in SDAG (at least, the generic type legalization logic will form them and I wouldn't want to try to audit everything te preclude forming them) so we need to be able to lower them. This isn't too hard to implement. We mark VSELECT as custom so we get a chance in C++, add a fast path for i1 conditions to get directly handled by the patterns, and a fallback when we need to manually force the condition to be an i1 that uses the vptestm instruction to turn a non-mask into a mask. This, unsurprisingly, generates awful code. But it at least doesn't crash. This was actually impacting open source packages built with LLVM for AVX-512 in the wild, so quickly landing a patch that at least stops the immediate bleeding. I think I've found where to fix the codegen quality issue, but less confident of that change so separating it out from the thing that doesn't change the result of any existing test case but causes mine to not crash. llvm-svn: 302785 --- diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9cac59b..607ba9b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1381,7 +1381,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); @@ -1445,8 +1445,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom); - setOperationAction(ISD::VSELECT, MVT::v32i16, Legal); - setOperationAction(ISD::VSELECT, MVT::v64i8, Legal); setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); @@ -1479,7 +1477,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v64i8, MVT::v32i16 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::ABS, VT, Legal); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); @@ -13817,6 +13815,11 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode())) return SDValue(); + // If this VSELECT has a vector if i1 as a mask, it will be directly matched + // with patterns on the mask registers on AVX-512. + if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1) + return Op; + // Try to lower this to a blend-style vector shuffle. This can handle all // constant condition cases. if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG)) @@ -13826,10 +13829,31 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { if (!Subtarget.hasSSE41()) return SDValue(); + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + + // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition + // into an i1 condition so that we can use the mask-based 512-bit blend + // instructions. + if (VT.getSizeInBits() == 512) { + SDValue Cond = Op.getOperand(0); + // The vNi1 condition case should be handled above as it can be trivially + // lowered. + assert(Cond.getValueType().getScalarSizeInBits() == + VT.getScalarSizeInBits() && + "Should have a size-matched integer condition!"); + // Build a mask by testing the condition against itself (tests for zero). + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond); + // Now return a new VSELECT using the mask. + return DAG.getNode(ISD::VSELECT, dl, VT, Mask, Op.getOperand(1), + Op.getOperand(2)); + } + // Only some types will be legal on some subtargets. If we can emit a legal // VSELECT-matching blend, return Op, and but if we need to expand, return // a null value. - switch (Op.getSimpleValueType().SimpleTy) { + switch (VT.SimpleTy) { default: // Most of the vector types have blends past SSE4.1. return Op; diff --git a/llvm/test/CodeGen/X86/avx512-vselect.ll b/llvm/test/CodeGen/X86/avx512-vselect.ll new file mode 100644 index 0000000..1940864 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512-vselect.ll @@ -0,0 +1,61 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mcpu=skx | FileCheck %s --check-prefixes=CHECK,CHECK-SKX +; RUN: llc < %s -mcpu=knl | FileCheck %s --check-prefixes=CHECK,CHECK-KNL + +target triple = "x86_64-unknown-unknown" + +define <8 x i64> @test1(<8 x i64> %m, <8 x i64> %a, <8 x i64> %b) { +; CHECK-LABEL: test1: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: vpsllq $63, %zmm0, %zmm0 +; CHECK-NEXT: vptestmq %zmm0, %zmm0, %k1 +; CHECK-NEXT: vpblendmq %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq +entry: + %m.trunc = trunc <8 x i64> %m to <8 x i1> + %ret = select <8 x i1> %m.trunc, <8 x i64> %a, <8 x i64> %b + ret <8 x i64> %ret +} + +; This is a very contrived test case to trick the legalizer into splitting the +; v16i1 masks in the select during type legalization, and in so doing extend them +; into two v8i64 types. This lets us ensure that the lowering code can handle +; both formulations of vselect. All of this trickery is because we can't +; directly form an SDAG input to the lowering. +define <16 x double> @test2(<16 x float> %x, <16 x float> %y, <16 x double> %a, <16 x double> %b) { +; CHECK-SKX-LABEL: test2: +; CHECK-SKX: # BB#0: # %entry +; CHECK-SKX-NEXT: vxorps %zmm6, %zmm6, %zmm6 +; CHECK-SKX-NEXT: vcmpltps %zmm0, %zmm6, %k0 +; CHECK-SKX-NEXT: vcmpltps %zmm6, %zmm1, %k1 +; CHECK-SKX-NEXT: korw %k1, %k0, %k0 +; CHECK-SKX-NEXT: kshiftrw $8, %k0, %k1 +; CHECK-SKX-NEXT: vpmovm2q %k1, %zmm1 +; CHECK-SKX-NEXT: vpmovm2q %k0, %zmm0 +; CHECK-SKX-NEXT: vptestmq %zmm0, %zmm0, %k1 +; CHECK-SKX-NEXT: vblendmpd %zmm2, %zmm4, %zmm0 {%k1} +; CHECK-SKX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; CHECK-SKX-NEXT: vblendmpd %zmm3, %zmm5, %zmm1 {%k1} +; CHECK-SKX-NEXT: retq +; +; CHECK-KNL-LABEL: test2: +; CHECK-KNL: # BB#0: # %entry +; CHECK-KNL-NEXT: vpxord %zmm6, %zmm6, %zmm6 +; CHECK-KNL-NEXT: vcmpltps %zmm0, %zmm6, %k0 +; CHECK-KNL-NEXT: vcmpltps %zmm6, %zmm1, %k1 +; CHECK-KNL-NEXT: korw %k1, %k0, %k1 +; CHECK-KNL-NEXT: kshiftrw $8, %k1, %k2 +; CHECK-KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; CHECK-KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; CHECK-KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; CHECK-KNL-NEXT: vblendmpd %zmm2, %zmm4, %zmm0 {%k1} +; CHECK-KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; CHECK-KNL-NEXT: vblendmpd %zmm3, %zmm5, %zmm1 {%k1} +; CHECK-KNL-NEXT: retq +entry: + %gt.m = fcmp ogt <16 x float> %x, zeroinitializer + %lt.m = fcmp olt <16 x float> %y, zeroinitializer + %m.or = or <16 x i1> %gt.m, %lt.m + %ret = select <16 x i1> %m.or, <16 x double> %a, <16 x double> %b + ret <16 x double> %ret +}