From: Craig Topper Date: Sat, 22 Feb 2020 01:56:35 +0000 (-0800) Subject: [X86] Teach combineCVTPH2PS to shrink v8i16 loads when the output type is v4f32.... X-Git-Tag: llvmorg-12-init~13904 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=228a2bc9b70c3d93bd28f0038a8664ef8dac042e;p=platform%2Fupstream%2Fllvm.git [X86] Teach combineCVTPH2PS to shrink v8i16 loads when the output type is v4f32. Remove extra isel patterns. Similar to what do for other operations that use a subset of bits. Allows us to remove a pattern that shrinks a load. Which was incorrect if the load was volatile. --- diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 64cae53..a7e4bc7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -43728,6 +43728,26 @@ static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero, DCI)) return SDValue(N, 0); + + if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { + LoadSDNode *LN = cast(N->getOperand(0)); + // Unless the load is volatile or atomic. + if (LN->isSimple()) { + SDLoc dl(N); + SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); + SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue VZLoad = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MVT::i64, + LN->getPointerInfo(), + LN->getAlignment(), + LN->getMemOperand()->getFlags()); + SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32, + DAG.getBitcast(MVT::v8i16, VZLoad)); + DCI.CombineTo(N, Convert); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); + return SDValue(N, 0); + } + } } return SDValue(); diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index de3fe20..09d1ea6 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -8566,7 +8566,7 @@ let Predicates = [HasDQI, HasVLX] in { let Uses = [MXCSR], mayRaiseFPException = 1 in multiclass avx512_cvtph2ps { defm rr : AVX512_maskable_split<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src), "vcvtph2ps", "$src", "$src", @@ -8575,8 +8575,8 @@ multiclass avx512_cvtph2ps; defm rm : AVX512_maskable_split<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src), "vcvtph2ps", "$src", "$src", - (X86any_cvtph2ps (_src.VT (ld_frag addr:$src))), - (X86cvtph2ps (_src.VT (ld_frag addr:$src)))>, + (X86any_cvtph2ps (_src.VT ld_dag)), + (X86cvtph2ps (_src.VT ld_dag))>, T8PD, Sched<[sched.Folded]>; } @@ -8591,22 +8591,21 @@ multiclass avx512_cvtph2ps_sae, + defm VCVTPH2PSZ : avx512_cvtph2ps, avx512_cvtph2ps_sae, EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>; let Predicates = [HasVLX] in { defm VCVTPH2PSZ256 : avx512_cvtph2ps, EVEX, EVEX_V256, + (load addr:$src), WriteCvtPH2PSY>, EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>; defm VCVTPH2PSZ128 : avx512_cvtph2ps, EVEX, EVEX_V128, + (bitconvert (v2i64 (X86vzload64 addr:$src))), + WriteCvtPH2PS>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; // Pattern match vcvtph2ps of a scalar i64 load. - def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), - (VCVTPH2PSZ128rm addr:$src)>; def : Pat<(v4f32 (X86any_cvtph2ps (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), (VCVTPH2PSZ128rm addr:$src)>; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index a8c285c..abbd513 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -7339,10 +7339,10 @@ multiclass f16c_ph2ps, T8PD, VEX, Sched<[sched]>; + let hasSideEffects = 0, mayLoad = 1 in def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), "vcvtph2ps\t{$src, $dst|$dst, $src}", - [(set RC:$dst, (X86any_cvtph2ps (loadv8i16 addr:$src)))]>, - T8PD, VEX, Sched<[sched.Folded]>; + []>, T8PD, VEX, Sched<[sched.Folded]>; } multiclass f16c_ps2ph; + def : Pat<(v8f32 (X86any_cvtph2ps (loadv8i16 addr:$src))), + (VCVTPH2PSYrm addr:$src)>; def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),