From b311a040ef9cda6d52fdfdd4968f2dc59ccf596e Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Thu, 19 Aug 2021 15:24:28 -0700 Subject: [PATCH] [WebAssembly] Pattern match SIMD convert_low and promote_low during ISel Since the simplest DAG patterns for convert_low and promote_low instructions involved v2i32, v2f32, v4i64, and v4f64 types, which are not legal in the WebAssembly backend and would be eliminated by type legalization, we were previously matching those patterns in a DAG combine before the type legalization stage. However in cases where the vectors were wider than 128 bits, the patterns we matched were not created until the type legalization stage when the wide vectors were split up. Type legalization would continue to eliminate the illegal types we were matching as well, so the code ended up scalarized. To make the ISel for these instructions more robust, match the scalarized patterns rather than the patterns containing illegal types. Add tests with double-wide vectors to show that this works as intended. Fixes PR51098. Depends on D107502. Differential Revision: https://reviews.llvm.org/D108266 --- .../Target/WebAssembly/WebAssemblyISelLowering.cpp | 182 +++++++-------------- llvm/test/CodeGen/WebAssembly/simd-conversions.ll | 147 +++++++++++++++++ 2 files changed, 210 insertions(+), 119 deletions(-) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 507895e..c418a38 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -1778,8 +1778,71 @@ WebAssemblyTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, Op.getOperand(1)); } +static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) { + if (Op.getValueType() != MVT::v2f64) + return SDValue(); + + auto GetConvertedLane = [](SDValue Op, unsigned &Opcode, SDValue &SrcVec, + unsigned &Index) -> bool { + switch (Op.getOpcode()) { + case ISD::SINT_TO_FP: + Opcode = WebAssemblyISD::CONVERT_LOW_S; + break; + case ISD::UINT_TO_FP: + Opcode = WebAssemblyISD::CONVERT_LOW_U; + break; + case ISD::FP_EXTEND: + Opcode = WebAssemblyISD::PROMOTE_LOW; + break; + default: + return false; + } + + auto ExtractVector = Op.getOperand(0); + if (ExtractVector.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return false; + + if (!isa(ExtractVector.getOperand(1).getNode())) + return false; + + SrcVec = ExtractVector.getOperand(0); + Index = ExtractVector.getConstantOperandVal(1); + return true; + }; + + unsigned LHSOpcode, RHSOpcode, LHSIndex, RHSIndex; + SDValue LHSSrcVec, RHSSrcVec; + if (!GetConvertedLane(Op.getOperand(0), LHSOpcode, LHSSrcVec, LHSIndex) || + !GetConvertedLane(Op.getOperand(1), RHSOpcode, RHSSrcVec, RHSIndex)) + return SDValue(); + + if (LHSOpcode != RHSOpcode || LHSSrcVec != RHSSrcVec) + return SDValue(); + + if (LHSIndex != 0 || RHSIndex != 1) + return SDValue(); + + MVT ExpectedSrcVT; + switch (LHSOpcode) { + case WebAssemblyISD::CONVERT_LOW_S: + case WebAssemblyISD::CONVERT_LOW_U: + ExpectedSrcVT = MVT::v4i32; + break; + case WebAssemblyISD::PROMOTE_LOW: + ExpectedSrcVT = MVT::v4f32; + break; + } + if (LHSSrcVec.getValueType() != ExpectedSrcVT) + return SDValue(); + + return DAG.getNode(LHSOpcode, SDLoc(Op), MVT::v2f64, LHSSrcVec); +} + SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { + if (auto ConvertLow = LowerConvertLow(Op, DAG)) + return ConvertLow; + SDLoc DL(Op); const EVT VecT = Op.getValueType(); const EVT LaneT = Op.getOperand(0).getValueType(); @@ -2231,120 +2294,6 @@ performVectorExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { } static SDValue -performVectorConvertLowCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - auto &DAG = DCI.DAG; - - EVT ResVT = N->getValueType(0); - if (ResVT != MVT::v2f64) - return SDValue(); - - auto GetWasmConversionOp = [](unsigned Op) { - switch (Op) { - case ISD::SINT_TO_FP: - return WebAssemblyISD::CONVERT_LOW_S; - case ISD::UINT_TO_FP: - return WebAssemblyISD::CONVERT_LOW_U; - case ISD::FP_EXTEND: - return WebAssemblyISD::PROMOTE_LOW; - } - llvm_unreachable("unexpected op"); - }; - - if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) { - // Combine this: - // - // (v2f64 (extract_subvector - // (v4f64 ({s,u}int_to_fp (v4i32 $x))), 0)) - // - // into (f64x2.convert_low_i32x4_{s,u} $x). - // - // Or this: - // - // (v2f64 (extract_subvector - // (v4f64 (fp_extend (v4f32 $x))), 0)) - // - // into (f64x2.promote_low_f32x4 $x). - auto Conversion = N->getOperand(0); - auto ConversionOp = Conversion.getOpcode(); - MVT ExpectedSourceType; - switch (ConversionOp) { - case ISD::SINT_TO_FP: - case ISD::UINT_TO_FP: - ExpectedSourceType = MVT::v4i32; - break; - case ISD::FP_EXTEND: - ExpectedSourceType = MVT::v4f32; - break; - default: - return SDValue(); - } - - if (Conversion.getValueType() != MVT::v4f64) - return SDValue(); - - auto Source = Conversion.getOperand(0); - if (Source.getValueType() != ExpectedSourceType) - return SDValue(); - - auto IndexNode = dyn_cast(N->getOperand(1)); - if (IndexNode == nullptr || IndexNode->getZExtValue() != 0) - return SDValue(); - - auto Op = GetWasmConversionOp(ConversionOp); - return DAG.getNode(Op, SDLoc(N), ResVT, Source); - } - - // Combine this: - // - // (v2f64 ({s,u}int_to_fp - // (v2i32 (extract_subvector (v4i32 $x), 0)))) - // - // into (f64x2.convert_low_i32x4_{s,u} $x). - // - // Or this: - // - // (v2f64 (fp_extend - // (v2f32 (extract_subvector (v4f32 $x), 0)))) - // - // into (f64x2.promote_low_f32x4 $x). - auto ConversionOp = N->getOpcode(); - MVT ExpectedExtractType; - MVT ExpectedSourceType; - switch (ConversionOp) { - case ISD::SINT_TO_FP: - case ISD::UINT_TO_FP: - ExpectedExtractType = MVT::v2i32; - ExpectedSourceType = MVT::v4i32; - break; - case ISD::FP_EXTEND: - ExpectedExtractType = MVT::v2f32; - ExpectedSourceType = MVT::v4f32; - break; - default: - llvm_unreachable("unexpected opcode"); - } - - auto Extract = N->getOperand(0); - if (Extract.getOpcode() != ISD::EXTRACT_SUBVECTOR) - return SDValue(); - - if (Extract.getValueType() != ExpectedExtractType) - return SDValue(); - - auto Source = Extract.getOperand(0); - if (Source.getValueType() != ExpectedSourceType) - return SDValue(); - - auto *IndexNode = dyn_cast(Extract.getOperand(1)); - if (IndexNode == nullptr || IndexNode->getZExtValue() != 0) - return SDValue(); - - unsigned Op = GetWasmConversionOp(ConversionOp); - return DAG.getNode(Op, SDLoc(N), ResVT, Source); -} - -static SDValue performVectorTruncZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { auto &DAG = DCI.DAG; @@ -2474,11 +2423,6 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: return performVectorExtendCombine(N, DCI); - case ISD::SINT_TO_FP: - case ISD::UINT_TO_FP: - case ISD::FP_EXTEND: - case ISD::EXTRACT_SUBVECTOR: - return performVectorConvertLowCombine(N, DCI); case ISD::FP_TO_SINT_SAT: case ISD::FP_TO_UINT_SAT: case ISD::FP_ROUND: diff --git a/llvm/test/CodeGen/WebAssembly/simd-conversions.ll b/llvm/test/CodeGen/WebAssembly/simd-conversions.ll index 1aa0ccc..c624058 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-conversions.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-conversions.ll @@ -304,3 +304,150 @@ define <2 x double> @promote_low_v2f64_2(<4 x float> %x) { %a = shufflevector <4 x double> %v, <4 x double> undef, <2 x i32> ret <2 x double> %a } + +;; Also check with illegally wide vectors + +define <4 x double> @convert_low_s_v4f64(<8 x i32> %x) { +; CHECK-LABEL: convert_low_s_v4f64: +; CHECK: .functype convert_low_s_v4f64 (i32, v128, v128) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f64x2.convert_low_i32x4_s +; CHECK-NEXT: v128.store 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.extract_lane 2 +; CHECK-NEXT: f64.convert_i32_s +; CHECK-NEXT: f64x2.splat +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.extract_lane 3 +; CHECK-NEXT: f64.convert_i32_s +; CHECK-NEXT: f64x2.replace_lane 1 +; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: # fallthrough-return + %v = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> + %a = sitofp <4 x i32> %v to <4 x double> + ret <4 x double> %a +} + +define <4 x double> @convert_low_u_v4f64(<8 x i32> %x) { +; CHECK-LABEL: convert_low_u_v4f64: +; CHECK: .functype convert_low_u_v4f64 (i32, v128, v128) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f64x2.convert_low_i32x4_u +; CHECK-NEXT: v128.store 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.extract_lane 2 +; CHECK-NEXT: f64.convert_i32_u +; CHECK-NEXT: f64x2.splat +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.extract_lane 3 +; CHECK-NEXT: f64.convert_i32_u +; CHECK-NEXT: f64x2.replace_lane 1 +; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: # fallthrough-return + %v = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> + %a = uitofp <4 x i32> %v to <4 x double> + ret <4 x double> %a +} + + +define <4 x double> @convert_low_s_v4f64_2(<8 x i32> %x) { +; CHECK-LABEL: convert_low_s_v4f64_2: +; CHECK: .functype convert_low_s_v4f64_2 (i32, v128, v128) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f64x2.convert_low_i32x4_s +; CHECK-NEXT: v128.store 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.extract_lane 2 +; CHECK-NEXT: f64.convert_i32_s +; CHECK-NEXT: f64x2.splat +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.extract_lane 3 +; CHECK-NEXT: f64.convert_i32_s +; CHECK-NEXT: f64x2.replace_lane 1 +; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: # fallthrough-return + %v = sitofp <8 x i32> %x to <8 x double> + %a = shufflevector <8 x double> %v, <8 x double> undef, <4 x i32> + ret <4 x double> %a +} + +define <4 x double> @convert_low_u_v4f64_2(<8 x i32> %x) { +; CHECK-LABEL: convert_low_u_v4f64_2: +; CHECK: .functype convert_low_u_v4f64_2 (i32, v128, v128) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f64x2.convert_low_i32x4_u +; CHECK-NEXT: v128.store 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.extract_lane 2 +; CHECK-NEXT: f64.convert_i32_u +; CHECK-NEXT: f64x2.splat +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.extract_lane 3 +; CHECK-NEXT: f64.convert_i32_u +; CHECK-NEXT: f64x2.replace_lane 1 +; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: # fallthrough-return + %v = uitofp <8 x i32> %x to <8 x double> + %a = shufflevector <8 x double> %v, <8 x double> undef, <4 x i32> + ret <4 x double> %a +} + +define <4 x double> @promote_low_v4f64(<8 x float> %x) { +; CHECK-LABEL: promote_low_v4f64: +; CHECK: .functype promote_low_v4f64 (i32, v128, v128) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: v128.store 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f32x4.extract_lane 2 +; CHECK-NEXT: f64.promote_f32 +; CHECK-NEXT: f64x2.splat +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f32x4.extract_lane 3 +; CHECK-NEXT: f64.promote_f32 +; CHECK-NEXT: f64x2.replace_lane 1 +; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: # fallthrough-return + %v = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> + %a = fpext <4 x float> %v to <4 x double> + ret <4 x double> %a +} + +define <4 x double> @promote_low_v4f64_2(<8 x float> %x) { +; CHECK-LABEL: promote_low_v4f64_2: +; CHECK: .functype promote_low_v4f64_2 (i32, v128, v128) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: v128.store 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f32x4.extract_lane 2 +; CHECK-NEXT: f64.promote_f32 +; CHECK-NEXT: f64x2.splat +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f32x4.extract_lane 3 +; CHECK-NEXT: f64.promote_f32 +; CHECK-NEXT: f64x2.replace_lane 1 +; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: # fallthrough-return + %v = fpext <8 x float> %x to <8 x double> + %a = shufflevector <8 x double> %v, <8 x double> undef, <4 x i32> + ret <4 x double> %a +} -- 2.7.4