From 27c77fe4cefb0ee1482871bb8339aa101fec5901 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 10 Jul 2018 22:23:54 +0000 Subject: [PATCH] [X86] Remove AddedComplexity from all patterns that use X86vzmovl as their root. Some added 20 and some added 15. Its unclear when to use which value and whether they are required at all. This patch removes them all. If we start finding real world issues we may need to add them back with proper tests. llvm-svn: 336735 --- llvm/lib/Target/X86/X86InstrAVX512.td | 133 +++++++++++------------- llvm/lib/Target/X86/X86InstrFMA.td | 1 - llvm/lib/Target/X86/X86InstrMMX.td | 2 - llvm/lib/Target/X86/X86InstrSSE.td | 186 +++++++++++++++------------------- 4 files changed, 139 insertions(+), 183 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index c425dc4..85fc440 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -4317,12 +4317,10 @@ def : InstAlias<"vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"# VR128X:$src1, VR128X:$src2), 0>; let Predicates = [HasAVX512] in { - let AddedComplexity = 15 in { def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))), (VMOVSSZrr (v4f32 (AVX512_128_SET0)), VR128X:$src)>; def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))), (VMOVSSZrr (v4i32 (AVX512_128_SET0)), VR128X:$src)>; - } // Move low f32 and clear high bits. def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))), @@ -4342,7 +4340,6 @@ let Predicates = [HasAVX512] in { (VMOVSSZrr (v4i32 (AVX512_128_SET0)), (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)), sub_xmm)>; - let AddedComplexity = 20 in { // MOVSSrm zeros the high parts of the register; represent this // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), @@ -4398,7 +4395,7 @@ let Predicates = [HasAVX512] in { (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; def : Pat<(v8f64 (X86vzload addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; - } + def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>; @@ -4442,7 +4439,6 @@ let Predicates = [HasAVX512] in { } let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in { -let AddedComplexity = 15 in def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src), "vmovq\t{$src, $dst|$dst, $src}", @@ -4452,42 +4448,39 @@ def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst), } let Predicates = [HasAVX512] in { - let AddedComplexity = 15 in { - def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), - (VMOVDI2PDIZrr GR32:$src)>; + def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), + (VMOVDI2PDIZrr GR32:$src)>; - def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), - (VMOV64toPQIZrr GR64:$src)>; + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), + (VMOV64toPQIZrr GR64:$src)>; - def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>; + def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), + (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>; + + def : Pat<(v8i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), + (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>; - def : Pat<(v8i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>; - } // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part. - let AddedComplexity = 20 in { - def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), - (VMOVDI2PDIZrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), - (VMOVDI2PDIZrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), - (VMOVDI2PDIZrm addr:$src)>; - def : Pat<(v4i32 (X86vzload addr:$src)), - (VMOVDI2PDIZrm addr:$src)>; - def : Pat<(v8i32 (X86vzload addr:$src)), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>; - def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), - (VMOVQI2PQIZrm addr:$src)>; - def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))), - (VMOVZPQILo2PQIZrr VR128X:$src)>; - def : Pat<(v2i64 (X86vzload addr:$src)), - (VMOVQI2PQIZrm addr:$src)>; - def : Pat<(v4i64 (X86vzload addr:$src)), - (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>; - } + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), + (VMOVDI2PDIZrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), + (VMOVDI2PDIZrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), + (VMOVDI2PDIZrm addr:$src)>; + def : Pat<(v4i32 (X86vzload addr:$src)), + (VMOVDI2PDIZrm addr:$src)>; + def : Pat<(v8i32 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>; + def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), + (VMOVQI2PQIZrm addr:$src)>; + def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))), + (VMOVZPQILo2PQIZrr VR128X:$src)>; + def : Pat<(v2i64 (X86vzload addr:$src)), + (VMOVQI2PQIZrm addr:$src)>; + def : Pat<(v4i64 (X86vzload addr:$src)), + (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>; // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, @@ -7721,14 +7714,12 @@ def : Pat<(v8f64 (extloadv8f32 addr:$src)), (VCVTPS2PDZrm addr:$src)>; let Predicates = [HasVLX] in { - let AddedComplexity = 15 in { - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86vfpround (v2f64 VR128X:$src)))))), - (VCVTPD2PSZ128rr VR128X:$src)>; - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86vfpround (loadv2f64 addr:$src)))))), - (VCVTPD2PSZ128rm addr:$src)>; - } + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (v2f64 VR128X:$src)))))), + (VCVTPD2PSZ128rr VR128X:$src)>; + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (loadv2f64 addr:$src)))))), + (VCVTPD2PSZ128rm addr:$src)>; def : Pat<(v2f64 (extloadv2f32 addr:$src)), (VCVTPS2PDZ128rm addr:$src)>; def : Pat<(v4f64 (extloadv4f32 addr:$src)), @@ -8224,26 +8215,24 @@ def : Pat<(v2f64 (X86VUintToFP (v4i32 VR128X:$src1))), } let Predicates = [HasAVX512, HasVLX] in { - let AddedComplexity = 15 in { - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvtp2Int (v2f64 VR128X:$src)))))), - (VCVTPD2DQZ128rr VR128X:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))), - (VCVTPD2DQZ128rm addr:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvtp2UInt (v2f64 VR128X:$src)))))), - (VCVTPD2UDQZ128rr VR128X:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttp2si (v2f64 VR128X:$src)))))), - (VCVTTPD2DQZ128rr VR128X:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))), - (VCVTTPD2DQZ128rm addr:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttp2ui (v2f64 VR128X:$src)))))), - (VCVTTPD2UDQZ128rr VR128X:$src)>; - } + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2Int (v2f64 VR128X:$src)))))), + (VCVTPD2DQZ128rr VR128X:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))), + (VCVTPD2DQZ128rm addr:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2UInt (v2f64 VR128X:$src)))))), + (VCVTPD2UDQZ128rr VR128X:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttp2si (v2f64 VR128X:$src)))))), + (VCVTTPD2DQZ128rr VR128X:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))), + (VCVTTPD2DQZ128rm addr:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttp2ui (v2f64 VR128X:$src)))))), + (VCVTTPD2UDQZ128rr VR128X:$src)>; def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (VCVTDQ2PDZ128rm addr:$src)>; @@ -8264,14 +8253,12 @@ let Predicates = [HasAVX512] in { } let Predicates = [HasDQI, HasVLX] in { - let AddedComplexity = 15 in { - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86VSintToFP (v2i64 VR128X:$src)))))), - (VCVTQQ2PSZ128rr VR128X:$src)>; - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86VUintToFP (v2i64 VR128X:$src)))))), - (VCVTUQQ2PSZ128rr VR128X:$src)>; - } + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86VSintToFP (v2i64 VR128X:$src)))))), + (VCVTQQ2PSZ128rr VR128X:$src)>; + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86VUintToFP (v2i64 VR128X:$src)))))), + (VCVTUQQ2PSZ128rr VR128X:$src)>; } let Predicates = [HasDQI, NoVLX] in { diff --git a/llvm/lib/Target/X86/X86InstrFMA.td b/llvm/lib/Target/X86/X86InstrFMA.td index 594eb3b..376f643 100644 --- a/llvm/lib/Target/X86/X86InstrFMA.td +++ b/llvm/lib/Target/X86/X86InstrFMA.td @@ -589,7 +589,6 @@ multiclass scalar_fma4_patterns { let Predicates = [HasFMA4] in { - let AddedComplexity = 15 in def : Pat<(VT (X86vzmovl (VT (scalar_to_vector (Op RC:$src1, RC:$src2, RC:$src3))))), (!cast(Name#"rr_Int") diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td index e9dc4f6..aefeffe 100644 --- a/llvm/lib/Target/X86/X86InstrMMX.td +++ b/llvm/lib/Target/X86/X86InstrMMX.td @@ -273,11 +273,9 @@ def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), Sched<[SchedWriteVecMoveLSNT.MMX.MR]>; let Predicates = [HasMMX] in { - let AddedComplexity = 15 in // movd to MMX register zero-extends def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))), (MMX_MOVD64rr GR32:$src)>; - let AddedComplexity = 20 in def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector (loadi32 addr:$src))))), (MMX_MOVD64rm addr:$src)>; } diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 407b37c..74b843d9 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -248,7 +248,6 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in { // Patterns let Predicates = [UseAVX] in { - let AddedComplexity = 20 in { // MOVSSrm zeros the high parts of the register; represent this // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), @@ -285,7 +284,6 @@ let Predicates = [UseAVX] in { (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; def : Pat<(v4f64 (X86vzload addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; - } // Extract and store. def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), @@ -308,7 +306,7 @@ let Predicates = [UseAVX] in { } let Predicates = [UseSSE1] in { - let Predicates = [NoSSE41], AddedComplexity = 15 in { + let Predicates = [NoSSE41] in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVSS to the lower bits. def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), @@ -317,7 +315,6 @@ let Predicates = [UseSSE1] in { (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>; } - let AddedComplexity = 20 in { // MOVSSrm already zeros the high parts of the register. def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; @@ -327,7 +324,6 @@ let Predicates = [UseSSE1] in { (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; def : Pat<(v4f32 (X86vzload addr:$src)), (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; - } // Extract and store. def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), @@ -343,7 +339,6 @@ let Predicates = [UseSSE1] in { } let Predicates = [UseSSE2] in { - let AddedComplexity = 20 in { // MOVSDrm already zeros the high parts of the register. def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; @@ -355,7 +350,6 @@ let Predicates = [UseSSE2] in { (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; def : Pat<(v2f64 (X86vzload addr:$src)), (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; - } // Shuffle with MOVSD def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), @@ -1637,20 +1631,18 @@ let Predicates = [HasAVX, NoVLX] in { } let Predicates = [HasAVX, NoVLX] in { - let AddedComplexity = 15 in { - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), - (VCVTPD2DQrr VR128:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))), - (VCVTPD2DQrm addr:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))), - (VCVTTPD2DQrr VR128:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))), - (VCVTTPD2DQrm addr:$src)>; - } + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), + (VCVTPD2DQrr VR128:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))), + (VCVTPD2DQrm addr:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))), + (VCVTTPD2DQrr VR128:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))), + (VCVTTPD2DQrm addr:$src)>; } // Predicates = [HasAVX, NoVLX] def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), @@ -1665,20 +1657,18 @@ def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), Sched<[WriteCvtPD2ILd]>; let Predicates = [UseSSE2] in { - let AddedComplexity = 15 in { - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), - (CVTPD2DQrr VR128:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvtp2Int (memopv2f64 addr:$src)))))), - (CVTPD2DQrm addr:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))), - (CVTTPD2DQrr VR128:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttp2si (memopv2f64 addr:$src)))))), - (CVTTPD2DQrm addr:$src)>; - } + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), + (CVTPD2DQrr VR128:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2Int (memopv2f64 addr:$src)))))), + (CVTPD2DQrm addr:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))), + (CVTTPD2DQrr VR128:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttp2si (memopv2f64 addr:$src)))))), + (CVTTPD2DQrm addr:$src)>; } // Predicates = [UseSSE2] // Convert packed single to packed double @@ -1819,26 +1809,22 @@ def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), let Predicates = [HasAVX, NoVLX] in { // Match fpround and fpextend for 128/256-bit conversions - let AddedComplexity = 15 in { - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86vfpround (v2f64 VR128:$src)))))), - (VCVTPD2PSrr VR128:$src)>; - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86vfpround (loadv2f64 addr:$src)))))), - (VCVTPD2PSrm addr:$src)>; - } + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (v2f64 VR128:$src)))))), + (VCVTPD2PSrr VR128:$src)>; + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (loadv2f64 addr:$src)))))), + (VCVTPD2PSrm addr:$src)>; } let Predicates = [UseSSE2] in { // Match fpround and fpextend for 128 conversions - let AddedComplexity = 15 in { - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86vfpround (v2f64 VR128:$src)))))), - (CVTPD2PSrr VR128:$src)>; - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86vfpround (memopv2f64 addr:$src)))))), - (CVTPD2PSrm addr:$src)>; - } + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (v2f64 VR128:$src)))))), + (CVTPD2PSrr VR128:$src)>; + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (memopv2f64 addr:$src)))))), + (CVTPD2PSrm addr:$src)>; } //===----------------------------------------------------------------------===// @@ -4165,34 +4151,30 @@ let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 let Predicates = [UseAVX] in { - let AddedComplexity = 15 in { - def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), - (VMOVDI2PDIrr GR32:$src)>; + def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), + (VMOVDI2PDIrr GR32:$src)>; - def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), - (VMOV64toPQIrr GR64:$src)>; + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), + (VMOV64toPQIrr GR64:$src)>; - def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i64 0), (VMOV64toPQIrr GR64:$src), sub_xmm)>; - } + def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), + (SUBREG_TO_REG (i64 0), (VMOV64toPQIrr GR64:$src), sub_xmm)>; // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part. // These instructions also write zeros in the high part of a 256-bit register. - let AddedComplexity = 20 in { - def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), - (VMOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), - (VMOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), - (VMOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzload addr:$src)), - (VMOVDI2PDIrm addr:$src)>; - def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, - (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>; - def : Pat<(v8i32 (X86vzload addr:$src)), - (SUBREG_TO_REG (i64 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>; - } + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), + (VMOVDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), + (VMOVDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), + (VMOVDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzload addr:$src)), + (VMOVDI2PDIrm addr:$src)>; + def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, + (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>; + def : Pat<(v8i32 (X86vzload addr:$src)), + (SUBREG_TO_REG (i64 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>; // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), @@ -4200,23 +4182,19 @@ let Predicates = [UseAVX] in { } let Predicates = [UseSSE2] in { - let AddedComplexity = 15 in { - def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), - (MOVDI2PDIrr GR32:$src)>; + def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), + (MOVDI2PDIrr GR32:$src)>; - def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), - (MOV64toPQIrr GR64:$src)>; - } - let AddedComplexity = 20 in { - def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), - (MOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), - (MOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), - (MOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzload addr:$src)), - (MOVDI2PDIrm addr:$src)>; - } + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), + (MOV64toPQIrr GR64:$src)>; + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), + (MOVDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), + (MOVDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), + (MOVDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzload addr:$src)), + (MOVDI2PDIrm addr:$src)>; } // Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of @@ -4287,7 +4265,7 @@ def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}", def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}", (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>; -let Predicates = [UseAVX], AddedComplexity = 20 in { +let Predicates = [UseAVX] in { def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), (VMOVQI2PQIrm addr:$src)>; def : Pat<(v2i64 (X86vzload addr:$src)), @@ -4299,7 +4277,7 @@ let Predicates = [UseAVX], AddedComplexity = 20 in { (SUBREG_TO_REG (i64 0), (VMOVQI2PQIrm addr:$src), sub_xmm)>; } -let Predicates = [UseSSE2], AddedComplexity = 20 in { +let Predicates = [UseSSE2] in { def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), (MOVQI2PQIrm addr:$src)>; def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>; @@ -4310,27 +4288,23 @@ let Predicates = [UseSSE2], AddedComplexity = 20 in { // IA32 document. movq xmm1, xmm2 does clear the high bits. // let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in { -let AddedComplexity = 15 in def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, XS, VEX, Requires<[UseAVX]>, VEX_WIG; -let AddedComplexity = 15 in def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, XS, Requires<[UseSSE2]>; } // ExeDomain, SchedRW -let AddedComplexity = 20 in { - let Predicates = [UseAVX] in { - def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), - (VMOVZPQILo2PQIrr VR128:$src)>; - } - let Predicates = [UseSSE2] in { - def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), - (MOVZPQILo2PQIrr VR128:$src)>; - } +let Predicates = [UseAVX] in { + def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), + (VMOVZPQILo2PQIrr VR128:$src)>; +} +let Predicates = [UseSSE2] in { + def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), + (MOVZPQILo2PQIrr VR128:$src)>; } //===---------------------------------------------------------------------===// @@ -6438,7 +6412,6 @@ let Predicates = [HasAVX2] in { // blends because blends have better throughput on SandyBridge and Haswell, but // movs[s/d] are 1-2 byte shorter instructions. let Predicates = [UseAVX] in { - let AddedComplexity = 15 in { def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), @@ -6451,7 +6424,6 @@ let Predicates = [UseAVX] in { // Move low f64 and clear high bits. def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>; - } // These will incur an FP/int domain crossing penalty, but it may be the only // way without AVX2. Do not add any complexity because we may be able to match @@ -6466,7 +6438,7 @@ let Predicates = [UseAVX] in { // on targets where they have equal performance. These were changed to use // blends because blends have better throughput on SandyBridge and Haswell, but // movs[s/d] are 1-2 byte shorter instructions. -let Predicates = [UseSSE41], AddedComplexity = 15 in { +let Predicates = [UseSSE41] in { // With SSE41 we can use blends for these patterns. def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; -- 2.7.4