From: Craig Topper Date: Wed, 7 Sep 2016 04:46:11 +0000 (+0000) Subject: [AVX-512] Add support for commuting masked instructions in findCommutedOpIndices... X-Git-Tag: llvmorg-4.0.0-rc1~10490 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b880ad3a71f288a0f5aef4ecac22faa4cd5e8389;p=platform%2Fupstream%2Fllvm.git [AVX-512] Add support for commuting masked instructions in findCommutedOpIndices. The default implementation doesn't skip the mask input or the preserved input. llvm-svn: 280781 --- diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index bb592e9..5e1a752 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -3709,7 +3709,8 @@ bool X86InstrInfo::findFMA3CommutedOpIndices( bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const { - if (!MI.isCommutable()) + const MCInstrDesc &Desc = MI.getDesc(); + if (!Desc.isCommutable()) return false; switch (MI.getOpcode()) { @@ -3750,6 +3751,27 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, X86InstrFMA3Info::getFMA3Group(MI.getOpcode()); if (FMA3Group) return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2, *FMA3Group); + + // Handled masked instructions since we need to skip over the mask input + // and the preserved input. + if (Desc.TSFlags & X86II::EVEX_K) { + unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1; + // If there is no preserved input we only need to skip 1 operand. + if (MI.getDesc().getOperandConstraint(Desc.getNumDefs(), + MCOI::TIED_TO) != -1) + ++CommutableOpIdx1; + unsigned CommutableOpIdx2 = CommutableOpIdx1 + 1; + if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, + CommutableOpIdx1, CommutableOpIdx2)) + return false; + + if (!MI.getOperand(SrcOpIdx1).isReg() || + !MI.getOperand(SrcOpIdx2).isReg()) + // No idea. + return false; + return true; + } + return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); } return false; diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll index 52faee3..c0bd7ba 100644 --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll @@ -16,6 +16,16 @@ define <8 x double> @stack_fold_addpd_zmm(<8 x double> %a0, <8 x double> %a1) { ret <8 x double> %2 } +define <8 x double> @stack_fold_addpd_zmm_kz(<8 x double> %a0, <8 x double> %a1, i8 %mask) { + ;CHECK-LABEL: stack_fold_addpd_zmm_kz + ;CHECK: vaddpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fadd <8 x double> %a1, %a0 + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer + ret <8 x double> %4 +} + define <16 x float> @stack_fold_addps_zmm(<16 x float> %a0, <16 x float> %a1) { ;CHECK-LABEL: stack_fold_addps_zmm ;CHECK: vaddps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload @@ -24,6 +34,16 @@ define <16 x float> @stack_fold_addps_zmm(<16 x float> %a0, <16 x float> %a1) { ret <16 x float> %2 } +define <16 x float> @stack_fold_addps_zmm_kz(<16 x float> %a0, <16 x float> %a1, i16 %mask) { + ;CHECK-LABEL: stack_fold_addps_zmm_kz + ;CHECK: vaddps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fadd <16 x float> %a1, %a0 + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer + ret <16 x float> %4 +} + define double @stack_fold_addsd(double %a0, double %a1) { ;CHECK-LABEL: stack_fold_addsd ;CHECK: vaddsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload @@ -191,6 +211,14 @@ define <8 x double> @stack_fold_maxpd_zmm_commutable(<8 x double> %a0, <8 x doub ret <8 x double> %2 } +define <8 x double> @stack_fold_maxpd_zmm_commutable_kz(<8 x double> %a0, <8 x double> %a1, i8 %mask) #1 { + ;CHECK-LABEL: stack_fold_maxpd_zmm_commutable_kz + ;CHECK: vmaxpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %a1, <8 x double> %a0, <8 x double> zeroinitializer, i8 %mask, i32 4) + ret <8 x double> %2 +} + define <16 x float> @stack_fold_maxps_zmm(<16 x float> %a0, <16 x float> %a1) #0 { ;CHECK-LABEL: stack_fold_maxps_zmm ;CHECK: vmaxps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload @@ -208,6 +236,14 @@ define <16 x float> @stack_fold_maxps_zmm_commutable(<16 x float> %a0, <16 x flo ret <16 x float> %2 } +define <16 x float> @stack_fold_maxps_zmm_commutable_kz(<16 x float> %a0, <16 x float> %a1, i16 %mask) #1 { + ;CHECK-LABEL: stack_fold_maxps_zmm_commutable_kz + ;CHECK: vmaxps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a1, <16 x float> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4) + ret <16 x float> %2 +} + define <8 x double> @stack_fold_minpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 { ;CHECK-LABEL: stack_fold_minpd_zmm ;CHECK: vminpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload @@ -225,6 +261,14 @@ define <8 x double> @stack_fold_minpd_zmm_commutable(<8 x double> %a0, <8 x doub ret <8 x double> %2 } +define <8 x double> @stack_fold_minpd_zmm_commutable_kz(<8 x double> %a0, <8 x double> %a1, i8 %mask) #1 { + ;CHECK-LABEL: stack_fold_minpd_zmm_commutable_kz + ;CHECK: vminpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %a1, <8 x double> %a0, <8 x double> zeroinitializer, i8 %mask, i32 4) + ret <8 x double> %2 +} + define <16 x float> @stack_fold_minps_zmm(<16 x float> %a0, <16 x float> %a1) #0 { ;CHECK-LABEL: stack_fold_minps_zmm ;CHECK: vminps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload @@ -242,6 +286,14 @@ define <16 x float> @stack_fold_minps_zmm_commutable(<16 x float> %a0, <16 x flo ret <16 x float> %2 } +define <16 x float> @stack_fold_minps_zmm_commutable_kz(<16 x float> %a0, <16 x float> %a1, i16 %mask) #1 { + ;CHECK-LABEL: stack_fold_minps_zmm_commutable_kz + ;CHECK: vminps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a1, <16 x float> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4) + ret <16 x float> %2 +} + define double @stack_fold_mulsd(double %a0, double %a1) { ;CHECK-LABEL: stack_fold_mulsd ;CHECK: vmulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload