From: Craig Topper Date: Tue, 9 Aug 2016 05:26:07 +0000 (+0000) Subject: [AVX-512] Add support for execution domain switching masked logical ops between float... X-Git-Tag: llvmorg-4.0.0-rc1~12986 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=92a4ff1294ebbcf1276ef543d9a4f2929dcfefd3;p=platform%2Fupstream%2Fllvm.git [AVX-512] Add support for execution domain switching masked logical ops between floating point and integer domain. This switches PS<->D and PD<->Q. llvm-svn: 278097 --- diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index f3d7c30..7894993 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -7474,6 +7474,108 @@ static const uint16_t ReplaceableInstrsAVX512DQ[][4] = { { X86::VXORPSZrr, X86::VXORPDZrr, X86::VPXORQZrr, X86::VPXORDZrr }, }; +static const uint16_t ReplaceableInstrsAVX512DQMasked[][4] = { + // Two integer columns for 64-bit and 32-bit elements. + //PackedSingle PackedDouble + //PackedInt PackedInt + { X86::VANDNPSZ128rmk, X86::VANDNPDZ128rmk, + X86::VPANDNQZ128rmk, X86::VPANDNDZ128rmk }, + { X86::VANDNPSZ128rmkz, X86::VANDNPDZ128rmkz, + X86::VPANDNQZ128rmkz, X86::VPANDNDZ128rmkz }, + { X86::VANDNPSZ128rrk, X86::VANDNPDZ128rrk, + X86::VPANDNQZ128rrk, X86::VPANDNDZ128rrk }, + { X86::VANDNPSZ128rrkz, X86::VANDNPDZ128rrkz, + X86::VPANDNQZ128rrkz, X86::VPANDNDZ128rrkz }, + { X86::VANDPSZ128rmk, X86::VANDPDZ128rmk, + X86::VPANDQZ128rmk, X86::VPANDDZ128rmk }, + { X86::VANDPSZ128rmkz, X86::VANDPDZ128rmkz, + X86::VPANDQZ128rmkz, X86::VPANDDZ128rmkz }, + { X86::VANDPSZ128rrk, X86::VANDPDZ128rrk, + X86::VPANDQZ128rrk, X86::VPANDDZ128rrk }, + { X86::VANDPSZ128rrkz, X86::VANDPDZ128rrkz, + X86::VPANDQZ128rrkz, X86::VPANDDZ128rrkz }, + { X86::VORPSZ128rmk, X86::VORPDZ128rmk, + X86::VPORQZ128rmk, X86::VPORDZ128rmk }, + { X86::VORPSZ128rmkz, X86::VORPDZ128rmkz, + X86::VPORQZ128rmkz, X86::VPORDZ128rmkz }, + { X86::VORPSZ128rrk, X86::VORPDZ128rrk, + X86::VPORQZ128rrk, X86::VPORDZ128rrk }, + { X86::VORPSZ128rrkz, X86::VORPDZ128rrkz, + X86::VPORQZ128rrkz, X86::VPORDZ128rrkz }, + { X86::VXORPSZ128rmk, X86::VXORPDZ128rmk, + X86::VPXORQZ128rmk, X86::VPXORDZ128rmk }, + { X86::VXORPSZ128rmkz, X86::VXORPDZ128rmkz, + X86::VPXORQZ128rmkz, X86::VPXORDZ128rmkz }, + { X86::VXORPSZ128rrk, X86::VXORPDZ128rrk, + X86::VPXORQZ128rrk, X86::VPXORDZ128rrk }, + { X86::VXORPSZ128rrkz, X86::VXORPDZ128rrkz, + X86::VPXORQZ128rrkz, X86::VPXORDZ128rrkz }, + { X86::VANDNPSZ256rmk, X86::VANDNPDZ256rmk, + X86::VPANDNQZ256rmk, X86::VPANDNDZ256rmk }, + { X86::VANDNPSZ256rmkz, X86::VANDNPDZ256rmkz, + X86::VPANDNQZ256rmkz, X86::VPANDNDZ256rmkz }, + { X86::VANDNPSZ256rrk, X86::VANDNPDZ256rrk, + X86::VPANDNQZ256rrk, X86::VPANDNDZ256rrk }, + { X86::VANDNPSZ256rrkz, X86::VANDNPDZ256rrkz, + X86::VPANDNQZ256rrkz, X86::VPANDNDZ256rrkz }, + { X86::VANDPSZ256rmk, X86::VANDPDZ256rmk, + X86::VPANDQZ256rmk, X86::VPANDDZ256rmk }, + { X86::VANDPSZ256rmkz, X86::VANDPDZ256rmkz, + X86::VPANDQZ256rmkz, X86::VPANDDZ256rmkz }, + { X86::VANDPSZ256rrk, X86::VANDPDZ256rrk, + X86::VPANDQZ256rrk, X86::VPANDDZ256rrk }, + { X86::VANDPSZ256rrkz, X86::VANDPDZ256rrkz, + X86::VPANDQZ256rrkz, X86::VPANDDZ256rrkz }, + { X86::VORPSZ256rmk, X86::VORPDZ256rmk, + X86::VPORQZ256rmk, X86::VPORDZ256rmk }, + { X86::VORPSZ256rmkz, X86::VORPDZ256rmkz, + X86::VPORQZ256rmkz, X86::VPORDZ256rmkz }, + { X86::VORPSZ256rrk, X86::VORPDZ256rrk, + X86::VPORQZ256rrk, X86::VPORDZ256rrk }, + { X86::VORPSZ256rrkz, X86::VORPDZ256rrkz, + X86::VPORQZ256rrkz, X86::VPORDZ256rrkz }, + { X86::VXORPSZ256rmk, X86::VXORPDZ256rmk, + X86::VPXORQZ256rmk, X86::VPXORDZ256rmk }, + { X86::VXORPSZ256rmkz, X86::VXORPDZ256rmkz, + X86::VPXORQZ256rmkz, X86::VPXORDZ256rmkz }, + { X86::VXORPSZ256rrk, X86::VXORPDZ256rrk, + X86::VPXORQZ256rrk, X86::VPXORDZ256rrk }, + { X86::VXORPSZ256rrkz, X86::VXORPDZ256rrkz, + X86::VPXORQZ256rrkz, X86::VPXORDZ256rrkz }, + { X86::VANDNPSZrmk, X86::VANDNPDZrmk, + X86::VPANDNQZrmk, X86::VPANDNDZrmk }, + { X86::VANDNPSZrmkz, X86::VANDNPDZrmkz, + X86::VPANDNQZrmkz, X86::VPANDNDZrmkz }, + { X86::VANDNPSZrrk, X86::VANDNPDZrrk, + X86::VPANDNQZrrk, X86::VPANDNDZrrk }, + { X86::VANDNPSZrrkz, X86::VANDNPDZrrkz, + X86::VPANDNQZrrkz, X86::VPANDNDZrrkz }, + { X86::VANDPSZrmk, X86::VANDPDZrmk, + X86::VPANDQZrmk, X86::VPANDDZrmk }, + { X86::VANDPSZrmkz, X86::VANDPDZrmkz, + X86::VPANDQZrmkz, X86::VPANDDZrmkz }, + { X86::VANDPSZrrk, X86::VANDPDZrrk, + X86::VPANDQZrrk, X86::VPANDDZrrk }, + { X86::VANDPSZrrkz, X86::VANDPDZrrkz, + X86::VPANDQZrrkz, X86::VPANDDZrrkz }, + { X86::VORPSZrmk, X86::VORPDZrmk, + X86::VPORQZrmk, X86::VPORDZrmk }, + { X86::VORPSZrmkz, X86::VORPDZrmkz, + X86::VPORQZrmkz, X86::VPORDZrmkz }, + { X86::VORPSZrrk, X86::VORPDZrrk, + X86::VPORQZrrk, X86::VPORDZrrk }, + { X86::VORPSZrrkz, X86::VORPDZrrkz, + X86::VPORQZrrkz, X86::VPORDZrrkz }, + { X86::VXORPSZrmk, X86::VXORPDZrmk, + X86::VPXORQZrmk, X86::VPXORDZrmk }, + { X86::VXORPSZrmkz, X86::VXORPDZrmkz, + X86::VPXORQZrmkz, X86::VPXORDZrmkz }, + { X86::VXORPSZrrk, X86::VXORPDZrrk, + X86::VPXORQZrrk, X86::VPXORDZrrk }, + { X86::VXORPSZrrkz, X86::VXORPDZrrkz, + X86::VPXORQZrrkz, X86::VPXORDZrrkz }, +}; + // FIXME: Some shuffle and unpack instructions have equivalents in different // domains, but they require a bit more work than just switching opcodes. @@ -7507,18 +7609,35 @@ static const uint16_t *lookupAVX512DQ(unsigned opcode, unsigned domain) { return nullptr; } +static const uint16_t *lookupAVX512DQMasked(unsigned opcode, unsigned domain) { + // If this is the integer domain make sure to check both integer columns. + for (const uint16_t (&Row)[4] : ReplaceableInstrsAVX512DQMasked) + if (Row[domain-1] == opcode || (domain == 3 && Row[3] == opcode)) + return Row; + return nullptr; +} + std::pair X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const { uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3; + unsigned opcode = MI.getOpcode(); uint16_t validDomains = 0; - if (domain && lookup(MI.getOpcode(), domain)) - validDomains = 0xe; - else if (domain && lookupAVX2(MI.getOpcode(), domain)) - validDomains = Subtarget.hasAVX2() ? 0xe : 0x6; - else if (domain && lookupAVX512(MI.getOpcode(), domain)) - validDomains = 0xe; - else if (domain && lookupAVX512DQ(MI.getOpcode(), domain)) - validDomains = Subtarget.hasDQI() ? 0xe : 0x8; + if (domain) { + if (lookup(MI.getOpcode(), domain)) { + validDomains = 0xe; + } else if (lookupAVX2(opcode, domain)) { + validDomains = Subtarget.hasAVX2() ? 0xe : 0x6; + } else if (lookupAVX512(opcode, domain)) { + validDomains = 0xe; + } else if (lookupAVX512DQ(opcode, domain)) { + validDomains = Subtarget.hasDQI() ? 0xe : 0x8; + } else if (const uint16_t *table = lookupAVX512DQMasked(opcode, domain)) { + if (domain == 1 || (domain == 3 && table[3] == opcode)) + validDomains = Subtarget.hasDQI() ? 0xa : 0x8; + else + validDomains = Subtarget.hasDQI() ? 0xc : 0x8; + } + } return std::make_pair(domain, validDomains); } @@ -7540,10 +7659,17 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const { Domain = 4; } if (!table) { // try the AVX512DQ table - assert((Subtarget.hasDQI() || Domain >=3) && "Requires AVX-512DQ"); + assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ"); table = lookupAVX512DQ(MI.getOpcode(), dom); - // Don't change integer Q instructions to D instructions. - if (table && Domain == 3 && table[3] == MI.getOpcode()) + // Don't change integer Q instructions to D instructions and + // use D intructions if we started with a PS instruction. + if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode())) + Domain = 4; + } + if (!table) { // try the AVX512DQMasked table + assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ"); + table = lookupAVX512DQMasked(MI.getOpcode(), dom); + if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode())) Domain = 4; } assert(table && "Cannot change domain"); diff --git a/llvm/test/CodeGen/X86/avx512-logic.ll b/llvm/test/CodeGen/X86/avx512-logic.ll index c14455b..438f6b1 100644 --- a/llvm/test/CodeGen/X86/avx512-logic.ll +++ b/llvm/test/CodeGen/X86/avx512-logic.ll @@ -278,3 +278,153 @@ define <32 x i16> @xor_v32i16(<32 x i16> %a, <32 x i16> %b) { %res = xor <32 x i16> %a, %b ret <32 x i16> %res } + +define <16 x float> @masked_and_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask, <16 x float> %c) { +; KNL-LABEL: masked_and_v16f32: +; KNL: ## BB#0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vpandd %zmm1, %zmm0, %zmm2 {%k1} +; KNL-NEXT: vaddps %zmm2, %zmm3, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: masked_and_v16f32: +; SKX: ## BB#0: +; SKX-NEXT: kmovw %edi, %k1 +; SKX-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} +; SKX-NEXT: vaddps %zmm2, %zmm3, %zmm0 +; SKX-NEXT: retq + %a1 = bitcast <16 x float> %a to <16 x i32> + %b1 = bitcast <16 x float> %b to <16 x i32> + %passThru1 = bitcast <16 x float> %passThru to <16 x i32> + %mask1 = bitcast i16 %mask to <16 x i1> + %op = and <16 x i32> %a1, %b1 + %select = select <16 x i1> %mask1, <16 x i32> %op, <16 x i32> %passThru1 + %cast = bitcast <16 x i32> %select to <16 x float> + %add = fadd <16 x float> %c, %cast + ret <16 x float> %add +} + +define <16 x float> @masked_or_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask, <16 x float> %c) { +; KNL-LABEL: masked_or_v16f32: +; KNL: ## BB#0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vpandd %zmm1, %zmm0, %zmm2 {%k1} +; KNL-NEXT: vaddps %zmm2, %zmm3, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: masked_or_v16f32: +; SKX: ## BB#0: +; SKX-NEXT: kmovw %edi, %k1 +; SKX-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} +; SKX-NEXT: vaddps %zmm2, %zmm3, %zmm0 +; SKX-NEXT: retq + %a1 = bitcast <16 x float> %a to <16 x i32> + %b1 = bitcast <16 x float> %b to <16 x i32> + %passThru1 = bitcast <16 x float> %passThru to <16 x i32> + %mask1 = bitcast i16 %mask to <16 x i1> + %op = and <16 x i32> %a1, %b1 + %select = select <16 x i1> %mask1, <16 x i32> %op, <16 x i32> %passThru1 + %cast = bitcast <16 x i32> %select to <16 x float> + %add = fadd <16 x float> %c, %cast + ret <16 x float> %add +} + +define <16 x float> @masked_xor_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask, <16 x float> %c) { +; KNL-LABEL: masked_xor_v16f32: +; KNL: ## BB#0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vpandd %zmm1, %zmm0, %zmm2 {%k1} +; KNL-NEXT: vaddps %zmm2, %zmm3, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: masked_xor_v16f32: +; SKX: ## BB#0: +; SKX-NEXT: kmovw %edi, %k1 +; SKX-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} +; SKX-NEXT: vaddps %zmm2, %zmm3, %zmm0 +; SKX-NEXT: retq + %a1 = bitcast <16 x float> %a to <16 x i32> + %b1 = bitcast <16 x float> %b to <16 x i32> + %passThru1 = bitcast <16 x float> %passThru to <16 x i32> + %mask1 = bitcast i16 %mask to <16 x i1> + %op = and <16 x i32> %a1, %b1 + %select = select <16 x i1> %mask1, <16 x i32> %op, <16 x i32> %passThru1 + %cast = bitcast <16 x i32> %select to <16 x float> + %add = fadd <16 x float> %c, %cast + ret <16 x float> %add +} + +define <8 x double> @masked_and_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %passThru, i8 %mask, <8 x double> %c) { +; KNL-LABEL: masked_and_v8f64: +; KNL: ## BB#0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm2 {%k1} +; KNL-NEXT: vaddpd %zmm2, %zmm3, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: masked_and_v8f64: +; SKX: ## BB#0: +; SKX-NEXT: kmovb %edi, %k1 +; SKX-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} +; SKX-NEXT: vaddpd %zmm2, %zmm3, %zmm0 +; SKX-NEXT: retq + %a1 = bitcast <8 x double> %a to <8 x i64> + %b1 = bitcast <8 x double> %b to <8 x i64> + %passThru1 = bitcast <8 x double> %passThru to <8 x i64> + %mask1 = bitcast i8 %mask to <8 x i1> + %op = and <8 x i64> %a1, %b1 + %select = select <8 x i1> %mask1, <8 x i64> %op, <8 x i64> %passThru1 + %cast = bitcast <8 x i64> %select to <8 x double> + %add = fadd <8 x double> %c, %cast + ret <8 x double> %add +} + +define <8 x double> @masked_or_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %passThru, i8 %mask, <8 x double> %c) { +; KNL-LABEL: masked_or_v8f64: +; KNL: ## BB#0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm2 {%k1} +; KNL-NEXT: vaddpd %zmm2, %zmm3, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: masked_or_v8f64: +; SKX: ## BB#0: +; SKX-NEXT: kmovb %edi, %k1 +; SKX-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} +; SKX-NEXT: vaddpd %zmm2, %zmm3, %zmm0 +; SKX-NEXT: retq + %a1 = bitcast <8 x double> %a to <8 x i64> + %b1 = bitcast <8 x double> %b to <8 x i64> + %passThru1 = bitcast <8 x double> %passThru to <8 x i64> + %mask1 = bitcast i8 %mask to <8 x i1> + %op = and <8 x i64> %a1, %b1 + %select = select <8 x i1> %mask1, <8 x i64> %op, <8 x i64> %passThru1 + %cast = bitcast <8 x i64> %select to <8 x double> + %add = fadd <8 x double> %c, %cast + ret <8 x double> %add +} + +define <8 x double> @masked_xor_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %passThru, i8 %mask, <8 x double> %c) { +; KNL-LABEL: masked_xor_v8f64: +; KNL: ## BB#0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm2 {%k1} +; KNL-NEXT: vaddpd %zmm2, %zmm3, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: masked_xor_v8f64: +; SKX: ## BB#0: +; SKX-NEXT: kmovb %edi, %k1 +; SKX-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} +; SKX-NEXT: vaddpd %zmm2, %zmm3, %zmm0 +; SKX-NEXT: retq + %a1 = bitcast <8 x double> %a to <8 x i64> + %b1 = bitcast <8 x double> %b to <8 x i64> + %passThru1 = bitcast <8 x double> %passThru to <8 x i64> + %mask1 = bitcast i8 %mask to <8 x i1> + %op = and <8 x i64> %a1, %b1 + %select = select <8 x i1> %mask1, <8 x i64> %op, <8 x i64> %passThru1 + %cast = bitcast <8 x i64> %select to <8 x double> + %add = fadd <8 x double> %c, %cast + ret <8 x double> %add +}