return NewMI;
}
-unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
- const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
- const X86InstrFMA3Group &FMA3Group) const {
-
- unsigned Opc = MI.getOpcode();
-
+/// This determines which of three possible cases of a three source commute
+/// the source indexes correspond to taking into account any mask operands.
+/// All prevents commuting a passthru operand. Returns -1 if the commute isn't
+/// possible.
+/// Case 0 - Possible to commute the first and second operands.
+/// Case 1 - Possible to commute the first and third operands.
+/// Case 2 - Possible to commute the second and third operands.
+static int getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
+ unsigned SrcOpIdx2) {
// Put the lowest index to SrcOpIdx1 to simplify the checks below.
if (SrcOpIdx1 > SrcOpIdx2)
std::swap(SrcOpIdx1, SrcOpIdx2);
- // TODO: Commuting the 1st operand of FMA*_Int requires some additional
- // analysis. The commute optimization is legal only if all users of FMA*_Int
- // use only the lowest element of the FMA*_Int instruction. Such analysis are
- // not implemented yet. So, just return 0 in that case.
- // When such analysis are available this place will be the right place for
- // calling it.
- if (FMA3Group.isIntrinsic() && SrcOpIdx1 == 1)
- return 0;
-
- unsigned FMAOp1 = 1, FMAOp2 = 2, FMAOp3 = 3;
- if (FMA3Group.isKMasked()) {
+ unsigned Op1 = 1, Op2 = 2, Op3 = 3;
+ if (X86II::isKMasked(TSFlags)) {
// The k-mask operand cannot be commuted.
if (SrcOpIdx1 == 2)
- return 0;
+ return -1;
// For k-zero-masked operations it is Ok to commute the first vector
// operand.
// For regular k-masked operations a conservative choice is done as the
// elements of the first vector operand, for which the corresponding bit
- // in the k-mask operand is set to 0, are copied to the result of FMA.
+ // in the k-mask operand is set to 0, are copied to the result of the
+ // instruction.
// TODO/FIXME: The commute still may be legal if it is known that the
// k-mask operand is set to either all ones or all zeroes.
// It is also Ok to commute the 1st operand if all users of MI use only
// : v1[i];
// VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
// // Ok, to commute v1 in FMADD213PSZrk.
- if (FMA3Group.isKMergeMasked() && SrcOpIdx1 == FMAOp1)
- return 0;
- FMAOp2++;
- FMAOp3++;
+ if (X86II::isKMergeMasked(TSFlags) && SrcOpIdx1 == Op1)
+ return -1;
+ Op2++;
+ Op3++;
}
- unsigned Case;
- if (SrcOpIdx1 == FMAOp1 && SrcOpIdx2 == FMAOp2)
- Case = 0;
- else if (SrcOpIdx1 == FMAOp1 && SrcOpIdx2 == FMAOp3)
- Case = 1;
- else if (SrcOpIdx1 == FMAOp2 && SrcOpIdx2 == FMAOp3)
- Case = 2;
- else
+ if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2)
+ return 0;
+ if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3)
+ return 1;
+ if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
+ return 2;
+ return -1;
+}
+
+unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
+ const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
+ const X86InstrFMA3Group &FMA3Group) const {
+
+ unsigned Opc = MI.getOpcode();
+
+ // Put the lowest index to SrcOpIdx1 to simplify the checks below.
+ if (SrcOpIdx1 > SrcOpIdx2)
+ std::swap(SrcOpIdx1, SrcOpIdx2);
+
+ // TODO: Commuting the 1st operand of FMA*_Int requires some additional
+ // analysis. The commute optimization is legal only if all users of FMA*_Int
+ // use only the lowest element of the FMA*_Int instruction. Such analysis are
+ // not implemented yet. So, just return 0 in that case.
+ // When such analysis are available this place will be the right place for
+ // calling it.
+ if (FMA3Group.isIntrinsic() && SrcOpIdx1 == 1)
+ return 0;
+
+ // Determine which case this commute is or if it can't be done.
+ int Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
+ if (Case < 0)
return 0;
// Define the FMA forms mapping array that helps to map input FMA form
return FMAForms[FormIndex];
}
+static bool commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
+ unsigned SrcOpIdx2) {
+ uint64_t TSFlags = MI.getDesc().TSFlags;
+
+ // Determine which case this commute is or if it can't be done.
+ int Case = getThreeSrcCommuteCase(TSFlags, SrcOpIdx1, SrcOpIdx2);
+ if (Case < 0)
+ return false;
+
+ // For each case we need to swap two pairs of bits in the final immediate.
+ static const uint8_t SwapMasks[3][4] = {
+ { 0x04, 0x10, 0x08, 0x20 }, // Swap bits 2/4 and 3/5.
+ { 0x02, 0x10, 0x08, 0x40 }, // Swap bits 1/4 and 3/6.
+ { 0x02, 0x04, 0x20, 0x40 }, // Swap bits 1/2 and 5/6.
+ };
+
+ uint8_t Imm = MI.getOperand(MI.getNumOperands()-1).getImm();
+ // Clear out the bits we are swapping.
+ uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] |
+ SwapMasks[Case][2] | SwapMasks[Case][3]);
+ // If the immediate had a bit of the pair set, then set the opposite bit.
+ if (Imm & SwapMasks[Case][0]) NewImm |= SwapMasks[Case][1];
+ if (Imm & SwapMasks[Case][1]) NewImm |= SwapMasks[Case][0];
+ if (Imm & SwapMasks[Case][2]) NewImm |= SwapMasks[Case][3];
+ if (Imm & SwapMasks[Case][3]) NewImm |= SwapMasks[Case][2];
+ MI.getOperand(MI.getNumOperands()-1).setImm(NewImm);
+
+ return true;
+}
+
MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
unsigned OpIdx1,
unsigned OpIdx2) const {
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
+ case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
+ case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
+ case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
+ case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
+ case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
+ case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
+ case X86::VPTERNLOGDZrrik: case X86::VPTERNLOGDZrmik:
+ case X86::VPTERNLOGDZ128rrik: case X86::VPTERNLOGDZ128rmik:
+ case X86::VPTERNLOGDZ256rrik: case X86::VPTERNLOGDZ256rmik:
+ case X86::VPTERNLOGQZrrik: case X86::VPTERNLOGQZrmik:
+ case X86::VPTERNLOGQZ128rrik: case X86::VPTERNLOGQZ128rmik:
+ case X86::VPTERNLOGQZ256rrik: case X86::VPTERNLOGQZ256rmik:
+ case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
+ case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
+ case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
+ case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
+ case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
+ case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz: {
+ auto &WorkingMI = cloneIfNew(MI);
+ if (!commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2))
+ return nullptr;
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
default:
const X86InstrFMA3Group *FMA3Group =
X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
bool X86InstrInfo::findFMA3CommutedOpIndices(
const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2,
const X86InstrFMA3Group &FMA3Group) const {
+
+ if (!findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2))
+ return false;
+
+ // Check if we can adjust the opcode to preserve the semantics when
+ // commute the register operands.
+ return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2, FMA3Group) != 0;
+}
+
+bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
+ unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const {
+ uint64_t TSFlags = MI.getDesc().TSFlags;
+
unsigned FirstCommutableVecOp = 1;
unsigned LastCommutableVecOp = 3;
unsigned KMaskOp = 0;
- if (FMA3Group.isKMasked()) {
+ if (X86II::isKMasked(TSFlags)) {
// The k-mask operand has index = 2 for masked and zero-masked operations.
KMaskOp = 2;
// The operand with index = 1 is used as a source for those elements for
// which the corresponding bit in the k-mask is set to 0.
- if (FMA3Group.isKMergeMasked())
+ if (X86II::isKMergeMasked(TSFlags))
FirstCommutableVecOp = 3;
LastCommutableVecOp++;
return false;
}
- // Check if we can adjust the opcode to preserve the semantics when
- // commute the register operands.
- return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2, FMA3Group) != 0;
+ return true;
}
bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
}
return false;
}
+ case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
+ case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
+ case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
+ case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
+ case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
+ case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
+ case X86::VPTERNLOGDZrrik: case X86::VPTERNLOGDZrmik:
+ case X86::VPTERNLOGDZ128rrik: case X86::VPTERNLOGDZ128rmik:
+ case X86::VPTERNLOGDZ256rrik: case X86::VPTERNLOGDZ256rmik:
+ case X86::VPTERNLOGQZrrik: case X86::VPTERNLOGQZrmik:
+ case X86::VPTERNLOGQZ128rrik: case X86::VPTERNLOGQZ128rmik:
+ case X86::VPTERNLOGQZ256rrik: case X86::VPTERNLOGQZ256rmik:
+ case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
+ case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
+ case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
+ case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
+ case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
+ case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
+ return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
default:
const X86InstrFMA3Group *FMA3Group =
X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
define <16 x i32> @vpternlog_v16i32_102(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_102:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm0, %zmm1
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpternlogd $9, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 -1)
ret <16 x i32> %res
define <16 x i32> @vpternlog_v16i32_210(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_210:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpternlogd $33, %zmm0, %zmm1, %zmm2
-; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: vpternlogd $9, %zmm0, %zmm2, %zmm1
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 -1)
ret <16 x i32> %res
define <16 x i32> @vpternlog_v16i32_012_load0(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_012_load0:
; CHECK: ## BB#0:
-; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm0, %zmm2
-; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: vpternlogd $9, (%rdi), %zmm1, %zmm0
; CHECK-NEXT: retq
%x0 = load <16 x i32>, <16 x i32>* %x0ptr
%res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
define <16 x i32> @vpternlog_v16i32_012_load1(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_012_load1:
; CHECK: ## BB#0:
-; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0
+; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm1, %zmm0
; CHECK-NEXT: retq
%x1 = load <16 x i32>, <16 x i32>* %x1ptr
%res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
define <16 x i32> @vpternlog_v16i32_102_load0(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_102_load0:
; CHECK: ## BB#0:
-; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0
+; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm1, %zmm0
; CHECK-NEXT: retq
%x0 = load <16 x i32>, <16 x i32>* %x0ptr
%res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 -1)
define <16 x i32> @vpternlog_v16i32_102_load1(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_102_load1:
; CHECK: ## BB#0:
-; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm0, %zmm2
-; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: vpternlogd $9, (%rdi), %zmm1, %zmm0
; CHECK-NEXT: retq
%x1 = load <16 x i32>, <16 x i32>* %x1ptr
%res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 -1)
define <16 x i32> @vpternlog_v16i32_102_load2(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr) {
; CHECK-LABEL: vpternlog_v16i32_102_load2:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpternlogd $33, (%rdi), %zmm0, %zmm1
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpternlogd $9, (%rdi), %zmm1, %zmm0
; CHECK-NEXT: retq
%x2 = load <16 x i32>, <16 x i32>* %x2ptr
%res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 -1)
define <16 x i32> @vpternlog_v16i32_210_load0(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_210_load0:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpternlogd $33, (%rdi), %zmm0, %zmm1
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpternlogd $9, (%rdi), %zmm1, %zmm0
; CHECK-NEXT: retq
%x0 = load <16 x i32>, <16 x i32>* %x0ptr
%res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 -1)
define <16 x i32> @vpternlog_v16i32_210_load1(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_210_load1:
; CHECK: ## BB#0:
-; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT: vpternlogd $33, %zmm0, %zmm2, %zmm1
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm1, %zmm0
; CHECK-NEXT: retq
%x1 = load <16 x i32>, <16 x i32>* %x1ptr
%res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 -1)
define <16 x i32> @vpternlog_v16i32_210_load2(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr) {
; CHECK-LABEL: vpternlog_v16i32_210_load2:
; CHECK: ## BB#0:
-; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT: vpternlogd $33, %zmm0, %zmm1, %zmm2
-; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: vpternlogd $33, (%rdi), %zmm1, %zmm0
; CHECK-NEXT: retq
%x2 = load <16 x i32>, <16 x i32>* %x2ptr
%res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 -1)
define <16 x i32> @vpternlog_v16i32_021_load0(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_021_load0:
; CHECK: ## BB#0:
-; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT: vpternlogd $33, %zmm0, %zmm1, %zmm2
-; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: vpternlogd $33, (%rdi), %zmm1, %zmm0
; CHECK-NEXT: retq
%x0 = load <16 x i32>, <16 x i32>* %x0ptr
%res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 33, i16 -1)
define <16 x i32> @vpternlog_v16i32_021_load2(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr) {
; CHECK-LABEL: vpternlog_v16i32_021_load2:
; CHECK: ## BB#0:
-; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0
+; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm1, %zmm0
; CHECK-NEXT: retq
%x2 = load <16 x i32>, <16 x i32>* %x2ptr
%res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 33, i16 -1)
; CHECK-LABEL: vpternlog_v16i32_012_load1_mask:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%x1 = load <16 x i32>, <16 x i32>* %x1ptr
%res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %mask)
; CHECK-LABEL: vpternlog_v16i32_102_load0_mask:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%x0 = load <16 x i32>, <16 x i32>* %x0ptr
%res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 %mask)
; CHECK-LABEL: vpternlog_v16i32_210_load1_mask:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT: vpternlogd $33, %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%x1 = load <16 x i32>, <16 x i32>* %x1ptr
; CHECK-LABEL: vpternlog_v16i32_021_load2_mask:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%x2 = load <16 x i32>, <16 x i32>* %x2ptr
%res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 33, i16 %mask)
; CHECK-LABEL: vpternlog_v16i32_102_maskz:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm0, %zmm1 {%k1} {z}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpternlogd $9, %zmm2, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 %mask)
ret <16 x i32> %res
; CHECK-LABEL: vpternlog_v16i32_210_maskz:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpternlogd $33, %zmm0, %zmm1, %zmm2 {%k1} {z}
-; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: vpternlogd $9, %zmm0, %zmm2, %zmm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 %mask)
ret <16 x i32> %res
; CHECK-LABEL: vpternlog_v16i32_012_load0_maskz:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: vpternlogd $9, (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%x0 = load <16 x i32>, <16 x i32>* %x0ptr
%res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %mask)
; CHECK-LABEL: vpternlog_v16i32_012_load1_maskz:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%x1 = load <16 x i32>, <16 x i32>* %x1ptr
%res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %mask)
; CHECK-LABEL: vpternlog_v16i32_102_load0_maskz:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%x0 = load <16 x i32>, <16 x i32>* %x0ptr
%res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 %mask)
; CHECK-LABEL: vpternlog_v16i32_102_load1_maskz:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: vpternlogd $9, (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%x1 = load <16 x i32>, <16 x i32>* %x1ptr
%res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 %mask)
; CHECK-LABEL: vpternlog_v16i32_102_load2_maskz:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vpternlogd $33, (%rdi), %zmm0, %zmm1 {%k1} {z}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpternlogd $9, (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%x2 = load <16 x i32>, <16 x i32>* %x2ptr
%res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 %mask)
; CHECK-LABEL: vpternlog_v16i32_210_load0_maskz:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vpternlogd $33, (%rdi), %zmm0, %zmm1 {%k1} {z}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpternlogd $9, (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%x0 = load <16 x i32>, <16 x i32>* %x0ptr
%res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 %mask)
; CHECK-LABEL: vpternlog_v16i32_210_load1_maskz:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT: vpternlogd $33, %zmm0, %zmm2, %zmm1 {%k1} {z}
-; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%x1 = load <16 x i32>, <16 x i32>* %x1ptr
%res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 %mask)
; CHECK-LABEL: vpternlog_v16i32_210_load2_maskz:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT: vpternlogd $33, %zmm0, %zmm1, %zmm2 {%k1} {z}
-; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: vpternlogd $33, (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%x2 = load <16 x i32>, <16 x i32>* %x2ptr
%res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 %mask)
; CHECK-LABEL: vpternlog_v16i32_021_load0_maskz:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT: vpternlogd $33, %zmm0, %zmm1, %zmm2 {%k1} {z}
-; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: vpternlogd $33, (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%x0 = load <16 x i32>, <16 x i32>* %x0ptr
%res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 33, i16 %mask)
; CHECK-LABEL: vpternlog_v16i32_021_load2_maskz:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT: vpternlogd $33, %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%x2 = load <16 x i32>, <16 x i32>* %x2ptr
%res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 33, i16 %mask)