From e767bf446840e65c5e84fbc89454c3d7d04b771d Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Sat, 8 Dec 2018 16:07:38 +0000 Subject: [PATCH] [DAGCombiner] re-enable truncation of binops This is effectively re-committing the changes from: rL347917 (D54640) rL348195 (D55126) ...which were effectively reverted here: rL348604 ...because the code had a bug that could induce infinite looping or eventual out-of-memory compilation. The bug was that this code did not guard against transforming opaque constants. More details are in the post-commit mailing list thread for r347917. A reduced test for that is included in the x86 bool-math.ll file. (I wasn't able to reduce a PPC backend test for this, but it was almost the same pattern.) Original commit message for r347917: The motivating case for this is shown in: https://bugs.llvm.org/show_bug.cgi?id=32023 and the corresponding rot16.ll regression tests. Because x86 scalar shift amounts are i8 values, we can end up with trunc-binop-trunc sequences that don't get folded in IR. As the TODO comments suggest, there will be regressions if we extend this (for x86, we mostly seem to be missing LEA opportunities, but there are likely vector folds missing too). I think those should be considered existing bugs because this is the same transform that we do as an IR canonicalization in instcombine. We just need more tests to make those visible independent of this patch. llvm-svn: 348706 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 19 +- llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll | 4 +- llvm/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll | 2 +- .../CodeGen/X86/2010-08-04-MaskedSignedCompare.ll | 2 +- llvm/test/CodeGen/X86/add-sub-nsw-nuw.ll | 2 +- llvm/test/CodeGen/X86/bool-math.ll | 23 +- llvm/test/CodeGen/X86/clear-lowbits.ll | 114 +++++---- llvm/test/CodeGen/X86/cmov.ll | 2 +- llvm/test/CodeGen/X86/extract-bits.ll | 250 ++++++++++---------- llvm/test/CodeGen/X86/extract-lowbits.ll | 216 +++++++++--------- llvm/test/CodeGen/X86/fshl.ll | 74 +++--- llvm/test/CodeGen/X86/fshr.ll | 62 ++--- llvm/test/CodeGen/X86/funnel-shift-rot.ll | 25 +- llvm/test/CodeGen/X86/funnel-shift.ll | 8 +- llvm/test/CodeGen/X86/pr32284.ll | 3 +- llvm/test/CodeGen/X86/pr37879.ll | 2 - llvm/test/CodeGen/X86/rot16.ll | 4 +- llvm/test/CodeGen/X86/rotate.ll | 18 +- llvm/test/CodeGen/X86/rotate4.ll | 36 +-- llvm/test/CodeGen/X86/schedule-x86-64-shld.ll | 12 +- llvm/test/CodeGen/X86/scheduler-backtracking.ll | 254 +++++++++++---------- llvm/test/CodeGen/X86/test-shrink.ll | 11 +- llvm/test/CodeGen/X86/vector-trunc-math-widen.ll | 236 +++++++------------ llvm/test/CodeGen/X86/vector-trunc-math.ll | 236 +++++++------------ llvm/test/CodeGen/X86/xchg-nofold.ll | 2 +- 25 files changed, 735 insertions(+), 882 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index bf47686..24531c3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -112,12 +112,6 @@ static cl::opt MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true), cl::desc("DAG combiner may split indexing from loads")); -// This is a temporary debug flag to disable a combine that is known to -// conflict with another combine. -static cl::opt -NarrowTruncatedBinops("narrow-truncated-binops", cl::Hidden, cl::init(false), - cl::desc("Move truncates ahead of binops")); - namespace { class DAGCombiner { @@ -9814,9 +9808,10 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) return NewVSel; - // Narrow a suitable binary operation with a constant operand by moving it - // ahead of the truncate. This is limited to pre-legalization because targets - // may prefer a wider type during later combines and invert this transform. + // Narrow a suitable binary operation with a non-opaque constant operand by + // moving it ahead of the truncate. This is limited to pre-legalization + // because targets may prefer a wider type during later combines and invert + // this transform. switch (N0.getOpcode()) { // TODO: Add case for ADD - that will likely require a change in logic here // or target-specific changes to avoid regressions. @@ -9825,9 +9820,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { case ISD::AND: case ISD::OR: case ISD::XOR: - if (NarrowTruncatedBinops && !LegalOperations && N0.hasOneUse() && - (isConstantOrConstantVector(N0.getOperand(0)) || - isConstantOrConstantVector(N0.getOperand(1)))) { + if (!LegalOperations && N0.hasOneUse() && + (isConstantOrConstantVector(N0.getOperand(0), true) || + isConstantOrConstantVector(N0.getOperand(1), true))) { // TODO: We already restricted this to pre-legalization, but for vectors // we are extra cautious to not create an unsupported operation. // Target-specific changes are likely needed to avoid regressions here. diff --git a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll index 04ad3bc..f6c30f8 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll @@ -125,11 +125,11 @@ ret: ; GCN: s_cbranch_scc1 ; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004 -; VI: s_and_b32 s{{[0-9]+}}, [[BFE]], 0xff +; VI: v_mov_b32_e32 v{{[0-9]+}}, 0xff ; GCN: BB2_2: ; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004 -; VI: s_and_b32 s{{[0-9]+}}, [[BFE]], 0x7f +; VI: v_mov_b32_e32 v{{[0-9]+}}, 0x7f ; GCN: BB2_3: ; GCN: buffer_store_short diff --git a/llvm/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll b/llvm/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll index afda33b..cbccc90 100644 --- a/llvm/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll +++ b/llvm/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll @@ -17,7 +17,7 @@ define i32 @func_44(i16 signext %p_46) nounwind { ; SOURCE-SCHED-NEXT: setg %cl ; SOURCE-SCHED-NEXT: movb g_73, %dl ; SOURCE-SCHED-NEXT: xorl %eax, %eax -; SOURCE-SCHED-NEXT: subl {{[0-9]+}}(%esp), %eax +; SOURCE-SCHED-NEXT: subb {{[0-9]+}}(%esp), %al ; SOURCE-SCHED-NEXT: testb %dl, %dl ; SOURCE-SCHED-NEXT: jne .LBB0_2 ; SOURCE-SCHED-NEXT: # %bb.1: # %bb11 diff --git a/llvm/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll b/llvm/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll index 4f6f0c9..0abe3cb 100644 --- a/llvm/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll +++ b/llvm/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll @@ -11,7 +11,7 @@ define i32 @main() nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpq {{.*}}(%rip), %rax -; CHECK-NEXT: sbbl %eax, %eax +; CHECK-NEXT: sbbb %al, %al ; CHECK-NEXT: testb $-106, %al ; CHECK-NEXT: jle .LBB0_1 ; CHECK-NEXT: # %bb.2: # %if.then diff --git a/llvm/test/CodeGen/X86/add-sub-nsw-nuw.ll b/llvm/test/CodeGen/X86/add-sub-nsw-nuw.ll index be6d20e..6ff37de 100644 --- a/llvm/test/CodeGen/X86/add-sub-nsw-nuw.ll +++ b/llvm/test/CodeGen/X86/add-sub-nsw-nuw.ll @@ -9,7 +9,7 @@ define i8 @PR30841(i64 %argc) { ; CHECK-LABEL: PR30841: ; CHECK: ## %bb.0: ## %entry ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: negl %eax +; CHECK-NEXT: negb %al ; CHECK-NEXT: ## kill: def $al killed $al killed $eax ; CHECK-NEXT: retl entry: diff --git a/llvm/test/CodeGen/X86/bool-math.ll b/llvm/test/CodeGen/X86/bool-math.ll index 9aa2a61..3a7193b 100644 --- a/llvm/test/CodeGen/X86/bool-math.ll +++ b/llvm/test/CodeGen/X86/bool-math.ll @@ -48,17 +48,16 @@ define i8 @sub_zext_cmp_mask_narrower_result(i32 %x) { ; X64-LABEL: sub_zext_cmp_mask_narrower_result: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: andl $1, %eax +; X64-NEXT: andb $1, %al ; X64-NEXT: orb $46, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; ; X32-LABEL: sub_zext_cmp_mask_narrower_result: ; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: andl $1, %eax +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: andb $1, %al ; X32-NEXT: orb $46, %al -; X32-NEXT: # kill: def $al killed $al killed $eax ; X32-NEXT: retl %a = and i32 %x, 1 %c = icmp eq i32 %a, 0 @@ -114,17 +113,16 @@ define i8 @add_zext_cmp_mask_narrower_result(i32 %x) { ; X64-LABEL: add_zext_cmp_mask_narrower_result: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: andl $1, %eax +; X64-NEXT: andb $1, %al ; X64-NEXT: xorb $43, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; ; X32-LABEL: add_zext_cmp_mask_narrower_result: ; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: andl $1, %eax +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: andb $1, %al ; X32-NEXT: xorb $43, %al -; X32-NEXT: # kill: def $al killed $al killed $eax ; X32-NEXT: retl %a = and i32 %x, 1 %c = icmp eq i32 %a, 0 @@ -185,7 +183,7 @@ define i16 @low_bit_select_constants_bigger_false_narrower_result(i32 %x) { ; ; X32-LABEL: low_bit_select_constants_bigger_false_narrower_result: ; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: andl $1, %eax ; X32-NEXT: orl $36, %eax ; X32-NEXT: # kill: def $ax killed $ax killed $eax @@ -241,17 +239,16 @@ define i8 @low_bit_select_constants_bigger_true_narrower_result(i16 %x) { ; X64-LABEL: low_bit_select_constants_bigger_true_narrower_result: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: andl $1, %eax +; X64-NEXT: andb $1, %al ; X64-NEXT: xorb $41, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; ; X32-LABEL: low_bit_select_constants_bigger_true_narrower_result: ; X32: # %bb.0: -; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X32-NEXT: andl $1, %eax +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: andb $1, %al ; X32-NEXT: xorb $41, %al -; X32-NEXT: # kill: def $al killed $al killed $eax ; X32-NEXT: retl %a = and i16 %x, 1 %c = icmp eq i16 %a, 0 diff --git a/llvm/test/CodeGen/X86/clear-lowbits.ll b/llvm/test/CodeGen/X86/clear-lowbits.ll index f29717e..be251eb 100644 --- a/llvm/test/CodeGen/X86/clear-lowbits.ll +++ b/llvm/test/CodeGen/X86/clear-lowbits.ll @@ -866,10 +866,9 @@ define i16 @clear_lowbits16_ic0(i16 %val, i16 %numlowbits) nounwind { ; X86-NOBMI2-LABEL: clear_lowbits16_ic0: ; X86-NOBMI2: # %bb.0: ; X86-NOBMI2-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NOBMI2-NEXT: movw $16, %cx -; X86-NOBMI2-NEXT: subw {{[0-9]+}}(%esp), %cx +; X86-NOBMI2-NEXT: movb $16, %cl +; X86-NOBMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI2-NEXT: shrl %cl, %eax -; X86-NOBMI2-NEXT: # kill: def $cl killed $cl killed $cx ; X86-NOBMI2-NEXT: shll %cl, %eax ; X86-NOBMI2-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NOBMI2-NEXT: retl @@ -877,8 +876,8 @@ define i16 @clear_lowbits16_ic0(i16 %val, i16 %numlowbits) nounwind { ; X86-BMI2-LABEL: clear_lowbits16_ic0: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-BMI2-NEXT: movw $16, %cx -; X86-BMI2-NEXT: subw {{[0-9]+}}(%esp), %cx +; X86-BMI2-NEXT: movb $16, %cl +; X86-BMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI2-NEXT: shrxl %ecx, %eax, %eax ; X86-BMI2-NEXT: shlxl %ecx, %eax, %eax ; X86-BMI2-NEXT: # kill: def $ax killed $ax killed $eax @@ -887,10 +886,9 @@ define i16 @clear_lowbits16_ic0(i16 %val, i16 %numlowbits) nounwind { ; X64-NOBMI2-LABEL: clear_lowbits16_ic0: ; X64-NOBMI2: # %bb.0: ; X64-NOBMI2-NEXT: movzwl %di, %eax -; X64-NOBMI2-NEXT: movl $16, %ecx -; X64-NOBMI2-NEXT: subl %esi, %ecx +; X64-NOBMI2-NEXT: movb $16, %cl +; X64-NOBMI2-NEXT: subb %sil, %cl ; X64-NOBMI2-NEXT: shrl %cl, %eax -; X64-NOBMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI2-NEXT: shll %cl, %eax ; X64-NOBMI2-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NOBMI2-NEXT: retq @@ -898,8 +896,8 @@ define i16 @clear_lowbits16_ic0(i16 %val, i16 %numlowbits) nounwind { ; X64-BMI2-LABEL: clear_lowbits16_ic0: ; X64-BMI2: # %bb.0: ; X64-BMI2-NEXT: movzwl %di, %eax -; X64-BMI2-NEXT: movl $16, %ecx -; X64-BMI2-NEXT: subl %esi, %ecx +; X64-BMI2-NEXT: movb $16, %cl +; X64-BMI2-NEXT: subb %sil, %cl ; X64-BMI2-NEXT: shrxl %ecx, %eax, %eax ; X64-BMI2-NEXT: shlxl %ecx, %eax, %eax ; X64-BMI2-NEXT: # kill: def $ax killed $ax killed $eax @@ -962,10 +960,9 @@ define i16 @clear_lowbits16_ic2_load(i16* %w, i16 %numlowbits) nounwind { ; X86-NOBMI2: # %bb.0: ; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI2-NEXT: movzwl (%eax), %eax -; X86-NOBMI2-NEXT: movw $16, %cx -; X86-NOBMI2-NEXT: subw {{[0-9]+}}(%esp), %cx +; X86-NOBMI2-NEXT: movb $16, %cl +; X86-NOBMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI2-NEXT: shrl %cl, %eax -; X86-NOBMI2-NEXT: # kill: def $cl killed $cl killed $cx ; X86-NOBMI2-NEXT: shll %cl, %eax ; X86-NOBMI2-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NOBMI2-NEXT: retl @@ -974,8 +971,8 @@ define i16 @clear_lowbits16_ic2_load(i16* %w, i16 %numlowbits) nounwind { ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movzwl (%eax), %eax -; X86-BMI2-NEXT: movw $16, %cx -; X86-BMI2-NEXT: subw {{[0-9]+}}(%esp), %cx +; X86-BMI2-NEXT: movb $16, %cl +; X86-BMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI2-NEXT: shrxl %ecx, %eax, %eax ; X86-BMI2-NEXT: shlxl %ecx, %eax, %eax ; X86-BMI2-NEXT: # kill: def $ax killed $ax killed $eax @@ -984,10 +981,9 @@ define i16 @clear_lowbits16_ic2_load(i16* %w, i16 %numlowbits) nounwind { ; X64-NOBMI2-LABEL: clear_lowbits16_ic2_load: ; X64-NOBMI2: # %bb.0: ; X64-NOBMI2-NEXT: movzwl (%rdi), %eax -; X64-NOBMI2-NEXT: movl $16, %ecx -; X64-NOBMI2-NEXT: subl %esi, %ecx +; X64-NOBMI2-NEXT: movb $16, %cl +; X64-NOBMI2-NEXT: subb %sil, %cl ; X64-NOBMI2-NEXT: shrl %cl, %eax -; X64-NOBMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI2-NEXT: shll %cl, %eax ; X64-NOBMI2-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NOBMI2-NEXT: retq @@ -995,8 +991,8 @@ define i16 @clear_lowbits16_ic2_load(i16* %w, i16 %numlowbits) nounwind { ; X64-BMI2-LABEL: clear_lowbits16_ic2_load: ; X64-BMI2: # %bb.0: ; X64-BMI2-NEXT: movzwl (%rdi), %eax -; X64-BMI2-NEXT: movl $16, %ecx -; X64-BMI2-NEXT: subl %esi, %ecx +; X64-BMI2-NEXT: movb $16, %cl +; X64-BMI2-NEXT: subb %sil, %cl ; X64-BMI2-NEXT: shrxl %ecx, %eax, %eax ; X64-BMI2-NEXT: shlxl %ecx, %eax, %eax ; X64-BMI2-NEXT: # kill: def $ax killed $ax killed $eax @@ -1062,10 +1058,9 @@ define i16 @clear_lowbits16_ic4_commutative(i16 %val, i16 %numlowbits) nounwind ; X86-NOBMI2-LABEL: clear_lowbits16_ic4_commutative: ; X86-NOBMI2: # %bb.0: ; X86-NOBMI2-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NOBMI2-NEXT: movw $16, %cx -; X86-NOBMI2-NEXT: subw {{[0-9]+}}(%esp), %cx +; X86-NOBMI2-NEXT: movb $16, %cl +; X86-NOBMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI2-NEXT: shrl %cl, %eax -; X86-NOBMI2-NEXT: # kill: def $cl killed $cl killed $cx ; X86-NOBMI2-NEXT: shll %cl, %eax ; X86-NOBMI2-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NOBMI2-NEXT: retl @@ -1073,8 +1068,8 @@ define i16 @clear_lowbits16_ic4_commutative(i16 %val, i16 %numlowbits) nounwind ; X86-BMI2-LABEL: clear_lowbits16_ic4_commutative: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-BMI2-NEXT: movw $16, %cx -; X86-BMI2-NEXT: subw {{[0-9]+}}(%esp), %cx +; X86-BMI2-NEXT: movb $16, %cl +; X86-BMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI2-NEXT: shrxl %ecx, %eax, %eax ; X86-BMI2-NEXT: shlxl %ecx, %eax, %eax ; X86-BMI2-NEXT: # kill: def $ax killed $ax killed $eax @@ -1083,10 +1078,9 @@ define i16 @clear_lowbits16_ic4_commutative(i16 %val, i16 %numlowbits) nounwind ; X64-NOBMI2-LABEL: clear_lowbits16_ic4_commutative: ; X64-NOBMI2: # %bb.0: ; X64-NOBMI2-NEXT: movzwl %di, %eax -; X64-NOBMI2-NEXT: movl $16, %ecx -; X64-NOBMI2-NEXT: subl %esi, %ecx +; X64-NOBMI2-NEXT: movb $16, %cl +; X64-NOBMI2-NEXT: subb %sil, %cl ; X64-NOBMI2-NEXT: shrl %cl, %eax -; X64-NOBMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI2-NEXT: shll %cl, %eax ; X64-NOBMI2-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NOBMI2-NEXT: retq @@ -1094,8 +1088,8 @@ define i16 @clear_lowbits16_ic4_commutative(i16 %val, i16 %numlowbits) nounwind ; X64-BMI2-LABEL: clear_lowbits16_ic4_commutative: ; X64-BMI2: # %bb.0: ; X64-BMI2-NEXT: movzwl %di, %eax -; X64-BMI2-NEXT: movl $16, %ecx -; X64-BMI2-NEXT: subl %esi, %ecx +; X64-BMI2-NEXT: movb $16, %cl +; X64-BMI2-NEXT: subb %sil, %cl ; X64-BMI2-NEXT: shrxl %ecx, %eax, %eax ; X64-BMI2-NEXT: shlxl %ecx, %eax, %eax ; X64-BMI2-NEXT: # kill: def $ax killed $ax killed $eax @@ -1113,7 +1107,7 @@ define i32 @clear_lowbits32_ic0(i32 %val, i32 %numlowbits) nounwind { ; X86-NOBMI2: # %bb.0: ; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI2-NEXT: xorl %ecx, %ecx -; X86-NOBMI2-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI2-NEXT: shrl %cl, %eax ; X86-NOBMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NOBMI2-NEXT: shll %cl, %eax @@ -1122,7 +1116,7 @@ define i32 @clear_lowbits32_ic0(i32 %val, i32 %numlowbits) nounwind { ; X86-BMI2-LABEL: clear_lowbits32_ic0: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: xorl %eax, %eax -; X86-BMI2-NEXT: subl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: subb {{[0-9]+}}(%esp), %al ; X86-BMI2-NEXT: shrxl %eax, {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: shlxl %eax, %ecx, %eax ; X86-BMI2-NEXT: retl @@ -1131,7 +1125,7 @@ define i32 @clear_lowbits32_ic0(i32 %val, i32 %numlowbits) nounwind { ; X64-NOBMI2: # %bb.0: ; X64-NOBMI2-NEXT: movl %esi, %ecx ; X64-NOBMI2-NEXT: movl %edi, %eax -; X64-NOBMI2-NEXT: negl %ecx +; X64-NOBMI2-NEXT: negb %cl ; X64-NOBMI2-NEXT: shrl %cl, %eax ; X64-NOBMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI2-NEXT: shll %cl, %eax @@ -1139,7 +1133,7 @@ define i32 @clear_lowbits32_ic0(i32 %val, i32 %numlowbits) nounwind { ; ; X64-BMI2-LABEL: clear_lowbits32_ic0: ; X64-BMI2: # %bb.0: -; X64-BMI2-NEXT: negl %esi +; X64-BMI2-NEXT: negb %sil ; X64-BMI2-NEXT: shrxl %esi, %edi, %eax ; X64-BMI2-NEXT: shlxl %esi, %eax, %eax ; X64-BMI2-NEXT: retq @@ -1197,7 +1191,7 @@ define i32 @clear_lowbits32_ic2_load(i32* %w, i32 %numlowbits) nounwind { ; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI2-NEXT: movl (%eax), %eax ; X86-NOBMI2-NEXT: xorl %ecx, %ecx -; X86-NOBMI2-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI2-NEXT: shrl %cl, %eax ; X86-NOBMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NOBMI2-NEXT: shll %cl, %eax @@ -1207,7 +1201,7 @@ define i32 @clear_lowbits32_ic2_load(i32* %w, i32 %numlowbits) nounwind { ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: xorl %ecx, %ecx -; X86-BMI2-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI2-NEXT: shrxl %ecx, (%eax), %eax ; X86-BMI2-NEXT: shlxl %ecx, %eax, %eax ; X86-BMI2-NEXT: retl @@ -1216,7 +1210,7 @@ define i32 @clear_lowbits32_ic2_load(i32* %w, i32 %numlowbits) nounwind { ; X64-NOBMI2: # %bb.0: ; X64-NOBMI2-NEXT: movl %esi, %ecx ; X64-NOBMI2-NEXT: movl (%rdi), %eax -; X64-NOBMI2-NEXT: negl %ecx +; X64-NOBMI2-NEXT: negb %cl ; X64-NOBMI2-NEXT: shrl %cl, %eax ; X64-NOBMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI2-NEXT: shll %cl, %eax @@ -1224,7 +1218,7 @@ define i32 @clear_lowbits32_ic2_load(i32* %w, i32 %numlowbits) nounwind { ; ; X64-BMI2-LABEL: clear_lowbits32_ic2_load: ; X64-BMI2: # %bb.0: -; X64-BMI2-NEXT: negl %esi +; X64-BMI2-NEXT: negb %sil ; X64-BMI2-NEXT: shrxl %esi, (%rdi), %eax ; X64-BMI2-NEXT: shlxl %esi, %eax, %eax ; X64-BMI2-NEXT: retq @@ -1285,7 +1279,7 @@ define i32 @clear_lowbits32_ic4_commutative(i32 %val, i32 %numlowbits) nounwind ; X86-NOBMI2: # %bb.0: ; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI2-NEXT: xorl %ecx, %ecx -; X86-NOBMI2-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI2-NEXT: shrl %cl, %eax ; X86-NOBMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NOBMI2-NEXT: shll %cl, %eax @@ -1294,7 +1288,7 @@ define i32 @clear_lowbits32_ic4_commutative(i32 %val, i32 %numlowbits) nounwind ; X86-BMI2-LABEL: clear_lowbits32_ic4_commutative: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: xorl %eax, %eax -; X86-BMI2-NEXT: subl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: subb {{[0-9]+}}(%esp), %al ; X86-BMI2-NEXT: shrxl %eax, {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: shlxl %eax, %ecx, %eax ; X86-BMI2-NEXT: retl @@ -1303,7 +1297,7 @@ define i32 @clear_lowbits32_ic4_commutative(i32 %val, i32 %numlowbits) nounwind ; X64-NOBMI2: # %bb.0: ; X64-NOBMI2-NEXT: movl %esi, %ecx ; X64-NOBMI2-NEXT: movl %edi, %eax -; X64-NOBMI2-NEXT: negl %ecx +; X64-NOBMI2-NEXT: negb %cl ; X64-NOBMI2-NEXT: shrl %cl, %eax ; X64-NOBMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI2-NEXT: shll %cl, %eax @@ -1311,7 +1305,7 @@ define i32 @clear_lowbits32_ic4_commutative(i32 %val, i32 %numlowbits) nounwind ; ; X64-BMI2-LABEL: clear_lowbits32_ic4_commutative: ; X64-BMI2: # %bb.0: -; X64-BMI2-NEXT: negl %esi +; X64-BMI2-NEXT: negb %sil ; X64-BMI2-NEXT: shrxl %esi, %edi, %eax ; X64-BMI2-NEXT: shlxl %esi, %eax, %eax ; X64-BMI2-NEXT: retq @@ -1326,8 +1320,8 @@ define i32 @clear_lowbits32_ic4_commutative(i32 %val, i32 %numlowbits) nounwind define i64 @clear_lowbits64_ic0(i64 %val, i64 %numlowbits) nounwind { ; X86-NOBMI2-LABEL: clear_lowbits64_ic0: ; X86-NOBMI2: # %bb.0: -; X86-NOBMI2-NEXT: movl $64, %ecx -; X86-NOBMI2-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI2-NEXT: movb $64, %cl +; X86-NOBMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI2-NEXT: movl $-1, %edx ; X86-NOBMI2-NEXT: movl $-1, %eax ; X86-NOBMI2-NEXT: shll %cl, %eax @@ -1344,8 +1338,8 @@ define i64 @clear_lowbits64_ic0(i64 %val, i64 %numlowbits) nounwind { ; ; X86-BMI2-LABEL: clear_lowbits64_ic0: ; X86-BMI2: # %bb.0: -; X86-BMI2-NEXT: movl $64, %ecx -; X86-BMI2-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI2-NEXT: movb $64, %cl +; X86-BMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI2-NEXT: movl $-1, %edx ; X86-BMI2-NEXT: shlxl %ecx, %edx, %eax ; X86-BMI2-NEXT: shldl %cl, %edx, %edx @@ -1363,7 +1357,7 @@ define i64 @clear_lowbits64_ic0(i64 %val, i64 %numlowbits) nounwind { ; X64-NOBMI2: # %bb.0: ; X64-NOBMI2-NEXT: movq %rsi, %rcx ; X64-NOBMI2-NEXT: movq %rdi, %rax -; X64-NOBMI2-NEXT: negl %ecx +; X64-NOBMI2-NEXT: negb %cl ; X64-NOBMI2-NEXT: shrq %cl, %rax ; X64-NOBMI2-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI2-NEXT: shlq %cl, %rax @@ -1371,7 +1365,7 @@ define i64 @clear_lowbits64_ic0(i64 %val, i64 %numlowbits) nounwind { ; ; X64-BMI2-LABEL: clear_lowbits64_ic0: ; X64-BMI2: # %bb.0: -; X64-BMI2-NEXT: negl %esi +; X64-BMI2-NEXT: negb %sil ; X64-BMI2-NEXT: shrxq %rsi, %rdi, %rax ; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax ; X64-BMI2-NEXT: retq @@ -1446,8 +1440,8 @@ define i64 @clear_lowbits64_ic2_load(i64* %w, i64 %numlowbits) nounwind { ; X86-NOBMI2: # %bb.0: ; X86-NOBMI2-NEXT: pushl %esi ; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOBMI2-NEXT: movl $64, %ecx -; X86-NOBMI2-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI2-NEXT: movb $64, %cl +; X86-NOBMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI2-NEXT: movl $-1, %edx ; X86-NOBMI2-NEXT: movl $-1, %eax ; X86-NOBMI2-NEXT: shll %cl, %eax @@ -1467,8 +1461,8 @@ define i64 @clear_lowbits64_ic2_load(i64* %w, i64 %numlowbits) nounwind { ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %esi ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI2-NEXT: movl $64, %ecx -; X86-BMI2-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI2-NEXT: movb $64, %cl +; X86-BMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI2-NEXT: movl $-1, %edx ; X86-BMI2-NEXT: shlxl %ecx, %edx, %eax ; X86-BMI2-NEXT: shldl %cl, %edx, %edx @@ -1487,7 +1481,7 @@ define i64 @clear_lowbits64_ic2_load(i64* %w, i64 %numlowbits) nounwind { ; X64-NOBMI2: # %bb.0: ; X64-NOBMI2-NEXT: movq %rsi, %rcx ; X64-NOBMI2-NEXT: movq (%rdi), %rax -; X64-NOBMI2-NEXT: negl %ecx +; X64-NOBMI2-NEXT: negb %cl ; X64-NOBMI2-NEXT: shrq %cl, %rax ; X64-NOBMI2-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI2-NEXT: shlq %cl, %rax @@ -1495,7 +1489,7 @@ define i64 @clear_lowbits64_ic2_load(i64* %w, i64 %numlowbits) nounwind { ; ; X64-BMI2-LABEL: clear_lowbits64_ic2_load: ; X64-BMI2: # %bb.0: -; X64-BMI2-NEXT: negl %esi +; X64-BMI2-NEXT: negb %sil ; X64-BMI2-NEXT: shrxq %rsi, (%rdi), %rax ; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax ; X64-BMI2-NEXT: retq @@ -1576,8 +1570,8 @@ define i64 @clear_lowbits64_ic3_load_indexzext(i64* %w, i8 %numlowbits) nounwind define i64 @clear_lowbits64_ic4_commutative(i64 %val, i64 %numlowbits) nounwind { ; X86-NOBMI2-LABEL: clear_lowbits64_ic4_commutative: ; X86-NOBMI2: # %bb.0: -; X86-NOBMI2-NEXT: movl $64, %ecx -; X86-NOBMI2-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI2-NEXT: movb $64, %cl +; X86-NOBMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI2-NEXT: movl $-1, %edx ; X86-NOBMI2-NEXT: movl $-1, %eax ; X86-NOBMI2-NEXT: shll %cl, %eax @@ -1594,8 +1588,8 @@ define i64 @clear_lowbits64_ic4_commutative(i64 %val, i64 %numlowbits) nounwind ; ; X86-BMI2-LABEL: clear_lowbits64_ic4_commutative: ; X86-BMI2: # %bb.0: -; X86-BMI2-NEXT: movl $64, %ecx -; X86-BMI2-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI2-NEXT: movb $64, %cl +; X86-BMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI2-NEXT: movl $-1, %edx ; X86-BMI2-NEXT: shlxl %ecx, %edx, %eax ; X86-BMI2-NEXT: shldl %cl, %edx, %edx @@ -1613,7 +1607,7 @@ define i64 @clear_lowbits64_ic4_commutative(i64 %val, i64 %numlowbits) nounwind ; X64-NOBMI2: # %bb.0: ; X64-NOBMI2-NEXT: movq %rsi, %rcx ; X64-NOBMI2-NEXT: movq %rdi, %rax -; X64-NOBMI2-NEXT: negl %ecx +; X64-NOBMI2-NEXT: negb %cl ; X64-NOBMI2-NEXT: shrq %cl, %rax ; X64-NOBMI2-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI2-NEXT: shlq %cl, %rax @@ -1621,7 +1615,7 @@ define i64 @clear_lowbits64_ic4_commutative(i64 %val, i64 %numlowbits) nounwind ; ; X64-BMI2-LABEL: clear_lowbits64_ic4_commutative: ; X64-BMI2: # %bb.0: -; X64-BMI2-NEXT: negl %esi +; X64-BMI2-NEXT: negb %sil ; X64-BMI2-NEXT: shrxq %rsi, %rdi, %rax ; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax ; X64-BMI2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/cmov.ll b/llvm/test/CodeGen/X86/cmov.ll index 2e92d8e..a504538 100644 --- a/llvm/test/CodeGen/X86/cmov.ll +++ b/llvm/test/CodeGen/X86/cmov.ll @@ -81,7 +81,7 @@ define i1 @test4() nounwind { ; CHECK-NEXT: movsbl {{.*}}(%rip), %edx ; CHECK-NEXT: movzbl %dl, %ecx ; CHECK-NEXT: shrl $7, %ecx -; CHECK-NEXT: xorl $1, %ecx +; CHECK-NEXT: xorb $1, %cl ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: sarl %cl, %edx ; CHECK-NEXT: movb {{.*}}(%rip), %al diff --git a/llvm/test/CodeGen/X86/extract-bits.ll b/llvm/test/CodeGen/X86/extract-bits.ll index 2f1b286..b69c0c1 100644 --- a/llvm/test/CodeGen/X86/extract-bits.ll +++ b/llvm/test/CodeGen/X86/extract-bits.ll @@ -2983,7 +2983,7 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NOBMI-NEXT: shrl %cl, %edi ; X86-NOBMI-NEXT: xorl %ecx, %ecx -; X86-NOBMI-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: movl $-1, %esi ; X86-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NOBMI-NEXT: shrl %cl, %esi @@ -3005,7 +3005,7 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-BMI1NOTBM-NEXT: shrl %cl, %edi ; X86-BMI1NOTBM-NEXT: xorl %ecx, %ecx -; X86-BMI1NOTBM-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: movl $-1, %esi ; X86-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-BMI1NOTBM-NEXT: shrl %cl, %esi @@ -3020,22 +3020,22 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; ; X86-BMI1BMI2-LABEL: bextr32_c0: ; X86-BMI1BMI2: # %bb.0: -; X86-BMI1BMI2-NEXT: pushl %edi +; X86-BMI1BMI2-NEXT: pushl %ebx ; X86-BMI1BMI2-NEXT: pushl %esi ; X86-BMI1BMI2-NEXT: pushl %eax -; X86-BMI1BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-BMI1BMI2-NEXT: movb {{[0-9]+}}(%esp), %bl ; X86-BMI1BMI2-NEXT: movb {{[0-9]+}}(%esp), %al -; X86-BMI1BMI2-NEXT: shrxl %eax, {{[0-9]+}}(%esp), %edi -; X86-BMI1BMI2-NEXT: movl %esi, %eax -; X86-BMI1BMI2-NEXT: negl %eax +; X86-BMI1BMI2-NEXT: shrxl %eax, {{[0-9]+}}(%esp), %esi +; X86-BMI1BMI2-NEXT: movl %ebx, %eax +; X86-BMI1BMI2-NEXT: negb %al ; X86-BMI1BMI2-NEXT: movl $-1, %ecx ; X86-BMI1BMI2-NEXT: shrxl %eax, %ecx, %eax ; X86-BMI1BMI2-NEXT: movl %eax, (%esp) ; X86-BMI1BMI2-NEXT: calll use32 -; X86-BMI1BMI2-NEXT: bzhil %esi, %edi, %eax +; X86-BMI1BMI2-NEXT: bzhil %ebx, %esi, %eax ; X86-BMI1BMI2-NEXT: addl $4, %esp ; X86-BMI1BMI2-NEXT: popl %esi -; X86-BMI1BMI2-NEXT: popl %edi +; X86-BMI1BMI2-NEXT: popl %ebx ; X86-BMI1BMI2-NEXT: retl ; ; X64-NOBMI-LABEL: bextr32_c0: @@ -3047,7 +3047,7 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; X64-NOBMI-NEXT: movl %edi, %ebx ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %ebx -; X64-NOBMI-NEXT: negl %edx +; X64-NOBMI-NEXT: negb %dl ; X64-NOBMI-NEXT: movl $-1, %ebp ; X64-NOBMI-NEXT: movl %edx, %ecx ; X64-NOBMI-NEXT: shrl %cl, %ebp @@ -3069,7 +3069,7 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; X64-BMI1NOTBM-NEXT: movl %edi, %ebx ; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1NOTBM-NEXT: shrl %cl, %ebx -; X64-BMI1NOTBM-NEXT: negl %edx +; X64-BMI1NOTBM-NEXT: negb %dl ; X64-BMI1NOTBM-NEXT: movl $-1, %ebp ; X64-BMI1NOTBM-NEXT: movl %edx, %ecx ; X64-BMI1NOTBM-NEXT: shrl %cl, %ebp @@ -3089,8 +3089,8 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; X64-BMI1BMI2-NEXT: pushq %rax ; X64-BMI1BMI2-NEXT: movl %edx, %ebx ; X64-BMI1BMI2-NEXT: shrxl %esi, %edi, %ebp -; X64-BMI1BMI2-NEXT: movl %edx, %eax -; X64-BMI1BMI2-NEXT: negl %eax +; X64-BMI1BMI2-NEXT: movl %ebx, %eax +; X64-BMI1BMI2-NEXT: negb %al ; X64-BMI1BMI2-NEXT: movl $-1, %ecx ; X64-BMI1BMI2-NEXT: shrxl %eax, %ecx, %edi ; X64-BMI1BMI2-NEXT: callq use32 @@ -3254,7 +3254,7 @@ define i32 @bextr32_c2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind ; X86-NOBMI-NEXT: movl (%eax), %edi ; X86-NOBMI-NEXT: shrl %cl, %edi ; X86-NOBMI-NEXT: xorl %ecx, %ecx -; X86-NOBMI-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: movl $-1, %esi ; X86-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NOBMI-NEXT: shrl %cl, %esi @@ -3277,7 +3277,7 @@ define i32 @bextr32_c2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind ; X86-BMI1NOTBM-NEXT: movl (%eax), %edi ; X86-BMI1NOTBM-NEXT: shrl %cl, %edi ; X86-BMI1NOTBM-NEXT: xorl %ecx, %ecx -; X86-BMI1NOTBM-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: movl $-1, %esi ; X86-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-BMI1NOTBM-NEXT: shrl %cl, %esi @@ -3292,23 +3292,23 @@ define i32 @bextr32_c2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind ; ; X86-BMI1BMI2-LABEL: bextr32_c2_load: ; X86-BMI1BMI2: # %bb.0: -; X86-BMI1BMI2-NEXT: pushl %edi +; X86-BMI1BMI2-NEXT: pushl %ebx ; X86-BMI1BMI2-NEXT: pushl %esi ; X86-BMI1BMI2-NEXT: pushl %eax -; X86-BMI1BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-BMI1BMI2-NEXT: movb {{[0-9]+}}(%esp), %bl ; X86-BMI1BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI1BMI2-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-BMI1BMI2-NEXT: shrxl %ecx, (%eax), %edi -; X86-BMI1BMI2-NEXT: movl %esi, %eax -; X86-BMI1BMI2-NEXT: negl %eax +; X86-BMI1BMI2-NEXT: shrxl %ecx, (%eax), %esi +; X86-BMI1BMI2-NEXT: movl %ebx, %eax +; X86-BMI1BMI2-NEXT: negb %al ; X86-BMI1BMI2-NEXT: movl $-1, %ecx ; X86-BMI1BMI2-NEXT: shrxl %eax, %ecx, %eax ; X86-BMI1BMI2-NEXT: movl %eax, (%esp) ; X86-BMI1BMI2-NEXT: calll use32 -; X86-BMI1BMI2-NEXT: bzhil %esi, %edi, %eax +; X86-BMI1BMI2-NEXT: bzhil %ebx, %esi, %eax ; X86-BMI1BMI2-NEXT: addl $4, %esp ; X86-BMI1BMI2-NEXT: popl %esi -; X86-BMI1BMI2-NEXT: popl %edi +; X86-BMI1BMI2-NEXT: popl %ebx ; X86-BMI1BMI2-NEXT: retl ; ; X64-NOBMI-LABEL: bextr32_c2_load: @@ -3320,7 +3320,7 @@ define i32 @bextr32_c2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind ; X64-NOBMI-NEXT: movl (%rdi), %ebp ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %ebp -; X64-NOBMI-NEXT: negl %edx +; X64-NOBMI-NEXT: negb %dl ; X64-NOBMI-NEXT: movl $-1, %ebx ; X64-NOBMI-NEXT: movl %edx, %ecx ; X64-NOBMI-NEXT: shrl %cl, %ebx @@ -3342,7 +3342,7 @@ define i32 @bextr32_c2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind ; X64-BMI1NOTBM-NEXT: movl (%rdi), %ebp ; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1NOTBM-NEXT: shrl %cl, %ebp -; X64-BMI1NOTBM-NEXT: negl %edx +; X64-BMI1NOTBM-NEXT: negb %dl ; X64-BMI1NOTBM-NEXT: movl $-1, %ebx ; X64-BMI1NOTBM-NEXT: movl %edx, %ecx ; X64-BMI1NOTBM-NEXT: shrl %cl, %ebx @@ -3362,8 +3362,8 @@ define i32 @bextr32_c2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind ; X64-BMI1BMI2-NEXT: pushq %rax ; X64-BMI1BMI2-NEXT: movl %edx, %ebx ; X64-BMI1BMI2-NEXT: shrxl %esi, (%rdi), %ebp -; X64-BMI1BMI2-NEXT: movl %edx, %eax -; X64-BMI1BMI2-NEXT: negl %eax +; X64-BMI1BMI2-NEXT: movl %ebx, %eax +; X64-BMI1BMI2-NEXT: negb %al ; X64-BMI1BMI2-NEXT: movl $-1, %ecx ; X64-BMI1BMI2-NEXT: shrxl %eax, %ecx, %edi ; X64-BMI1BMI2-NEXT: callq use32 @@ -3531,7 +3531,7 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NOBMI-NEXT: shrl %cl, %edi ; X86-NOBMI-NEXT: xorl %ecx, %ecx -; X86-NOBMI-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: movl $-1, %esi ; X86-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NOBMI-NEXT: shrl %cl, %esi @@ -3553,7 +3553,7 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) ; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-BMI1NOTBM-NEXT: shrl %cl, %edi ; X86-BMI1NOTBM-NEXT: xorl %ecx, %ecx -; X86-BMI1NOTBM-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: movl $-1, %esi ; X86-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-BMI1NOTBM-NEXT: shrl %cl, %esi @@ -3568,22 +3568,22 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) ; ; X86-BMI1BMI2-LABEL: bextr32_c4_commutative: ; X86-BMI1BMI2: # %bb.0: -; X86-BMI1BMI2-NEXT: pushl %edi +; X86-BMI1BMI2-NEXT: pushl %ebx ; X86-BMI1BMI2-NEXT: pushl %esi ; X86-BMI1BMI2-NEXT: pushl %eax -; X86-BMI1BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-BMI1BMI2-NEXT: movb {{[0-9]+}}(%esp), %bl ; X86-BMI1BMI2-NEXT: movb {{[0-9]+}}(%esp), %al -; X86-BMI1BMI2-NEXT: shrxl %eax, {{[0-9]+}}(%esp), %edi -; X86-BMI1BMI2-NEXT: movl %esi, %eax -; X86-BMI1BMI2-NEXT: negl %eax +; X86-BMI1BMI2-NEXT: shrxl %eax, {{[0-9]+}}(%esp), %esi +; X86-BMI1BMI2-NEXT: movl %ebx, %eax +; X86-BMI1BMI2-NEXT: negb %al ; X86-BMI1BMI2-NEXT: movl $-1, %ecx ; X86-BMI1BMI2-NEXT: shrxl %eax, %ecx, %eax ; X86-BMI1BMI2-NEXT: movl %eax, (%esp) ; X86-BMI1BMI2-NEXT: calll use32 -; X86-BMI1BMI2-NEXT: bzhil %esi, %edi, %eax +; X86-BMI1BMI2-NEXT: bzhil %ebx, %esi, %eax ; X86-BMI1BMI2-NEXT: addl $4, %esp ; X86-BMI1BMI2-NEXT: popl %esi -; X86-BMI1BMI2-NEXT: popl %edi +; X86-BMI1BMI2-NEXT: popl %ebx ; X86-BMI1BMI2-NEXT: retl ; ; X64-NOBMI-LABEL: bextr32_c4_commutative: @@ -3595,7 +3595,7 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) ; X64-NOBMI-NEXT: movl %edi, %ebx ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %ebx -; X64-NOBMI-NEXT: negl %edx +; X64-NOBMI-NEXT: negb %dl ; X64-NOBMI-NEXT: movl $-1, %ebp ; X64-NOBMI-NEXT: movl %edx, %ecx ; X64-NOBMI-NEXT: shrl %cl, %ebp @@ -3617,7 +3617,7 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) ; X64-BMI1NOTBM-NEXT: movl %edi, %ebx ; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1NOTBM-NEXT: shrl %cl, %ebx -; X64-BMI1NOTBM-NEXT: negl %edx +; X64-BMI1NOTBM-NEXT: negb %dl ; X64-BMI1NOTBM-NEXT: movl $-1, %ebp ; X64-BMI1NOTBM-NEXT: movl %edx, %ecx ; X64-BMI1NOTBM-NEXT: shrl %cl, %ebp @@ -3637,8 +3637,8 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) ; X64-BMI1BMI2-NEXT: pushq %rax ; X64-BMI1BMI2-NEXT: movl %edx, %ebx ; X64-BMI1BMI2-NEXT: shrxl %esi, %edi, %ebp -; X64-BMI1BMI2-NEXT: movl %edx, %eax -; X64-BMI1BMI2-NEXT: negl %eax +; X64-BMI1BMI2-NEXT: movl %ebx, %eax +; X64-BMI1BMI2-NEXT: negb %al ; X64-BMI1BMI2-NEXT: movl $-1, %ecx ; X64-BMI1BMI2-NEXT: shrxl %eax, %ecx, %edi ; X64-BMI1BMI2-NEXT: callq use32 @@ -3667,7 +3667,7 @@ define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits ; X86-NOBMI-NEXT: movl %ebx, %ecx ; X86-NOBMI-NEXT: shrl %cl, %edi ; X86-NOBMI-NEXT: xorl %ecx, %ecx -; X86-NOBMI-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: movl $-1, %esi ; X86-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NOBMI-NEXT: shrl %cl, %esi @@ -3694,7 +3694,7 @@ define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits ; X86-BMI1NOTBM-NEXT: movl %ebx, %ecx ; X86-BMI1NOTBM-NEXT: shrl %cl, %edi ; X86-BMI1NOTBM-NEXT: xorl %ecx, %ecx -; X86-BMI1NOTBM-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: movl $-1, %esi ; X86-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-BMI1NOTBM-NEXT: shrl %cl, %esi @@ -3716,16 +3716,16 @@ define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits ; X86-BMI1BMI2-NEXT: pushl %edi ; X86-BMI1BMI2-NEXT: pushl %esi ; X86-BMI1BMI2-NEXT: subl $16, %esp -; X86-BMI1BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-BMI1BMI2-NEXT: movb {{[0-9]+}}(%esp), %bl ; X86-BMI1BMI2-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-BMI1BMI2-NEXT: shrxl %edi, {{[0-9]+}}(%esp), %ebx -; X86-BMI1BMI2-NEXT: movl %esi, %eax -; X86-BMI1BMI2-NEXT: negl %eax +; X86-BMI1BMI2-NEXT: shrxl %edi, {{[0-9]+}}(%esp), %esi +; X86-BMI1BMI2-NEXT: movl %ebx, %eax +; X86-BMI1BMI2-NEXT: negb %al ; X86-BMI1BMI2-NEXT: movl $-1, %ecx ; X86-BMI1BMI2-NEXT: shrxl %eax, %ecx, %eax ; X86-BMI1BMI2-NEXT: movl %eax, (%esp) ; X86-BMI1BMI2-NEXT: calll use32 -; X86-BMI1BMI2-NEXT: bzhil %esi, %ebx, %esi +; X86-BMI1BMI2-NEXT: bzhil %ebx, %esi, %esi ; X86-BMI1BMI2-NEXT: movl %edi, (%esp) ; X86-BMI1BMI2-NEXT: calll use32 ; X86-BMI1BMI2-NEXT: movl %esi, %eax @@ -3744,7 +3744,7 @@ define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits ; X64-NOBMI-NEXT: movl %edi, %ebp ; X64-NOBMI-NEXT: movl %r14d, %ecx ; X64-NOBMI-NEXT: shrl %cl, %ebp -; X64-NOBMI-NEXT: negl %edx +; X64-NOBMI-NEXT: negb %dl ; X64-NOBMI-NEXT: movl $-1, %ebx ; X64-NOBMI-NEXT: movl %edx, %ecx ; X64-NOBMI-NEXT: shrl %cl, %ebx @@ -3768,7 +3768,7 @@ define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits ; X64-BMI1NOTBM-NEXT: movl %edi, %ebp ; X64-BMI1NOTBM-NEXT: movl %r14d, %ecx ; X64-BMI1NOTBM-NEXT: shrl %cl, %ebp -; X64-BMI1NOTBM-NEXT: negl %edx +; X64-BMI1NOTBM-NEXT: negb %dl ; X64-BMI1NOTBM-NEXT: movl $-1, %ebx ; X64-BMI1NOTBM-NEXT: movl %edx, %ecx ; X64-BMI1NOTBM-NEXT: shrl %cl, %ebx @@ -3791,8 +3791,8 @@ define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits ; X64-BMI1BMI2-NEXT: movl %edx, %ebx ; X64-BMI1BMI2-NEXT: movl %esi, %ebp ; X64-BMI1BMI2-NEXT: shrxl %esi, %edi, %r14d -; X64-BMI1BMI2-NEXT: movl %edx, %eax -; X64-BMI1BMI2-NEXT: negl %eax +; X64-BMI1BMI2-NEXT: movl %ebx, %eax +; X64-BMI1BMI2-NEXT: negb %al ; X64-BMI1BMI2-NEXT: movl $-1, %ecx ; X64-BMI1BMI2-NEXT: shrxl %eax, %ecx, %edi ; X64-BMI1BMI2-NEXT: callq use32 @@ -3835,8 +3835,8 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; X86-NOBMI-NEXT: movl %edi, %esi ; X86-NOBMI-NEXT: xorl %edi, %edi ; X86-NOBMI-NEXT: .LBB32_2: -; X86-NOBMI-NEXT: movl $64, %ecx -; X86-NOBMI-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: movb $64, %cl +; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: movl $-1, %ebp ; X86-NOBMI-NEXT: movl $-1, %ebx ; X86-NOBMI-NEXT: shrl %cl, %ebx @@ -3882,8 +3882,8 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; X86-BMI1NOTBM-NEXT: movl %edi, %esi ; X86-BMI1NOTBM-NEXT: xorl %edi, %edi ; X86-BMI1NOTBM-NEXT: .LBB32_2: -; X86-BMI1NOTBM-NEXT: movl $64, %ecx -; X86-BMI1NOTBM-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: movb $64, %cl +; X86-BMI1NOTBM-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: movl $-1, %ebp ; X86-BMI1NOTBM-NEXT: movl $-1, %ebx ; X86-BMI1NOTBM-NEXT: shrl %cl, %ebx @@ -3928,8 +3928,8 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; X86-BMI1BMI2-NEXT: movl %edi, %esi ; X86-BMI1BMI2-NEXT: xorl %edi, %edi ; X86-BMI1BMI2-NEXT: .LBB32_2: -; X86-BMI1BMI2-NEXT: movl $64, %ecx -; X86-BMI1BMI2-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1BMI2-NEXT: movb $64, %cl +; X86-BMI1BMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1BMI2-NEXT: movl $-1, %ebx ; X86-BMI1BMI2-NEXT: shrxl %ecx, %ebx, %ebp ; X86-BMI1BMI2-NEXT: shrdl %cl, %ebx, %ebx @@ -3964,7 +3964,7 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; X64-NOBMI-NEXT: movq %rdi, %r14 ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %r14 -; X64-NOBMI-NEXT: negl %edx +; X64-NOBMI-NEXT: negb %dl ; X64-NOBMI-NEXT: movq $-1, %rbx ; X64-NOBMI-NEXT: movl %edx, %ecx ; X64-NOBMI-NEXT: shrq %cl, %rbx @@ -3986,7 +3986,7 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; X64-BMI1NOTBM-NEXT: movq %rdi, %r14 ; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-BMI1NOTBM-NEXT: shrq %cl, %r14 -; X64-BMI1NOTBM-NEXT: negl %edx +; X64-BMI1NOTBM-NEXT: negb %dl ; X64-BMI1NOTBM-NEXT: movq $-1, %rbx ; X64-BMI1NOTBM-NEXT: movl %edx, %ecx ; X64-BMI1NOTBM-NEXT: shrq %cl, %rbx @@ -4007,7 +4007,7 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; X64-BMI1BMI2-NEXT: movq %rdx, %rbx ; X64-BMI1BMI2-NEXT: shrxq %rsi, %rdi, %r14 ; X64-BMI1BMI2-NEXT: movl %ebx, %eax -; X64-BMI1BMI2-NEXT: negl %eax +; X64-BMI1BMI2-NEXT: negb %al ; X64-BMI1BMI2-NEXT: movq $-1, %rcx ; X64-BMI1BMI2-NEXT: shrxq %rax, %rcx, %rdi ; X64-BMI1BMI2-NEXT: callq use64 @@ -4257,8 +4257,8 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X86-NOBMI-NEXT: movl %edi, %esi ; X86-NOBMI-NEXT: xorl %edi, %edi ; X86-NOBMI-NEXT: .LBB34_2: -; X86-NOBMI-NEXT: movl $64, %ecx -; X86-NOBMI-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: movb $64, %cl +; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: movl $-1, %ebp ; X86-NOBMI-NEXT: movl $-1, %ebx ; X86-NOBMI-NEXT: shrl %cl, %ebx @@ -4305,8 +4305,8 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X86-BMI1NOTBM-NEXT: movl %edi, %esi ; X86-BMI1NOTBM-NEXT: xorl %edi, %edi ; X86-BMI1NOTBM-NEXT: .LBB34_2: -; X86-BMI1NOTBM-NEXT: movl $64, %ecx -; X86-BMI1NOTBM-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: movb $64, %cl +; X86-BMI1NOTBM-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: movl $-1, %ebp ; X86-BMI1NOTBM-NEXT: movl $-1, %ebx ; X86-BMI1NOTBM-NEXT: shrl %cl, %ebx @@ -4352,8 +4352,8 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X86-BMI1BMI2-NEXT: movl %edi, %esi ; X86-BMI1BMI2-NEXT: xorl %edi, %edi ; X86-BMI1BMI2-NEXT: .LBB34_2: -; X86-BMI1BMI2-NEXT: movl $64, %ecx -; X86-BMI1BMI2-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1BMI2-NEXT: movb $64, %cl +; X86-BMI1BMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1BMI2-NEXT: movl $-1, %ebx ; X86-BMI1BMI2-NEXT: shrxl %ecx, %ebx, %ebp ; X86-BMI1BMI2-NEXT: shrdl %cl, %ebx, %ebx @@ -4388,7 +4388,7 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X64-NOBMI-NEXT: movq (%rdi), %r14 ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %r14 -; X64-NOBMI-NEXT: negl %edx +; X64-NOBMI-NEXT: negb %dl ; X64-NOBMI-NEXT: movq $-1, %rbx ; X64-NOBMI-NEXT: movl %edx, %ecx ; X64-NOBMI-NEXT: shrq %cl, %rbx @@ -4410,7 +4410,7 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X64-BMI1NOTBM-NEXT: movq (%rdi), %r14 ; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-BMI1NOTBM-NEXT: shrq %cl, %r14 -; X64-BMI1NOTBM-NEXT: negl %edx +; X64-BMI1NOTBM-NEXT: negb %dl ; X64-BMI1NOTBM-NEXT: movq $-1, %rbx ; X64-BMI1NOTBM-NEXT: movl %edx, %ecx ; X64-BMI1NOTBM-NEXT: shrq %cl, %rbx @@ -4431,7 +4431,7 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X64-BMI1BMI2-NEXT: movq %rdx, %rbx ; X64-BMI1BMI2-NEXT: shrxq %rsi, (%rdi), %r14 ; X64-BMI1BMI2-NEXT: movl %ebx, %eax -; X64-BMI1BMI2-NEXT: negl %eax +; X64-BMI1BMI2-NEXT: negb %al ; X64-BMI1BMI2-NEXT: movq $-1, %rcx ; X64-BMI1BMI2-NEXT: shrxq %rax, %rcx, %rdi ; X64-BMI1BMI2-NEXT: callq use64 @@ -4685,8 +4685,8 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) ; X86-NOBMI-NEXT: movl %edi, %esi ; X86-NOBMI-NEXT: xorl %edi, %edi ; X86-NOBMI-NEXT: .LBB36_2: -; X86-NOBMI-NEXT: movl $64, %ecx -; X86-NOBMI-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: movb $64, %cl +; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: movl $-1, %ebp ; X86-NOBMI-NEXT: movl $-1, %ebx ; X86-NOBMI-NEXT: shrl %cl, %ebx @@ -4732,8 +4732,8 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) ; X86-BMI1NOTBM-NEXT: movl %edi, %esi ; X86-BMI1NOTBM-NEXT: xorl %edi, %edi ; X86-BMI1NOTBM-NEXT: .LBB36_2: -; X86-BMI1NOTBM-NEXT: movl $64, %ecx -; X86-BMI1NOTBM-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: movb $64, %cl +; X86-BMI1NOTBM-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: movl $-1, %ebp ; X86-BMI1NOTBM-NEXT: movl $-1, %ebx ; X86-BMI1NOTBM-NEXT: shrl %cl, %ebx @@ -4778,8 +4778,8 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) ; X86-BMI1BMI2-NEXT: movl %edi, %esi ; X86-BMI1BMI2-NEXT: xorl %edi, %edi ; X86-BMI1BMI2-NEXT: .LBB36_2: -; X86-BMI1BMI2-NEXT: movl $64, %ecx -; X86-BMI1BMI2-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1BMI2-NEXT: movb $64, %cl +; X86-BMI1BMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1BMI2-NEXT: movl $-1, %ebx ; X86-BMI1BMI2-NEXT: shrxl %ecx, %ebx, %ebp ; X86-BMI1BMI2-NEXT: shrdl %cl, %ebx, %ebx @@ -4814,7 +4814,7 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) ; X64-NOBMI-NEXT: movq %rdi, %r14 ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %r14 -; X64-NOBMI-NEXT: negl %edx +; X64-NOBMI-NEXT: negb %dl ; X64-NOBMI-NEXT: movq $-1, %rbx ; X64-NOBMI-NEXT: movl %edx, %ecx ; X64-NOBMI-NEXT: shrq %cl, %rbx @@ -4836,7 +4836,7 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) ; X64-BMI1NOTBM-NEXT: movq %rdi, %r14 ; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-BMI1NOTBM-NEXT: shrq %cl, %r14 -; X64-BMI1NOTBM-NEXT: negl %edx +; X64-BMI1NOTBM-NEXT: negb %dl ; X64-BMI1NOTBM-NEXT: movq $-1, %rbx ; X64-BMI1NOTBM-NEXT: movl %edx, %ecx ; X64-BMI1NOTBM-NEXT: shrq %cl, %rbx @@ -4857,7 +4857,7 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) ; X64-BMI1BMI2-NEXT: movq %rdx, %rbx ; X64-BMI1BMI2-NEXT: shrxq %rsi, %rdi, %r14 ; X64-BMI1BMI2-NEXT: movl %ebx, %eax -; X64-BMI1BMI2-NEXT: negl %eax +; X64-BMI1BMI2-NEXT: negb %al ; X64-BMI1BMI2-NEXT: movq $-1, %rcx ; X64-BMI1BMI2-NEXT: shrxq %rax, %rcx, %rdi ; X64-BMI1BMI2-NEXT: callq use64 @@ -4894,8 +4894,8 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; X86-NOBMI-NEXT: movl %edi, %esi ; X86-NOBMI-NEXT: xorl %edi, %edi ; X86-NOBMI-NEXT: .LBB37_2: -; X86-NOBMI-NEXT: movl $64, %ecx -; X86-NOBMI-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: movb $64, %cl +; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: movl $-1, %ebx ; X86-NOBMI-NEXT: movl $-1, %ebp ; X86-NOBMI-NEXT: shrl %cl, %ebp @@ -4946,8 +4946,8 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; X86-BMI1NOTBM-NEXT: movl %edi, %esi ; X86-BMI1NOTBM-NEXT: xorl %edi, %edi ; X86-BMI1NOTBM-NEXT: .LBB37_2: -; X86-BMI1NOTBM-NEXT: movl $64, %ecx -; X86-BMI1NOTBM-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: movb $64, %cl +; X86-BMI1NOTBM-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: movl $-1, %ebx ; X86-BMI1NOTBM-NEXT: movl $-1, %ebp ; X86-BMI1NOTBM-NEXT: shrl %cl, %ebp @@ -4997,8 +4997,8 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; X86-BMI1BMI2-NEXT: movl %edi, %esi ; X86-BMI1BMI2-NEXT: xorl %edi, %edi ; X86-BMI1BMI2-NEXT: .LBB37_2: -; X86-BMI1BMI2-NEXT: movl $64, %ecx -; X86-BMI1BMI2-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1BMI2-NEXT: movb $64, %cl +; X86-BMI1BMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1BMI2-NEXT: movl $-1, %ebp ; X86-BMI1BMI2-NEXT: shrxl %ecx, %ebp, %ebx ; X86-BMI1BMI2-NEXT: shrdl %cl, %ebp, %ebp @@ -5038,7 +5038,7 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; X64-NOBMI-NEXT: movq %rdi, %r15 ; X64-NOBMI-NEXT: movl %r14d, %ecx ; X64-NOBMI-NEXT: shrq %cl, %r15 -; X64-NOBMI-NEXT: negl %edx +; X64-NOBMI-NEXT: negb %dl ; X64-NOBMI-NEXT: movq $-1, %rbx ; X64-NOBMI-NEXT: movl %edx, %ecx ; X64-NOBMI-NEXT: shrq %cl, %rbx @@ -5062,7 +5062,7 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; X64-BMI1NOTBM-NEXT: movq %rdi, %r15 ; X64-BMI1NOTBM-NEXT: movl %r14d, %ecx ; X64-BMI1NOTBM-NEXT: shrq %cl, %r15 -; X64-BMI1NOTBM-NEXT: negl %edx +; X64-BMI1NOTBM-NEXT: negb %dl ; X64-BMI1NOTBM-NEXT: movq $-1, %rbx ; X64-BMI1NOTBM-NEXT: movl %edx, %ecx ; X64-BMI1NOTBM-NEXT: shrq %cl, %rbx @@ -5086,7 +5086,7 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; X64-BMI1BMI2-NEXT: movq %rsi, %r14 ; X64-BMI1BMI2-NEXT: shrxq %rsi, %rdi, %r15 ; X64-BMI1BMI2-NEXT: movl %ebx, %eax -; X64-BMI1BMI2-NEXT: negl %eax +; X64-BMI1BMI2-NEXT: negb %al ; X64-BMI1BMI2-NEXT: movq $-1, %rcx ; X64-BMI1BMI2-NEXT: shrxq %rax, %rcx, %rdi ; X64-BMI1BMI2-NEXT: callq use64 @@ -5118,7 +5118,7 @@ define i32 @bextr32_d0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: shrl %cl, %eax ; X86-NOBMI-NEXT: xorl %ecx, %ecx -; X86-NOBMI-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: shll %cl, %eax ; X86-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NOBMI-NEXT: shrl %cl, %eax @@ -5126,16 +5126,16 @@ define i32 @bextr32_d0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; ; X86-BMI1NOTBM-LABEL: bextr32_d0: ; X86-BMI1NOTBM: # %bb.0: -; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI1NOTBM-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-BMI1NOTBM-NEXT: shll $8, %eax -; X86-BMI1NOTBM-NEXT: orl %ecx, %eax -; X86-BMI1NOTBM-NEXT: bextrl %eax, {{[0-9]+}}(%esp), %eax +; X86-BMI1NOTBM-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: orl %eax, %ecx +; X86-BMI1NOTBM-NEXT: bextrl %ecx, {{[0-9]+}}(%esp), %eax ; X86-BMI1NOTBM-NEXT: retl ; ; X86-BMI1BMI2-LABEL: bextr32_d0: ; X86-BMI1BMI2: # %bb.0: -; X86-BMI1BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1BMI2-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-BMI1BMI2-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-BMI1BMI2-NEXT: shrxl %ecx, {{[0-9]+}}(%esp), %ecx ; X86-BMI1BMI2-NEXT: bzhil %eax, %ecx, %eax @@ -5147,7 +5147,7 @@ define i32 @bextr32_d0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; X64-NOBMI-NEXT: movl %edi, %eax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %eax -; X64-NOBMI-NEXT: negl %edx +; X64-NOBMI-NEXT: negb %dl ; X64-NOBMI-NEXT: movl %edx, %ecx ; X64-NOBMI-NEXT: shll %cl, %eax ; X64-NOBMI-NEXT: shrl %cl, %eax @@ -5245,7 +5245,7 @@ define i32 @bextr32_d2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind ; X86-NOBMI-NEXT: movl (%eax), %eax ; X86-NOBMI-NEXT: shrl %cl, %eax ; X86-NOBMI-NEXT: xorl %ecx, %ecx -; X86-NOBMI-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: shll %cl, %eax ; X86-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NOBMI-NEXT: shrl %cl, %eax @@ -5254,16 +5254,16 @@ define i32 @bextr32_d2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind ; X86-BMI1NOTBM-LABEL: bextr32_d2_load: ; X86-BMI1NOTBM: # %bb.0: ; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-BMI1NOTBM-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-BMI1NOTBM-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: shll $8, %ecx -; X86-BMI1NOTBM-NEXT: orl %edx, %ecx -; X86-BMI1NOTBM-NEXT: bextrl %ecx, (%eax), %eax +; X86-BMI1NOTBM-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-BMI1NOTBM-NEXT: orl %ecx, %edx +; X86-BMI1NOTBM-NEXT: bextrl %edx, (%eax), %eax ; X86-BMI1NOTBM-NEXT: retl ; ; X86-BMI1BMI2-LABEL: bextr32_d2_load: ; X86-BMI1BMI2: # %bb.0: -; X86-BMI1BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1BMI2-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-BMI1BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI1BMI2-NEXT: movb {{[0-9]+}}(%esp), %dl ; X86-BMI1BMI2-NEXT: shrxl %edx, (%ecx), %ecx @@ -5276,7 +5276,7 @@ define i32 @bextr32_d2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind ; X64-NOBMI-NEXT: movl (%rdi), %eax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %eax -; X64-NOBMI-NEXT: negl %edx +; X64-NOBMI-NEXT: negb %dl ; X64-NOBMI-NEXT: movl %edx, %ecx ; X64-NOBMI-NEXT: shll %cl, %eax ; X64-NOBMI-NEXT: shrl %cl, %eax @@ -5381,7 +5381,7 @@ define i32 @bextr32_d5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits ; X86-NOBMI-NEXT: movl %eax, %ecx ; X86-NOBMI-NEXT: shrl %cl, %esi ; X86-NOBMI-NEXT: xorl %ecx, %ecx -; X86-NOBMI-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: shll %cl, %esi ; X86-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NOBMI-NEXT: shrl %cl, %esi @@ -5396,7 +5396,7 @@ define i32 @bextr32_d5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits ; X86-BMI1NOTBM: # %bb.0: ; X86-BMI1NOTBM-NEXT: pushl %esi ; X86-BMI1NOTBM-NEXT: subl $8, %esp -; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI1NOTBM-NEXT: shll $8, %ecx ; X86-BMI1NOTBM-NEXT: movzbl %al, %edx @@ -5413,7 +5413,7 @@ define i32 @bextr32_d5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits ; X86-BMI1BMI2: # %bb.0: ; X86-BMI1BMI2-NEXT: pushl %esi ; X86-BMI1BMI2-NEXT: subl $8, %esp -; X86-BMI1BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1BMI2-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-BMI1BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI1BMI2-NEXT: shrxl %ecx, {{[0-9]+}}(%esp), %edx ; X86-BMI1BMI2-NEXT: bzhil %eax, %edx, %esi @@ -5430,7 +5430,7 @@ define i32 @bextr32_d5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits ; X64-NOBMI-NEXT: movl %edi, %ebx ; X64-NOBMI-NEXT: movl %esi, %ecx ; X64-NOBMI-NEXT: shrl %cl, %ebx -; X64-NOBMI-NEXT: negl %edx +; X64-NOBMI-NEXT: negb %dl ; X64-NOBMI-NEXT: movl %edx, %ecx ; X64-NOBMI-NEXT: shll %cl, %ebx ; X64-NOBMI-NEXT: shrl %cl, %ebx @@ -5492,8 +5492,8 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; X86-NOBMI-NEXT: movl %eax, %edi ; X86-NOBMI-NEXT: xorl %eax, %eax ; X86-NOBMI-NEXT: .LBB43_2: -; X86-NOBMI-NEXT: movl $64, %ecx -; X86-NOBMI-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: movb $64, %cl +; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: shldl %cl, %edi, %eax ; X86-NOBMI-NEXT: shll %cl, %edi ; X86-NOBMI-NEXT: testb $32, %cl @@ -5540,8 +5540,8 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; X86-BMI1NOTBM-NEXT: movl %eax, %edi ; X86-BMI1NOTBM-NEXT: xorl %eax, %eax ; X86-BMI1NOTBM-NEXT: .LBB43_2: -; X86-BMI1NOTBM-NEXT: movl $64, %ecx -; X86-BMI1NOTBM-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: movb $64, %cl +; X86-BMI1NOTBM-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: shldl %cl, %edi, %eax ; X86-BMI1NOTBM-NEXT: shll %cl, %edi ; X86-BMI1NOTBM-NEXT: testb $32, %cl @@ -5586,8 +5586,8 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; X86-BMI1BMI2-NEXT: movl %esi, %eax ; X86-BMI1BMI2-NEXT: xorl %esi, %esi ; X86-BMI1BMI2-NEXT: .LBB43_2: -; X86-BMI1BMI2-NEXT: movl $64, %ecx -; X86-BMI1BMI2-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1BMI2-NEXT: movb $64, %cl +; X86-BMI1BMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1BMI2-NEXT: shldl %cl, %eax, %esi ; X86-BMI1BMI2-NEXT: shlxl %ecx, %eax, %edi ; X86-BMI1BMI2-NEXT: testb $32, %cl @@ -5617,7 +5617,7 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; X64-NOBMI-NEXT: movq %rdi, %rax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rax -; X64-NOBMI-NEXT: negl %edx +; X64-NOBMI-NEXT: negb %dl ; X64-NOBMI-NEXT: movl %edx, %ecx ; X64-NOBMI-NEXT: shlq %cl, %rax ; X64-NOBMI-NEXT: shrq %cl, %rax @@ -5838,8 +5838,8 @@ define i64 @bextr64_d2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X86-NOBMI-NEXT: movl %eax, %edi ; X86-NOBMI-NEXT: xorl %eax, %eax ; X86-NOBMI-NEXT: .LBB45_2: -; X86-NOBMI-NEXT: movl $64, %ecx -; X86-NOBMI-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: movb $64, %cl +; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: shldl %cl, %edi, %eax ; X86-NOBMI-NEXT: shll %cl, %edi ; X86-NOBMI-NEXT: testb $32, %cl @@ -5887,8 +5887,8 @@ define i64 @bextr64_d2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X86-BMI1NOTBM-NEXT: movl %eax, %edi ; X86-BMI1NOTBM-NEXT: xorl %eax, %eax ; X86-BMI1NOTBM-NEXT: .LBB45_2: -; X86-BMI1NOTBM-NEXT: movl $64, %ecx -; X86-BMI1NOTBM-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: movb $64, %cl +; X86-BMI1NOTBM-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: shldl %cl, %edi, %eax ; X86-BMI1NOTBM-NEXT: shll %cl, %edi ; X86-BMI1NOTBM-NEXT: testb $32, %cl @@ -5934,8 +5934,8 @@ define i64 @bextr64_d2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X86-BMI1BMI2-NEXT: movl %esi, %eax ; X86-BMI1BMI2-NEXT: xorl %esi, %esi ; X86-BMI1BMI2-NEXT: .LBB45_2: -; X86-BMI1BMI2-NEXT: movl $64, %ecx -; X86-BMI1BMI2-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1BMI2-NEXT: movb $64, %cl +; X86-BMI1BMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1BMI2-NEXT: shldl %cl, %eax, %esi ; X86-BMI1BMI2-NEXT: shlxl %ecx, %eax, %edi ; X86-BMI1BMI2-NEXT: testb $32, %cl @@ -5965,7 +5965,7 @@ define i64 @bextr64_d2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X64-NOBMI-NEXT: movq (%rdi), %rax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rax -; X64-NOBMI-NEXT: negl %edx +; X64-NOBMI-NEXT: negb %dl ; X64-NOBMI-NEXT: movl %edx, %ecx ; X64-NOBMI-NEXT: shlq %cl, %rax ; X64-NOBMI-NEXT: shrq %cl, %rax @@ -6193,8 +6193,8 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; X86-NOBMI-NEXT: movl %esi, %ebx ; X86-NOBMI-NEXT: xorl %esi, %esi ; X86-NOBMI-NEXT: .LBB47_2: -; X86-NOBMI-NEXT: movl $64, %ecx -; X86-NOBMI-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: movb $64, %cl +; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: shldl %cl, %ebx, %esi ; X86-NOBMI-NEXT: shll %cl, %ebx ; X86-NOBMI-NEXT: testb $32, %cl @@ -6254,8 +6254,8 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; X86-BMI1NOTBM-NEXT: movl %esi, %ebx ; X86-BMI1NOTBM-NEXT: xorl %esi, %esi ; X86-BMI1NOTBM-NEXT: .LBB47_2: -; X86-BMI1NOTBM-NEXT: movl $64, %ecx -; X86-BMI1NOTBM-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: movb $64, %cl +; X86-BMI1NOTBM-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: shldl %cl, %ebx, %esi ; X86-BMI1NOTBM-NEXT: shll %cl, %ebx ; X86-BMI1NOTBM-NEXT: testb $32, %cl @@ -6312,8 +6312,8 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; X86-BMI1BMI2-NEXT: movl %edx, %edi ; X86-BMI1BMI2-NEXT: xorl %edx, %edx ; X86-BMI1BMI2-NEXT: .LBB47_2: -; X86-BMI1BMI2-NEXT: movl $64, %ecx -; X86-BMI1BMI2-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1BMI2-NEXT: movb $64, %cl +; X86-BMI1BMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1BMI2-NEXT: shldl %cl, %edi, %edx ; X86-BMI1BMI2-NEXT: shlxl %ecx, %edi, %ebx ; X86-BMI1BMI2-NEXT: testb $32, %cl @@ -6352,7 +6352,7 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; X64-NOBMI-NEXT: movq %rdi, %rbx ; X64-NOBMI-NEXT: movl %esi, %ecx ; X64-NOBMI-NEXT: shrq %cl, %rbx -; X64-NOBMI-NEXT: negl %edx +; X64-NOBMI-NEXT: negb %dl ; X64-NOBMI-NEXT: movl %edx, %ecx ; X64-NOBMI-NEXT: shlq %cl, %rbx ; X64-NOBMI-NEXT: shrq %cl, %rbx diff --git a/llvm/test/CodeGen/X86/extract-lowbits.ll b/llvm/test/CodeGen/X86/extract-lowbits.ll index 8d18f29..ac85edd 100644 --- a/llvm/test/CodeGen/X86/extract-lowbits.ll +++ b/llvm/test/CodeGen/X86/extract-lowbits.ll @@ -1436,7 +1436,7 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind { ; X86-NOBMI-NEXT: pushl %esi ; X86-NOBMI-NEXT: subl $8, %esp ; X86-NOBMI-NEXT: xorl %ecx, %ecx -; X86-NOBMI-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: movl $-1, %esi ; X86-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NOBMI-NEXT: shrl %cl, %esi @@ -1453,7 +1453,7 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind { ; X86-BMI1NOTBM-NEXT: pushl %esi ; X86-BMI1NOTBM-NEXT: subl $8, %esp ; X86-BMI1NOTBM-NEXT: xorl %ecx, %ecx -; X86-BMI1NOTBM-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: movl $-1, %esi ; X86-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-BMI1NOTBM-NEXT: shrl %cl, %esi @@ -1467,18 +1467,18 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind { ; ; X86-BMI1BMI2-LABEL: bzhi32_c0: ; X86-BMI1BMI2: # %bb.0: -; X86-BMI1BMI2-NEXT: pushl %esi +; X86-BMI1BMI2-NEXT: pushl %ebx ; X86-BMI1BMI2-NEXT: subl $8, %esp -; X86-BMI1BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI1BMI2-NEXT: movl %esi, %eax -; X86-BMI1BMI2-NEXT: negl %eax +; X86-BMI1BMI2-NEXT: movb {{[0-9]+}}(%esp), %bl +; X86-BMI1BMI2-NEXT: movl %ebx, %eax +; X86-BMI1BMI2-NEXT: negb %al ; X86-BMI1BMI2-NEXT: movl $-1, %ecx ; X86-BMI1BMI2-NEXT: shrxl %eax, %ecx, %eax ; X86-BMI1BMI2-NEXT: movl %eax, (%esp) ; X86-BMI1BMI2-NEXT: calll use32 -; X86-BMI1BMI2-NEXT: bzhil %esi, {{[0-9]+}}(%esp), %eax +; X86-BMI1BMI2-NEXT: bzhil %ebx, {{[0-9]+}}(%esp), %eax ; X86-BMI1BMI2-NEXT: addl $8, %esp -; X86-BMI1BMI2-NEXT: popl %esi +; X86-BMI1BMI2-NEXT: popl %ebx ; X86-BMI1BMI2-NEXT: retl ; ; X64-NOBMI-LABEL: bzhi32_c0: @@ -1488,7 +1488,7 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind { ; X64-NOBMI-NEXT: pushq %rax ; X64-NOBMI-NEXT: movl %esi, %ecx ; X64-NOBMI-NEXT: movl %edi, %ebx -; X64-NOBMI-NEXT: negl %ecx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: movl $-1, %ebp ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %ebp @@ -1508,7 +1508,7 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind { ; X64-BMI1NOTBM-NEXT: pushq %rax ; X64-BMI1NOTBM-NEXT: movl %esi, %ecx ; X64-BMI1NOTBM-NEXT: movl %edi, %ebx -; X64-BMI1NOTBM-NEXT: negl %ecx +; X64-BMI1NOTBM-NEXT: negb %cl ; X64-BMI1NOTBM-NEXT: movl $-1, %ebp ; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1NOTBM-NEXT: shrl %cl, %ebp @@ -1528,8 +1528,8 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind { ; X64-BMI1BMI2-NEXT: pushq %rax ; X64-BMI1BMI2-NEXT: movl %esi, %ebx ; X64-BMI1BMI2-NEXT: movl %edi, %ebp -; X64-BMI1BMI2-NEXT: movl %esi, %eax -; X64-BMI1BMI2-NEXT: negl %eax +; X64-BMI1BMI2-NEXT: movl %ebx, %eax +; X64-BMI1BMI2-NEXT: negb %al ; X64-BMI1BMI2-NEXT: movl $-1, %ecx ; X64-BMI1BMI2-NEXT: shrxl %eax, %ecx, %edi ; X64-BMI1BMI2-NEXT: callq use32 @@ -1668,7 +1668,7 @@ define i32 @bzhi32_c2_load(i32* %w, i32 %numlowbits) nounwind { ; X86-NOBMI-NEXT: subl $8, %esp ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: xorl %ecx, %ecx -; X86-NOBMI-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: movl $-1, %edx ; X86-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NOBMI-NEXT: shrl %cl, %edx @@ -1687,7 +1687,7 @@ define i32 @bzhi32_c2_load(i32* %w, i32 %numlowbits) nounwind { ; X86-BMI1NOTBM-NEXT: subl $8, %esp ; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI1NOTBM-NEXT: xorl %ecx, %ecx -; X86-BMI1NOTBM-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: movl $-1, %edx ; X86-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-BMI1NOTBM-NEXT: shrl %cl, %edx @@ -1705,9 +1705,10 @@ define i32 @bzhi32_c2_load(i32* %w, i32 %numlowbits) nounwind { ; X86-BMI1BMI2-NEXT: pushl %esi ; X86-BMI1BMI2-NEXT: subl $8, %esp ; X86-BMI1BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI1BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMI1BMI2-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-BMI1BMI2-NEXT: bzhil %ecx, (%eax), %esi -; X86-BMI1BMI2-NEXT: negl %ecx +; X86-BMI1BMI2-NEXT: # kill: def $cl killed $cl killed $ecx def $ecx +; X86-BMI1BMI2-NEXT: negb %cl ; X86-BMI1BMI2-NEXT: movl $-1, %eax ; X86-BMI1BMI2-NEXT: shrxl %ecx, %eax, %eax ; X86-BMI1BMI2-NEXT: movl %eax, (%esp) @@ -1721,7 +1722,7 @@ define i32 @bzhi32_c2_load(i32* %w, i32 %numlowbits) nounwind { ; X64-NOBMI: # %bb.0: ; X64-NOBMI-NEXT: pushq %rbx ; X64-NOBMI-NEXT: movl %esi, %ecx -; X64-NOBMI-NEXT: negl %ecx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: movl $-1, %eax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %eax @@ -1737,7 +1738,7 @@ define i32 @bzhi32_c2_load(i32* %w, i32 %numlowbits) nounwind { ; X64-BMI1NOTBM: # %bb.0: ; X64-BMI1NOTBM-NEXT: pushq %rbx ; X64-BMI1NOTBM-NEXT: movl %esi, %ecx -; X64-BMI1NOTBM-NEXT: negl %ecx +; X64-BMI1NOTBM-NEXT: negb %cl ; X64-BMI1NOTBM-NEXT: movl $-1, %eax ; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1NOTBM-NEXT: shrl %cl, %eax @@ -1753,7 +1754,8 @@ define i32 @bzhi32_c2_load(i32* %w, i32 %numlowbits) nounwind { ; X64-BMI1BMI2: # %bb.0: ; X64-BMI1BMI2-NEXT: pushq %rbx ; X64-BMI1BMI2-NEXT: bzhil %esi, (%rdi), %ebx -; X64-BMI1BMI2-NEXT: negl %esi +; X64-BMI1BMI2-NEXT: # kill: def $sil killed $sil killed $esi def $esi +; X64-BMI1BMI2-NEXT: negb %sil ; X64-BMI1BMI2-NEXT: movl $-1, %eax ; X64-BMI1BMI2-NEXT: shrxl %esi, %eax, %edi ; X64-BMI1BMI2-NEXT: callq use32 @@ -1884,7 +1886,7 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind { ; X86-NOBMI-NEXT: pushl %esi ; X86-NOBMI-NEXT: subl $8, %esp ; X86-NOBMI-NEXT: xorl %ecx, %ecx -; X86-NOBMI-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: movl $-1, %esi ; X86-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NOBMI-NEXT: shrl %cl, %esi @@ -1901,7 +1903,7 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind { ; X86-BMI1NOTBM-NEXT: pushl %esi ; X86-BMI1NOTBM-NEXT: subl $8, %esp ; X86-BMI1NOTBM-NEXT: xorl %ecx, %ecx -; X86-BMI1NOTBM-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: movl $-1, %esi ; X86-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-BMI1NOTBM-NEXT: shrl %cl, %esi @@ -1915,18 +1917,18 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind { ; ; X86-BMI1BMI2-LABEL: bzhi32_c4_commutative: ; X86-BMI1BMI2: # %bb.0: -; X86-BMI1BMI2-NEXT: pushl %esi +; X86-BMI1BMI2-NEXT: pushl %ebx ; X86-BMI1BMI2-NEXT: subl $8, %esp -; X86-BMI1BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI1BMI2-NEXT: movl %esi, %eax -; X86-BMI1BMI2-NEXT: negl %eax +; X86-BMI1BMI2-NEXT: movb {{[0-9]+}}(%esp), %bl +; X86-BMI1BMI2-NEXT: movl %ebx, %eax +; X86-BMI1BMI2-NEXT: negb %al ; X86-BMI1BMI2-NEXT: movl $-1, %ecx ; X86-BMI1BMI2-NEXT: shrxl %eax, %ecx, %eax ; X86-BMI1BMI2-NEXT: movl %eax, (%esp) ; X86-BMI1BMI2-NEXT: calll use32 -; X86-BMI1BMI2-NEXT: bzhil %esi, {{[0-9]+}}(%esp), %eax +; X86-BMI1BMI2-NEXT: bzhil %ebx, {{[0-9]+}}(%esp), %eax ; X86-BMI1BMI2-NEXT: addl $8, %esp -; X86-BMI1BMI2-NEXT: popl %esi +; X86-BMI1BMI2-NEXT: popl %ebx ; X86-BMI1BMI2-NEXT: retl ; ; X64-NOBMI-LABEL: bzhi32_c4_commutative: @@ -1936,7 +1938,7 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind { ; X64-NOBMI-NEXT: pushq %rax ; X64-NOBMI-NEXT: movl %esi, %ecx ; X64-NOBMI-NEXT: movl %edi, %ebx -; X64-NOBMI-NEXT: negl %ecx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: movl $-1, %ebp ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %ebp @@ -1956,7 +1958,7 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind { ; X64-BMI1NOTBM-NEXT: pushq %rax ; X64-BMI1NOTBM-NEXT: movl %esi, %ecx ; X64-BMI1NOTBM-NEXT: movl %edi, %ebx -; X64-BMI1NOTBM-NEXT: negl %ecx +; X64-BMI1NOTBM-NEXT: negb %cl ; X64-BMI1NOTBM-NEXT: movl $-1, %ebp ; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1NOTBM-NEXT: shrl %cl, %ebp @@ -1976,8 +1978,8 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind { ; X64-BMI1BMI2-NEXT: pushq %rax ; X64-BMI1BMI2-NEXT: movl %esi, %ebx ; X64-BMI1BMI2-NEXT: movl %edi, %ebp -; X64-BMI1BMI2-NEXT: movl %esi, %eax -; X64-BMI1BMI2-NEXT: negl %eax +; X64-BMI1BMI2-NEXT: movl %ebx, %eax +; X64-BMI1BMI2-NEXT: negb %al ; X64-BMI1BMI2-NEXT: movl $-1, %ecx ; X64-BMI1BMI2-NEXT: shrxl %eax, %ecx, %edi ; X64-BMI1BMI2-NEXT: callq use32 @@ -2003,8 +2005,8 @@ define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind { ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi ; X86-NOBMI-NEXT: pushl %eax -; X86-NOBMI-NEXT: movl $64, %ecx -; X86-NOBMI-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: movb $64, %cl +; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: movl $-1, %esi ; X86-NOBMI-NEXT: movl $-1, %edi ; X86-NOBMI-NEXT: shrl %cl, %edi @@ -2034,8 +2036,8 @@ define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind { ; X86-BMI1NOTBM-NEXT: pushl %edi ; X86-BMI1NOTBM-NEXT: pushl %esi ; X86-BMI1NOTBM-NEXT: pushl %eax -; X86-BMI1NOTBM-NEXT: movl $64, %ecx -; X86-BMI1NOTBM-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: movb $64, %cl +; X86-BMI1NOTBM-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: movl $-1, %esi ; X86-BMI1NOTBM-NEXT: movl $-1, %edi ; X86-BMI1NOTBM-NEXT: shrl %cl, %edi @@ -2065,8 +2067,8 @@ define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind { ; X86-BMI1BMI2-NEXT: pushl %edi ; X86-BMI1BMI2-NEXT: pushl %esi ; X86-BMI1BMI2-NEXT: pushl %eax -; X86-BMI1BMI2-NEXT: movl $64, %ecx -; X86-BMI1BMI2-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1BMI2-NEXT: movb $64, %cl +; X86-BMI1BMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1BMI2-NEXT: movl $-1, %esi ; X86-BMI1BMI2-NEXT: shrxl %ecx, %esi, %edi ; X86-BMI1BMI2-NEXT: shrdl %cl, %esi, %esi @@ -2097,7 +2099,7 @@ define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind { ; X64-NOBMI-NEXT: pushq %rax ; X64-NOBMI-NEXT: movq %rsi, %rcx ; X64-NOBMI-NEXT: movq %rdi, %r14 -; X64-NOBMI-NEXT: negl %ecx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: movq $-1, %rbx ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rbx @@ -2117,7 +2119,7 @@ define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind { ; X64-BMI1NOTBM-NEXT: pushq %rax ; X64-BMI1NOTBM-NEXT: movq %rsi, %rcx ; X64-BMI1NOTBM-NEXT: movq %rdi, %r14 -; X64-BMI1NOTBM-NEXT: negl %ecx +; X64-BMI1NOTBM-NEXT: negb %cl ; X64-BMI1NOTBM-NEXT: movq $-1, %rbx ; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-BMI1NOTBM-NEXT: shrq %cl, %rbx @@ -2138,7 +2140,7 @@ define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind { ; X64-BMI1BMI2-NEXT: movq %rsi, %rbx ; X64-BMI1BMI2-NEXT: movq %rdi, %r14 ; X64-BMI1BMI2-NEXT: movl %ebx, %eax -; X64-BMI1BMI2-NEXT: negl %eax +; X64-BMI1BMI2-NEXT: negb %al ; X64-BMI1BMI2-NEXT: movq $-1, %rcx ; X64-BMI1BMI2-NEXT: shrxq %rax, %rcx, %rdi ; X64-BMI1BMI2-NEXT: callq use64 @@ -2318,26 +2320,26 @@ define i64 @bzhi64_c2_load(i64* %w, i64 %numlowbits) nounwind { ; X86-NOBMI-NEXT: pushl %ebx ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOBMI-NEXT: movl $64, %ecx -; X86-NOBMI-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NOBMI-NEXT: movl $-1, %edx +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NOBMI-NEXT: movb $64, %cl +; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl +; X86-NOBMI-NEXT: movl $-1, %eax ; X86-NOBMI-NEXT: movl $-1, %ebx ; X86-NOBMI-NEXT: shrl %cl, %ebx -; X86-NOBMI-NEXT: shrdl %cl, %edx, %edx +; X86-NOBMI-NEXT: shrdl %cl, %eax, %eax ; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: je .LBB27_2 ; X86-NOBMI-NEXT: # %bb.1: -; X86-NOBMI-NEXT: movl %ebx, %edx +; X86-NOBMI-NEXT: movl %ebx, %eax ; X86-NOBMI-NEXT: xorl %ebx, %ebx ; X86-NOBMI-NEXT: .LBB27_2: -; X86-NOBMI-NEXT: movl (%eax), %esi -; X86-NOBMI-NEXT: andl %edx, %esi -; X86-NOBMI-NEXT: movl 4(%eax), %edi +; X86-NOBMI-NEXT: movl (%edx), %esi +; X86-NOBMI-NEXT: andl %eax, %esi +; X86-NOBMI-NEXT: movl 4(%edx), %edi ; X86-NOBMI-NEXT: andl %ebx, %edi ; X86-NOBMI-NEXT: subl $8, %esp ; X86-NOBMI-NEXT: pushl %ebx -; X86-NOBMI-NEXT: pushl %edx +; X86-NOBMI-NEXT: pushl %eax ; X86-NOBMI-NEXT: calll use64 ; X86-NOBMI-NEXT: addl $16, %esp ; X86-NOBMI-NEXT: movl %esi, %eax @@ -2352,26 +2354,26 @@ define i64 @bzhi64_c2_load(i64* %w, i64 %numlowbits) nounwind { ; X86-BMI1NOTBM-NEXT: pushl %ebx ; X86-BMI1NOTBM-NEXT: pushl %edi ; X86-BMI1NOTBM-NEXT: pushl %esi -; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI1NOTBM-NEXT: movl $64, %ecx -; X86-BMI1NOTBM-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-BMI1NOTBM-NEXT: movl $-1, %edx +; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BMI1NOTBM-NEXT: movb $64, %cl +; X86-BMI1NOTBM-NEXT: subb {{[0-9]+}}(%esp), %cl +; X86-BMI1NOTBM-NEXT: movl $-1, %eax ; X86-BMI1NOTBM-NEXT: movl $-1, %ebx ; X86-BMI1NOTBM-NEXT: shrl %cl, %ebx -; X86-BMI1NOTBM-NEXT: shrdl %cl, %edx, %edx +; X86-BMI1NOTBM-NEXT: shrdl %cl, %eax, %eax ; X86-BMI1NOTBM-NEXT: testb $32, %cl ; X86-BMI1NOTBM-NEXT: je .LBB27_2 ; X86-BMI1NOTBM-NEXT: # %bb.1: -; X86-BMI1NOTBM-NEXT: movl %ebx, %edx +; X86-BMI1NOTBM-NEXT: movl %ebx, %eax ; X86-BMI1NOTBM-NEXT: xorl %ebx, %ebx ; X86-BMI1NOTBM-NEXT: .LBB27_2: -; X86-BMI1NOTBM-NEXT: movl (%eax), %esi -; X86-BMI1NOTBM-NEXT: andl %edx, %esi -; X86-BMI1NOTBM-NEXT: movl 4(%eax), %edi +; X86-BMI1NOTBM-NEXT: movl (%edx), %esi +; X86-BMI1NOTBM-NEXT: andl %eax, %esi +; X86-BMI1NOTBM-NEXT: movl 4(%edx), %edi ; X86-BMI1NOTBM-NEXT: andl %ebx, %edi ; X86-BMI1NOTBM-NEXT: subl $8, %esp ; X86-BMI1NOTBM-NEXT: pushl %ebx -; X86-BMI1NOTBM-NEXT: pushl %edx +; X86-BMI1NOTBM-NEXT: pushl %eax ; X86-BMI1NOTBM-NEXT: calll use64 ; X86-BMI1NOTBM-NEXT: addl $16, %esp ; X86-BMI1NOTBM-NEXT: movl %esi, %eax @@ -2386,25 +2388,25 @@ define i64 @bzhi64_c2_load(i64* %w, i64 %numlowbits) nounwind { ; X86-BMI1BMI2-NEXT: pushl %ebx ; X86-BMI1BMI2-NEXT: pushl %edi ; X86-BMI1BMI2-NEXT: pushl %esi -; X86-BMI1BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI1BMI2-NEXT: movl $64, %ecx -; X86-BMI1BMI2-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-BMI1BMI2-NEXT: movl $-1, %edx -; X86-BMI1BMI2-NEXT: shrxl %ecx, %edx, %ebx -; X86-BMI1BMI2-NEXT: shrdl %cl, %edx, %edx +; X86-BMI1BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BMI1BMI2-NEXT: movb $64, %cl +; X86-BMI1BMI2-NEXT: subb {{[0-9]+}}(%esp), %cl +; X86-BMI1BMI2-NEXT: movl $-1, %eax +; X86-BMI1BMI2-NEXT: shrxl %ecx, %eax, %ebx +; X86-BMI1BMI2-NEXT: shrdl %cl, %eax, %eax ; X86-BMI1BMI2-NEXT: testb $32, %cl ; X86-BMI1BMI2-NEXT: je .LBB27_2 ; X86-BMI1BMI2-NEXT: # %bb.1: -; X86-BMI1BMI2-NEXT: movl %ebx, %edx +; X86-BMI1BMI2-NEXT: movl %ebx, %eax ; X86-BMI1BMI2-NEXT: xorl %ebx, %ebx ; X86-BMI1BMI2-NEXT: .LBB27_2: -; X86-BMI1BMI2-NEXT: movl (%eax), %esi -; X86-BMI1BMI2-NEXT: andl %edx, %esi -; X86-BMI1BMI2-NEXT: movl 4(%eax), %edi +; X86-BMI1BMI2-NEXT: movl (%edx), %esi +; X86-BMI1BMI2-NEXT: andl %eax, %esi +; X86-BMI1BMI2-NEXT: movl 4(%edx), %edi ; X86-BMI1BMI2-NEXT: andl %ebx, %edi ; X86-BMI1BMI2-NEXT: subl $8, %esp ; X86-BMI1BMI2-NEXT: pushl %ebx -; X86-BMI1BMI2-NEXT: pushl %edx +; X86-BMI1BMI2-NEXT: pushl %eax ; X86-BMI1BMI2-NEXT: calll use64 ; X86-BMI1BMI2-NEXT: addl $16, %esp ; X86-BMI1BMI2-NEXT: movl %esi, %eax @@ -2418,7 +2420,7 @@ define i64 @bzhi64_c2_load(i64* %w, i64 %numlowbits) nounwind { ; X64-NOBMI: # %bb.0: ; X64-NOBMI-NEXT: pushq %rbx ; X64-NOBMI-NEXT: movq %rsi, %rcx -; X64-NOBMI-NEXT: negl %ecx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: movq $-1, %rax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rax @@ -2434,7 +2436,7 @@ define i64 @bzhi64_c2_load(i64* %w, i64 %numlowbits) nounwind { ; X64-BMI1NOTBM: # %bb.0: ; X64-BMI1NOTBM-NEXT: pushq %rbx ; X64-BMI1NOTBM-NEXT: movq %rsi, %rcx -; X64-BMI1NOTBM-NEXT: negl %ecx +; X64-BMI1NOTBM-NEXT: negb %cl ; X64-BMI1NOTBM-NEXT: movq $-1, %rax ; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-BMI1NOTBM-NEXT: shrq %cl, %rax @@ -2450,8 +2452,8 @@ define i64 @bzhi64_c2_load(i64* %w, i64 %numlowbits) nounwind { ; X64-BMI1BMI2: # %bb.0: ; X64-BMI1BMI2-NEXT: pushq %rbx ; X64-BMI1BMI2-NEXT: bzhiq %rsi, (%rdi), %rbx -; X64-BMI1BMI2-NEXT: # kill: def $esi killed $esi killed $rsi def $rsi -; X64-BMI1BMI2-NEXT: negl %esi +; X64-BMI1BMI2-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi +; X64-BMI1BMI2-NEXT: negb %sil ; X64-BMI1BMI2-NEXT: movq $-1, %rax ; X64-BMI1BMI2-NEXT: shrxq %rsi, %rax, %rdi ; X64-BMI1BMI2-NEXT: callq use64 @@ -2628,8 +2630,8 @@ define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind { ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi ; X86-NOBMI-NEXT: pushl %eax -; X86-NOBMI-NEXT: movl $64, %ecx -; X86-NOBMI-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: movb $64, %cl +; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: movl $-1, %esi ; X86-NOBMI-NEXT: movl $-1, %edi ; X86-NOBMI-NEXT: shrl %cl, %edi @@ -2659,8 +2661,8 @@ define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind { ; X86-BMI1NOTBM-NEXT: pushl %edi ; X86-BMI1NOTBM-NEXT: pushl %esi ; X86-BMI1NOTBM-NEXT: pushl %eax -; X86-BMI1NOTBM-NEXT: movl $64, %ecx -; X86-BMI1NOTBM-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: movb $64, %cl +; X86-BMI1NOTBM-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: movl $-1, %esi ; X86-BMI1NOTBM-NEXT: movl $-1, %edi ; X86-BMI1NOTBM-NEXT: shrl %cl, %edi @@ -2690,8 +2692,8 @@ define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind { ; X86-BMI1BMI2-NEXT: pushl %edi ; X86-BMI1BMI2-NEXT: pushl %esi ; X86-BMI1BMI2-NEXT: pushl %eax -; X86-BMI1BMI2-NEXT: movl $64, %ecx -; X86-BMI1BMI2-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1BMI2-NEXT: movb $64, %cl +; X86-BMI1BMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1BMI2-NEXT: movl $-1, %esi ; X86-BMI1BMI2-NEXT: shrxl %ecx, %esi, %edi ; X86-BMI1BMI2-NEXT: shrdl %cl, %esi, %esi @@ -2722,7 +2724,7 @@ define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind { ; X64-NOBMI-NEXT: pushq %rax ; X64-NOBMI-NEXT: movq %rsi, %rcx ; X64-NOBMI-NEXT: movq %rdi, %r14 -; X64-NOBMI-NEXT: negl %ecx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: movq $-1, %rbx ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rbx @@ -2742,7 +2744,7 @@ define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind { ; X64-BMI1NOTBM-NEXT: pushq %rax ; X64-BMI1NOTBM-NEXT: movq %rsi, %rcx ; X64-BMI1NOTBM-NEXT: movq %rdi, %r14 -; X64-BMI1NOTBM-NEXT: negl %ecx +; X64-BMI1NOTBM-NEXT: negb %cl ; X64-BMI1NOTBM-NEXT: movq $-1, %rbx ; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-BMI1NOTBM-NEXT: shrq %cl, %rbx @@ -2763,7 +2765,7 @@ define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind { ; X64-BMI1BMI2-NEXT: movq %rsi, %rbx ; X64-BMI1BMI2-NEXT: movq %rdi, %r14 ; X64-BMI1BMI2-NEXT: movl %ebx, %eax -; X64-BMI1BMI2-NEXT: negl %eax +; X64-BMI1BMI2-NEXT: negb %al ; X64-BMI1BMI2-NEXT: movq $-1, %rcx ; X64-BMI1BMI2-NEXT: shrxq %rax, %rcx, %rdi ; X64-BMI1BMI2-NEXT: callq use64 @@ -2788,7 +2790,7 @@ define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind { ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: xorl %ecx, %ecx -; X86-NOBMI-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: shll %cl, %eax ; X86-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NOBMI-NEXT: shrl %cl, %eax @@ -2796,14 +2798,14 @@ define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind { ; ; X86-BMI1NOTBM-LABEL: bzhi32_d0: ; X86-BMI1NOTBM: # %bb.0: -; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1NOTBM-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-BMI1NOTBM-NEXT: shll $8, %eax ; X86-BMI1NOTBM-NEXT: bextrl %eax, {{[0-9]+}}(%esp), %eax ; X86-BMI1NOTBM-NEXT: retl ; ; X86-BMI1BMI2-LABEL: bzhi32_d0: ; X86-BMI1BMI2: # %bb.0: -; X86-BMI1BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1BMI2-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-BMI1BMI2-NEXT: bzhil %eax, {{[0-9]+}}(%esp), %eax ; X86-BMI1BMI2-NEXT: retl ; @@ -2811,7 +2813,7 @@ define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind { ; X64-NOBMI: # %bb.0: ; X64-NOBMI-NEXT: movl %esi, %ecx ; X64-NOBMI-NEXT: movl %edi, %eax -; X64-NOBMI-NEXT: negl %ecx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: shll %cl, %eax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %eax @@ -2890,7 +2892,7 @@ define i32 @bzhi32_d2_load(i32* %w, i32 %numlowbits) nounwind { ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl (%eax), %eax ; X86-NOBMI-NEXT: xorl %ecx, %ecx -; X86-NOBMI-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: shll %cl, %eax ; X86-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NOBMI-NEXT: shrl %cl, %eax @@ -2899,7 +2901,7 @@ define i32 @bzhi32_d2_load(i32* %w, i32 %numlowbits) nounwind { ; X86-BMI1NOTBM-LABEL: bzhi32_d2_load: ; X86-BMI1NOTBM: # %bb.0: ; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: shll $8, %ecx ; X86-BMI1NOTBM-NEXT: bextrl %ecx, (%eax), %eax ; X86-BMI1NOTBM-NEXT: retl @@ -2907,15 +2909,15 @@ define i32 @bzhi32_d2_load(i32* %w, i32 %numlowbits) nounwind { ; X86-BMI1BMI2-LABEL: bzhi32_d2_load: ; X86-BMI1BMI2: # %bb.0: ; X86-BMI1BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI1BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-BMI1BMI2-NEXT: bzhil %eax, (%ecx), %eax +; X86-BMI1BMI2-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-BMI1BMI2-NEXT: bzhil %ecx, (%eax), %eax ; X86-BMI1BMI2-NEXT: retl ; ; X64-NOBMI-LABEL: bzhi32_d2_load: ; X64-NOBMI: # %bb.0: ; X64-NOBMI-NEXT: movl %esi, %ecx ; X64-NOBMI-NEXT: movl (%rdi), %eax -; X64-NOBMI-NEXT: negl %ecx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: shll %cl, %eax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %eax @@ -3003,8 +3005,8 @@ define i64 @bzhi64_d0(i64 %val, i64 %numlowbits) nounwind { ; X86-NOBMI-NEXT: pushl %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOBMI-NEXT: movl $64, %ecx -; X86-NOBMI-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: movb $64, %cl +; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: movl %edx, %esi ; X86-NOBMI-NEXT: shll %cl, %esi ; X86-NOBMI-NEXT: shldl %cl, %edx, %eax @@ -3042,8 +3044,8 @@ define i64 @bzhi64_d0(i64 %val, i64 %numlowbits) nounwind { ; X86-BMI1NOTBM-NEXT: pushl %esi ; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI1NOTBM-NEXT: movl $64, %ecx -; X86-BMI1NOTBM-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: movb $64, %cl +; X86-BMI1NOTBM-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: movl %edx, %esi ; X86-BMI1NOTBM-NEXT: shll %cl, %esi ; X86-BMI1NOTBM-NEXT: shldl %cl, %edx, %eax @@ -3080,8 +3082,8 @@ define i64 @bzhi64_d0(i64 %val, i64 %numlowbits) nounwind { ; X86-BMI1BMI2-NEXT: pushl %esi ; X86-BMI1BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI1BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI1BMI2-NEXT: movl $64, %ecx -; X86-BMI1BMI2-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1BMI2-NEXT: movb $64, %cl +; X86-BMI1BMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1BMI2-NEXT: shldl %cl, %eax, %esi ; X86-BMI1BMI2-NEXT: shlxl %ecx, %eax, %edi ; X86-BMI1BMI2-NEXT: xorl %edx, %edx @@ -3110,7 +3112,7 @@ define i64 @bzhi64_d0(i64 %val, i64 %numlowbits) nounwind { ; X64-NOBMI: # %bb.0: ; X64-NOBMI-NEXT: movq %rsi, %rcx ; X64-NOBMI-NEXT: movq %rdi, %rax -; X64-NOBMI-NEXT: negl %ecx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: shlq %cl, %rax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rax @@ -3281,8 +3283,8 @@ define i64 @bzhi64_d2_load(i64* %w, i64 %numlowbits) nounwind { ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl (%eax), %edx ; X86-NOBMI-NEXT: movl 4(%eax), %eax -; X86-NOBMI-NEXT: movl $64, %ecx -; X86-NOBMI-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: movb $64, %cl +; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: movl %edx, %esi ; X86-NOBMI-NEXT: shll %cl, %esi ; X86-NOBMI-NEXT: shldl %cl, %edx, %eax @@ -3321,8 +3323,8 @@ define i64 @bzhi64_d2_load(i64* %w, i64 %numlowbits) nounwind { ; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI1NOTBM-NEXT: movl (%eax), %edx ; X86-BMI1NOTBM-NEXT: movl 4(%eax), %eax -; X86-BMI1NOTBM-NEXT: movl $64, %ecx -; X86-BMI1NOTBM-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: movb $64, %cl +; X86-BMI1NOTBM-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: movl %edx, %esi ; X86-BMI1NOTBM-NEXT: shll %cl, %esi ; X86-BMI1NOTBM-NEXT: shldl %cl, %edx, %eax @@ -3360,8 +3362,8 @@ define i64 @bzhi64_d2_load(i64* %w, i64 %numlowbits) nounwind { ; X86-BMI1BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI1BMI2-NEXT: movl (%eax), %edx ; X86-BMI1BMI2-NEXT: movl 4(%eax), %esi -; X86-BMI1BMI2-NEXT: movl $64, %ecx -; X86-BMI1BMI2-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-BMI1BMI2-NEXT: movb $64, %cl +; X86-BMI1BMI2-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1BMI2-NEXT: shldl %cl, %edx, %esi ; X86-BMI1BMI2-NEXT: shlxl %ecx, %edx, %edi ; X86-BMI1BMI2-NEXT: xorl %edx, %edx @@ -3390,7 +3392,7 @@ define i64 @bzhi64_d2_load(i64* %w, i64 %numlowbits) nounwind { ; X64-NOBMI: # %bb.0: ; X64-NOBMI-NEXT: movq %rsi, %rcx ; X64-NOBMI-NEXT: movq (%rdi), %rax -; X64-NOBMI-NEXT: negl %ecx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: shlq %cl, %rax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rax diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll index 0ee936f..b161763 100644 --- a/llvm/test/CodeGen/X86/fshl.ll +++ b/llvm/test/CodeGen/X86/fshl.ll @@ -203,8 +203,8 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind { ; X86-FAST-NEXT: movl %edi, %ebp ; X86-FAST-NEXT: xorl %edi, %edi ; X86-FAST-NEXT: .LBB3_2: -; X86-FAST-NEXT: movl $64, %ecx -; X86-FAST-NEXT: subl %ebx, %ecx +; X86-FAST-NEXT: movb $64, %cl +; X86-FAST-NEXT: subb %bl, %cl ; X86-FAST-NEXT: movl %edx, %esi ; X86-FAST-NEXT: shrl %cl, %esi ; X86-FAST-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill @@ -242,59 +242,58 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind { ; X86-SLOW-NEXT: pushl %edi ; X86-SLOW-NEXT: pushl %esi ; X86-SLOW-NEXT: subl $8, %esp -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-SLOW-NEXT: andl $63, %ebx -; X86-SLOW-NEXT: movl $64, %ecx -; X86-SLOW-NEXT: subl %ebx, %ecx +; X86-SLOW-NEXT: movb $64, %dh +; X86-SLOW-NEXT: subb %bl, %dh ; X86-SLOW-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movb %dh, %cl ; X86-SLOW-NEXT: shrl %cl, %eax -; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movb %cl, %ch -; X86-SLOW-NEXT: andb $31, %ch -; X86-SLOW-NEXT: movb %ch, %cl +; X86-SLOW-NEXT: movb %dh, %dl +; X86-SLOW-NEXT: andb $31, %dl +; X86-SLOW-NEXT: movl %edx, %ecx ; X86-SLOW-NEXT: negb %cl -; X86-SLOW-NEXT: movl %edi, %esi -; X86-SLOW-NEXT: shll %cl, %esi -; X86-SLOW-NEXT: testb %ch, %ch -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SLOW-NEXT: movl %esi, %ebp +; X86-SLOW-NEXT: shll %cl, %ebp +; X86-SLOW-NEXT: testb %dl, %dl +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SLOW-NEXT: je .LBB3_2 ; X86-SLOW-NEXT: # %bb.1: -; X86-SLOW-NEXT: orl %eax, %esi -; X86-SLOW-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: orl %eax, %ebp +; X86-SLOW-NEXT: movl %ebp, (%esp) # 4-byte Spill ; X86-SLOW-NEXT: .LBB3_2: -; X86-SLOW-NEXT: movl %edx, %eax +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-SLOW-NEXT: movl %ebp, %eax ; X86-SLOW-NEXT: movl %ebx, %ecx ; X86-SLOW-NEXT: shll %cl, %eax ; X86-SLOW-NEXT: movb %bl, %ch ; X86-SLOW-NEXT: andb $31, %ch ; X86-SLOW-NEXT: movb %ch, %cl ; X86-SLOW-NEXT: negb %cl -; X86-SLOW-NEXT: movl %edx, %esi -; X86-SLOW-NEXT: movl %ebp, %edx -; X86-SLOW-NEXT: shrl %cl, %ebp +; X86-SLOW-NEXT: shrl %cl, %edi ; X86-SLOW-NEXT: testb %ch, %ch ; X86-SLOW-NEXT: je .LBB3_4 ; X86-SLOW-NEXT: # %bb.3: -; X86-SLOW-NEXT: orl %ebp, %eax -; X86-SLOW-NEXT: movl %eax, %esi +; X86-SLOW-NEXT: orl %edi, %eax +; X86-SLOW-NEXT: movl %eax, %ebp ; X86-SLOW-NEXT: .LBB3_4: -; X86-SLOW-NEXT: movl %edx, %eax -; X86-SLOW-NEXT: movl %edx, %ebp +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-NEXT: movl %eax, %edi ; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: shll %cl, %ebp +; X86-SLOW-NEXT: shll %cl, %edi ; X86-SLOW-NEXT: testb $32, %bl ; X86-SLOW-NEXT: je .LBB3_6 ; X86-SLOW-NEXT: # %bb.5: -; X86-SLOW-NEXT: movl %ebp, %esi -; X86-SLOW-NEXT: xorl %ebp, %ebp +; X86-SLOW-NEXT: movl %edi, %ebp +; X86-SLOW-NEXT: xorl %edi, %edi ; X86-SLOW-NEXT: .LBB3_6: -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SLOW-NEXT: shrl %cl, %edi -; X86-SLOW-NEXT: testb $32, %cl +; X86-SLOW-NEXT: movb %dh, %cl +; X86-SLOW-NEXT: shrl %cl, %esi +; X86-SLOW-NEXT: testb $32, %dh ; X86-SLOW-NEXT: jne .LBB3_7 ; X86-SLOW-NEXT: # %bb.8: ; X86-SLOW-NEXT: movl (%esp), %ecx # 4-byte Reload @@ -302,16 +301,17 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind { ; X86-SLOW-NEXT: jne .LBB3_10 ; X86-SLOW-NEXT: jmp .LBB3_11 ; X86-SLOW-NEXT: .LBB3_7: -; X86-SLOW-NEXT: movl %edi, %ecx -; X86-SLOW-NEXT: xorl %edi, %edi +; X86-SLOW-NEXT: movl %esi, %ecx +; X86-SLOW-NEXT: xorl %esi, %esi ; X86-SLOW-NEXT: testl %ebx, %ebx ; X86-SLOW-NEXT: je .LBB3_11 ; X86-SLOW-NEXT: .LBB3_10: -; X86-SLOW-NEXT: orl %edi, %esi -; X86-SLOW-NEXT: orl %ecx, %ebp -; X86-SLOW-NEXT: movl %esi, %edx -; X86-SLOW-NEXT: movl %ebp, %eax +; X86-SLOW-NEXT: orl %esi, %ebp +; X86-SLOW-NEXT: orl %ecx, %edi +; X86-SLOW-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %edi, %eax ; X86-SLOW-NEXT: .LBB3_11: +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SLOW-NEXT: addl $8, %esp ; X86-SLOW-NEXT: popl %esi ; X86-SLOW-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll index f8968f3..10bf28d 100644 --- a/llvm/test/CodeGen/X86/fshr.ll +++ b/llvm/test/CodeGen/X86/fshr.ll @@ -187,17 +187,17 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind { ; X86-FAST-NEXT: pushl %eax ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-FAST-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-FAST-NEXT: andl $63, %ebx -; X86-FAST-NEXT: movl $64, %ecx -; X86-FAST-NEXT: subl %ebx, %ecx +; X86-FAST-NEXT: movb $64, %cl +; X86-FAST-NEXT: subb %bl, %cl ; X86-FAST-NEXT: movl %eax, %edi ; X86-FAST-NEXT: shll %cl, %edi ; X86-FAST-NEXT: shldl %cl, %eax, %esi ; X86-FAST-NEXT: testb $32, %cl -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-FAST-NEXT: je .LBB3_2 ; X86-FAST-NEXT: # %bb.1: ; X86-FAST-NEXT: movl %edi, %esi @@ -237,12 +237,14 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind { ; X86-SLOW-NEXT: pushl %edi ; X86-SLOW-NEXT: pushl %esi ; X86-SLOW-NEXT: subl $8, %esp -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-SLOW-NEXT: andl $63, %ebx -; X86-SLOW-NEXT: movl $64, %eax -; X86-SLOW-NEXT: subl %ebx, %eax +; X86-SLOW-NEXT: movb $64, %al +; X86-SLOW-NEXT: subb %bl, %al ; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-SLOW-NEXT: movl %eax, %ecx ; X86-SLOW-NEXT: shll %cl, %edx @@ -250,45 +252,43 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind { ; X86-SLOW-NEXT: andb $31, %ch ; X86-SLOW-NEXT: movb %ch, %cl ; X86-SLOW-NEXT: negb %cl -; X86-SLOW-NEXT: movl %edi, %ebp -; X86-SLOW-NEXT: shrl %cl, %ebp +; X86-SLOW-NEXT: movl %esi, %edi +; X86-SLOW-NEXT: shrl %cl, %edi ; X86-SLOW-NEXT: testb %ch, %ch -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-SLOW-NEXT: je .LBB3_2 ; X86-SLOW-NEXT: # %bb.1: -; X86-SLOW-NEXT: orl %ebp, %edx +; X86-SLOW-NEXT: orl %edi, %edx ; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-SLOW-NEXT: .LBB3_2: ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SLOW-NEXT: movl %ecx, %edx ; X86-SLOW-NEXT: movl %ebx, %ecx ; X86-SLOW-NEXT: shrl %cl, %edx -; X86-SLOW-NEXT: movb %bl, %ch -; X86-SLOW-NEXT: andb $31, %ch -; X86-SLOW-NEXT: movb %ch, %cl +; X86-SLOW-NEXT: movb %bl, %ah +; X86-SLOW-NEXT: andb $31, %ah +; X86-SLOW-NEXT: movb %ah, %cl ; X86-SLOW-NEXT: negb %cl -; X86-SLOW-NEXT: movl %esi, %ebp -; X86-SLOW-NEXT: shll %cl, %ebp -; X86-SLOW-NEXT: testb %ch, %ch -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SLOW-NEXT: movl %ebp, %edi +; X86-SLOW-NEXT: shll %cl, %edi +; X86-SLOW-NEXT: testb %ah, %ah +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-SLOW-NEXT: je .LBB3_4 ; X86-SLOW-NEXT: # %bb.3: -; X86-SLOW-NEXT: orl %edx, %ebp -; X86-SLOW-NEXT: movl %ebp, %esi +; X86-SLOW-NEXT: orl %edx, %edi +; X86-SLOW-NEXT: movl %edi, %ebp ; X86-SLOW-NEXT: .LBB3_4: -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: shrl %cl, %ebp +; X86-SLOW-NEXT: shrl %cl, %edi ; X86-SLOW-NEXT: testb $32, %bl ; X86-SLOW-NEXT: je .LBB3_6 ; X86-SLOW-NEXT: # %bb.5: -; X86-SLOW-NEXT: movl %ebp, %esi -; X86-SLOW-NEXT: xorl %ebp, %ebp +; X86-SLOW-NEXT: movl %edi, %ebp +; X86-SLOW-NEXT: xorl %edi, %edi ; X86-SLOW-NEXT: .LBB3_6: ; X86-SLOW-NEXT: movl %eax, %ecx -; X86-SLOW-NEXT: shll %cl, %edi +; X86-SLOW-NEXT: shll %cl, %esi ; X86-SLOW-NEXT: testb $32, %al ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SLOW-NEXT: jne .LBB3_7 @@ -298,14 +298,14 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind { ; X86-SLOW-NEXT: jne .LBB3_10 ; X86-SLOW-NEXT: jmp .LBB3_11 ; X86-SLOW-NEXT: .LBB3_7: -; X86-SLOW-NEXT: movl %edi, %eax -; X86-SLOW-NEXT: xorl %edi, %edi +; X86-SLOW-NEXT: movl %esi, %eax +; X86-SLOW-NEXT: xorl %esi, %esi ; X86-SLOW-NEXT: testl %ebx, %ebx ; X86-SLOW-NEXT: je .LBB3_11 ; X86-SLOW-NEXT: .LBB3_10: -; X86-SLOW-NEXT: orl %esi, %edi -; X86-SLOW-NEXT: orl %ebp, %eax -; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: orl %ebp, %esi +; X86-SLOW-NEXT: orl %edi, %eax +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SLOW-NEXT: movl %eax, %edx ; X86-SLOW-NEXT: .LBB3_11: ; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload diff --git a/llvm/test/CodeGen/X86/funnel-shift-rot.ll b/llvm/test/CodeGen/X86/funnel-shift-rot.ll index 19e75ab..e07d34f9 100644 --- a/llvm/test/CodeGen/X86/funnel-shift-rot.ll +++ b/llvm/test/CodeGen/X86/funnel-shift-rot.ll @@ -231,30 +231,25 @@ define i64 @rotr_i64(i64 %x, i64 %z) nounwind { ; X32-SSE2-NEXT: pushl %ebx ; X32-SSE2-NEXT: pushl %edi ; X32-SSE2-NEXT: pushl %esi +; X32-SSE2-NEXT: movb {{[0-9]+}}(%esp), %cl ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X32-SSE2-NEXT: movl %ebx, %ecx -; X32-SSE2-NEXT: andl $63, %ecx ; X32-SSE2-NEXT: movl %edx, %edi ; X32-SSE2-NEXT: shrl %cl, %edi -; X32-SSE2-NEXT: movl %esi, %ebp -; X32-SSE2-NEXT: shrdl %cl, %edx, %ebp -; X32-SSE2-NEXT: xorl %eax, %eax +; X32-SSE2-NEXT: movl %esi, %ebx +; X32-SSE2-NEXT: shrdl %cl, %edx, %ebx +; X32-SSE2-NEXT: xorl %ebp, %ebp ; X32-SSE2-NEXT: testb $32, %cl -; X32-SSE2-NEXT: cmovnel %edi, %ebp -; X32-SSE2-NEXT: cmovnel %eax, %edi -; X32-SSE2-NEXT: negl %ebx -; X32-SSE2-NEXT: andl $63, %ebx +; X32-SSE2-NEXT: cmovnel %edi, %ebx +; X32-SSE2-NEXT: cmovnel %ebp, %edi +; X32-SSE2-NEXT: negb %cl ; X32-SSE2-NEXT: movl %esi, %eax -; X32-SSE2-NEXT: movl %ebx, %ecx ; X32-SSE2-NEXT: shll %cl, %eax ; X32-SSE2-NEXT: shldl %cl, %esi, %edx -; X32-SSE2-NEXT: testb $32, %bl +; X32-SSE2-NEXT: testb $32, %cl ; X32-SSE2-NEXT: cmovnel %eax, %edx -; X32-SSE2-NEXT: movl $0, %ecx -; X32-SSE2-NEXT: cmovnel %ecx, %eax -; X32-SSE2-NEXT: orl %ebp, %eax +; X32-SSE2-NEXT: cmovnel %ebp, %eax +; X32-SSE2-NEXT: orl %ebx, %eax ; X32-SSE2-NEXT: orl %edi, %edx ; X32-SSE2-NEXT: popl %esi ; X32-SSE2-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll index b421725..0969d6d 100644 --- a/llvm/test/CodeGen/X86/funnel-shift.ll +++ b/llvm/test/CodeGen/X86/funnel-shift.ll @@ -70,8 +70,8 @@ define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) nounwind { ; X32-SSE2-NEXT: cmovnel %ebp, %eax ; X32-SSE2-NEXT: cmovnel %ecx, %ebp ; X32-SSE2-NEXT: xorl %edx, %edx -; X32-SSE2-NEXT: movl $37, %ecx -; X32-SSE2-NEXT: subl %ebx, %ecx +; X32-SSE2-NEXT: movb $37, %cl +; X32-SSE2-NEXT: subb %bl, %cl ; X32-SSE2-NEXT: shrdl %cl, %esi, %edi ; X32-SSE2-NEXT: shrl %cl, %esi ; X32-SSE2-NEXT: testb $32, %cl @@ -248,8 +248,8 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) nounwind { ; X32-SSE2-NEXT: calll __umoddi3 ; X32-SSE2-NEXT: addl $16, %esp ; X32-SSE2-NEXT: movl %eax, %ebx -; X32-SSE2-NEXT: movl $37, %ecx -; X32-SSE2-NEXT: subl %eax, %ecx +; X32-SSE2-NEXT: movb $37, %cl +; X32-SSE2-NEXT: subb %bl, %cl ; X32-SSE2-NEXT: movl %ebp, %eax ; X32-SSE2-NEXT: shll %cl, %ebp ; X32-SSE2-NEXT: shldl %cl, %eax, %edi diff --git a/llvm/test/CodeGen/X86/pr32284.ll b/llvm/test/CodeGen/X86/pr32284.ll index 3998fce..acbdf0e 100644 --- a/llvm/test/CodeGen/X86/pr32284.ll +++ b/llvm/test/CodeGen/X86/pr32284.ll @@ -81,9 +81,8 @@ define void @foo() { ; 686-NEXT: movzbl c, %eax ; 686-NEXT: xorl %ecx, %ecx ; 686-NEXT: testl %eax, %eax -; 686-NEXT: setne %cl -; 686-NEXT: testb %al, %al ; 686-NEXT: setne {{[0-9]+}}(%esp) +; 686-NEXT: setne %cl ; 686-NEXT: xorl %edx, %edx ; 686-NEXT: cmpl %eax, %ecx ; 686-NEXT: setle %dl diff --git a/llvm/test/CodeGen/X86/pr37879.ll b/llvm/test/CodeGen/X86/pr37879.ll index 1dc5e325..64cf3ea 100644 --- a/llvm/test/CodeGen/X86/pr37879.ll +++ b/llvm/test/CodeGen/X86/pr37879.ll @@ -6,8 +6,6 @@ define double @foo(i32** nocapture readonly) #0 { ; CHECK: ## %bb.0: ; CHECK-NEXT: movq (%rax), %rax ; CHECK-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 -; CHECK-NEXT: ## kill: def $eax killed $eax killed $rax -; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: kmovd %eax, %k1 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} diff --git a/llvm/test/CodeGen/X86/rot16.ll b/llvm/test/CodeGen/X86/rot16.ll index 3b2a01b..69ea701 100644 --- a/llvm/test/CodeGen/X86/rot16.ll +++ b/llvm/test/CodeGen/X86/rot16.ll @@ -15,7 +15,7 @@ define i16 @foo(i16 %x, i16 %y, i16 %z) nounwind { ; X64-NEXT: movl %edx, %ecx ; X64-NEXT: movl %edi, %eax ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shldw %cl, %ax, %ax +; X64-NEXT: rolw %cl, %ax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %t0 = shl i16 %x, %z @@ -62,7 +62,7 @@ define i16 @un(i16 %x, i16 %y, i16 %z) nounwind { ; X64-NEXT: movl %edx, %ecx ; X64-NEXT: movl %edi, %eax ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shrdw %cl, %ax, %ax +; X64-NEXT: rorw %cl, %ax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %t0 = lshr i16 %x, %z diff --git a/llvm/test/CodeGen/X86/rotate.ll b/llvm/test/CodeGen/X86/rotate.ll index 7f287eb..0d92e26 100644 --- a/llvm/test/CodeGen/X86/rotate.ll +++ b/llvm/test/CodeGen/X86/rotate.ll @@ -653,29 +653,29 @@ define i64 @truncated_rot(i64 %x, i32 %amt) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %esi, %eax ; X86-NEXT: shll %cl, %eax ; X86-NEXT: testb $32, %cl -; X86-NEXT: movl $0, %ebx +; X86-NEXT: movl $0, %edi ; X86-NEXT: jne .LBB28_2 ; X86-NEXT: # %bb.1: # %entry -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, %edi ; X86-NEXT: .LBB28_2: # %entry -; X86-NEXT: movl $64, %edx -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movb $64, %dl +; X86-NEXT: subb %cl, %dl +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: shrl %cl, %eax -; X86-NEXT: shrdl %cl, %edi, %esi +; X86-NEXT: shrdl %cl, %ebx, %esi ; X86-NEXT: testb $32, %dl ; X86-NEXT: jne .LBB28_4 ; X86-NEXT: # %bb.3: # %entry ; X86-NEXT: movl %esi, %eax ; X86-NEXT: .LBB28_4: # %entry -; X86-NEXT: orl %ebx, %eax +; X86-NEXT: orl %edi, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/rotate4.ll b/llvm/test/CodeGen/X86/rotate4.ll index 5347fec..92a98c9 100644 --- a/llvm/test/CodeGen/X86/rotate4.ll +++ b/llvm/test/CodeGen/X86/rotate4.ll @@ -65,9 +65,9 @@ define i64 @rotate_left_64(i64 %a, i64 %b) { ; X86-NEXT: .cfi_offset %esi, -16 ; X86-NEXT: .cfi_offset %edi, -12 ; X86-NEXT: .cfi_offset %ebx, -8 +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %esi, %eax ; X86-NEXT: shll %cl, %eax ; X86-NEXT: movl %edi, %edx @@ -78,7 +78,7 @@ define i64 @rotate_left_64(i64 %a, i64 %b) { ; X86-NEXT: movl %eax, %edx ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: .LBB2_2: -; X86-NEXT: negl %ecx +; X86-NEXT: negb %cl ; X86-NEXT: movl %edi, %ebx ; X86-NEXT: shrl %cl, %ebx ; X86-NEXT: shrdl %cl, %edi, %esi @@ -126,9 +126,9 @@ define i64 @rotate_right_64(i64 %a, i64 %b) { ; X86-NEXT: .cfi_offset %esi, -16 ; X86-NEXT: .cfi_offset %edi, -12 ; X86-NEXT: .cfi_offset %ebx, -8 +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %esi, %edx ; X86-NEXT: shrl %cl, %edx ; X86-NEXT: movl %edi, %eax @@ -139,7 +139,7 @@ define i64 @rotate_right_64(i64 %a, i64 %b) { ; X86-NEXT: movl %edx, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .LBB3_2: -; X86-NEXT: negl %ecx +; X86-NEXT: negb %cl ; X86-NEXT: movl %edi, %ebx ; X86-NEXT: shll %cl, %ebx ; X86-NEXT: shldl %cl, %edi, %esi @@ -242,7 +242,7 @@ define void @rotate_left_m64(i64 *%pa, i64 %b) { ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl (%eax), %edx ; X86-NEXT: movl 4(%eax), %ebx @@ -256,7 +256,7 @@ define void @rotate_left_m64(i64 *%pa, i64 %b) { ; X86-NEXT: movl %esi, %edi ; X86-NEXT: xorl %esi, %esi ; X86-NEXT: .LBB6_2: -; X86-NEXT: negl %ecx +; X86-NEXT: negb %cl ; X86-NEXT: movl %ebx, %ebp ; X86-NEXT: shrl %cl, %ebp ; X86-NEXT: shrdl %cl, %ebx, %edx @@ -312,33 +312,33 @@ define void @rotate_right_m64(i64 *%pa, i64 %b) { ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl (%eax), %ebx -; X86-NEXT: movl 4(%eax), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shrl %cl, %esi +; X86-NEXT: movl 4(%eax), %esi +; X86-NEXT: movl %esi, %edx +; X86-NEXT: shrl %cl, %edx ; X86-NEXT: movl %ebx, %edi -; X86-NEXT: shrdl %cl, %edx, %edi +; X86-NEXT: shrdl %cl, %esi, %edi ; X86-NEXT: testb $32, %cl ; X86-NEXT: je .LBB7_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movl %esi, %edi -; X86-NEXT: xorl %esi, %esi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .LBB7_2: -; X86-NEXT: negl %ecx +; X86-NEXT: negb %cl ; X86-NEXT: movl %ebx, %ebp ; X86-NEXT: shll %cl, %ebp -; X86-NEXT: shldl %cl, %ebx, %edx +; X86-NEXT: shldl %cl, %ebx, %esi ; X86-NEXT: testb $32, %cl ; X86-NEXT: je .LBB7_4 ; X86-NEXT: # %bb.3: -; X86-NEXT: movl %ebp, %edx +; X86-NEXT: movl %ebp, %esi ; X86-NEXT: xorl %ebp, %ebp ; X86-NEXT: .LBB7_4: -; X86-NEXT: orl %edx, %esi +; X86-NEXT: orl %esi, %edx ; X86-NEXT: orl %ebp, %edi -; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edx, 4(%eax) ; X86-NEXT: movl %edi, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 16 diff --git a/llvm/test/CodeGen/X86/schedule-x86-64-shld.ll b/llvm/test/CodeGen/X86/schedule-x86-64-shld.ll index 541a3fa..b0faf4c 100644 --- a/llvm/test/CodeGen/X86/schedule-x86-64-shld.ll +++ b/llvm/test/CodeGen/X86/schedule-x86-64-shld.ll @@ -170,7 +170,7 @@ define i64 @lshift_cl(i64 %a, i64 %b, i64 %c) nounwind readnone { ; BDVER12-NEXT: movq %rdx, %rcx # sched: [1:0.50] ; BDVER12-NEXT: movq %rsi, %rax # sched: [1:0.50] ; BDVER12-NEXT: shlq %cl, %rdi # sched: [1:0.50] -; BDVER12-NEXT: negl %ecx # sched: [1:0.50] +; BDVER12-NEXT: negb %cl # sched: [1:0.50] ; BDVER12-NEXT: # kill: def $cl killed $cl killed $rcx ; BDVER12-NEXT: shrq %cl, %rax # sched: [1:0.50] ; BDVER12-NEXT: orq %rdi, %rax # sched: [1:0.50] @@ -181,7 +181,7 @@ define i64 @lshift_cl(i64 %a, i64 %b, i64 %c) nounwind readnone { ; BTVER2-NEXT: movq %rdx, %rcx # sched: [1:0.50] ; BTVER2-NEXT: movq %rsi, %rax # sched: [1:0.50] ; BTVER2-NEXT: shlq %cl, %rdi # sched: [1:0.50] -; BTVER2-NEXT: negl %ecx # sched: [1:0.50] +; BTVER2-NEXT: negb %cl # sched: [1:0.50] ; BTVER2-NEXT: # kill: def $cl killed $cl killed $rcx ; BTVER2-NEXT: shrq %cl, %rax # sched: [1:0.50] ; BTVER2-NEXT: orq %rdi, %rax # sched: [1:0.50] @@ -246,7 +246,7 @@ define i64 @rshift_cl(i64 %a, i64 %b, i64 %c) nounwind readnone { ; BDVER12-NEXT: movq %rdx, %rcx # sched: [1:0.50] ; BDVER12-NEXT: movq %rsi, %rax # sched: [1:0.50] ; BDVER12-NEXT: shrq %cl, %rdi # sched: [1:0.50] -; BDVER12-NEXT: negl %ecx # sched: [1:0.50] +; BDVER12-NEXT: negb %cl # sched: [1:0.50] ; BDVER12-NEXT: # kill: def $cl killed $cl killed $rcx ; BDVER12-NEXT: shlq %cl, %rax # sched: [1:0.50] ; BDVER12-NEXT: orq %rdi, %rax # sched: [1:0.50] @@ -257,7 +257,7 @@ define i64 @rshift_cl(i64 %a, i64 %b, i64 %c) nounwind readnone { ; BTVER2-NEXT: movq %rdx, %rcx # sched: [1:0.50] ; BTVER2-NEXT: movq %rsi, %rax # sched: [1:0.50] ; BTVER2-NEXT: shrq %cl, %rdi # sched: [1:0.50] -; BTVER2-NEXT: negl %ecx # sched: [1:0.50] +; BTVER2-NEXT: negb %cl # sched: [1:0.50] ; BTVER2-NEXT: # kill: def $cl killed $cl killed $rcx ; BTVER2-NEXT: shlq %cl, %rax # sched: [1:0.50] ; BTVER2-NEXT: orq %rdi, %rax # sched: [1:0.50] @@ -321,7 +321,7 @@ define void @lshift_mem_cl(i64 %a, i64 %c) nounwind readnone { ; BDVER12-NEXT: movq {{.*}}(%rip), %rax # sched: [5:0.50] ; BDVER12-NEXT: movq %rsi, %rcx # sched: [1:0.50] ; BDVER12-NEXT: shlq %cl, %rax # sched: [1:0.50] -; BDVER12-NEXT: negl %ecx # sched: [1:0.50] +; BDVER12-NEXT: negb %cl # sched: [1:0.50] ; BDVER12-NEXT: # kill: def $cl killed $cl killed $rcx ; BDVER12-NEXT: shrq %cl, %rdi # sched: [1:0.50] ; BDVER12-NEXT: orq %rax, %rdi # sched: [1:0.50] @@ -333,7 +333,7 @@ define void @lshift_mem_cl(i64 %a, i64 %c) nounwind readnone { ; BTVER2-NEXT: movq {{.*}}(%rip), %rax # sched: [5:1.00] ; BTVER2-NEXT: movq %rsi, %rcx # sched: [1:0.50] ; BTVER2-NEXT: shlq %cl, %rax # sched: [1:0.50] -; BTVER2-NEXT: negl %ecx # sched: [1:0.50] +; BTVER2-NEXT: negb %cl # sched: [1:0.50] ; BTVER2-NEXT: # kill: def $cl killed $cl killed $rcx ; BTVER2-NEXT: shrq %cl, %rdi # sched: [1:0.50] ; BTVER2-NEXT: orq %rax, %rdi # sched: [1:0.50] diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll index f726aaf..c08bfbf 100644 --- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll +++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll @@ -16,43 +16,45 @@ define i256 @test1(i256 %a) nounwind { ; ILP-NEXT: pushq %rbx ; ILP-NEXT: movq %rdi, %rax ; ILP-NEXT: xorl %r8d, %r8d -; ILP-NEXT: leal 3(%rsi,%rsi), %r11d +; ILP-NEXT: incl %esi +; ILP-NEXT: addb %sil, %sil +; ILP-NEXT: orb $1, %sil ; ILP-NEXT: movl $1, %r9d ; ILP-NEXT: xorl %r14d, %r14d -; ILP-NEXT: movl %r11d, %ecx +; ILP-NEXT: movl %esi, %ecx ; ILP-NEXT: shldq %cl, %r9, %r14 +; ILP-NEXT: movl $1, %edx +; ILP-NEXT: shlq %cl, %rdx +; ILP-NEXT: movl %esi, %r11d +; ILP-NEXT: addb $-128, %r11b +; ILP-NEXT: movb $-128, %r10b +; ILP-NEXT: xorl %ebx, %ebx +; ILP-NEXT: movl %r11d, %ecx +; ILP-NEXT: shldq %cl, %r9, %rbx +; ILP-NEXT: testb $64, %sil +; ILP-NEXT: cmovneq %rdx, %r14 +; ILP-NEXT: cmovneq %r8, %rdx ; ILP-NEXT: movl $1, %edi ; ILP-NEXT: shlq %cl, %rdi -; ILP-NEXT: movb $-128, %r10b -; ILP-NEXT: subb %r11b, %r10b -; ILP-NEXT: movl %r11d, %edx -; ILP-NEXT: addb $-128, %dl -; ILP-NEXT: xorl %esi, %esi -; ILP-NEXT: movl %edx, %ecx -; ILP-NEXT: shldq %cl, %r9, %rsi -; ILP-NEXT: movl $1, %ebx -; ILP-NEXT: shlq %cl, %rbx +; ILP-NEXT: subb %sil, %r10b ; ILP-NEXT: movl %r10d, %ecx ; ILP-NEXT: shrdq %cl, %r8, %r9 -; ILP-NEXT: testb $64, %r11b -; ILP-NEXT: cmovneq %rdi, %r14 -; ILP-NEXT: cmovneq %r8, %rdi ; ILP-NEXT: testb $64, %r10b ; ILP-NEXT: cmovneq %r8, %r9 -; ILP-NEXT: testb $64, %dl -; ILP-NEXT: cmovneq %rbx, %rsi -; ILP-NEXT: cmovneq %r8, %rbx -; ILP-NEXT: testb %r11b, %r11b +; ILP-NEXT: testb $64, %r11b +; ILP-NEXT: cmovneq %rdi, %rbx +; ILP-NEXT: cmovneq %r8, %rdi +; ILP-NEXT: testb %sil, %sil ; ILP-NEXT: cmovsq %r8, %r14 -; ILP-NEXT: cmovsq %r8, %rdi +; ILP-NEXT: cmovsq %r8, %rdx ; ILP-NEXT: movq %r14, 8(%rax) -; ILP-NEXT: movq %rdi, (%rax) -; ILP-NEXT: cmovnsq %r8, %rsi -; ILP-NEXT: cmoveq %r8, %rsi -; ILP-NEXT: movq %rsi, 24(%rax) -; ILP-NEXT: cmovnsq %r9, %rbx +; ILP-NEXT: movq %rdx, (%rax) +; ILP-NEXT: cmovnsq %r8, %rbx ; ILP-NEXT: cmoveq %r8, %rbx -; ILP-NEXT: movq %rbx, 16(%rax) +; ILP-NEXT: movq %rbx, 24(%rax) +; ILP-NEXT: cmovnsq %r9, %rdi +; ILP-NEXT: cmoveq %r8, %rdi +; ILP-NEXT: movq %rdi, 16(%rax) ; ILP-NEXT: popq %rbx ; ILP-NEXT: popq %r14 ; ILP-NEXT: retq @@ -60,126 +62,132 @@ define i256 @test1(i256 %a) nounwind { ; HYBRID-LABEL: test1: ; HYBRID: # %bb.0: ; HYBRID-NEXT: movq %rdi, %rax -; HYBRID-NEXT: leal 3(%rsi,%rsi), %r10d +; HYBRID-NEXT: incl %esi +; HYBRID-NEXT: addb %sil, %sil +; HYBRID-NEXT: orb $1, %sil ; HYBRID-NEXT: movb $-128, %cl -; HYBRID-NEXT: subb %r10b, %cl +; HYBRID-NEXT: subb %sil, %cl ; HYBRID-NEXT: xorl %r8d, %r8d -; HYBRID-NEXT: movl $1, %esi +; HYBRID-NEXT: movl $1, %r11d ; HYBRID-NEXT: movl $1, %r9d ; HYBRID-NEXT: shrdq %cl, %r8, %r9 ; HYBRID-NEXT: testb $64, %cl ; HYBRID-NEXT: cmovneq %r8, %r9 -; HYBRID-NEXT: xorl %r11d, %r11d -; HYBRID-NEXT: movl %r10d, %ecx -; HYBRID-NEXT: shldq %cl, %rsi, %r11 +; HYBRID-NEXT: xorl %r10d, %r10d +; HYBRID-NEXT: movl %esi, %ecx +; HYBRID-NEXT: shldq %cl, %r11, %r10 ; HYBRID-NEXT: addb $-128, %cl -; HYBRID-NEXT: xorl %edx, %edx -; HYBRID-NEXT: shldq %cl, %rsi, %rdx -; HYBRID-NEXT: movl $1, %edi -; HYBRID-NEXT: shlq %cl, %rdi +; HYBRID-NEXT: xorl %edi, %edi +; HYBRID-NEXT: shldq %cl, %r11, %rdi +; HYBRID-NEXT: movl $1, %edx +; HYBRID-NEXT: shlq %cl, %rdx ; HYBRID-NEXT: testb $64, %cl -; HYBRID-NEXT: cmovneq %rdi, %rdx -; HYBRID-NEXT: cmovneq %r8, %rdi -; HYBRID-NEXT: movl %r10d, %ecx -; HYBRID-NEXT: shlq %cl, %rsi -; HYBRID-NEXT: testb $64, %r10b -; HYBRID-NEXT: cmovneq %rsi, %r11 -; HYBRID-NEXT: cmovneq %r8, %rsi -; HYBRID-NEXT: testb %r10b, %r10b +; HYBRID-NEXT: cmovneq %rdx, %rdi +; HYBRID-NEXT: cmovneq %r8, %rdx +; HYBRID-NEXT: movl %esi, %ecx +; HYBRID-NEXT: shlq %cl, %r11 +; HYBRID-NEXT: testb $64, %sil +; HYBRID-NEXT: cmovneq %r11, %r10 +; HYBRID-NEXT: cmovneq %r8, %r11 +; HYBRID-NEXT: testb %sil, %sil +; HYBRID-NEXT: cmovsq %r8, %r10 +; HYBRID-NEXT: movq %r10, 8(%rax) ; HYBRID-NEXT: cmovsq %r8, %r11 -; HYBRID-NEXT: movq %r11, 8(%rax) -; HYBRID-NEXT: cmovsq %r8, %rsi -; HYBRID-NEXT: movq %rsi, (%rax) -; HYBRID-NEXT: cmovnsq %r8, %rdx -; HYBRID-NEXT: cmoveq %r8, %rdx -; HYBRID-NEXT: movq %rdx, 24(%rax) -; HYBRID-NEXT: cmovnsq %r9, %rdi +; HYBRID-NEXT: movq %r11, (%rax) +; HYBRID-NEXT: cmovnsq %r8, %rdi ; HYBRID-NEXT: cmoveq %r8, %rdi -; HYBRID-NEXT: movq %rdi, 16(%rax) +; HYBRID-NEXT: movq %rdi, 24(%rax) +; HYBRID-NEXT: cmovnsq %r9, %rdx +; HYBRID-NEXT: cmoveq %r8, %rdx +; HYBRID-NEXT: movq %rdx, 16(%rax) ; HYBRID-NEXT: retq ; ; BURR-LABEL: test1: ; BURR: # %bb.0: ; BURR-NEXT: movq %rdi, %rax -; BURR-NEXT: leal 3(%rsi,%rsi), %r10d +; BURR-NEXT: incl %esi +; BURR-NEXT: addb %sil, %sil +; BURR-NEXT: orb $1, %sil ; BURR-NEXT: movb $-128, %cl -; BURR-NEXT: subb %r10b, %cl +; BURR-NEXT: subb %sil, %cl ; BURR-NEXT: xorl %r8d, %r8d -; BURR-NEXT: movl $1, %esi +; BURR-NEXT: movl $1, %r11d ; BURR-NEXT: movl $1, %r9d ; BURR-NEXT: shrdq %cl, %r8, %r9 ; BURR-NEXT: testb $64, %cl ; BURR-NEXT: cmovneq %r8, %r9 -; BURR-NEXT: xorl %r11d, %r11d -; BURR-NEXT: movl %r10d, %ecx -; BURR-NEXT: shldq %cl, %rsi, %r11 +; BURR-NEXT: xorl %r10d, %r10d +; BURR-NEXT: movl %esi, %ecx +; BURR-NEXT: shldq %cl, %r11, %r10 ; BURR-NEXT: addb $-128, %cl -; BURR-NEXT: xorl %edx, %edx -; BURR-NEXT: shldq %cl, %rsi, %rdx -; BURR-NEXT: movl $1, %edi -; BURR-NEXT: shlq %cl, %rdi +; BURR-NEXT: xorl %edi, %edi +; BURR-NEXT: shldq %cl, %r11, %rdi +; BURR-NEXT: movl $1, %edx +; BURR-NEXT: shlq %cl, %rdx ; BURR-NEXT: testb $64, %cl -; BURR-NEXT: cmovneq %rdi, %rdx -; BURR-NEXT: cmovneq %r8, %rdi -; BURR-NEXT: movl %r10d, %ecx -; BURR-NEXT: shlq %cl, %rsi -; BURR-NEXT: testb $64, %r10b -; BURR-NEXT: cmovneq %rsi, %r11 -; BURR-NEXT: cmovneq %r8, %rsi -; BURR-NEXT: testb %r10b, %r10b +; BURR-NEXT: cmovneq %rdx, %rdi +; BURR-NEXT: cmovneq %r8, %rdx +; BURR-NEXT: movl %esi, %ecx +; BURR-NEXT: shlq %cl, %r11 +; BURR-NEXT: testb $64, %sil +; BURR-NEXT: cmovneq %r11, %r10 +; BURR-NEXT: cmovneq %r8, %r11 +; BURR-NEXT: testb %sil, %sil +; BURR-NEXT: cmovsq %r8, %r10 +; BURR-NEXT: movq %r10, 8(%rax) ; BURR-NEXT: cmovsq %r8, %r11 -; BURR-NEXT: movq %r11, 8(%rax) -; BURR-NEXT: cmovsq %r8, %rsi -; BURR-NEXT: movq %rsi, (%rax) -; BURR-NEXT: cmovnsq %r8, %rdx -; BURR-NEXT: cmoveq %r8, %rdx -; BURR-NEXT: movq %rdx, 24(%rax) -; BURR-NEXT: cmovnsq %r9, %rdi +; BURR-NEXT: movq %r11, (%rax) +; BURR-NEXT: cmovnsq %r8, %rdi ; BURR-NEXT: cmoveq %r8, %rdi -; BURR-NEXT: movq %rdi, 16(%rax) +; BURR-NEXT: movq %rdi, 24(%rax) +; BURR-NEXT: cmovnsq %r9, %rdx +; BURR-NEXT: cmoveq %r8, %rdx +; BURR-NEXT: movq %rdx, 16(%rax) ; BURR-NEXT: retq ; ; SRC-LABEL: test1: ; SRC: # %bb.0: ; SRC-NEXT: pushq %rbx ; SRC-NEXT: movq %rdi, %rax -; SRC-NEXT: leal 3(%rsi,%rsi), %r9d +; SRC-NEXT: incl %esi +; SRC-NEXT: addb %sil, %sil +; SRC-NEXT: orb $1, %sil ; SRC-NEXT: movb $-128, %cl -; SRC-NEXT: subb %r9b, %cl +; SRC-NEXT: subb %sil, %cl ; SRC-NEXT: xorl %r8d, %r8d ; SRC-NEXT: movl $1, %edi ; SRC-NEXT: movl $1, %r10d ; SRC-NEXT: shrdq %cl, %r8, %r10 ; SRC-NEXT: testb $64, %cl ; SRC-NEXT: cmovneq %r8, %r10 -; SRC-NEXT: movl %r9d, %r11d -; SRC-NEXT: addb $-128, %r11b -; SRC-NEXT: xorl %esi, %esi -; SRC-NEXT: movl %r11d, %ecx -; SRC-NEXT: shldq %cl, %rdi, %rsi +; SRC-NEXT: movl %esi, %r9d +; SRC-NEXT: addb $-128, %r9b ; SRC-NEXT: xorl %edx, %edx ; SRC-NEXT: movl %r9d, %ecx ; SRC-NEXT: shldq %cl, %rdi, %rdx +; SRC-NEXT: xorl %r11d, %r11d +; SRC-NEXT: movl %esi, %ecx +; SRC-NEXT: shldq %cl, %rdi, %r11 ; SRC-NEXT: movl $1, %ebx ; SRC-NEXT: shlq %cl, %rbx -; SRC-NEXT: testb $64, %r9b -; SRC-NEXT: cmovneq %rbx, %rdx +; SRC-NEXT: testb $64, %sil +; SRC-NEXT: cmovneq %rbx, %r11 ; SRC-NEXT: cmovneq %r8, %rbx -; SRC-NEXT: movl %r11d, %ecx +; SRC-NEXT: movl %r9d, %ecx ; SRC-NEXT: shlq %cl, %rdi -; SRC-NEXT: testb $64, %r11b -; SRC-NEXT: cmovneq %rdi, %rsi +; SRC-NEXT: testb $64, %r9b +; SRC-NEXT: cmovneq %rdi, %rdx ; SRC-NEXT: cmovneq %r8, %rdi -; SRC-NEXT: testb %r9b, %r9b +; SRC-NEXT: testb %sil, %sil ; SRC-NEXT: cmovnsq %r10, %rdi ; SRC-NEXT: cmoveq %r8, %rdi -; SRC-NEXT: cmovnsq %r8, %rsi -; SRC-NEXT: cmoveq %r8, %rsi -; SRC-NEXT: cmovsq %r8, %rdx +; SRC-NEXT: cmovnsq %r8, %rdx +; SRC-NEXT: cmoveq %r8, %rdx +; SRC-NEXT: cmovsq %r8, %r11 ; SRC-NEXT: cmovsq %r8, %rbx -; SRC-NEXT: movq %rdx, 8(%rax) +; SRC-NEXT: movq %r11, 8(%rax) ; SRC-NEXT: movq %rbx, (%rax) -; SRC-NEXT: movq %rsi, 24(%rax) +; SRC-NEXT: movq %rdx, 24(%rax) ; SRC-NEXT: movq %rdi, 16(%rax) ; SRC-NEXT: popq %rbx ; SRC-NEXT: retq @@ -189,46 +197,48 @@ define i256 @test1(i256 %a) nounwind { ; LIN-NEXT: movq %rdi, %rax ; LIN-NEXT: xorl %r9d, %r9d ; LIN-NEXT: movl $1, %r8d -; LIN-NEXT: leal 3(%rsi,%rsi), %edx -; LIN-NEXT: movl $1, %esi -; LIN-NEXT: movl %edx, %ecx -; LIN-NEXT: shlq %cl, %rsi -; LIN-NEXT: testb $64, %dl -; LIN-NEXT: movq %rsi, %rcx +; LIN-NEXT: incl %esi +; LIN-NEXT: addb %sil, %sil +; LIN-NEXT: orb $1, %sil +; LIN-NEXT: movl $1, %edx +; LIN-NEXT: movl %esi, %ecx +; LIN-NEXT: shlq %cl, %rdx +; LIN-NEXT: testb $64, %sil +; LIN-NEXT: movq %rdx, %rcx ; LIN-NEXT: cmovneq %r9, %rcx -; LIN-NEXT: testb %dl, %dl +; LIN-NEXT: testb %sil, %sil ; LIN-NEXT: cmovsq %r9, %rcx ; LIN-NEXT: movq %rcx, (%rdi) ; LIN-NEXT: xorl %edi, %edi -; LIN-NEXT: movl %edx, %ecx +; LIN-NEXT: movl %esi, %ecx ; LIN-NEXT: shldq %cl, %r8, %rdi -; LIN-NEXT: cmovneq %rsi, %rdi +; LIN-NEXT: cmovneq %rdx, %rdi ; LIN-NEXT: cmovsq %r9, %rdi ; LIN-NEXT: movq %rdi, 8(%rax) -; LIN-NEXT: movl %edx, %esi -; LIN-NEXT: addb $-128, %sil +; LIN-NEXT: movl %esi, %edx +; LIN-NEXT: addb $-128, %dl ; LIN-NEXT: movl $1, %r10d -; LIN-NEXT: movl %esi, %ecx +; LIN-NEXT: movl %edx, %ecx ; LIN-NEXT: shlq %cl, %r10 -; LIN-NEXT: testb $64, %sil +; LIN-NEXT: testb $64, %dl ; LIN-NEXT: movq %r10, %rdi ; LIN-NEXT: cmovneq %r9, %rdi ; LIN-NEXT: movb $-128, %cl -; LIN-NEXT: subb %dl, %cl -; LIN-NEXT: movl $1, %edx -; LIN-NEXT: shrdq %cl, %r9, %rdx +; LIN-NEXT: subb %sil, %cl +; LIN-NEXT: movl $1, %esi +; LIN-NEXT: shrdq %cl, %r9, %rsi ; LIN-NEXT: testb $64, %cl -; LIN-NEXT: cmovneq %r9, %rdx -; LIN-NEXT: cmovsq %rdi, %rdx -; LIN-NEXT: cmoveq %r9, %rdx -; LIN-NEXT: movq %rdx, 16(%rax) -; LIN-NEXT: xorl %edx, %edx -; LIN-NEXT: movl %esi, %ecx -; LIN-NEXT: shldq %cl, %r8, %rdx -; LIN-NEXT: cmovneq %r10, %rdx -; LIN-NEXT: cmovnsq %r9, %rdx -; LIN-NEXT: cmoveq %r9, %rdx -; LIN-NEXT: movq %rdx, 24(%rax) +; LIN-NEXT: cmovneq %r9, %rsi +; LIN-NEXT: cmovsq %rdi, %rsi +; LIN-NEXT: cmoveq %r9, %rsi +; LIN-NEXT: movq %rsi, 16(%rax) +; LIN-NEXT: xorl %esi, %esi +; LIN-NEXT: movl %edx, %ecx +; LIN-NEXT: shldq %cl, %r8, %rsi +; LIN-NEXT: cmovneq %r10, %rsi +; LIN-NEXT: cmovnsq %r9, %rsi +; LIN-NEXT: cmoveq %r9, %rsi +; LIN-NEXT: movq %rsi, 24(%rax) ; LIN-NEXT: retq %b = add i256 %a, 1 %m = shl i256 %b, 1 diff --git a/llvm/test/CodeGen/X86/test-shrink.ll b/llvm/test/CodeGen/X86/test-shrink.ll index 5e9f495..993b78c 100644 --- a/llvm/test/CodeGen/X86/test-shrink.ll +++ b/llvm/test/CodeGen/X86/test-shrink.ll @@ -590,7 +590,6 @@ define void @and16_trunc_8_sign(i16 %x) nounwind { ; CHECK-WIN32-64-LABEL: and16_trunc_8_sign: ; CHECK-WIN32-64: # %bb.0: ; CHECK-WIN32-64-NEXT: subq $40, %rsp -; CHECK-WIN32-64-NEXT: # kill: def $cx killed $cx def $ecx ; CHECK-WIN32-64-NEXT: testb $-128, %cl ; CHECK-WIN32-64-NEXT: jg .LBB13_2 ; CHECK-WIN32-64-NEXT: # %bb.1: # %yes @@ -601,8 +600,7 @@ define void @and16_trunc_8_sign(i16 %x) nounwind { ; ; CHECK-X86-LABEL: and16_trunc_8_sign: ; CHECK-X86: # %bb.0: -; CHECK-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-X86-NEXT: testb $-128, %al +; CHECK-X86-NEXT: testb $-128, {{[0-9]+}}(%esp) ; CHECK-X86-NEXT: jg .LBB13_2 ; CHECK-X86-NEXT: # %bb.1: # %yes ; CHECK-X86-NEXT: calll bar @@ -733,8 +731,8 @@ define void @and32_trunc_16_sign(i32 %x) nounwind { ; ; CHECK-X86-LABEL: and32_trunc_16_sign: ; CHECK-X86: # %bb.0: -; CHECK-X86-NEXT: movl $32768, %eax # imm = 0x8000 -; CHECK-X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; CHECK-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-X86-NEXT: andl $32768, %eax # imm = 0x8000 ; CHECK-X86-NEXT: testw %ax, %ax ; CHECK-X86-NEXT: jg .LBB16_2 ; CHECK-X86-NEXT: # %bb.1: # %yes @@ -778,7 +776,8 @@ define void @and32_trunc_16_sign_minsize(i32 %x) minsize nounwind { ; ; CHECK-X86-LABEL: and32_trunc_16_sign_minsize: ; CHECK-X86: # %bb.0: -; CHECK-X86-NEXT: testw $-32768, {{[0-9]+}}(%esp) # imm = 0x8000 +; CHECK-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-X86-NEXT: testw $-32768, %ax # imm = 0x8000 ; CHECK-X86-NEXT: jg .LBB17_2 ; CHECK-X86-NEXT: # %bb.1: # %yes ; CHECK-X86-NEXT: calll bar diff --git a/llvm/test/CodeGen/X86/vector-trunc-math-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-math-widen.ll index 38e2a6b..1cda401 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-math-widen.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math-widen.ll @@ -1340,49 +1340,39 @@ define <16 x i8> @trunc_ext_sub_v16i16_v16i8(<16 x i8> %x, <16 x i8> %y) { define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; SSE-LABEL: trunc_sub_const_v4i64_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movq %rax, %xmm2 -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; SSE-NEXT: psubq %xmm2, %xmm0 -; SSE-NEXT: psubq {{.*}}(%rip), %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: psubd {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_sub_const_v4i64_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: movl $1, %eax -; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc_sub_const_v4i64_v4i32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_sub_const_v4i64_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <4 x i64> %a0, @@ -1393,53 +1383,38 @@ define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind { define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_sub_const_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movq %rax, %xmm4 -; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] -; SSE-NEXT: psubq %xmm4, %xmm0 -; SSE-NEXT: psubq {{.*}}(%rip), %xmm1 -; SSE-NEXT: psubq {{.*}}(%rip), %xmm2 -; SSE-NEXT: psubq {{.*}}(%rip), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: psubw {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_sub_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: movl $1, %eax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] -; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: trunc_sub_const_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -1447,28 +1422,26 @@ define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc_sub_const_v8i64_v8i16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_sub_const_v8i64_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <8 x i64> %a0, @@ -1479,41 +1452,38 @@ define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind { define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; SSE-LABEL: trunc_sub_const_v8i32_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: psubd {{.*}}(%rip), %xmm0 -; SSE-NEXT: psubd {{.*}}(%rip), %xmm1 ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: psubw {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_sub_const_v8i32_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_sub_const_v8i32_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_sub_const_v8i32_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <8 x i32> %a0, @@ -1524,17 +1494,6 @@ define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind { define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-LABEL: trunc_sub_const_v16i64_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movq %rax, %xmm8 -; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] -; SSE-NEXT: psubq %xmm8, %xmm0 -; SSE-NEXT: psubq {{.*}}(%rip), %xmm1 -; SSE-NEXT: psubq {{.*}}(%rip), %xmm2 -; SSE-NEXT: psubq {{.*}}(%rip), %xmm3 -; SSE-NEXT: psubq {{.*}}(%rip), %xmm4 -; SSE-NEXT: psubq {{.*}}(%rip), %xmm5 -; SSE-NEXT: psubq {{.*}}(%rip), %xmm6 -; SSE-NEXT: psubq {{.*}}(%rip), %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; SSE-NEXT: pand %xmm8, %xmm7 ; SSE-NEXT: pand %xmm8, %xmm6 @@ -1551,51 +1510,38 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: psubb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_sub_const_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: movl $1, %eax -; AVX1-NEXT: vmovq %rax, %xmm4 -; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm2, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [1.2598673968951787E-321,1.2598673968951787E-321] -; AVX1-NEXT: # xmm4 = mem[0,0] -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vpackusdw %xmm3, %xmm7, %xmm3 -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6 -; AVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm5 = mem[0,0] +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3 -; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3 -; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: trunc_sub_const_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] @@ -1615,15 +1561,12 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc_sub_const_v16i64_v16i8: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm2, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 @@ -1640,17 +1583,17 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_sub_const_v16i64_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <16 x i64> %a0, @@ -1661,10 +1604,6 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-LABEL: trunc_sub_const_v16i32_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: psubd {{.*}}(%rip), %xmm0 -; SSE-NEXT: psubd {{.*}}(%rip), %xmm1 -; SSE-NEXT: psubd {{.*}}(%rip), %xmm2 -; SSE-NEXT: psubd {{.*}}(%rip), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm2 @@ -1673,31 +1612,27 @@ define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: psubb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_sub_const_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm1, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_sub_const_v16i32_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -1707,13 +1642,14 @@ define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_sub_const_v16i32_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsubd {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <16 x i32> %a0, @@ -1724,56 +1660,52 @@ define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind { define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; SSE-LABEL: trunc_sub_const_v16i16_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: psubw {{.*}}(%rip), %xmm0 -; SSE-NEXT: psubw {{.*}}(%rip), %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: psubb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_sub_const_v16i16_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_sub_const_v16i16_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = sub <16 x i16> %a0, diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll index f358a69..b06f329 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -1340,49 +1340,39 @@ define <16 x i8> @trunc_ext_sub_v16i16_v16i8(<16 x i8> %x, <16 x i8> %y) { define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; SSE-LABEL: trunc_sub_const_v4i64_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movq %rax, %xmm2 -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; SSE-NEXT: psubq %xmm2, %xmm0 -; SSE-NEXT: psubq {{.*}}(%rip), %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: psubd {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_sub_const_v4i64_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: movl $1, %eax -; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc_sub_const_v4i64_v4i32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_sub_const_v4i64_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <4 x i64> %a0, @@ -1393,53 +1383,38 @@ define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind { define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_sub_const_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movq %rax, %xmm4 -; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] -; SSE-NEXT: psubq %xmm4, %xmm0 -; SSE-NEXT: psubq {{.*}}(%rip), %xmm1 -; SSE-NEXT: psubq {{.*}}(%rip), %xmm2 -; SSE-NEXT: psubq {{.*}}(%rip), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: psubw {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_sub_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: movl $1, %eax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] -; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: trunc_sub_const_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -1447,28 +1422,26 @@ define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc_sub_const_v8i64_v8i16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_sub_const_v8i64_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <8 x i64> %a0, @@ -1479,41 +1452,38 @@ define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind { define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; SSE-LABEL: trunc_sub_const_v8i32_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: psubd {{.*}}(%rip), %xmm0 -; SSE-NEXT: psubd {{.*}}(%rip), %xmm1 ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: psubw {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_sub_const_v8i32_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_sub_const_v8i32_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_sub_const_v8i32_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <8 x i32> %a0, @@ -1524,17 +1494,6 @@ define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind { define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-LABEL: trunc_sub_const_v16i64_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movq %rax, %xmm8 -; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] -; SSE-NEXT: psubq %xmm8, %xmm0 -; SSE-NEXT: psubq {{.*}}(%rip), %xmm1 -; SSE-NEXT: psubq {{.*}}(%rip), %xmm2 -; SSE-NEXT: psubq {{.*}}(%rip), %xmm3 -; SSE-NEXT: psubq {{.*}}(%rip), %xmm4 -; SSE-NEXT: psubq {{.*}}(%rip), %xmm5 -; SSE-NEXT: psubq {{.*}}(%rip), %xmm6 -; SSE-NEXT: psubq {{.*}}(%rip), %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; SSE-NEXT: pand %xmm8, %xmm7 ; SSE-NEXT: pand %xmm8, %xmm6 @@ -1551,51 +1510,38 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: psubb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_sub_const_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: movl $1, %eax -; AVX1-NEXT: vmovq %rax, %xmm4 -; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm2, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [1.2598673968951787E-321,1.2598673968951787E-321] -; AVX1-NEXT: # xmm4 = mem[0,0] -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vpackusdw %xmm3, %xmm7, %xmm3 -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6 -; AVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm5 = mem[0,0] +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3 -; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3 -; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: trunc_sub_const_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] @@ -1615,15 +1561,12 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc_sub_const_v16i64_v16i8: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm2, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 @@ -1640,17 +1583,17 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_sub_const_v16i64_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <16 x i64> %a0, @@ -1661,10 +1604,6 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-LABEL: trunc_sub_const_v16i32_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: psubd {{.*}}(%rip), %xmm0 -; SSE-NEXT: psubd {{.*}}(%rip), %xmm1 -; SSE-NEXT: psubd {{.*}}(%rip), %xmm2 -; SSE-NEXT: psubd {{.*}}(%rip), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm2 @@ -1673,31 +1612,27 @@ define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: psubb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_sub_const_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm1, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_sub_const_v16i32_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -1707,13 +1642,14 @@ define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_sub_const_v16i32_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsubd {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <16 x i32> %a0, @@ -1724,56 +1660,52 @@ define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind { define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; SSE-LABEL: trunc_sub_const_v16i16_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: psubw {{.*}}(%rip), %xmm0 -; SSE-NEXT: psubw {{.*}}(%rip), %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: psubb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_sub_const_v16i16_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_sub_const_v16i16_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = sub <16 x i16> %a0, diff --git a/llvm/test/CodeGen/X86/xchg-nofold.ll b/llvm/test/CodeGen/X86/xchg-nofold.ll index 2e24f8b..673fa74d 100644 --- a/llvm/test/CodeGen/X86/xchg-nofold.ll +++ b/llvm/test/CodeGen/X86/xchg-nofold.ll @@ -17,7 +17,7 @@ define zeroext i1 @_Z3fooRSt6atomicIbEb(%"struct.std::atomic"* nocapture derefer ; CHECK-NEXT: je .LBB0_3 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: movl %edi, %edx -; CHECK-NEXT: andl $7, %edx +; CHECK-NEXT: andb $7, %dl ; CHECK-NEXT: cmpb %cl, %dl ; CHECK-NEXT: jge .LBB0_2 ; CHECK-NEXT: .LBB0_3: -- 2.7.4