I was looking at a potential DAGCombiner fix for 1 of the regressions in D60278, and it caused severe regression test pain because x86 TLI lies about the desirability of 8-bit shift ops.
We've hinted at making all 8-bit ops undesirable for the reason in the code comment:
// TODO: Almost no 8-bit ops are desirable because they have no actual
// size/speed advantages vs. 32-bit ops, but they do have a major
// potential disadvantage by causing partial register stalls.
...but that leads to massive diffs and exposes all kinds of optimization holes itself.
Differential Revision: https://reviews.llvm.org/D60286
llvm-svn: 357912
if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
return false;
- // 8-bit multiply is probably not much cheaper than 32-bit multiply, and
- // we have specializations to turn 32-bit multiply into LEA or other ops.
+ // TODO: Almost no 8-bit ops are desirable because they have no actual
+ // size/speed advantages vs. 32-bit ops, but they do have a major
+ // potential disadvantage by causing partial register stalls.
+ //
+ // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
+ // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
// Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
// check for a constant operand to the multiply.
- if (Opc == ISD::MUL && VT == MVT::i8)
+ if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
return false;
// i16 instruction encodings are longer and some i16 instructions are slow,
define zeroext i1 @demanded_with_known_zeroes(i32 %bit, i32 %bits) {
; X86-LABEL: demanded_with_known_zeroes:
; X86: # %bb.0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT: shlb $2, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: movb {{[0-9]+}}(%esp), %al
+; X86-NEXT: shlb $2, %al
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: btl %eax, %ecx
; X86-NEXT: setb %al
; X86-NEXT: retl
;
; X64-LABEL: demanded_with_known_zeroes:
; X64: # %bb.0: # %entry
-; X64-NEXT: shlb $2, %dil
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: btl %eax, %esi
+; X64-NEXT: shll $2, %edi
+; X64-NEXT: btl %edi, %esi
; X64-NEXT: setb %al
; X64-NEXT: retq
entry:
; X64-LABEL: btr_32_mask_zeros:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shlb $2, %sil
+; X64-NEXT: shll $2, %esi
; X64-NEXT: btrl %esi, %eax
; X64-NEXT: retq
;
; X86-LABEL: btr_32_mask_zeros:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: shlb $2, %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: btrl %ecx, %eax
; X86-NEXT: retl
%1 = shl i32 %n, 2
; X64-LABEL: bts_32_mask_zeros:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shlb $2, %sil
+; X64-NEXT: shll $2, %esi
; X64-NEXT: btsl %esi, %eax
; X64-NEXT: retq
;
; X86-LABEL: bts_32_mask_zeros:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: shlb $2, %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: btsl %ecx, %eax
; X86-NEXT: retl
%1 = shl i32 %n, 2
; X64-LABEL: btc_32_mask_zeros:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shlb $2, %sil
+; X64-NEXT: shll $2, %esi
; X64-NEXT: btcl %esi, %eax
; X64-NEXT: retq
;
; X86-LABEL: btc_32_mask_zeros:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: shlb $2, %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: btcl %ecx, %eax
; X86-NEXT: retl
%1 = shl i32 %n, 2
; X64-LABEL: btr_64_mask_zeros:
; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: shlb $2, %sil
+; X64-NEXT: shlq $2, %rsi
; X64-NEXT: btrq %rsi, %rax
; X64-NEXT: retq
;
; X86-LABEL: btr_64_mask_zeros:
; X86: # %bb.0:
-; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT: shlb $2, %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: shll $2, %ecx
; X86-NEXT: movl $1, %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: shldl %cl, %eax, %edx
; X64-LABEL: bts_64_mask_zeros:
; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: shlb $2, %sil
+; X64-NEXT: shlq $2, %rsi
; X64-NEXT: btsq %rsi, %rax
; X64-NEXT: retq
;
; X86-LABEL: bts_64_mask_zeros:
; X86: # %bb.0:
-; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT: shlb $2, %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: shll $2, %ecx
; X86-NEXT: movl $1, %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: shldl %cl, %eax, %edx
; X64-LABEL: btc_64_mask_zeros:
; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: shlb $2, %sil
+; X64-NEXT: shlq $2, %rsi
; X64-NEXT: btcq %rsi, %rax
; X64-NEXT: retq
;
; X86-LABEL: btc_64_mask_zeros:
; X86: # %bb.0:
-; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT: shlb $2, %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: shll $2, %ecx
; X86-NEXT: movl $1, %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: shldl %cl, %eax, %edx
define i32 @rotate_demanded_bits_3(i32, i32) {
; X86-LABEL: rotate_demanded_bits_3:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: addb %cl, %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: roll %cl, %eax
; X86-NEXT: retl
;
; ILP-NEXT: pushq %rbx
; ILP-NEXT: movq %rdi, %rax
; ILP-NEXT: xorl %r8d, %r8d
-; ILP-NEXT: addb %sil, %sil
+; ILP-NEXT: addq %rsi, %rsi
; ILP-NEXT: addb $2, %sil
; ILP-NEXT: orb $1, %sil
; ILP-NEXT: movl $1, %r10d
; HYBRID-LABEL: test1:
; HYBRID: # %bb.0:
; HYBRID-NEXT: movq %rdi, %rax
-; HYBRID-NEXT: addb %sil, %sil
+; HYBRID-NEXT: addq %rsi, %rsi
; HYBRID-NEXT: addb $2, %sil
; HYBRID-NEXT: orb $1, %sil
; HYBRID-NEXT: movb $-128, %cl
; BURR-LABEL: test1:
; BURR: # %bb.0:
; BURR-NEXT: movq %rdi, %rax
-; BURR-NEXT: addb %sil, %sil
+; BURR-NEXT: addq %rsi, %rsi
; BURR-NEXT: addb $2, %sil
; BURR-NEXT: orb $1, %sil
; BURR-NEXT: movb $-128, %cl
; SRC: # %bb.0:
; SRC-NEXT: pushq %rbx
; SRC-NEXT: movq %rdi, %rax
-; SRC-NEXT: addb %sil, %sil
+; SRC-NEXT: addq %rsi, %rsi
; SRC-NEXT: addb $2, %sil
; SRC-NEXT: orb $1, %sil
; SRC-NEXT: movb $-128, %cl
; LIN-NEXT: movq %rdi, %rax
; LIN-NEXT: xorl %r9d, %r9d
; LIN-NEXT: movl $1, %r8d
-; LIN-NEXT: addb %sil, %sil
+; LIN-NEXT: addq %rsi, %rsi
; LIN-NEXT: addb $2, %sil
; LIN-NEXT: orb $1, %sil
; LIN-NEXT: movl $1, %edx
define i32 @select_pow2_diff_neg(i1 zeroext %cond) {
; CHECK-LABEL: select_pow2_diff_neg:
; CHECK: # %bb.0:
-; CHECK-NEXT: shlb $4, %dil
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: orl $-25, %eax
+; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT: shll $4, %edi
+; CHECK-NEXT: leal -25(%rdi), %eax
; CHECK-NEXT: retq
%sel = select i1 %cond, i32 -9, i32 -25
ret i32 %sel