From 50c3b290ed8749b568439908adcfca85df33535d Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 8 Apr 2019 13:58:50 +0000
Subject: [PATCH] [x86] make 8-bit shl undesirable

I was looking at a potential DAGCombiner fix for 1 of the regressions in D60278, and it caused severe regression test pain because x86 TLI lies about the desirability of 8-bit shift ops.

We've hinted at making all 8-bit ops undesirable for the reason in the code comment:

// TODO: Almost no 8-bit ops are desirable because they have no actual
//       size/speed advantages vs. 32-bit ops, but they do have a major
//       potential disadvantage by causing partial register stalls.

...but that leads to massive diffs and exposes all kinds of optimization holes itself.

Differential Revision: https://reviews.llvm.org/D60286

llvm-svn: 357912
---
 llvm/lib/Target/X86/X86ISelLowering.cpp         | 10 ++++++---
 llvm/test/CodeGen/X86/bt.ll                     | 15 ++++++-------
 llvm/test/CodeGen/X86/btc_bts_btr.ll            | 30 ++++++++++++-------------
 llvm/test/CodeGen/X86/rotate4.ll                |  2 +-
 llvm/test/CodeGen/X86/scheduler-backtracking.ll | 10 ++++-----
 llvm/test/CodeGen/X86/select_const.ll           |  6 ++---
 6 files changed, 38 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8c12ab4..4cc2bf4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -42798,11 +42798,15 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
   if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
     return false;
 
-  // 8-bit multiply is probably not much cheaper than 32-bit multiply, and
-  // we have specializations to turn 32-bit multiply into LEA or other ops.
+  // TODO: Almost no 8-bit ops are desirable because they have no actual
+  //       size/speed advantages vs. 32-bit ops, but they do have a major
+  //       potential disadvantage by causing partial register stalls.
+  //
+  // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
+  // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
   // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
   // check for a constant operand to the multiply.
-  if (Opc == ISD::MUL && VT == MVT::i8)
+  if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
     return false;
 
   // i16 instruction encodings are longer and some i16 instructions are slow,
diff --git a/llvm/test/CodeGen/X86/bt.ll b/llvm/test/CodeGen/X86/bt.ll
index c3aea3d..59df366 100644
--- a/llvm/test/CodeGen/X86/bt.ll
+++ b/llvm/test/CodeGen/X86/bt.ll
@@ -1150,19 +1150,18 @@ define void @demanded_i32(i32* nocapture readonly, i32* nocapture, i32) nounwind
 define zeroext i1 @demanded_with_known_zeroes(i32 %bit, i32 %bits) {
 ; X86-LABEL: demanded_with_known_zeroes:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    shlb $2, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    shlb $2, %al
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: demanded_with_known_zeroes:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    shlb $2, %dil
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    btl %eax, %esi
+; X64-NEXT:    shll $2, %edi
+; X64-NEXT:    btl %edi, %esi
 ; X64-NEXT:    setb %al
 ; X64-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/btc_bts_btr.ll b/llvm/test/CodeGen/X86/btc_bts_btr.ll
index 5e64be9..79c745d 100644
--- a/llvm/test/CodeGen/X86/btc_bts_btr.ll
+++ b/llvm/test/CodeGen/X86/btc_bts_btr.ll
@@ -954,15 +954,15 @@ define i32 @btr_32_mask_zeros(i32 %x, i32 %n) {
 ; X64-LABEL: btr_32_mask_zeros:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shlb $2, %sil
+; X64-NEXT:    shll $2, %esi
 ; X64-NEXT:    btrl %esi, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: btr_32_mask_zeros:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    shlb $2, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    btrl %ecx, %eax
 ; X86-NEXT:    retl
   %1 = shl i32 %n, 2
@@ -977,15 +977,15 @@ define i32 @bts_32_mask_zeros(i32 %x, i32 %n) {
 ; X64-LABEL: bts_32_mask_zeros:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shlb $2, %sil
+; X64-NEXT:    shll $2, %esi
 ; X64-NEXT:    btsl %esi, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: bts_32_mask_zeros:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    shlb $2, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    btsl %ecx, %eax
 ; X86-NEXT:    retl
   %1 = shl i32 %n, 2
@@ -999,15 +999,15 @@ define i32 @btc_32_mask_zeros(i32 %x, i32 %n) {
 ; X64-LABEL: btc_32_mask_zeros:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shlb $2, %sil
+; X64-NEXT:    shll $2, %esi
 ; X64-NEXT:    btcl %esi, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: btc_32_mask_zeros:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    shlb $2, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    btcl %ecx, %eax
 ; X86-NEXT:    retl
   %1 = shl i32 %n, 2
@@ -1021,14 +1021,14 @@ define i64 @btr_64_mask_zeros(i64 %x, i64 %n) {
 ; X64-LABEL: btr_64_mask_zeros:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shlb $2, %sil
+; X64-NEXT:    shlq $2, %rsi
 ; X64-NEXT:    btrq %rsi, %rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: btr_64_mask_zeros:
 ; X86:       # %bb.0:
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    shlb $2, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $2, %ecx
 ; X86-NEXT:    movl $1, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    shldl %cl, %eax, %edx
@@ -1056,14 +1056,14 @@ define i64 @bts_64_mask_zeros(i64 %x, i64 %n) {
 ; X64-LABEL: bts_64_mask_zeros:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shlb $2, %sil
+; X64-NEXT:    shlq $2, %rsi
 ; X64-NEXT:    btsq %rsi, %rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: bts_64_mask_zeros:
 ; X86:       # %bb.0:
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    shlb $2, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $2, %ecx
 ; X86-NEXT:    movl $1, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    shldl %cl, %eax, %edx
@@ -1088,14 +1088,14 @@ define i64 @btc_64_mask_zeros(i64 %x, i64 %n) {
 ; X64-LABEL: btc_64_mask_zeros:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shlb $2, %sil
+; X64-NEXT:    shlq $2, %rsi
 ; X64-NEXT:    btcq %rsi, %rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: btc_64_mask_zeros:
 ; X86:       # %bb.0:
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    shlb $2, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $2, %ecx
 ; X86-NEXT:    movl $1, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    shldl %cl, %eax, %edx
diff --git a/llvm/test/CodeGen/X86/rotate4.ll b/llvm/test/CodeGen/X86/rotate4.ll
index fa7f550..e58451b 100644
--- a/llvm/test/CodeGen/X86/rotate4.ll
+++ b/llvm/test/CodeGen/X86/rotate4.ll
@@ -633,9 +633,9 @@ define i32 @rotate_demanded_bits_2(i32, i32) {
 define i32 @rotate_demanded_bits_3(i32, i32) {
 ; X86-LABEL: rotate_demanded_bits_3:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    addb %cl, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    roll %cl, %eax
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
index 95eb8e7..3fcc5a3 100644
--- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
@@ -16,7 +16,7 @@ define i256 @test1(i256 %a) nounwind {
 ; ILP-NEXT:    pushq %rbx
 ; ILP-NEXT:    movq %rdi, %rax
 ; ILP-NEXT:    xorl %r8d, %r8d
-; ILP-NEXT:    addb %sil, %sil
+; ILP-NEXT:    addq %rsi, %rsi
 ; ILP-NEXT:    addb $2, %sil
 ; ILP-NEXT:    orb $1, %sil
 ; ILP-NEXT:    movl $1, %r10d
@@ -61,7 +61,7 @@ define i256 @test1(i256 %a) nounwind {
 ; HYBRID-LABEL: test1:
 ; HYBRID:       # %bb.0:
 ; HYBRID-NEXT:    movq %rdi, %rax
-; HYBRID-NEXT:    addb %sil, %sil
+; HYBRID-NEXT:    addq %rsi, %rsi
 ; HYBRID-NEXT:    addb $2, %sil
 ; HYBRID-NEXT:    orb $1, %sil
 ; HYBRID-NEXT:    movb $-128, %cl
@@ -104,7 +104,7 @@ define i256 @test1(i256 %a) nounwind {
 ; BURR-LABEL: test1:
 ; BURR:       # %bb.0:
 ; BURR-NEXT:    movq %rdi, %rax
-; BURR-NEXT:    addb %sil, %sil
+; BURR-NEXT:    addq %rsi, %rsi
 ; BURR-NEXT:    addb $2, %sil
 ; BURR-NEXT:    orb $1, %sil
 ; BURR-NEXT:    movb $-128, %cl
@@ -148,7 +148,7 @@ define i256 @test1(i256 %a) nounwind {
 ; SRC:       # %bb.0:
 ; SRC-NEXT:    pushq %rbx
 ; SRC-NEXT:    movq %rdi, %rax
-; SRC-NEXT:    addb %sil, %sil
+; SRC-NEXT:    addq %rsi, %rsi
 ; SRC-NEXT:    addb $2, %sil
 ; SRC-NEXT:    orb $1, %sil
 ; SRC-NEXT:    movb $-128, %cl
@@ -195,7 +195,7 @@ define i256 @test1(i256 %a) nounwind {
 ; LIN-NEXT:    movq %rdi, %rax
 ; LIN-NEXT:    xorl %r9d, %r9d
 ; LIN-NEXT:    movl $1, %r8d
-; LIN-NEXT:    addb %sil, %sil
+; LIN-NEXT:    addq %rsi, %rsi
 ; LIN-NEXT:    addb $2, %sil
 ; LIN-NEXT:    orb $1, %sil
 ; LIN-NEXT:    movl $1, %edx
diff --git a/llvm/test/CodeGen/X86/select_const.ll b/llvm/test/CodeGen/X86/select_const.ll
index 0f10649..852032a 100644
--- a/llvm/test/CodeGen/X86/select_const.ll
+++ b/llvm/test/CodeGen/X86/select_const.ll
@@ -353,9 +353,9 @@ define i16 @select_pow2_diff_invert(i1 zeroext %cond) {
 define i32 @select_pow2_diff_neg(i1 zeroext %cond) {
 ; CHECK-LABEL: select_pow2_diff_neg:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    shlb $4, %dil
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    orl $-25, %eax
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    shll $4, %edi
+; CHECK-NEXT:    leal -25(%rdi), %eax
 ; CHECK-NEXT:    retq
   %sel = select i1 %cond, i32 -9, i32 -25
   ret i32 %sel
-- 
2.7.4