From 50c3b290ed8749b568439908adcfca85df33535d Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Mon, 8 Apr 2019 13:58:50 +0000 Subject: [PATCH] [x86] make 8-bit shl undesirable I was looking at a potential DAGCombiner fix for 1 of the regressions in D60278, and it caused severe regression test pain because x86 TLI lies about the desirability of 8-bit shift ops. We've hinted at making all 8-bit ops undesirable for the reason in the code comment: // TODO: Almost no 8-bit ops are desirable because they have no actual // size/speed advantages vs. 32-bit ops, but they do have a major // potential disadvantage by causing partial register stalls. ...but that leads to massive diffs and exposes all kinds of optimization holes itself. Differential Revision: https://reviews.llvm.org/D60286 llvm-svn: 357912 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 10 ++++++--- llvm/test/CodeGen/X86/bt.ll | 15 ++++++------- llvm/test/CodeGen/X86/btc_bts_btr.ll | 30 ++++++++++++------------- llvm/test/CodeGen/X86/rotate4.ll | 2 +- llvm/test/CodeGen/X86/scheduler-backtracking.ll | 10 ++++----- llvm/test/CodeGen/X86/select_const.ll | 6 ++--- 6 files changed, 38 insertions(+), 35 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 8c12ab4..4cc2bf4 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -42798,11 +42798,15 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8) return false; - // 8-bit multiply is probably not much cheaper than 32-bit multiply, and - // we have specializations to turn 32-bit multiply into LEA or other ops. + // TODO: Almost no 8-bit ops are desirable because they have no actual + // size/speed advantages vs. 32-bit ops, but they do have a major + // potential disadvantage by causing partial register stalls. + // + // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and + // we have specializations to turn 32-bit multiply/shl into LEA or other ops. // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally // check for a constant operand to the multiply. - if (Opc == ISD::MUL && VT == MVT::i8) + if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8) return false; // i16 instruction encodings are longer and some i16 instructions are slow, diff --git a/llvm/test/CodeGen/X86/bt.ll b/llvm/test/CodeGen/X86/bt.ll index c3aea3d..59df366 100644 --- a/llvm/test/CodeGen/X86/bt.ll +++ b/llvm/test/CodeGen/X86/bt.ll @@ -1150,19 +1150,18 @@ define void @demanded_i32(i32* nocapture readonly, i32* nocapture, i32) nounwind define zeroext i1 @demanded_with_known_zeroes(i32 %bit, i32 %bits) { ; X86-LABEL: demanded_with_known_zeroes: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-NEXT: shlb $2, %cl -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: btl %ecx, %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: shlb $2, %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: btl %eax, %ecx ; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: demanded_with_known_zeroes: ; X64: # %bb.0: # %entry -; X64-NEXT: shlb $2, %dil -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: btl %eax, %esi +; X64-NEXT: shll $2, %edi +; X64-NEXT: btl %edi, %esi ; X64-NEXT: setb %al ; X64-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/btc_bts_btr.ll b/llvm/test/CodeGen/X86/btc_bts_btr.ll index 5e64be9..79c745d 100644 --- a/llvm/test/CodeGen/X86/btc_bts_btr.ll +++ b/llvm/test/CodeGen/X86/btc_bts_btr.ll @@ -954,15 +954,15 @@ define i32 @btr_32_mask_zeros(i32 %x, i32 %n) { ; X64-LABEL: btr_32_mask_zeros: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shlb $2, %sil +; X64-NEXT: shll $2, %esi ; X64-NEXT: btrl %esi, %eax ; X64-NEXT: retq ; ; X86-LABEL: btr_32_mask_zeros: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: shlb $2, %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: btrl %ecx, %eax ; X86-NEXT: retl %1 = shl i32 %n, 2 @@ -977,15 +977,15 @@ define i32 @bts_32_mask_zeros(i32 %x, i32 %n) { ; X64-LABEL: bts_32_mask_zeros: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shlb $2, %sil +; X64-NEXT: shll $2, %esi ; X64-NEXT: btsl %esi, %eax ; X64-NEXT: retq ; ; X86-LABEL: bts_32_mask_zeros: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: shlb $2, %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: btsl %ecx, %eax ; X86-NEXT: retl %1 = shl i32 %n, 2 @@ -999,15 +999,15 @@ define i32 @btc_32_mask_zeros(i32 %x, i32 %n) { ; X64-LABEL: btc_32_mask_zeros: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shlb $2, %sil +; X64-NEXT: shll $2, %esi ; X64-NEXT: btcl %esi, %eax ; X64-NEXT: retq ; ; X86-LABEL: btc_32_mask_zeros: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: shlb $2, %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: btcl %ecx, %eax ; X86-NEXT: retl %1 = shl i32 %n, 2 @@ -1021,14 +1021,14 @@ define i64 @btr_64_mask_zeros(i64 %x, i64 %n) { ; X64-LABEL: btr_64_mask_zeros: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: shlb $2, %sil +; X64-NEXT: shlq $2, %rsi ; X64-NEXT: btrq %rsi, %rax ; X64-NEXT: retq ; ; X86-LABEL: btr_64_mask_zeros: ; X86: # %bb.0: -; X86-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-NEXT: shlb $2, %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll $2, %ecx ; X86-NEXT: movl $1, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: shldl %cl, %eax, %edx @@ -1056,14 +1056,14 @@ define i64 @bts_64_mask_zeros(i64 %x, i64 %n) { ; X64-LABEL: bts_64_mask_zeros: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: shlb $2, %sil +; X64-NEXT: shlq $2, %rsi ; X64-NEXT: btsq %rsi, %rax ; X64-NEXT: retq ; ; X86-LABEL: bts_64_mask_zeros: ; X86: # %bb.0: -; X86-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-NEXT: shlb $2, %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll $2, %ecx ; X86-NEXT: movl $1, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: shldl %cl, %eax, %edx @@ -1088,14 +1088,14 @@ define i64 @btc_64_mask_zeros(i64 %x, i64 %n) { ; X64-LABEL: btc_64_mask_zeros: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: shlb $2, %sil +; X64-NEXT: shlq $2, %rsi ; X64-NEXT: btcq %rsi, %rax ; X64-NEXT: retq ; ; X86-LABEL: btc_64_mask_zeros: ; X86: # %bb.0: -; X86-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-NEXT: shlb $2, %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll $2, %ecx ; X86-NEXT: movl $1, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: shldl %cl, %eax, %edx diff --git a/llvm/test/CodeGen/X86/rotate4.ll b/llvm/test/CodeGen/X86/rotate4.ll index fa7f550..e58451b 100644 --- a/llvm/test/CodeGen/X86/rotate4.ll +++ b/llvm/test/CodeGen/X86/rotate4.ll @@ -633,9 +633,9 @@ define i32 @rotate_demanded_bits_2(i32, i32) { define i32 @rotate_demanded_bits_3(i32, i32) { ; X86-LABEL: rotate_demanded_bits_3: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: addb %cl, %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: roll %cl, %eax ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll index 95eb8e7..3fcc5a3 100644 --- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll +++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll @@ -16,7 +16,7 @@ define i256 @test1(i256 %a) nounwind { ; ILP-NEXT: pushq %rbx ; ILP-NEXT: movq %rdi, %rax ; ILP-NEXT: xorl %r8d, %r8d -; ILP-NEXT: addb %sil, %sil +; ILP-NEXT: addq %rsi, %rsi ; ILP-NEXT: addb $2, %sil ; ILP-NEXT: orb $1, %sil ; ILP-NEXT: movl $1, %r10d @@ -61,7 +61,7 @@ define i256 @test1(i256 %a) nounwind { ; HYBRID-LABEL: test1: ; HYBRID: # %bb.0: ; HYBRID-NEXT: movq %rdi, %rax -; HYBRID-NEXT: addb %sil, %sil +; HYBRID-NEXT: addq %rsi, %rsi ; HYBRID-NEXT: addb $2, %sil ; HYBRID-NEXT: orb $1, %sil ; HYBRID-NEXT: movb $-128, %cl @@ -104,7 +104,7 @@ define i256 @test1(i256 %a) nounwind { ; BURR-LABEL: test1: ; BURR: # %bb.0: ; BURR-NEXT: movq %rdi, %rax -; BURR-NEXT: addb %sil, %sil +; BURR-NEXT: addq %rsi, %rsi ; BURR-NEXT: addb $2, %sil ; BURR-NEXT: orb $1, %sil ; BURR-NEXT: movb $-128, %cl @@ -148,7 +148,7 @@ define i256 @test1(i256 %a) nounwind { ; SRC: # %bb.0: ; SRC-NEXT: pushq %rbx ; SRC-NEXT: movq %rdi, %rax -; SRC-NEXT: addb %sil, %sil +; SRC-NEXT: addq %rsi, %rsi ; SRC-NEXT: addb $2, %sil ; SRC-NEXT: orb $1, %sil ; SRC-NEXT: movb $-128, %cl @@ -195,7 +195,7 @@ define i256 @test1(i256 %a) nounwind { ; LIN-NEXT: movq %rdi, %rax ; LIN-NEXT: xorl %r9d, %r9d ; LIN-NEXT: movl $1, %r8d -; LIN-NEXT: addb %sil, %sil +; LIN-NEXT: addq %rsi, %rsi ; LIN-NEXT: addb $2, %sil ; LIN-NEXT: orb $1, %sil ; LIN-NEXT: movl $1, %edx diff --git a/llvm/test/CodeGen/X86/select_const.ll b/llvm/test/CodeGen/X86/select_const.ll index 0f10649..852032a 100644 --- a/llvm/test/CodeGen/X86/select_const.ll +++ b/llvm/test/CodeGen/X86/select_const.ll @@ -353,9 +353,9 @@ define i16 @select_pow2_diff_invert(i1 zeroext %cond) { define i32 @select_pow2_diff_neg(i1 zeroext %cond) { ; CHECK-LABEL: select_pow2_diff_neg: ; CHECK: # %bb.0: -; CHECK-NEXT: shlb $4, %dil -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: orl $-25, %eax +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: shll $4, %edi +; CHECK-NEXT: leal -25(%rdi), %eax ; CHECK-NEXT: retq %sel = select i1 %cond, i32 -9, i32 -25 ret i32 %sel -- 2.7.4