From bfb900d36328469422683dd5e0bd0320532c13d3 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 19 Jul 2018 09:27:34 +0000 Subject: [PATCH] [DAGCombiner] Add rotate-extract tests Add new tests from D47681 to current codegen. Also added i686 codegen tests. llvm-svn: 337445 --- llvm/test/CodeGen/AArch64/rotate-extract.ll | 148 ++++++++++++ llvm/test/CodeGen/X86/rotate-extract-vector.ll | 317 +++++++++++++++++++++++++ llvm/test/CodeGen/X86/rotate-extract.ll | 285 ++++++++++++++++++++++ 3 files changed, 750 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/rotate-extract.ll create mode 100644 llvm/test/CodeGen/X86/rotate-extract-vector.ll create mode 100644 llvm/test/CodeGen/X86/rotate-extract.ll diff --git a/llvm/test/CodeGen/AArch64/rotate-extract.ll b/llvm/test/CodeGen/AArch64/rotate-extract.ll new file mode 100644 index 0000000..4f5313c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/rotate-extract.ll @@ -0,0 +1,148 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s + +; Check that under certain conditions we can factor out a rotate +; from the following idioms: +; (a*c0) >> s1 | (a*c1) +; (a/c0) << s1 | (a/c1) +; This targets cases where instcombine has folded a shl/srl/mul/udiv +; with one of the shifts from the rotate idiom + +define i64 @ror_extract_shl(i64 %i) nounwind { +; CHECK-LABEL: ror_extract_shl: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl x8, x0, #10 +; CHECK-NEXT: bfxil x8, x0, #54, #7 +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: ret + %lhs_mul = shl i64 %i, 3 + %rhs_mul = shl i64 %i, 10 + %lhs_shift = lshr i64 %lhs_mul, 57 + %out = or i64 %lhs_shift, %rhs_mul + ret i64 %out +} + +define i32 @ror_extract_shrl(i32 %i) nounwind { +; CHECK-LABEL: ror_extract_shrl: +; CHECK: // %bb.0: +; CHECK-NEXT: ror w8, w0, #7 +; CHECK-NEXT: and w0, w8, #0xf1ffffff +; CHECK-NEXT: ret + %lhs_div = lshr i32 %i, 7 + %rhs_div = lshr i32 %i, 3 + %rhs_shift = shl i32 %rhs_div, 28 + %out = or i32 %lhs_div, %rhs_shift + ret i32 %out +} + +define i32 @ror_extract_mul(i32 %i) nounwind { +; CHECK-LABEL: ror_extract_mul: +; CHECK: // %bb.0: +; CHECK-NEXT: add w8, w0, w0, lsl #3 +; CHECK-NEXT: ror w0, w8, #25 +; CHECK-NEXT: ret + %lhs_mul = mul i32 %i, 9 + %rhs_mul = mul i32 %i, 1152 + %lhs_shift = lshr i32 %lhs_mul, 25 + %out = or i32 %lhs_shift, %rhs_mul + ret i32 %out +} + +define i64 @ror_extract_udiv(i64 %i) nounwind { +; CHECK-LABEL: ror_extract_udiv: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #-6148914691236517206 +; CHECK-NEXT: movk x8, #43691 +; CHECK-NEXT: umulh x8, x0, x8 +; CHECK-NEXT: ror x8, x8, #5 +; CHECK-NEXT: and x0, x8, #0xf7ffffffffffffff +; CHECK-NEXT: ret + %lhs_div = udiv i64 %i, 3 + %rhs_div = udiv i64 %i, 48 + %lhs_shift = shl i64 %lhs_div, 60 + %out = or i64 %lhs_shift, %rhs_div + ret i64 %out +} + +define i64 @ror_extract_mul_with_mask(i64 %i) nounwind { +; CHECK-LABEL: ror_extract_mul_with_mask: +; CHECK: // %bb.0: +; CHECK-NEXT: add w8, w0, w0, lsl #3 +; CHECK-NEXT: lsl w8, w8, #7 +; CHECK-NEXT: add x9, x0, x0, lsl #3 +; CHECK-NEXT: and x0, x8, #0x80 +; CHECK-NEXT: bfxil x0, x9, #57, #7 +; CHECK-NEXT: ret + %lhs_mul = mul i64 %i, 1152 + %rhs_mul = mul i64 %i, 9 + %lhs_and = and i64 %lhs_mul, 160 + %rhs_shift = lshr i64 %rhs_mul, 57 + %out = or i64 %lhs_and, %rhs_shift + ret i64 %out +} + +; Result would undershift +define i64 @no_extract_shl(i64 %i) nounwind { +; CHECK-LABEL: no_extract_shl: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl x8, x0, #10 +; CHECK-NEXT: bfxil x8, x0, #52, #7 +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: ret + %lhs_mul = shl i64 %i, 5 + %rhs_mul = shl i64 %i, 10 + %lhs_shift = lshr i64 %lhs_mul, 57 + %out = or i64 %lhs_shift, %rhs_mul + ret i64 %out +} + +; Result would overshift +define i32 @no_extract_shrl(i32 %i) nounwind { +; CHECK-LABEL: no_extract_shrl: +; CHECK: // %bb.0: +; CHECK-NEXT: lsr w8, w0, #3 +; CHECK-NEXT: lsr w0, w0, #9 +; CHECK-NEXT: bfi w0, w8, #28, #4 +; CHECK-NEXT: ret + %lhs_div = lshr i32 %i, 3 + %rhs_div = lshr i32 %i, 9 + %lhs_shift = shl i32 %lhs_div, 28 + %out = or i32 %lhs_shift, %rhs_div + ret i32 %out +} + +; Can factor 128 from 2304, but result is 18 instead of 9 +define i64 @no_extract_mul(i64 %i) nounwind { +; CHECK-LABEL: no_extract_mul: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, x0, lsl #3 +; CHECK-NEXT: lsr x0, x8, #57 +; CHECK-NEXT: bfi x0, x8, #8, #56 +; CHECK-NEXT: ret + %lhs_mul = mul i64 %i, 2304 + %rhs_mul = mul i64 %i, 9 + %rhs_shift = lshr i64 %rhs_mul, 57 + %out = or i64 %lhs_mul, %rhs_shift + ret i64 %out +} + +; Can't evenly factor 16 from 49 +define i32 @no_extract_udiv(i32 %i) nounwind { +; CHECK-LABEL: no_extract_udiv: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #43691 +; CHECK-NEXT: mov w9, #33437 +; CHECK-NEXT: movk w8, #43690, lsl #16 +; CHECK-NEXT: movk w9, #21399, lsl #16 +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: umull x9, w0, w9 +; CHECK-NEXT: lsr x8, x8, #33 +; CHECK-NEXT: lsr x9, x9, #32 +; CHECK-NEXT: extr w0, w8, w9, #4 +; CHECK-NEXT: ret + %lhs_div = udiv i32 %i, 3 + %rhs_div = udiv i32 %i, 49 + %lhs_shift = shl i32 %lhs_div, 28 + %out = or i32 %lhs_shift, %rhs_div + ret i32 %out +} diff --git a/llvm/test/CodeGen/X86/rotate-extract-vector.ll b/llvm/test/CodeGen/X86/rotate-extract-vector.ll new file mode 100644 index 0000000..6059a76 --- /dev/null +++ b/llvm/test/CodeGen/X86/rotate-extract-vector.ll @@ -0,0 +1,317 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X64 + +; Check that under certain conditions we can factor out a rotate +; from the following idioms: +; (a*c0) >> s1 | (a*c1) +; (a/c0) << s1 | (a/c1) +; This targets cases where instcombine has folded a shl/srl/mul/udiv +; with one of the shifts from the rotate idiom + +define <4 x i32> @vroll_v4i32_extract_shl(<4 x i32> %i) { +; CHECK-LABEL: vroll_v4i32_extract_shl: +; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $3, %xmm0, %xmm1 +; CHECK-NEXT: vpslld $10, %xmm0, %xmm0 +; CHECK-NEXT: vpsrld $25, %xmm1, %xmm1 +; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %lhs_mul = shl <4 x i32> %i, + %rhs_mul = shl <4 x i32> %i, + %lhs_shift = lshr <4 x i32> %lhs_mul, + %out = or <4 x i32> %lhs_shift, %rhs_mul + ret <4 x i32> %out +} + +define <4 x i64> @vrolq_v4i64_extract_shrl(<4 x i64> %i) nounwind { +; X86-LABEL: vrolq_v4i64_extract_shrl: +; X86: # %bb.0: +; X86-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; X86-NEXT: vprolq $24, %zmm0, %zmm0 +; X86-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: vrolq_v4i64_extract_shrl: +; X64: # %bb.0: +; X64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; X64-NEXT: vprolq $24, %zmm0, %zmm0 +; X64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073189457919,18446744073189457919,18446744073189457919,18446744073189457919] +; X64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %lhs_div = lshr <4 x i64> %i, + %rhs_div = lshr <4 x i64> %i, + %rhs_shift = shl <4 x i64> %rhs_div, + %out = or <4 x i64> %lhs_div, %rhs_shift + ret <4 x i64> %out +} + +define <8 x i32> @vroll_extract_mul(<8 x i32> %i) nounwind { +; CHECK-LABEL: vroll_extract_mul: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [640,640,640,640,640,640,640,640] +; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm1 +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10] +; CHECK-NEXT: vpmulld %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vpsrld $26, %ymm0, %ymm0 +; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %lhs_mul = mul <8 x i32> %i, + %rhs_mul = mul <8 x i32> %i, + %rhs_shift = lshr <8 x i32> %rhs_mul, + %out = or <8 x i32> %lhs_mul, %rhs_shift + ret <8 x i32> %out +} + +define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind { +; X86-LABEL: vrolq_extract_udiv: +; X86: # %bb.0: +; X86-NEXT: subl $60, %esp +; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $3, {{[0-9]+}}(%esp) +; X86-NEXT: calll __udivdi3 +; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: vextractps $2, %xmm0, (%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $3, {{[0-9]+}}(%esp) +; X86-NEXT: vmovd %eax, %xmm0 +; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 +; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: calll __udivdi3 +; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 +; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $384, {{[0-9]+}}(%esp) # imm = 0x180 +; X86-NEXT: calll __udivdi3 +; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: vextractps $2, %xmm0, (%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $384, {{[0-9]+}}(%esp) # imm = 0x180 +; X86-NEXT: vmovd %eax, %xmm0 +; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 +; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: calll __udivdi3 +; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 +; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-NEXT: vpsllq $57, %xmm1, %xmm1 +; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X86-NEXT: addl $60, %esp +; X86-NEXT: retl +; +; X64-LABEL: vrolq_extract_udiv: +; X64: # %bb.0: +; X64-NEXT: vpextrq $1, %xmm0, %rax +; X64-NEXT: movabsq $-6148914691236517205, %rsi # imm = 0xAAAAAAAAAAAAAAAB +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: shrq %rax +; X64-NEXT: vmovq %rax, %xmm1 +; X64-NEXT: vmovq %xmm0, %rax +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: shrq %rax +; X64-NEXT: vmovq %rax, %xmm0 +; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-NEXT: shrq $8, %rcx +; X64-NEXT: vmovq %rcx, %xmm1 +; X64-NEXT: shrq $8, %rdx +; X64-NEXT: vmovq %rdx, %xmm2 +; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; X64-NEXT: vpsllq $57, %xmm0, %xmm0 +; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-NEXT: retq + %lhs_div = udiv <2 x i64> %i, + %rhs_div = udiv <2 x i64> %i, + %lhs_shift = shl <2 x i64> %lhs_div, + %out = or <2 x i64> %lhs_shift, %rhs_div + ret <2 x i64> %out +} + +define <4 x i32> @vrolw_extract_mul_with_mask(<4 x i32> %i) nounwind { +; CHECK-LABEL: vrolw_extract_mul_with_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1152,1152,1152,1152] +; CHECK-NEXT: vpmulld %xmm1, %xmm0, %xmm1 +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [9,9,9,9] +; CHECK-NEXT: vpmulld %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [160,160,160,160] +; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpsrld $25, %xmm0, %xmm0 +; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %lhs_mul = mul <4 x i32> %i, + %rhs_mul = mul <4 x i32> %i, + %lhs_and = and <4 x i32> %lhs_mul, + %rhs_shift = lshr <4 x i32> %rhs_mul, + %out = or <4 x i32> %lhs_and, %rhs_shift + ret <4 x i32> %out +} + +define <32 x i16> @illegal_no_extract_mul(<32 x i16> %i) nounwind { +; X86-LABEL: illegal_no_extract_mul: +; X86: # %bb.0: +; X86-NEXT: vpmullw {{\.LCPI.*}}, %zmm0, %zmm1 +; X86-NEXT: vpmullw {{\.LCPI.*}}, %zmm0, %zmm0 +; X86-NEXT: vpsrlw $10, %zmm0, %zmm0 +; X86-NEXT: vporq %zmm0, %zmm1, %zmm0 +; X86-NEXT: retl +; +; X64-LABEL: illegal_no_extract_mul: +; X64: # %bb.0: +; X64-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm1 +; X64-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm0 +; X64-NEXT: vpsrlw $10, %zmm0, %zmm0 +; X64-NEXT: vporq %zmm0, %zmm1, %zmm0 +; X64-NEXT: retq + %lhs_mul = mul <32 x i16> %i, + %rhs_mul = mul <32 x i16> %i, + %rhs_shift = lshr <32 x i16> %rhs_mul, + %out = or <32 x i16> %lhs_mul, %rhs_shift + ret <32 x i16> %out +} + +; Result would undershift +define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind { +; CHECK-LABEL: no_extract_shl: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsllq $11, %ymm0, %ymm1 +; CHECK-NEXT: vpsllq $24, %ymm0, %ymm0 +; CHECK-NEXT: vpsrlq $50, %ymm1, %ymm1 +; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %lhs_mul = shl <4 x i64> %i, + %rhs_mul = shl <4 x i64> %i, + %lhs_shift = lshr <4 x i64> %lhs_mul, + %out = or <4 x i64> %lhs_shift, %rhs_mul + ret <4 x i64> %out +} + +; Result would overshift +define <4 x i32> @no_extract_shrl(<4 x i32> %i) nounwind { +; CHECK-LABEL: no_extract_shrl: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4026531840,4026531840,4026531840,4026531840] +; CHECK-NEXT: vpslld $25, %xmm0, %xmm2 +; CHECK-NEXT: vpand %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vpsrld $9, %xmm0, %xmm0 +; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %lhs_div = lshr <4 x i32> %i, + %rhs_div = lshr <4 x i32> %i, + %lhs_shift = shl <4 x i32> %lhs_div, + %out = or <4 x i32> %lhs_shift, %rhs_div + ret <4 x i32> %out +} + +; Can factor 512 from 1536, but result is 3 instead of 9 +define <8 x i32> @no_extract_mul(<8 x i32> %i) nounwind { +; CHECK-LABEL: no_extract_mul: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1536,1536,1536,1536,1536,1536,1536,1536] +; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm1 +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9] +; CHECK-NEXT: vpmulld %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vpsrld $23, %ymm0, %ymm0 +; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %lhs_mul = mul <8 x i32> %i, + %rhs_mul = mul <8 x i32> %i, + %rhs_shift = lshr <8 x i32> %rhs_mul, + %out = or <8 x i32> %lhs_mul, %rhs_shift + ret <8 x i32> %out +} + +; Can't evenly factor 256 from 770 +define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind { +; X86-LABEL: no_extract_udiv: +; X86: # %bb.0: +; X86-NEXT: subl $60, %esp +; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $3, {{[0-9]+}}(%esp) +; X86-NEXT: calll __udivdi3 +; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: vextractps $2, %xmm0, (%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $3, {{[0-9]+}}(%esp) +; X86-NEXT: vmovd %eax, %xmm0 +; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 +; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: calll __udivdi3 +; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 +; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $770, {{[0-9]+}}(%esp) # imm = 0x302 +; X86-NEXT: calll __udivdi3 +; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: vextractps $2, %xmm0, (%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $770, {{[0-9]+}}(%esp) # imm = 0x302 +; X86-NEXT: vmovd %eax, %xmm0 +; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 +; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: calll __udivdi3 +; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 +; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-NEXT: vpsllq $56, %xmm1, %xmm1 +; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X86-NEXT: addl $60, %esp +; X86-NEXT: retl +; +; X64-LABEL: no_extract_udiv: +; X64: # %bb.0: +; X64-NEXT: vpextrq $1, %xmm0, %rcx +; X64-NEXT: movabsq $-6148914691236517205, %rdi # imm = 0xAAAAAAAAAAAAAAAB +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: mulq %rdi +; X64-NEXT: shrq %rdx +; X64-NEXT: vmovq %rdx, %xmm1 +; X64-NEXT: vmovq %xmm0, %rsi +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %rdi +; X64-NEXT: shrq %rdx +; X64-NEXT: vmovq %rdx, %xmm0 +; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-NEXT: movabsq $-6180857105216966645, %rdi # imm = 0xAA392F35DC17F00B +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: mulq %rdi +; X64-NEXT: shrq $9, %rdx +; X64-NEXT: vmovq %rdx, %xmm1 +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %rdi +; X64-NEXT: shrq $9, %rdx +; X64-NEXT: vmovq %rdx, %xmm2 +; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; X64-NEXT: vpsllq $56, %xmm0, %xmm0 +; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-NEXT: retq + %lhs_div = udiv <2 x i64> %i, + %rhs_div = udiv <2 x i64> %i, + %lhs_shift = shl <2 x i64> %lhs_div, + %out = or <2 x i64> %lhs_shift, %rhs_div + ret <2 x i64> %out +} diff --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll new file mode 100644 index 0000000..6ce3db1 --- /dev/null +++ b/llvm/test/CodeGen/X86/rotate-extract.ll @@ -0,0 +1,285 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=CHECK,X64 + +; Check that under certain conditions we can factor out a rotate +; from the following idioms: +; (a*c0) >> s1 | (a*c1) +; (a/c0) << s1 | (a/c1) +; This targets cases where instcombine has folded a shl/srl/mul/udiv +; with one of the shifts from the rotate idiom + +define i64 @rolq_extract_shl(i64 %i) nounwind { +; X86-LABEL: rolq_extract_shl: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: leal (,%edx,8), %eax +; X86-NEXT: shldl $10, %ecx, %edx +; X86-NEXT: shll $10, %ecx +; X86-NEXT: shrl $25, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: retl +; +; X64-LABEL: rolq_extract_shl: +; X64: # %bb.0: +; X64-NEXT: leaq (,%rdi,8), %rax +; X64-NEXT: shlq $10, %rdi +; X64-NEXT: shrq $57, %rax +; X64-NEXT: orq %rdi, %rax +; X64-NEXT: retq + %lhs_mul = shl i64 %i, 3 + %rhs_mul = shl i64 %i, 10 + %lhs_shift = lshr i64 %lhs_mul, 57 + %out = or i64 %lhs_shift, %rhs_mul + ret i64 %out +} + +define i16 @rolw_extract_shrl(i16 %i) nounwind { +; X86-LABEL: rolw_extract_shrl: +; X86: # %bb.0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: rolw $9, %ax +; X86-NEXT: andl $61951, %eax # imm = 0xF1FF +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl +; +; X64-LABEL: rolw_extract_shrl: +; X64: # %bb.0: +; X64-NEXT: rolw $9, %di +; X64-NEXT: andl $61951, %edi # imm = 0xF1FF +; X64-NEXT: movl %edi, %eax +; X64-NEXT: retq + %lhs_div = lshr i16 %i, 7 + %rhs_div = lshr i16 %i, 3 + %rhs_shift = shl i16 %rhs_div, 12 + %out = or i16 %lhs_div, %rhs_shift + ret i16 %out +} + +define i32 @roll_extract_mul(i32 %i) nounwind { +; X86-LABEL: roll_extract_mul: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (%ecx,%ecx,8), %eax +; X86-NEXT: shll $7, %ecx +; X86-NEXT: leal (%ecx,%ecx,8), %ecx +; X86-NEXT: shrl $25, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: retl +; +; X64-LABEL: roll_extract_mul: +; X64: # %bb.0: +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: leal (%rdi,%rdi,8), %eax +; X64-NEXT: shll $7, %edi +; X64-NEXT: leal (%rdi,%rdi,8), %ecx +; X64-NEXT: shrl $25, %eax +; X64-NEXT: orl %ecx, %eax +; X64-NEXT: retq + %lhs_mul = mul i32 %i, 9 + %rhs_mul = mul i32 %i, 1152 + %lhs_shift = lshr i32 %lhs_mul, 25 + %out = or i32 %lhs_shift, %rhs_mul + ret i32 %out +} + +define i8 @rolb_extract_udiv(i8 %i) nounwind { +; X86-LABEL: rolb_extract_udiv: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull $171, %eax, %eax +; X86-NEXT: movb %ah, %cl +; X86-NEXT: shlb $3, %cl +; X86-NEXT: andb $-16, %cl +; X86-NEXT: shrl $13, %eax +; X86-NEXT: orb %cl, %al +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl +; +; X64-LABEL: rolb_extract_udiv: +; X64: # %bb.0: +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: imull $171, %eax, %eax +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: shrl $8, %ecx +; X64-NEXT: shlb $3, %cl +; X64-NEXT: andb $-16, %cl +; X64-NEXT: shrl $13, %eax +; X64-NEXT: orb %cl, %al +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq + %lhs_div = udiv i8 %i, 3 + %rhs_div = udiv i8 %i, 48 + %lhs_shift = shl i8 %lhs_div, 4 + %out = or i8 %lhs_shift, %rhs_div + ret i8 %out +} + +define i64 @rolq_extract_mul_with_mask(i64 %i) nounwind { +; X86-LABEL: rolq_extract_mul_with_mask: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shll $7, %ecx +; X86-NEXT: leal (%ecx,%ecx,8), %ecx +; X86-NEXT: movl $9, %edx +; X86-NEXT: mull %edx +; X86-NEXT: leal (%esi,%esi,8), %eax +; X86-NEXT: addl %edx, %eax +; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: shrl $25, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: rolq_extract_mul_with_mask: +; X64: # %bb.0: +; X64-NEXT: leaq (%rdi,%rdi,8), %rax +; X64-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi +; X64-NEXT: shll $7, %edi +; X64-NEXT: leal (%rdi,%rdi,8), %ecx +; X64-NEXT: movzbl %cl, %ecx +; X64-NEXT: shrq $57, %rax +; X64-NEXT: orq %rcx, %rax +; X64-NEXT: retq + %lhs_mul = mul i64 %i, 1152 + %rhs_mul = mul i64 %i, 9 + %lhs_and = and i64 %lhs_mul, 160 + %rhs_shift = lshr i64 %rhs_mul, 57 + %out = or i64 %lhs_and, %rhs_shift + ret i64 %out +} + +; Result would undershift +define i64 @no_extract_shl(i64 %i) nounwind { +; X86-LABEL: no_extract_shl: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shll $5, %eax +; X86-NEXT: shldl $10, %ecx, %edx +; X86-NEXT: shll $10, %ecx +; X86-NEXT: shrl $25, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: retl +; +; X64-LABEL: no_extract_shl: +; X64: # %bb.0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: shlq $5, %rax +; X64-NEXT: shlq $10, %rdi +; X64-NEXT: shrq $57, %rax +; X64-NEXT: leaq (%rax,%rdi), %rax +; X64-NEXT: retq + %lhs_mul = shl i64 %i, 5 + %rhs_mul = shl i64 %i, 10 + %lhs_shift = lshr i64 %lhs_mul, 57 + %out = or i64 %lhs_shift, %rhs_mul + ret i64 %out +} + +; Result would overshift +define i32 @no_extract_shrl(i32 %i) nounwind { +; X86-LABEL: no_extract_shrl: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $-8, %ecx +; X86-NEXT: shll $25, %ecx +; X86-NEXT: shrl $9, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: retl +; +; X64-LABEL: no_extract_shrl: +; X64: # %bb.0: +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: movl %edi, %eax +; X64-NEXT: andl $-8, %eax +; X64-NEXT: shll $25, %eax +; X64-NEXT: shrl $9, %edi +; X64-NEXT: leal (%rdi,%rax), %eax +; X64-NEXT: retq + %lhs_div = lshr i32 %i, 3 + %rhs_div = lshr i32 %i, 9 + %lhs_shift = shl i32 %lhs_div, 28 + %out = or i32 %lhs_shift, %rhs_div + ret i32 %out +} + +; Can factor 128 from 2304, but result is 18 instead of 9 +define i16 @no_extract_mul(i16 %i) nounwind { +; X86-LABEL: no_extract_mul: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: leal (%eax,%eax,8), %ecx +; X86-NEXT: shll $8, %eax +; X86-NEXT: leal (%eax,%eax,8), %edx +; X86-NEXT: movzwl %cx, %eax +; X86-NEXT: shrl $9, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl +; +; X64-LABEL: no_extract_mul: +; X64: # %bb.0: +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: leal (%rdi,%rdi,8), %eax +; X64-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi +; X64-NEXT: shll $8, %edi +; X64-NEXT: leal (%rdi,%rdi,8), %ecx +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: shrl $9, %eax +; X64-NEXT: orl %ecx, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq + %lhs_mul = mul i16 %i, 2304 + %rhs_mul = mul i16 %i, 9 + %rhs_shift = lshr i16 %rhs_mul, 9 + %out = or i16 %lhs_mul, %rhs_shift + ret i16 %out +} + +; Can't evenly factor 16 from 49 +define i8 @no_extract_udiv(i8 %i) nounwind { +; X86-LABEL: no_extract_udiv: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull $171, %eax, %ecx +; X86-NEXT: shlb $3, %ch +; X86-NEXT: andb $-16, %ch +; X86-NEXT: imull $79, %eax, %edx +; X86-NEXT: subb %dh, %al +; X86-NEXT: shrb %al +; X86-NEXT: addb %dh, %al +; X86-NEXT: shrb $5, %al +; X86-NEXT: orb %ch, %al +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl +; +; X64-LABEL: no_extract_udiv: +; X64: # %bb.0: +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: imull $171, %eax, %ecx +; X64-NEXT: shrl $8, %ecx +; X64-NEXT: shlb $3, %cl +; X64-NEXT: andb $-16, %cl +; X64-NEXT: imull $79, %eax, %edx +; X64-NEXT: shrl $8, %edx +; X64-NEXT: subb %dl, %al +; X64-NEXT: shrb %al +; X64-NEXT: addb %dl, %al +; X64-NEXT: shrb $5, %al +; X64-NEXT: orb %cl, %al +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq + %lhs_div = udiv i8 %i, 3 + %rhs_div = udiv i8 %i, 49 + %lhs_shift = shl i8 %lhs_div,4 + %out = or i8 %lhs_shift, %rhs_div + ret i8 %out +} -- 2.7.4