From 2d1390efbe610ff15a8cfc6d40f6e8eaa74355b6 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 31 Jan 2022 12:00:51 +0000 Subject: [PATCH] [DAG] SimplifyDemandedBits - mul(x,x) - if only demand bit[1] then fold to zero --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 6 +++++- llvm/test/CodeGen/X86/combine-mul.ll | 15 ++++----------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index a98c21f..ba6cae0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -2247,8 +2247,12 @@ bool TargetLowering::SimplifyDemandedBits( } break; } - case ISD::ADD: case ISD::MUL: + // 'Quadratic Reciprocity': mul(x,x) -> 0 if we're only demanding bit[1] + if (DemandedBits == 2 && Op.getOperand(0) == Op.getOperand(1)) + return TLO.CombineTo(Op, TLO.DAG.getConstant(0, dl, VT)); + LLVM_FALLTHROUGH; + case ISD::ADD: case ISD::SUB: { // Add, Sub, and Mul don't demand any bits in positions beyond that // of the highest bit demanded of them. diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll index f0254e7..403443d 100644 --- a/llvm/test/CodeGen/X86/combine-mul.ll +++ b/llvm/test/CodeGen/X86/combine-mul.ll @@ -366,16 +366,12 @@ define <2 x i64> @combine_mul_to_abs_v2i64(<2 x i64> %x) { define i64 @combine_mul_self_knownbits(i64 %x) { ; SSE-LABEL: combine_mul_self_knownbits: ; SSE: # %bb.0: -; SSE-NEXT: movq %rdi, %rax -; SSE-NEXT: imull %eax, %eax -; SSE-NEXT: andl $2, %eax +; SSE-NEXT: xorl %eax, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: combine_mul_self_knownbits: ; AVX: # %bb.0: -; AVX-NEXT: movq %rdi, %rax -; AVX-NEXT: imull %eax, %eax -; AVX-NEXT: andl $2, %eax +; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: retq %1 = mul i64 %x, %x %2 = and i64 %1, 2 @@ -385,15 +381,12 @@ define i64 @combine_mul_self_knownbits(i64 %x) { define <4 x i32> @combine_mul_self_knownbits_vector(<4 x i32> %x) { ; SSE-LABEL: combine_mul_self_knownbits_vector: ; SSE: # %bb.0: -; SSE-NEXT: pmulld %xmm0, %xmm0 -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_mul_self_knownbits_vector: ; AVX: # %bb.0: -; AVX-NEXT: vpmulld %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = mul <4 x i32> %x, %x %2 = and <4 x i32> %1, -- 2.7.4