From 2d1390efbe610ff15a8cfc6d40f6e8eaa74355b6 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 31 Jan 2022 12:00:51 +0000
Subject: [PATCH] [DAG] SimplifyDemandedBits - mul(x,x) - if only demand bit[1]
 then fold to zero

---
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp |  6 +++++-
 llvm/test/CodeGen/X86/combine-mul.ll             | 15 ++++-----------
 2 files changed, 9 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a98c21f..ba6cae0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2247,8 +2247,12 @@ bool TargetLowering::SimplifyDemandedBits(
     }
     break;
   }
-  case ISD::ADD:
   case ISD::MUL:
+    // 'Quadratic Reciprocity': mul(x,x) -> 0 if we're only demanding bit[1]
+    if (DemandedBits == 2 && Op.getOperand(0) == Op.getOperand(1))
+      return TLO.CombineTo(Op, TLO.DAG.getConstant(0, dl, VT));
+    LLVM_FALLTHROUGH;
+  case ISD::ADD:
   case ISD::SUB: {
     // Add, Sub, and Mul don't demand any bits in positions beyond that
     // of the highest bit demanded of them.
diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll
index f0254e7..403443d 100644
--- a/llvm/test/CodeGen/X86/combine-mul.ll
+++ b/llvm/test/CodeGen/X86/combine-mul.ll
@@ -366,16 +366,12 @@ define <2 x i64> @combine_mul_to_abs_v2i64(<2 x i64> %x) {
 define i64 @combine_mul_self_knownbits(i64 %x) {
 ; SSE-LABEL: combine_mul_self_knownbits:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movq %rdi, %rax
-; SSE-NEXT:    imull %eax, %eax
-; SSE-NEXT:    andl $2, %eax
+; SSE-NEXT:    xorl %eax, %eax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_mul_self_knownbits:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    movq %rdi, %rax
-; AVX-NEXT:    imull %eax, %eax
-; AVX-NEXT:    andl $2, %eax
+; AVX-NEXT:    xorl %eax, %eax
 ; AVX-NEXT:    retq
   %1 = mul i64 %x, %x
   %2 = and i64 %1, 2
@@ -385,15 +381,12 @@ define i64 @combine_mul_self_knownbits(i64 %x) {
 define <4 x i32> @combine_mul_self_knownbits_vector(<4 x i32> %x) {
 ; SSE-LABEL: combine_mul_self_knownbits_vector:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pmulld %xmm0, %xmm0
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    xorps %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_mul_self_knownbits_vector:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpmulld %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = mul <4 x i32> %x, %x
   %2 = and <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
-- 
2.7.4