From 46a13a0ef847dbe3226f4b3d8726ef25c8909293 Mon Sep 17 00:00:00 2001
From: Clement Courbet <courbet@google.com>
Date: Fri, 15 Apr 2022 11:11:26 +0200
Subject: [PATCH] [ExpandMemCmp] Properly expand `bcmp` to an equality pattern.

Before that change, constant-size `bcmp` would miss an opportunity to generate
a more efficient equality pattern and would generate a -1/0-1 pattern
instead.

Differential Revision: https://reviews.llvm.org/D123849
---
 llvm/lib/CodeGen/ExpandMemCmp.cpp             |  7 ++++---
 llvm/test/CodeGen/X86/memcmp-optsize-x32.ll   | 11 ++++-------
 llvm/test/CodeGen/X86/memcmp-optsize.ll       | 11 ++++-------
 llvm/test/CodeGen/X86/memcmp-pgso-x32.ll      | 11 ++++-------
 llvm/test/CodeGen/X86/memcmp-pgso.ll          | 11 ++++-------
 llvm/test/Transforms/ExpandMemCmp/X86/bcmp.ll | 11 +++--------
 6 files changed, 23 insertions(+), 39 deletions(-)

diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp
index bd6486b..17bf6b0 100644
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -740,7 +740,7 @@ Value *MemCmpExpansion::getMemCmpExpansion() {
 static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
                          const TargetLowering *TLI, const DataLayout *DL,
                          ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
-                         DomTreeUpdater *DTU) {
+                         DomTreeUpdater *DTU, const bool IsBCmp) {
   NumMemCmpCalls++;
 
   // Early exit from expansion if -Oz.
@@ -760,7 +760,8 @@ static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
   }
   // TTI call to check if target would like to expand memcmp. Also, get the
   // available load sizes.
-  const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
+  const bool IsUsedForZeroCmp =
+      IsBCmp || isOnlyUsedInZeroEqualityComparison(CI);
   bool OptForSize = CI->getFunction()->hasOptSize() ||
                     llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI);
   auto Options = TTI->enableMemCmpExpansion(OptForSize,
@@ -864,7 +865,7 @@ bool ExpandMemCmpPass::runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI,
     LibFunc Func;
     if (TLI->getLibFunc(*CI, Func) &&
         (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
-        expandMemCmp(CI, TTI, TL, &DL, PSI, BFI, DTU)) {
+        expandMemCmp(CI, TTI, TL, &DL, PSI, BFI, DTU, Func == LibFunc_bcmp)) {
       return true;
     }
   }
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
index 2c45b85..db205e5 100644
--- a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
@@ -569,15 +569,12 @@ define i1 @length64_eq_const(i8* %X) nounwind optsize {
 define i32 @bcmp_length2(i8* %X, i8* %Y) nounwind optsize {
 ; X86-LABEL: bcmp_length2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpw (%ecx), %dx
+; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @bcmp(i8* %X, i8* %Y, i32 2) nounwind
   ret i32 %m
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/llvm/test/CodeGen/X86/memcmp-optsize.ll
index 4c5b339..ef0be61 100644
--- a/llvm/test/CodeGen/X86/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize.ll
@@ -584,13 +584,10 @@ define i1 @length64_eq_const(i8* %X) nounwind optsize {
 define i32 @bcmp_length2(i8* %X, i8* %Y) nounwind optsize {
 ; X64-LABEL: bcmp_length2:
 ; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    movzwl (%rdi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpw (%rsi), %cx
+; X64-NEXT:    setne %al
 ; X64-NEXT:    retq
   %m = tail call i32 @bcmp(i8* %X, i8* %Y, i64 2) nounwind
   ret i32 %m
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
index 0953e35..85290c3 100644
--- a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
@@ -569,15 +569,12 @@ define i1 @length64_eq_const(i8* %X) nounwind !prof !14 {
 define i32 @bcmp_length2(i8* %X, i8* %Y) nounwind !prof !14 {
 ; X86-LABEL: bcmp_length2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpw (%ecx), %dx
+; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @bcmp(i8* %X, i8* %Y, i32 2) nounwind
   ret i32 %m
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll
index f763d91..28e9f36 100644
--- a/llvm/test/CodeGen/X86/memcmp-pgso.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll
@@ -584,13 +584,10 @@ define i1 @length64_eq_const(i8* %X) nounwind !prof !14 {
 define i32 @bcmp_length2(i8* %X, i8* %Y) nounwind !prof !14 {
 ; X64-LABEL: bcmp_length2:
 ; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    movzwl (%rdi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpw (%rsi), %cx
+; X64-NEXT:    setne %al
 ; X64-NEXT:    retq
   %m = tail call i32 @bcmp(i8* %X, i8* %Y, i64 2) nounwind
   ret i32 %m
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/bcmp.ll b/llvm/test/Transforms/ExpandMemCmp/X86/bcmp.ll
index 72d049d..6f29dc7 100644
--- a/llvm/test/Transforms/ExpandMemCmp/X86/bcmp.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/bcmp.ll
@@ -9,14 +9,9 @@ define i32 @bcmp8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X64-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64*
 ; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]], align 1
 ; X64-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP2]], align 1
-; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; X64-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
-; X64-NEXT:    [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; X64-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; X64-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
-; X64-NEXT:    ret i32 [[TMP11]]
+; X64-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    ret i32 [[TMP6]]
 ;
   %call = tail call i32 @bcmp(i8* %x, i8* %y, i64 8)
   ret i32 %call
-- 
2.7.4