Use v16i8 rather than v2i64 as the VT for memset expansion on AArch64.
authorOwen Anderson <resistor@mac.com>
Thu, 19 Aug 2021 08:00:29 +0000 (08:00 +0000)
committerOwen Anderson <resistor@mac.com>
Thu, 19 Aug 2021 16:54:07 +0000 (16:54 +0000)
This allows the instruction selector to realize that it can directly
broadcast the low byte of the memset value, rather than replicating
it to a 64-bit GPR before broadcasting.

This fixes PR50985.

Differential Revision: https://reviews.llvm.org/D108354

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/memset.ll [new file with mode: 0644]

index 0e5e704..2ec9f84 100644 (file)
@@ -12091,8 +12091,8 @@ EVT AArch64TargetLowering::getOptimalMemOpType(
   };
 
   if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
-      AlignmentIsAcceptable(MVT::v2i64, Align(16)))
-    return MVT::v2i64;
+      AlignmentIsAcceptable(MVT::v16i8, Align(16)))
+    return MVT::v16i8;
   if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
     return MVT::f128;
   if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
diff --git a/llvm/test/CodeGen/AArch64/memset.ll b/llvm/test/CodeGen/AArch64/memset.ll
new file mode 100644 (file)
index 0000000..4d1d224
--- /dev/null
@@ -0,0 +1,18 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+; CHECK: memset_call:
+; CHECK-NOT: and
+; CHECK: dup
+; CHECK-NEXT: stp
+; CHECK-NEXT: stp
+; CHECK-NEXT: ret
+define void @memset_call(i8* %0, i32 %1) {
+  %3 = trunc i32 %1 to i8
+  call void @llvm.memset.p0i8.i64(i8* %0, i8 %3, i64 64, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8*, i8, i64, i1 immarg)
+