In the same spirit as D73543 and in reply to https://reviews.llvm.org/D126768#3549920 this patch is adding support for `__builtin_memset_inline`.
The idea is to get support from the compiler to easily write efficient memory function implementations.
This patch could be split in two:
- one for the LLVM part adding the `llvm.memset.inline.*` intrinsics.
- and another one for the Clang part providing the instrinsic as a builtin.
Differential Revision: https://reviews.llvm.org/D126903
Note that this intrinsic cannot yet be called in a ``constexpr`` context.
+Guaranteed inlined memset
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+. code-block:: c
+
+ void __builtin_memset_inline(void *dst, int value, size_t size);
+
+
+``__builtin_memset_inline`` has been designed as a building block for efficient
+``memset`` implementations. It is identical to ``__builtin_memset`` but also
+guarantees not to call any external functions. See LLVM IR `llvm.memset.inline
+<https://llvm.org/docs/LangRef.html#llvm-memset-inline-intrinsic>`_ intrinsic
+for more information.
+
+This is useful to implement a custom version of ``memset``, implement a
+``libc`` memset or work around the absence of a ``libc``.
+
+Note that the `size` argument must be a compile time constant.
+
+Note that this intrinsic cannot yet be called in a ``constexpr`` context.
Atomic Min/Max builtins with memory ordering
--------------------------------------------
BUILTIN(__builtin_memmove, "v*v*vC*z", "nF")
BUILTIN(__builtin_mempcpy, "v*v*vC*z", "nF")
BUILTIN(__builtin_memset, "v*v*iz", "nF")
+BUILTIN(__builtin_memset_inline, "vv*iIz", "n")
BUILTIN(__builtin_printf, "icC*.", "Fp:0:")
BUILTIN(__builtin_stpcpy, "c*c*cC*", "nF")
BUILTIN(__builtin_stpncpy, "c*c*cC*z", "nF")
Dest.getAlignment().getAsAlign(), IsVolatile);
}
+ using CGBuilderBaseTy::CreateMemSetInline;
+ llvm::CallInst *CreateMemSetInline(Address Dest, llvm::Value *Value,
+ uint64_t Size) {
+ return CreateMemSetInline(Dest.getPointer(),
+ Dest.getAlignment().getAsAlign(), Value,
+ getInt64(Size));
+ }
+
using CGBuilderBaseTy::CreatePreserveStructAccessIndex;
Address CreatePreserveStructAccessIndex(Address Addr, unsigned Index,
unsigned FieldIndex,
Builder.CreateMemSet(Dest, ByteVal, SizeVal, false);
return RValue::get(Dest.getPointer());
}
+ case Builtin::BI__builtin_memset_inline: {
+ Address Dest = EmitPointerWithAlignment(E->getArg(0));
+ Value *ByteVal =
+ Builder.CreateTrunc(EmitScalarExpr(E->getArg(1)), Builder.getInt8Ty());
+ uint64_t Size =
+ E->getArg(2)->EvaluateKnownConstInt(getContext()).getZExtValue();
+ EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
+ E->getArg(0)->getExprLoc(), FD, 0);
+ Builder.CreateMemSetInline(Dest, ByteVal, Size);
+ return RValue::get(nullptr);
+ }
case Builtin::BI__builtin___memset_chk: {
// fold __builtin_memset_chk(x, y, cst1, cst2) to memset iff cst1<=cst2.
Expr::EvalResult SizeResult, DstSizeResult;
}
break;
}
+ case Builtin::BI__builtin_memset_inline: {
+ clang::Expr *SizeOp = TheCall->getArg(2);
+ // We warn about filling to `nullptr` pointers when `size` is greater than
+ // 0. When `size` is value dependent we cannot evaluate its value so we bail
+ // out.
+ if (SizeOp->isValueDependent())
+ break;
+ if (!SizeOp->EvaluateKnownConstInt(Context).isZero())
+ CheckNonNullArgument(*this, TheCall->getArg(0), TheCall->getExprLoc());
+ break;
+ }
#define BUILTIN(ID, TYPE, ATTRS)
#define ATOMIC_BUILTIN(ID, TYPE, ATTRS) \
case Builtin::BI##ID: \
--- /dev/null
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: x86-registered-target
+// RUN: %clang_cc1 -no-opaque-pointers -triple x86_64-unknown-linux -emit-llvm %s -o - | FileCheck %s
+
+// CHECK-LABEL: define{{.*}} void @test_memset_inline_0(i8* noundef %dst, i8 noundef signext %value)
+void test_memset_inline_0(void *dst, char value) {
+ // CHECK: call void @llvm.memset.inline.p0i8.i64(i8* align 1 %0, i8 %2, i64 0, i1 false)
+ __builtin_memset_inline(dst, value, 0);
+}
+
+// CHECK-LABEL: define{{.*}} void @test_memset_inline_1(i8* noundef %dst, i8 noundef signext %value)
+void test_memset_inline_1(void *dst, char value) {
+ // CHECK: call void @llvm.memset.inline.p0i8.i64(i8* align 1 %0, i8 %2, i64 1, i1 false)
+ __builtin_memset_inline(dst, value, 1);
+}
+
+// CHECK-LABEL: define{{.*}} void @test_memset_inline_4(i8* noundef %dst, i8 noundef signext %value)
+void test_memset_inline_4(void *dst, char value) {
+ // CHECK: call void @llvm.memset.inline.p0i8.i64(i8* align 1 %0, i8 %2, i64 4, i1 false)
+ __builtin_memset_inline(dst, value, 4);
+}
--- /dev/null
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+#define NULL ((char *)0)
+
+#if __has_builtin(__builtin_memset_inline)
+#warning defined as expected
+// expected-warning@-1 {{defined as expected}}
+#endif
+
+void test_memset_inline_invalid_arg_types() {
+ __builtin_memset_inline(1, 2, 3); // expected-error {{cannot initialize a parameter of type 'void *' with an rvalue of type 'int'}}
+}
+
+void test_memset_inline_null_dst(void *ptr) {
+ __builtin_memset_inline(NULL, 1, 4); // expected-warning {{null passed to a callee that requires a non-null argument}}
+}
+
+void test_memset_inline_null_buffer_is_ok_if_size_is_zero(void *ptr, char value) {
+ __builtin_memset_inline(NULL, value, /*size */ 0);
+}
+
+void test_memset_inline_non_constant_size(void *dst, char value, unsigned size) {
+ __builtin_memset_inline(dst, value, size); // expected-error {{argument to '__builtin_memset_inline' must be a constant integer}}
+}
+
+template <unsigned size>
+void test_memset_inline_template(void *dst, char value) {
+ // we do not try to evaluate size in non intantiated templates.
+ __builtin_memset_inline(dst, value, size);
+}
+
+void test_memset_inline_implicit_conversion(void *ptr, char value) {
+ char a[5];
+ __builtin_memset_inline(a, value, 5);
+}
+
+void test_memset_inline_num_args(void *dst, char value) {
+ __builtin_memset_inline(); // expected-error {{too few arguments to function call}}
+ __builtin_memset_inline(dst, value, 4, NULL); // expected-error {{too many arguments to function call}}
+}
If ``<len>`` is not zero, ``<dest>`` should be well-defined, otherwise the
behavior is undefined.
+.. _int_memset_inline:
+
+'``llvm.memset.inline``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.memset.inline`` on any
+integer bit width and for different address spaces. Not all targets
+support all bit widths however.
+
+::
+
+ declare void @llvm.memset.inline.p0i8.p0i8.i32(i8* <dest>, i8 <val>,
+ i32 <len>,
+ i1 <isvolatile>)
+ declare void @llvm.memset.inline.p0i8.p0i8.i64(i8* <dest>, i8 <val>,
+ i64 <len>,
+ i1 <isvolatile>)
+
+Overview:
+"""""""""
+
+The '``llvm.memset.inline.*``' intrinsics fill a block of memory with a
+particular byte value and guarantees that no external functions are called.
+
+Note that, unlike the standard libc function, the ``llvm.memset.inline.*``
+intrinsics do not return a value, take an extra isvolatile argument and the
+pointer can be in specified address spaces.
+
+Arguments:
+""""""""""
+
+The first argument is a pointer to the destination to fill, the second
+is the byte value with which to fill it, the third argument is a constant
+integer argument specifying the number of bytes to fill, and the fourth
+is a boolean indicating a volatile access.
+
+The :ref:`align <attr_align>` parameter attribute can be provided
+for the first argument.
+
+If the ``isvolatile`` parameter is ``true``, the ``llvm.memset.inline`` call is
+a :ref:`volatile operation <volatile>`. The detailed access behavior is not
+very cleanly specified and it is unwise to depend on it.
+
+Semantics:
+""""""""""
+
+The '``llvm.memset.inline.*``' intrinsics fill "len" bytes of memory starting
+at the destination location. If the argument is known to be
+aligned to some boundary, this can be specified as an attribute on
+the argument.
+
+``len`` must be a constant expression.
+If ``<len>`` is 0, it is no-op modulo the behavior of attributes attached to
+the arguments.
+If ``<len>`` is not a well-defined value, the behavior is undefined.
+If ``<len>`` is not zero, ``<dest>`` should be well-defined, otherwise the
+behavior is undefined.
+
+The behavior of '``llvm.memset.inline.*``' is equivalent to the behavior of
+'``llvm.memset.*``', but the generated code is guaranteed not to call any
+external functions.
+
'``llvm.sqrt.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^
const AAMDNodes &AAInfo = AAMDNodes());
SDValue getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src,
- SDValue Size, Align Alignment, bool isVol, bool isTailCall,
+ SDValue Size, Align Alignment, bool isVol,
+ bool AlwaysInline, bool isTailCall,
MachinePointerInfo DstPtrInfo,
const AAMDNodes &AAInfo = AAMDNodes());
/// that don't fit the target's parameters for simple stores and can be more
/// efficient than using a library call. This function can return a null
/// SDValue if the target declines to use custom code and a different
- /// lowering strategy should be used.
+ /// lowering strategy should be used. Note that if AlwaysInline is true the
+ /// function has to return a valid SDValue.
virtual SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
SDValue Chain, SDValue Op1,
SDValue Op2, SDValue Op3,
Align Alignment, bool isVolatile,
+ bool AlwaysInline,
MachinePointerInfo DstPtrInfo) const {
return SDValue();
}
/// Determines the optimal series of memory ops to replace the memset / memcpy.
/// Return true if the number of memory ops is below the threshold (Limit).
+ /// Note that this is always the case when Limit is ~0.
/// It returns the types of the sequence of memory ops to perform
/// memset / memcpy by reference.
virtual bool
MDNode *ScopeTag = nullptr,
MDNode *NoAliasTag = nullptr);
+ CallInst *CreateMemSetInline(Value *Dst, MaybeAlign DstAlign, Value *Val,
+ Value *Size, bool IsVolatile = false,
+ MDNode *TBAATag = nullptr,
+ MDNode *ScopeTag = nullptr,
+ MDNode *NoAliasTag = nullptr);
+
/// Create and insert an element unordered-atomic memset of the region of
/// memory starting at the given pointer to the given value.
///
case Intrinsic::memcpy:
case Intrinsic::memmove:
case Intrinsic::memset:
+ case Intrinsic::memset_inline:
case Intrinsic::memcpy_inline:
return true;
default:
}
};
-/// This class wraps the llvm.memset intrinsic.
+/// This class wraps the llvm.memset and llvm.memset.inline intrinsics.
class MemSetInst : public MemSetBase<MemIntrinsic> {
public:
// Methods for support type inquiry through isa, cast, and dyn_cast:
static bool classof(const IntrinsicInst *I) {
- return I->getIntrinsicID() == Intrinsic::memset;
+ switch (I->getIntrinsicID()) {
+ case Intrinsic::memset:
+ case Intrinsic::memset_inline:
+ return true;
+ default:
+ return false;
+ }
+ }
+ static bool classof(const Value *V) {
+ return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+ }
+};
+
+/// This class wraps the llvm.memset.inline intrinsic.
+class MemSetInlineInst : public MemSetInst {
+public:
+ ConstantInt *getLength() const {
+ return cast<ConstantInt>(MemSetInst::getLength());
+ }
+ // Methods for support type inquiry through isa, cast, and dyn_cast:
+ static bool classof(const IntrinsicInst *I) {
+ return I->getIntrinsicID() == Intrinsic::memset_inline;
}
static bool classof(const Value *V) {
return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
case Intrinsic::memcpy_inline:
case Intrinsic::memmove:
case Intrinsic::memset:
+ case Intrinsic::memset_inline:
case Intrinsic::memcpy_element_unordered_atomic:
case Intrinsic::memmove_element_unordered_atomic:
case Intrinsic::memset_element_unordered_atomic:
static bool classof(const IntrinsicInst *I) {
switch (I->getIntrinsicID()) {
case Intrinsic::memset:
+ case Intrinsic::memset_inline:
case Intrinsic::memset_element_unordered_atomic:
return true;
default:
NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>,
ImmArg<ArgIndex<3>>]>;
+// Memset version that is guaranteed to be inlined.
+// In particular this means that the generated code is not allowed to call any
+// external function.
+// The third argument (specifying the size) must be a constant.
+def int_memset_inline
+ : Intrinsic<[],
+ [llvm_anyptr_ty, llvm_i8_ty, llvm_anyint_ty, llvm_i1_ty],
+ [IntrWriteMem, IntrArgMemOnly, IntrWillReturn, IntrNoFree,
+ NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>,
+ ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
+
// FIXME: Add version of these floating point intrinsics which allow non-default
// rounding modes and FP exception handling.
MSI->getDestAlign(), nullptr, MemRef::Write);
break;
}
+ case Intrinsic::memset_inline: {
+ MemSetInlineInst *MSII = cast<MemSetInlineInst>(&I);
+ visitMemoryReference(I, MemoryLocation::getForDest(MSII),
+ MSII->getDestAlign(), nullptr, MemRef::Write);
+ break;
+ }
case Intrinsic::vastart:
Check(I.getParent()->getParent()->isVarArg(),
/// \param Size Number of bytes to write.
/// \param Alignment Alignment of the destination in bytes.
/// \param isVol True if destination is volatile.
+/// \param AlwaysInline Makes sure no function call is generated.
/// \param DstPtrInfo IR information on the memory pointer.
/// \returns New head in the control flow, if lowering was successful, empty
/// SDValue otherwise.
///
/// The function tries to replace 'llvm.memset' intrinsic with several store
/// operations and value calculation code. This is usually profitable for small
-/// memory size.
+/// memory size or when the semantic requires inlining.
static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
SDValue Chain, SDValue Dst, SDValue Src,
uint64_t Size, Align Alignment, bool isVol,
- MachinePointerInfo DstPtrInfo,
+ bool AlwaysInline, MachinePointerInfo DstPtrInfo,
const AAMDNodes &AAInfo) {
// Turn a memset of undef to nop.
// FIXME: We need to honor volatile even is Src is undef.
DstAlignCanChange = true;
bool IsZeroVal =
isa<ConstantSDNode>(Src) && cast<ConstantSDNode>(Src)->isZero();
+ unsigned Limit = AlwaysInline ? ~0 : TLI.getMaxStoresPerMemset(OptSize);
+
if (!TLI.findOptimalMemOpLowering(
- MemOps, TLI.getMaxStoresPerMemset(OptSize),
+ MemOps, Limit,
MemOp::Set(Size, DstAlignCanChange, Alignment, IsZeroVal, isVol),
DstPtrInfo.getAddrSpace(), ~0u, MF.getFunction().getAttributes()))
return SDValue();
SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
SDValue Src, SDValue Size, Align Alignment,
- bool isVol, bool isTailCall,
+ bool isVol, bool AlwaysInline, bool isTailCall,
MachinePointerInfo DstPtrInfo,
const AAMDNodes &AAInfo) {
// Check to see if we should lower the memset to stores first.
SDValue Result = getMemsetStores(*this, dl, Chain, Dst, Src,
ConstantSize->getZExtValue(), Alignment,
- isVol, DstPtrInfo, AAInfo);
+ isVol, false, DstPtrInfo, AAInfo);
if (Result.getNode())
return Result;
// code. If the target chooses to do this, this is the next best.
if (TSI) {
SDValue Result = TSI->EmitTargetCodeForMemset(
- *this, dl, Chain, Dst, Src, Size, Alignment, isVol, DstPtrInfo);
+ *this, dl, Chain, Dst, Src, Size, Alignment, isVol, AlwaysInline, DstPtrInfo);
if (Result.getNode())
return Result;
}
+ // If we really need inline code and the target declined to provide it,
+ // use a (potentially long) sequence of loads and stores.
+ if (AlwaysInline) {
+ assert(ConstantSize && "AlwaysInline requires a constant size!");
+ SDValue Result = getMemsetStores(*this, dl, Chain, Dst, Src,
+ ConstantSize->getZExtValue(), Alignment,
+ isVol, true, DstPtrInfo, AAInfo);
+ assert(Result &&
+ "getMemsetStores must return a valid sequence when AlwaysInline");
+ return Result;
+ }
+
checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
// Emit a library call.
bool isVol = MSI.isVolatile();
bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
SDValue Root = isVol ? getRoot() : getMemoryRoot();
- SDValue MS = DAG.getMemset(Root, sdl, Op1, Op2, Op3, Alignment, isVol, isTC,
+ SDValue MS = DAG.getMemset(
+ Root, sdl, Op1, Op2, Op3, Alignment, isVol, /* AlwaysInline */ false,
+ isTC, MachinePointerInfo(I.getArgOperand(0)), I.getAAMetadata());
+ updateDAGForMaybeTailCall(MS);
+ return;
+ }
+ case Intrinsic::memset_inline: {
+ const auto &MSII = cast<MemSetInlineInst>(I);
+ SDValue Dst = getValue(I.getArgOperand(0));
+ SDValue Value = getValue(I.getArgOperand(1));
+ SDValue Size = getValue(I.getArgOperand(2));
+ assert(isa<ConstantSDNode>(Size) && "memset_inline needs constant size");
+ // @llvm.memset defines 0 and 1 to both mean no alignment.
+ Align DstAlign = MSII.getDestAlign().valueOrOne();
+ bool isVol = MSII.isVolatile();
+ bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
+ SDValue Root = isVol ? getRoot() : getMemoryRoot();
+ SDValue MC = DAG.getMemset(Root, sdl, Dst, Value, Size, DstAlign, isVol,
+ /* AlwaysInline */ true, isTC,
MachinePointerInfo(I.getArgOperand(0)),
I.getAAMetadata());
- updateDAGForMaybeTailCall(MS);
+ updateDAGForMaybeTailCall(MC);
return;
}
case Intrinsic::memmove: {
bool TargetLowering::findOptimalMemOpLowering(
std::vector<EVT> &MemOps, unsigned Limit, const MemOp &Op, unsigned DstAS,
unsigned SrcAS, const AttributeList &FuncAttributes) const {
- if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
+ if (Limit != ~unsigned(0) && Op.isMemcpyWithFixedDstAlign() &&
+ Op.getSrcAlign() < Op.getDstAlign())
return false;
EVT VT = getOptimalMemOpType(Op, FuncAttributes);
return CI;
}
+CallInst *IRBuilderBase::CreateMemSetInline(Value *Dst, MaybeAlign DstAlign,
+ Value *Val, Value *Size,
+ bool IsVolatile, MDNode *TBAATag,
+ MDNode *ScopeTag,
+ MDNode *NoAliasTag) {
+ Dst = getCastedInt8PtrValue(Dst);
+ Value *Ops[] = {Dst, Val, Size, getInt1(IsVolatile)};
+ Type *Tys[] = {Dst->getType(), Size->getType()};
+ Module *M = BB->getParent()->getParent();
+ Function *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memset_inline, Tys);
+
+ CallInst *CI = createCallHelper(TheFn, Ops, this);
+
+ if (DstAlign)
+ cast<MemSetInlineInst>(CI)->setDestAlignment(*DstAlign);
+
+ // Set the TBAA info if present.
+ if (TBAATag)
+ CI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+
+ if (ScopeTag)
+ CI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag);
+
+ if (NoAliasTag)
+ CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag);
+
+ return CI;
+}
+
CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemSet(
Value *Ptr, Value *Val, Value *Size, Align Alignment, uint32_t ElementSize,
MDNode *TBAATag, MDNode *ScopeTag, MDNode *NoAliasTag) {
case Intrinsic::memcpy:
case Intrinsic::memcpy_inline:
case Intrinsic::memmove:
- case Intrinsic::memset: {
+ case Intrinsic::memset:
+ case Intrinsic::memset_inline: {
const auto *MI = cast<MemIntrinsic>(&Call);
auto IsValidAlignment = [&](unsigned Alignment) -> bool {
return Alignment == 0 || isPowerOf2_32(Alignment);
SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
- SDValue Size, Align Alignment, bool isVolatile,
+ SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo) const {
const AArch64Subtarget &STI =
DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
SDValue Chain, SDValue Dst, SDValue Src,
SDValue Size, Align Alignment,
- bool isVolatile,
+ bool isVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo) const override;
SDValue
EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
- SDValue Size, Align Alignment, bool isVolatile,
+ SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo) const {
const ARMSubtarget &Subtarget =
DAG.getZExtOrTrunc(Size, dl, MVT::i32));
}
- return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
- Alignment.value(), RTLIB::MEMSET);
+ if (!AlwaysInline)
+ return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
+ Alignment.value(), RTLIB::MEMSET);
+
+ return SDValue();
}
SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
SDValue Chain, SDValue Op1, SDValue Op2,
SDValue Op3, Align Alignment, bool isVolatile,
+ bool AlwaysInline,
MachinePointerInfo DstPtrInfo) const override;
SDValue EmitSpecializedLibcall(SelectionDAG &DAG, const SDLoc &dl,
unsigned SrcAS, const AttributeList &FuncAttributes) const {
const int MVCFastLen = 16;
- // Don't expand Op into scalar loads/stores in these cases:
- if (Op.isMemcpy() && Op.allowOverlap() && Op.size() <= MVCFastLen)
- return false; // Small memcpy: Use MVC
- if (Op.isMemset() && Op.size() - 1 <= MVCFastLen)
- return false; // Small memset (first byte with STC/MVI): Use MVC
- if (Op.isZeroMemset())
- return false; // Memset zero: Use XC
+ if (Limit != ~unsigned(0)) {
+ // Don't expand Op into scalar loads/stores in these cases:
+ if (Op.isMemcpy() && Op.allowOverlap() && Op.size() <= MVCFastLen)
+ return false; // Small memcpy: Use MVC
+ if (Op.isMemset() && Op.size() - 1 <= MVCFastLen)
+ return false; // Small memset (first byte with STC/MVI): Use MVC
+ if (Op.isZeroMemset())
+ return false; // Memset zero: Use XC
+ }
return TargetLowering::findOptimalMemOpLowering(MemOps, Limit, Op, DstAS,
SrcAS, FuncAttributes);
SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst,
SDValue Byte, SDValue Size, Align Alignment, bool IsVolatile,
- MachinePointerInfo DstPtrInfo) const {
+ bool AlwaysInline, MachinePointerInfo DstPtrInfo) const {
EVT PtrVT = Dst.getValueType();
if (IsVolatile)
SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &DL,
SDValue Chain, SDValue Dst, SDValue Byte,
SDValue Size, Align Alignment,
- bool IsVolatile,
+ bool IsVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo) const override;
std::pair<SDValue, SDValue>
SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemset(
SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Val,
- SDValue Size, Align Alignment, bool IsVolatile,
+ SDValue Size, Align Alignment, bool IsVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo) const {
auto &ST = DAG.getMachineFunction().getSubtarget<WebAssemblySubtarget>();
if (!ST.hasBulkMemory())
SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &DL,
SDValue Chain, SDValue Op1, SDValue Op2,
SDValue Op3, Align Alignment, bool IsVolatile,
+ bool AlwaysInline,
MachinePointerInfo DstPtrInfo) const override;
};
SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
- SDValue Size, Align Alignment, bool isVolatile,
+ SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo) const {
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
const X86Subtarget &Subtarget =
DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
DAG.getConstant(Offset, dl, AddrVT)),
Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment,
- isVolatile, false, DstPtrInfo.getWithOffset(Offset));
+ isVolatile, AlwaysInline,
+ /* isTailCall */ false, DstPtrInfo.getWithOffset(Offset));
}
// TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
SDValue Chain, SDValue Dst, SDValue Src,
SDValue Size, Align Alignment,
- bool isVolatile,
+ bool isVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo) const override;
SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu -mattr=-neon | FileCheck %s --check-prefixes=ALL,GPR
+; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu -mattr=neon | FileCheck %s --check-prefixes=ALL,NEON
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+declare void @llvm.memset.inline.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+
+; /////////////////////////////////////////////////////////////////////////////
+
+define void @memset_1(i8* %a, i8 %value) nounwind {
+; ALL-LABEL: memset_1:
+; ALL: // %bb.0:
+; ALL-NEXT: strb w1, [x0]
+; ALL-NEXT: ret
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 1, i1 0)
+ ret void
+}
+
+define void @memset_2(i8* %a, i8 %value) nounwind {
+; ALL-LABEL: memset_2:
+; ALL: // %bb.0:
+; ALL-NEXT: bfi w1, w1, #8, #24
+; ALL-NEXT: strh w1, [x0]
+; ALL-NEXT: ret
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 2, i1 0)
+ ret void
+}
+
+define void @memset_4(i8* %a, i8 %value) nounwind {
+; ALL-LABEL: memset_4:
+; ALL: // %bb.0:
+; ALL-NEXT: mov w8, #16843009
+; ALL-NEXT: and w9, w1, #0xff
+; ALL-NEXT: mul w8, w9, w8
+; ALL-NEXT: str w8, [x0]
+; ALL-NEXT: ret
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 4, i1 0)
+ ret void
+}
+
+define void @memset_8(i8* %a, i8 %value) nounwind {
+; ALL-LABEL: memset_8:
+; ALL: // %bb.0:
+; ALL-NEXT: // kill: def $w1 killed $w1 def $x1
+; ALL-NEXT: mov x8, #72340172838076673
+; ALL-NEXT: and x9, x1, #0xff
+; ALL-NEXT: mul x8, x9, x8
+; ALL-NEXT: str x8, [x0]
+; ALL-NEXT: ret
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 8, i1 0)
+ ret void
+}
+
+define void @memset_16(i8* %a, i8 %value) nounwind {
+; ALL-LABEL: memset_16:
+; ALL: // %bb.0:
+; ALL-NEXT: // kill: def $w1 killed $w1 def $x1
+; ALL-NEXT: mov x8, #72340172838076673
+; ALL-NEXT: and x9, x1, #0xff
+; ALL-NEXT: mul x8, x9, x8
+; ALL-NEXT: stp x8, x8, [x0]
+; ALL-NEXT: ret
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 16, i1 0)
+ ret void
+}
+
+define void @memset_32(i8* %a, i8 %value) nounwind {
+; GPR-LABEL: memset_32:
+; GPR: // %bb.0:
+; GPR-NEXT: // kill: def $w1 killed $w1 def $x1
+; GPR-NEXT: mov x8, #72340172838076673
+; GPR-NEXT: and x9, x1, #0xff
+; GPR-NEXT: mul x8, x9, x8
+; GPR-NEXT: stp x8, x8, [x0, #16]
+; GPR-NEXT: stp x8, x8, [x0]
+; GPR-NEXT: ret
+;
+; NEON-LABEL: memset_32:
+; NEON: // %bb.0:
+; NEON-NEXT: dup v0.16b, w1
+; NEON-NEXT: stp q0, q0, [x0]
+; NEON-NEXT: ret
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 32, i1 0)
+ ret void
+}
+
+define void @memset_64(i8* %a, i8 %value) nounwind {
+; GPR-LABEL: memset_64:
+; GPR: // %bb.0:
+; GPR-NEXT: // kill: def $w1 killed $w1 def $x1
+; GPR-NEXT: mov x8, #72340172838076673
+; GPR-NEXT: and x9, x1, #0xff
+; GPR-NEXT: mul x8, x9, x8
+; GPR-NEXT: stp x8, x8, [x0, #48]
+; GPR-NEXT: stp x8, x8, [x0, #32]
+; GPR-NEXT: stp x8, x8, [x0, #16]
+; GPR-NEXT: stp x8, x8, [x0]
+; GPR-NEXT: ret
+;
+; NEON-LABEL: memset_64:
+; NEON: // %bb.0:
+; NEON-NEXT: dup v0.16b, w1
+; NEON-NEXT: stp q0, q0, [x0]
+; NEON-NEXT: stp q0, q0, [x0, #32]
+; NEON-NEXT: ret
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 64, i1 0)
+ ret void
+}
+
+; /////////////////////////////////////////////////////////////////////////////
+
+define void @aligned_memset_16(i8* align 16 %a, i8 %value) nounwind {
+; ALL-LABEL: aligned_memset_16:
+; ALL: // %bb.0:
+; ALL-NEXT: // kill: def $w1 killed $w1 def $x1
+; ALL-NEXT: mov x8, #72340172838076673
+; ALL-NEXT: and x9, x1, #0xff
+; ALL-NEXT: mul x8, x9, x8
+; ALL-NEXT: stp x8, x8, [x0]
+; ALL-NEXT: ret
+ tail call void @llvm.memset.inline.p0i8.i64(i8* align 16 %a, i8 %value, i64 16, i1 0)
+ ret void
+}
+
+define void @aligned_memset_32(i8* align 32 %a, i8 %value) nounwind {
+; GPR-LABEL: aligned_memset_32:
+; GPR: // %bb.0:
+; GPR-NEXT: // kill: def $w1 killed $w1 def $x1
+; GPR-NEXT: mov x8, #72340172838076673
+; GPR-NEXT: and x9, x1, #0xff
+; GPR-NEXT: mul x8, x9, x8
+; GPR-NEXT: stp x8, x8, [x0, #16]
+; GPR-NEXT: stp x8, x8, [x0]
+; GPR-NEXT: ret
+;
+; NEON-LABEL: aligned_memset_32:
+; NEON: // %bb.0:
+; NEON-NEXT: dup v0.16b, w1
+; NEON-NEXT: stp q0, q0, [x0]
+; NEON-NEXT: ret
+ tail call void @llvm.memset.inline.p0i8.i64(i8* align 32 %a, i8 %value, i64 32, i1 0)
+ ret void
+}
+
+define void @aligned_memset_64(i8* align 64 %a, i8 %value) nounwind {
+; GPR-LABEL: aligned_memset_64:
+; GPR: // %bb.0:
+; GPR-NEXT: // kill: def $w1 killed $w1 def $x1
+; GPR-NEXT: mov x8, #72340172838076673
+; GPR-NEXT: and x9, x1, #0xff
+; GPR-NEXT: mul x8, x9, x8
+; GPR-NEXT: stp x8, x8, [x0, #48]
+; GPR-NEXT: stp x8, x8, [x0, #32]
+; GPR-NEXT: stp x8, x8, [x0, #16]
+; GPR-NEXT: stp x8, x8, [x0]
+; GPR-NEXT: ret
+;
+; NEON-LABEL: aligned_memset_64:
+; NEON: // %bb.0:
+; NEON-NEXT: dup v0.16b, w1
+; NEON-NEXT: stp q0, q0, [x0]
+; NEON-NEXT: stp q0, q0, [x0, #32]
+; NEON-NEXT: ret
+ tail call void @llvm.memset.inline.p0i8.i64(i8* align 64 %a, i8 %value, i64 64, i1 0)
+ ret void
+}
+
+; /////////////////////////////////////////////////////////////////////////////
+
+define void @bzero_1(i8* %a) nounwind {
+; ALL-LABEL: bzero_1:
+; ALL: // %bb.0:
+; ALL-NEXT: strb wzr, [x0]
+; ALL-NEXT: ret
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 1, i1 0)
+ ret void
+}
+
+define void @bzero_2(i8* %a) nounwind {
+; ALL-LABEL: bzero_2:
+; ALL: // %bb.0:
+; ALL-NEXT: strh wzr, [x0]
+; ALL-NEXT: ret
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 2, i1 0)
+ ret void
+}
+
+define void @bzero_4(i8* %a) nounwind {
+; ALL-LABEL: bzero_4:
+; ALL: // %bb.0:
+; ALL-NEXT: str wzr, [x0]
+; ALL-NEXT: ret
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 4, i1 0)
+ ret void
+}
+
+define void @bzero_8(i8* %a) nounwind {
+; ALL-LABEL: bzero_8:
+; ALL: // %bb.0:
+; ALL-NEXT: str xzr, [x0]
+; ALL-NEXT: ret
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 8, i1 0)
+ ret void
+}
+
+define void @bzero_16(i8* %a) nounwind {
+; ALL-LABEL: bzero_16:
+; ALL: // %bb.0:
+; ALL-NEXT: stp xzr, xzr, [x0]
+; ALL-NEXT: ret
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 16, i1 0)
+ ret void
+}
+
+define void @bzero_32(i8* %a) nounwind {
+; GPR-LABEL: bzero_32:
+; GPR: // %bb.0:
+; GPR-NEXT: adrp x8, .LCPI15_0
+; GPR-NEXT: ldr q0, [x8, :lo12:.LCPI15_0]
+; GPR-NEXT: stp q0, q0, [x0]
+; GPR-NEXT: ret
+;
+; NEON-LABEL: bzero_32:
+; NEON: // %bb.0:
+; NEON-NEXT: movi v0.2d, #0000000000000000
+; NEON-NEXT: stp q0, q0, [x0]
+; NEON-NEXT: ret
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 32, i1 0)
+ ret void
+}
+
+define void @bzero_64(i8* %a) nounwind {
+; GPR-LABEL: bzero_64:
+; GPR: // %bb.0:
+; GPR-NEXT: adrp x8, .LCPI16_0
+; GPR-NEXT: ldr q0, [x8, :lo12:.LCPI16_0]
+; GPR-NEXT: stp q0, q0, [x0]
+; GPR-NEXT: stp q0, q0, [x0, #32]
+; GPR-NEXT: ret
+;
+; NEON-LABEL: bzero_64:
+; NEON: // %bb.0:
+; NEON-NEXT: movi v0.2d, #0000000000000000
+; NEON-NEXT: stp q0, q0, [x0]
+; NEON-NEXT: stp q0, q0, [x0, #32]
+; NEON-NEXT: ret
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 64, i1 0)
+ ret void
+}
+
+; /////////////////////////////////////////////////////////////////////////////
+
+define void @aligned_bzero_16(i8* %a) nounwind {
+; ALL-LABEL: aligned_bzero_16:
+; ALL: // %bb.0:
+; ALL-NEXT: stp xzr, xzr, [x0]
+; ALL-NEXT: ret
+ tail call void @llvm.memset.inline.p0i8.i64(i8* align 16 %a, i8 0, i64 16, i1 0)
+ ret void
+}
+
+define void @aligned_bzero_32(i8* %a) nounwind {
+; GPR-LABEL: aligned_bzero_32:
+; GPR: // %bb.0:
+; GPR-NEXT: adrp x8, .LCPI18_0
+; GPR-NEXT: ldr q0, [x8, :lo12:.LCPI18_0]
+; GPR-NEXT: stp q0, q0, [x0]
+; GPR-NEXT: ret
+;
+; NEON-LABEL: aligned_bzero_32:
+; NEON: // %bb.0:
+; NEON-NEXT: movi v0.2d, #0000000000000000
+; NEON-NEXT: stp q0, q0, [x0]
+; NEON-NEXT: ret
+ tail call void @llvm.memset.inline.p0i8.i64(i8* align 32 %a, i8 0, i64 32, i1 0)
+ ret void
+}
+
+define void @aligned_bzero_64(i8* %a) nounwind {
+; GPR-LABEL: aligned_bzero_64:
+; GPR: // %bb.0:
+; GPR-NEXT: adrp x8, .LCPI19_0
+; GPR-NEXT: ldr q0, [x8, :lo12:.LCPI19_0]
+; GPR-NEXT: stp q0, q0, [x0]
+; GPR-NEXT: stp q0, q0, [x0, #32]
+; GPR-NEXT: ret
+;
+; NEON-LABEL: aligned_bzero_64:
+; NEON: // %bb.0:
+; NEON-NEXT: movi v0.2d, #0000000000000000
+; NEON-NEXT: stp q0, q0, [x0]
+; NEON-NEXT: stp q0, q0, [x0, #32]
+; NEON-NEXT: ret
+ tail call void @llvm.memset.inline.p0i8.i64(i8* align 64 %a, i8 0, i64 64, i1 0)
+ ret void
+}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+declare void @llvm.memset.inline.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+
+define void @test1(i8* %a, i8 %value) nounwind {
+; CHECK-LABEL: test1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT: mov x8, #72340172838076673
+; CHECK-NEXT: and x9, x1, #0xff
+; CHECK-NEXT: mul x8, x9, x8
+; CHECK-NEXT: str x8, [x0]
+; CHECK-NEXT: ret
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 8, i1 0)
+ ret void
+}
+
+define void @regular_memset_calls_external_function(i8* %a, i8 %value) nounwind {
+; CHECK-LABEL: regular_memset_calls_external_function:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w2, #1024
+; CHECK-NEXT: b memset
+ tail call void @llvm.memset.p0i8.i64(i8* %a, i8 %value, i64 1024, i1 0)
+ ret void
+}
+
+define void @inlined_set_doesnt_call_external_function(i8* %a, i8 %value) nounwind {
+; CHECK-LABEL: inlined_set_doesnt_call_external_function:
+; CHECK: // %bb.0:
+; CHECK-NEXT: dup v0.16b, w1
+; CHECK-NEXT: stp q0, q0, [x0]
+; CHECK-NEXT: stp q0, q0, [x0, #32]
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 1024, i1 0)
+ ret void
+}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse2,-sse4.2 | FileCheck %s --check-prefixes=GPR,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse4.2,-avx | FileCheck %s --check-prefixes=GPR,SSE4
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx,-avx512f | FileCheck %s --check-prefixes=GPR,AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s --check-prefixes=GPR,AVX512
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+declare void @llvm.memset.inline.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+
+; /////////////////////////////////////////////////////////////////////////////
+
+define void @memset_1(i8* %a, i8 %value) nounwind {
+; GPR-LABEL: memset_1:
+; GPR: # %bb.0:
+; GPR-NEXT: movb %sil, (%rdi)
+; GPR-NEXT: retq
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 1, i1 0)
+ ret void
+}
+
+define void @memset_2(i8* %a, i8 %value) nounwind {
+; GPR-LABEL: memset_2:
+; GPR: # %bb.0:
+; GPR-NEXT: movzbl %sil, %eax
+; GPR-NEXT: shll $8, %esi
+; GPR-NEXT: orl %esi, %eax
+; GPR-NEXT: movw %ax, (%rdi)
+; GPR-NEXT: retq
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 2, i1 0)
+ ret void
+}
+
+define void @memset_4(i8* %a, i8 %value) nounwind {
+; GPR-LABEL: memset_4:
+; GPR: # %bb.0:
+; GPR-NEXT: movzbl %sil, %eax
+; GPR-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101
+; GPR-NEXT: movl %eax, (%rdi)
+; GPR-NEXT: retq
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 4, i1 0)
+ ret void
+}
+
+define void @memset_8(i8* %a, i8 %value) nounwind {
+; GPR-LABEL: memset_8:
+; GPR: # %bb.0:
+; GPR-NEXT: # kill: def $esi killed $esi def $rsi
+; GPR-NEXT: movzbl %sil, %eax
+; GPR-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
+; GPR-NEXT: imulq %rax, %rcx
+; GPR-NEXT: movq %rcx, (%rdi)
+; GPR-NEXT: retq
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 8, i1 0)
+ ret void
+}
+
+define void @memset_16(i8* %a, i8 %value) nounwind {
+; SSE2-LABEL: memset_16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: # kill: def $esi killed $esi def $rsi
+; SSE2-NEXT: movzbl %sil, %eax
+; SSE2-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
+; SSE2-NEXT: imulq %rax, %rcx
+; SSE2-NEXT: movq %rcx, 8(%rdi)
+; SSE2-NEXT: movq %rcx, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: memset_16:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movd %esi, %xmm0
+; SSE4-NEXT: pxor %xmm1, %xmm1
+; SSE4-NEXT: pshufb %xmm1, %xmm0
+; SSE4-NEXT: movdqu %xmm0, (%rdi)
+; SSE4-NEXT: retq
+;
+; AVX-LABEL: memset_16:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovd %esi, %xmm0
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovdqu %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: memset_16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovd %esi, %xmm0
+; AVX512-NEXT: vpbroadcastb %xmm0, %xmm0
+; AVX512-NEXT: vmovdqu %xmm0, (%rdi)
+; AVX512-NEXT: retq
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 16, i1 0)
+ ret void
+}
+
+define void @memset_32(i8* %a, i8 %value) nounwind {
+; SSE2-LABEL: memset_32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: # kill: def $esi killed $esi def $rsi
+; SSE2-NEXT: movzbl %sil, %eax
+; SSE2-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
+; SSE2-NEXT: imulq %rax, %rcx
+; SSE2-NEXT: movq %rcx, 24(%rdi)
+; SSE2-NEXT: movq %rcx, 16(%rdi)
+; SSE2-NEXT: movq %rcx, 8(%rdi)
+; SSE2-NEXT: movq %rcx, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: memset_32:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movd %esi, %xmm0
+; SSE4-NEXT: pxor %xmm1, %xmm1
+; SSE4-NEXT: pshufb %xmm1, %xmm0
+; SSE4-NEXT: movdqu %xmm0, 16(%rdi)
+; SSE4-NEXT: movdqu %xmm0, (%rdi)
+; SSE4-NEXT: retq
+;
+; AVX-LABEL: memset_32:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovd %esi, %xmm0
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovdqu %xmm0, 16(%rdi)
+; AVX-NEXT: vmovdqu %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: memset_32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovd %esi, %xmm0
+; AVX512-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX512-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 32, i1 0)
+ ret void
+}
+
+define void @memset_64(i8* %a, i8 %value) nounwind {
+; SSE2-LABEL: memset_64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: # kill: def $esi killed $esi def $rsi
+; SSE2-NEXT: movzbl %sil, %eax
+; SSE2-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
+; SSE2-NEXT: imulq %rax, %rcx
+; SSE2-NEXT: movq %rcx, 56(%rdi)
+; SSE2-NEXT: movq %rcx, 48(%rdi)
+; SSE2-NEXT: movq %rcx, 40(%rdi)
+; SSE2-NEXT: movq %rcx, 32(%rdi)
+; SSE2-NEXT: movq %rcx, 24(%rdi)
+; SSE2-NEXT: movq %rcx, 16(%rdi)
+; SSE2-NEXT: movq %rcx, 8(%rdi)
+; SSE2-NEXT: movq %rcx, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: memset_64:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movd %esi, %xmm0
+; SSE4-NEXT: pxor %xmm1, %xmm1
+; SSE4-NEXT: pshufb %xmm1, %xmm0
+; SSE4-NEXT: movdqu %xmm0, 48(%rdi)
+; SSE4-NEXT: movdqu %xmm0, 32(%rdi)
+; SSE4-NEXT: movdqu %xmm0, 16(%rdi)
+; SSE4-NEXT: movdqu %xmm0, (%rdi)
+; SSE4-NEXT: retq
+;
+; AVX-LABEL: memset_64:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovd %esi, %xmm0
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX-NEXT: vmovups %ymm0, 32(%rdi)
+; AVX-NEXT: vmovups %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: memset_64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: movzbl %sil, %eax
+; AVX512-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101
+; AVX512-NEXT: vpbroadcastd %eax, %zmm0
+; AVX512-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 64, i1 0)
+ ret void
+}
+
+; /////////////////////////////////////////////////////////////////////////////
+
+define void @aligned_memset_16(i8* align 16 %a, i8 %value) nounwind {
+; SSE2-LABEL: aligned_memset_16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %esi, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: aligned_memset_16:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movd %esi, %xmm0
+; SSE4-NEXT: pxor %xmm1, %xmm1
+; SSE4-NEXT: pshufb %xmm1, %xmm0
+; SSE4-NEXT: movdqa %xmm0, (%rdi)
+; SSE4-NEXT: retq
+;
+; AVX-LABEL: aligned_memset_16:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovd %esi, %xmm0
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: aligned_memset_16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovd %esi, %xmm0
+; AVX512-NEXT: vpbroadcastb %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX512-NEXT: retq
+ tail call void @llvm.memset.inline.p0i8.i64(i8* align 16 %a, i8 %value, i64 16, i1 0)
+ ret void
+}
+
+define void @aligned_memset_32(i8* align 32 %a, i8 %value) nounwind {
+; SSE2-LABEL: aligned_memset_32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %esi, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, 16(%rdi)
+; SSE2-NEXT: movdqa %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: aligned_memset_32:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movd %esi, %xmm0
+; SSE4-NEXT: pxor %xmm1, %xmm1
+; SSE4-NEXT: pshufb %xmm1, %xmm0
+; SSE4-NEXT: movdqa %xmm0, 16(%rdi)
+; SSE4-NEXT: movdqa %xmm0, (%rdi)
+; SSE4-NEXT: retq
+;
+; AVX-LABEL: aligned_memset_32:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovd %esi, %xmm0
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovdqa %xmm0, 16(%rdi)
+; AVX-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: aligned_memset_32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovd %esi, %xmm0
+; AVX512-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX512-NEXT: vmovdqa %ymm0, (%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ tail call void @llvm.memset.inline.p0i8.i64(i8* align 32 %a, i8 %value, i64 32, i1 0)
+ ret void
+}
+
+define void @aligned_memset_64(i8* align 64 %a, i8 %value) nounwind {
+; SSE2-LABEL: aligned_memset_64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %esi, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, 48(%rdi)
+; SSE2-NEXT: movdqa %xmm0, 32(%rdi)
+; SSE2-NEXT: movdqa %xmm0, 16(%rdi)
+; SSE2-NEXT: movdqa %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: aligned_memset_64:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movd %esi, %xmm0
+; SSE4-NEXT: pxor %xmm1, %xmm1
+; SSE4-NEXT: pshufb %xmm1, %xmm0
+; SSE4-NEXT: movdqa %xmm0, 48(%rdi)
+; SSE4-NEXT: movdqa %xmm0, 32(%rdi)
+; SSE4-NEXT: movdqa %xmm0, 16(%rdi)
+; SSE4-NEXT: movdqa %xmm0, (%rdi)
+; SSE4-NEXT: retq
+;
+; AVX-LABEL: aligned_memset_64:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovd %esi, %xmm0
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX-NEXT: vmovaps %ymm0, 32(%rdi)
+; AVX-NEXT: vmovaps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: aligned_memset_64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: movzbl %sil, %eax
+; AVX512-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101
+; AVX512-NEXT: vpbroadcastd %eax, %zmm0
+; AVX512-NEXT: vmovdqa64 %zmm0, (%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ tail call void @llvm.memset.inline.p0i8.i64(i8* align 64 %a, i8 %value, i64 64, i1 0)
+ ret void
+}
+
+; /////////////////////////////////////////////////////////////////////////////
+
+define void @bzero_1(i8* %a) nounwind {
+; GPR-LABEL: bzero_1:
+; GPR: # %bb.0:
+; GPR-NEXT: movb $0, (%rdi)
+; GPR-NEXT: retq
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 1, i1 0)
+ ret void
+}
+
+define void @bzero_2(i8* %a) nounwind {
+; GPR-LABEL: bzero_2:
+; GPR: # %bb.0:
+; GPR-NEXT: movw $0, (%rdi)
+; GPR-NEXT: retq
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 2, i1 0)
+ ret void
+}
+
+define void @bzero_4(i8* %a) nounwind {
+; GPR-LABEL: bzero_4:
+; GPR: # %bb.0:
+; GPR-NEXT: movl $0, (%rdi)
+; GPR-NEXT: retq
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 4, i1 0)
+ ret void
+}
+
+define void @bzero_8(i8* %a) nounwind {
+; GPR-LABEL: bzero_8:
+; GPR: # %bb.0:
+; GPR-NEXT: movq $0, (%rdi)
+; GPR-NEXT: retq
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 8, i1 0)
+ ret void
+}
+
+define void @bzero_16(i8* %a) nounwind {
+; SSE2-LABEL: bzero_16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movq $0, 8(%rdi)
+; SSE2-NEXT: movq $0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: bzero_16:
+; SSE4: # %bb.0:
+; SSE4-NEXT: xorps %xmm0, %xmm0
+; SSE4-NEXT: movups %xmm0, (%rdi)
+; SSE4-NEXT: retq
+;
+; AVX-LABEL: bzero_16:
+; AVX: # %bb.0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovups %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: bzero_16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovups %xmm0, (%rdi)
+; AVX512-NEXT: retq
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 16, i1 0)
+ ret void
+}
+
+define void @bzero_32(i8* %a) nounwind {
+; SSE2-LABEL: bzero_32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movq $0, 24(%rdi)
+; SSE2-NEXT: movq $0, 16(%rdi)
+; SSE2-NEXT: movq $0, 8(%rdi)
+; SSE2-NEXT: movq $0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: bzero_32:
+; SSE4: # %bb.0:
+; SSE4-NEXT: xorps %xmm0, %xmm0
+; SSE4-NEXT: movups %xmm0, 16(%rdi)
+; SSE4-NEXT: movups %xmm0, (%rdi)
+; SSE4-NEXT: retq
+;
+; AVX-LABEL: bzero_32:
+; AVX: # %bb.0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovups %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: bzero_32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovups %ymm0, (%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 32, i1 0)
+ ret void
+}
+
+define void @bzero_64(i8* %a) nounwind {
+; SSE2-LABEL: bzero_64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movq $0, 56(%rdi)
+; SSE2-NEXT: movq $0, 48(%rdi)
+; SSE2-NEXT: movq $0, 40(%rdi)
+; SSE2-NEXT: movq $0, 32(%rdi)
+; SSE2-NEXT: movq $0, 24(%rdi)
+; SSE2-NEXT: movq $0, 16(%rdi)
+; SSE2-NEXT: movq $0, 8(%rdi)
+; SSE2-NEXT: movq $0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: bzero_64:
+; SSE4: # %bb.0:
+; SSE4-NEXT: xorps %xmm0, %xmm0
+; SSE4-NEXT: movups %xmm0, 48(%rdi)
+; SSE4-NEXT: movups %xmm0, 32(%rdi)
+; SSE4-NEXT: movups %xmm0, 16(%rdi)
+; SSE4-NEXT: movups %xmm0, (%rdi)
+; SSE4-NEXT: retq
+;
+; AVX-LABEL: bzero_64:
+; AVX: # %bb.0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovups %ymm0, 32(%rdi)
+; AVX-NEXT: vmovups %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: bzero_64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovups %zmm0, (%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 64, i1 0)
+ ret void
+}
+
+; /////////////////////////////////////////////////////////////////////////////
+
+define void @aligned_bzero_16(i8* %a) nounwind {
+; SSE2-LABEL: aligned_bzero_16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorps %xmm0, %xmm0
+; SSE2-NEXT: movaps %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: aligned_bzero_16:
+; SSE4: # %bb.0:
+; SSE4-NEXT: xorps %xmm0, %xmm0
+; SSE4-NEXT: movaps %xmm0, (%rdi)
+; SSE4-NEXT: retq
+;
+; AVX-LABEL: aligned_bzero_16:
+; AVX: # %bb.0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovaps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: aligned_bzero_16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, (%rdi)
+; AVX512-NEXT: retq
+ tail call void @llvm.memset.inline.p0i8.i64(i8* align 16 %a, i8 0, i64 16, i1 0)
+ ret void
+}
+
+define void @aligned_bzero_32(i8* %a) nounwind {
+; SSE2-LABEL: aligned_bzero_32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorps %xmm0, %xmm0
+; SSE2-NEXT: movaps %xmm0, 16(%rdi)
+; SSE2-NEXT: movaps %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: aligned_bzero_32:
+; SSE4: # %bb.0:
+; SSE4-NEXT: xorps %xmm0, %xmm0
+; SSE4-NEXT: movaps %xmm0, 16(%rdi)
+; SSE4-NEXT: movaps %xmm0, (%rdi)
+; SSE4-NEXT: retq
+;
+; AVX-LABEL: aligned_bzero_32:
+; AVX: # %bb.0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovaps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: aligned_bzero_32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %ymm0, (%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ tail call void @llvm.memset.inline.p0i8.i64(i8* align 32 %a, i8 0, i64 32, i1 0)
+ ret void
+}
+
+define void @aligned_bzero_64(i8* %a) nounwind {
+; SSE2-LABEL: aligned_bzero_64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorps %xmm0, %xmm0
+; SSE2-NEXT: movaps %xmm0, 48(%rdi)
+; SSE2-NEXT: movaps %xmm0, 32(%rdi)
+; SSE2-NEXT: movaps %xmm0, 16(%rdi)
+; SSE2-NEXT: movaps %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: aligned_bzero_64:
+; SSE4: # %bb.0:
+; SSE4-NEXT: xorps %xmm0, %xmm0
+; SSE4-NEXT: movaps %xmm0, 48(%rdi)
+; SSE4-NEXT: movaps %xmm0, 32(%rdi)
+; SSE4-NEXT: movaps %xmm0, 16(%rdi)
+; SSE4-NEXT: movaps %xmm0, (%rdi)
+; SSE4-NEXT: retq
+;
+; AVX-LABEL: aligned_bzero_64:
+; AVX: # %bb.0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovaps %ymm0, 32(%rdi)
+; AVX-NEXT: vmovaps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: aligned_bzero_64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %zmm0, (%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ tail call void @llvm.memset.inline.p0i8.i64(i8* align 64 %a, i8 0, i64 64, i1 0)
+ ret void
+}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 | FileCheck %s
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+declare void @llvm.memset.inline.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+
+define void @test1(i8* %a, i8 %value) nounwind {
+; CHECK-LABEL: test1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $esi killed $esi def $rsi
+; CHECK-NEXT: movzbl %sil, %eax
+; CHECK-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
+; CHECK-NEXT: imulq %rax, %rcx
+; CHECK-NEXT: movq %rcx, (%rdi)
+; CHECK-NEXT: retq
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 8, i1 0)
+ ret void
+}
+
+define void @regular_memset_calls_external_function(i8* %a, i8 %value) nounwind {
+; CHECK-LABEL: regular_memset_calls_external_function:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl $1024, %edx # imm = 0x400
+; CHECK-NEXT: jmp memset@PLT # TAILCALL
+ tail call void @llvm.memset.p0i8.i64(i8* %a, i8 %value, i64 1024, i1 0)
+ ret void
+}
+
+define void @inlined_set_doesnt_call_external_function(i8* %a, i8 %value) nounwind {
+; CHECK-LABEL: inlined_set_doesnt_call_external_function:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $esi killed $esi def $rsi
+; CHECK-NEXT: movzbl %sil, %ecx
+; CHECK-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
+; CHECK-NEXT: imulq %rcx, %rax
+; CHECK-NEXT: movq %rax, 1016(%rdi)
+; CHECK-NEXT: movq %rax, 1008(%rdi)
+ tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 1024, i1 0)
+ ret void
+}
declare void @llvm.stackrestore(i8*)
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
declare void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
+declare void @llvm.memset.p0i8.i8.i64(i8* nocapture, i8, i64, i1) nounwind
+declare void @llvm.memset.inline.p0i8.i8.i64(i8* nocapture, i8, i64, i1) nounwind
declare void @has_sret(i8* sret(i8) %p)
declare void @has_noaliases(i32* noalias %p, i32* %q)
declare void @one_arg(i32)
; CHECK: Unusual: noalias argument aliases another argument
call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (i32* @CG to i8*), i8* bitcast (i32* @CG to i8*), i64 1, i1 0)
+; CHECK: Write to read-only memory
+call void @llvm.memset.p0i8.i8.i64(i8* bitcast (i32* @CG to i8*), i8 1, i64 1, i1 0)
+; CHECK: Write to read-only memory
+call void @llvm.memset.inline.p0i8.i8.i64(i8* bitcast (i32* @CG to i8*), i8 1, i64 1, i1 0)
+
; CHECK: Undefined behavior: Buffer overflow
%wider = bitcast i8* %buf to i16*
store i16 0, i16* %wider
ret void
}
+declare void @llvm.memset.inline.p0i8.i32(i8* nocapture, i8, i32, i1)
+define void @memset_inline_is_volatile(i8* %dest, i8 %value, i1 %is.volatile) {
+ ; CHECK: immarg operand has non-immediate parameter
+ ; CHECK-NEXT: i1 %is.volatile
+ ; CHECK-NEXT: call void @llvm.memset.inline.p0i8.i32(i8* %dest, i8 %value, i32 8, i1 %is.volatile)
+ call void @llvm.memset.inline.p0i8.i32(i8* %dest, i8 %value, i32 8, i1 %is.volatile)
+ ret void
+}
+
+define void @memset_inline_variable_size(i8* %dest, i8 %value, i32 %size) {
+ ; CHECK: immarg operand has non-immediate parameter
+ ; CHECK-NEXT: i32 %size
+ ; CHECK-NEXT: call void @llvm.memset.inline.p0i8.i32(i8* %dest, i8 %value, i32 %size, i1 true)
+ call void @llvm.memset.inline.p0i8.i32(i8* %dest, i8 %value, i32 %size, i1 true)
+ ret void
+}
+
declare i64 @llvm.objectsize.i64.p0i8(i8*, i1, i1, i1)
define void @objectsize(i8* %ptr, i1 %a, i1 %b, i1 %c) {
--- /dev/null
+; RUN: not opt -verify < %s 2>&1 | FileCheck %s
+
+; CHECK: alignment is not a power of two
+
+define void @foo(i8* %P, i8 %value) {
+ call void @llvm.memset.inline.p0i8.i32(i8* align 3 %P, i8 %value, i32 4, i1 false)
+ ret void
+}
+declare void @llvm.memset.inline.p0i8.i32(i8* nocapture, i8, i32, i1) nounwind