#define LLVM_LIBC_HAS_FEATURE(FEATURE) 0
#endif
+#if defined(LLVM_LIBC_COMPILER_CLANG)
+#define LLVM_LIBC_LOOP_NOUNROLL _Pragma("nounroll")
+#elif defined(LLVM_LIBC_COMPILER_GCC)
+#define LLVM_LIBC_LOOP_NOUNROLL _Pragma("GCC unroll 0")
+#else
+#define LLVM_LIBC_LOOP_NOUNROLL
+#endif
+
#endif // LLVM_LIBC_SUPPORT_COMPILER_FEATURES_H
[[maybe_unused]] static inline BcmpReturnType
inline_bcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) {
-#pragma nounroll
+ LLVM_LIBC_LOOP_NOUNROLL
for (size_t offset = 0; offset < count; ++offset)
if (auto value = generic::Bcmp<1>::block(p1 + offset, p2 + offset))
return value;
namespace __llvm_libc {
[[maybe_unused]] static inline MemcmpReturnType
inline_memcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) {
-#pragma nounroll
+ LLVM_LIBC_LOOP_NOUNROLL
for (size_t offset = 0; offset < count; ++offset)
if (auto value = generic::Memcmp<1>::block(p1 + offset, p2 + offset))
return value;
}
return x86::avx512bw::Memcmp<64>::loop_and_tail(p1, p2, count);
}
+
#endif // defined(LLVM_LIBC_ARCH_X86)
#if defined(LLVM_LIBC_ARCH_AARCH64)
[[maybe_unused]] static inline void
inline_memcpy_embedded_tiny(Ptr __restrict dst, CPtr __restrict src,
size_t count) {
-#pragma nounroll
+ LLVM_LIBC_LOOP_NOUNROLL
for (size_t offset = 0; offset < count; ++offset)
builtin::Memcpy<1>::block(dst + offset, src + offset);
}
if ((count == 0) || (dst == src))
return;
if (dst < src) {
-#pragma nounroll
+ LLVM_LIBC_LOOP_NOUNROLL
for (size_t offset = 0; offset < count; ++offset)
builtin::Memcpy<1>::block(dst + offset, src + offset);
} else {
-#pragma nounroll
+ LLVM_LIBC_LOOP_NOUNROLL
for (ptrdiff_t offset = count - 1; offset >= 0; --offset)
builtin::Memcpy<1>::block(dst + offset, src + offset);
}
[[maybe_unused]] inline static void
inline_memset_embedded_tiny(Ptr dst, uint8_t value, size_t count) {
-#pragma nounroll
+ LLVM_LIBC_LOOP_NOUNROLL
for (size_t offset = 0; offset < count; ++offset)
generic::Memset<1, 1>::block(dst + offset, value);
}
#ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
return __builtin_memcpy_inline(dst, src, SIZE);
#else
- deferred_static_assert("Missing __builtin_memcpy_inline");
- (void)dst;
- (void)src;
+ // The codegen may be suboptimal.
+ for (size_t i = 0; i < Size; ++i)
+ dst[i] = src[i];
#endif
}
#include "src/__support/CPP/array.h"
#include "src/__support/CPP/type_traits.h"
#include "src/__support/common.h"
+#include "src/__support/compiler_features.h"
#include "src/__support/endian.h"
#include "src/string/memory_utils/op_builtin.h"
#include "src/string/memory_utils/utils.h"
}
};
+// GCC can only take literals as __vector_size__ argument so we have to use
+// template specialization.
+template <size_t Size> struct VectorValueType {};
+template <> struct VectorValueType<1> {
+ using type = uint8_t __attribute__((__vector_size__(1)));
+};
+template <> struct VectorValueType<2> {
+ using type = uint8_t __attribute__((__vector_size__(2)));
+};
+template <> struct VectorValueType<4> {
+ using type = uint8_t __attribute__((__vector_size__(4)));
+};
+template <> struct VectorValueType<8> {
+ using type = uint8_t __attribute__((__vector_size__(8)));
+};
+template <> struct VectorValueType<16> {
+ using type = uint8_t __attribute__((__vector_size__(16)));
+};
+template <> struct VectorValueType<32> {
+ using type = uint8_t __attribute__((__vector_size__(32)));
+};
+template <> struct VectorValueType<64> {
+ using type = uint8_t __attribute__((__vector_size__(64)));
+};
+
// Implements load, store and splat for vector types.
template <size_t Size> struct VectorType {
- using Type = uint8_t __attribute__((__vector_size__(Size)));
+ using Type = typename VectorValueType<Size>::type;
static inline Type load(CPtr src) { return ::__llvm_libc::load<Type>(src); }
static inline void store(Ptr dst, Type value) {
::__llvm_libc::store<Type>(dst, value);
const size_t tail_offset = count - Size;
const auto tail_value = T::load(src + tail_offset);
size_t offset = 0;
-#pragma nounroll
+ LLVM_LIBC_LOOP_NOUNROLL
do {
block(dst + offset, src + offset);
offset += Size;
static_assert(Size > 1, "a loop of size 1 does not need tail");
const auto head_value = T::load(src);
ptrdiff_t offset = count - Size;
-#pragma nounroll
+ LLVM_LIBC_LOOP_NOUNROLL
do {
block(dst + offset, src + offset);
offset -= Size;
namespace sse2 {
static inline BcmpReturnType bcmp16(CPtr p1, CPtr p2) {
+#if defined(__SSE2__)
using T = char __attribute__((__vector_size__(16)));
// A mask indicating which bytes differ after loading 16 bytes from p1 and p2.
const int mask =
_mm_movemask_epi8(cpp::bit_cast<__m128i>(load<T>(p1) != load<T>(p2)));
return static_cast<uint32_t>(mask);
+#else
+ (void)p1;
+ (void)p2;
+ return BcmpReturnType::ZERO();
+#endif // defined(__SSE2__)
}
template <size_t Size> using Bcmp = BcmpImpl<Size, 16, bcmp16>;
} // namespace sse2
namespace avx2 {
static inline BcmpReturnType bcmp32(CPtr p1, CPtr p2) {
+#if defined(__AVX2__)
using T = char __attribute__((__vector_size__(32)));
// A mask indicating which bytes differ after loading 32 bytes from p1 and p2.
const int mask =
// _mm256_movemask_epi8 returns an int but it is to be interpreted as a 32-bit
// mask.
return static_cast<uint32_t>(mask);
+#else
+ (void)p1;
+ (void)p2;
+ return BcmpReturnType::ZERO();
+#endif // defined(__AVX2__)
}
template <size_t Size> using Bcmp = BcmpImpl<Size, 32, bcmp32>;
} // namespace avx2
namespace avx512bw {
static inline BcmpReturnType bcmp64(CPtr p1, CPtr p2) {
+#if defined(__AVX512BW__)
using T = char __attribute__((__vector_size__(64)));
// A mask indicating which bytes differ after loading 64 bytes from p1 and p2.
- const uint64_t mask = _mm512_cmpneq_epi8_mask(load<T>(p1), load<T>(p2));
+ const uint64_t mask = _mm512_cmpneq_epi8_mask(
+ cpp::bit_cast<__m512i>(load<T>(p1)), cpp::bit_cast<__m512i>(load<T>(p2)));
const bool mask_is_set = mask != 0;
return static_cast<uint32_t>(mask_is_set);
+#else
+ (void)p1;
+ (void)p2;
+ return BcmpReturnType::ZERO();
+#endif // defined(__AVX512BW__)
}
template <size_t Size> using Bcmp = BcmpImpl<Size, 64, bcmp64>;
} // namespace avx512bw
namespace sse2 {
static inline MemcmpReturnType memcmp16(CPtr p1, CPtr p2) {
+#if defined(__SSE2__)
using T = char __attribute__((__vector_size__(16)));
// A mask indicating which bytes differ after loading 16 bytes from p1 and p2.
if (int mask =
_mm_movemask_epi8(cpp::bit_cast<__m128i>(load<T>(p1) != load<T>(p2))))
return char_diff_no_zero(p1, p2, mask);
return MemcmpReturnType::ZERO();
+#else
+ (void)p1;
+ (void)p2;
+ return MemcmpReturnType::ZERO();
+#endif // defined(__SSE2__)
}
template <size_t Size> using Memcmp = MemcmpImpl<Size, 16, memcmp16, bcmp16>;
} // namespace sse2
namespace avx2 {
static inline MemcmpReturnType memcmp32(CPtr p1, CPtr p2) {
+#if defined(__AVX2__)
using T = char __attribute__((__vector_size__(32)));
// A mask indicating which bytes differ after loading 32 bytes from p1 and p2.
if (int mask = _mm256_movemask_epi8(
cpp::bit_cast<__m256i>(load<T>(p1) != load<T>(p2))))
return char_diff_no_zero(p1, p2, mask);
return MemcmpReturnType::ZERO();
+#else
+ (void)p1;
+ (void)p2;
+ return MemcmpReturnType::ZERO();
+#endif // defined(__AVX2__)
}
template <size_t Size> using Memcmp = MemcmpImpl<Size, 32, memcmp32, bcmp32>;
} // namespace avx2
namespace avx512bw {
static inline MemcmpReturnType memcmp64(CPtr p1, CPtr p2) {
+#if defined(__AVX512BW__)
using T = char __attribute__((__vector_size__(64)));
// A mask indicating which bytes differ after loading 64 bytes from p1 and p2.
- if (uint64_t mask = _mm512_cmpneq_epi8_mask(load<T>(p1), load<T>(p2)))
+ if (uint64_t mask =
+ _mm512_cmpneq_epi8_mask(cpp::bit_cast<__m512i>(load<T>(p1)),
+ cpp::bit_cast<__m512i>(load<T>(p2))))
return char_diff_no_zero(p1, p2, mask);
return MemcmpReturnType::ZERO();
+#else
+ (void)p1;
+ (void)p2;
+ return MemcmpReturnType::ZERO();
+#endif // defined(__AVX512BW__)
}
template <size_t Size> using Memcmp = MemcmpImpl<Size, 64, memcmp64, bcmp64>;
} // namespace avx512bw