From c4e416ce490aebc2aefafe5fec8e596e816ec978 Mon Sep 17 00:00:00 2001 From: "commit-bot@chromium.org" Date: Tue, 20 May 2014 14:54:04 +0000 Subject: [PATCH] Add Memcpy32 bench. This compares 32-bit copies using memcpy, autovectorization, and when SSE2 is available, aligned and unaligned SSE2. Running this on my desktop (Intel(R) Xeon(R) CPU E5-2690 0 @ 2.90GHz), I see all four perform essentially the same, except Clang's autovectorization looks a little better than GCC's. memcpy is calling libc 2.19's __memcpy_sse2_unaligned. BUG=skia: R=reed@google.com, qiankun.miao@intel.com, mtklein@google.com Author: mtklein@chromium.org Review URL: https://codereview.chromium.org/290533002 git-svn-id: http://skia.googlecode.com/svn/trunk@14799 2bbb7eff-a529-9590-31e7-b0007b416f81 --- bench/MemcpyBench.cpp | 154 ++++++++++++++++++++++++++++++++++++++++++ gyp/bench.gypi | 1 + 2 files changed, 155 insertions(+) create mode 100644 bench/MemcpyBench.cpp diff --git a/bench/MemcpyBench.cpp b/bench/MemcpyBench.cpp new file mode 100644 index 0000000000..452bf6fdc2 --- /dev/null +++ b/bench/MemcpyBench.cpp @@ -0,0 +1,154 @@ +/* + * Copyright 2014 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#include "SkBenchmark.h" +#include "SkRandom.h" +#include "SkTemplates.h" + +template +class Memcpy32Bench : public SkBenchmark { +public: + explicit Memcpy32Bench(int count, Memcpy32 memcpy32, const char* name) + : fCount(count) + , fMemcpy32(memcpy32) + , fName(SkStringPrintf("%s_%d", name, count)) {} + + virtual const char* onGetName() SK_OVERRIDE { + return fName.c_str(); + } + + virtual bool isSuitableFor(Backend backend) SK_OVERRIDE { + return backend == kNonRendering_Backend; + } + + virtual void onPreDraw() SK_OVERRIDE { + fDst.reset(fCount); + fSrc.reset(fCount); + + SkRandom rand; + for (int i = 0; i < fCount; i++) { + fSrc[i] = rand.nextU(); + } + } + + virtual void onDraw(const int loops, SkCanvas*) SK_OVERRIDE { + for (int i = 0; i < loops; i++) { + fMemcpy32(fDst, fSrc, fCount); + } + } + +private: + SkAutoTMalloc fDst, fSrc; + + int fCount; + Memcpy32 fMemcpy32; + const SkString fName; +}; + +template +static Memcpy32Bench* Bench(int count, Memcpy32 memcpy32, const char* name) { + return new Memcpy32Bench(count, memcpy32, name); +} +#define BENCH(memcpy32, count) DEF_BENCH(return Bench(count, memcpy32, #memcpy32); ) + + +// Let the libc developers do what they think is best. +static void memcpy32_memcpy(uint32_t* dst, const uint32_t* src, int count) { + memcpy(dst, src, sizeof(uint32_t) * count); +} +BENCH(memcpy32_memcpy, 10) +BENCH(memcpy32_memcpy, 100) +BENCH(memcpy32_memcpy, 1000) +BENCH(memcpy32_memcpy, 10000) +BENCH(memcpy32_memcpy, 100000) + +// Let the compiler's autovectorizer do what it thinks is best. +static void memcpy32_autovectorize(uint32_t* dst, const uint32_t* src, int count) { + while (count --> 0) { + *dst++ = *src++; + } +} +BENCH(memcpy32_autovectorize, 10) +BENCH(memcpy32_autovectorize, 100) +BENCH(memcpy32_autovectorize, 1000) +BENCH(memcpy32_autovectorize, 10000) +BENCH(memcpy32_autovectorize, 100000) + +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 + +// Align dst to 16 bytes, then use aligned stores. src isn't algined, so use unaligned loads. +static void memcpy32_sse2_align(uint32_t* dst, const uint32_t* src, int count) { + if (count >= 16) { + while (uintptr_t(dst) & 0xF) { + *dst++ = *src++; + count--; + } + + __m128i* dst128 = reinterpret_cast<__m128i*>(dst); + const __m128i* src128 = reinterpret_cast(src); + while (count >= 16) { + __m128i a = _mm_loadu_si128(src128++); + __m128i b = _mm_loadu_si128(src128++); + __m128i c = _mm_loadu_si128(src128++); + __m128i d = _mm_loadu_si128(src128++); + + _mm_store_si128(dst128++, a); + _mm_store_si128(dst128++, b); + _mm_store_si128(dst128++, c); + _mm_store_si128(dst128++, d); + + count -= 16; + } + + dst = reinterpret_cast(dst128); + src = reinterpret_cast(src128); + } + + while (count --> 0) { + *dst++ = *src++; + } +} +BENCH(memcpy32_sse2_align, 10) +BENCH(memcpy32_sse2_align, 100) +BENCH(memcpy32_sse2_align, 1000) +BENCH(memcpy32_sse2_align, 10000) +BENCH(memcpy32_sse2_align, 100000) + +// Leave both dst and src unaliged, and so use unaligned stores for dst and unaligned loads for src. +static void memcpy32_sse2_unalign(uint32_t* dst, const uint32_t* src, int count) { + __m128i* dst128 = reinterpret_cast<__m128i*>(dst); + const __m128i* src128 = reinterpret_cast(src); + while (count >= 16) { + __m128i a = _mm_loadu_si128(src128++); + __m128i b = _mm_loadu_si128(src128++); + __m128i c = _mm_loadu_si128(src128++); + __m128i d = _mm_loadu_si128(src128++); + + _mm_storeu_si128(dst128++, a); + _mm_storeu_si128(dst128++, b); + _mm_storeu_si128(dst128++, c); + _mm_storeu_si128(dst128++, d); + + count -= 16; + } + + dst = reinterpret_cast(dst128); + src = reinterpret_cast(src128); + while (count --> 0) { + *dst++ = *src++; + } +} +BENCH(memcpy32_sse2_unalign, 10) +BENCH(memcpy32_sse2_unalign, 100) +BENCH(memcpy32_sse2_unalign, 1000) +BENCH(memcpy32_sse2_unalign, 10000) +BENCH(memcpy32_sse2_unalign, 100000) + +#endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 + +#undef BENCH + diff --git a/gyp/bench.gypi b/gyp/bench.gypi index 1b3bc12741..295be45b1c 100644 --- a/gyp/bench.gypi +++ b/gyp/bench.gypi @@ -53,6 +53,7 @@ '../bench/Matrix44Bench.cpp', '../bench/MatrixBench.cpp', '../bench/MatrixConvolutionBench.cpp', + '../bench/MemcpyBench.cpp', '../bench/MemoryBench.cpp', '../bench/MemsetBench.cpp', '../bench/MergeBench.cpp', -- 2.34.1