+++ /dev/null
-/*
- * Copyright 2011 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include <emmintrin.h>
-#include "SkBlitRect_opts_SSE2.h"
-#include "SkBlitRow.h"
-#include "SkColorPriv.h"
-
-/* Simple blitting of opaque rectangles less than 31 pixels wide:
- * inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
- */
-static void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination,
- int width, int height,
- size_t rowBytes, uint32_t color) {
- SkASSERT(255 == SkGetPackedA32(color));
- SkASSERT(width > 0);
- SkASSERT(width < 31);
-
- while (--height >= 0) {
- SkPMColor* dst = destination;
- int count = width;
-
- while (count > 4) {
- *dst++ = color;
- *dst++ = color;
- *dst++ = color;
- *dst++ = color;
- count -= 4;
- }
-
- while (count > 0) {
- *dst++ = color;
- --count;
- }
-
- destination = (uint32_t*)((char*)destination + rowBytes);
- }
-}
-
-/*
- * Fast blitting of opaque rectangles at least 31 pixels wide:
- * inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
- * A 31 pixel rectangle is guaranteed to have at least one
- * 16-pixel aligned span that can take advantage of mm_store.
- */
-static void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination,
- int width, int height,
- size_t rowBytes, uint32_t color) {
- SkASSERT(255 == SkGetPackedA32(color));
- SkASSERT(width >= 31);
-
- __m128i color_wide = _mm_set1_epi32(color);
- while (--height >= 0) {
- // Prefetching one row ahead to L1 cache can equal hardware
- // performance for large/tall rects, but never *beats*
- // hardware performance.
- SkPMColor* dst = destination;
- int count = width;
-
- while (((size_t)dst) & 0x0F) {
- *dst++ = color;
- --count;
- }
- __m128i *d = reinterpret_cast<__m128i*>(dst);
-
- // Googling suggests _mm_stream is only going to beat _mm_store
- // for things that wouldn't fit in L2 cache anyway, typically
- // >500kB, and precisely fill cache lines. For us, with
- // arrays > 100k elements _mm_stream is still 100%+ slower than
- // mm_store.
-
- // Unrolling to count >= 64 is a break-even for most
- // input patterns; we seem to be saturating the bus and having
- // low enough overhead at 32.
-
- while (count >= 32) {
- _mm_store_si128(d++, color_wide);
- _mm_store_si128(d++, color_wide);
- _mm_store_si128(d++, color_wide);
- _mm_store_si128(d++, color_wide);
- _mm_store_si128(d++, color_wide);
- _mm_store_si128(d++, color_wide);
- _mm_store_si128(d++, color_wide);
- _mm_store_si128(d++, color_wide);
- count -= 32;
- }
- if (count >= 16) {
- _mm_store_si128(d++, color_wide);
- _mm_store_si128(d++, color_wide);
- _mm_store_si128(d++, color_wide);
- _mm_store_si128(d++, color_wide);
- count -= 16;
- }
- dst = reinterpret_cast<uint32_t*>(d);
-
- // Unrolling the loop in the Narrow code is a significant performance
- // gain, but unrolling this loop appears to make no difference in
- // benchmarks with either mm_store_si128 or individual sets.
-
- while (count > 0) {
- *dst++ = color;
- --count;
- }
-
- destination = (uint32_t*)((char*)destination + rowBytes);
- }
-}
-
-void ColorRect32_SSE2(SkPMColor* destination,
- int width, int height,
- size_t rowBytes, uint32_t color) {
- if (0 == height || 0 == width || 0 == color) {
- return;
- }
- unsigned colorA = SkGetPackedA32(color);
- colorA = 0; // skip below if () for now...(has been disabled since this was added in r3423).
- if (255 == colorA) {
- if (width < 31) {
- BlitRect32_OpaqueNarrow_SSE2(destination, width, height,
- rowBytes, color);
- } else {
- BlitRect32_OpaqueWide_SSE2(destination, width, height,
- rowBytes, color);
- }
- } else {
- SkBlitRow::ColorRect32(destination, width, height, rowBytes, color);
- }
-}