over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
}
+static force_inline void
+cache_prefetch (__m128i* addr)
+{
+ _mm_prefetch ((void const*)addr, _MM_HINT_T0);
+}
+
+static force_inline void
+cache_prefetch_next (__m128i* addr)
+{
+ _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
+}
+
+/* prefetching NULL is very slow on some systems. don't do that. */
+
+static force_inline void
+maybe_prefetch (__m128i* addr)
+{
+ if (addr)
+ cache_prefetch (addr);
+}
+
+static force_inline void
+maybe_prefetch_next (__m128i* addr)
+{
+ if (addr)
+ cache_prefetch_next (addr);
+}
+
/* load 4 pixels from a 16-byte boundary aligned address */
static force_inline __m128i
load_128_aligned (__m128i* src)
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_alpha_lo, xmm_alpha_hi;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ maybe_prefetch ((__m128i*)pm);
+
/* Align dst on a 16-byte boundary */
while (w && ((unsigned long)pd & 15))
{
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ maybe_prefetch ((__m128i*)pm);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ maybe_prefetch_next ((__m128i*)pm);
+
/* I'm loading unaligned because I'm not sure about
* the address alignment.
*/
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_alpha_lo, xmm_alpha_hi;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ maybe_prefetch ((__m128i*)pm);
+
/* Align dst on a 16-byte boundary */
while (w &&
((unsigned long)pd & 15))
pm++;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ maybe_prefetch ((__m128i*)pm);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ maybe_prefetch_next ((__m128i*)pm);
+
/* I'm loading unaligned because I'm not sure
* about the address alignment.
*/
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ maybe_prefetch ((__m128i*)pm);
+
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
pm++;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ maybe_prefetch ((__m128i*)pm);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ maybe_prefetch_next ((__m128i*)pm);
+
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ maybe_prefetch ((__m128i*)pm);
+
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
pm++;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ maybe_prefetch ((__m128i*)pm);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ maybe_prefetch_next ((__m128i*)pm);
+
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
const uint32_t* pm,
int w)
{
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ maybe_prefetch ((__m128i*)pm);
+
while (w && ((unsigned long) pd & 15))
{
uint32_t s = combine1 (ps, pm);
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ maybe_prefetch ((__m128i*)pm);
+
while (w >= 4)
{
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ maybe_prefetch_next ((__m128i*)pm);
+
xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
const uint32_t* pm,
int w)
{
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ maybe_prefetch ((__m128i*)pm);
+
while (w && ((unsigned long) pd & 15))
{
uint32_t s = combine1 (ps, pm);
pm++;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ maybe_prefetch ((__m128i*)pm);
+
while (w >= 4)
{
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ maybe_prefetch_next ((__m128i*)pm);
+
xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ maybe_prefetch ((__m128i*)pm);
+
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
pm++;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ maybe_prefetch ((__m128i*)pm);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ maybe_prefetch_next ((__m128i*)pm);
+
xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ maybe_prefetch ((__m128i*)pm);
+
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
pm++;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ maybe_prefetch ((__m128i*)pm);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ maybe_prefetch_next ((__m128i*)pm);
+
xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ maybe_prefetch ((__m128i*)pm);
+
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
pm++;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ maybe_prefetch ((__m128i*)pm);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ maybe_prefetch_next ((__m128i*)pm);
+
xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
xmm_dst = load_128_aligned ((__m128i*) pd);
const uint32_t* ps = src;
const uint32_t* pm = mask;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ maybe_prefetch ((__m128i*)pm);
+
while (w && (unsigned long)pd & 15)
{
s = combine1 (ps, pm);
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ maybe_prefetch ((__m128i*)pm);
+
while (w >= 4)
{
__m128i s;
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ maybe_prefetch_next ((__m128i*)pm);
+
s = combine4 ((__m128i*)ps, (__m128i*)pm);
save_128_aligned (
uint32_t pack_cmp;
__m128i xmm_src, xmm_dst;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ maybe_prefetch ((__m128i*)pm);
+
while (w && (unsigned long)pd & 15)
{
s = combine1 (ps, pm);
pm++;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ maybe_prefetch ((__m128i*)pm);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ maybe_prefetch_next ((__m128i*)pm);
+
xmm_dst = load_128_aligned ((__m128i*)pd);
xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
__m128i xmm_mask_lo, xmm_mask_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w && (unsigned long)pd & 15)
{
s = *ps++;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w && (unsigned long)pd & 15)
{
s = *ps++;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w && (unsigned long)pd & 15)
{
s = *ps++;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w && (unsigned long)pd & 15)
{
s = *ps++;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w && (unsigned long)pd & 15)
{
s = *ps++;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w && (unsigned long)pd & 15)
{
s = *ps++;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w && (unsigned long)pd & 15)
{
s = *ps++;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w && (unsigned long)pd & 15)
{
s = *ps++;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w && (unsigned long)pd & 15)
{
s = *ps++;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w && (unsigned long)pd & 15)
{
s = *ps++;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w && (unsigned long)pd & 15)
{
s = *ps++;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
{
dst = dst_line;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)dst);
+
dst_line += dst_stride;
w = width;
w--;
}
+ cache_prefetch ((__m128i*)dst);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)dst);
+
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
{
dst = dst_line;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)dst);
+
dst_line += dst_stride;
w = width;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)dst);
+
while (w >= 8)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)dst);
+
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_565_128_4x128 (xmm_dst,
dst_line += dst_stride;
mask_line += mask_stride;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w && (unsigned long)pd & 15)
{
m = *pm++;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
xmm_mask = load_128_unaligned ((__m128i*)pm);
pack_cmp =
dst_line += dst_stride;
mask_line += mask_stride;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w && (unsigned long)pd & 15)
{
m = *pm++;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
xmm_mask = load_128_unaligned ((__m128i*)pm);
pack_cmp =
src_line += src_stride;
w = width;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)dst);
+ cache_prefetch ((__m128i*)src);
+
while (w && (unsigned long)dst & 15)
{
uint32_t s = *src++;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)dst);
+ cache_prefetch ((__m128i*)src);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)dst);
+ cache_prefetch_next ((__m128i*)src);
+
xmm_src = load_128_unaligned ((__m128i*)src);
xmm_dst = load_128_aligned ((__m128i*)dst);
src_line += src_stride;
w = width;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+
while (w && (unsigned long)dst & 15)
{
*dst++ = *src++ | 0xff000000;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+
while (w >= 16)
{
__m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)src);
+
xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
src_line += src_stride;
w = width;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)dst);
+ cache_prefetch ((__m128i*)src);
+
while (w && (unsigned long)dst & 15)
{
uint32_t s = (*src++) | 0xff000000;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)dst);
+ cache_prefetch ((__m128i*)src);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)dst);
+ cache_prefetch_next ((__m128i*)src);
+
xmm_src = _mm_or_si128 (
load_128_unaligned ((__m128i*)src), mask_ff000000);
xmm_dst = load_128_aligned ((__m128i*)dst);
dst = dst_line;
src = src_line;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+ cache_prefetch ((__m128i*)dst);
+
dst_line += dst_stride;
src_line += src_stride;
w = width;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+ cache_prefetch ((__m128i*)dst);
+
/* It's a 8 pixel loop */
while (w >= 8)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)src);
+ cache_prefetch_next ((__m128i*)dst);
+
/* I'm loading unaligned because I'm not sure
* about the address alignment.
*/
mask_line += mask_stride;
w = width;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)mask);
+ cache_prefetch ((__m128i*)dst);
+
while (w && (unsigned long)dst & 15)
{
uint8_t m = *mask++;
dst++;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)mask);
+ cache_prefetch ((__m128i*)dst);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)mask);
+ cache_prefetch_next ((__m128i*)dst);
+
m = *((uint32_t*)mask);
if (srca == 0xff && m == 0xffffffff)
return FALSE;
}
+ cache_prefetch ((__m128i*)byte_line);
xmm_def = create_mask_2x32_128 (data, data);
while (height--)
byte_line += stride;
w = byte_width;
+ cache_prefetch_next ((__m128i*)d);
+
while (w >= 1 && ((unsigned long)d & 1))
{
*(uint8_t *)d = data;
d += 4;
}
+ cache_prefetch_next ((__m128i*)d);
+
while (w >= 128)
{
+ cache_prefetch (((__m128i*)d) + 12);
+
save_128_aligned ((__m128i*)(d), xmm_def);
save_128_aligned ((__m128i*)(d + 16), xmm_def);
save_128_aligned ((__m128i*)(d + 32), xmm_def);
if (w >= 64)
{
+ cache_prefetch (((__m128i*)d) + 8);
+
save_128_aligned ((__m128i*)(d), xmm_def);
save_128_aligned ((__m128i*)(d + 16), xmm_def);
save_128_aligned ((__m128i*)(d + 32), xmm_def);
w -= 64;
}
+ cache_prefetch_next ((__m128i*)d);
+
if (w >= 32)
{
save_128_aligned ((__m128i*)(d), xmm_def);
w -= 16;
}
+ cache_prefetch_next ((__m128i*)d);
+
while (w >= 4)
{
*(uint32_t *)d = data;
mask_line += mask_stride;
w = width;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)mask);
+ cache_prefetch ((__m128i*)dst);
+
while (w && (unsigned long)dst & 15)
{
uint8_t m = *mask++;
dst++;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)mask);
+ cache_prefetch ((__m128i*)dst);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)mask);
+ cache_prefetch_next ((__m128i*)dst);
+
m = *((uint32_t*)mask);
if (srca == 0xff && m == 0xffffffff)
mask_line += mask_stride;
w = width;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)mask);
+ cache_prefetch ((__m128i*)dst);
+
while (w && (unsigned long)dst & 15)
{
m = *mask++;
dst++;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)mask);
+ cache_prefetch ((__m128i*)dst);
+
while (w >= 8)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)mask);
+ cache_prefetch_next ((__m128i*)dst);
+
xmm_dst = load_128_aligned ((__m128i*) dst);
unpack_565_128_4x128 (xmm_dst,
&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
src_line += src_stride;
w = width;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+ cache_prefetch ((__m128i*)dst);
+
while (w && (unsigned long)dst & 15)
{
s = *src++;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+ cache_prefetch ((__m128i*)dst);
+
while (w >= 8)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)src);
+ cache_prefetch_next ((__m128i*)dst);
+
/* First round */
xmm_src = load_128_unaligned ((__m128i*)src);
xmm_dst = load_128_aligned ((__m128i*)dst);
src_line += src_stride;
w = width;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+ cache_prefetch ((__m128i*)dst);
+
while (w && (unsigned long)dst & 15)
{
s = *src++;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+ cache_prefetch ((__m128i*)dst);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)src);
+ cache_prefetch_next ((__m128i*)dst);
+
xmm_src_hi = load_128_unaligned ((__m128i*)src);
opaque = is_opaque (xmm_src_hi);
mask_line += mask_stride;
dst_line += dst_stride;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)mask);
+ cache_prefetch ((__m128i*)dst);
+
while (w && ((unsigned long)dst & 15))
{
m = *(uint32_t *) mask;
mask++;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)mask);
+ cache_prefetch ((__m128i*)dst);
+
while (w >= 8)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)mask);
+ cache_prefetch_next ((__m128i*)dst);
+
/* First round */
xmm_mask = load_128_unaligned ((__m128i*)mask);
xmm_dst = load_128_aligned ((__m128i*)dst);
mask_line += mask_stride;
w = width;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)mask);
+ cache_prefetch ((__m128i*)dst);
+
while (w && ((unsigned long)dst & 15))
{
m = (uint32_t) *mask++;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)mask);
+ cache_prefetch ((__m128i*)dst);
+
while (w >= 16)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)mask);
+ cache_prefetch_next ((__m128i*)dst);
+
xmm_mask = load_128_unaligned ((__m128i*)mask);
xmm_dst = load_128_aligned ((__m128i*)dst);
dst_line += dst_stride;
w = width;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)dst);
+
while (w && ((unsigned long)dst & 15))
{
d = (uint32_t) *dst;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)dst);
+
while (w >= 16)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)dst);
+
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
src_line += src_stride;
w = width;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+ cache_prefetch ((__m128i*)dst);
+
while (w && ((unsigned long)dst & 15))
{
s = (uint32_t) *src++;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+ cache_prefetch ((__m128i*)dst);
+
while (w >= 16)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)src);
+ cache_prefetch_next ((__m128i*)dst);
+
xmm_src = load_128_unaligned ((__m128i*)src);
xmm_dst = load_128_aligned ((__m128i*)dst);
mask_line += mask_stride;
w = width;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)mask);
+ cache_prefetch ((__m128i*)dst);
+
while (w && ((unsigned long)dst & 15))
{
m = (uint32_t) *mask++;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)mask);
+ cache_prefetch ((__m128i*)dst);
+
while (w >= 16)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)mask);
+ cache_prefetch_next ((__m128i*)dst);
+
xmm_mask = load_128_unaligned ((__m128i*)mask);
xmm_dst = load_128_aligned ((__m128i*)dst);
dst_line += dst_stride;
w = width;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)dst);
+
while (w && ((unsigned long)dst & 15))
{
*dst = (uint8_t)_mm_cvtsi64_si32 (
dst++;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)dst);
+
while (w >= 16)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)dst);
+
save_128_aligned (
(__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
dst = dst_line;
src = src_line;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+ cache_prefetch ((__m128i*)dst);
+
dst_line += dst_stride;
src_line += src_stride;
w = width;
return FALSE;
}
+ cache_prefetch ((__m128i*)src_bytes);
+ cache_prefetch ((__m128i*)dst_bytes);
+
while (height--)
{
int w;
dst_bytes += dst_stride;
w = byte_width;
+ cache_prefetch_next ((__m128i*)s);
+ cache_prefetch_next ((__m128i*)d);
+
while (w >= 2 && ((unsigned long)d & 3))
{
*(uint16_t *)d = *(uint16_t *)s;
d += 4;
}
+ cache_prefetch_next ((__m128i*)s);
+ cache_prefetch_next ((__m128i*)d);
+
while (w >= 64)
{
__m128i xmm0, xmm1, xmm2, xmm3;
+ /* 128 bytes ahead */
+ cache_prefetch (((__m128i*)s) + 8);
+ cache_prefetch (((__m128i*)d) + 8);
+
xmm0 = load_128_unaligned ((__m128i*)(s));
xmm1 = load_128_unaligned ((__m128i*)(s + 16));
xmm2 = load_128_unaligned ((__m128i*)(s + 32));
w -= 64;
}
+ cache_prefetch_next ((__m128i*)s);
+ cache_prefetch_next ((__m128i*)d);
+
while (w >= 16)
{
save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
s += 16;
}
+ cache_prefetch_next ((__m128i*)s);
+ cache_prefetch_next ((__m128i*)d);
+
while (w >= 4)
{
*(uint32_t *)d = *(uint32_t *)s;
w = width;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+ cache_prefetch ((__m128i*)dst);
+ cache_prefetch ((__m128i*)mask);
+
while (w && (unsigned long)dst & 15)
{
s = 0xff000000 | *src++;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+ cache_prefetch ((__m128i*)dst);
+ cache_prefetch ((__m128i*)mask);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)src);
+ cache_prefetch_next ((__m128i*)dst);
+ cache_prefetch_next ((__m128i*)mask);
+
m = *(uint32_t*) mask;
xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
w = width;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i *)src);
+ cache_prefetch ((__m128i *)dst);
+ cache_prefetch ((__m128i *)mask);
+
while (w && (unsigned long)dst & 15)
{
uint32_t sa;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i *)src);
+ cache_prefetch ((__m128i *)dst);
+ cache_prefetch ((__m128i *)mask);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i *)src);
+ cache_prefetch_next ((__m128i *)dst);
+ cache_prefetch_next ((__m128i *)mask);
+
m = *(uint32_t *) mask;
if (m)
{
dst = dst_line;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)dst);
+
dst_line += dst_stride;
w = width;
dst++;
}
+ cache_prefetch ((__m128i*)dst);
+
while (w >= 4)
{
__m128i tmp_lo, tmp_hi;
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)(dst + 4));
+
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
w = width;
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i *)src);
+ cache_prefetch ((__m128i *)dst);
+ cache_prefetch ((__m128i *)mask);
+
while (w && (unsigned long)dst & 15)
{
uint32_t sa;
w--;
}
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i *)src);
+ cache_prefetch ((__m128i *)dst);
+ cache_prefetch ((__m128i *)mask);
+
while (w >= 4)
{
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i *)src);
+ cache_prefetch_next ((__m128i *)dst);
+ cache_prefetch_next ((__m128i *)mask);
+
xmm_mask = load_128_unaligned ((__m128i*)mask);
if (!is_transparent (xmm_mask))