_mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
}
+/* prefetching NULL is very slow on some systems. don't do that. */
+
+static force_inline void
+maybe_prefetch (__m128i* addr)
+{
+ if (addr)
+ cache_prefetch (addr);
+}
+
+static force_inline void
+maybe_prefetch_next (__m128i* addr)
+{
+ if (addr)
+ cache_prefetch_next (addr);
+}
+
/* load 4 pixels from a 16-byte boundary aligned address */
static force_inline __m128i
load_128_aligned (__m128i* src)
/* call prefetch hint to optimize cache load*/
cache_prefetch ((__m128i*)ps);
cache_prefetch ((__m128i*)pd);
- if (pm)
- cache_prefetch ((__m128i*)pm);
+ maybe_prefetch ((__m128i*)pm);
/* Align dst on a 16-byte boundary */
while (w && ((unsigned long)pd & 15))
/* call prefetch hint to optimize cache load*/
cache_prefetch ((__m128i*)ps);
cache_prefetch ((__m128i*)pd);
- if (pm)
- cache_prefetch ((__m128i*)pm);
+ maybe_prefetch ((__m128i*)pm);
while (w >= 4)
{
/* fill cache line with next memory */
cache_prefetch_next ((__m128i*)ps);
cache_prefetch_next ((__m128i*)pd);
- if (pm)
- cache_prefetch_next ((__m128i*)pm);
+ maybe_prefetch_next ((__m128i*)pm);
/* I'm loading unaligned because I'm not sure about
* the address alignment.
/* call prefetch hint to optimize cache load*/
cache_prefetch ((__m128i*)ps);
cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
+ maybe_prefetch ((__m128i*)pm);
/* Align dst on a 16-byte boundary */
while (w &&
/* call prefetch hint to optimize cache load*/
cache_prefetch ((__m128i*)ps);
cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
+ maybe_prefetch ((__m128i*)pm);
while (w >= 4)
{
/* fill cache line with next memory */
cache_prefetch_next ((__m128i*)ps);
cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
+ maybe_prefetch_next ((__m128i*)pm);
/* I'm loading unaligned because I'm not sure
* about the address alignment.
/* call prefetch hint to optimize cache load*/
cache_prefetch ((__m128i*)ps);
cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
+ maybe_prefetch ((__m128i*)pm);
while (w && ((unsigned long) pd & 15))
{
/* call prefetch hint to optimize cache load*/
cache_prefetch ((__m128i*)ps);
cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
+ maybe_prefetch ((__m128i*)pm);
while (w >= 4)
{
/* fill cache line with next memory */
cache_prefetch_next ((__m128i*)ps);
cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
+ maybe_prefetch_next ((__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
/* call prefetch hint to optimize cache load*/
cache_prefetch ((__m128i*)ps);
cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
+ maybe_prefetch ((__m128i*)pm);
while (w && ((unsigned long) pd & 15))
{
/* call prefetch hint to optimize cache load*/
cache_prefetch ((__m128i*)ps);
cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
+ maybe_prefetch ((__m128i*)pm);
while (w >= 4)
{
/* fill cache line with next memory */
cache_prefetch_next ((__m128i*)ps);
cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
+ maybe_prefetch_next ((__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
/* call prefetch hint to optimize cache load*/
cache_prefetch ((__m128i*)ps);
cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
+ maybe_prefetch ((__m128i*)pm);
while (w && ((unsigned long) pd & 15))
{
/* call prefetch hint to optimize cache load*/
cache_prefetch ((__m128i*)ps);
cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
+ maybe_prefetch ((__m128i*)pm);
while (w >= 4)
{
/* fill cache line with next memory */
cache_prefetch_next ((__m128i*)ps);
cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
+ maybe_prefetch_next ((__m128i*)pm);
xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
/* call prefetch hint to optimize cache load*/
cache_prefetch ((__m128i*)ps);
cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
+ maybe_prefetch ((__m128i*)pm);
while (w && ((unsigned long) pd & 15))
{
/* call prefetch hint to optimize cache load*/
cache_prefetch ((__m128i*)ps);
cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
+ maybe_prefetch ((__m128i*)pm);
while (w >= 4)
{
/* fill cache line with next memory */
cache_prefetch_next ((__m128i*)ps);
cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
+ maybe_prefetch_next ((__m128i*)pm);
xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
/* call prefetch hint to optimize cache load*/
cache_prefetch ((__m128i*)ps);
cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
+ maybe_prefetch ((__m128i*)pm);
while (w && ((unsigned long) pd & 15))
{
/* call prefetch hint to optimize cache load*/
cache_prefetch ((__m128i*)ps);
cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
+ maybe_prefetch ((__m128i*)pm);
while (w >= 4)
{
/* fill cache line with next memory */
cache_prefetch_next ((__m128i*)ps);
cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
+ maybe_prefetch_next ((__m128i*)pm);
xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
/* call prefetch hint to optimize cache load*/
cache_prefetch ((__m128i*)ps);
cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
+ maybe_prefetch ((__m128i*)pm);
while (w && ((unsigned long) pd & 15))
{
/* call prefetch hint to optimize cache load*/
cache_prefetch ((__m128i*)ps);
cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
+ maybe_prefetch ((__m128i*)pm);
while (w >= 4)
{
/* fill cache line with next memory */
cache_prefetch_next ((__m128i*)ps);
cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
+ maybe_prefetch_next ((__m128i*)pm);
xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
/* call prefetch hint to optimize cache load*/
cache_prefetch ((__m128i*)ps);
cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
+ maybe_prefetch ((__m128i*)pm);
while (w && ((unsigned long) pd & 15))
{
/* call prefetch hint to optimize cache load*/
cache_prefetch ((__m128i*)ps);
cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
+ maybe_prefetch ((__m128i*)pm);
while (w >= 4)
{
/* fill cache line with next memory */
cache_prefetch_next ((__m128i*)ps);
cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
+ maybe_prefetch_next ((__m128i*)pm);
xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
xmm_dst = load_128_aligned ((__m128i*) pd);
/* call prefetch hint to optimize cache load*/
cache_prefetch ((__m128i*)ps);
cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
+ maybe_prefetch ((__m128i*)pm);
while (w && (unsigned long)pd & 15)
{
/* call prefetch hint to optimize cache load*/
cache_prefetch ((__m128i*)ps);
cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
+ maybe_prefetch ((__m128i*)pm);
while (w >= 4)
{
/* fill cache line with next memory */
cache_prefetch_next ((__m128i*)ps);
cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
+ maybe_prefetch_next ((__m128i*)pm);
s = combine4 ((__m128i*)ps, (__m128i*)pm);
/* call prefetch hint to optimize cache load*/
cache_prefetch ((__m128i*)ps);
cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
+ maybe_prefetch ((__m128i*)pm);
while (w && (unsigned long)pd & 15)
{
/* call prefetch hint to optimize cache load*/
cache_prefetch ((__m128i*)ps);
cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
+ maybe_prefetch ((__m128i*)pm);
while (w >= 4)
{
/* fill cache line with next memory */
cache_prefetch_next ((__m128i*)ps);
cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
+ maybe_prefetch_next ((__m128i*)pm);
xmm_dst = load_128_aligned ((__m128i*)pd);
xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);