From ec7c1affcc66c12af1fc29fd000f9885a5d48320 Mon Sep 17 00:00:00 2001 From: M Joonas Pihlaja Date: Mon, 14 Sep 2009 22:52:29 +0300 Subject: [PATCH] Don't prefetch from NULL in the SSE2 fast paths. On an Athlon64 box prefetch from NULL slows down the rgba OVER rgba fast for predominantly solid sources by up to 3.5x in the one-rounded-rectangle test case when run using a tiling polygon renderer. This patch conditionalises the prefetches of the mask everywhere where the mask pointer may be NULL in a fast path. --- pixman/pixman-sse2.c | 85 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 36 deletions(-) diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c index fc78765..56fda4d 100644 --- a/pixman/pixman-sse2.c +++ b/pixman/pixman-sse2.c @@ -368,6 +368,22 @@ cache_prefetch_next (__m128i* addr) _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */ } +/* prefetching NULL is very slow on some systems. don't do that. */ + +static force_inline void +maybe_prefetch (__m128i* addr) +{ + if (addr) + cache_prefetch (addr); +} + +static force_inline void +maybe_prefetch_next (__m128i* addr) +{ + if (addr) + cache_prefetch_next (addr); +} + /* load 4 pixels from a 16-byte boundary aligned address */ static force_inline __m128i load_128_aligned (__m128i* src) @@ -629,8 +645,7 @@ core_combine_over_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - if (pm) - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); /* Align dst on a 16-byte boundary */ while (w && ((unsigned long)pd & 15)) @@ -648,16 +663,14 @@ core_combine_over_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - if (pm) - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); - if (pm) - cache_prefetch_next ((__m128i*)pm); + maybe_prefetch_next ((__m128i*)pm); /* I'm loading unaligned because I'm not sure about * the address alignment. @@ -723,7 +736,7 @@ core_combine_over_reverse_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); /* Align dst on a 16-byte boundary */ while (w && @@ -742,14 +755,14 @@ core_combine_over_reverse_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); + maybe_prefetch_next ((__m128i*)pm); /* I'm loading unaligned because I'm not sure * about the address alignment. @@ -825,7 +838,7 @@ core_combine_in_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { @@ -842,14 +855,14 @@ core_combine_in_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); + maybe_prefetch_next ((__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm); @@ -899,7 +912,7 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { @@ -916,14 +929,14 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); + maybe_prefetch_next ((__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); @@ -968,7 +981,7 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { @@ -989,7 +1002,7 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w >= 4) { @@ -999,7 +1012,7 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd, /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); + maybe_prefetch_next ((__m128i*)pm); xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1050,7 +1063,7 @@ core_combine_out_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { @@ -1070,7 +1083,7 @@ core_combine_out_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w >= 4) { @@ -1080,7 +1093,7 @@ core_combine_out_u_sse2 (uint32_t* pd, /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); + maybe_prefetch_next ((__m128i*)pm); xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1150,7 +1163,7 @@ core_combine_atop_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { @@ -1167,14 +1180,14 @@ core_combine_atop_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); + maybe_prefetch_next ((__m128i*)pm); xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1247,7 +1260,7 @@ core_combine_reverse_atop_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { @@ -1264,14 +1277,14 @@ core_combine_reverse_atop_u_sse2 (uint32_t* pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); + maybe_prefetch_next ((__m128i*)pm); xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1348,7 +1361,7 @@ core_combine_xor_u_sse2 (uint32_t* dst, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { @@ -1365,14 +1378,14 @@ core_combine_xor_u_sse2 (uint32_t* dst, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); + maybe_prefetch_next ((__m128i*)pm); xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm); xmm_dst = load_128_aligned ((__m128i*) pd); @@ -1433,7 +1446,7 @@ core_combine_add_u_sse2 (uint32_t* dst, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { @@ -1451,7 +1464,7 @@ core_combine_add_u_sse2 (uint32_t* dst, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w >= 4) { @@ -1460,7 +1473,7 @@ core_combine_add_u_sse2 (uint32_t* dst, /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); + maybe_prefetch_next ((__m128i*)pm); s = combine4 ((__m128i*)ps, (__m128i*)pm); @@ -1519,7 +1532,7 @@ core_combine_saturate_u_sse2 (uint32_t * pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { @@ -1536,14 +1549,14 @@ core_combine_saturate_u_sse2 (uint32_t * pd, /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); + maybe_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); + maybe_prefetch_next ((__m128i*)pm); xmm_dst = load_128_aligned ((__m128i*)pd); xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm); -- 2.7.4