From db5803959182f891259b457b5bac2ed54785b709 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Samuel=20R=C3=B8dal?= Date: Fri, 17 Sep 2010 21:53:43 +0200 Subject: [PATCH] Optimized radial gradient fetch using SSE 2. On an i7 this improves performance by 22 % in parcycle, 107 % in default svgviewer example, and 283 % in a synthetic radial gradient benchmark. Reviewed-by: Andreas Kling (cherry picked from commit 26bd3dccdee8c6a8f1cf9d254a2a6be7d403aa8d) --- src/gui/painting/qdrawhelper.cpp | 5 +++ src/gui/painting/qdrawhelper_p.h | 9 ++++ src/gui/painting/qdrawhelper_sse2.cpp | 84 +++++++++++++++++++++++++++++++++++ 3 files changed, 98 insertions(+) diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp index bb46df5..ce96bac 100644 --- a/src/gui/painting/qdrawhelper.cpp +++ b/src/gui/painting/qdrawhelper.cpp @@ -7728,6 +7728,11 @@ void qInitDrawhelperAsm() qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_sse2; qBlendFunctions[QImage::Format_RGB32][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_sse2; qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_sse2; + + extern const uint * QT_FASTCALL qt_fetch_radial_gradient_sse2(uint *buffer, const Operator *op, const QSpanData *data, + int y, int x, int length); + + qt_fetch_radial_gradient = qt_fetch_radial_gradient_sse2; } #ifdef QT_HAVE_SSSE3 diff --git a/src/gui/painting/qdrawhelper_p.h b/src/gui/painting/qdrawhelper_p.h index 6377fe1..db5ec70 100644 --- a/src/gui/painting/qdrawhelper_p.h +++ b/src/gui/painting/qdrawhelper_p.h @@ -268,8 +268,10 @@ struct QGradientData #ifdef Q_WS_QWS #define GRADIENT_STOPTABLE_SIZE 256 +#define GRADIENT_STOPTABLE_SIZE_SHIFT 8 #else #define GRADIENT_STOPTABLE_SIZE 1024 +#define GRADIENT_STOPTABLE_SIZE_SHIFT 10 #endif uint* colorTable; //[GRADIENT_STOPTABLE_SIZE]; @@ -389,6 +391,13 @@ template const uint * QT_FASTCALL qt_fetch_radial_gradient_template(uint *buffer, const Operator *op, const QSpanData *data, int y, int x, int length) { + // avoid division by zero + if (qFuzzyIsNull(op->radial.a)) { + extern void (*qt_memfill32)(quint32 *dest, quint32 value, int count); + qt_memfill32(buffer, data->gradient.colorTable[0], length); + return buffer; + } + const uint *b = buffer; qreal rx = data->m21 * (y + qreal(0.5)) + data->dx + data->m11 * (x + qreal(0.5)); diff --git a/src/gui/painting/qdrawhelper_sse2.cpp b/src/gui/painting/qdrawhelper_sse2.cpp index aad6bc9..eef4cda 100644 --- a/src/gui/painting/qdrawhelper_sse2.cpp +++ b/src/gui/painting/qdrawhelper_sse2.cpp @@ -491,6 +491,90 @@ void qt_bitmapblit16_sse2(QRasterBuffer *rasterBuffer, int x, int y, } } +extern const uint * QT_FASTCALL qt_fetch_radial_gradient_plain(uint *buffer, const Operator *op, const QSpanData *data, + int y, int x, int length); +class RadialFetchSse2 +{ +public: + static inline void fetch(uint *buffer, uint *end, const QSpanData *data, qreal det, qreal delta_det, + qreal delta_delta_det, qreal b, qreal delta_b) + { + union Vect_buffer_f { __m128 v; float f[4]; }; + union Vect_buffer_i { __m128i v; int i[4]; }; + + Vect_buffer_f det_vec; + Vect_buffer_f delta_det4_vec; + Vect_buffer_f b_vec; + + for (int i = 0; i < 4; ++i) { + det_vec.f[i] = det; + delta_det4_vec.f[i] = 4 * delta_det; + b_vec.f[i] = b; + + det += delta_det; + delta_det += delta_delta_det; + b += delta_b; + } + + const __m128 v_delta_delta_det16 = _mm_set1_ps(16 * delta_delta_det); + const __m128 v_delta_delta_det6 = _mm_set1_ps(6 * delta_delta_det); + const __m128 v_delta_b4 = _mm_set1_ps(4 * delta_b); + + const __m128 v_min = _mm_set1_ps(0.0f); + const __m128 v_max = _mm_set1_ps(GRADIENT_STOPTABLE_SIZE-1.5f); + const __m128 v_half = _mm_set1_ps(0.5f); + + const __m128 v_table_size_minus_one = _mm_set1_ps(float(GRADIENT_STOPTABLE_SIZE-1)); + + const __m128i v_repeat_mask = _mm_set1_epi32(uint(0xffffff) << GRADIENT_STOPTABLE_SIZE_SHIFT); + const __m128i v_reflect_mask = _mm_set1_epi32(uint(0xffffff) << (GRADIENT_STOPTABLE_SIZE_SHIFT+1)); + + const __m128i v_reflect_limit = _mm_set1_epi32(2 * GRADIENT_STOPTABLE_SIZE - 1); + +#define FETCH_RADIAL_LOOP_PROLOGUE \ + while (buffer < end) { \ + const __m128 v_index_local = _mm_sub_ps(_mm_sqrt_ps(_mm_max_ps(v_min, det_vec.v)), b_vec.v); \ + const __m128 v_index = _mm_add_ps(_mm_mul_ps(v_index_local, v_table_size_minus_one), v_half); \ + Vect_buffer_i index_vec; +#define FETCH_RADIAL_LOOP_CLAMP_REPEAT \ + index_vec.v = _mm_andnot_si128(v_repeat_mask, _mm_cvttps_epi32(v_index)); +#define FETCH_RADIAL_LOOP_CLAMP_REFLECT \ + const __m128i v_index_i = _mm_andnot_si128(v_reflect_mask, _mm_cvttps_epi32(v_index)); \ + const __m128i v_index_i_inv = _mm_sub_epi32(v_reflect_limit, v_index_i); \ + index_vec.v = _mm_min_epi16(v_index_i, v_index_i_inv); +#define FETCH_RADIAL_LOOP_CLAMP_PAD \ + index_vec.v = _mm_cvttps_epi32(_mm_min_ps(v_max, _mm_max_ps(v_min, v_index))); +#define FETCH_RADIAL_LOOP_EPILOGUE \ + det_vec.v = _mm_add_ps(_mm_add_ps(det_vec.v, delta_det4_vec.v), v_delta_delta_det6); \ + delta_det4_vec.v = _mm_add_ps(delta_det4_vec.v, v_delta_delta_det16); \ + b_vec.v = _mm_add_ps(b_vec.v, v_delta_b4); \ + for (int i = 0; i < 4; ++i) \ + *buffer++ = data->gradient.colorTable[index_vec.i[i]]; \ + } + + if (data->gradient.spread == QGradient::RepeatSpread) { + FETCH_RADIAL_LOOP_PROLOGUE + FETCH_RADIAL_LOOP_CLAMP_REPEAT + FETCH_RADIAL_LOOP_EPILOGUE + } else if (data->gradient.spread == QGradient::ReflectSpread) { + FETCH_RADIAL_LOOP_PROLOGUE + FETCH_RADIAL_LOOP_CLAMP_REFLECT + FETCH_RADIAL_LOOP_EPILOGUE + } else { + FETCH_RADIAL_LOOP_PROLOGUE + FETCH_RADIAL_LOOP_CLAMP_PAD + FETCH_RADIAL_LOOP_EPILOGUE + } + } +}; + +const uint * QT_FASTCALL qt_fetch_radial_gradient_sse2(uint *buffer, const Operator *op, const QSpanData *data, + int y, int x, int length) +{ + return qt_fetch_radial_gradient_template(buffer, op, data, y, x, length); +} + + QT_END_NAMESPACE #endif // QT_HAVE_SSE2 -- 2.7.4