#if CV_SSE2
void process(__m128i v_r, __m128i v_g, __m128i v_b,
- __m128 v_coeffs,
+ const __m128& v_coeffs_,
float * buf) const
{
__m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero));
__m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero));
__m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero));
+ __m128 v_coeffs = v_coeffs_;
+
v_r0 = _mm_mul_ps(v_r0, v_coeffs);
v_g1 = _mm_mul_ps(v_g1, v_coeffs);
#if CV_SSE2
void process(__m128i v_r, __m128i v_g, __m128i v_b,
- __m128 v_coeffs,
+ const __m128& v_coeffs_,
float * buf) const
{
__m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero));
__m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero));
__m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero));
+ __m128 v_coeffs = v_coeffs_;
+
v_r0 = _mm_mul_ps(v_r0, v_coeffs);
v_g1 = _mm_mul_ps(v_g1, v_coeffs);
#if CV_SSE2
// 16s x 8
void process(__m128i v_r, __m128i v_g, __m128i v_b,
- __m128 v_coeffs, __m128 v_res,
+ const __m128& v_coeffs_, const __m128& v_res_,
float * buf) const
{
__m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero));
__m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero));
__m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero));
+ __m128 v_coeffs = v_coeffs_;
+ __m128 v_res = v_res_;
+
v_r0 = _mm_sub_ps(_mm_mul_ps(v_r0, v_coeffs), v_res);
v_g1 = _mm_sub_ps(_mm_mul_ps(v_g1, v_coeffs), v_res);
#if CV_SSE2
// 16s x 8
void process(__m128i v_l, __m128i v_u, __m128i v_v,
- __m128 v_coeffs, __m128 v_res,
+ const __m128& v_coeffs_, const __m128& v_res_,
float * buf) const
{
__m128 v_l0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_l, v_zero));
__m128 v_u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_u, v_zero));
__m128 v_v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_v, v_zero));
+ __m128 v_coeffs = v_coeffs_;
+ __m128 v_res = v_res_;
+
v_l0 = _mm_mul_ps(v_l0, v_coeffs);
v_u1 = _mm_mul_ps(v_u1, v_coeffs);
v_l0 = _mm_sub_ps(v_l0, v_res);