1.9571441241754002690183222516269 * EXPPOLY_32F_A0,
1.9784560263879509682582499181312 * EXPPOLY_32F_A0,
};
-
+
+
static const double exp_prescale = 1.4426950408889634073599246810019 * (1 << EXPTAB_SCALE);
static const double exp_postscale = 1./(1 << EXPTAB_SCALE);
static const double exp_max_val = 3000.*(1 << EXPTAB_SCALE); // log10(DBL_MAX) < 3000
static CvStatus CV_STDCALL Exp_32f( const float *_x, float *y, int n )
{
- static const double
- EXPPOLY_32F_A4 = 1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0,
- EXPPOLY_32F_A3 = .6931471805521448196800669615864773144641 / EXPPOLY_32F_A0,
- EXPPOLY_32F_A2 = .2402265109513301490103372422686535526573 / EXPPOLY_32F_A0,
- EXPPOLY_32F_A1 = .5550339366753125211915322047004666939128e-1 / EXPPOLY_32F_A0;
-
- #undef EXPPOLY
- #define EXPPOLY(x) \
- (((((x) + EXPPOLY_32F_A1)*(x) + EXPPOLY_32F_A2)*(x) + EXPPOLY_32F_A3)*(x) + EXPPOLY_32F_A4)
-
+ static const float
+ A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0),
+ A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0),
+ A2 = (float)(.2402265109513301490103372422686535526573 / EXPPOLY_32F_A0),
+ A1 = (float)(.5550339366753125211915322047004666939128e-1 / EXPPOLY_32F_A0);
+
+#undef EXPPOLY
+#define EXPPOLY(x) \
+ (((((x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)
+
int i = 0;
- DBLINT buf[4];
const Cv32suf* x = (const Cv32suf*)_x;
+ Cv32suf buf[4];
- buf[0].i.lo = buf[1].i.lo = buf[2].i.lo = buf[3].i.lo = 0;
+#if CV_SSE2
+ if( n >= 8 && checkHardwareSupport(CV_CPU_SSE) )
+ {
+ static const __m128d prescale2 = _mm_set1_pd(exp_prescale);
+ static const __m128 postscale4 = _mm_set1_ps((float)exp_postscale);
+ static const __m128 maxval4 = _mm_set1_ps((float)(exp_max_val/exp_prescale));
+ static const __m128 minval4 = _mm_set1_ps((float)(-exp_max_val/exp_prescale));
+
+ static const __m128 mA1 = _mm_set1_ps(A1);
+ static const __m128 mA2 = _mm_set1_ps(A2);
+ static const __m128 mA3 = _mm_set1_ps(A3);
+ static const __m128 mA4 = _mm_set1_ps(A4);
+ bool y_aligned = (size_t)(void*)y % 16 == 0;
+
+ ushort CV_DECL_ALIGNED(16) tab_idx[8];
+
+ for( ; i <= n - 8; i += 8 )
+ {
+ __m128 xf0, xf1;
+ xf0 = _mm_loadu_ps(&x[i].f);
+ xf1 = _mm_loadu_ps(&x[i+4].f);
+ __m128i xi0, xi1, xi2, xi3;
+
+ xf0 = _mm_min_ps(_mm_max_ps(xf0, minval4), maxval4);
+ xf1 = _mm_min_ps(_mm_max_ps(xf1, minval4), maxval4);
+
+ __m128d xd0 = _mm_cvtps_pd(xf0);
+ __m128d xd2 = _mm_cvtps_pd(_mm_movehl_ps(xf0, xf0));
+ __m128d xd1 = _mm_cvtps_pd(xf1);
+ __m128d xd3 = _mm_cvtps_pd(_mm_movehl_ps(xf1, xf1));
+
+ xd0 = _mm_mul_pd(xd0, prescale2);
+ xd2 = _mm_mul_pd(xd2, prescale2);
+ xd1 = _mm_mul_pd(xd1, prescale2);
+ xd3 = _mm_mul_pd(xd3, prescale2);
+
+ xi0 = _mm_cvtpd_epi32(xd0);
+ xi2 = _mm_cvtpd_epi32(xd2);
+
+ xi1 = _mm_cvtpd_epi32(xd1);
+ xi3 = _mm_cvtpd_epi32(xd3);
+
+ xd0 = _mm_sub_pd(xd0, _mm_cvtepi32_pd(xi0));
+ xd2 = _mm_sub_pd(xd2, _mm_cvtepi32_pd(xi2));
+ xd1 = _mm_sub_pd(xd1, _mm_cvtepi32_pd(xi1));
+ xd3 = _mm_sub_pd(xd3, _mm_cvtepi32_pd(xi3));
+
+ xf0 = _mm_movelh_ps(_mm_cvtpd_ps(xd0), _mm_cvtpd_ps(xd2));
+ xf1 = _mm_movelh_ps(_mm_cvtpd_ps(xd1), _mm_cvtpd_ps(xd3));
+
+ xf0 = _mm_mul_ps(xf0, postscale4);
+ xf1 = _mm_mul_ps(xf1, postscale4);
+
+ xi0 = _mm_unpacklo_epi64(xi0, xi2);
+ xi1 = _mm_unpacklo_epi64(xi1, xi3);
+ xi0 = _mm_packs_epi32(xi0, xi1);
+
+ _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi16(EXPTAB_MASK)));
+
+ xi0 = _mm_add_epi16(_mm_srai_epi16(xi0, EXPTAB_SCALE), _mm_set1_epi16(127));
+ xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
+ xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(255));
+ xi1 = _mm_unpackhi_epi16(xi0, _mm_setzero_si128());
+ xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
+
+ __m128d yd0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1]));
+ __m128d yd1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3]));
+ __m128d yd2 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[4]), _mm_load_sd(expTab + tab_idx[5]));
+ __m128d yd3 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[6]), _mm_load_sd(expTab + tab_idx[7]));
+
+ __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1));
+ __m128 yf1 = _mm_movelh_ps(_mm_cvtpd_ps(yd2), _mm_cvtpd_ps(yd3));
+ yf0 = _mm_mul_ps(yf0, _mm_castsi128_ps(_mm_slli_epi32(xi0, 23)));
+ yf1 = _mm_mul_ps(yf1, _mm_castsi128_ps(_mm_slli_epi32(xi1, 23)));
+
+ __m128 zf0 = _mm_add_ps(xf0, mA1);
+ __m128 zf1 = _mm_add_ps(xf1, mA1);
+
+ zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA2);
+ zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA2);
+
+ zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA3);
+ zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA3);
+
+ zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA4);
+ zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA4);
+
+ zf0 = _mm_mul_ps(zf0, yf0);
+ zf1 = _mm_mul_ps(zf1, yf1);
+
+ if( y_aligned )
+ {
+ _mm_store_ps(y + i, zf0);
+ _mm_store_ps(y + i + 4, zf1);
+ }
+ else
+ {
+ _mm_storeu_ps(y + i, zf0);
+ _mm_storeu_ps(y + i + 4, zf1);
+ }
+ }
+ }
+ else
+#endif
for( ; i <= n - 4; i += 4 )
{
double x0 = x[i].f * exp_prescale;
double x2 = x[i + 2].f * exp_prescale;
double x3 = x[i + 3].f * exp_prescale;
int val0, val1, val2, val3, t;
-
+
if( ((x[i].i >> 23) & 255) > 127 + 10 )
x0 = x[i].i < 0 ? -exp_max_val : exp_max_val;
-
+
if( ((x[i+1].i >> 23) & 255) > 127 + 10 )
x1 = x[i+1].i < 0 ? -exp_max_val : exp_max_val;
-
+
if( ((x[i+2].i >> 23) & 255) > 127 + 10 )
x2 = x[i+2].i < 0 ? -exp_max_val : exp_max_val;
-
+
if( ((x[i+3].i >> 23) & 255) > 127 + 10 )
x3 = x[i+3].i < 0 ? -exp_max_val : exp_max_val;
-
+
val0 = cvRound(x0);
val1 = cvRound(x1);
val2 = cvRound(x2);
val3 = cvRound(x3);
-
+
x0 = (x0 - val0)*exp_postscale;
x1 = (x1 - val1)*exp_postscale;
x2 = (x2 - val2)*exp_postscale;
x3 = (x3 - val3)*exp_postscale;
-
- t = (val0 >> EXPTAB_SCALE) + 1023;
- t = (t | ((t < 2047) - 1)) & (((t < 0) - 1) & 2047);
- buf[0].i.hi = t << 20;
-
- t = (val1 >> EXPTAB_SCALE) + 1023;
- t = (t | ((t < 2047) - 1)) & (((t < 0) - 1) & 2047);
- buf[1].i.hi = t << 20;
-
- t = (val2 >> EXPTAB_SCALE) + 1023;
- t = (t | ((t < 2047) - 1)) & (((t < 0) - 1) & 2047);
- buf[2].i.hi = t << 20;
-
- t = (val3 >> EXPTAB_SCALE) + 1023;
- t = (t | ((t < 2047) - 1)) & (((t < 0) - 1) & 2047);
- buf[3].i.hi = t << 20;
-
- x0 = buf[0].d * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
- x1 = buf[1].d * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 );
-
- y[i] = (float)x0;
- y[i + 1] = (float)x1;
-
- x2 = buf[2].d * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 );
- x3 = buf[3].d * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 );
-
- y[i + 2] = (float)x2;
- y[i + 3] = (float)x3;
+
+ t = (val0 >> EXPTAB_SCALE) + 127;
+ t = !(t & ~255) ? t : t < 0 ? 0 : 255;
+ buf[0].i = t << 23;
+
+ t = (val1 >> EXPTAB_SCALE) + 127;
+ t = !(t & ~255) ? t : t < 0 ? 0 : 255;
+ buf[1].i = t << 23;
+
+ t = (val2 >> EXPTAB_SCALE) + 127;
+ t = !(t & ~255) ? t : t < 0 ? 0 : 255;
+ buf[2].i = t << 23;
+
+ t = (val3 >> EXPTAB_SCALE) + 127;
+ t = !(t & ~255) ? t : t < 0 ? 0 : 255;
+ buf[3].i = t << 23;
+
+ x0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
+ x1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 );
+
+ y[i] = x0;
+ y[i + 1] = x1;
+
+ x2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 );
+ x3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 );
+
+ y[i + 2] = x2;
+ y[i + 3] = x3;
}
-
+
for( ; i < n; i++ )
{
double x0 = x[i].f * exp_prescale;
int val0, t;
-
+
if( ((x[i].i >> 23) & 255) > 127 + 10 )
x0 = x[i].i < 0 ? -exp_max_val : exp_max_val;
-
+
val0 = cvRound(x0);
- t = (val0 >> EXPTAB_SCALE) + 1023;
- t = (t | ((t < 2047) - 1)) & (((t < 0) - 1) & 2047);
-
- buf[0].i.hi = t << 20;
+ t = (val0 >> EXPTAB_SCALE) + 127;
+ t = !(t & ~255) ? t : t < 0 ? 0 : 255;
+
+ buf[0].i = t << 23;
x0 = (x0 - val0)*exp_postscale;
-
- y[i] = (float)(buf[0].d * expTab[val0 & EXPTAB_MASK] * EXPPOLY(x0));
+
+ y[i] = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY(x0);
}
-
+
return CV_OK;
}
-
+
static CvStatus CV_STDCALL Exp_64f( const double *_x, double *y, int n )
{
static const double
- A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0,
- A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0,
- A3 = .24022650695886477918181338054308 / EXPPOLY_32F_A0,
- A2 = .55504108793649567998466049042729e-1 / EXPPOLY_32F_A0,
- A1 = .96180973140732918010002372686186e-2 / EXPPOLY_32F_A0,
- A0 = .13369713757180123244806654839424e-2 / EXPPOLY_32F_A0;
-
- #undef EXPPOLY
- #define EXPPOLY(x) (((((A0*(x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)*(x) + A5)
-
+ A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0,
+ A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0,
+ A3 = .24022650695886477918181338054308 / EXPPOLY_32F_A0,
+ A2 = .55504108793649567998466049042729e-1 / EXPPOLY_32F_A0,
+ A1 = .96180973140732918010002372686186e-2 / EXPPOLY_32F_A0,
+ A0 = .13369713757180123244806654839424e-2 / EXPPOLY_32F_A0;
+
+#undef EXPPOLY
+#define EXPPOLY(x) (((((A0*(x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)*(x) + A5)
+
int i = 0;
- DBLINT buf[4];
+ Cv64suf buf[4];
const Cv64suf* x = (const Cv64suf*)_x;
-
- buf[0].i.lo = buf[1].i.lo = buf[2].i.lo = buf[3].i.lo = 0;
-
+
+#if CV_SSE2
+ if( checkHardwareSupport(CV_CPU_SSE) )
+ {
+ static const __m128d prescale2 = _mm_set1_pd(exp_prescale);
+ static const __m128d postscale2 = _mm_set1_pd(exp_postscale);
+ static const __m128d maxval2 = _mm_set1_pd(exp_max_val);
+ static const __m128d minval2 = _mm_set1_pd(-exp_max_val);
+
+ static const __m128d mA0 = _mm_set1_pd(A0);
+ static const __m128d mA1 = _mm_set1_pd(A1);
+ static const __m128d mA2 = _mm_set1_pd(A2);
+ static const __m128d mA3 = _mm_set1_pd(A3);
+ static const __m128d mA4 = _mm_set1_pd(A4);
+ static const __m128d mA5 = _mm_set1_pd(A5);
+
+ int CV_DECL_ALIGNED(16) tab_idx[4];
+
+ for( ; i <= n - 4; i += 4 )
+ {
+ __m128d xf0 = _mm_loadu_pd(&x[i].f), xf1 = _mm_loadu_pd(&x[i+2].f);
+ __m128i xi0, xi1;
+ xf0 = _mm_min_pd(_mm_max_pd(xf0, minval2), maxval2);
+ xf1 = _mm_min_pd(_mm_max_pd(xf1, minval2), maxval2);
+ xf0 = _mm_mul_pd(xf0, prescale2);
+ xf1 = _mm_mul_pd(xf1, prescale2);
+
+ xi0 = _mm_cvtpd_epi32(xf0);
+ xi1 = _mm_cvtpd_epi32(xf1);
+ xf0 = _mm_mul_pd(_mm_sub_pd(xf0, _mm_cvtepi32_pd(xi0)), postscale2);
+ xf1 = _mm_mul_pd(_mm_sub_pd(xf1, _mm_cvtepi32_pd(xi1)), postscale2);
+
+ xi0 = _mm_unpacklo_epi64(xi0, xi1);
+ _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi32(EXPTAB_MASK)));
+
+ xi0 = _mm_add_epi32(_mm_srai_epi32(xi0, EXPTAB_SCALE), _mm_set1_epi32(1023));
+ xi0 = _mm_packs_epi32(xi0, xi0);
+ xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
+ xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(2047));
+ xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
+ xi1 = _mm_unpackhi_epi32(xi0, _mm_setzero_si128());
+ xi0 = _mm_unpacklo_epi32(xi0, _mm_setzero_si128());
+
+ __m128d yf0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1]));
+ __m128d yf1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3]));
+ yf0 = _mm_mul_pd(yf0, _mm_castsi128_pd(_mm_slli_epi64(xi0, 52)));
+ yf1 = _mm_mul_pd(yf1, _mm_castsi128_pd(_mm_slli_epi64(xi1, 52)));
+
+ __m128d zf0 = _mm_add_pd(_mm_mul_pd(mA0, xf0), mA1);
+ __m128d zf1 = _mm_add_pd(_mm_mul_pd(mA0, xf1), mA1);
+
+ zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA2);
+ zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA2);
+
+ zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA3);
+ zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA3);
+
+ zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA4);
+ zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA4);
+
+ zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA5);
+ zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA5);
+
+ zf0 = _mm_mul_pd(zf0, yf0);
+ zf1 = _mm_mul_pd(zf1, yf1);
+
+ _mm_storeu_pd(y + i, zf0);
+ _mm_storeu_pd(y + i + 2, zf1);
+ }
+ }
+ else
+#endif
for( ; i <= n - 4; i += 4 )
{
double x0 = x[i].f * exp_prescale;
double x1 = x[i + 1].f * exp_prescale;
double x2 = x[i + 2].f * exp_prescale;
double x3 = x[i + 3].f * exp_prescale;
-
+
double y0, y1, y2, y3;
int val0, val1, val2, val3, t;
-
+
t = (int)(x[i].i >> 52);
if( (t & 2047) > 1023 + 10 )
x0 = t < 0 ? -exp_max_val : exp_max_val;
-
+
t = (int)(x[i+1].i >> 52);
if( (t & 2047) > 1023 + 10 )
x1 = t < 0 ? -exp_max_val : exp_max_val;
-
+
t = (int)(x[i+2].i >> 52);
if( (t & 2047) > 1023 + 10 )
x2 = t < 0 ? -exp_max_val : exp_max_val;
-
+
t = (int)(x[i+3].i >> 52);
if( (t & 2047) > 1023 + 10 )
x3 = t < 0 ? -exp_max_val : exp_max_val;
-
+
val0 = cvRound(x0);
val1 = cvRound(x1);
val2 = cvRound(x2);
val3 = cvRound(x3);
-
+
x0 = (x0 - val0)*exp_postscale;
x1 = (x1 - val1)*exp_postscale;
x2 = (x2 - val2)*exp_postscale;
x3 = (x3 - val3)*exp_postscale;
-
+
t = (val0 >> EXPTAB_SCALE) + 1023;
- t = (t | ((t < 2047) - 1)) & (((t < 0) - 1) & 2047);
- buf[0].i.hi = t << 20;
-
+ t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
+ buf[0].i = (int64)t << 52;
+
t = (val1 >> EXPTAB_SCALE) + 1023;
- t = (t | ((t < 2047) - 1)) & (((t < 0) - 1) & 2047);
- buf[1].i.hi = t << 20;
-
+ t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
+ buf[1].i = (int64)t << 52;
+
t = (val2 >> EXPTAB_SCALE) + 1023;
- t = (t | ((t < 2047) - 1)) & (((t < 0) - 1) & 2047);
- buf[2].i.hi = t << 20;
-
+ t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
+ buf[2].i = (int64)t << 52;
+
t = (val3 >> EXPTAB_SCALE) + 1023;
- t = (t | ((t < 2047) - 1)) & (((t < 0) - 1) & 2047);
- buf[3].i.hi = t << 20;
-
- y0 = buf[0].d * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
- y1 = buf[1].d * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 );
-
+ t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
+ buf[3].i = (int64)t << 52;
+
+ y0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
+ y1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 );
+
y[i] = y0;
y[i + 1] = y1;
-
- y2 = buf[2].d * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 );
- y3 = buf[3].d * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 );
-
+
+ y2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 );
+ y3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 );
+
y[i + 2] = y2;
y[i + 3] = y3;
}
-
+
for( ; i < n; i++ )
{
double x0 = x[i].f * exp_prescale;
int val0, t;
-
+
t = (int)(x[i].i >> 52);
if( (t & 2047) > 1023 + 10 )
x0 = t < 0 ? -exp_max_val : exp_max_val;
-
+
val0 = cvRound(x0);
t = (val0 >> EXPTAB_SCALE) + 1023;
- t = (t | ((t < 2047) - 1)) & (((t < 0) - 1) & 2047);
-
- buf[0].i.hi = t << 20;
+ t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
+
+ buf[0].i = (int64)t << 52;
x0 = (x0 - val0)*exp_postscale;
-
- y[i] = buf[0].d * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
+
+ y[i] = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
}
-
+
return CV_OK;
}
#define LOGTAB_MASK2 ((1 << (20 - LOGTAB_SCALE)) - 1)
#define LOGTAB_MASK2_32F ((1 << (23 - LOGTAB_SCALE)) - 1)
-static const double icvLogTab[] = {
+static const double CV_DECL_ALIGNED(16) icvLogTab[] = {
0.0000000000000000000000000000000000000000, 1.000000000000000000000000000000000000000,
.00389864041565732288852075271279318258166, .9961089494163424124513618677042801556420,
.00778214044205494809292034119607706088573, .9922480620155038759689922480620155038760,
static CvStatus CV_STDCALL Log_32f( const float *_x, float *y, int n )
{
- static const double shift[] = { 0, -1./512 };
- static const double
- A0 = 0.3333333333333333333333333,
- A1 = -0.5,
- A2 = 1;
+ static const float shift[] = { 0, -1.f/512 };
+ static const float
+ A0 = 0.3333333333333333333333333f,
+ A1 = -0.5f,
+ A2 = 1.f;
#undef LOGPOLY
- #define LOGPOLY(x,k) ((x)+=shift[k],((A0*(x) + A1)*(x) + A2)*(x))
+ #define LOGPOLY(x) (((A0*(x) + A1)*(x) + A2)*(x))
int i = 0;
- union
- {
- int i;
- float f;
- }
- buf[4];
-
+ Cv32suf buf[4];
const int* x = (const int*)_x;
- for( i = 0; i <= n - 4; i += 4 )
+#if CV_SSE2
+ if( checkHardwareSupport(CV_CPU_SSE) )
+ {
+ static const __m128d ln2_2 = _mm_set1_pd(ln_2);
+ static const __m128 _1_4 = _mm_set1_ps(1.f);
+ static const __m128 shift4 = _mm_set1_ps(-1.f/512);
+
+ static const __m128 mA0 = _mm_set1_ps(A0);
+ static const __m128 mA1 = _mm_set1_ps(A1);
+ static const __m128 mA2 = _mm_set1_ps(A2);
+
+ int CV_DECL_ALIGNED(16) idx[4];
+
+ for( ; i <= n - 4; i += 4 )
+ {
+ __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i));
+ __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 23), _mm_set1_epi32(255)), _mm_set1_epi32(127));
+ __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2);
+ __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0,yi0)), ln2_2);
+
+ __m128i xi0 = _mm_or_si128(_mm_and_si128(h0, _mm_set1_epi32(LOGTAB_MASK2_32F)), _mm_set1_epi32(127 << 23));
+
+ h0 = _mm_and_si128(_mm_srli_epi32(h0, 23 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK*2));
+ _mm_store_si128((__m128i*)idx, h0);
+ h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510));
+
+ __m128d t0, t1, t2, t3, t4;
+ t0 = _mm_load_pd(icvLogTab + idx[0]);
+ t2 = _mm_load_pd(icvLogTab + idx[1]);
+ t1 = _mm_unpackhi_pd(t0, t2);
+ t0 = _mm_unpacklo_pd(t0, t2);
+ t2 = _mm_load_pd(icvLogTab + idx[2]);
+ t4 = _mm_load_pd(icvLogTab + idx[3]);
+ t3 = _mm_unpackhi_pd(t2, t4);
+ t2 = _mm_unpacklo_pd(t2, t4);
+
+ yd0 = _mm_add_pd(yd0, t0);
+ yd1 = _mm_add_pd(yd1, t2);
+
+ __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1));
+
+ __m128 xf0 = _mm_sub_ps(_mm_castsi128_ps(xi0), _1_4);
+ xf0 = _mm_mul_ps(xf0, _mm_movelh_ps(_mm_cvtpd_ps(t1), _mm_cvtpd_ps(t3)));
+ xf0 = _mm_add_ps(xf0, _mm_and_ps(_mm_castsi128_ps(h0), shift4));
+
+ __m128 zf0 = _mm_mul_ps(xf0, mA0);
+ zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA1), xf0);
+ zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA2), xf0);
+ yf0 = _mm_add_ps(yf0, zf0);
+
+ _mm_storeu_ps(y + i, yf0);
+ }
+ }
+ else
+#endif
+ for( ; i <= n - 4; i += 4 )
{
double x0, x1, x2, x3;
double y0, y1, y2, y3;
x2 = LOGTAB_TRANSLATE( buf[2].f, h2 );
x3 = LOGTAB_TRANSLATE( buf[3].f, h3 );
- y0 += LOGPOLY( x0, h0 == 510 );
- y1 += LOGPOLY( x1, h1 == 510 );
+ x0 += shift[h0 == 510];
+ x1 += shift[h1 == 510];
+ y0 += LOGPOLY( x0 );
+ y1 += LOGPOLY( x1 );
y[i] = (float) y0;
y[i + 1] = (float) y1;
- y2 += LOGPOLY( x2, h2 == 510 );
- y3 += LOGPOLY( x3, h3 == 510 );
+ x2 += shift[h2 == 510];
+ x3 += shift[h3 == 510];
+ y2 += LOGPOLY( x2 );
+ y3 += LOGPOLY( x3 );
y[i + 2] = (float) y2;
y[i + 3] = (float) y3;
for( ; i < n; i++ )
{
int h0 = x[i];
- double x0, y0;
+ double y0;
+ float x0;
y0 = (((h0 >> 23) & 0xff) - 127) * ln_2;
y0 += icvLogTab[h0];
x0 = LOGTAB_TRANSLATE( buf[0].f, h0 );
- y0 += LOGPOLY( x0, h0 == 510 );
+ x0 += shift[h0 == 510];
+ y0 += LOGPOLY( x0 );
y[i] = (float)y0;
}
DBLINT buf[4];
DBLINT *X = (DBLINT *) x;
+#if CV_SSE2
+ if( checkHardwareSupport(CV_CPU_SSE) )
+ {
+ static const __m128d ln2_2 = _mm_set1_pd(ln_2);
+ static const __m128d _1_2 = _mm_set1_pd(1.);
+ static const __m128d shift2 = _mm_set1_pd(-1./512);
+
+ static const __m128i log_and_mask2 = _mm_set_epi32(LOGTAB_MASK2, 0xffffffff, LOGTAB_MASK2, 0xffffffff);
+ static const __m128i log_or_mask2 = _mm_set_epi32(1023 << 20, 0, 1023 << 20, 0);
+
+ static const __m128d mA0 = _mm_set1_pd(A0);
+ static const __m128d mA1 = _mm_set1_pd(A1);
+ static const __m128d mA2 = _mm_set1_pd(A2);
+ static const __m128d mA3 = _mm_set1_pd(A3);
+ static const __m128d mA4 = _mm_set1_pd(A4);
+ static const __m128d mA5 = _mm_set1_pd(A5);
+ static const __m128d mA6 = _mm_set1_pd(A6);
+ static const __m128d mA7 = _mm_set1_pd(A7);
+
+ int CV_DECL_ALIGNED(16) idx[4];
+
+ for( ; i <= n - 4; i += 4 )
+ {
+ __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i));
+ __m128i h1 = _mm_loadu_si128((const __m128i*)(x + i + 2));
+
+ __m128d xd0 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h0, log_and_mask2), log_or_mask2));
+ __m128d xd1 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h1, log_and_mask2), log_or_mask2));
+
+ h0 = _mm_unpackhi_epi32(_mm_unpacklo_epi32(h0, h1), _mm_unpackhi_epi32(h0, h1));
+
+ __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 20),
+ _mm_set1_epi32(2047)), _mm_set1_epi32(1023));
+ __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2);
+ __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0, yi0)), ln2_2);
+
+ h0 = _mm_and_si128(_mm_srli_epi32(h0, 20 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK * 2));
+ _mm_store_si128((__m128i*)idx, h0);
+ h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510));
+
+ __m128d t0, t1, t2, t3, t4;
+ t0 = _mm_load_pd(icvLogTab + idx[0]);
+ t2 = _mm_load_pd(icvLogTab + idx[1]);
+ t1 = _mm_unpackhi_pd(t0, t2);
+ t0 = _mm_unpacklo_pd(t0, t2);
+ t2 = _mm_load_pd(icvLogTab + idx[2]);
+ t4 = _mm_load_pd(icvLogTab + idx[3]);
+ t3 = _mm_unpackhi_pd(t2, t4);
+ t2 = _mm_unpacklo_pd(t2, t4);
+
+ yd0 = _mm_add_pd(yd0, t0);
+ yd1 = _mm_add_pd(yd1, t2);
+
+ xd0 = _mm_mul_pd(_mm_sub_pd(xd0, _1_2), t1);
+ xd1 = _mm_mul_pd(_mm_sub_pd(xd1, _1_2), t3);
+
+ xd0 = _mm_add_pd(xd0, _mm_and_pd(_mm_castsi128_pd(_mm_unpacklo_epi32(h0, h0)), shift2));
+ xd1 = _mm_add_pd(xd1, _mm_and_pd(_mm_castsi128_pd(_mm_unpackhi_epi32(h0, h0)), shift2));
+
+ __m128d zd0 = _mm_mul_pd(xd0, mA0);
+ __m128d zd1 = _mm_mul_pd(xd1, mA0);
+ zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA1), xd0);
+ zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA1), xd1);
+ zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA2), xd0);
+ zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA2), xd1);
+ zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA3), xd0);
+ zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA3), xd1);
+ zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA4), xd0);
+ zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA4), xd1);
+ zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA5), xd0);
+ zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA5), xd1);
+ zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA6), xd0);
+ zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA6), xd1);
+ zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA7), xd0);
+ zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA7), xd1);
+
+ yd0 = _mm_add_pd(yd0, zd0);
+ yd1 = _mm_add_pd(yd1, zd1);
+
+ _mm_storeu_pd(y + i, yd0);
+ _mm_storeu_pd(y + i + 2, yd1);
+ }
+ }
+ else
+#endif
for( ; i <= n - 4; i += 4 )
{
double xq;
y[i + 2] = y2;
y[i + 3] = y3;
}
-
+
for( ; i < n; i++ )
{
int h0 = X[i].i.hi;