src/gallium/drivers/llvmpipe/sse_mathfun.h

   1 /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
   2
   3    Inspired by Intel Approximate Math library, and based on the
   4    corresponding algorithms of the cephes math library
   5
   6    The default is to use the SSE1 version. If you define USE_SSE2 the
   7    the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
   8    not expect any significant performance improvement with SSE2.
   9 */
  10
  11 /* Copyright (C) 2007  Julien Pommier
  12
  13   This software is provided 'as-is', without any express or implied
  14   warranty.  In no event will the authors be held liable for any damages
  15   arising from the use of this software.
  16
  17   Permission is granted to anyone to use this software for any purpose,
  18   including commercial applications, and to alter it and redistribute it
  19   freely, subject to the following restrictions:
  20
  21   1. The origin of this software must not be misrepresented; you must not
  22      claim that you wrote the original software. If you use this software
  23      in a product, an acknowledgment in the product documentation would be
  24      appreciated but is not required.
  25   2. Altered source versions must be plainly marked as such, and must not be
  26      misrepresented as being the original software.
  27   3. This notice may not be removed or altered from any source distribution.
  28
  29   (this is the zlib license)
  30 */
  31
  32 #include <xmmintrin.h>
  33
  34 /* yes I know, the top of this file is quite ugly */
  35
  36 #ifdef _MSC_VER /* visual c++ */
  37 # define ALIGN16_BEG __declspec(align(16))
  38 # define ALIGN16_END
  39 #else /* gcc or icc */
  40 # define ALIGN16_BEG
  41 # define ALIGN16_END __attribute__((aligned(16)))
  42 #endif
  43
  44 /* __m128 is ugly to write */
  45 typedef __m128 v4sf;  // vector of 4 float (sse1)
  46
  47 #ifdef USE_SSE2
  48 # include <emmintrin.h>
  49 typedef __m128i v4si; // vector of 4 int (sse2)
  50 #else
  51 typedef __m64 v2si;   // vector of 2 int (mmx)
  52 #endif
  53
  54 /* declare some SSE constants -- why can't I figure a better way to do that? */
  55 #define _PS_CONST(Name, Val)                                            \
  56   static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
  57 #define _PI32_CONST(Name, Val)                                            \
  58   static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
  59 #define _PS_CONST_TYPE(Name, Type, Val)                                 \
  60   static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
  61
  62 _PS_CONST(1  , 1.0f);
  63 _PS_CONST(0p5, 0.5f);
  64 /* the smallest non denormalized float number */
  65 _PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
  66 _PS_CONST_TYPE(mant_mask, int, 0x7f800000);
  67 _PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
  68
  69 _PS_CONST_TYPE(sign_mask, int, 0x80000000);
  70 _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
  71
  72 _PI32_CONST(1, 1);
  73 _PI32_CONST(inv1, ~1);
  74 _PI32_CONST(2, 2);
  75 _PI32_CONST(4, 4);
  76 _PI32_CONST(0x7f, 0x7f);
  77
  78 _PS_CONST(cephes_SQRTHF, 0.707106781186547524);
  79 _PS_CONST(cephes_log_p0, 7.0376836292E-2);
  80 _PS_CONST(cephes_log_p1, - 1.1514610310E-1);
  81 _PS_CONST(cephes_log_p2, 1.1676998740E-1);
  82 _PS_CONST(cephes_log_p3, - 1.2420140846E-1);
  83 _PS_CONST(cephes_log_p4, + 1.4249322787E-1);
  84 _PS_CONST(cephes_log_p5, - 1.6668057665E-1);
  85 _PS_CONST(cephes_log_p6, + 2.0000714765E-1);
  86 _PS_CONST(cephes_log_p7, - 2.4999993993E-1);
  87 _PS_CONST(cephes_log_p8, + 3.3333331174E-1);
  88 _PS_CONST(cephes_log_q1, -2.12194440e-4);
  89 _PS_CONST(cephes_log_q2, 0.693359375);
  90
  91 v4sf log_ps(v4sf x);
  92 v4sf exp_ps(v4sf x);
  93 v4sf sin_ps(v4sf x);
  94 v4sf cos_ps(v4sf x);
  95 void sincos_ps(v4sf x, v4sf *s, v4sf *c);
  96
  97 #ifndef USE_SSE2
  98 typedef union xmm_mm_union {
  99   __m128 xmm;
 100   __m64 mm[2];
 101 } xmm_mm_union;
 102
 103 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) {          \
 104     xmm_mm_union u; u.xmm = xmm_;                   \
 105     mm0_ = u.mm[0];                                 \
 106     mm1_ = u.mm[1];                                 \
 107 }
 108
 109 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) {                         \
 110     xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm;      \
 111   }
 112
 113 #endif // USE_SSE2
 114
 115 /* natural logarithm computed for 4 simultaneous float
 116    return NaN for x <= 0
 117 */
 118 v4sf log_ps(v4sf x) {
 119 #ifdef USE_SSE2
 120   v4si emm0;
 121 #else
 122   v2si mm0, mm1;
 123 #endif
 124   v4sf one = *(v4sf*)_ps_1;
 125
 126   v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
 127   v4sf e, mask, tmp, z, y;
 128
 129   x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos);  /* cut off denormalized stuff */
 130
 131 #ifndef USE_SSE2
 132   /* part 1: x = frexpf(x, &e); */
 133   COPY_XMM_TO_MM(x, mm0, mm1);
 134   mm0 = _mm_srli_pi32(mm0, 23);
 135   mm1 = _mm_srli_pi32(mm1, 23);
 136 #else
 137   emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
 138 #endif
 139   /* keep only the fractional part */
 140   x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
 141   x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
 142
 143 #ifndef USE_SSE2
 144   /* now e=mm0:mm1 contain the really base-2 exponent */
 145   mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
 146   mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
 147   e = _mm_cvtpi32x2_ps(mm0, mm1);
 148   _mm_empty(); /* bye bye mmx */
 149 #else
 150   emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
 151   e = _mm_cvtepi32_ps(emm0);
 152 #endif
 153
 154   e = _mm_add_ps(e, one);
 155
 156   /* part2:
 157      if( x < SQRTHF ) {
 158        e -= 1;
 159        x = x + x - 1.0;
 160      } else { x = x - 1.0; }
 161   */
 162
 163   mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
 164   tmp = _mm_and_ps(x, mask);
 165   x = _mm_sub_ps(x, one);
 166   e = _mm_sub_ps(e, _mm_and_ps(one, mask));
 167   x = _mm_add_ps(x, tmp);
 168
 169
 170   z = _mm_mul_ps(x,x);
 171
 172   y = *(v4sf*)_ps_cephes_log_p0;
 173   y = _mm_mul_ps(y, x);
 174   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
 175   y = _mm_mul_ps(y, x);
 176   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
 177   y = _mm_mul_ps(y, x);
 178   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
 179   y = _mm_mul_ps(y, x);
 180   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
 181   y = _mm_mul_ps(y, x);
 182   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
 183   y = _mm_mul_ps(y, x);
 184   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
 185   y = _mm_mul_ps(y, x);
 186   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
 187   y = _mm_mul_ps(y, x);
 188   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
 189   y = _mm_mul_ps(y, x);
 190
 191   y = _mm_mul_ps(y, z);
 192
 193
 194   tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
 195   y = _mm_add_ps(y, tmp);
 196
 197
 198   tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
 199   y = _mm_sub_ps(y, tmp);
 200
 201   tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
 202   x = _mm_add_ps(x, y);
 203   x = _mm_add_ps(x, tmp);
 204   x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
 205   return x;
 206 }
 207
 208 _PS_CONST(exp_hi,       88.3762626647949f);
 209 _PS_CONST(exp_lo,       -88.3762626647949f);
 210
 211 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
 212 _PS_CONST(cephes_exp_C1, 0.693359375);
 213 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
 214
 215 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
 216 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
 217 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
 218 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
 219 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
 220 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
 221
 222 v4sf exp_ps(v4sf x) {
 223   v4sf tmp = _mm_setzero_ps(), fx;
 224 #ifdef USE_SSE2
 225   v4si emm0;
 226 #else
 227   v2si mm0, mm1;
 228 #endif
 229   v4sf one = *(v4sf*)_ps_1;
 230   v4sf mask, z, y, pow2n;
 231
 232   x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
 233   x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
 234
 235   /* express exp(x) as exp(g + n*log(2)) */
 236   fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
 237   fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
 238
 239   /* how to perform a floorf with SSE: just below */
 240 #ifndef USE_SSE2
 241   /* step 1 : cast to int */
 242   tmp = _mm_movehl_ps(tmp, fx);
 243   mm0 = _mm_cvttps_pi32(fx);
 244   mm1 = _mm_cvttps_pi32(tmp);
 245   /* step 2 : cast back to float */
 246   tmp = _mm_cvtpi32x2_ps(mm0, mm1);
 247 #else
 248   emm0 = _mm_cvttps_epi32(fx);
 249   tmp  = _mm_cvtepi32_ps(emm0);
 250 #endif
 251   /* if greater, substract 1 */
 252   mask = _mm_cmpgt_ps(tmp, fx);
 253   mask = _mm_and_ps(mask, one);
 254   fx = _mm_sub_ps(tmp, mask);
 255
 256   tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
 257   z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
 258   x = _mm_sub_ps(x, tmp);
 259   x = _mm_sub_ps(x, z);
 260
 261   z = _mm_mul_ps(x,x);
 262
 263   y = *(v4sf*)_ps_cephes_exp_p0;
 264   y = _mm_mul_ps(y, x);
 265   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
 266   y = _mm_mul_ps(y, x);
 267   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
 268   y = _mm_mul_ps(y, x);
 269   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
 270   y = _mm_mul_ps(y, x);
 271   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
 272   y = _mm_mul_ps(y, x);
 273   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
 274   y = _mm_mul_ps(y, z);
 275   y = _mm_add_ps(y, x);
 276   y = _mm_add_ps(y, one);
 277
 278   /* build 2^n */
 279 #ifndef USE_SSE2
 280   z = _mm_movehl_ps(z, fx);
 281   mm0 = _mm_cvttps_pi32(fx);
 282   mm1 = _mm_cvttps_pi32(z);
 283   mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
 284   mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
 285   mm0 = _mm_slli_pi32(mm0, 23);
 286   mm1 = _mm_slli_pi32(mm1, 23);
 287
 288   COPY_MM_TO_XMM(mm0, mm1, pow2n);
 289   _mm_empty();
 290 #else
 291   emm0 = _mm_cvttps_epi32(fx);
 292   emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
 293   emm0 = _mm_slli_epi32(emm0, 23);
 294   pow2n = _mm_castsi128_ps(emm0);
 295 #endif
 296   y = _mm_mul_ps(y, pow2n);
 297   return y;
 298 }
 299
 300 _PS_CONST(minus_cephes_DP1, -0.78515625);
 301 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
 302 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
 303 _PS_CONST(sincof_p0, -1.9515295891E-4);
 304 _PS_CONST(sincof_p1,  8.3321608736E-3);
 305 _PS_CONST(sincof_p2, -1.6666654611E-1);
 306 _PS_CONST(coscof_p0,  2.443315711809948E-005);
 307 _PS_CONST(coscof_p1, -1.388731625493765E-003);
 308 _PS_CONST(coscof_p2,  4.166664568298827E-002);
 309 _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
 310
 311
 312 /* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
 313    it runs also on old athlons XPs and the pentium III of your grand
 314    mother.
 315
 316    The code is the exact rewriting of the cephes sinf function.
 317    Precision is excellent as long as x < 8192 (I did not bother to
 318    take into account the special handling they have for greater values
 319    -- it does not return garbage for arguments over 8192, though, but
 320    the extra precision is missing).
 321
 322    Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
 323    surprising but correct result.
 324
 325    Performance is also surprisingly good, 1.33 times faster than the
 326    macos vsinf SSE2 function, and 1.5 times faster than the
 327    __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
 328    too bad for an SSE1 function (with no special tuning) !
 329    However the latter libraries probably have a much better handling of NaN,
 330    Inf, denormalized and other special arguments..
 331
 332    On my core 1 duo, the execution of this function takes approximately 95 cycles.
 333
 334    From what I have observed on the experiments with Intel AMath lib, switching to an
 335    SSE2 version would improve the perf by only 10%.
 336
 337    Since it is based on SSE intrinsics, it has to be compiled at -O2 to
 338    deliver full speed.
 339 */
 340 v4sf sin_ps(v4sf x) { // any x
 341   v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
 342
 343 #ifdef USE_SSE2
 344   v4si emm0, emm2;
 345 #else
 346   v2si mm0, mm1, mm2, mm3;
 347 #endif
 348   v4sf swap_sign_bit, poly_mask, z, tmp, y2;
 349
 350   sign_bit = x;
 351   /* take the absolute value */
 352   x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
 353   /* extract the sign bit (upper one) */
 354   sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
 355
 356   /* scale by 4/Pi */
 357   y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
 358
 359   //printf("plop:"); print4(y);
 360 #ifdef USE_SSE2
 361   /* store the integer part of y in mm0 */
 362   emm2 = _mm_cvttps_epi32(y);
 363   /* j=(j+1) & (~1) (see the cephes sources) */
 364   emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
 365   emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
 366   y = _mm_cvtepi32_ps(emm2);
 367   /* get the swap sign flag */
 368   emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
 369   emm0 = _mm_slli_epi32(emm0, 29);
 370   /* get the polynom selection mask
 371      there is one polynom for 0 <= x <= Pi/4
 372      and another one for Pi/4<x<=Pi/2
 373
 374      Both branches will be computed.
 375   */
 376   emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
 377   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
 378
 379   swap_sign_bit = _mm_castsi128_ps(emm0);
 380   poly_mask = _mm_castsi128_ps(emm2);
 381   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
 382 #else
 383   /* store the integer part of y in mm0:mm1 */
 384   xmm2 = _mm_movehl_ps(xmm2, y);
 385   mm2 = _mm_cvttps_pi32(y);
 386   mm3 = _mm_cvttps_pi32(xmm2);
 387   /* j=(j+1) & (~1) (see the cephes sources) */
 388   mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
 389   mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
 390   mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
 391   mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
 392   y = _mm_cvtpi32x2_ps(mm2, mm3);
 393   /* get the swap sign flag */
 394   mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
 395   mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
 396   mm0 = _mm_slli_pi32(mm0, 29);
 397   mm1 = _mm_slli_pi32(mm1, 29);
 398   /* get the polynom selection mask */
 399   mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
 400   mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
 401   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
 402   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
 403
 404   COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
 405   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
 406   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
 407   _mm_empty(); /* good-bye mmx */
 408 #endif
 409
 410   /* The magic pass: "Extended precision modular arithmetic"
 411      x = ((x - y * DP1) - y * DP2) - y * DP3; */
 412   xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
 413   xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
 414   xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
 415   xmm1 = _mm_mul_ps(y, xmm1);
 416   xmm2 = _mm_mul_ps(y, xmm2);
 417   xmm3 = _mm_mul_ps(y, xmm3);
 418   x = _mm_add_ps(x, xmm1);
 419   x = _mm_add_ps(x, xmm2);
 420   x = _mm_add_ps(x, xmm3);
 421
 422   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
 423   y = *(v4sf*)_ps_coscof_p0;
 424   z = _mm_mul_ps(x,x);
 425
 426   y = _mm_mul_ps(y, z);
 427   y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
 428   y = _mm_mul_ps(y, z);
 429   y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
 430   y = _mm_mul_ps(y, z);
 431   y = _mm_mul_ps(y, z);
 432   tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
 433   y = _mm_sub_ps(y, tmp);
 434   y = _mm_add_ps(y, *(v4sf*)_ps_1);
 435
 436   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
 437
 438   y2 = *(v4sf*)_ps_sincof_p0;
 439   y2 = _mm_mul_ps(y2, z);
 440   y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
 441   y2 = _mm_mul_ps(y2, z);
 442   y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
 443   y2 = _mm_mul_ps(y2, z);
 444   y2 = _mm_mul_ps(y2, x);
 445   y2 = _mm_add_ps(y2, x);
 446
 447   /* select the correct result from the two polynoms */
 448   xmm3 = poly_mask;
 449   y2 = _mm_and_ps(xmm3, y2); //, xmm3);
 450   y = _mm_andnot_ps(xmm3, y);
 451   y = _mm_add_ps(y,y2);
 452   /* update the sign */
 453   y = _mm_xor_ps(y, sign_bit);
 454
 455   return y;
 456 }
 457
 458 /* almost the same as sin_ps */
 459 v4sf cos_ps(v4sf x) { // any x
 460   v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
 461 #ifdef USE_SSE2
 462   v4si emm0, emm2;
 463 #else
 464   v2si mm0, mm1, mm2, mm3;
 465 #endif
 466   v4sf sign_bit, poly_mask, z, tmp, y2;
 467
 468   /* take the absolute value */
 469   x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
 470
 471   /* scale by 4/Pi */
 472   y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
 473
 474 #ifdef USE_SSE2
 475   /* store the integer part of y in mm0 */
 476   emm2 = _mm_cvttps_epi32(y);
 477   /* j=(j+1) & (~1) (see the cephes sources) */
 478   emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
 479   emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
 480   y = _mm_cvtepi32_ps(emm2);
 481
 482   emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
 483
 484   /* get the swap sign flag */
 485   emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
 486   emm0 = _mm_slli_epi32(emm0, 29);
 487   /* get the polynom selection mask */
 488   emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
 489   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
 490
 491   sign_bit = _mm_castsi128_ps(emm0);
 492   poly_mask = _mm_castsi128_ps(emm2);
 493 #else
 494   /* store the integer part of y in mm0:mm1 */
 495   xmm2 = _mm_movehl_ps(xmm2, y);
 496   mm2 = _mm_cvttps_pi32(y);
 497   mm3 = _mm_cvttps_pi32(xmm2);
 498
 499   /* j=(j+1) & (~1) (see the cephes sources) */
 500   mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
 501   mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
 502   mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
 503   mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
 504
 505   y = _mm_cvtpi32x2_ps(mm2, mm3);
 506
 507
 508   mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
 509   mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
 510
 511   /* get the swap sign flag in mm0:mm1 and the
 512      polynom selection mask in mm2:mm3 */
 513
 514   mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
 515   mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
 516   mm0 = _mm_slli_pi32(mm0, 29);
 517   mm1 = _mm_slli_pi32(mm1, 29);
 518
 519   mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
 520   mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
 521
 522   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
 523   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
 524
 525   COPY_MM_TO_XMM(mm0, mm1, sign_bit);
 526   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
 527   _mm_empty(); /* good-bye mmx */
 528 #endif
 529   /* The magic pass: "Extended precision modular arithmetic"
 530      x = ((x - y * DP1) - y * DP2) - y * DP3; */
 531   xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
 532   xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
 533   xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
 534   xmm1 = _mm_mul_ps(y, xmm1);
 535   xmm2 = _mm_mul_ps(y, xmm2);
 536   xmm3 = _mm_mul_ps(y, xmm3);
 537   x = _mm_add_ps(x, xmm1);
 538   x = _mm_add_ps(x, xmm2);
 539   x = _mm_add_ps(x, xmm3);
 540
 541   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
 542   y = *(v4sf*)_ps_coscof_p0;
 543   z = _mm_mul_ps(x,x);
 544
 545   y = _mm_mul_ps(y, z);
 546   y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
 547   y = _mm_mul_ps(y, z);
 548   y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
 549   y = _mm_mul_ps(y, z);
 550   y = _mm_mul_ps(y, z);
 551   tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
 552   y = _mm_sub_ps(y, tmp);
 553   y = _mm_add_ps(y, *(v4sf*)_ps_1);
 554
 555   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
 556
 557   y2 = *(v4sf*)_ps_sincof_p0;
 558   y2 = _mm_mul_ps(y2, z);
 559   y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
 560   y2 = _mm_mul_ps(y2, z);
 561   y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
 562   y2 = _mm_mul_ps(y2, z);
 563   y2 = _mm_mul_ps(y2, x);
 564   y2 = _mm_add_ps(y2, x);
 565
 566   /* select the correct result from the two polynoms */
 567   xmm3 = poly_mask;
 568   y2 = _mm_and_ps(xmm3, y2); //, xmm3);
 569   y = _mm_andnot_ps(xmm3, y);
 570   y = _mm_add_ps(y,y2);
 571   /* update the sign */
 572   y = _mm_xor_ps(y, sign_bit);
 573
 574   return y;
 575 }
 576
 577 /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
 578    it is almost as fast, and gives you a free cosine with your sine */
 579 void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
 580   v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
 581 #ifdef USE_SSE2
 582   v4si emm0, emm2, emm4;
 583 #else
 584   v2si mm0, mm1, mm2, mm3, mm4, mm5;
 585 #endif
 586   v4sf swap_sign_bit_sin, poly_mask, z, tmp, y2, ysin1, ysin2;
 587   v4sf sign_bit_cos;
 588
 589   sign_bit_sin = x;
 590   /* take the absolute value */
 591   x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
 592   /* extract the sign bit (upper one) */
 593   sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
 594
 595   /* scale by 4/Pi */
 596   y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
 597
 598 #ifdef USE_SSE2
 599   /* store the integer part of y in emm2 */
 600   emm2 = _mm_cvttps_epi32(y);
 601
 602   /* j=(j+1) & (~1) (see the cephes sources) */
 603   emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
 604   emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
 605   y = _mm_cvtepi32_ps(emm2);
 606
 607   emm4 = emm2;
 608
 609   /* get the swap sign flag for the sine */
 610   emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
 611   emm0 = _mm_slli_epi32(emm0, 29);
 612   swap_sign_bit_sin = _mm_castsi128_ps(emm0);
 613
 614   /* get the polynom selection mask for the sine*/
 615   emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
 616   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
 617   poly_mask = _mm_castsi128_ps(emm2);
 618 #else
 619   /* store the integer part of y in mm2:mm3 */
 620   xmm3 = _mm_movehl_ps(xmm3, y);
 621   mm2 = _mm_cvttps_pi32(y);
 622   mm3 = _mm_cvttps_pi32(xmm3);
 623
 624   /* j=(j+1) & (~1) (see the cephes sources) */
 625   mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
 626   mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
 627   mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
 628   mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
 629
 630   y = _mm_cvtpi32x2_ps(mm2, mm3);
 631
 632   mm4 = mm2;
 633   mm5 = mm3;
 634
 635   /* get the swap sign flag for the sine */
 636   mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
 637   mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
 638   mm0 = _mm_slli_pi32(mm0, 29);
 639   mm1 = _mm_slli_pi32(mm1, 29);
 640
 641   COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
 642
 643   /* get the polynom selection mask for the sine */
 644
 645   mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
 646   mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
 647   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
 648   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
 649
 650   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
 651 #endif
 652
 653   /* The magic pass: "Extended precision modular arithmetic"
 654      x = ((x - y * DP1) - y * DP2) - y * DP3; */
 655   xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
 656   xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
 657   xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
 658   xmm1 = _mm_mul_ps(y, xmm1);
 659   xmm2 = _mm_mul_ps(y, xmm2);
 660   xmm3 = _mm_mul_ps(y, xmm3);
 661   x = _mm_add_ps(x, xmm1);
 662   x = _mm_add_ps(x, xmm2);
 663   x = _mm_add_ps(x, xmm3);
 664
 665 #ifdef USE_SSE2
 666   emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
 667   emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
 668   emm4 = _mm_slli_epi32(emm4, 29);
 669   sign_bit_cos = _mm_castsi128_ps(emm4);
 670 #else
 671   /* get the sign flag for the cosine */
 672   mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
 673   mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
 674   mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
 675   mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
 676   mm4 = _mm_slli_pi32(mm4, 29);
 677   mm5 = _mm_slli_pi32(mm5, 29);
 678   COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
 679   _mm_empty(); /* good-bye mmx */
 680 #endif
 681
 682   sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
 683
 684
 685   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
 686   z = _mm_mul_ps(x,x);
 687   y = *(v4sf*)_ps_coscof_p0;
 688
 689   y = _mm_mul_ps(y, z);
 690   y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
 691   y = _mm_mul_ps(y, z);
 692   y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
 693   y = _mm_mul_ps(y, z);
 694   y = _mm_mul_ps(y, z);
 695   tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
 696   y = _mm_sub_ps(y, tmp);
 697   y = _mm_add_ps(y, *(v4sf*)_ps_1);
 698
 699   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
 700
 701   y2 = *(v4sf*)_ps_sincof_p0;
 702   y2 = _mm_mul_ps(y2, z);
 703   y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
 704   y2 = _mm_mul_ps(y2, z);
 705   y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
 706   y2 = _mm_mul_ps(y2, z);
 707   y2 = _mm_mul_ps(y2, x);
 708   y2 = _mm_add_ps(y2, x);
 709
 710   /* select the correct result from the two polynoms */
 711   xmm3 = poly_mask;
 712   ysin2 = _mm_and_ps(xmm3, y2);
 713   ysin1 = _mm_andnot_ps(xmm3, y);
 714   y2 = _mm_sub_ps(y2,ysin2);
 715   y = _mm_sub_ps(y, ysin1);
 716
 717   xmm1 = _mm_add_ps(ysin1,ysin2);
 718   xmm2 = _mm_add_ps(y,y2);
 719
 720   /* update the sign */
 721   *s = _mm_xor_ps(xmm1, sign_bit_sin);
 722   *c = _mm_xor_ps(xmm2, sign_bit_cos);
 723 }
 724