src/third_party/openmax_dl/dl/sp/src/x86/x86SP_SSE_Math.h

   1 /*
   2  *  Copyright (c) 2013 The WebRTC project authors. All Rights realserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  *
  10  */
  11
  12 #include <emmintrin.h>
  13 #include <assert.h>
  14
  15 /**
  16  * Two data formats are used by the FFT routines, internally. The
  17  * interface to the main external FFT routines use interleaved complex
  18  * values where the real part is followed by the imaginary part.
  19  *
  20  * One is the split format where a complex vector of real and imaginary
  21  * values are split such that all of the real values are placed in the
  22  * first half of the vector and the corresponding values are placed in
  23  * the second half, in the same order. The conversion from interleaved
  24  * complex values to split format and back is transparent to the
  25  * external FFT interface.
  26  *
  27  * VComplex uses split format.
  28  */
  29
  30 /** VComplex hold 4 complex float elements, with the real parts stored
  31  * in real and corresponding imaginary parts in imag.
  32  */
  33 typedef struct VComplex {
  34   __m128 real;
  35   __m128 imag;
  36 } VC;
  37
  38 /* out = a * b */
  39 static __inline void VC_MUL(VC *out, VC *a, VC *b) {
  40   out->real = _mm_sub_ps(_mm_mul_ps(a->real, b->real),
  41       _mm_mul_ps(a->imag, b->imag));
  42   out->imag = _mm_add_ps(_mm_mul_ps(a->real, b->imag),
  43       _mm_mul_ps(a->imag, b->real));
  44 }
  45
  46 /* out = conj(a) * b */
  47 static __inline void VC_CONJ_MUL(VC *out, VC *a, VC *b) {
  48   out->real = _mm_add_ps(_mm_mul_ps(a->real, b->real),
  49       _mm_mul_ps(a->imag, b->imag));
  50   out->imag = _mm_sub_ps(_mm_mul_ps(a->real, b->imag),
  51       _mm_mul_ps(a->imag, b->real));
  52 }
  53
  54 /* Scale complex by a real factor */
  55 static __inline void VC_MUL_F(VC *out, VC *a, __m128 factor) {
  56   out->real = _mm_mul_ps(factor, a->real);
  57   out->imag = _mm_mul_ps(factor, a->imag);
  58 }
  59
  60 /* out = a + b */
  61 static __inline void VC_ADD(VC *out, VC *a, VC *b) {
  62   out->real = _mm_add_ps(a->real, b->real);
  63   out->imag = _mm_add_ps(a->imag, b->imag);
  64 }
  65
  66 /**
  67  * out.real = a.real + b.imag
  68  * out.imag = a.imag + b.real
  69  */
  70 static __inline void VC_ADD_X(VC *out, VC *a, VC *b) {
  71   out->real = _mm_add_ps(a->real, b->imag);
  72   out->imag = _mm_add_ps(b->real, a->imag);
  73 }
  74
  75 /* VC_ADD and store the result with Split format. */
  76 static __inline void VC_ADD_STORE_SPLIT(
  77     OMX_F32 *out,
  78     VC *a,
  79     VC *b,
  80     OMX_INT offset) {
  81   _mm_store_ps(out, _mm_add_ps(a->real, b->real));
  82   _mm_store_ps(out + offset, _mm_add_ps(a->imag, b->imag));
  83 }
  84
  85 /* out = a - b */
  86 static __inline void VC_SUB(VC *out, VC *a, VC *b) {
  87   out->real = _mm_sub_ps(a->real, b->real);
  88   out->imag = _mm_sub_ps(a->imag, b->imag);
  89 }
  90
  91 /**
  92  * out.real = a.real - b.imag
  93  * out.imag = a.imag - b.real
  94  */
  95 static __inline void VC_SUB_X(VC *out, VC *a, VC *b) {
  96   out->real = _mm_sub_ps(a->real, b->imag);
  97   out->imag = _mm_sub_ps(b->real, a->imag);
  98 }
  99
 100 /* VC_SUB and store the result with Split format. */
 101 static __inline void VC_SUB_STORE_SPLIT(
 102     OMX_F32 *out,
 103     VC *a,
 104     VC *b,
 105     OMX_INT offset) {
 106   _mm_store_ps(out, _mm_sub_ps(a->real, b->real));
 107   _mm_store_ps(out + offset, _mm_sub_ps(a->imag, b->imag));
 108 }
 109
 110 /**
 111  * out.real = a.real + b.real
 112  * out.imag = a.imag - b.imag
 113  */
 114 static __inline void VC_ADD_SUB(VC *out, VC *a, VC *b) {
 115   out->real = _mm_add_ps(a->real, b->real);
 116   out->imag = _mm_sub_ps(a->imag, b->imag);
 117 }
 118
 119 /**
 120  * out.real = a.real + b.imag
 121  * out.imag = a.imag - b.real
 122  */
 123 static __inline void VC_ADD_SUB_X(VC *out, VC *a, VC *b) {
 124   out->real = _mm_add_ps(a->real, b->imag);
 125   out->imag = _mm_sub_ps(a->imag, b->real);
 126 }
 127
 128 /* VC_ADD_SUB_X and store the result with Split format. */
 129 static __inline void VC_ADD_SUB_X_STORE_SPLIT(
 130     OMX_F32 *out,
 131     VC *a,
 132     VC *b,
 133     OMX_INT offset) {
 134   _mm_store_ps(out, _mm_add_ps(a->real, b->imag));
 135   _mm_store_ps(out + offset, _mm_sub_ps(a->imag, b->real));
 136 }
 137
 138 /**
 139  * out.real = a.real - b.real
 140  * out.imag = a.imag + b.imag
 141  */
 142 static __inline void VC_SUB_ADD(VC *out, VC *a, VC *b) {
 143   out->real = _mm_sub_ps(a->real, b->real);
 144   out->imag = _mm_add_ps(a->imag, b->imag);
 145 }
 146
 147 /**
 148  * out.real = a.real - b.imag
 149  * out.imag = a.imag + b.real
 150  */
 151 static __inline void VC_SUB_ADD_X(VC *out, VC *a, VC *b) {
 152   out->real = _mm_sub_ps(a->real, b->imag);
 153   out->imag = _mm_add_ps(a->imag, b->real);
 154 }
 155
 156 /* VC_SUB_ADD_X and store the result with Split format. */
 157 static __inline void VC_SUB_ADD_X_STORE_SPLIT(
 158     OMX_F32 *out,
 159     VC *a, VC *b,
 160     OMX_INT offset) {
 161   _mm_store_ps(out, _mm_sub_ps(a->real, b->imag));
 162   _mm_store_ps(out + offset, _mm_add_ps(a->imag, b->real));
 163 }
 164
 165 /**
 166  * out[0]      = in.real
 167  * out[offset] = in.imag
 168  */
 169 static __inline void VC_STORE_SPLIT(
 170     OMX_F32 *out,
 171     VC *in,
 172     OMX_INT offset) {
 173   _mm_store_ps(out, in->real);
 174   _mm_store_ps(out + offset, in->imag);
 175 }
 176
 177 /**
 178  * out.real = in[0];
 179  * out.imag = in[offset];
 180 */
 181 static __inline void VC_LOAD_SPLIT(
 182     VC *out,
 183     const OMX_F32 *in,
 184     OMX_INT offset) {
 185   out->real = _mm_load_ps(in);
 186   out->imag = _mm_load_ps(in + offset);
 187 }
 188
 189 /* Vector Complex Unpack from Split format to Interleaved format. */
 190 static __inline void VC_UNPACK(VC *out, VC *in) {
 191     out->real = _mm_unpacklo_ps(in->real, in->imag);
 192     out->imag = _mm_unpackhi_ps(in->real, in->imag);
 193 }
 194
 195 /**
 196  * Vector Complex load from interleaved complex array.
 197  * out.real = [in[0].real, in[1].real, in[2].real, in[3].real]
 198  * out.imag = [in[0].imag, in[1].imag, in[2].imag, in[3].imag]
 199  */
 200 static __inline void VC_LOAD_INTERLEAVE(VC *out, const OMX_F32 *in) {
 201     __m128 temp0 = _mm_load_ps(in);
 202     __m128 temp1 = _mm_load_ps(in + 4);
 203     out->real = _mm_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 0, 2, 0));
 204     out->imag = _mm_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1));
 205 }
 206 /**
 207  * Vector Complex Load with Split format.
 208  * The input address is not 16 byte aligned.
 209  */
 210 static __inline void VC_LOADU_SPLIT(
 211     VC *out,
 212     const OMX_F32 *in,
 213     OMX_INT offset) {
 214   out->real = _mm_loadu_ps(in);
 215   out->imag = _mm_loadu_ps(in + offset);
 216 }
 217
 218 /* Reverse the order of the Complex Vector. */
 219 static __inline void VC_REVERSE(VC *v) {
 220   v->real = _mm_shuffle_ps(v->real, v->real, _MM_SHUFFLE(0, 1, 2, 3));
 221   v->imag = _mm_shuffle_ps(v->imag, v->imag, _MM_SHUFFLE(0, 1, 2, 3));
 222 }
 223 /*
 224  * Vector Complex store to interleaved complex array
 225  * out[0] = in.real[0]
 226  * out[1] = in.imag[0]
 227  * out[2] = in.real[1]
 228  * out[3] = in.imag[1]
 229  * out[4] = in.real[2]
 230  * out[5] = in.imag[2]
 231  * out[6] = in.real[3]
 232  * out[7] = in.imag[3]
 233  */
 234 static __inline void VC_STORE_INTERLEAVE(OMX_F32 *out, VC *in) {
 235   _mm_store_ps(out, _mm_unpacklo_ps(in->real, in->imag));
 236   _mm_store_ps(out + 4, _mm_unpackhi_ps(in->real, in->imag));
 237 }
 238
 239 /**
 240  * Vector Complex Store with Interleaved format.
 241  * Address is not 16 byte aligned.
 242  */
 243 static __inline void VC_STOREU_INTERLEAVE(OMX_F32 *out, VC *in) {
 244   _mm_storeu_ps(out, _mm_unpacklo_ps(in->real, in->imag));
 245   _mm_storeu_ps(out + 4, _mm_unpackhi_ps(in->real, in->imag));
 246 }
 247
 248 /* VC_ADD_X and store the result with Split format. */
 249 static __inline void VC_ADD_X_STORE_SPLIT(
 250     OMX_F32 *out,
 251     VC *a, VC *b,
 252     OMX_INT offset) {
 253   _mm_store_ps(out, _mm_add_ps(a->real, b->imag));
 254   _mm_store_ps(out + offset, _mm_add_ps(b->real, a->imag));
 255 }
 256
 257 /**
 258  * VC_SUB_X and store the result with inverse order.
 259  * Address is not 16 byte aligned.
 260  */
 261 static __inline void VC_SUB_X_INVERSE_STOREU_SPLIT(
 262     OMX_F32 *out,
 263     VC *a,
 264     VC *b,
 265     OMX_INT offset) {
 266   __m128 t;
 267   t = _mm_sub_ps(a->real, b->imag);
 268   _mm_storeu_ps(out, _mm_shuffle_ps(t, t, _MM_SHUFFLE(0, 1, 2, 3)));
 269   t = _mm_sub_ps(b->real, a->imag);
 270   _mm_storeu_ps(out + offset, _mm_shuffle_ps(t, t, _MM_SHUFFLE(0, 1, 2, 3)));
 271 }
 272
 273 /**
 274  * Vector Complex Load from Interleaved format to Split format.
 275  * Store the result into two __m128 registers.
 276  */
 277 static __inline void VC_LOAD_SHUFFLE(
 278     __m128 *out0,
 279     __m128 *out1,
 280     const OMX_F32 *in) {
 281   VC temp;
 282   VC_LOAD_INTERLEAVE(&temp, in);
 283   *out0 = temp.real;
 284   *out1 = temp.imag;
 285 }
 286
 287 /* Finish the butterfly calculation of forward radix4 and store the outputs. */
 288 static __inline void RADIX4_FWD_BUTTERFLY_STORE(
 289     OMX_F32 *out0,
 290     OMX_F32 *out1,
 291     OMX_F32 *out2,
 292     OMX_F32 *out3,
 293     VC *t0,
 294     VC *t1,
 295     VC *t2,
 296     VC *t3,
 297     OMX_INT n) {
 298   /* CADD out0, t0, t2 */
 299   VC_ADD_STORE_SPLIT(out0, t0, t2, n);
 300
 301   /* CSUB out2, t0, t2 */
 302   VC_SUB_STORE_SPLIT(out2, t0, t2, n);
 303
 304   /* CADD_SUB_X out1, t1, t3 */
 305   VC_ADD_SUB_X_STORE_SPLIT(out1, t1, t3, n);
 306
 307   /* CSUB_ADD_X out3, t1, t3 */
 308   VC_SUB_ADD_X_STORE_SPLIT(out3, t1, t3, n);
 309 }
 310
 311 /* Finish the butterfly calculation of inverse radix4 and store the outputs. */
 312 static __inline void RADIX4_INV_BUTTERFLY_STORE(
 313     OMX_F32 *out0,
 314     OMX_F32 *out1,
 315     OMX_F32 *out2,
 316     OMX_F32 *out3,
 317     VC *t0,
 318     VC *t1,
 319     VC *t2,
 320     VC *t3,
 321     OMX_INT n) {
 322   /* CADD out0, t0, t2 */
 323   VC_ADD_STORE_SPLIT(out0, t0, t2, n);
 324
 325   /* CSUB out2, t0, t2 */
 326   VC_SUB_STORE_SPLIT(out2, t0, t2, n);
 327
 328   /* CSUB_ADD_X out1, t1, t3 */
 329   VC_SUB_ADD_X_STORE_SPLIT(out1, t1, t3, n);
 330
 331   /* CADD_SUB_X out3, t1, t3 */
 332   VC_ADD_SUB_X_STORE_SPLIT(out3, t1, t3, n);
 333 }
 334
 335 /* Radix4 forward butterfly */
 336 static __inline void RADIX4_FWD_BUTTERFLY(
 337     VC *t0,
 338     VC *t1,
 339     VC *t2,
 340     VC *t3,
 341     VC *Tw1,
 342     VC *Tw2,
 343     VC *Tw3,
 344     VC *T0,
 345     VC *T1,
 346     VC *T2,
 347     VC *T3) {
 348   VC tt1, tt2, tt3;
 349
 350   /* CMUL tt1, Tw1, T1 */
 351   VC_MUL(&tt1, Tw1, T1);
 352
 353   /* CMUL tt2, Tw2, T2 */
 354   VC_MUL(&tt2, Tw2, T2);
 355
 356   /* CMUL tt3, Tw3, T3 */
 357   VC_MUL(&tt3, Tw3, T3);
 358
 359   /* CADD t0, T0, tt2 */
 360   VC_ADD(t0, T0, &tt2);
 361
 362   /* CSUB t1, T0, tt2 */
 363   VC_SUB(t1, T0, &tt2);
 364
 365   /* CADD t2, tt1, tt3 */
 366   VC_ADD(t2, &tt1, &tt3);
 367
 368   /* CSUB t3, tt1, tt3 */
 369   VC_SUB(t3, &tt1, &tt3);
 370 }
 371
 372 /* Radix4 inverse butterfly */
 373 static __inline void RADIX4_INV_BUTTERFLY(
 374     VC *t0,
 375     VC *t1,
 376     VC *t2,
 377     VC *t3,
 378     VC *Tw1,
 379     VC *Tw2,
 380     VC *Tw3,
 381     VC *T0,
 382     VC *T1,
 383     VC *T2,
 384     VC *T3) {
 385   VC tt1, tt2, tt3;
 386
 387   /* CMUL tt1, Tw1, T1 */
 388   VC_CONJ_MUL(&tt1, Tw1, T1);
 389
 390   /* CMUL tt2, Tw2, T2 */
 391   VC_CONJ_MUL(&tt2, Tw2, T2);
 392
 393   /* CMUL tt3, Tw3, T3 */
 394   VC_CONJ_MUL(&tt3, Tw3, T3);
 395
 396   /* CADD t0, T0, tt2 */
 397   VC_ADD(t0, T0, &tt2);
 398
 399   /* CSUB t1, T0, tt2 */
 400   VC_SUB(t1, T0, &tt2);
 401
 402   /* CADD t2, tt1, tt3 */
 403   VC_ADD(t2, &tt1, &tt3);
 404
 405   /* CSUB t3, tt1, tt3 */
 406   VC_SUB(t3, &tt1, &tt3);
 407 }
 408
 409 /* Radix4 butterfly in first stage for both forward and inverse */
 410 static __inline void RADIX4_BUTTERFLY_FS(
 411     VC *t0,
 412     VC *t1,
 413     VC *t2,
 414     VC *t3,
 415     VC *T0,
 416     VC *T1,
 417     VC *T2,
 418     VC *T3) {
 419   /* CADD t0, T0, T2 */
 420   VC_ADD(t0, T0, T2);
 421
 422   /* CSUB t1, T0, T2 */
 423   VC_SUB(t1, T0, T2);
 424
 425   /* CADD t2, T1, T3 */
 426   VC_ADD(t2, T1, T3);
 427
 428   /* CSUB t3, T1, T3 */
 429   VC_SUB(t3, T1, T3);
 430 }
 431
 432 /**
 433  * Load 16 float elements (4 sse registers) which is a 4 * 4 matrix.
 434  * Then Do transpose on the matrix.
 435  * 3,  2,  1,  0                  12, 8,  4,  0
 436  * 7,  6,  5,  4        =====>    13, 9,  5,  1
 437  * 11, 10, 9,  8                  14, 10, 6,  2
 438  * 15, 14, 13, 12                 15, 11, 7,  3
 439  */
 440 static __inline void VC_LOAD_MATRIX_TRANSPOSE(
 441     VC *T0,
 442     VC *T1,
 443     VC *T2,
 444     VC *T3,
 445     const OMX_F32 *pT0,
 446     const OMX_F32 *pT1,
 447     const OMX_F32 *pT2,
 448     const OMX_F32 *pT3,
 449     OMX_INT n) {
 450   __m128 xmm0;
 451   __m128 xmm1;
 452   __m128 xmm2;
 453   __m128 xmm3;
 454   __m128 xmm4;
 455   __m128 xmm5;
 456   __m128 xmm6;
 457   __m128 xmm7;
 458
 459   xmm0 = _mm_load_ps(pT0);
 460   xmm1 = _mm_load_ps(pT1);
 461   xmm2 = _mm_load_ps(pT2);
 462   xmm3 = _mm_load_ps(pT3);
 463
 464   /* Matrix transpose */
 465   xmm4 = _mm_unpacklo_ps(xmm0, xmm1);
 466   xmm5 = _mm_unpackhi_ps(xmm0, xmm1);
 467   xmm6 = _mm_unpacklo_ps(xmm2, xmm3);
 468   xmm7 = _mm_unpackhi_ps(xmm2, xmm3);
 469   T0->real = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
 470   T1->real = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
 471   T2->real = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
 472   T3->real = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));
 473
 474   xmm0 = _mm_load_ps(pT0 + n);
 475   xmm1 = _mm_load_ps(pT1 + n);
 476   xmm2 = _mm_load_ps(pT2 + n);
 477   xmm3 = _mm_load_ps(pT3 + n);
 478
 479   /* Matrix transpose */
 480   xmm4 = _mm_unpacklo_ps(xmm0, xmm1);
 481   xmm5 = _mm_unpackhi_ps(xmm0, xmm1);
 482   xmm6 = _mm_unpacklo_ps(xmm2, xmm3);
 483   xmm7 = _mm_unpackhi_ps(xmm2, xmm3);
 484   T0->imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
 485   T1->imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
 486   T2->imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
 487   T3->imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));
 488 }