src/third_party/webrtc/modules/audio_processing/ns/nsx_core_neon.c

   1 /*
   2  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include "webrtc/modules/audio_processing/ns/nsx_core.h"
  12
  13 #include <arm_neon.h>
  14 #include <assert.h>
  15
  16 // Constants to compensate for shifting signal log(2^shifts).
  17 const int16_t WebRtcNsx_kLogTable[9] = {
  18   0, 177, 355, 532, 710, 887, 1065, 1242, 1420
  19 };
  20
  21 const int16_t WebRtcNsx_kCounterDiv[201] = {
  22   32767, 16384, 10923, 8192, 6554, 5461, 4681, 4096, 3641, 3277, 2979, 2731,
  23   2521, 2341, 2185, 2048, 1928, 1820, 1725, 1638, 1560, 1489, 1425, 1365, 1311,
  24   1260, 1214, 1170, 1130, 1092, 1057, 1024, 993, 964, 936, 910, 886, 862, 840,
  25   819, 799, 780, 762, 745, 728, 712, 697, 683, 669, 655, 643, 630, 618, 607,
  26   596, 585, 575, 565, 555, 546, 537, 529, 520, 512, 504, 496, 489, 482, 475,
  27   468, 462, 455, 449, 443, 437, 431, 426, 420, 415, 410, 405, 400, 395, 390,
  28   386, 381, 377, 372, 368, 364, 360, 356, 352, 349, 345, 341, 338, 334, 331,
  29   328, 324, 321, 318, 315, 312, 309, 306, 303, 301, 298, 295, 293, 290, 287,
  30   285, 282, 280, 278, 275, 273, 271, 269, 266, 264, 262, 260, 258, 256, 254,
  31   252, 250, 248, 246, 245, 243, 241, 239, 237, 236, 234, 232, 231, 229, 228,
  32   226, 224, 223, 221, 220, 218, 217, 216, 214, 213, 211, 210, 209, 207, 206,
  33   205, 204, 202, 201, 200, 199, 197, 196, 195, 194, 193, 192, 191, 189, 188,
  34   187, 186, 185, 184, 183, 182, 181, 180, 179, 178, 177, 176, 175, 174, 173,
  35   172, 172, 171, 170, 169, 168, 167, 166, 165, 165, 164, 163
  36 };
  37
  38 const int16_t WebRtcNsx_kLogTableFrac[256] = {
  39   0, 1, 3, 4, 6, 7, 9, 10, 11, 13, 14, 16, 17, 18, 20, 21,
  40   22, 24, 25, 26, 28, 29, 30, 32, 33, 34, 36, 37, 38, 40, 41, 42,
  41   44, 45, 46, 47, 49, 50, 51, 52, 54, 55, 56, 57, 59, 60, 61, 62,
  42   63, 65, 66, 67, 68, 69, 71, 72, 73, 74, 75, 77, 78, 79, 80, 81,
  43   82, 84, 85, 86, 87, 88, 89, 90, 92, 93, 94, 95, 96, 97, 98, 99,
  44   100, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 116,
  45   117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
  46   132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146,
  47   147, 148, 149, 150, 151, 152, 153, 154, 155, 155, 156, 157, 158, 159, 160,
  48   161, 162, 163, 164, 165, 166, 167, 168, 169, 169, 170, 171, 172, 173, 174,
  49   175, 176, 177, 178, 178, 179, 180, 181, 182, 183, 184, 185, 185, 186, 187,
  50   188, 189, 190, 191, 192, 192, 193, 194, 195, 196, 197, 198, 198, 199, 200,
  51   201, 202, 203, 203, 204, 205, 206, 207, 208, 208, 209, 210, 211, 212, 212,
  52   213, 214, 215, 216, 216, 217, 218, 219, 220, 220, 221, 222, 223, 224, 224,
  53   225, 226, 227, 228, 228, 229, 230, 231, 231, 232, 233, 234, 234, 235, 236,
  54   237, 238, 238, 239, 240, 241, 241, 242, 243, 244, 244, 245, 246, 247, 247,
  55   248, 249, 249, 250, 251, 252, 252, 253, 254, 255, 255
  56 };
  57
  58 // Update the noise estimation information.
  59 static void UpdateNoiseEstimateNeon(NsxInst_t* inst, int offset) {
  60   const int16_t kExp2Const = 11819; // Q13
  61   int16_t* ptr_noiseEstLogQuantile = NULL;
  62   int16_t* ptr_noiseEstQuantile = NULL;
  63   int16x4_t kExp2Const16x4 = vdup_n_s16(kExp2Const);
  64   int32x4_t twentyOne32x4 = vdupq_n_s32(21);
  65   int32x4_t constA32x4 = vdupq_n_s32(0x1fffff);
  66   int32x4_t constB32x4 = vdupq_n_s32(0x200000);
  67
  68   int16_t tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset,
  69                                         inst->magnLen);
  70
  71   // Guarantee a Q-domain as high as possible and still fit in int16
  72   inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(kExp2Const,
  73                                                                  tmp16,
  74                                                                  21);
  75
  76   int32x4_t qNoise32x4 = vdupq_n_s32(inst->qNoise);
  77
  78   for (ptr_noiseEstLogQuantile = &inst->noiseEstLogQuantile[offset],
  79        ptr_noiseEstQuantile = &inst->noiseEstQuantile[0];
  80        ptr_noiseEstQuantile < &inst->noiseEstQuantile[inst->magnLen - 3];
  81        ptr_noiseEstQuantile += 4, ptr_noiseEstLogQuantile += 4) {
  82
  83     // tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const,
  84     //                                inst->noiseEstLogQuantile[offset + i]);
  85     int16x4_t v16x4 = vld1_s16(ptr_noiseEstLogQuantile);
  86     int32x4_t v32x4B = vmull_s16(v16x4, kExp2Const16x4);
  87
  88     // tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
  89     int32x4_t v32x4A = vandq_s32(v32x4B, constA32x4);
  90     v32x4A = vorrq_s32(v32x4A, constB32x4);
  91
  92     // tmp16 = (int16_t)(tmp32no2 >> 21);
  93     v32x4B = vshrq_n_s32(v32x4B, 21);
  94
  95     // tmp16 -= 21;// shift 21 to get result in Q0
  96     v32x4B = vsubq_s32(v32x4B, twentyOne32x4);
  97
  98     // tmp16 += (int16_t) inst->qNoise;
  99     // shift to get result in Q(qNoise)
 100     v32x4B = vaddq_s32(v32x4B, qNoise32x4);
 101
 102     // if (tmp16 < 0) {
 103     //   tmp32no1 >>= -tmp16;
 104     // } else {
 105     //   tmp32no1 <<= tmp16;
 106     // }
 107     v32x4B = vshlq_s32(v32x4A, v32x4B);
 108
 109     // tmp16 = WebRtcSpl_SatW32ToW16(tmp32no1);
 110     v16x4 = vqmovn_s32(v32x4B);
 111
 112     //inst->noiseEstQuantile[i] = tmp16;
 113     vst1_s16(ptr_noiseEstQuantile, v16x4);
 114   }
 115
 116   // Last iteration:
 117
 118   // inst->quantile[i]=exp(inst->lquantile[offset+i]);
 119   // in Q21
 120   int32_t tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const,
 121                                           *ptr_noiseEstLogQuantile);
 122   int32_t tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
 123
 124   tmp16 = (int16_t)(tmp32no2 >> 21);
 125   tmp16 -= 21;// shift 21 to get result in Q0
 126   tmp16 += (int16_t) inst->qNoise; //shift to get result in Q(qNoise)
 127   if (tmp16 < 0) {
 128     tmp32no1 >>= -tmp16;
 129   } else {
 130     tmp32no1 <<= tmp16;
 131   }
 132   *ptr_noiseEstQuantile = WebRtcSpl_SatW32ToW16(tmp32no1);
 133 }
 134
 135 // Noise Estimation
 136 void WebRtcNsx_NoiseEstimationNeon(NsxInst_t* inst,
 137                                    uint16_t* magn,
 138                                    uint32_t* noise,
 139                                    int16_t* q_noise) {
 140   int16_t lmagn[HALF_ANAL_BLOCKL], counter, countDiv;
 141   int16_t countProd, delta, zeros, frac;
 142   int16_t log2, tabind, logval, tmp16, tmp16no1, tmp16no2;
 143   const int16_t log2_const = 22713;
 144   const int16_t width_factor = 21845;
 145
 146   int i, s, offset;
 147
 148   tabind = inst->stages - inst->normData;
 149   assert(tabind < 9);
 150   assert(tabind > -9);
 151   if (tabind < 0) {
 152     logval = -WebRtcNsx_kLogTable[-tabind];
 153   } else {
 154     logval = WebRtcNsx_kLogTable[tabind];
 155   }
 156
 157   int16x8_t logval_16x8 = vdupq_n_s16(logval);
 158
 159   // lmagn(i)=log(magn(i))=log(2)*log2(magn(i))
 160   // magn is in Q(-stages), and the real lmagn values are:
 161   // real_lmagn(i)=log(magn(i)*2^stages)=log(magn(i))+log(2^stages)
 162   // lmagn in Q8
 163   for (i = 0; i < inst->magnLen; i++) {
 164     if (magn[i]) {
 165       zeros = WebRtcSpl_NormU32((uint32_t)magn[i]);
 166       frac = (int16_t)((((uint32_t)magn[i] << zeros)
 167                         & 0x7FFFFFFF) >> 23);
 168       assert(frac < 256);
 169       // log2(magn(i))
 170       log2 = (int16_t)(((31 - zeros) << 8)
 171                        + WebRtcNsx_kLogTableFrac[frac]);
 172       // log2(magn(i))*log(2)
 173       lmagn[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15);
 174       // + log(2^stages)
 175       lmagn[i] += logval;
 176     } else {
 177       lmagn[i] = logval;
 178     }
 179   }
 180
 181   int16x4_t Q3_16x4  = vdup_n_s16(3);
 182   int16x8_t WIDTHQ8_16x8 = vdupq_n_s16(WIDTH_Q8);
 183   int16x8_t WIDTHFACTOR_16x8 = vdupq_n_s16(width_factor);
 184
 185   int16_t factor = FACTOR_Q7;
 186   if (inst->blockIndex < END_STARTUP_LONG)
 187     factor = FACTOR_Q7_STARTUP;
 188
 189   // Loop over simultaneous estimates
 190   for (s = 0; s < SIMULT; s++) {
 191     offset = s * inst->magnLen;
 192
 193     // Get counter values from state
 194     counter = inst->noiseEstCounter[s];
 195     assert(counter < 201);
 196     countDiv = WebRtcNsx_kCounterDiv[counter];
 197     countProd = (int16_t)WEBRTC_SPL_MUL_16_16(counter, countDiv);
 198
 199     // quant_est(...)
 200     int16_t deltaBuff[8];
 201     int16x4_t tmp16x4_0;
 202     int16x4_t tmp16x4_1;
 203     int16x4_t countDiv_16x4 = vdup_n_s16(countDiv);
 204     int16x8_t countProd_16x8 = vdupq_n_s16(countProd);
 205     int16x8_t tmp16x8_0 = vdupq_n_s16(countDiv);
 206     int16x8_t prod16x8 = vqrdmulhq_s16(WIDTHFACTOR_16x8, tmp16x8_0);
 207     int16x8_t tmp16x8_1;
 208     int16x8_t tmp16x8_2;
 209     int16x8_t tmp16x8_3;
 210     // Initialize tmp16x8_4 to zero to avoid compilaton error.
 211     int16x8_t tmp16x8_4 = vdupq_n_s16(0);
 212     int16x8_t tmp16x8_5;
 213     int32x4_t tmp32x4;
 214
 215     for (i = 0; i < inst->magnLen - 7; i += 8) {
 216       // Compute delta.
 217       // Smaller step size during startup. This prevents from using
 218       // unrealistic values causing overflow.
 219       tmp16x8_0 = vdupq_n_s16(factor);
 220       vst1q_s16(deltaBuff, tmp16x8_0);
 221
 222       int j;
 223       for (j = 0; j < 8; j++) {
 224         if (inst->noiseEstDensity[offset + i + j] > 512) {
 225           // Get values for deltaBuff by shifting intead of dividing.
 226           int factor = WebRtcSpl_NormW16(inst->noiseEstDensity[offset + i + j]);
 227           deltaBuff[j] = (int16_t)(FACTOR_Q16 >> (14 - factor));
 228         }
 229       }
 230
 231       // Update log quantile estimate
 232
 233       // tmp16 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
 234       tmp32x4 = vmull_s16(vld1_s16(&deltaBuff[0]), countDiv_16x4);
 235       tmp16x4_1 = vshrn_n_s32(tmp32x4, 14);
 236       tmp32x4 = vmull_s16(vld1_s16(&deltaBuff[4]), countDiv_16x4);
 237       tmp16x4_0 = vshrn_n_s32(tmp32x4, 14);
 238       tmp16x8_0 = vcombine_s16(tmp16x4_1, tmp16x4_0); // Keep for several lines.
 239
 240       // prepare for the "if" branch
 241       // tmp16 += 2;
 242       // tmp16_1 = (Word16)(tmp16>>2);
 243       tmp16x8_1 = vrshrq_n_s16(tmp16x8_0, 2);
 244
 245       // inst->noiseEstLogQuantile[offset+i] + tmp16_1;
 246       tmp16x8_2 = vld1q_s16(&inst->noiseEstLogQuantile[offset + i]); // Keep
 247       tmp16x8_1 = vaddq_s16(tmp16x8_2, tmp16x8_1); // Keep for several lines
 248
 249       // Prepare for the "else" branch
 250       // tmp16 += 1;
 251       // tmp16_1 = (Word16)(tmp16>>1);
 252       tmp16x8_0 = vrshrq_n_s16(tmp16x8_0, 1);
 253
 254       // tmp16_2 = (Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16_1,3,1);
 255       tmp32x4 = vmull_s16(vget_low_s16(tmp16x8_0), Q3_16x4);
 256       tmp16x4_1 = vshrn_n_s32(tmp32x4, 1);
 257
 258       // tmp16_2 = (Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16_1,3,1);
 259       tmp32x4 = vmull_s16(vget_high_s16(tmp16x8_0), Q3_16x4);
 260       tmp16x4_0 = vshrn_n_s32(tmp32x4, 1);
 261
 262       // inst->noiseEstLogQuantile[offset + i] - tmp16_2;
 263       tmp16x8_0 = vcombine_s16(tmp16x4_1, tmp16x4_0); // keep
 264       tmp16x8_0 = vsubq_s16(tmp16x8_2, tmp16x8_0);
 265
 266       // logval is the smallest fixed point representation we can have. Values
 267       // below that will correspond to values in the interval [0, 1], which
 268       // can't possibly occur.
 269       tmp16x8_0 = vmaxq_s16(tmp16x8_0, logval_16x8);
 270
 271       // Do the if-else branches:
 272       tmp16x8_3 = vld1q_s16(&lmagn[i]); // keep for several lines
 273       tmp16x8_5 = vsubq_s16(tmp16x8_3, tmp16x8_2);
 274       __asm__("vcgt.s16 %q0, %q1, #0"::"w"(tmp16x8_4), "w"(tmp16x8_5));
 275       __asm__("vbit %q0, %q1, %q2"::
 276               "w"(tmp16x8_2), "w"(tmp16x8_1), "w"(tmp16x8_4));
 277       __asm__("vbif %q0, %q1, %q2"::
 278               "w"(tmp16x8_2), "w"(tmp16x8_0), "w"(tmp16x8_4));
 279       vst1q_s16(&inst->noiseEstLogQuantile[offset + i], tmp16x8_2);
 280
 281       // Update density estimate
 282       // tmp16_1 + tmp16_2
 283       tmp16x8_1 = vld1q_s16(&inst->noiseEstDensity[offset + i]);
 284       tmp16x8_0 = vqrdmulhq_s16(tmp16x8_1, countProd_16x8);
 285       tmp16x8_0 = vaddq_s16(tmp16x8_0, prod16x8);
 286
 287       // lmagn[i] - inst->noiseEstLogQuantile[offset + i]
 288       tmp16x8_3 = vsubq_s16(tmp16x8_3, tmp16x8_2);
 289       tmp16x8_3 = vabsq_s16(tmp16x8_3);
 290       tmp16x8_4 = vcgtq_s16(WIDTHQ8_16x8, tmp16x8_3);
 291       __asm__("vbit %q0, %q1, %q2"::
 292               "w"(tmp16x8_1), "w"(tmp16x8_0), "w"(tmp16x8_4));
 293       vst1q_s16(&inst->noiseEstDensity[offset + i], tmp16x8_1);
 294     }  // End loop over magnitude spectrum
 295
 296     // Last iteration over magnitude spectrum:
 297     // compute delta
 298     if (inst->noiseEstDensity[offset + i] > 512) {
 299       // Get values for deltaBuff by shifting intead of dividing.
 300       int factor = WebRtcSpl_NormW16(inst->noiseEstDensity[offset + i]);
 301       delta = (int16_t)(FACTOR_Q16 >> (14 - factor));
 302     } else {
 303       delta = FACTOR_Q7;
 304       if (inst->blockIndex < END_STARTUP_LONG) {
 305         // Smaller step size during startup. This prevents from using
 306         // unrealistic values causing overflow.
 307         delta = FACTOR_Q7_STARTUP;
 308       }
 309     }
 310     // update log quantile estimate
 311     tmp16 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
 312     if (lmagn[i] > inst->noiseEstLogQuantile[offset + i]) {
 313       // +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2
 314       // CounterDiv=1/(inst->counter[s]+1) in Q15
 315       tmp16 += 2;
 316       inst->noiseEstLogQuantile[offset + i] += tmp16 / 4;
 317     } else {
 318       tmp16 += 1;
 319       // *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2
 320       tmp16no2 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(tmp16 / 2, 3, 1);
 321       inst->noiseEstLogQuantile[offset + i] -= tmp16no2;
 322       if (inst->noiseEstLogQuantile[offset + i] < logval) {
 323         // logval is the smallest fixed point representation we can have.
 324         // Values below that will correspond to values in the interval
 325         // [0, 1], which can't possibly occur.
 326         inst->noiseEstLogQuantile[offset + i] = logval;
 327       }
 328     }
 329
 330     // update density estimate
 331     if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i])
 332         < WIDTH_Q8) {
 333       tmp16no1 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
 334                    inst->noiseEstDensity[offset + i], countProd, 15);
 335       tmp16no2 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
 336                    width_factor, countDiv, 15);
 337       inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2;
 338     }
 339
 340
 341     if (counter >= END_STARTUP_LONG) {
 342       inst->noiseEstCounter[s] = 0;
 343       if (inst->blockIndex >= END_STARTUP_LONG) {
 344         UpdateNoiseEstimateNeon(inst, offset);
 345       }
 346     }
 347     inst->noiseEstCounter[s]++;
 348
 349   }  // end loop over simultaneous estimates
 350
 351   // Sequentially update the noise during startup
 352   if (inst->blockIndex < END_STARTUP_LONG) {
 353     UpdateNoiseEstimateNeon(inst, offset);
 354   }
 355
 356   for (i = 0; i < inst->magnLen; i++) {
 357     noise[i] = (uint32_t)(inst->noiseEstQuantile[i]); // Q(qNoise)
 358   }
 359   (*q_noise) = (int16_t)inst->qNoise;
 360 }
 361
 362 // Filter the data in the frequency domain, and create spectrum.
 363 void WebRtcNsx_PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf) {
 364   assert(inst->magnLen % 8 == 1);
 365   assert(inst->anaLen2 % 16 == 0);
 366
 367   // (1) Filtering.
 368
 369   // Fixed point C code for the next block is as follows:
 370   // for (i = 0; i < inst->magnLen; i++) {
 371   //   inst->real[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(inst->real[i],
 372   //      (int16_t)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
 373   //   inst->imag[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(inst->imag[i],
 374   //      (int16_t)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
 375   // }
 376
 377   int16_t* preal = &inst->real[0];
 378   int16_t* pimag = &inst->imag[0];
 379   int16_t* pns_filter = (int16_t*)&inst->noiseSupFilter[0];
 380   int16_t* pimag_end = pimag + inst->magnLen - 4;
 381
 382   while (pimag < pimag_end) {
 383     int16x8_t real = vld1q_s16(preal);
 384     int16x8_t imag = vld1q_s16(pimag);
 385     int16x8_t ns_filter = vld1q_s16(pns_filter);
 386
 387     int32x4_t tmp_r_0 = vmull_s16(vget_low_s16(real), vget_low_s16(ns_filter));
 388     int32x4_t tmp_i_0 = vmull_s16(vget_low_s16(imag), vget_low_s16(ns_filter));
 389     int32x4_t tmp_r_1 = vmull_s16(vget_high_s16(real),
 390                                   vget_high_s16(ns_filter));
 391     int32x4_t tmp_i_1 = vmull_s16(vget_high_s16(imag),
 392                                   vget_high_s16(ns_filter));
 393
 394     int16x4_t result_r_0 = vshrn_n_s32(tmp_r_0, 14);
 395     int16x4_t result_i_0 = vshrn_n_s32(tmp_i_0, 14);
 396     int16x4_t result_r_1 = vshrn_n_s32(tmp_r_1, 14);
 397     int16x4_t result_i_1 = vshrn_n_s32(tmp_i_1, 14);
 398
 399     vst1q_s16(preal, vcombine_s16(result_r_0, result_r_1));
 400     vst1q_s16(pimag, vcombine_s16(result_i_0, result_i_1));
 401     preal += 8;
 402     pimag += 8;
 403     pns_filter += 8;
 404   }
 405
 406   // Filter the last element
 407   *preal = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(*preal, *pns_filter, 14);
 408   *pimag = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(*pimag, *pns_filter, 14);
 409
 410   // (2) Create spectrum.
 411
 412   // Fixed point C code for the rest of the function is as follows:
 413   // freq_buf[0] = inst->real[0];
 414   // freq_buf[1] = -inst->imag[0];
 415   // for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) {
 416   //   freq_buf[j] = inst->real[i];
 417   //   freq_buf[j + 1] = -inst->imag[i];
 418   // }
 419   // freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
 420   // freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
 421
 422   preal = &inst->real[0];
 423   pimag = &inst->imag[0];
 424   pimag_end = pimag + inst->anaLen2;
 425   int16_t * freq_buf_start = freq_buf;
 426   while (pimag < pimag_end) {
 427     // loop unroll
 428     int16x8x2_t real_imag_0;
 429     int16x8x2_t real_imag_1;
 430     real_imag_0.val[1] = vld1q_s16(pimag);
 431     real_imag_0.val[0] = vld1q_s16(preal);
 432     preal += 8;
 433     pimag += 8;
 434     real_imag_1.val[1] = vld1q_s16(pimag);
 435     real_imag_1.val[0] = vld1q_s16(preal);
 436     preal += 8;
 437     pimag += 8;
 438
 439     real_imag_0.val[1] = vnegq_s16(real_imag_0.val[1]);
 440     real_imag_1.val[1] = vnegq_s16(real_imag_1.val[1]);
 441     vst2q_s16(freq_buf_start, real_imag_0);
 442     freq_buf_start += 16;
 443     vst2q_s16(freq_buf_start, real_imag_1);
 444     freq_buf_start += 16;
 445   }
 446   freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
 447   freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
 448 }
 449
 450 // Denormalize the input buffer.
 451 void WebRtcNsx_DenormalizeNeon(NsxInst_t* inst, int16_t* in, int factor) {
 452   int16_t* ptr_real = &inst->real[0];
 453   int16_t* ptr_in = &in[0];
 454
 455   __asm__ __volatile__("vdup.32 q10, %0" ::
 456                        "r"((int32_t)(factor - inst->normData)) : "q10");
 457   for (; ptr_real < &inst->real[inst->anaLen];) {
 458
 459     // Loop unrolled once. Both pointers are incremented.
 460     __asm__ __volatile__(
 461       // tmp32 = WEBRTC_SPL_SHIFT_W32((int32_t)in[j],
 462       //                             factor - inst->normData);
 463       "vld2.16 {d24, d25}, [%[ptr_in]]!\n\t"
 464       "vmovl.s16 q12, d24\n\t"
 465       "vshl.s32 q12, q10\n\t"
 466       // inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0
 467       "vqmovn.s32 d24, q12\n\t"
 468       "vst1.16 d24, [%[ptr_real]]!\n\t"
 469
 470       // tmp32 = WEBRTC_SPL_SHIFT_W32((int32_t)in[j],
 471       //                             factor - inst->normData);
 472       "vld2.16 {d22, d23}, [%[ptr_in]]!\n\t"
 473       "vmovl.s16 q11, d22\n\t"
 474       "vshl.s32 q11, q10\n\t"
 475       // inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0
 476       "vqmovn.s32 d22, q11\n\t"
 477       "vst1.16 d22, [%[ptr_real]]!\n\t"
 478
 479       // Specify constraints.
 480       :[ptr_in]"+r"(ptr_in),
 481        [ptr_real]"+r"(ptr_real)
 482       :
 483       :"d22", "d23", "d24", "d25"
 484     );
 485   }
 486 }
 487
 488 // For the noise supress process, synthesis, read out fully processed segment,
 489 // and update synthesis buffer.
 490 void WebRtcNsx_SynthesisUpdateNeon(NsxInst_t* inst,
 491                                    int16_t* out_frame,
 492                                    int16_t gain_factor) {
 493   assert(inst->anaLen % 16 == 0);
 494   assert(inst->blockLen10ms % 16 == 0);
 495
 496   int16_t* preal_start = inst->real;
 497   const int16_t* pwindow = inst->window;
 498   int16_t* preal_end = preal_start + inst->anaLen;
 499   int16_t* psynthesis_buffer = inst->synthesisBuffer;
 500
 501   while (preal_start < preal_end) {
 502     // Loop unroll.
 503     int16x8_t window_0 = vld1q_s16(pwindow);
 504     int16x8_t real_0 = vld1q_s16(preal_start);
 505     int16x8_t synthesis_buffer_0 = vld1q_s16(psynthesis_buffer);
 506
 507     int16x8_t window_1 = vld1q_s16(pwindow + 8);
 508     int16x8_t real_1 = vld1q_s16(preal_start + 8);
 509     int16x8_t synthesis_buffer_1 = vld1q_s16(psynthesis_buffer + 8);
 510
 511     int32x4_t tmp32a_0_low = vmull_s16(vget_low_s16(real_0),
 512                                        vget_low_s16(window_0));
 513     int32x4_t tmp32a_0_high = vmull_s16(vget_high_s16(real_0),
 514                                         vget_high_s16(window_0));
 515
 516     int32x4_t tmp32a_1_low = vmull_s16(vget_low_s16(real_1),
 517                                        vget_low_s16(window_1));
 518     int32x4_t tmp32a_1_high = vmull_s16(vget_high_s16(real_1),
 519                                         vget_high_s16(window_1));
 520
 521     int16x4_t tmp16a_0_low = vqrshrn_n_s32(tmp32a_0_low, 14);
 522     int16x4_t tmp16a_0_high = vqrshrn_n_s32(tmp32a_0_high, 14);
 523
 524     int16x4_t tmp16a_1_low = vqrshrn_n_s32(tmp32a_1_low, 14);
 525     int16x4_t tmp16a_1_high = vqrshrn_n_s32(tmp32a_1_high, 14);
 526
 527     int32x4_t tmp32b_0_low = vmull_n_s16(tmp16a_0_low, gain_factor);
 528     int32x4_t tmp32b_0_high = vmull_n_s16(tmp16a_0_high, gain_factor);
 529
 530     int32x4_t tmp32b_1_low = vmull_n_s16(tmp16a_1_low, gain_factor);
 531     int32x4_t tmp32b_1_high = vmull_n_s16(tmp16a_1_high, gain_factor);
 532
 533     int16x4_t tmp16b_0_low = vqrshrn_n_s32(tmp32b_0_low, 13);
 534     int16x4_t tmp16b_0_high = vqrshrn_n_s32(tmp32b_0_high, 13);
 535
 536     int16x4_t tmp16b_1_low = vqrshrn_n_s32(tmp32b_1_low, 13);
 537     int16x4_t tmp16b_1_high = vqrshrn_n_s32(tmp32b_1_high, 13);
 538
 539     synthesis_buffer_0 = vqaddq_s16(vcombine_s16(tmp16b_0_low, tmp16b_0_high),
 540                                     synthesis_buffer_0);
 541     synthesis_buffer_1 = vqaddq_s16(vcombine_s16(tmp16b_1_low, tmp16b_1_high),
 542                                     synthesis_buffer_1);
 543     vst1q_s16(psynthesis_buffer, synthesis_buffer_0);
 544     vst1q_s16(psynthesis_buffer + 8, synthesis_buffer_1);
 545
 546     pwindow += 16;
 547     preal_start += 16;
 548     psynthesis_buffer += 16;
 549   }
 550
 551   // Read out fully processed segment.
 552   int16_t * p_start = inst->synthesisBuffer;
 553   int16_t * p_end = inst->synthesisBuffer + inst->blockLen10ms;
 554   int16_t * p_frame = out_frame;
 555   while (p_start < p_end) {
 556     int16x8_t frame_0 = vld1q_s16(p_start);
 557     vst1q_s16(p_frame, frame_0);
 558     p_start += 8;
 559     p_frame += 8;
 560   }
 561
 562   // Update synthesis buffer.
 563   int16_t* p_start_src = inst->synthesisBuffer + inst->blockLen10ms;
 564   int16_t* p_end_src = inst->synthesisBuffer + inst->anaLen;
 565   int16_t* p_start_dst = inst->synthesisBuffer;
 566   while (p_start_src < p_end_src) {
 567     int16x8_t frame = vld1q_s16(p_start_src);
 568     vst1q_s16(p_start_dst, frame);
 569     p_start_src += 8;
 570     p_start_dst += 8;
 571   }
 572
 573   p_start = inst->synthesisBuffer + inst->anaLen - inst->blockLen10ms;
 574   p_end = p_start + inst->blockLen10ms;
 575   int16x8_t zero = vdupq_n_s16(0);
 576   for (;p_start < p_end; p_start += 8) {
 577     vst1q_s16(p_start, zero);
 578   }
 579 }
 580
 581 // Update analysis buffer for lower band, and window data before FFT.
 582 void WebRtcNsx_AnalysisUpdateNeon(NsxInst_t* inst,
 583                                   int16_t* out,
 584                                   int16_t* new_speech) {
 585   assert(inst->blockLen10ms % 16 == 0);
 586   assert(inst->anaLen % 16 == 0);
 587
 588   // For lower band update analysis buffer.
 589   // WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer,
 590   //                      inst->analysisBuffer + inst->blockLen10ms,
 591   //                      inst->anaLen - inst->blockLen10ms);
 592   int16_t* p_start_src = inst->analysisBuffer + inst->blockLen10ms;
 593   int16_t* p_end_src = inst->analysisBuffer + inst->anaLen;
 594   int16_t* p_start_dst = inst->analysisBuffer;
 595   while (p_start_src < p_end_src) {
 596     int16x8_t frame = vld1q_s16(p_start_src);
 597     vst1q_s16(p_start_dst, frame);
 598
 599     p_start_src += 8;
 600     p_start_dst += 8;
 601   }
 602
 603   // WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer
 604   //    + inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms);
 605   p_start_src = new_speech;
 606   p_end_src = new_speech + inst->blockLen10ms;
 607   p_start_dst = inst->analysisBuffer + inst->anaLen - inst->blockLen10ms;
 608   while (p_start_src < p_end_src) {
 609     int16x8_t frame = vld1q_s16(p_start_src);
 610     vst1q_s16(p_start_dst, frame);
 611
 612     p_start_src += 8;
 613     p_start_dst += 8;
 614   }
 615
 616   // Window data before FFT.
 617   int16_t* p_start_window = (int16_t*) inst->window;
 618   int16_t* p_start_buffer = inst->analysisBuffer;
 619   int16_t* p_start_out = out;
 620   const int16_t* p_end_out = out + inst->anaLen;
 621
 622   // Load the first element to reduce pipeline bubble.
 623   int16x8_t window = vld1q_s16(p_start_window);
 624   int16x8_t buffer = vld1q_s16(p_start_buffer);
 625   p_start_window += 8;
 626   p_start_buffer += 8;
 627
 628   while (p_start_out < p_end_out) {
 629     // Unroll loop.
 630     int32x4_t tmp32_low = vmull_s16(vget_low_s16(window), vget_low_s16(buffer));
 631     int32x4_t tmp32_high = vmull_s16(vget_high_s16(window),
 632                                      vget_high_s16(buffer));
 633     window = vld1q_s16(p_start_window);
 634     buffer = vld1q_s16(p_start_buffer);
 635
 636     int16x4_t result_low = vrshrn_n_s32(tmp32_low, 14);
 637     int16x4_t result_high = vrshrn_n_s32(tmp32_high, 14);
 638     vst1q_s16(p_start_out, vcombine_s16(result_low, result_high));
 639
 640     p_start_buffer += 8;
 641     p_start_window += 8;
 642     p_start_out += 8;
 643   }
 644 }
 645
 646 // Create a complex number buffer (out[]) as the intput (in[]) interleaved with
 647 // zeros, and normalize it.
 648 void WebRtcNsx_CreateComplexBufferNeon(NsxInst_t* inst,
 649                                        int16_t* in,
 650                                        int16_t* out) {
 651   int16_t* ptr_out = &out[0];
 652   int16_t* ptr_in = &in[0];
 653
 654   __asm__ __volatile__("vdup.16 d25, %0" : : "r"(0) : "d25");
 655   __asm__ __volatile__("vdup.16 q10, %0" : : "r"(inst->normData) : "q10");
 656   for (; ptr_in < &in[inst->anaLen];) {
 657
 658     // Loop unrolled once, so ptr_in is incremented by 8 twice,
 659     // and ptr_out is incremented by 8 four times.
 660     __asm__ __volatile__(
 661       // out[j] = in[i] << inst->normData;  // Q(normData)
 662       "vld1.16 {d22, d23}, [%[ptr_in]]!\n\t"
 663       "vshl.s16 q11, q10\n\t"
 664       "vmov d24, d23\n\t"
 665
 666       // out[j + 1] = 0; // Insert zeros in imaginary part
 667       "vmov d23, d25\n\t"
 668       "vst2.16 {d22, d23}, [%[ptr_out]]!\n\t"
 669       "vst2.16 {d24, d25}, [%[ptr_out]]!\n\t"
 670
 671       // out[j] = in[i] << inst->normData;  // Q(normData)
 672       "vld1.16 {d22, d23}, [%[ptr_in]]!\n\t"
 673       "vshl.s16 q11, q10\n\t"
 674       "vmov d24, d23\n\t"
 675
 676       // out[j + 1] = 0; // Insert zeros in imaginary part
 677       "vmov d23, d25\n\t"
 678       "vst2.16 {d22, d23}, [%[ptr_out]]!\n\t"
 679       "vst2.16 {d24, d25}, [%[ptr_out]]!\n\t"
 680
 681       // Specify constraints.
 682       :[ptr_in]"+r"(ptr_in),
 683        [ptr_out]"+r"(ptr_out)
 684       :
 685       :"d22", "d23", "d24", "d25", "q10", "q11"
 686     );
 687   }
 688 }