src/third_party/webrtc/modules/audio_processing/aecm/aecm_core_mips.c

   1 /*
   2  *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include "webrtc/modules/audio_processing/aecm/aecm_core.h"
  12
  13 #include <assert.h>
  14
  15 #include "webrtc/modules/audio_processing/aecm/include/echo_control_mobile.h"
  16 #include "webrtc/modules/audio_processing/utility/delay_estimator_wrapper.h"
  17
  18 static const ALIGN8_BEG int16_t WebRtcAecm_kSqrtHanning[] ALIGN8_END = {
  19   0, 399, 798, 1196, 1594, 1990, 2386, 2780, 3172,
  20   3562, 3951, 4337, 4720, 5101, 5478, 5853, 6224,
  21   6591, 6954, 7313, 7668, 8019, 8364, 8705, 9040,
  22   9370, 9695, 10013, 10326, 10633, 10933, 11227, 11514,
  23   11795, 12068, 12335, 12594, 12845, 13089, 13325, 13553,
  24   13773, 13985, 14189, 14384, 14571, 14749, 14918, 15079,
  25   15231, 15373, 15506, 15631, 15746, 15851, 15947, 16034,
  26   16111, 16179, 16237, 16286, 16325, 16354, 16373, 16384
  27 };
  28
  29 static const int16_t kNoiseEstQDomain = 15;
  30 static const int16_t kNoiseEstIncCount = 5;
  31
  32 static int16_t coefTable[] = {
  33    0,   4, 256, 260, 128, 132, 384, 388,
  34   64,  68, 320, 324, 192, 196, 448, 452,
  35   32,  36, 288, 292, 160, 164, 416, 420,
  36   96, 100, 352, 356, 224, 228, 480, 484,
  37   16,  20, 272, 276, 144, 148, 400, 404,
  38   80,  84, 336, 340, 208, 212, 464, 468,
  39   48,  52, 304, 308, 176, 180, 432, 436,
  40  112, 116, 368, 372, 240, 244, 496, 500,
  41    8,  12, 264, 268, 136, 140, 392, 396,
  42   72,  76, 328, 332, 200, 204, 456, 460,
  43   40,  44, 296, 300, 168, 172, 424, 428,
  44  104, 108, 360, 364, 232, 236, 488, 492,
  45   24,  28, 280, 284, 152, 156, 408, 412,
  46   88,  92, 344, 348, 216, 220, 472, 476,
  47   56,  60, 312, 316, 184, 188, 440, 444,
  48  120, 124, 376, 380, 248, 252, 504, 508
  49 };
  50
  51 static int16_t coefTable_ifft[] = {
  52     0, 512, 256, 508, 128, 252, 384, 380,
  53    64, 124, 320, 444, 192, 188, 448, 316,
  54    32,  60, 288, 476, 160, 220, 416, 348,
  55    96,  92, 352, 412, 224, 156, 480, 284,
  56    16,  28, 272, 492, 144, 236, 400, 364,
  57    80, 108, 336, 428, 208, 172, 464, 300,
  58    48,  44, 304, 460, 176, 204, 432, 332,
  59   112,  76, 368, 396, 240, 140, 496, 268,
  60     8,  12, 264, 500, 136, 244, 392, 372,
  61    72, 116, 328, 436, 200, 180, 456, 308,
  62    40,  52, 296, 468, 168, 212, 424, 340,
  63   104,  84, 360, 404, 232, 148, 488, 276,
  64    24,  20, 280, 484, 152, 228, 408, 356,
  65    88, 100, 344, 420, 216, 164, 472, 292,
  66    56,  36, 312, 452, 184, 196, 440, 324,
  67   120,  68, 376, 388, 248, 132, 504, 260
  68 };
  69
  70 static void ComfortNoise(AecmCore_t* aecm,
  71                          const uint16_t* dfa,
  72                          complex16_t* out,
  73                          const int16_t* lambda);
  74
  75 static void WindowAndFFT(AecmCore_t* aecm,
  76                          int16_t* fft,
  77                          const int16_t* time_signal,
  78                          complex16_t* freq_signal,
  79                          int time_signal_scaling) {
  80   int i, j;
  81   int32_t tmp1, tmp2, tmp3, tmp4;
  82   int16_t* pfrfi;
  83   complex16_t* pfreq_signal;
  84   int16_t  f_coef, s_coef;
  85   int32_t load_ptr, store_ptr1, store_ptr2, shift, shift1;
  86   int32_t hann, hann1, coefs;
  87
  88   memset(fft, 0, sizeof(int16_t) * PART_LEN4);
  89
  90   // FFT of signal
  91   __asm __volatile (
  92     ".set        push                                                    \n\t"
  93     ".set        noreorder                                               \n\t"
  94     "addiu       %[shift],          %[time_signal_scaling], -14          \n\t"
  95     "addiu       %[i],              $zero,                  64           \n\t"
  96     "addiu       %[load_ptr],       %[time_signal],         0            \n\t"
  97     "addiu       %[hann],           %[hanning],             0            \n\t"
  98     "addiu       %[hann1],          %[hanning],             128          \n\t"
  99     "addiu       %[coefs],          %[coefTable],           0            \n\t"
 100     "bltz        %[shift],          2f                                   \n\t"
 101     " negu       %[shift1],         %[shift]                             \n\t"
 102    "1:                                                                   \n\t"
 103     "lh          %[tmp1],           0(%[load_ptr])                       \n\t"
 104     "lh          %[tmp2],           0(%[hann])                           \n\t"
 105     "lh          %[tmp3],           128(%[load_ptr])                     \n\t"
 106     "lh          %[tmp4],           0(%[hann1])                          \n\t"
 107     "addiu       %[i],              %[i],                   -1           \n\t"
 108     "mul         %[tmp1],           %[tmp1],                %[tmp2]      \n\t"
 109     "mul         %[tmp3],           %[tmp3],                %[tmp4]      \n\t"
 110     "lh          %[f_coef],         0(%[coefs])                          \n\t"
 111     "lh          %[s_coef],         2(%[coefs])                          \n\t"
 112     "addiu       %[load_ptr],       %[load_ptr],            2            \n\t"
 113     "addiu       %[hann],           %[hann],                2            \n\t"
 114     "addiu       %[hann1],          %[hann1],               -2           \n\t"
 115     "addu        %[store_ptr1],     %[fft],                 %[f_coef]    \n\t"
 116     "addu        %[store_ptr2],     %[fft],                 %[s_coef]    \n\t"
 117     "sllv        %[tmp1],           %[tmp1],                %[shift]     \n\t"
 118     "sllv        %[tmp3],           %[tmp3],                %[shift]     \n\t"
 119     "sh          %[tmp1],           0(%[store_ptr1])                     \n\t"
 120     "sh          %[tmp3],           0(%[store_ptr2])                     \n\t"
 121     "bgtz        %[i],              1b                                   \n\t"
 122     " addiu      %[coefs],          %[coefs],               4            \n\t"
 123     "b           3f                                                      \n\t"
 124     " nop                                                                \n\t"
 125    "2:                                                                   \n\t"
 126     "lh          %[tmp1],           0(%[load_ptr])                       \n\t"
 127     "lh          %[tmp2],           0(%[hann])                           \n\t"
 128     "lh          %[tmp3],           128(%[load_ptr])                     \n\t"
 129     "lh          %[tmp4],           0(%[hann1])                          \n\t"
 130     "addiu       %[i],              %[i],                   -1           \n\t"
 131     "mul         %[tmp1],           %[tmp1],                %[tmp2]      \n\t"
 132     "mul         %[tmp3],           %[tmp3],                %[tmp4]      \n\t"
 133     "lh          %[f_coef],         0(%[coefs])                          \n\t"
 134     "lh          %[s_coef],         2(%[coefs])                          \n\t"
 135     "addiu       %[load_ptr],       %[load_ptr],            2            \n\t"
 136     "addiu       %[hann],           %[hann],                2            \n\t"
 137     "addiu       %[hann1],          %[hann1],               -2           \n\t"
 138     "addu        %[store_ptr1],     %[fft],                 %[f_coef]    \n\t"
 139     "addu        %[store_ptr2],     %[fft],                 %[s_coef]    \n\t"
 140     "srav        %[tmp1],           %[tmp1],                %[shift1]    \n\t"
 141     "srav        %[tmp3],           %[tmp3],                %[shift1]    \n\t"
 142     "sh          %[tmp1],           0(%[store_ptr1])                     \n\t"
 143     "sh          %[tmp3],           0(%[store_ptr2])                     \n\t"
 144     "bgtz        %[i],              2b                                   \n\t"
 145     " addiu      %[coefs],          %[coefs],               4            \n\t"
 146    "3:                                                                   \n\t"
 147     ".set        pop                                                     \n\t"
 148     : [load_ptr] "=&r" (load_ptr), [shift] "=&r" (shift), [hann] "=&r" (hann),
 149       [hann1] "=&r" (hann1), [shift1] "=&r" (shift1), [coefs] "=&r" (coefs),
 150       [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3),
 151       [tmp4] "=&r" (tmp4), [i] "=&r" (i), [f_coef] "=&r" (f_coef),
 152       [s_coef] "=&r" (s_coef), [store_ptr1] "=&r" (store_ptr1),
 153       [store_ptr2] "=&r" (store_ptr2)
 154     : [time_signal] "r" (time_signal), [coefTable] "r" (coefTable),
 155       [time_signal_scaling] "r" (time_signal_scaling),
 156       [hanning] "r" (WebRtcAecm_kSqrtHanning), [fft] "r" (fft)
 157     : "memory", "hi", "lo"
 158   );
 159
 160   WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
 161   pfrfi = fft;
 162   pfreq_signal = freq_signal;
 163
 164   __asm __volatile (
 165     ".set        push                                                     \n\t"
 166     ".set        noreorder                                                \n\t"
 167     "addiu       %[j],              $zero,                 128            \n\t"
 168    "1:                                                                    \n\t"
 169     "lh          %[tmp1],           0(%[pfrfi])                           \n\t"
 170     "lh          %[tmp2],           2(%[pfrfi])                           \n\t"
 171     "lh          %[tmp3],           4(%[pfrfi])                           \n\t"
 172     "lh          %[tmp4],           6(%[pfrfi])                           \n\t"
 173     "subu        %[tmp2],           $zero,                 %[tmp2]        \n\t"
 174     "sh          %[tmp1],           0(%[pfreq_signal])                    \n\t"
 175     "sh          %[tmp2],           2(%[pfreq_signal])                    \n\t"
 176     "subu        %[tmp4],           $zero,                 %[tmp4]        \n\t"
 177     "sh          %[tmp3],           4(%[pfreq_signal])                    \n\t"
 178     "sh          %[tmp4],           6(%[pfreq_signal])                    \n\t"
 179     "lh          %[tmp1],           8(%[pfrfi])                           \n\t"
 180     "lh          %[tmp2],           10(%[pfrfi])                          \n\t"
 181     "lh          %[tmp3],           12(%[pfrfi])                          \n\t"
 182     "lh          %[tmp4],           14(%[pfrfi])                          \n\t"
 183     "addiu       %[j],              %[j],                  -8             \n\t"
 184     "subu        %[tmp2],           $zero,                 %[tmp2]        \n\t"
 185     "sh          %[tmp1],           8(%[pfreq_signal])                    \n\t"
 186     "sh          %[tmp2],           10(%[pfreq_signal])                   \n\t"
 187     "subu        %[tmp4],           $zero,                 %[tmp4]        \n\t"
 188     "sh          %[tmp3],           12(%[pfreq_signal])                   \n\t"
 189     "sh          %[tmp4],           14(%[pfreq_signal])                   \n\t"
 190     "addiu       %[pfreq_signal],   %[pfreq_signal],       16             \n\t"
 191     "bgtz        %[j],              1b                                    \n\t"
 192     " addiu      %[pfrfi],          %[pfrfi],              16             \n\t"
 193     ".set        pop                                                      \n\t"
 194     : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3),
 195       [j] "=&r" (j), [pfrfi] "+r" (pfrfi), [pfreq_signal] "+r" (pfreq_signal),
 196       [tmp4] "=&r" (tmp4)
 197     :
 198     : "memory"
 199   );
 200 }
 201
 202 static void InverseFFTAndWindow(AecmCore_t* aecm,
 203                                 int16_t* fft,
 204                                 complex16_t* efw,
 205                                 int16_t* output,
 206                                 const int16_t* nearendClean) {
 207   int i, outCFFT;
 208   int32_t tmp1, tmp2, tmp3, tmp4, tmp_re, tmp_im;
 209   int16_t* pcoefTable_ifft = coefTable_ifft;
 210   int16_t* pfft = fft;
 211   int16_t* ppfft = fft;
 212   complex16_t* pefw = efw;
 213   int32_t out_aecm;
 214   int16_t* paecm_buf = aecm->outBuf;
 215   const int16_t* p_kSqrtHanning = WebRtcAecm_kSqrtHanning;
 216   const int16_t* pp_kSqrtHanning = &WebRtcAecm_kSqrtHanning[PART_LEN];
 217   int16_t* output1 = output;
 218
 219   __asm __volatile (
 220     ".set      push                                                        \n\t"
 221     ".set      noreorder                                                   \n\t"
 222     "addiu     %[i],                $zero,                   64            \n\t"
 223    "1:                                                                     \n\t"
 224     "lh        %[tmp1],             0(%[pcoefTable_ifft])                  \n\t"
 225     "lh        %[tmp2],             2(%[pcoefTable_ifft])                  \n\t"
 226     "lh        %[tmp_re],           0(%[pefw])                             \n\t"
 227     "lh        %[tmp_im],           2(%[pefw])                             \n\t"
 228     "addu      %[pfft],             %[fft],                  %[tmp2]       \n\t"
 229     "sh        %[tmp_re],           0(%[pfft])                             \n\t"
 230     "sh        %[tmp_im],           2(%[pfft])                             \n\t"
 231     "addu      %[pfft],             %[fft],                  %[tmp1]       \n\t"
 232     "sh        %[tmp_re],           0(%[pfft])                             \n\t"
 233     "subu      %[tmp_im],           $zero,                   %[tmp_im]     \n\t"
 234     "sh        %[tmp_im],           2(%[pfft])                             \n\t"
 235     "lh        %[tmp1],             4(%[pcoefTable_ifft])                  \n\t"
 236     "lh        %[tmp2],             6(%[pcoefTable_ifft])                  \n\t"
 237     "lh        %[tmp_re],           4(%[pefw])                             \n\t"
 238     "lh        %[tmp_im],           6(%[pefw])                             \n\t"
 239     "addu      %[pfft],             %[fft],                  %[tmp2]       \n\t"
 240     "sh        %[tmp_re],           0(%[pfft])                             \n\t"
 241     "sh        %[tmp_im],           2(%[pfft])                             \n\t"
 242     "addu      %[pfft],             %[fft],                  %[tmp1]       \n\t"
 243     "sh        %[tmp_re],           0(%[pfft])                             \n\t"
 244     "subu      %[tmp_im],           $zero,                   %[tmp_im]     \n\t"
 245     "sh        %[tmp_im],           2(%[pfft])                             \n\t"
 246     "lh        %[tmp1],             8(%[pcoefTable_ifft])                  \n\t"
 247     "lh        %[tmp2],             10(%[pcoefTable_ifft])                 \n\t"
 248     "lh        %[tmp_re],           8(%[pefw])                             \n\t"
 249     "lh        %[tmp_im],           10(%[pefw])                            \n\t"
 250     "addu      %[pfft],             %[fft],                  %[tmp2]       \n\t"
 251     "sh        %[tmp_re],           0(%[pfft])                             \n\t"
 252     "sh        %[tmp_im],           2(%[pfft])                             \n\t"
 253     "addu      %[pfft],             %[fft],                  %[tmp1]       \n\t"
 254     "sh        %[tmp_re],           0(%[pfft])                             \n\t"
 255     "subu      %[tmp_im],           $zero,                   %[tmp_im]     \n\t"
 256     "sh        %[tmp_im],           2(%[pfft])                             \n\t"
 257     "lh        %[tmp1],             12(%[pcoefTable_ifft])                 \n\t"
 258     "lh        %[tmp2],             14(%[pcoefTable_ifft])                 \n\t"
 259     "lh        %[tmp_re],           12(%[pefw])                            \n\t"
 260     "lh        %[tmp_im],           14(%[pefw])                            \n\t"
 261     "addu      %[pfft],             %[fft],                  %[tmp2]       \n\t"
 262     "sh        %[tmp_re],           0(%[pfft])                             \n\t"
 263     "sh        %[tmp_im],           2(%[pfft])                             \n\t"
 264     "addu      %[pfft],             %[fft],                  %[tmp1]       \n\t"
 265     "sh        %[tmp_re],           0(%[pfft])                             \n\t"
 266     "subu      %[tmp_im],           $zero,                   %[tmp_im]     \n\t"
 267     "sh        %[tmp_im],           2(%[pfft])                             \n\t"
 268     "addiu     %[pcoefTable_ifft],  %[pcoefTable_ifft],      16            \n\t"
 269     "addiu     %[i],                %[i],                    -4            \n\t"
 270     "bgtz      %[i],                1b                                     \n\t"
 271     " addiu    %[pefw],             %[pefw],                 16            \n\t"
 272     ".set      pop                                                         \n\t"
 273     : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), [pfft] "+r" (pfft),
 274       [i] "=&r" (i), [tmp_re] "=&r" (tmp_re), [tmp_im] "=&r" (tmp_im),
 275       [pefw] "+r" (pefw), [pcoefTable_ifft] "+r" (pcoefTable_ifft),
 276       [fft] "+r" (fft)
 277     :
 278     : "memory"
 279   );
 280
 281   fft[2] = efw[PART_LEN].real;
 282   fft[3] = -efw[PART_LEN].imag;
 283
 284   outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
 285   pfft = fft;
 286
 287   __asm __volatile (
 288     ".set       push                                               \n\t"
 289     ".set       noreorder                                          \n\t"
 290     "addiu      %[i],            $zero,               128          \n\t"
 291    "1:                                                             \n\t"
 292     "lh         %[tmp1],         0(%[ppfft])                       \n\t"
 293     "lh         %[tmp2],         4(%[ppfft])                       \n\t"
 294     "lh         %[tmp3],         8(%[ppfft])                       \n\t"
 295     "lh         %[tmp4],         12(%[ppfft])                      \n\t"
 296     "addiu      %[i],            %[i],                -4           \n\t"
 297     "sh         %[tmp1],         0(%[pfft])                        \n\t"
 298     "sh         %[tmp2],         2(%[pfft])                        \n\t"
 299     "sh         %[tmp3],         4(%[pfft])                        \n\t"
 300     "sh         %[tmp4],         6(%[pfft])                        \n\t"
 301     "addiu      %[ppfft],        %[ppfft],            16           \n\t"
 302     "bgtz       %[i],            1b                                \n\t"
 303     " addiu     %[pfft],         %[pfft],             8            \n\t"
 304     ".set       pop                                                \n\t"
 305     : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), [pfft] "+r" (pfft),
 306       [i] "=&r" (i), [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4),
 307       [ppfft] "+r" (ppfft)
 308     :
 309     : "memory"
 310   );
 311
 312   pfft = fft;
 313   out_aecm = (int32_t)(outCFFT - aecm->dfaCleanQDomain);
 314
 315   __asm __volatile (
 316     ".set       push                                                       \n\t"
 317     ".set       noreorder                                                  \n\t"
 318     "addiu      %[i],                $zero,                  64            \n\t"
 319    "11:                                                                    \n\t"
 320     "lh         %[tmp1],             0(%[pfft])                            \n\t"
 321     "lh         %[tmp2],             0(%[p_kSqrtHanning])                  \n\t"
 322     "addiu      %[i],                %[i],                   -2            \n\t"
 323     "mul        %[tmp1],             %[tmp1],                %[tmp2]       \n\t"
 324     "lh         %[tmp3],             2(%[pfft])                            \n\t"
 325     "lh         %[tmp4],             2(%[p_kSqrtHanning])                  \n\t"
 326     "mul        %[tmp3],             %[tmp3],                %[tmp4]       \n\t"
 327     "addiu      %[tmp1],             %[tmp1],                8192          \n\t"
 328     "sra        %[tmp1],             %[tmp1],                14            \n\t"
 329     "addiu      %[tmp3],             %[tmp3],                8192          \n\t"
 330     "sra        %[tmp3],             %[tmp3],                14            \n\t"
 331     "bgez       %[out_aecm],         1f                                    \n\t"
 332     " negu      %[tmp2],             %[out_aecm]                           \n\t"
 333     "srav       %[tmp1],             %[tmp1],                %[tmp2]       \n\t"
 334     "b          2f                                                         \n\t"
 335     " srav      %[tmp3],             %[tmp3],                %[tmp2]       \n\t"
 336    "1:                                                                     \n\t"
 337     "sllv       %[tmp1],             %[tmp1],                %[out_aecm]   \n\t"
 338     "sllv       %[tmp3],             %[tmp3],                %[out_aecm]   \n\t"
 339    "2:                                                                     \n\t"
 340     "lh         %[tmp4],             0(%[paecm_buf])                       \n\t"
 341     "lh         %[tmp2],             2(%[paecm_buf])                       \n\t"
 342     "addu       %[tmp3],             %[tmp3],                %[tmp2]       \n\t"
 343     "addu       %[tmp1],             %[tmp1],                %[tmp4]       \n\t"
 344 #if defined(MIPS_DSP_R1_LE)
 345     "shll_s.w   %[tmp1],             %[tmp1],                16            \n\t"
 346     "sra        %[tmp1],             %[tmp1],                16            \n\t"
 347     "shll_s.w   %[tmp3],             %[tmp3],                16            \n\t"
 348     "sra        %[tmp3],             %[tmp3],                16            \n\t"
 349 #else  // #if defined(MIPS_DSP_R1_LE)
 350     "sra        %[tmp4],             %[tmp1],                31            \n\t"
 351     "sra        %[tmp2],             %[tmp1],                15            \n\t"
 352     "beq        %[tmp4],             %[tmp2],                3f            \n\t"
 353     " ori       %[tmp2],             $zero,                  0x7fff        \n\t"
 354     "xor        %[tmp1],             %[tmp2],                %[tmp4]       \n\t"
 355    "3:                                                                     \n\t"
 356     "sra        %[tmp2],             %[tmp3],                31            \n\t"
 357     "sra        %[tmp4],             %[tmp3],                15            \n\t"
 358     "beq        %[tmp2],             %[tmp4],                4f            \n\t"
 359     " ori       %[tmp4],             $zero,                  0x7fff        \n\t"
 360     "xor        %[tmp3],             %[tmp4],                %[tmp2]       \n\t"
 361    "4:                                                                     \n\t"
 362 #endif  // #if defined(MIPS_DSP_R1_LE)
 363     "sh         %[tmp1],             0(%[pfft])                            \n\t"
 364     "sh         %[tmp1],             0(%[output1])                         \n\t"
 365     "sh         %[tmp3],             2(%[pfft])                            \n\t"
 366     "sh         %[tmp3],             2(%[output1])                         \n\t"
 367     "lh         %[tmp1],             128(%[pfft])                          \n\t"
 368     "lh         %[tmp2],             0(%[pp_kSqrtHanning])                 \n\t"
 369     "mul        %[tmp1],             %[tmp1],                %[tmp2]       \n\t"
 370     "lh         %[tmp3],             130(%[pfft])                          \n\t"
 371     "lh         %[tmp4],             -2(%[pp_kSqrtHanning])                \n\t"
 372     "mul        %[tmp3],             %[tmp3],                %[tmp4]       \n\t"
 373     "sra        %[tmp1],             %[tmp1],                14            \n\t"
 374     "sra        %[tmp3],             %[tmp3],                14            \n\t"
 375     "bgez       %[out_aecm],         5f                                    \n\t"
 376     " negu      %[tmp2],             %[out_aecm]                           \n\t"
 377     "srav       %[tmp3],             %[tmp3],                %[tmp2]       \n\t"
 378     "b          6f                                                         \n\t"
 379     " srav      %[tmp1],             %[tmp1],                %[tmp2]       \n\t"
 380    "5:                                                                     \n\t"
 381     "sllv       %[tmp1],             %[tmp1],                %[out_aecm]   \n\t"
 382     "sllv       %[tmp3],             %[tmp3],                %[out_aecm]   \n\t"
 383    "6:                                                                     \n\t"
 384 #if defined(MIPS_DSP_R1_LE)
 385     "shll_s.w   %[tmp1],             %[tmp1],                16            \n\t"
 386     "sra        %[tmp1],             %[tmp1],                16            \n\t"
 387     "shll_s.w   %[tmp3],             %[tmp3],                16            \n\t"
 388     "sra        %[tmp3],             %[tmp3],                16            \n\t"
 389 #else  // #if defined(MIPS_DSP_R1_LE)
 390     "sra        %[tmp4],             %[tmp1],                31            \n\t"
 391     "sra        %[tmp2],             %[tmp1],                15            \n\t"
 392     "beq        %[tmp4],             %[tmp2],                7f            \n\t"
 393     " ori       %[tmp2],             $zero,                  0x7fff        \n\t"
 394     "xor        %[tmp1],             %[tmp2],                %[tmp4]       \n\t"
 395    "7:                                                                     \n\t"
 396     "sra        %[tmp2],             %[tmp3],                31            \n\t"
 397     "sra        %[tmp4],             %[tmp3],                15            \n\t"
 398     "beq        %[tmp2],             %[tmp4],                8f            \n\t"
 399     " ori       %[tmp4],             $zero,                  0x7fff        \n\t"
 400     "xor        %[tmp3],             %[tmp4],                %[tmp2]       \n\t"
 401    "8:                                                                     \n\t"
 402 #endif  // #if defined(MIPS_DSP_R1_LE)
 403     "sh         %[tmp1],             0(%[paecm_buf])                       \n\t"
 404     "sh         %[tmp3],             2(%[paecm_buf])                       \n\t"
 405     "addiu      %[output1],          %[output1],             4             \n\t"
 406     "addiu      %[paecm_buf],        %[paecm_buf],           4             \n\t"
 407     "addiu      %[pfft],             %[pfft],                4             \n\t"
 408     "addiu      %[p_kSqrtHanning],   %[p_kSqrtHanning],      4             \n\t"
 409     "bgtz       %[i],                11b                                   \n\t"
 410     " addiu     %[pp_kSqrtHanning],  %[pp_kSqrtHanning],     -4            \n\t"
 411     ".set       pop                                                        \n\t"
 412     : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), [pfft] "+r" (pfft),
 413       [output1] "+r" (output1), [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4),
 414       [paecm_buf] "+r" (paecm_buf), [i] "=&r" (i),
 415       [pp_kSqrtHanning] "+r" (pp_kSqrtHanning),
 416       [p_kSqrtHanning] "+r" (p_kSqrtHanning)
 417     : [out_aecm] "r" (out_aecm),
 418       [WebRtcAecm_kSqrtHanning] "r" (WebRtcAecm_kSqrtHanning)
 419     : "hi", "lo","memory"
 420   );
 421
 422   // Copy the current block to the old position
 423   // (aecm->outBuf is shifted elsewhere)
 424   memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(int16_t) * PART_LEN);
 425   memcpy(aecm->dBufNoisy,
 426          aecm->dBufNoisy + PART_LEN,
 427          sizeof(int16_t) * PART_LEN);
 428   if (nearendClean != NULL) {
 429     memcpy(aecm->dBufClean,
 430            aecm->dBufClean + PART_LEN,
 431            sizeof(int16_t) * PART_LEN);
 432   }
 433 }
 434
 435 void WebRtcAecm_CalcLinearEnergies_mips(AecmCore_t* aecm,
 436                                         const uint16_t* far_spectrum,
 437                                         int32_t* echo_est,
 438                                         uint32_t* far_energy,
 439                                         uint32_t* echo_energy_adapt,
 440                                         uint32_t* echo_energy_stored) {
 441   int i;
 442   uint32_t par1 = (*far_energy);
 443   uint32_t par2 = (*echo_energy_adapt);
 444   uint32_t par3 = (*echo_energy_stored);
 445   int16_t* ch_stored_p = &(aecm->channelStored[0]);
 446   int16_t* ch_adapt_p = &(aecm->channelAdapt16[0]);
 447   uint16_t* spectrum_p = (uint16_t*)(&(far_spectrum[0]));
 448   int32_t* echo_p = &(echo_est[0]);
 449   int32_t temp0, stored0, echo0, adept0, spectrum0;
 450   int32_t stored1, adept1, spectrum1, echo1, temp1;
 451
 452   // Get energy for the delayed far end signal and estimated
 453   // echo using both stored and adapted channels.
 454   for (i = 0; i < PART_LEN; i+= 4) {
 455     __asm __volatile (
 456       ".set           push                                            \n\t"
 457       ".set           noreorder                                       \n\t"
 458       "lh             %[stored0],     0(%[ch_stored_p])               \n\t"
 459       "lhu            %[adept0],      0(%[ch_adapt_p])                \n\t"
 460       "lhu            %[spectrum0],   0(%[spectrum_p])                \n\t"
 461       "lh             %[stored1],     2(%[ch_stored_p])               \n\t"
 462       "lhu            %[adept1],      2(%[ch_adapt_p])                \n\t"
 463       "lhu            %[spectrum1],   2(%[spectrum_p])                \n\t"
 464       "mul            %[echo0],       %[stored0],     %[spectrum0]    \n\t"
 465       "mul            %[temp0],       %[adept0],      %[spectrum0]    \n\t"
 466       "mul            %[echo1],       %[stored1],     %[spectrum1]    \n\t"
 467       "mul            %[temp1],       %[adept1],      %[spectrum1]    \n\t"
 468       "addu           %[par1],        %[par1],        %[spectrum0]    \n\t"
 469       "addu           %[par1],        %[par1],        %[spectrum1]    \n\t"
 470       "addiu          %[echo_p],      %[echo_p],      16              \n\t"
 471       "addu           %[par3],        %[par3],        %[echo0]        \n\t"
 472       "addu           %[par2],        %[par2],        %[temp0]        \n\t"
 473       "addu           %[par3],        %[par3],        %[echo1]        \n\t"
 474       "addu           %[par2],        %[par2],        %[temp1]        \n\t"
 475       "usw            %[echo0],       -16(%[echo_p])                  \n\t"
 476       "usw            %[echo1],       -12(%[echo_p])                  \n\t"
 477       "lh             %[stored0],     4(%[ch_stored_p])               \n\t"
 478       "lhu            %[adept0],      4(%[ch_adapt_p])                \n\t"
 479       "lhu            %[spectrum0],   4(%[spectrum_p])                \n\t"
 480       "lh             %[stored1],     6(%[ch_stored_p])               \n\t"
 481       "lhu            %[adept1],      6(%[ch_adapt_p])                \n\t"
 482       "lhu            %[spectrum1],   6(%[spectrum_p])                \n\t"
 483       "mul            %[echo0],       %[stored0],     %[spectrum0]    \n\t"
 484       "mul            %[temp0],       %[adept0],      %[spectrum0]    \n\t"
 485       "mul            %[echo1],       %[stored1],     %[spectrum1]    \n\t"
 486       "mul            %[temp1],       %[adept1],      %[spectrum1]    \n\t"
 487       "addu           %[par1],        %[par1],        %[spectrum0]    \n\t"
 488       "addu           %[par1],        %[par1],        %[spectrum1]    \n\t"
 489       "addiu          %[ch_stored_p], %[ch_stored_p], 8               \n\t"
 490       "addiu          %[ch_adapt_p],  %[ch_adapt_p],  8               \n\t"
 491       "addiu          %[spectrum_p],  %[spectrum_p],  8               \n\t"
 492       "addu           %[par3],        %[par3],        %[echo0]        \n\t"
 493       "addu           %[par2],        %[par2],        %[temp0]        \n\t"
 494       "addu           %[par3],        %[par3],        %[echo1]        \n\t"
 495       "addu           %[par2],        %[par2],        %[temp1]        \n\t"
 496       "usw            %[echo0],       -8(%[echo_p])                   \n\t"
 497       "usw            %[echo1],       -4(%[echo_p])                   \n\t"
 498       ".set           pop                                             \n\t"
 499       : [temp0] "=&r" (temp0), [stored0] "=&r" (stored0),
 500         [adept0] "=&r" (adept0), [spectrum0] "=&r" (spectrum0),
 501         [echo0] "=&r" (echo0), [echo_p] "+r" (echo_p), [par3] "+r" (par3),
 502         [par1] "+r" (par1), [par2] "+r" (par2), [stored1] "=&r" (stored1),
 503         [adept1] "=&r" (adept1), [echo1] "=&r" (echo1),
 504         [spectrum1] "=&r" (spectrum1), [temp1] "=&r" (temp1),
 505         [ch_stored_p] "+r" (ch_stored_p), [ch_adapt_p] "+r" (ch_adapt_p),
 506         [spectrum_p] "+r" (spectrum_p)
 507       :
 508       : "hi", "lo", "memory"
 509     );
 510   }
 511
 512   echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN],
 513                                              far_spectrum[PART_LEN]);
 514   par1 += (uint32_t)(far_spectrum[PART_LEN]);
 515   par2 += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[PART_LEN],
 516                                 far_spectrum[PART_LEN]);
 517   par3 += (uint32_t)echo_est[PART_LEN];
 518
 519   (*far_energy) = par1;
 520   (*echo_energy_adapt) = par2;
 521   (*echo_energy_stored) = par3;
 522 }
 523
 524 #if defined(MIPS_DSP_R1_LE)
 525 void WebRtcAecm_StoreAdaptiveChannel_mips(AecmCore_t* aecm,
 526                                           const uint16_t* far_spectrum,
 527                                           int32_t* echo_est) {
 528   int i;
 529   int16_t* temp1;
 530   uint16_t* temp8;
 531   int32_t temp0, temp2, temp3, temp4, temp5, temp6;
 532   int32_t* temp7 = &(echo_est[0]);
 533   temp1 = &(aecm->channelStored[0]);
 534   temp8 = (uint16_t*)(&far_spectrum[0]);
 535
 536   // During startup we store the channel every block.
 537   memcpy(aecm->channelStored, aecm->channelAdapt16,
 538          sizeof(int16_t) * PART_LEN1);
 539   // Recalculate echo estimate
 540   for (i = 0; i < PART_LEN; i += 4) {
 541     __asm __volatile (
 542       "ulw            %[temp0],   0(%[temp8])               \n\t"
 543       "ulw            %[temp2],   0(%[temp1])               \n\t"
 544       "ulw            %[temp4],   4(%[temp8])               \n\t"
 545       "ulw            %[temp5],   4(%[temp1])               \n\t"
 546       "muleq_s.w.phl  %[temp3],   %[temp2],     %[temp0]    \n\t"
 547       "muleq_s.w.phr  %[temp0],   %[temp2],     %[temp0]    \n\t"
 548       "muleq_s.w.phl  %[temp6],   %[temp5],     %[temp4]    \n\t"
 549       "muleq_s.w.phr  %[temp4],   %[temp5],     %[temp4]    \n\t"
 550       "addiu          %[temp7],   %[temp7],     16          \n\t"
 551       "addiu          %[temp1],   %[temp1],     8           \n\t"
 552       "addiu          %[temp8],   %[temp8],     8           \n\t"
 553       "sra            %[temp3],   %[temp3],     1           \n\t"
 554       "sra            %[temp0],   %[temp0],     1           \n\t"
 555       "sra            %[temp6],   %[temp6],     1           \n\t"
 556       "sra            %[temp4],   %[temp4],     1           \n\t"
 557       "usw            %[temp3],   -12(%[temp7])             \n\t"
 558       "usw            %[temp0],   -16(%[temp7])             \n\t"
 559       "usw            %[temp6],   -4(%[temp7])              \n\t"
 560       "usw            %[temp4],   -8(%[temp7])              \n\t"
 561       : [temp0] "=&r" (temp0), [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
 562         [temp4] "=&r" (temp4), [temp5] "=&r" (temp5), [temp6] "=&r" (temp6),
 563         [temp1] "+r" (temp1), [temp8] "+r" (temp8), [temp7] "+r" (temp7)
 564       :
 565       : "hi", "lo", "memory"
 566     );
 567   }
 568   echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
 569                                       far_spectrum[i]);
 570 }
 571
 572 void WebRtcAecm_ResetAdaptiveChannel_mips(AecmCore_t* aecm) {
 573   int i;
 574   int32_t* temp3;
 575   int16_t* temp0;
 576   int32_t temp1, temp2, temp4, temp5;
 577
 578   temp0 = &(aecm->channelStored[0]);
 579   temp3 = &(aecm->channelAdapt32[0]);
 580
 581   // The stored channel has a significantly lower MSE than the adaptive one for
 582   // two consecutive calculations. Reset the adaptive channel.
 583   memcpy(aecm->channelAdapt16,
 584          aecm->channelStored,
 585          sizeof(int16_t) * PART_LEN1);
 586
 587   // Restore the W32 channel
 588   for (i = 0; i < PART_LEN; i += 4) {
 589     __asm __volatile (
 590       "ulw            %[temp1], 0(%[temp0])           \n\t"
 591       "ulw            %[temp4], 4(%[temp0])           \n\t"
 592       "preceq.w.phl   %[temp2], %[temp1]              \n\t"
 593       "preceq.w.phr   %[temp1], %[temp1]              \n\t"
 594       "preceq.w.phl   %[temp5], %[temp4]              \n\t"
 595       "preceq.w.phr   %[temp4], %[temp4]              \n\t"
 596       "addiu          %[temp0], %[temp0], 8           \n\t"
 597       "usw            %[temp2], 4(%[temp3])           \n\t"
 598       "usw            %[temp1], 0(%[temp3])           \n\t"
 599       "usw            %[temp5], 12(%[temp3])          \n\t"
 600       "usw            %[temp4], 8(%[temp3])           \n\t"
 601       "addiu          %[temp3], %[temp3], 16          \n\t"
 602       : [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
 603         [temp4] "=&r" (temp4), [temp5] "=&r" (temp5),
 604         [temp3] "+r" (temp3), [temp0] "+r" (temp0)
 605       :
 606       : "memory"
 607     );
 608   }
 609
 610   aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
 611                               (int32_t)aecm->channelStored[i], 16);
 612 }
 613 #endif  // #if defined(MIPS_DSP_R1_LE)
 614
 615 // Transforms a time domain signal into the frequency domain, outputting the
 616 // complex valued signal, absolute value and sum of absolute values.
 617 //
 618 // time_signal          [in]    Pointer to time domain signal
 619 // freq_signal_real     [out]   Pointer to real part of frequency domain array
 620 // freq_signal_imag     [out]   Pointer to imaginary part of frequency domain
 621 //                              array
 622 // freq_signal_abs      [out]   Pointer to absolute value of frequency domain
 623 //                              array
 624 // freq_signal_sum_abs  [out]   Pointer to the sum of all absolute values in
 625 //                              the frequency domain array
 626 // return value                 The Q-domain of current frequency values
 627 //
 628 static int TimeToFrequencyDomain(AecmCore_t* aecm,
 629                                  const int16_t* time_signal,
 630                                  complex16_t* freq_signal,
 631                                  uint16_t* freq_signal_abs,
 632                                  uint32_t* freq_signal_sum_abs)
 633 {
 634   int i = 0;
 635   int time_signal_scaling = 0;
 636
 637   // In fft_buf, +16 for 32-byte alignment.
 638   int16_t fft_buf[PART_LEN4 + 16];
 639   int16_t *fft = (int16_t *) (((uintptr_t) fft_buf + 31) & ~31);
 640
 641   int16_t tmp16no1;
 642 #if !defined(MIPS_DSP_R2_LE)
 643   int32_t tmp32no1;
 644   int32_t tmp32no2;
 645   int16_t tmp16no2;
 646 #else
 647   int32_t tmp32no10, tmp32no11, tmp32no12, tmp32no13;
 648   int32_t tmp32no20, tmp32no21, tmp32no22, tmp32no23;
 649   int16_t* freqp;
 650   uint16_t* freqabsp;
 651   uint32_t freqt0, freqt1, freqt2, freqt3;
 652   uint32_t freqs;
 653 #endif
 654
 655 #ifdef AECM_DYNAMIC_Q
 656   tmp16no1 = WebRtcSpl_MaxAbsValueW16(time_signal, PART_LEN2);
 657   time_signal_scaling = WebRtcSpl_NormW16(tmp16no1);
 658 #endif
 659
 660   WindowAndFFT(aecm, fft, time_signal, freq_signal, time_signal_scaling);
 661
 662   // Extract imaginary and real part,
 663   // calculate the magnitude for all frequency bins
 664   freq_signal[0].imag = 0;
 665   freq_signal[PART_LEN].imag = 0;
 666   freq_signal[PART_LEN].real = fft[PART_LEN2];
 667   freq_signal_abs[0] = (uint16_t)WEBRTC_SPL_ABS_W16(freq_signal[0].real);
 668   freq_signal_abs[PART_LEN] = (uint16_t)WEBRTC_SPL_ABS_W16(
 669     freq_signal[PART_LEN].real);
 670   (*freq_signal_sum_abs) = (uint32_t)(freq_signal_abs[0]) +
 671     (uint32_t)(freq_signal_abs[PART_LEN]);
 672
 673 #if !defined(MIPS_DSP_R2_LE)
 674   for (i = 1; i < PART_LEN; i++) {
 675     if (freq_signal[i].real == 0)
 676     {
 677       freq_signal_abs[i] = (uint16_t)WEBRTC_SPL_ABS_W16(
 678         freq_signal[i].imag);
 679     }
 680     else if (freq_signal[i].imag == 0)
 681     {
 682       freq_signal_abs[i] = (uint16_t)WEBRTC_SPL_ABS_W16(
 683         freq_signal[i].real);
 684     }
 685     else
 686     {
 687       // Approximation for magnitude of complex fft output
 688       // magn = sqrt(real^2 + imag^2)
 689       // magn ~= alpha * max(|imag|,|real|) + beta * min(|imag|,|real|)
 690       //
 691       // The parameters alpha and beta are stored in Q15
 692       tmp16no1 = WEBRTC_SPL_ABS_W16(freq_signal[i].real);
 693       tmp16no2 = WEBRTC_SPL_ABS_W16(freq_signal[i].imag);
 694       tmp32no1 = WEBRTC_SPL_MUL_16_16(tmp16no1, tmp16no1);
 695       tmp32no2 = WEBRTC_SPL_MUL_16_16(tmp16no2, tmp16no2);
 696       tmp32no2 = WEBRTC_SPL_ADD_SAT_W32(tmp32no1, tmp32no2);
 697       tmp32no1 = WebRtcSpl_SqrtFloor(tmp32no2);
 698
 699       freq_signal_abs[i] = (uint16_t)tmp32no1;
 700     }
 701     (*freq_signal_sum_abs) += (uint32_t)freq_signal_abs[i];
 702   }
 703 #else // #if !defined(MIPS_DSP_R2_LE)
 704   freqs = (uint32_t)(freq_signal_abs[0]) +
 705           (uint32_t)(freq_signal_abs[PART_LEN]);
 706   freqp = &(freq_signal[1].real);
 707
 708   __asm __volatile (
 709     "lw             %[freqt0],      0(%[freqp])             \n\t"
 710     "lw             %[freqt1],      4(%[freqp])             \n\t"
 711     "lw             %[freqt2],      8(%[freqp])             \n\t"
 712     "mult           $ac0,           $zero,      $zero       \n\t"
 713     "mult           $ac1,           $zero,      $zero       \n\t"
 714     "mult           $ac2,           $zero,      $zero       \n\t"
 715     "dpaq_s.w.ph    $ac0,           %[freqt0],  %[freqt0]   \n\t"
 716     "dpaq_s.w.ph    $ac1,           %[freqt1],  %[freqt1]   \n\t"
 717     "dpaq_s.w.ph    $ac2,           %[freqt2],  %[freqt2]   \n\t"
 718     "addiu          %[freqp],       %[freqp],   12          \n\t"
 719     "extr.w         %[tmp32no20],   $ac0,       1           \n\t"
 720     "extr.w         %[tmp32no21],   $ac1,       1           \n\t"
 721     "extr.w         %[tmp32no22],   $ac2,       1           \n\t"
 722     : [freqt0] "=&r" (freqt0), [freqt1] "=&r" (freqt1),
 723       [freqt2] "=&r" (freqt2), [freqp] "+r" (freqp),
 724       [tmp32no20] "=r" (tmp32no20), [tmp32no21] "=r" (tmp32no21),
 725       [tmp32no22] "=r" (tmp32no22)
 726     :
 727     : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo"
 728   );
 729
 730   tmp32no10 = WebRtcSpl_SqrtFloor(tmp32no20);
 731   tmp32no11 = WebRtcSpl_SqrtFloor(tmp32no21);
 732   tmp32no12 = WebRtcSpl_SqrtFloor(tmp32no22);
 733   freq_signal_abs[1] = (uint16_t)tmp32no10;
 734   freq_signal_abs[2] = (uint16_t)tmp32no11;
 735   freq_signal_abs[3] = (uint16_t)tmp32no12;
 736   freqs += (uint32_t)tmp32no10;
 737   freqs += (uint32_t)tmp32no11;
 738   freqs += (uint32_t)tmp32no12;
 739   freqabsp = &(freq_signal_abs[4]);
 740   for (i = 4; i < PART_LEN; i+=4)
 741   {
 742     __asm __volatile (
 743       "ulw            %[freqt0],      0(%[freqp])                 \n\t"
 744       "ulw            %[freqt1],      4(%[freqp])                 \n\t"
 745       "ulw            %[freqt2],      8(%[freqp])                 \n\t"
 746       "ulw            %[freqt3],      12(%[freqp])                \n\t"
 747       "mult           $ac0,           $zero,          $zero       \n\t"
 748       "mult           $ac1,           $zero,          $zero       \n\t"
 749       "mult           $ac2,           $zero,          $zero       \n\t"
 750       "mult           $ac3,           $zero,          $zero       \n\t"
 751       "dpaq_s.w.ph    $ac0,           %[freqt0],      %[freqt0]   \n\t"
 752       "dpaq_s.w.ph    $ac1,           %[freqt1],      %[freqt1]   \n\t"
 753       "dpaq_s.w.ph    $ac2,           %[freqt2],      %[freqt2]   \n\t"
 754       "dpaq_s.w.ph    $ac3,           %[freqt3],      %[freqt3]   \n\t"
 755       "addiu          %[freqp],       %[freqp],       16          \n\t"
 756       "addiu          %[freqabsp],    %[freqabsp],    8           \n\t"
 757       "extr.w         %[tmp32no20],   $ac0,           1           \n\t"
 758       "extr.w         %[tmp32no21],   $ac1,           1           \n\t"
 759       "extr.w         %[tmp32no22],   $ac2,           1           \n\t"
 760       "extr.w         %[tmp32no23],   $ac3,           1           \n\t"
 761       : [freqt0] "=&r" (freqt0), [freqt1] "=&r" (freqt1),
 762         [freqt2] "=&r" (freqt2), [freqt3] "=&r" (freqt3),
 763         [tmp32no20] "=r" (tmp32no20), [tmp32no21] "=r" (tmp32no21),
 764         [tmp32no22] "=r" (tmp32no22), [tmp32no23] "=r" (tmp32no23),
 765         [freqabsp] "+r" (freqabsp), [freqp] "+r" (freqp)
 766       :
 767       : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
 768         "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
 769     );
 770
 771     tmp32no10 = WebRtcSpl_SqrtFloor(tmp32no20);
 772     tmp32no11 = WebRtcSpl_SqrtFloor(tmp32no21);
 773     tmp32no12 = WebRtcSpl_SqrtFloor(tmp32no22);
 774     tmp32no13 = WebRtcSpl_SqrtFloor(tmp32no23);
 775
 776     __asm __volatile (
 777       "sh             %[tmp32no10],   -8(%[freqabsp])                 \n\t"
 778       "sh             %[tmp32no11],   -6(%[freqabsp])                 \n\t"
 779       "sh             %[tmp32no12],   -4(%[freqabsp])                 \n\t"
 780       "sh             %[tmp32no13],   -2(%[freqabsp])                 \n\t"
 781       "addu           %[freqs],       %[freqs],       %[tmp32no10]    \n\t"
 782       "addu           %[freqs],       %[freqs],       %[tmp32no11]    \n\t"
 783       "addu           %[freqs],       %[freqs],       %[tmp32no12]    \n\t"
 784       "addu           %[freqs],       %[freqs],       %[tmp32no13]    \n\t"
 785       : [freqs] "+r" (freqs)
 786       : [tmp32no10] "r" (tmp32no10), [tmp32no11] "r" (tmp32no11),
 787         [tmp32no12] "r" (tmp32no12), [tmp32no13] "r" (tmp32no13),
 788         [freqabsp] "r" (freqabsp)
 789       : "memory"
 790     );
 791   }
 792
 793   (*freq_signal_sum_abs) = freqs;
 794 #endif
 795
 796   return time_signal_scaling;
 797 }
 798
 799 int WebRtcAecm_ProcessBlock(AecmCore_t* aecm,
 800                             const int16_t* farend,
 801                             const int16_t* nearendNoisy,
 802                             const int16_t* nearendClean,
 803                             int16_t* output) {
 804   int i;
 805   uint32_t xfaSum;
 806   uint32_t dfaNoisySum;
 807   uint32_t dfaCleanSum;
 808   uint32_t echoEst32Gained;
 809   uint32_t tmpU32;
 810   int32_t tmp32no1;
 811
 812   uint16_t xfa[PART_LEN1];
 813   uint16_t dfaNoisy[PART_LEN1];
 814   uint16_t dfaClean[PART_LEN1];
 815   uint16_t* ptrDfaClean = dfaClean;
 816   const uint16_t* far_spectrum_ptr = NULL;
 817
 818   // 32 byte aligned buffers (with +8 or +16).
 819   int16_t fft_buf[PART_LEN4 + 2 + 16]; // +2 to make a loop safe.
 820   int32_t echoEst32_buf[PART_LEN1 + 8];
 821   int32_t dfw_buf[PART_LEN2 + 8];
 822   int32_t efw_buf[PART_LEN2 + 8];
 823
 824   int16_t* fft = (int16_t*)(((uint32_t)fft_buf + 31) & ~ 31);
 825   int32_t* echoEst32 = (int32_t*)(((uint32_t)echoEst32_buf + 31) & ~ 31);
 826   complex16_t* dfw = (complex16_t*)(((uint32_t)dfw_buf + 31) & ~ 31);
 827   complex16_t* efw = (complex16_t*)(((uint32_t)efw_buf + 31) & ~ 31);
 828
 829   int16_t hnl[PART_LEN1];
 830   int16_t numPosCoef = 0;
 831   int delay;
 832   int16_t tmp16no1;
 833   int16_t tmp16no2;
 834   int16_t mu;
 835   int16_t supGain;
 836   int16_t zeros32, zeros16;
 837   int16_t zerosDBufNoisy, zerosDBufClean, zerosXBuf;
 838   int far_q;
 839   int16_t resolutionDiff, qDomainDiff, dfa_clean_q_domain_diff;
 840
 841   const int kMinPrefBand = 4;
 842   const int kMaxPrefBand = 24;
 843   int32_t avgHnl32 = 0;
 844
 845   int32_t temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
 846   int16_t* ptr;
 847   int16_t* ptr1;
 848   int16_t* er_ptr;
 849   int16_t* dr_ptr;
 850
 851   ptr = &hnl[0];
 852   ptr1 = &hnl[0];
 853   er_ptr = &efw[0].real;
 854   dr_ptr = &dfw[0].real;
 855
 856   // Determine startup state. There are three states:
 857   // (0) the first CONV_LEN blocks
 858   // (1) another CONV_LEN blocks
 859   // (2) the rest
 860
 861   if (aecm->startupState < 2) {
 862     aecm->startupState = (aecm->totCount >= CONV_LEN) +
 863                          (aecm->totCount >= CONV_LEN2);
 864   }
 865   // END: Determine startup state
 866
 867   // Buffer near and far end signals
 868   memcpy(aecm->xBuf + PART_LEN, farend, sizeof(int16_t) * PART_LEN);
 869   memcpy(aecm->dBufNoisy + PART_LEN,
 870          nearendNoisy,
 871          sizeof(int16_t) * PART_LEN);
 872   if (nearendClean != NULL) {
 873     memcpy(aecm->dBufClean + PART_LEN,
 874            nearendClean,
 875            sizeof(int16_t) * PART_LEN);
 876   }
 877
 878   // Transform far end signal from time domain to frequency domain.
 879   far_q = TimeToFrequencyDomain(aecm,
 880                                 aecm->xBuf,
 881                                 dfw,
 882                                 xfa,
 883                                 &xfaSum);
 884
 885   // Transform noisy near end signal from time domain to frequency domain.
 886   zerosDBufNoisy = TimeToFrequencyDomain(aecm,
 887                                          aecm->dBufNoisy,
 888                                          dfw,
 889                                          dfaNoisy,
 890                                          &dfaNoisySum);
 891   aecm->dfaNoisyQDomainOld = aecm->dfaNoisyQDomain;
 892   aecm->dfaNoisyQDomain = (int16_t)zerosDBufNoisy;
 893
 894   if (nearendClean == NULL) {
 895     ptrDfaClean = dfaNoisy;
 896     aecm->dfaCleanQDomainOld = aecm->dfaNoisyQDomainOld;
 897     aecm->dfaCleanQDomain = aecm->dfaNoisyQDomain;
 898     dfaCleanSum = dfaNoisySum;
 899   } else {
 900     // Transform clean near end signal from time domain to frequency domain.
 901     zerosDBufClean = TimeToFrequencyDomain(aecm,
 902                                            aecm->dBufClean,
 903                                            dfw,
 904                                            dfaClean,
 905                                            &dfaCleanSum);
 906     aecm->dfaCleanQDomainOld = aecm->dfaCleanQDomain;
 907     aecm->dfaCleanQDomain = (int16_t)zerosDBufClean;
 908   }
 909
 910   // Get the delay
 911   // Save far-end history and estimate delay
 912   WebRtcAecm_UpdateFarHistory(aecm, xfa, far_q);
 913
 914   if (WebRtc_AddFarSpectrumFix(aecm->delay_estimator_farend, xfa, PART_LEN1,
 915                                far_q) == -1) {
 916     return -1;
 917   }
 918   delay = WebRtc_DelayEstimatorProcessFix(aecm->delay_estimator,
 919                                           dfaNoisy,
 920                                           PART_LEN1,
 921                                           zerosDBufNoisy);
 922   if (delay == -1) {
 923     return -1;
 924   }
 925   else if (delay == -2) {
 926     // If the delay is unknown, we assume zero.
 927     // NOTE: this will have to be adjusted if we ever add lookahead.
 928     delay = 0;
 929   }
 930
 931   if (aecm->fixedDelay >= 0) {
 932     // Use fixed delay
 933     delay = aecm->fixedDelay;
 934   }
 935
 936   // Get aligned far end spectrum
 937   far_spectrum_ptr = WebRtcAecm_AlignedFarend(aecm, &far_q, delay);
 938   zerosXBuf = (int16_t) far_q;
 939
 940   if (far_spectrum_ptr == NULL) {
 941     return -1;
 942   }
 943
 944   // Calculate log(energy) and update energy threshold levels
 945   WebRtcAecm_CalcEnergies(aecm,
 946                           far_spectrum_ptr,
 947                           zerosXBuf,
 948                           dfaNoisySum,
 949                           echoEst32);
 950   // Calculate stepsize
 951   mu = WebRtcAecm_CalcStepSize(aecm);
 952
 953   // Update counters
 954   aecm->totCount++;
 955
 956   // This is the channel estimation algorithm.
 957   // It is base on NLMS but has a variable step length,
 958   // which was calculated above.
 959   WebRtcAecm_UpdateChannel(aecm,
 960                            far_spectrum_ptr,
 961                            zerosXBuf,
 962                            dfaNoisy,
 963                            mu,
 964                            echoEst32);
 965
 966   supGain = WebRtcAecm_CalcSuppressionGain(aecm);
 967
 968   // Calculate Wiener filter hnl[]
 969   for (i = 0; i < PART_LEN1; i++) {
 970     // Far end signal through channel estimate in Q8
 971     // How much can we shift right to preserve resolution
 972     tmp32no1 = echoEst32[i] - aecm->echoFilt[i];
 973     aecm->echoFilt[i] += WEBRTC_SPL_RSHIFT_W32(
 974                            WEBRTC_SPL_MUL_32_16(tmp32no1, 50), 8);
 975
 976     zeros32 = WebRtcSpl_NormW32(aecm->echoFilt[i]) + 1;
 977     zeros16 = WebRtcSpl_NormW16(supGain) + 1;
 978     if (zeros32 + zeros16 > 16) {
 979       // Multiplication is safe
 980       // Result in
 981       // Q(RESOLUTION_CHANNEL+RESOLUTION_SUPGAIN+aecm->xfaQDomainBuf[diff])
 982       echoEst32Gained = WEBRTC_SPL_UMUL_32_16((uint32_t)aecm->echoFilt[i],
 983                                               (uint16_t)supGain);
 984       resolutionDiff = 14 - RESOLUTION_CHANNEL16 - RESOLUTION_SUPGAIN;
 985       resolutionDiff += (aecm->dfaCleanQDomain - zerosXBuf);
 986     } else {
 987       tmp16no1 = 17 - zeros32 - zeros16;
 988       resolutionDiff = 14 + tmp16no1 - RESOLUTION_CHANNEL16 -
 989                        RESOLUTION_SUPGAIN;
 990       resolutionDiff += (aecm->dfaCleanQDomain - zerosXBuf);
 991       if (zeros32 > tmp16no1) {
 992         echoEst32Gained = WEBRTC_SPL_UMUL_32_16(
 993                             (uint32_t)aecm->echoFilt[i],
 994                             (uint16_t)WEBRTC_SPL_RSHIFT_W16(supGain, tmp16no1));
 995       } else {
 996         // Result in Q-(RESOLUTION_CHANNEL+RESOLUTION_SUPGAIN-16)
 997         echoEst32Gained = WEBRTC_SPL_UMUL_32_16(
 998                             (uint32_t)WEBRTC_SPL_RSHIFT_W32(aecm->echoFilt[i],
 999                                                             tmp16no1),
1000                             (uint16_t)supGain);
1001       }
1002     }
1003
1004     zeros16 = WebRtcSpl_NormW16(aecm->nearFilt[i]);
1005     assert(zeros16 >= 0);  // |zeros16| is a norm, hence non-negative.
1006     dfa_clean_q_domain_diff = aecm->dfaCleanQDomain - aecm->dfaCleanQDomainOld;
1007     if (zeros16 < dfa_clean_q_domain_diff && aecm->nearFilt[i]) {
1008       tmp16no1 = aecm->nearFilt[i] << zeros16;
1009       qDomainDiff = zeros16 - dfa_clean_q_domain_diff;
1010       tmp16no2 = ptrDfaClean[i] >> -qDomainDiff;
1011     } else {
1012       tmp16no1 = dfa_clean_q_domain_diff < 0
1013           ? aecm->nearFilt[i] >> -dfa_clean_q_domain_diff
1014           : aecm->nearFilt[i] << dfa_clean_q_domain_diff;
1015       qDomainDiff = 0;
1016       tmp16no2 = ptrDfaClean[i];
1017     }
1018
1019     tmp32no1 = (int32_t)(tmp16no2 - tmp16no1);
1020     tmp16no2 = (int16_t)WEBRTC_SPL_RSHIFT_W32(tmp32no1, 4);
1021     tmp16no2 += tmp16no1;
1022     zeros16 = WebRtcSpl_NormW16(tmp16no2);
1023     if ((tmp16no2) & (-qDomainDiff > zeros16)) {
1024       aecm->nearFilt[i] = WEBRTC_SPL_WORD16_MAX;
1025     } else {
1026       aecm->nearFilt[i] = qDomainDiff < 0 ? tmp16no2 << -qDomainDiff
1027                                           : tmp16no2 >> qDomainDiff;
1028     }
1029
1030     // Wiener filter coefficients, resulting hnl in Q14
1031     if (echoEst32Gained == 0) {
1032       hnl[i] = ONE_Q14;
1033       numPosCoef++;
1034     } else if (aecm->nearFilt[i] == 0) {
1035       hnl[i] = 0;
1036     } else {
1037       // Multiply the suppression gain
1038       // Rounding
1039       echoEst32Gained += (uint32_t)(aecm->nearFilt[i] >> 1);
1040       tmpU32 = WebRtcSpl_DivU32U16(echoEst32Gained,
1041                                    (uint16_t)aecm->nearFilt[i]);
1042
1043       // Current resolution is
1044       // Q-(RESOLUTION_CHANNEL + RESOLUTION_SUPGAIN
1045       //    - max(0, 17 - zeros16 - zeros32))
1046       // Make sure we are in Q14
1047       tmp32no1 = (int32_t)WEBRTC_SPL_SHIFT_W32(tmpU32, resolutionDiff);
1048       if (tmp32no1 > ONE_Q14) {
1049         hnl[i] = 0;
1050       } else if (tmp32no1 < 0) {
1051         hnl[i] = ONE_Q14;
1052         numPosCoef++;
1053       } else {
1054         // 1-echoEst/dfa
1055         hnl[i] = ONE_Q14 - (int16_t)tmp32no1;
1056         if (hnl[i] <= 0) {
1057           hnl[i] = 0;
1058         } else {
1059           numPosCoef++;
1060         }
1061       }
1062     }
1063   }
1064
1065   // Only in wideband. Prevent the gain in upper band from being larger than
1066   // in lower band.
1067   if (aecm->mult == 2) {
1068     // TODO(bjornv): Investigate if the scaling of hnl[i] below can cause
1069     //               speech distortion in double-talk.
1070     for (i = 0; i < (PART_LEN1 >> 3); i++) {
1071       __asm __volatile (
1072         "lh         %[temp1],       0(%[ptr1])                  \n\t"
1073         "lh         %[temp2],       2(%[ptr1])                  \n\t"
1074         "lh         %[temp3],       4(%[ptr1])                  \n\t"
1075         "lh         %[temp4],       6(%[ptr1])                  \n\t"
1076         "lh         %[temp5],       8(%[ptr1])                  \n\t"
1077         "lh         %[temp6],       10(%[ptr1])                 \n\t"
1078         "lh         %[temp7],       12(%[ptr1])                 \n\t"
1079         "lh         %[temp8],       14(%[ptr1])                 \n\t"
1080         "mul        %[temp1],       %[temp1],       %[temp1]    \n\t"
1081         "mul        %[temp2],       %[temp2],       %[temp2]    \n\t"
1082         "mul        %[temp3],       %[temp3],       %[temp3]    \n\t"
1083         "mul        %[temp4],       %[temp4],       %[temp4]    \n\t"
1084         "mul        %[temp5],       %[temp5],       %[temp5]    \n\t"
1085         "mul        %[temp6],       %[temp6],       %[temp6]    \n\t"
1086         "mul        %[temp7],       %[temp7],       %[temp7]    \n\t"
1087         "mul        %[temp8],       %[temp8],       %[temp8]    \n\t"
1088         "sra        %[temp1],       %[temp1],       14          \n\t"
1089         "sra        %[temp2],       %[temp2],       14          \n\t"
1090         "sra        %[temp3],       %[temp3],       14          \n\t"
1091         "sra        %[temp4],       %[temp4],       14          \n\t"
1092         "sra        %[temp5],       %[temp5],       14          \n\t"
1093         "sra        %[temp6],       %[temp6],       14          \n\t"
1094         "sra        %[temp7],       %[temp7],       14          \n\t"
1095         "sra        %[temp8],       %[temp8],       14          \n\t"
1096         "sh         %[temp1],       0(%[ptr1])                  \n\t"
1097         "sh         %[temp2],       2(%[ptr1])                  \n\t"
1098         "sh         %[temp3],       4(%[ptr1])                  \n\t"
1099         "sh         %[temp4],       6(%[ptr1])                  \n\t"
1100         "sh         %[temp5],       8(%[ptr1])                  \n\t"
1101         "sh         %[temp6],       10(%[ptr1])                 \n\t"
1102         "sh         %[temp7],       12(%[ptr1])                 \n\t"
1103         "sh         %[temp8],       14(%[ptr1])                 \n\t"
1104         "addiu      %[ptr1],        %[ptr1],        16          \n\t"
1105         : [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
1106           [temp4] "=&r" (temp4), [temp5] "=&r" (temp5), [temp6] "=&r" (temp6),
1107           [temp7] "=&r" (temp7), [temp8] "=&r" (temp8), [ptr1] "+r" (ptr1)
1108         :
1109         : "memory", "hi", "lo"
1110       );
1111     }
1112     for(i = 0; i < (PART_LEN1 & 7); i++) {
1113       __asm __volatile (
1114         "lh         %[temp1],       0(%[ptr1])                  \n\t"
1115         "mul        %[temp1],       %[temp1],       %[temp1]    \n\t"
1116         "sra        %[temp1],       %[temp1],       14          \n\t"
1117         "sh         %[temp1],       0(%[ptr1])                  \n\t"
1118         "addiu      %[ptr1],        %[ptr1],        2           \n\t"
1119         : [temp1] "=&r" (temp1), [ptr1] "+r" (ptr1)
1120         :
1121         : "memory", "hi", "lo"
1122       );
1123     }
1124
1125     for (i = kMinPrefBand; i <= kMaxPrefBand; i++) {
1126       avgHnl32 += (int32_t)hnl[i];
1127     }
1128
1129     assert(kMaxPrefBand - kMinPrefBand + 1 > 0);
1130     avgHnl32 /= (kMaxPrefBand - kMinPrefBand + 1);
1131
1132     for (i = kMaxPrefBand; i < PART_LEN1; i++) {
1133       if (hnl[i] > (int16_t)avgHnl32) {
1134         hnl[i] = (int16_t)avgHnl32;
1135       }
1136     }
1137   }
1138
1139   // Calculate NLP gain, result is in Q14
1140   if (aecm->nlpFlag) {
1141     if (numPosCoef < 3) {
1142       for (i = 0; i < PART_LEN1; i++) {
1143         efw[i].real = 0;
1144         efw[i].imag = 0;
1145         hnl[i] = 0;
1146       }
1147     } else {
1148       for (i = 0; i < PART_LEN1; i++) {
1149 #if defined(MIPS_DSP_R1_LE)
1150         __asm __volatile (
1151           ".set       push                                        \n\t"
1152           ".set       noreorder                                   \n\t"
1153           "lh         %[temp1],       0(%[ptr])                   \n\t"
1154           "lh         %[temp2],       0(%[dr_ptr])                \n\t"
1155           "slti       %[temp4],       %[temp1],       0x4001      \n\t"
1156           "beqz       %[temp4],       3f                          \n\t"
1157           " lh        %[temp3],       2(%[dr_ptr])                \n\t"
1158           "slti       %[temp5],       %[temp1],       3277        \n\t"
1159           "bnez       %[temp5],       2f                          \n\t"
1160           " addiu     %[dr_ptr],      %[dr_ptr],      4           \n\t"
1161           "mul        %[temp2],       %[temp2],       %[temp1]    \n\t"
1162           "mul        %[temp3],       %[temp3],       %[temp1]    \n\t"
1163           "shra_r.w   %[temp2],       %[temp2],       14          \n\t"
1164           "shra_r.w   %[temp3],       %[temp3],       14          \n\t"
1165           "b          4f                                          \n\t"
1166           " nop                                                   \n\t"
1167          "2:                                                      \n\t"
1168           "addu       %[temp1],       $zero,          $zero       \n\t"
1169           "addu       %[temp2],       $zero,          $zero       \n\t"
1170           "addu       %[temp3],       $zero,          $zero       \n\t"
1171           "b          1f                                          \n\t"
1172           " nop                                                   \n\t"
1173          "3:                                                      \n\t"
1174           "addiu      %[temp1],       $0,             0x4000      \n\t"
1175          "1:                                                      \n\t"
1176           "sh         %[temp1],       0(%[ptr])                   \n\t"
1177          "4:                                                      \n\t"
1178           "sh         %[temp2],       0(%[er_ptr])                \n\t"
1179           "sh         %[temp3],       2(%[er_ptr])                \n\t"
1180           "addiu      %[ptr],         %[ptr],         2           \n\t"
1181           "addiu      %[er_ptr],      %[er_ptr],      4           \n\t"
1182           ".set       pop                                         \n\t"
1183           : [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
1184             [temp4] "=&r" (temp4), [temp5] "=&r" (temp5), [ptr] "+r" (ptr),
1185             [er_ptr] "+r" (er_ptr), [dr_ptr] "+r" (dr_ptr)
1186           :
1187           : "memory", "hi", "lo"
1188         );
1189 #else
1190         __asm __volatile (
1191           ".set       push                                        \n\t"
1192           ".set       noreorder                                   \n\t"
1193           "lh         %[temp1],       0(%[ptr])                   \n\t"
1194           "lh         %[temp2],       0(%[dr_ptr])                \n\t"
1195           "slti       %[temp4],       %[temp1],       0x4001      \n\t"
1196           "beqz       %[temp4],       3f                          \n\t"
1197           " lh        %[temp3],       2(%[dr_ptr])                \n\t"
1198           "slti       %[temp5],       %[temp1],       3277        \n\t"
1199           "bnez       %[temp5],       2f                          \n\t"
1200           " addiu     %[dr_ptr],      %[dr_ptr],      4           \n\t"
1201           "mul        %[temp2],       %[temp2],       %[temp1]    \n\t"
1202           "mul        %[temp3],       %[temp3],       %[temp1]    \n\t"
1203           "addiu      %[temp2],       %[temp2],       0x2000      \n\t"
1204           "addiu      %[temp3],       %[temp3],       0x2000      \n\t"
1205           "sra        %[temp2],       %[temp2],       14          \n\t"
1206           "sra        %[temp3],       %[temp3],       14          \n\t"
1207           "b          4f                                          \n\t"
1208           " nop                                                   \n\t"
1209          "2:                                                      \n\t"
1210           "addu       %[temp1],       $zero,          $zero       \n\t"
1211           "addu       %[temp2],       $zero,          $zero       \n\t"
1212           "addu       %[temp3],       $zero,          $zero       \n\t"
1213           "b          1f                                          \n\t"
1214           " nop                                                   \n\t"
1215          "3:                                                      \n\t"
1216           "addiu      %[temp1],       $0,             0x4000      \n\t"
1217          "1:                                                      \n\t"
1218           "sh         %[temp1],       0(%[ptr])                   \n\t"
1219          "4:                                                      \n\t"
1220           "sh         %[temp2],       0(%[er_ptr])                \n\t"
1221           "sh         %[temp3],       2(%[er_ptr])                \n\t"
1222           "addiu      %[ptr],         %[ptr],         2           \n\t"
1223           "addiu      %[er_ptr],      %[er_ptr],      4           \n\t"
1224           ".set       pop                                         \n\t"
1225           : [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
1226             [temp4] "=&r" (temp4), [temp5] "=&r" (temp5), [ptr] "+r" (ptr),
1227             [er_ptr] "+r" (er_ptr), [dr_ptr] "+r" (dr_ptr)
1228           :
1229           : "memory", "hi", "lo"
1230         );
1231 #endif
1232       }
1233     }
1234   }
1235   else {
1236     // multiply with Wiener coefficients
1237     for (i = 0; i < PART_LEN1; i++) {
1238       efw[i].real = (int16_t)
1239                       (WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfw[i].real,
1240                                                             hnl[i],
1241                                                             14));
1242       efw[i].imag = (int16_t)
1243                       (WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfw[i].imag,
1244                                                             hnl[i],
1245                                                             14));
1246     }
1247   }
1248
1249   if (aecm->cngMode == AecmTrue) {
1250     ComfortNoise(aecm, ptrDfaClean, efw, hnl);
1251   }
1252
1253   InverseFFTAndWindow(aecm, fft, efw, output, nearendClean);
1254
1255   return 0;
1256 }
1257
1258 // Generate comfort noise and add to output signal.
1259 static void ComfortNoise(AecmCore_t* aecm,
1260                          const uint16_t* dfa,
1261                          complex16_t* out,
1262                          const int16_t* lambda) {
1263   int16_t i;
1264   int16_t tmp16, tmp161, tmp162, tmp163, nrsh1, nrsh2;
1265   int32_t tmp32, tmp321, tnoise, tnoise1;
1266   int32_t tmp322, tmp323, *tmp1;
1267   int16_t* dfap;
1268   int16_t* lambdap;
1269   const int32_t c2049 = 2049;
1270   const int32_t c359 = 359;
1271   const int32_t c114 = ONE_Q14;
1272
1273   int16_t randW16[PART_LEN];
1274   int16_t uReal[PART_LEN1];
1275   int16_t uImag[PART_LEN1];
1276   int32_t outLShift32;
1277
1278   int16_t shiftFromNearToNoise = kNoiseEstQDomain - aecm->dfaCleanQDomain;
1279   int16_t minTrackShift = 9;
1280
1281   assert(shiftFromNearToNoise >= 0);
1282   assert(shiftFromNearToNoise < 16);
1283
1284   if (aecm->noiseEstCtr < 100) {
1285     // Track the minimum more quickly initially.
1286     aecm->noiseEstCtr++;
1287     minTrackShift = 6;
1288   }
1289
1290   // Generate a uniform random array on [0 2^15-1].
1291   WebRtcSpl_RandUArray(randW16, PART_LEN, &aecm->seed);
1292   int16_t* randW16p = (int16_t*)randW16;
1293 #if defined (MIPS_DSP_R1_LE)
1294   int16_t* kCosTablep = (int16_t*)WebRtcAecm_kCosTable;
1295   int16_t* kSinTablep = (int16_t*)WebRtcAecm_kSinTable;
1296 #endif   // #if defined(MIPS_DSP_R1_LE)
1297   tmp1 = (int32_t*)aecm->noiseEst + 1;
1298   dfap = (int16_t*)dfa + 1;
1299   lambdap = (int16_t*)lambda + 1;
1300   // Estimate noise power.
1301   for (i = 1; i < PART_LEN1; i+=2) {
1302   // Shift to the noise domain.
1303     __asm __volatile (
1304       "lh     %[tmp32],       0(%[dfap])                              \n\t"
1305       "lw     %[tnoise],      0(%[tmp1])                              \n\t"
1306       "sllv   %[outLShift32], %[tmp32],   %[shiftFromNearToNoise]     \n\t"
1307       : [tmp32] "=&r" (tmp32), [outLShift32] "=r" (outLShift32),
1308         [tnoise] "=&r" (tnoise)
1309       : [tmp1] "r" (tmp1), [dfap] "r" (dfap),
1310         [shiftFromNearToNoise] "r" (shiftFromNearToNoise)
1311       : "memory"
1312     );
1313
1314     if (outLShift32 < tnoise) {
1315       // Reset "too low" counter
1316       aecm->noiseEstTooLowCtr[i] = 0;
1317       // Track the minimum.
1318       if (tnoise < (1 << minTrackShift)) {
1319         // For small values, decrease noiseEst[i] every
1320         // |kNoiseEstIncCount| block. The regular approach below can not
1321         // go further down due to truncation.
1322         aecm->noiseEstTooHighCtr[i]++;
1323         if (aecm->noiseEstTooHighCtr[i] >= kNoiseEstIncCount) {
1324           tnoise--;
1325           aecm->noiseEstTooHighCtr[i] = 0;  // Reset the counter
1326         }
1327       } else {
1328         __asm __volatile (
1329           "subu   %[tmp32],       %[tnoise],      %[outLShift32]      \n\t"
1330           "srav   %[tmp32],       %[tmp32],       %[minTrackShift]    \n\t"
1331           "subu   %[tnoise],      %[tnoise],      %[tmp32]            \n\t"
1332           : [tmp32] "=&r" (tmp32), [tnoise] "+r" (tnoise)
1333           : [outLShift32] "r" (outLShift32), [minTrackShift] "r" (minTrackShift)
1334         );
1335       }
1336     } else {
1337       // Reset "too high" counter
1338       aecm->noiseEstTooHighCtr[i] = 0;
1339       // Ramp slowly upwards until we hit the minimum again.
1340       if ((tnoise >> 19) <= 0) {
1341         if ((tnoise >> 11) > 0) {
1342           // Large enough for relative increase
1343           __asm __volatile (
1344             "mul    %[tnoise],  %[tnoise],  %[c2049]    \n\t"
1345             "sra    %[tnoise],  %[tnoise],  11          \n\t"
1346             : [tnoise] "+r" (tnoise)
1347             : [c2049] "r" (c2049)
1348             : "hi", "lo"
1349           );
1350         } else {
1351           // Make incremental increases based on size every
1352           // |kNoiseEstIncCount| block
1353           aecm->noiseEstTooLowCtr[i]++;
1354           if (aecm->noiseEstTooLowCtr[i] >= kNoiseEstIncCount) {
1355             __asm __volatile (
1356               "sra    %[tmp32],   %[tnoise],  9           \n\t"
1357               "addi   %[tnoise],  %[tnoise],  1           \n\t"
1358               "addu   %[tnoise],  %[tnoise],  %[tmp32]    \n\t"
1359               : [tnoise] "+r" (tnoise), [tmp32] "=&r" (tmp32)
1360               :
1361             );
1362             aecm->noiseEstTooLowCtr[i] = 0; // Reset counter
1363           }
1364         }
1365       } else {
1366         // Avoid overflow.
1367         // Multiplication with 2049 will cause wrap around. Scale
1368         // down first and then multiply
1369         __asm __volatile (
1370           "sra    %[tnoise],  %[tnoise],  11          \n\t"
1371           "mul    %[tnoise],  %[tnoise],  %[c2049]    \n\t"
1372           : [tnoise] "+r" (tnoise)
1373           : [c2049] "r" (c2049)
1374           : "hi", "lo"
1375         );
1376       }
1377     }
1378
1379     // Shift to the noise domain.
1380     __asm __volatile (
1381       "lh     %[tmp32],       2(%[dfap])                              \n\t"
1382       "lw     %[tnoise1],     4(%[tmp1])                              \n\t"
1383       "addiu  %[dfap],        %[dfap],    4                           \n\t"
1384       "sllv   %[outLShift32], %[tmp32],   %[shiftFromNearToNoise]     \n\t"
1385       : [tmp32] "=&r" (tmp32), [dfap] "+r" (dfap),
1386         [outLShift32] "=r" (outLShift32), [tnoise1] "=&r" (tnoise1)
1387       : [tmp1] "r" (tmp1), [shiftFromNearToNoise] "r" (shiftFromNearToNoise)
1388       : "memory"
1389     );
1390
1391     if (outLShift32 < tnoise1) {
1392       // Reset "too low" counter
1393       aecm->noiseEstTooLowCtr[i + 1] = 0;
1394       // Track the minimum.
1395       if (tnoise1 < (1 << minTrackShift)) {
1396         // For small values, decrease noiseEst[i] every
1397         // |kNoiseEstIncCount| block. The regular approach below can not
1398         // go further down due to truncation.
1399         aecm->noiseEstTooHighCtr[i + 1]++;
1400         if (aecm->noiseEstTooHighCtr[i + 1] >= kNoiseEstIncCount) {
1401           tnoise1--;
1402           aecm->noiseEstTooHighCtr[i + 1] = 0; // Reset the counter
1403         }
1404       } else {
1405         __asm __volatile (
1406           "subu   %[tmp32],       %[tnoise1],     %[outLShift32]      \n\t"
1407           "srav   %[tmp32],       %[tmp32],       %[minTrackShift]    \n\t"
1408           "subu   %[tnoise1],     %[tnoise1],     %[tmp32]            \n\t"
1409           : [tmp32] "=&r" (tmp32), [tnoise1] "+r" (tnoise1)
1410           : [outLShift32] "r" (outLShift32), [minTrackShift] "r" (minTrackShift)
1411         );
1412       }
1413     } else {
1414       // Reset "too high" counter
1415       aecm->noiseEstTooHighCtr[i + 1] = 0;
1416       // Ramp slowly upwards until we hit the minimum again.
1417       if ((tnoise1 >> 19) <= 0) {
1418         if ((tnoise1 >> 11) > 0) {
1419           // Large enough for relative increase
1420           __asm __volatile (
1421             "mul    %[tnoise1], %[tnoise1], %[c2049]   \n\t"
1422             "sra    %[tnoise1], %[tnoise1], 11         \n\t"
1423             : [tnoise1] "+r" (tnoise1)
1424             : [c2049] "r" (c2049)
1425             : "hi", "lo"
1426           );
1427         } else {
1428           // Make incremental increases based on size every
1429           // |kNoiseEstIncCount| block
1430           aecm->noiseEstTooLowCtr[i + 1]++;
1431           if (aecm->noiseEstTooLowCtr[i + 1] >= kNoiseEstIncCount) {
1432             __asm __volatile (
1433               "sra    %[tmp32],   %[tnoise1], 9           \n\t"
1434               "addi   %[tnoise1], %[tnoise1], 1           \n\t"
1435               "addu   %[tnoise1], %[tnoise1], %[tmp32]    \n\t"
1436               : [tnoise1] "+r" (tnoise1), [tmp32] "=&r" (tmp32)
1437               :
1438             );
1439             aecm->noiseEstTooLowCtr[i + 1] = 0; // Reset counter
1440           }
1441         }
1442       } else {
1443         // Avoid overflow.
1444         // Multiplication with 2049 will cause wrap around. Scale
1445         // down first and then multiply
1446         __asm __volatile (
1447           "sra    %[tnoise1], %[tnoise1], 11          \n\t"
1448           "mul    %[tnoise1], %[tnoise1], %[c2049]    \n\t"
1449           : [tnoise1] "+r" (tnoise1)
1450           : [c2049] "r" (c2049)
1451           : "hi", "lo"
1452         );
1453       }
1454     }
1455
1456     __asm __volatile (
1457       "lh     %[tmp16],   0(%[lambdap])                           \n\t"
1458       "lh     %[tmp161],  2(%[lambdap])                           \n\t"
1459       "sw     %[tnoise],  0(%[tmp1])                              \n\t"
1460       "sw     %[tnoise1], 4(%[tmp1])                              \n\t"
1461       "subu   %[tmp16],   %[c114],        %[tmp16]                \n\t"
1462       "subu   %[tmp161],  %[c114],        %[tmp161]               \n\t"
1463       "srav   %[tmp32],   %[tnoise],      %[shiftFromNearToNoise] \n\t"
1464       "srav   %[tmp321],  %[tnoise1],     %[shiftFromNearToNoise] \n\t"
1465       "addiu  %[lambdap], %[lambdap],     4                       \n\t"
1466       "addiu  %[tmp1],    %[tmp1],        8                       \n\t"
1467       : [tmp16] "=&r" (tmp16), [tmp161] "=&r" (tmp161), [tmp1] "+r" (tmp1),
1468         [tmp32] "=&r" (tmp32), [tmp321] "=&r" (tmp321), [lambdap] "+r" (lambdap)
1469       : [tnoise] "r" (tnoise), [tnoise1] "r" (tnoise1), [c114] "r" (c114),
1470         [shiftFromNearToNoise] "r" (shiftFromNearToNoise)
1471       : "memory"
1472     );
1473
1474     if (tmp32 > 32767) {
1475       tmp32 = 32767;
1476       aecm->noiseEst[i] = WEBRTC_SPL_LSHIFT_W32(tmp32, shiftFromNearToNoise);
1477     }
1478     if (tmp321 > 32767) {
1479       tmp321 = 32767;
1480       aecm->noiseEst[i+1] = WEBRTC_SPL_LSHIFT_W32(tmp321, shiftFromNearToNoise);
1481     }
1482
1483     __asm __volatile (
1484       "mul    %[tmp32],   %[tmp32],       %[tmp16]                \n\t"
1485       "mul    %[tmp321],  %[tmp321],      %[tmp161]               \n\t"
1486       "sra    %[nrsh1],   %[tmp32],       14                      \n\t"
1487       "sra    %[nrsh2],   %[tmp321],      14                      \n\t"
1488       : [nrsh1] "=&r" (nrsh1), [nrsh2] "=r" (nrsh2)
1489       : [tmp16] "r" (tmp16), [tmp161] "r" (tmp161), [tmp32] "r" (tmp32),
1490         [tmp321] "r" (tmp321)
1491       : "memory", "hi", "lo"
1492     );
1493
1494     __asm __volatile (
1495       "lh     %[tmp32],       0(%[randW16p])              \n\t"
1496       "lh     %[tmp321],      2(%[randW16p])              \n\t"
1497       "addiu  %[randW16p],    %[randW16p],    4           \n\t"
1498       "mul    %[tmp32],       %[tmp32],       %[c359]     \n\t"
1499       "mul    %[tmp321],      %[tmp321],      %[c359]     \n\t"
1500       "sra    %[tmp16],       %[tmp32],       15          \n\t"
1501       "sra    %[tmp161],      %[tmp321],      15          \n\t"
1502       : [randW16p] "+r" (randW16p), [tmp32] "=&r" (tmp32),
1503         [tmp16] "=r" (tmp16), [tmp161] "=r" (tmp161), [tmp321] "=&r" (tmp321)
1504       : [c359] "r" (c359)
1505       : "memory", "hi", "lo"
1506     );
1507
1508 #if !defined(MIPS_DSP_R1_LE)
1509     tmp32 = WebRtcAecm_kCosTable[tmp16];
1510     tmp321 = WebRtcAecm_kSinTable[tmp16];
1511     tmp322 = WebRtcAecm_kCosTable[tmp161];
1512     tmp323 = WebRtcAecm_kSinTable[tmp161];
1513 #else
1514     __asm __volatile (
1515       "sll    %[tmp16],       %[tmp16],                   1           \n\t"
1516       "sll    %[tmp161],      %[tmp161],                  1           \n\t"
1517       "lhx    %[tmp32],       %[tmp16](%[kCosTablep])                 \n\t"
1518       "lhx    %[tmp321],      %[tmp16](%[kSinTablep])                 \n\t"
1519       "lhx    %[tmp322],      %[tmp161](%[kCosTablep])                \n\t"
1520       "lhx    %[tmp323],      %[tmp161](%[kSinTablep])                \n\t"
1521       : [tmp32] "=&r" (tmp32), [tmp321] "=&r" (tmp321),
1522         [tmp322] "=&r" (tmp322), [tmp323] "=&r" (tmp323)
1523       : [kCosTablep] "r" (kCosTablep), [tmp16] "r" (tmp16),
1524         [tmp161] "r" (tmp161), [kSinTablep] "r" (kSinTablep)
1525       : "memory"
1526     );
1527 #endif
1528     __asm __volatile (
1529       "mul    %[tmp32],       %[tmp32],                   %[nrsh1]    \n\t"
1530       "negu   %[tmp162],      %[nrsh1]                                \n\t"
1531       "mul    %[tmp322],      %[tmp322],                  %[nrsh2]    \n\t"
1532       "negu   %[tmp163],      %[nrsh2]                                \n\t"
1533       "sra    %[tmp32],       %[tmp32],                   13          \n\t"
1534       "mul    %[tmp321],      %[tmp321],                  %[tmp162]   \n\t"
1535       "sra    %[tmp322],      %[tmp322],                  13          \n\t"
1536       "mul    %[tmp323],      %[tmp323],                  %[tmp163]   \n\t"
1537       "sra    %[tmp321],      %[tmp321],                  13          \n\t"
1538       "sra    %[tmp323],      %[tmp323],                  13          \n\t"
1539       : [tmp32] "+r" (tmp32), [tmp321] "+r" (tmp321), [tmp162] "=&r" (tmp162),
1540         [tmp322] "+r" (tmp322), [tmp323] "+r" (tmp323), [tmp163] "=&r" (tmp163)
1541       : [nrsh1] "r" (nrsh1), [nrsh2] "r" (nrsh2)
1542       : "hi", "lo"
1543     );
1544     // Tables are in Q13.
1545     uReal[i] = (int16_t)tmp32;
1546     uImag[i] = (int16_t)tmp321;
1547     uReal[i + 1] = (int16_t)tmp322;
1548     uImag[i + 1] = (int16_t)tmp323;
1549   }
1550
1551   int32_t tt, sgn;
1552   tt = out[0].real;
1553   sgn = ((int)tt) >> 31;
1554   out[0].real = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
1555   tt = out[0].imag;
1556   sgn = ((int)tt) >> 31;
1557   out[0].imag = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
1558   for (i = 1; i < PART_LEN; i++) {
1559     tt = out[i].real + uReal[i];
1560     sgn = ((int)tt) >> 31;
1561     out[i].real = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
1562     tt = out[i].imag + uImag[i];
1563     sgn = ((int)tt) >> 31;
1564     out[i].imag = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
1565   }
1566   tt = out[PART_LEN].real + uReal[PART_LEN];
1567   sgn = ((int)tt) >> 31;
1568   out[PART_LEN].real = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
1569   tt = out[PART_LEN].imag;
1570   sgn = ((int)tt) >> 31;
1571   out[PART_LEN].imag = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
1572 }
1573