3 * Copyright (C) DFS Deutsche Flugsicherung (2004, 2005).
6 * Acoustic Echo Cancellation NLMS-pw algorithm
8 * Version 0.3 filter created with www.dsptutor.freeuk.com
9 * Version 0.3.1 Allow change of stability parameter delta
10 * Version 0.4 Leaky Normalized LMS - pre whitening algorithm
21 #include <pulse/xmalloc.h>
23 #include "adrian-aec.h"
26 #include "adrian-aec-orc-gen.h"
30 #include <xmmintrin.h>
33 /* Vector Dot Product */
34 static REAL dotp(REAL a[], REAL b[])
36 REAL sum0 = 0.0, sum1 = 0.0;
39 for (j = 0; j < NLMS_LEN; j += 2) {
40 // optimize: partial loop unrolling
42 sum1 += a[j + 1] * b[j + 1];
47 static REAL dotp_sse(REAL a[], REAL b[])
50 /* This is taken from speex's inner product implementation */
53 __m128 acc = _mm_setzero_ps();
55 for (j=0;j<NLMS_LEN;j+=8)
57 acc = _mm_add_ps(acc, _mm_mul_ps(_mm_load_ps(a+j), _mm_loadu_ps(b+j)));
58 acc = _mm_add_ps(acc, _mm_mul_ps(_mm_load_ps(a+j+4), _mm_loadu_ps(b+j+4)));
60 acc = _mm_add_ps(acc, _mm_movehl_ps(acc, acc));
61 acc = _mm_add_ss(acc, _mm_shuffle_ps(acc, acc, 0x55));
62 _mm_store_ss(&sum, acc);
71 AEC* AEC_init(int RATE, int have_vector)
73 AEC *a = pa_xnew(AEC, 1);
75 memset(a->x, 0, sizeof(a->x));
76 memset(a->xf, 0, sizeof(a->xf));
77 memset(a->w_arr, 0, sizeof(a->w_arr));
80 AEC_setambient(a, NoiseFloor);
81 a->dfast = a->dslow = M75dB_PCM;
82 a->xfast = a->xslow = M80dB_PCM;
84 a->Fx = IIR1_init(2000.0f/RATE);
85 a->Fe = IIR1_init(2000.0f/RATE);
86 a->cutoff = FIR_HP_300Hz_init();
87 a->acMic = IIR_HP_init();
88 a->acSpk = IIR_HP_init();
94 memset(a->ws, 0, sizeof(a->ws));
97 /* Get a 16-byte aligned location */
98 a->w = (REAL *) (((uintptr_t) a->w_arr) + (((uintptr_t) a->w_arr) % 16));
101 /* We don't care about alignment, just use the array as-is */
109 void AEC_done(AEC *a) {
120 // Adrian soft decision DTD
121 // (Dual Average Near-End to Far-End signal Ratio DTD)
122 // This algorithm uses exponential smoothing with differnt
123 // ageing parameters to get fast and slow near-end and far-end
124 // signal averages. The ratio of NFRs term
125 // (dfast / xfast) / (dslow / xslow) is used to compute the stepsize
126 // A ratio value of 2.5 is mapped to stepsize 0, a ratio of 0 is
127 // mapped to 1.0 with a limited linear function.
128 static float AEC_dtd(AEC *a, REAL d, REAL x)
130 float ratio, stepsize;
132 // fast near-end and far-end average
133 a->dfast += ALPHAFAST * (fabsf(d) - a->dfast);
134 a->xfast += ALPHAFAST * (fabsf(x) - a->xfast);
136 // slow near-end and far-end average
137 a->dslow += ALPHASLOW * (fabsf(d) - a->dslow);
138 a->xslow += ALPHASLOW * (fabsf(x) - a->xslow);
140 if (a->xfast < M70dB_PCM) {
141 return 0.0; // no Spk signal
144 if (a->dfast < M70dB_PCM) {
145 return 0.0; // no Mic signal
149 ratio = (a->dfast * a->xslow) / (a->dslow * a->xfast);
151 // Linear interpolation with clamping at the limits
154 else if (ratio > STEPX2)
157 stepsize = STEPY1 + (STEPY2 - STEPY1) * (ratio - STEPX1) / (STEPX2 - STEPX1);
163 static void AEC_leaky(AEC *a)
164 // The xfast signal is used to charge the hangover timer to Thold.
165 // When hangover expires (no Spk signal for some time) the vector w
166 // is erased. This is my implementation of Leaky NLMS.
168 if (a->xfast >= M70dB_PCM) {
169 // vector w is valid for hangover Thold time
172 if (a->hangover > 1) {
174 } else if (1 == a->hangover) {
176 // My Leaky NLMS is to erase vector w when hangover expires
177 memset(a->w_arr, 0, sizeof(a->w_arr));
184 void AEC::openwdisplay() {
185 // open TCP connection to program wdisplay.tcl
186 fdwdisplay = socket_async("127.0.0.1", 50999);
191 static REAL AEC_nlms_pw(AEC *a, REAL d, REAL x_, float stepsize)
196 a->xf[a->j] = IIR1_highpass(a->Fx, x_); // pre-whitening of x
198 // calculate error value
199 // (mic signal - estimated mic signal from spk signal)
201 if (a->hangover > 0) {
202 e -= a->dotp(a->w, a->x + a->j);
204 ef = IIR1_highpass(a->Fe, e); // pre-whitening of e
206 // optimize: iterative dotp(xf, xf)
207 a->dotp_xf_xf += (a->xf[a->j] * a->xf[a->j] - a->xf[a->j + NLMS_LEN - 1] * a->xf[a->j + NLMS_LEN - 1]);
209 if (stepsize > 0.0) {
210 // calculate variable step size
211 REAL mikro_ef = stepsize * ef / a->dotp_xf_xf;
214 // update tap weights (filter learning)
216 for (i = 0; i < NLMS_LEN; i += 2) {
217 // optimize: partial loop unrolling
218 a->w[i] += mikro_ef * a->xf[i + a->j];
219 a->w[i + 1] += mikro_ef * a->xf[i + a->j + 1];
222 update_tap_weights(a->w, &a->xf[a->j], mikro_ef, NLMS_LEN);
227 // optimize: decrease number of memory copies
229 memmove(a->x + a->j + 1, a->x, (NLMS_LEN - 1) * sizeof(REAL));
230 memmove(a->xf + a->j + 1, a->xf, (NLMS_LEN - 1) * sizeof(REAL));
236 } else if (e < -MAXPCM) {
244 int AEC_doAEC(AEC *a, int d_, int x_)
249 // Mic Highpass Filter - to remove DC
250 d = IIR_HP_highpass(a->acMic, d);
252 // Mic Highpass Filter - cut-off below 300Hz
253 d = FIR_HP_300Hz_highpass(a->cutoff, d);
255 // Amplify, for e.g. Soundcards with -6dB max. volume
258 // Spk Highpass Filter - to remove DC
259 x = IIR_HP_highpass(a->acSpk, x);
261 // Double Talk Detector
262 a->stepsize = AEC_dtd(a, d, x);
264 // Leaky (ageing of vector w)
267 // Acoustic Echo Cancellation
268 d = AEC_nlms_pw(a, d, x, a->stepsize);
271 if (fdwdisplay >= 0) {
272 if (++dumpcnt >= (WIDEB*RATE/10)) {
273 // wdisplay creates 10 dumps per seconds = large CPU load!
275 write(fdwdisplay, ws, DUMP_LEN*sizeof(float));
276 // we don't check return value. This is not production quality!!!
277 memset(ws, 0, sizeof(ws));
280 for (i = 0; i < DUMP_LEN; i += 2) {
281 // optimize: partial loop unrolling
283 ws[i + 1] += w[i + 1];