freeswitch/libs/spandsp/src/echo.c

622 lines
23 KiB
C

/*
* SpanDSP - a series of DSP components for telephony
*
* echo.c - An echo cancellor, suitable for electrical and acoustic
* cancellation. This code does not currently comply with
* any relevant standards (e.g. G.164/5/7/8). One day....
*
* Written by Steve Underwood <steveu@coppice.org>
*
* Copyright (C) 2001, 2003 Steve Underwood
*
* Based on a bit from here, a bit from there, eye of toad,
* ear of bat, etc - plus, of course, my own 2 cents.
*
* All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License version 2.1,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*! \file */
/* TODO:
Finish the echo suppressor option, however nasty suppression may be.
Add an option to reintroduce side tone at -24dB under appropriate conditions.
Improve double talk detector (iterative!)
*/
/* We need to differentiate between transmitted energy which will train the echo
canceller well (voice, white noise, and other broadband sources) and energy
which will train it badly (supervisory tones, DTMF, whistles, and other
narrowband sources). There are many ways this might be done. This canceller uses
a method based on the autocorrelation qualities of the transmitted signal. A rather
peaky autocorrelation function is a clear sign of a narrowband signal. We only need
perform the autocorrelation at well spaced intervals, so the compute load is not too
great. Multiple successive autocorrelation functions with a similar peaky shape are a
clear indication of a stationary narrowband signal. Using TKEO, it should be possible to
greatly reduce the compute requirement for narrowband detection. */
/* The FIR taps must be adapted as 32 bit values, to get the necessary finesse
in the adaption process. However, they are applied as 16 bit values (bits 30-15
of the 32 bit values) in the FIR. For the working 16 bit values, we need 4 sets.
3 of the 16 bit sets are used on a rotating basis. Normally the canceller steps
round these 3 sets at regular intervals. Any time we detect double talk, we can go
back to the set from two steps ago with reasonable assurance it is a well adapted
set. We cannot just go back one step, as we may have rotated the sets just before
double talk or tone was detected, and that set may already be somewhat corrupted.
When narrowband energy is detected we need to continue adapting to it, to echo
cancel it. However, the adaption will almost certainly be going astray. Broadband
(or even complex sequences of narrowband) energy will normally lead to a well
trained cancellor, with taps matching the impulse response of the channel.
For stationary narrowband energy, there is usually has an infinite number of
alternative tap sets which will cancel it well. A previously well trained set of
taps will tend to drift amongst the alternatives. When broadband energy resumes, the
taps may be a total mismatch for the signal, and could even amplify rather than
attenuate the echo. The solution is to use a fourth set of 16 bit taps. When we first
detect the narrowband energy we save the oldest of the group of three sets, but do
not change back to an older set. We let the canceller cancel, and it adaption drift
while the narrowband energy is present. When we detect the narrowband energy has ceased,
we switch to using the fourth set of taps which was saved.
When we revert to an older set of taps, we must replace both the 16 bit and 32 bit
working tap sets. The saved 16 bit values are good enough to also be used as a replacement
for the 32 bit values. We loose the fractions, but they should soon settle down in a
reasonable way. */
#if defined(HAVE_CONFIG_H)
#include "config.h"
#endif
#include <inttypes.h>
#include <stdlib.h>
#if defined(HAVE_TGMATH_H)
#include <tgmath.h>
#endif
#if defined(HAVE_MATH_H)
#include <math.h>
#endif
#if defined(HAVE_STDBOOL_H)
#include <stdbool.h>
#else
#include "spandsp/stdbool.h"
#endif
#include "floating_fudge.h"
#include <string.h>
#include <stdio.h>
#include "spandsp/telephony.h"
#include "spandsp/alloc.h"
#include "spandsp/fast_convert.h"
#include "spandsp/logging.h"
#include "spandsp/saturated.h"
#include "spandsp/dc_restore.h"
#include "spandsp/bit_operations.h"
#include "spandsp/echo.h"
#include "spandsp/private/echo.h"
#if !defined(NULL)
#define NULL (void *) 0
#endif
#define NONUPDATE_DWELL_TIME 600 /* 600 samples, or 75ms */
#define MIN_TX_POWER_FOR_ADAPTION 64*64
#define MIN_RX_POWER_FOR_ADAPTION 64*64
static int narrowband_detect(echo_can_state_t *ec)
{
int k;
int i;
float temp;
float scale;
float sf[128];
float f_acf[128];
int32_t acf[28];
int score;
int len = 32;
int alen = 9;
k = ec->curr_pos;
for (i = 0; i < len; i++)
{
sf[i] = ec->fir_state.history[k++];
if (k >= 256)
k = 0;
}
for (k = 0; k < alen; k++)
{
temp = 0;
for (i = k; i < len; i++)
temp += sf[i]*sf[i - k];
f_acf[k] = temp;
}
scale = 0x1FFFFFFF/f_acf[0];
for (k = 0; k < alen; k++)
acf[k] = (int32_t) (f_acf[k]*scale);
score = 0;
for (i = 0; i < 9; i++)
{
if (ec->last_acf[i] >= 0 && acf[i] >= 0)
{
if ((ec->last_acf[i] >> 1) < acf[i] && acf[i] < (ec->last_acf[i] << 1))
score++;
}
else if (ec->last_acf[i] < 0 && acf[i] < 0)
{
if ((ec->last_acf[i] >> 1) > acf[i] && acf[i] > (ec->last_acf[i] << 1))
score++;
}
}
memcpy(ec->last_acf, acf, alen*sizeof(ec->last_acf[0]));
return score;
}
static __inline__ void lms_adapt(echo_can_state_t *ec, int factor)
{
int i;
#if 0
mmx_t *mmx_taps;
mmx_t *mmx_coeffs;
mmx_t *mmx_hist;
mmx_t mmx;
mmx.w[0] =
mmx.w[1] =
mmx.w[2] =
mmx.w[3] = factor;
mmx_hist = (mmx_t *) &fir->history[fir->curr_pos];
mmx_taps = (mmx_t *) &fir->taps;
mmx_coeffs = (mmx_t *) fir->coeffs;
i = fir->taps;
movq_m2r(mmx, mm0);
while (i > 0)
{
movq_m2r(mmx_hist[0], mm1);
movq_m2r(mmx_taps[0], mm0);
movq_m2r(mmx_taps[1], mm1);
movq_r2r(mm1, mm2);
pmulhw(mm0, mm1);
pmullw(mm0, mm2);
pmaddwd_r2r(mm1, mm0);
pmaddwd_r2r(mm3, mm2);
paddd_r2r(mm0, mm4);
paddd_r2r(mm2, mm4);
movq_r2m(mm0, mmx_taps[0]);
movq_r2m(mm1, mmx_taps[0]);
movq_r2m(mm2, mmx_coeffs[0]);
mmx_taps += 2;
mmx_coeffs += 1;
mmx_hist += 1;
i -= 4;
)
emms();
#elif 0
/* Update the FIR taps */
for (i = ec->taps - 1; i >= 0; i--)
{
/* Leak to avoid the coefficients drifting beyond the ability of the
adaption process to bring them back under control. */
ec->fir_taps32[i] -= (ec->fir_taps32[i] >> 23);
ec->fir_taps32[i] += (ec->fir_state.history[i + ec->curr_pos]*factor);
ec->latest_correction = (ec->fir_state.history[i + ec->curr_pos]*factor);
ec->fir_taps16[ec->tap_set][i] = ec->fir_taps32[i] >> 15;
}
#else
int offset1;
int offset2;
/* Update the FIR taps */
offset2 = ec->curr_pos;
offset1 = ec->taps - offset2;
for (i = ec->taps - 1; i >= offset1; i--)
{
ec->fir_taps32[i] += (ec->fir_state.history[i - offset1]*factor);
ec->fir_taps16[ec->tap_set][i] = (int16_t) (ec->fir_taps32[i] >> 15);
}
for ( ; i >= 0; i--)
{
ec->fir_taps32[i] += (ec->fir_state.history[i + offset2]*factor);
ec->fir_taps16[ec->tap_set][i] = (int16_t) (ec->fir_taps32[i] >> 15);
}
#endif
}
/*- End of function --------------------------------------------------------*/
SPAN_DECLARE(echo_can_state_t *) echo_can_init(int len, int adaption_mode)
{
echo_can_state_t *ec;
int i;
int j;
if ((ec = (echo_can_state_t *) span_alloc(sizeof(*ec))) == NULL)
return NULL;
memset(ec, 0, sizeof(*ec));
ec->taps = len;
ec->curr_pos = ec->taps - 1;
ec->tap_mask = ec->taps - 1;
if ((ec->fir_taps32 = (int32_t *) span_alloc(ec->taps*sizeof(int32_t))) == NULL)
{
span_free(ec);
return NULL;
}
memset(ec->fir_taps32, 0, ec->taps*sizeof(int32_t));
for (i = 0; i < 4; i++)
{
if ((ec->fir_taps16[i] = (int16_t *) span_alloc(ec->taps*sizeof(int16_t))) == NULL)
{
for (j = 0; j < i; j++)
span_free(ec->fir_taps16[j]);
span_free(ec->fir_taps32);
span_free(ec);
return NULL;
}
memset(ec->fir_taps16[i], 0, ec->taps*sizeof(int16_t));
}
fir16_create(&ec->fir_state,
ec->fir_taps16[0],
ec->taps);
ec->rx_power_threshold = 10000000;
ec->geigel_max = 0;
ec->geigel_lag = 0;
ec->dtd_onset = false;
ec->tap_set = 0;
ec->tap_rotate_counter = 1600;
ec->cng_level = 1000;
echo_can_adaption_mode(ec, adaption_mode);
return ec;
}
/*- End of function --------------------------------------------------------*/
SPAN_DECLARE(int) echo_can_release(echo_can_state_t *ec)
{
return 0;
}
/*- End of function --------------------------------------------------------*/
SPAN_DECLARE(int) echo_can_free(echo_can_state_t *ec)
{
int i;
fir16_free(&ec->fir_state);
span_free(ec->fir_taps32);
for (i = 0; i < 4; i++)
span_free(ec->fir_taps16[i]);
span_free(ec);
return 0;
}
/*- End of function --------------------------------------------------------*/
SPAN_DECLARE(void) echo_can_adaption_mode(echo_can_state_t *ec, int adaption_mode)
{
ec->adaption_mode = adaption_mode;
}
/*- End of function --------------------------------------------------------*/
SPAN_DECLARE(void) echo_can_flush(echo_can_state_t *ec)
{
int i;
for (i = 0; i < 4; i++)
ec->tx_power[i] = 0;
for (i = 0; i < 3; i++)
ec->rx_power[i] = 0;
ec->clean_rx_power = 0;
ec->nonupdate_dwell = 0;
fir16_flush(&ec->fir_state);
ec->fir_state.curr_pos = ec->taps - 1;
memset(ec->fir_taps32, 0, ec->taps*sizeof(int32_t));
for (i = 0; i < 4; i++)
memset(ec->fir_taps16[i], 0, ec->taps*sizeof(int16_t));
ec->curr_pos = ec->taps - 1;
ec->supp_test1 = 0;
ec->supp_test2 = 0;
ec->supp1 = 0;
ec->supp2 = 0;
ec->vad = 0;
ec->cng_level = 1000;
ec->cng_filter = 0;
ec->geigel_max = 0;
ec->geigel_lag = 0;
ec->dtd_onset = false;
ec->tap_set = 0;
ec->tap_rotate_counter = 1600;
ec->latest_correction = 0;
memset(ec->last_acf, 0, sizeof(ec->last_acf));
ec->narrowband_count = 0;
ec->narrowband_score = 0;
}
/*- End of function --------------------------------------------------------*/
int sample_no = 0;
SPAN_DECLARE(void) echo_can_snapshot(echo_can_state_t *ec)
{
memcpy(ec->snapshot, ec->fir_taps16[0], ec->taps*sizeof(int16_t));
}
/*- End of function --------------------------------------------------------*/
static __inline__ int16_t echo_can_hpf(int32_t coeff[2], int16_t amp)
{
int32_t z;
/*
Filter DC, 3dB point is 160Hz (I think), note 32 bit precision required
otherwise values do not track down to 0. Zero at DC, Pole at (1-Beta)
only real axis. Some chip sets (like Si labs) don't need
this, but something like a $10 X100P card does. Any DC really slows
down convergence.
Note: removes some low frequency from the signal, this reduces
the speech quality when listening to samples through headphones
but may not be obvious through a telephone handset.
Note that the 3dB frequency in radians is approx Beta, e.g. for
Beta = 2^(-3) = 0.125, 3dB freq is 0.125 rads = 159Hz.
This is one of the classic DC removal filters, adjusted to provide sufficient
bass rolloff to meet the above requirement to protect hybrids from things that
upset them. The difference between successive samples produces a lousy HPF, and
then a suitably placed pole flattens things out. The final result is a nicely
rolled off bass end. The filtering is implemented with extended fractional
precision, which noise shapes things, giving very clean DC removal.
Make sure the gain of the HPF is 1.0. The first can still saturate a little under
impulse conditions, and it might roll to 32768 and need clipping on sustained peak
level signals. However, the scale of such clipping is small, and the error due to
any saturation should not markedly affect the downstream processing. */
z = amp << 15;
z -= (z >> 4);
coeff[0] += z - (coeff[0] >> 3) - coeff[1];
coeff[1] = z;
z = coeff[0] >> 15;
return saturate16(z);
}
/*- End of function --------------------------------------------------------*/
SPAN_DECLARE(int16_t) echo_can_update(echo_can_state_t *ec, int16_t tx, int16_t rx)
{
int32_t echo_value;
int clean_rx;
int nsuppr;
int score;
int i;
sample_no++;
if (ec->adaption_mode & ECHO_CAN_USE_RX_HPF)
rx = echo_can_hpf(ec->rx_hpf, rx);
ec->latest_correction = 0;
/* Evaluate the echo - i.e. apply the FIR filter */
/* Assume the gain of the FIR does not exceed unity. Exceeding unity
would seem like a rather poor thing for an echo cancellor to do :)
This means we can compute the result with a total disregard for
overflows. 16bits x 16bits -> 31bits, so no overflow can occur in
any multiply. While accumulating we may overflow and underflow the
32 bit scale often. However, if the gain does not exceed unity,
everything should work itself out, and the final result will be
OK, without any saturation logic. */
/* Overflow is very much possible here, and we do nothing about it because
of the compute costs */
/* 16 bit coeffs for the LMS give lousy results (maths good, actual sound
bad!), but 32 bit coeffs require some shifting. On balance 32 bit seems
best */
echo_value = fir16(&ec->fir_state, tx);
/* And the answer is..... */
clean_rx = rx - echo_value;
printf("echo is %" PRId32 "\n", echo_value);
/* That was the easy part. Now we need to adapt! */
if (ec->nonupdate_dwell > 0)
ec->nonupdate_dwell--;
/* Calculate short term power levels using very simple single pole IIRs */
/* TODO: Is the nasty modulus approach the fastest, or would a real
tx*tx power calculation actually be faster? Using the squares
makes the numbers grow a lot! */
ec->tx_power[3] += ((abs(tx) - ec->tx_power[3]) >> 5);
ec->tx_power[2] += ((tx*tx - ec->tx_power[2]) >> 8);
ec->tx_power[1] += ((tx*tx - ec->tx_power[1]) >> 5);
ec->tx_power[0] += ((tx*tx - ec->tx_power[0]) >> 3);
ec->rx_power[1] += ((rx*rx - ec->rx_power[1]) >> 6);
ec->rx_power[0] += ((rx*rx - ec->rx_power[0]) >> 3);
ec->clean_rx_power += ((clean_rx*clean_rx - ec->clean_rx_power) >> 6);
score = 0;
/* If there is very little being transmitted, any attempt to train is
futile. We would either be training on the far end's noise or signal,
the channel's own noise, or our noise. Either way, this is hardly good
training, so don't do it (avoid trouble). */
if (ec->tx_power[0] > MIN_TX_POWER_FOR_ADAPTION)
{
/* If the received power is very low, either we are sending very little or
we are already well adapted. There is little point in trying to improve
the adaption under these circumstances, so don't do it (reduce the
compute load). */
if (ec->tx_power[1] > ec->rx_power[0])
{
/* There is no (or little) far-end speech. */
if (ec->nonupdate_dwell == 0)
{
if (++ec->narrowband_count >= 160)
{
ec->narrowband_count = 0;
score = narrowband_detect(ec);
printf("Do the narrowband test %d at %d\n", score, ec->curr_pos);
if (score > 6)
{
if (ec->narrowband_score == 0)
memcpy(ec->fir_taps16[3], ec->fir_taps16[(ec->tap_set + 1)%3], ec->taps*sizeof(int16_t));
ec->narrowband_score += score;
}
else
{
if (ec->narrowband_score > 200)
{
printf("Revert to %d at %d\n", (ec->tap_set + 1)%3, sample_no);
memcpy(ec->fir_taps16[ec->tap_set], ec->fir_taps16[3], ec->taps*sizeof(int16_t));
memcpy(ec->fir_taps16[(ec->tap_set - 1)%3], ec->fir_taps16[3], ec->taps*sizeof(int16_t));
for (i = 0; i < ec->taps; i++)
ec->fir_taps32[i] = ec->fir_taps16[3][i] << 15;
ec->tap_rotate_counter = 1600;
}
ec->narrowband_score = 0;
}
}
ec->dtd_onset = false;
if (--ec->tap_rotate_counter <= 0)
{
printf("Rotate to %d at %d\n", ec->tap_set, sample_no);
ec->tap_rotate_counter = 1600;
ec->tap_set++;
if (ec->tap_set > 2)
ec->tap_set = 0;
ec->fir_state.coeffs = ec->fir_taps16[ec->tap_set];
}
/* ... and we are not in the dwell time from previous speech. */
if ((ec->adaption_mode & ECHO_CAN_USE_ADAPTION) && ec->narrowband_score == 0)
{
//nsuppr = saturate16((clean_rx << 16)/ec->tx_power[1]);
//nsuppr = clean_rx/ec->tx_power[1];
/* If a sudden surge in signal level (e.g. the onset of a tone
burst) cause an abnormally high instantaneous to average
signal power ratio, we could kick the adaption badly in the
wrong direction. This is because the tx_power takes too long
to react and rise. We need to stop too rapid adaption to the
new signal. We normalise to a value derived from the
instantaneous signal if it exceeds the peak by too much. */
nsuppr = clean_rx;
/* Divide isn't very quick, but the "where is the top bit" and shift
instructions are single cycle. */
if (tx > 4*ec->tx_power[3])
i = top_bit(tx) - 8;
else
i = top_bit(ec->tx_power[3]) - 8;
if (i > 0)
nsuppr >>= i;
lms_adapt(ec, nsuppr);
}
}
//printf("%10d %10d %10d %10d %10d\n", rx, clean_rx, nsuppr, ec->tx_power[1], ec->rx_power[1]);
//printf("%.4f\n", (float) ec->rx_power[1]/(float) ec->clean_rx_power);
}
else
{
if (!ec->dtd_onset)
{
printf("Revert to %d at %d\n", (ec->tap_set + 1)%3, sample_no);
memcpy(ec->fir_taps16[ec->tap_set], ec->fir_taps16[(ec->tap_set + 1)%3], ec->taps*sizeof(int16_t));
memcpy(ec->fir_taps16[(ec->tap_set - 1)%3], ec->fir_taps16[(ec->tap_set + 1)%3], ec->taps*sizeof(int16_t));
for (i = 0; i < ec->taps; i++)
ec->fir_taps32[i] = ec->fir_taps16[(ec->tap_set + 1)%3][i] << 15;
ec->tap_rotate_counter = 1600;
ec->dtd_onset = true;
}
ec->nonupdate_dwell = NONUPDATE_DWELL_TIME;
}
}
if (ec->rx_power[1])
ec->vad = (8000*ec->clean_rx_power)/ec->rx_power[1];
else
ec->vad = 0;
if (ec->rx_power[1] > 2048*2048 && ec->clean_rx_power > 4*ec->rx_power[1])
{
/* The EC seems to be making things worse, instead of better. Zap it! */
memset(ec->fir_taps32, 0, ec->taps*sizeof(int32_t));
for (i = 0; i < 4; i++)
memset(ec->fir_taps16[i], 0, ec->taps*sizeof(int16_t));
}
#if defined(XYZZY)
if ((ec->adaption_mode & ECHO_CAN_USE_SUPPRESSOR))
{
ec->supp_test1 += (ec->fir_state.history[ec->curr_pos] - ec->fir_state.history[(ec->curr_pos - 7) & ec->tap_mask]);
ec->supp_test2 += (ec->fir_state.history[(ec->curr_pos - 24) & ec->tap_mask] - ec->fir_state.history[(ec->curr_pos - 31) & ec->tap_mask]);
if (ec->supp_test1 > 42 && ec->supp_test2 > 42)
supp_change = 25;
else
supp_change = 50;
supp = supp_change + k1*ec->supp1 + k2*ec->supp2;
ec->supp2 = ec->supp1;
ec->supp1 = supp;
clean_rx *= (1 - supp);
}
#endif
if ((ec->adaption_mode & ECHO_CAN_USE_NLP))
{
/* Non-linear processor - a fancy way to say "zap small signals, to avoid
residual echo due to (uLaw/ALaw) non-linearity in the channel.". */
if (ec->rx_power[1] < 30000000)
{
if (!ec->cng)
{
ec->cng_level = ec->clean_rx_power;
ec->cng = true;
}
if ((ec->adaption_mode & ECHO_CAN_USE_CNG))
{
/* Very elementary comfort noise generation */
/* Just random numbers rolled off very vaguely Hoth-like */
ec->cng_rndnum = 1664525U*ec->cng_rndnum + 1013904223U;
ec->cng_filter = ((ec->cng_rndnum & 0xFFFF) - 32768 + 5*ec->cng_filter) >> 3;
clean_rx = (ec->cng_filter*ec->cng_level) >> 17;
/* TODO: A better CNG, with more accurate (tracking) spectral shaping! */
}
else
{
clean_rx = 0;
}
//clean_rx = -16000;
}
else
{
ec->cng = false;
}
}
else
{
ec->cng = false;
}
printf("Narrowband score %4d %5d at %d\n", ec->narrowband_score, score, sample_no);
/* Roll around the rolling buffer */
if (ec->curr_pos <= 0)
ec->curr_pos = ec->taps;
ec->curr_pos--;
return (int16_t) clean_rx;
}
/*- End of function --------------------------------------------------------*/
SPAN_DECLARE(int16_t) echo_can_hpf_tx(echo_can_state_t *ec, int16_t tx)
{
if (ec->adaption_mode & ECHO_CAN_USE_TX_HPF)
tx = echo_can_hpf(ec->tx_hpf, tx);
return tx;
}
/*- End of function --------------------------------------------------------*/
/*- End of file ------------------------------------------------------------*/