freeswitch/src/mod/endpoints/mod_gsmopen/asterisk/celliax_spandsp.c

/*
 * SpanDSP - a series of DSP components for telephony
 *
 * echo.c - An echo cancellor, suitable for electrical and acoustic
 *          cancellation. This code does not currently comply with
 *          any relevant standards (e.g. G.164/5/7/8). One day....
 *
 * Written by Steve Underwood <steveu@coppice.org>
 *
 * Copyright (C) 2001, 2003 Steve Underwood
 *
 * Based on a bit from here, a bit from there, eye of toad,
 * ear of bat, etc - plus, of course, my own 2 cents.
 *
 * All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2, as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * $Id: echo.c,v 1.20 2006/12/01 18:00:48 steveu Exp $
 */

/*! \file */

/* TODO:
   Finish the echo suppressor option, however nasty suppression may be.
   Add an option to reintroduce side tone at -24dB under appropriate conditions.
   Improve double talk detector (iterative!)
*/

/* We need to differentiate between transmitted energy which will train the echo
   canceller well (voice, white noise, and other broadband sources) and energy
   which will train it badly (supervisory tones, DTMF, whistles, and other
   narrowband sources). There are many ways this might be done. This canceller uses
   a method based on the autocorrelation qualities of the transmitted signal. A rather
   peaky autocorrelation function is a clear sign of a narrowband signal. We only need
   perform the autocorrelation at well spaced intervals, so the compute load is not too
   great. Multiple successive autocorrelation functions with a similar peaky shape are a
   clear indication of a stationary narrowband signal. Using TKEO, it should be possible to
   greatly reduce the compute requirement for narrowband detection. */

/* The FIR taps must be adapted as 32 bit values, to get the necessary finesse
   in the adaption process. However, they are applied as 16 bit values (bits 30-15
   of the 32 bit values) in the FIR. For the working 16 bit values, we need 4 sets.

   3 of the 16 bit sets are used on a rotating basis. Normally the canceller steps
   round these 3 sets at regular intervals. Any time we detect double talk, we can go
   back to the set from two steps ago with reasonable assurance it is a well adapted
   set. We cannot just go back one step, as we may have rotated the sets just before
   double talk or tone was detected, and that set may already be somewhat corrupted.

   When narrowband energy is detected we need to continue adapting to it, to echo
   cancel it. However, the adaption will almost certainly be going astray. Broadband
   (or even complex sequences of narrowband) energy will normally lead to a well
   trained cancellor, with taps matching the impulse response of the channel.
   For stationary narrowband energy, there is usually has an infinite number of
   alternative tap sets which will cancel it well. A previously well trained set of
   taps will tend to drift amongst the alternatives. When broadband energy resumes, the
   taps may be a total mismatch for the signal, and could even amplify rather than
   attenuate the echo. The solution is to use a fourth set of 16 bit taps. When we first
   detect the narrowband energy we save the oldest of the group of three sets, but do
   not change back to an older set. We let the canceller cancel, and it adaption drift
   while the narrowband energy is present. When we detect the narrowband energy has ceased,
   we switch to using the fourth set of taps which was saved.

   When we revert to an older set of taps, we must replace both the 16 bit and 32 bit
   working tap sets. The saved 16 bit values are good enough to also be used as a replacement
   for the 32 bit values. We loose the fractions, but they should soon settle down in a
   reasonable way. */

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <stdlib.h>
#include <inttypes.h>
#include <string.h>
#include <stdio.h>

#include "celliax_spandsp.h"

//#include "spandsp/telephony.h"
//#include "spandsp/logging.h"
//#include "spandsp/bit_operations.h"
//#include "spandsp/echo.h"

//#include "bit_operations.h"
//#include "giova.h"

#if !defined(NULL)
#define NULL (void *) 0
#endif
#if !defined(FALSE)
#define FALSE 0
#endif
#if !defined(TRUE)
#define TRUE (!FALSE)
#endif

#if 0
#define MIN_TX_POWER_FOR_ADAPTION   64*64
#define MIN_RX_POWER_FOR_ADAPTION   64*64

static int narrowband_detect(echo_can_state_t * ec)
{
  int k;
  int i;
  float temp;
  float scale;
  float sf[128];
  float f_acf[128];
  int32_t acf[28];
  int score;
  int len = 32;
  int alen = 9;

  k = ec->curr_pos;
  for (i = 0; i < len; i++) {
    sf[i] = ec->fir_state.history[k++];
    if (k >= 256)
      k = 0;
  }
  for (k = 0; k < alen; k++) {
    temp = 0;
    for (i = k; i < len; i++)
      temp += sf[i] * sf[i - k];
    f_acf[k] = temp;
  }
  scale = 0x1FFFFFFF / f_acf[0];
  for (k = 0; k < alen; k++)
    acf[k] = (int32_t) (f_acf[k] * scale);
  score = 0;
  for (i = 0; i < 9; i++) {
    if (ec->last_acf[i] >= 0 && acf[i] >= 0) {
      if ((ec->last_acf[i] >> 1) < acf[i] && acf[i] < (ec->last_acf[i] << 1))
        score++;
    } else if (ec->last_acf[i] < 0 && acf[i] < 0) {
      if ((ec->last_acf[i] >> 1) > acf[i] && acf[i] > (ec->last_acf[i] << 1))
        score++;
    }
  }
  memcpy(ec->last_acf, acf, alen * sizeof(ec->last_acf[0]));
  return score;
}

static __inline__ void lms_adapt(echo_can_state_t * ec, int factor)
{
  int i;

#if 0
  mmx_t *mmx_taps;
  mmx_t *mmx_coeffs;
  mmx_t *mmx_hist;
  mmx_t mmx;

  mmx.w[0] = mmx.w[1] = mmx.w[2] = mmx.w[3] = factor;
  mmx_hist = (mmx_t *) & fir->history[fir->curr_pos];
  mmx_taps = (mmx_t *) & fir->taps;
  mmx_coeffs = (mmx_t *) fir->coeffs;
  i = fir->taps;
  movq_m2r(mmx, mm0);
  while (i > 0) {
    movq_m2r(mmx_hist[0], mm1);
    movq_m2r(mmx_taps[0], mm0);
    movq_m2r(mmx_taps[1], mm1);
    movq_r2r(mm1, mm2);
    pmulhw(mm0, mm1);
    pmullw(mm0, mm2);

    pmaddwd_r2r(mm1, mm0);
    pmaddwd_r2r(mm3, mm2);
    paddd_r2r(mm0, mm4);
    paddd_r2r(mm2, mm4);
    movq_r2m(mm0, mmx_taps[0]);
    movq_r2m(mm1, mmx_taps[0]);
    movq_r2m(mm2, mmx_coeffs[0]);
    mmx_taps += 2;
    mmx_coeffs += 1;
    mmx_hist += 1;
    i -= 4;
    )
      emms();
#elif 0
  /* Update the FIR taps */
  for (i = ec->taps - 1; i >= 0; i--) {
    /* Leak to avoid the coefficients drifting beyond the ability of the
       adaption process to bring them back under control. */
    ec->fir_taps32[i] -= (ec->fir_taps32[i] >> 23);
    ec->fir_taps32[i] += (ec->fir_state.history[i + ec->curr_pos] * factor);
    ec->latest_correction = (ec->fir_state.history[i + ec->curr_pos] * factor);
    ec->fir_taps16[ec->tap_set][i] = ec->fir_taps32[i] >> 15;
  }
#else
  int offset1;
  int offset2;

  /* Update the FIR taps */
  offset2 = ec->curr_pos;
  offset1 = ec->taps - offset2;
  for (i = ec->taps - 1; i >= offset1; i--) {
    ec->fir_taps32[i] += (ec->fir_state.history[i - offset1] * factor);
    ec->fir_taps16[ec->tap_set][i] = (int16_t) (ec->fir_taps32[i] >> 15);
  }
  for (; i >= 0; i--) {
    ec->fir_taps32[i] += (ec->fir_state.history[i + offset2] * factor);
    ec->fir_taps16[ec->tap_set][i] = (int16_t) (ec->fir_taps32[i] >> 15);
  }
#endif
}

/*- End of function --------------------------------------------------------*/

#ifdef NOT_NEEDED
echo_can_state_t *echo_can_create(int len, int adaption_mode)
{
  echo_can_state_t *ec;
  int i;
  int j;

  ec = (echo_can_state_t *) malloc(sizeof(*ec));
  if (ec == NULL)
    return NULL;
  memset(ec, 0, sizeof(*ec));
  ec->taps = len;
  ec->curr_pos = ec->taps - 1;
  ec->tap_mask = ec->taps - 1;
  if ((ec->fir_taps32 = (int32_t *) malloc(ec->taps * sizeof(int32_t))) == NULL) {
    free(ec);
    return NULL;
  }
  memset(ec->fir_taps32, 0, ec->taps * sizeof(int32_t));
  for (i = 0; i < 4; i++) {
    if ((ec->fir_taps16[i] = (int16_t *) malloc(ec->taps * sizeof(int16_t))) == NULL) {
      for (j = 0; j < i; j++)
        free(ec->fir_taps16[j]);
      free(ec->fir_taps32);
      free(ec);
      return NULL;
    }
    memset(ec->fir_taps16[i], 0, ec->taps * sizeof(int16_t));
  }
  fir16_create(&ec->fir_state, ec->fir_taps16[0], ec->taps);
  ec->rx_power_threshold = 10000000;
  ec->geigel_max = 0;
  ec->geigel_lag = 0;
  ec->dtd_onset = FALSE;
  ec->tap_set = 0;
  ec->tap_rotate_counter = 1600;
  ec->cng_level = 1000;
  echo_can_adaption_mode(ec, adaption_mode);
  return ec;
}

/*- End of function --------------------------------------------------------*/

void echo_can_free(echo_can_state_t * ec)
{
  int i;

  fir16_free(&ec->fir_state);
  free(ec->fir_taps32);
  for (i = 0; i < 4; i++)
    free(ec->fir_taps16[i]);
  free(ec);
}

/*- End of function --------------------------------------------------------*/

void echo_can_adaption_mode(echo_can_state_t * ec, int adaption_mode)
{
  ec->adaption_mode = adaption_mode;
}

/*- End of function --------------------------------------------------------*/

void echo_can_flush(echo_can_state_t * ec)
{
  int i;

  for (i = 0; i < 4; i++)
    ec->tx_power[i] = 0;
  for (i = 0; i < 3; i++)
    ec->rx_power[i] = 0;
  ec->clean_rx_power = 0;
  ec->nonupdate_dwell = 0;

  fir16_flush(&ec->fir_state);
  ec->fir_state.curr_pos = ec->taps - 1;
  memset(ec->fir_taps32, 0, ec->taps * sizeof(int32_t));
  for (i = 0; i < 4; i++)
    memset(ec->fir_taps16[i], 0, ec->taps * sizeof(int16_t));

  ec->curr_pos = ec->taps - 1;

  ec->supp_test1 = 0;
  ec->supp_test2 = 0;
  ec->supp1 = 0;
  ec->supp2 = 0;
  ec->vad = 0;
  ec->cng_level = 1000;
  ec->cng_filter = 0;

  ec->geigel_max = 0;
  ec->geigel_lag = 0;
  ec->dtd_onset = FALSE;
  ec->tap_set = 0;
  ec->tap_rotate_counter = 1600;

  ec->latest_correction = 0;

  memset(ec->last_acf, 0, sizeof(ec->last_acf));
  ec->narrowband_count = 0;
  ec->narrowband_score = 0;
}

/*- End of function --------------------------------------------------------*/

int sample_no = 0;

int16_t echo_can_update(echo_can_state_t * ec, int16_t tx, int16_t rx)
{
  int32_t echo_value;
  int clean_rx;
  int nsuppr;
  int score;
  int i;

  sample_no++;
  ec->latest_correction = 0;
  /* Evaluate the echo - i.e. apply the FIR filter */
  /* Assume the gain of the FIR does not exceed unity. Exceeding unity
     would seem like a rather poor thing for an echo cancellor to do :)
     This means we can compute the result with a total disregard for
     overflows. 16bits x 16bits -> 31bits, so no overflow can occur in
     any multiply. While accumulating we may overflow and underflow the
     32 bit scale often. However, if the gain does not exceed unity,
     everything should work itself out, and the final result will be
     OK, without any saturation logic. */
  /* Overflow is very much possible here, and we do nothing about it because
     of the compute costs */
  /* 16 bit coeffs for the LMS give lousy results (maths good, actual sound
     bad!), but 32 bit coeffs require some shifting. On balance 32 bit seems
     best */
  echo_value = fir16(&ec->fir_state, tx);

  /* And the answer is..... */
  clean_rx = rx - echo_value;
//printf("echo is %" PRId32 "\n", echo_value);
  /* That was the easy part. Now we need to adapt! */
  if (ec->nonupdate_dwell > 0)
    ec->nonupdate_dwell--;

  /* Calculate short term power levels using very simple single pole IIRs */
  /* TODO: Is the nasty modulus approach the fastest, or would a real
     tx*tx power calculation actually be faster? Using the squares
     makes the numbers grow a lot! */
  ec->tx_power[3] += ((abs(tx) - ec->tx_power[3]) >> 5);
  ec->tx_power[2] += ((tx * tx - ec->tx_power[2]) >> 8);
  ec->tx_power[1] += ((tx * tx - ec->tx_power[1]) >> 5);
  ec->tx_power[0] += ((tx * tx - ec->tx_power[0]) >> 3);
  ec->rx_power[1] += ((rx * rx - ec->rx_power[1]) >> 6);
  ec->rx_power[0] += ((rx * rx - ec->rx_power[0]) >> 3);
  ec->clean_rx_power += ((clean_rx * clean_rx - ec->clean_rx_power) >> 6);

  score = 0;
  /* If there is very little being transmitted, any attempt to train is
     futile. We would either be training on the far end's noise or signal,
     the channel's own noise, or our noise. Either way, this is hardly good
     training, so don't do it (avoid trouble). */
  if (ec->tx_power[0] > MIN_TX_POWER_FOR_ADAPTION) {
    /* If the received power is very low, either we are sending very little or
       we are already well adapted. There is little point in trying to improve
       the adaption under these circumstances, so don't do it (reduce the
       compute load). */
    if (ec->tx_power[1] > ec->rx_power[0]) {
      /* There is no (or little) far-end speech. */
      if (ec->nonupdate_dwell == 0) {
        if (++ec->narrowband_count >= 160) {
          ec->narrowband_count = 0;
          score = narrowband_detect(ec);
//printf("Do the narrowband test %d at %d\n", score, ec->curr_pos);
          if (score > 6) {
            if (ec->narrowband_score == 0)
              memcpy(ec->fir_taps16[3], ec->fir_taps16[(ec->tap_set + 1) % 3],
                     ec->taps * sizeof(int16_t));
            ec->narrowband_score += score;
          } else {
            if (ec->narrowband_score > 200) {
//printf("Revert to %d at %d\n", (ec->tap_set + 1)%3, sample_no);
              memcpy(ec->fir_taps16[ec->tap_set], ec->fir_taps16[3],
                     ec->taps * sizeof(int16_t));
              memcpy(ec->fir_taps16[(ec->tap_set - 1) % 3], ec->fir_taps16[3],
                     ec->taps * sizeof(int16_t));
              for (i = 0; i < ec->taps; i++)
                ec->fir_taps32[i] = ec->fir_taps16[3][i] << 15;
              ec->tap_rotate_counter = 1600;
            }
            ec->narrowband_score = 0;
          }
        }
        ec->dtd_onset = FALSE;
        if (--ec->tap_rotate_counter <= 0) {
//printf("Rotate to %d at %d\n", ec->tap_set, sample_no);
          ec->tap_rotate_counter = 1600;
          ec->tap_set++;
          if (ec->tap_set > 2)
            ec->tap_set = 0;
          ec->fir_state.coeffs = ec->fir_taps16[ec->tap_set];
        }
        /* ... and we are not in the dwell time from previous speech. */
        if ((ec->adaption_mode & ECHO_CAN_USE_ADAPTION) && ec->narrowband_score == 0) {
          //nsuppr = saturate((clean_rx << 16)/ec->tx_power[1]);
          //nsuppr = clean_rx/ec->tx_power[1];
          /* If a sudden surge in signal level (e.g. the onset of a tone
             burst) cause an abnormally high instantaneous to average
             signal power ratio, we could kick the adaption badly in the
             wrong direction. This is because the tx_power takes too long
             to react and rise. We need to stop too rapid adaption to the
             new signal. We normalise to a value derived from the
             instantaneous signal if it exceeds the peak by too much. */
          nsuppr = clean_rx;
          /* Divide isn't very quick, but the "where is the top bit" and shift
             instructions are single cycle. */
          if (tx > 4 * ec->tx_power[3])
            i = top_bit(tx) - 8;
          else
            i = top_bit(ec->tx_power[3]) - 8;
          if (i > 0)
            nsuppr >>= i;
          lms_adapt(ec, nsuppr);
        }
      }
      //printf("%10d %10d %10d %10d %10d\n", rx, clean_rx, nsuppr, ec->tx_power[1], ec->rx_power[1]);
      //printf("%.4f\n", (float) ec->rx_power[1]/(float) ec->clean_rx_power);
    } else {
      if (!ec->dtd_onset) {
//printf("Revert to %d at %d\n", (ec->tap_set + 1)%3, sample_no);
        memcpy(ec->fir_taps16[ec->tap_set], ec->fir_taps16[(ec->tap_set + 1) % 3],
               ec->taps * sizeof(int16_t));
        memcpy(ec->fir_taps16[(ec->tap_set - 1) % 3],
               ec->fir_taps16[(ec->tap_set + 1) % 3], ec->taps * sizeof(int16_t));
        for (i = 0; i < ec->taps; i++)
          ec->fir_taps32[i] = ec->fir_taps16[(ec->tap_set + 1) % 3][i] << 15;
        ec->tap_rotate_counter = 1600;
        ec->dtd_onset = TRUE;
      }
      ec->nonupdate_dwell = NONUPDATE_DWELL_TIME;
    }
  }

  if (ec->rx_power[1])
    ec->vad = (8000 * ec->clean_rx_power) / ec->rx_power[1];
  else
    ec->vad = 0;
  if (ec->rx_power[1] > 2048 * 2048 && ec->clean_rx_power > 4 * ec->rx_power[1]) {
    /* The EC seems to be making things worse, instead of better. Zap it! */
    memset(ec->fir_taps32, 0, ec->taps * sizeof(int32_t));
    for (i = 0; i < 4; i++)
      memset(ec->fir_taps16[i], 0, ec->taps * sizeof(int16_t));
  }
#if defined(XYZZY)
  if ((ec->adaption_mode & ECHO_CAN_USE_SUPPRESSOR)) {
    ec->supp_test1 +=
      (ec->fir_state.history[ec->curr_pos] -
       ec->fir_state.history[(ec->curr_pos - 7) & ec->tap_mask]);
    ec->supp_test2 +=
      (ec->fir_state.history[(ec->curr_pos - 24) & ec->tap_mask] -
       ec->fir_state.history[(ec->curr_pos - 31) & ec->tap_mask]);
    if (ec->supp_test1 > 42 && ec->supp_test2 > 42)
      supp_change = 25;
    else
      supp_change = 50;
    supp = supp_change + k1 * ec->supp1 + k2 * ec->supp2;
    ec->supp2 = ec->supp1;
    ec->supp1 = supp;
    clean_rx *= (1 - supp);
  }
#endif

  if ((ec->adaption_mode & ECHO_CAN_USE_NLP)) {
    /* Non-linear processor - a fancy way to say "zap small signals, to avoid
       residual echo due to (uLaw/ALaw) non-linearity in the channel.". */
    if (ec->rx_power[1] < 30000000) {
      if (!ec->cng) {
        ec->cng_level = ec->clean_rx_power;
        ec->cng = TRUE;
      }
      if ((ec->adaption_mode & ECHO_CAN_USE_CNG)) {
        /* Very elementary comfort noise generation */
        /* Just random numbers rolled off very vaguely Hoth-like */
        ec->cng_rndnum = 1664525U * ec->cng_rndnum + 1013904223U;
        ec->cng_filter = ((ec->cng_rndnum & 0xFFFF) - 32768 + 5 * ec->cng_filter) >> 3;
        clean_rx = (ec->cng_filter * ec->cng_level) >> 17;
        /* TODO: A better CNG, with more accurate (tracking) spectral shaping! */
      } else {
        clean_rx = 0;
      }
//clean_rx = -16000;
    } else {
      ec->cng = FALSE;
    }
  } else {
    ec->cng = FALSE;
  }

//printf("Narrowband score %4d %5d at %d\n", ec->narrowband_score, score, sample_no);
  /* Roll around the rolling buffer */
  if (ec->curr_pos <= 0)
    ec->curr_pos = ec->taps;
  ec->curr_pos--;
  return (int16_t) clean_rx;
}

#endif //NOT_NEEDED
/*- End of function --------------------------------------------------------*/
/*- End of file ------------------------------------------------------------*/
#endif

#include <inttypes.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <time.h>
#include <fcntl.h>
#include <math.h>

//#include "spandsp/telephony.h"
//#include "spandsp/tone_detect.h"
//#include "spandsp/tone_generate.h"
//#include "spandsp/super_tone_rx.h"
//#include "giova.h"

#if !defined(M_PI)
/* C99 systems may not define M_PI */
#define M_PI 3.14159265358979323846264338327
#endif

//#define USE_3DNOW

#define DEFAULT_DTMF_TX_LEVEL       -10
#define DEFAULT_DTMF_TX_ON_TIME     50
#define DEFAULT_DTMF_TX_OFF_TIME    55

#define DTMF_THRESHOLD              8.0e7f
#define DTMF_NORMAL_TWIST           6.3f    /* 8dB */
#define DTMF_REVERSE_TWIST          2.5f    /* 4dB */
#define DTMF_RELATIVE_PEAK_ROW      6.3f    /* 8dB */
#define DTMF_RELATIVE_PEAK_COL      6.3f    /* 8dB */
#define DTMF_TO_TOTAL_ENERGY        42.0f

static const float dtmf_row[] = {
  697.0f, 770.0f, 852.0f, 941.0f
};
static const float dtmf_col[] = {
  1209.0f, 1336.0f, 1477.0f, 1633.0f
};

static const char dtmf_positions[] = "123A" "456B" "789C" "*0#D";

static goertzel_descriptor_t dtmf_detect_row[4];
static goertzel_descriptor_t dtmf_detect_col[4];

//
//static int dtmf_tx_inited = 0;
//static tone_gen_descriptor_t dtmf_digit_tones[16];

#if defined(USE_3DNOW)
static __inline__ void _dtmf_goertzel_update(goertzel_state_t * s, float x[], int samples)
{
  int n;
  float v;
  int i;
  float vv[16];

  vv[4] = s[0].v2;
  vv[5] = s[1].v2;
  vv[6] = s[2].v2;
  vv[7] = s[3].v2;
  vv[8] = s[0].v3;
  vv[9] = s[1].v3;
  vv[10] = s[2].v3;
  vv[11] = s[3].v3;
  vv[12] = s[0].fac;
  vv[13] = s[1].fac;
  vv[14] = s[2].fac;
  vv[15] = s[3].fac;

  //v1 = s->v2;
  //s->v2 = s->v3;
  //s->v3 = s->fac*s->v2 - v1 + x[0];

  __asm__ __volatile__(" femms;\n" " movq        16(%%edx),%%mm2;\n"
                       " movq        24(%%edx),%%mm3;\n" " movq        32(%%edx),%%mm4;\n"
                       " movq        40(%%edx),%%mm5;\n" " movq        48(%%edx),%%mm6;\n"
                       " movq        56(%%edx),%%mm7;\n" " jmp         1f;\n"
                       " .align 32;\n" " 1: ;\n" " prefetch    (%%eax);\n"
                       " movq        %%mm3,%%mm1;\n" " movq        %%mm2,%%mm0;\n"
                       " movq        %%mm5,%%mm3;\n" " movq        %%mm4,%%mm2;\n"
                       " pfmul       %%mm7,%%mm5;\n" " pfmul       %%mm6,%%mm4;\n"
                       " pfsub       %%mm1,%%mm5;\n" " pfsub       %%mm0,%%mm4;\n"
                       " movq        (%%eax),%%mm0;\n" " movq        %%mm0,%%mm1;\n"
                       " punpckldq   %%mm0,%%mm1;\n" " add         $4,%%eax;\n"
                       " pfadd       %%mm1,%%mm5;\n" " pfadd       %%mm1,%%mm4;\n"
                       " dec         %%ecx;\n" " jnz         1b;\n"
                       " movq        %%mm2,16(%%edx);\n" " movq        %%mm3,24(%%edx);\n"
                       " movq        %%mm4,32(%%edx);\n" " movq        %%mm5,40(%%edx);\n"
                       " femms;\n"::"c"(samples), "a"(x), "d"(vv)
                       :"memory", "eax", "ecx");

  s[0].v2 = vv[4];
  s[1].v2 = vv[5];
  s[2].v2 = vv[6];
  s[3].v2 = vv[7];
  s[0].v3 = vv[8];
  s[1].v3 = vv[9];
  s[2].v3 = vv[10];
  s[3].v3 = vv[11];
}

/*- End of function --------------------------------------------------------*/
#endif

int dtmf_rx(dtmf_rx_state_t * s, const int16_t amp[], int samples)
{
  float row_energy[4];
  float col_energy[4];
  float famp;
  float v1;
  int i;
  int j;
  int sample;
  int best_row;
  int best_col;
  int limit;
  uint8_t hit;

  hit = 0;
  for (sample = 0; sample < samples; sample = limit) {
    /* The block length is optimised to meet the DTMF specs. */
    if ((samples - sample) >= (102 - s->current_sample))
      limit = sample + (102 - s->current_sample);
    else
      limit = samples;
#if defined(USE_3DNOW)
    _dtmf_goertzel_update(s->row_out, amp + sample, limit - sample);
    _dtmf_goertzel_update(s->col_out, amp + sample, limit - sample);
#else
    /* The following unrolled loop takes only 35% (rough estimate) of the
       time of a rolled loop on the machine on which it was developed */
    for (j = sample; j < limit; j++) {
      famp = amp[j];
      if (s->filter_dialtone) {
        /* Sharp notches applied at 350Hz and 440Hz - the two common dialtone frequencies.
           These are rather high Q, to achieve the required narrowness, without using lots of
           sections. */
        v1 = 0.98356f * famp + 1.8954426f * s->z350_1 - 0.9691396f * s->z350_2;
        famp = v1 - 1.9251480f * s->z350_1 + s->z350_2;
        s->z350_2 = s->z350_1;
        s->z350_1 = v1;

        v1 = 0.98456f * famp + 1.8529543f * s->z440_1 - 0.9691396f * s->z440_2;
        famp = v1 - 1.8819938f * s->z440_1 + s->z440_2;
        s->z440_2 = s->z440_1;
        s->z440_1 = v1;
      }
      s->energy += famp * famp;
      /* With GCC 2.95, the following unrolled code seems to take about 35%
         (rough estimate) as long as a neat little 0-3 loop */
      v1 = s->row_out[0].v2;
      s->row_out[0].v2 = s->row_out[0].v3;
      s->row_out[0].v3 = s->row_out[0].fac * s->row_out[0].v2 - v1 + famp;

      v1 = s->col_out[0].v2;
      s->col_out[0].v2 = s->col_out[0].v3;
      s->col_out[0].v3 = s->col_out[0].fac * s->col_out[0].v2 - v1 + famp;

      v1 = s->row_out[1].v2;
      s->row_out[1].v2 = s->row_out[1].v3;
      s->row_out[1].v3 = s->row_out[1].fac * s->row_out[1].v2 - v1 + famp;

      v1 = s->col_out[1].v2;
      s->col_out[1].v2 = s->col_out[1].v3;
      s->col_out[1].v3 = s->col_out[1].fac * s->col_out[1].v2 - v1 + famp;

      v1 = s->row_out[2].v2;
      s->row_out[2].v2 = s->row_out[2].v3;
      s->row_out[2].v3 = s->row_out[2].fac * s->row_out[2].v2 - v1 + famp;

      v1 = s->col_out[2].v2;
      s->col_out[2].v2 = s->col_out[2].v3;
      s->col_out[2].v3 = s->col_out[2].fac * s->col_out[2].v2 - v1 + famp;

      v1 = s->row_out[3].v2;
      s->row_out[3].v2 = s->row_out[3].v3;
      s->row_out[3].v3 = s->row_out[3].fac * s->row_out[3].v2 - v1 + famp;

      v1 = s->col_out[3].v2;
      s->col_out[3].v2 = s->col_out[3].v3;
      s->col_out[3].v3 = s->col_out[3].fac * s->col_out[3].v2 - v1 + famp;
    }
#endif
    s->current_sample += (limit - sample);
    if (s->current_sample < 102)
      continue;

    /* We are at the end of a DTMF detection block */
    /* Find the peak row and the peak column */
    row_energy[0] = goertzel_result(&s->row_out[0]);
    best_row = 0;
    col_energy[0] = goertzel_result(&s->col_out[0]);
    best_col = 0;

    for (i = 1; i < 4; i++) {
      row_energy[i] = goertzel_result(&s->row_out[i]);
      if (row_energy[i] > row_energy[best_row])
        best_row = i;
      col_energy[i] = goertzel_result(&s->col_out[i]);
      if (col_energy[i] > col_energy[best_col])
        best_col = i;
    }
    hit = 0;
    /* Basic signal level test and the twist test */
    if (row_energy[best_row] >= DTMF_THRESHOLD && col_energy[best_col] >= DTMF_THRESHOLD
        && col_energy[best_col] < row_energy[best_row] * s->reverse_twist
        && col_energy[best_col] * s->normal_twist > row_energy[best_row]) {
      /* Relative peak test ... */
      for (i = 0; i < 4; i++) {
        if ((i != best_col
             && col_energy[i] * DTMF_RELATIVE_PEAK_COL > col_energy[best_col])
            || (i != best_row
                && row_energy[i] * DTMF_RELATIVE_PEAK_ROW > row_energy[best_row])) {
          break;
        }
      }
      /* ... and fraction of total energy test */
      if (i >= 4
          && (row_energy[best_row] + col_energy[best_col]) >
          DTMF_TO_TOTAL_ENERGY * s->energy) {
        hit = dtmf_positions[(best_row << 2) + best_col];
      }
    }
    /* The logic in the next test should ensure the following for different successive hit patterns:
       -----ABB = start of digit B.
       ----B-BB = start of digit B
       ----A-BB = start of digit B
       BBBBBABB = still in digit B.
       BBBBBB-- = end of digit B
       BBBBBBC- = end of digit B
       BBBBACBB = B ends, then B starts again.
       BBBBBBCC = B ends, then C starts.
       BBBBBCDD = B ends, then D starts.
       This can work with:
       - Back to back differing digits. Back-to-back digits should
       not happen. The spec. says there should be a gap between digits.
       However, many real phones do not impose a gap, and rolling across
       the keypad can produce little or no gap.
       - It tolerates nasty phones that give a very wobbly start to a digit.
       - VoIP can give sample slips. The phase jumps that produces will cause
       the block it is in to give no detection. This logic will ride over a
       single missed block, and not falsely declare a second digit. If the
       hiccup happens in the wrong place on a minimum length digit, however
       we would still fail to detect that digit. Could anything be done to
       deal with that? Packet loss is clearly a no-go zone.
       Note this is only relevant to VoIP using A-law, u-law or similar.
       Low bit rate codecs scramble DTMF too much for it to be recognised,
       and often slip in units larger than a sample. */
    if (hit != s->in_digit) {
      if (s->last_hit != s->in_digit) {
        /* We have two successive indications that something has changed. */
        /* To declare digit on, the hits must agree. Otherwise we declare tone off. */
        hit = (hit && hit == s->last_hit) ? hit : 0;
#if 0
        if (s->realtime_callback) {
          /* Avoid reporting multiple no digit conditions on flaky hits */
          if (s->in_digit || hit) {
            i = (s->in_digit
                 && !hit) ? -99 : rint(log10f(s->energy) * 10.0f - 20.08f - 90.30F +
                                       DBM0_MAX_POWER);
            s->realtime_callback(s->realtime_callback_data, hit, i);
          }
        } else {
#endif
          if (hit) {
            if (s->current_digits < MAX_DTMF_DIGITS) {
              s->digits[s->current_digits++] = (char) hit;
              s->digits[s->current_digits] = '\0';
              if (s->callback) {
                s->callback(s->callback_data, s->digits, s->current_digits);
                s->current_digits = 0;
              }
            } else {
              s->lost_digits++;
            }
          }
#if 0
        }
#endif
        s->in_digit = hit;
      }
    }
    s->last_hit = hit;
    /* Reinitialise the detector for the next block */
    for (i = 0; i < 4; i++) {
      goertzel_reset(&s->row_out[i]);
      goertzel_reset(&s->col_out[i]);
    }
    s->energy = 0.0f;
    s->current_sample = 0;
  }
  if (s->current_digits && s->callback) {
    s->callback(s->callback_data, s->digits, s->current_digits);
    s->digits[0] = '\0';
    s->current_digits = 0;
  }
  return 0;
}

/*- End of function --------------------------------------------------------*/

size_t dtmf_rx_get(dtmf_rx_state_t * s, char *buf, int max)
{
  if (max > s->current_digits)
    max = s->current_digits;
  if (max > 0) {
    memcpy(buf, s->digits, max);
    memmove(s->digits, s->digits + max, s->current_digits - max);
    s->current_digits -= max;
  }
  buf[max] = '\0';
  return max;
}

/*- End of function --------------------------------------------------------*/

#if 0
void dtmf_rx_set_realtime_callback(dtmf_rx_state_t * s, tone_report_func_t callback,
                                   void *user_data)
{
  s->realtime_callback = callback;
  s->realtime_callback_data = user_data;
}
#endif
/*- End of function --------------------------------------------------------*/

void dtmf_rx_parms(dtmf_rx_state_t * s, int filter_dialtone, int twist, int reverse_twist)
{
  if (filter_dialtone >= 0) {
    s->z350_1 = 0.0f;
    s->z350_2 = 0.0f;
    s->z440_1 = 0.0f;
    s->z440_2 = 0.0f;
    s->filter_dialtone = filter_dialtone;
  }
  if (twist >= 0)
    s->normal_twist = powf(10.0f, twist / 10.0f);
  if (reverse_twist >= 0)
    s->reverse_twist = powf(10.0f, reverse_twist / 10.0f);
}

/*- End of function --------------------------------------------------------*/

dtmf_rx_state_t *dtmf_rx_init(dtmf_rx_state_t * s, dtmf_rx_callback_t callback,
                              void *user_data)
{
  int i;
  static int initialised = 0;

  s->callback = callback;
  s->callback_data = user_data;
  s->realtime_callback = NULL;
  s->realtime_callback_data = NULL;
  s->filter_dialtone = 0;
  s->normal_twist = DTMF_NORMAL_TWIST;
  s->reverse_twist = DTMF_REVERSE_TWIST;

  s->in_digit = 0;
  s->last_hit = 0;

  if (!initialised) {
    for (i = 0; i < 4; i++) {
      make_goertzel_descriptor(&dtmf_detect_row[i], dtmf_row[i], 102);
      make_goertzel_descriptor(&dtmf_detect_col[i], dtmf_col[i], 102);
    }
    initialised = 1;
  }
  for (i = 0; i < 4; i++) {
    goertzel_init(&s->row_out[i], &dtmf_detect_row[i]);
    goertzel_init(&s->col_out[i], &dtmf_detect_col[i]);
  }
  s->energy = 0.0f;
  s->current_sample = 0;
  s->lost_digits = 0;
  s->current_digits = 0;
  s->digits[0] = '\0';
  return s;
}

/*- End of function --------------------------------------------------------*/

#if 0
static void dtmf_tx_initialise(void)
{
  int row;
  int col;

  if (dtmf_tx_inited)
    return;
  for (row = 0; row < 4; row++) {
    for (col = 0; col < 4; col++) {
      make_tone_gen_descriptor(&dtmf_digit_tones[row * 4 + col], (int) dtmf_row[row],
                               DEFAULT_DTMF_TX_LEVEL, (int) dtmf_col[col],
                               DEFAULT_DTMF_TX_LEVEL, DEFAULT_DTMF_TX_ON_TIME,
                               DEFAULT_DTMF_TX_OFF_TIME, 0, 0, FALSE);
    }
  }
  dtmf_tx_inited = TRUE;
}

/*- End of function --------------------------------------------------------*/

int dtmf_tx(dtmf_tx_state_t * s, int16_t amp[], int max_samples)
{
  int len;
  size_t dig;
  char *cp;

  len = 0;
  if (s->tones.current_section >= 0) {
    /* Deal with the fragment left over from last time */
    len = tone_gen(&(s->tones), amp, max_samples);
  }
  dig = 0;
  while (dig < s->current_digits && len < max_samples) {
    /* Step to the next digit */
    if ((cp = strchr(dtmf_positions, s->digits[dig++])) == NULL)
      continue;
    tone_gen_init(&(s->tones), &(s->tone_descriptors[cp - dtmf_positions]));
    len += tone_gen(&(s->tones), amp + len, max_samples - len);
  }
  if (dig) {
    /* Shift out the consumed digits */
    s->current_digits -= dig;
    memmove(s->digits, s->digits + dig, s->current_digits);
  }
  return len;
}

/*- End of function --------------------------------------------------------*/

size_t dtmf_tx_put(dtmf_tx_state_t * s, const char *digits)
{
  size_t len;

  /* This returns the number of characters that would not fit in the buffer.
     The buffer will only be loaded if the whole string of digits will fit,
     in which case zero is returned. */
  if ((len = strlen(digits)) > 0) {
    if (s->current_digits + len <= MAX_DTMF_DIGITS) {
      memcpy(s->digits + s->current_digits, digits, len);
      s->current_digits += len;
      len = 0;
    } else {
      len = MAX_DTMF_DIGITS - s->current_digits;
    }
  }
  return len;
}

/*- End of function --------------------------------------------------------*/

dtmf_tx_state_t *dtmf_tx_init(dtmf_tx_state_t * s)
{
  if (!dtmf_tx_inited)
    dtmf_tx_initialise();
  s->tone_descriptors = dtmf_digit_tones;
  tone_gen_init(&(s->tones), &dtmf_digit_tones[0]);
  s->current_sample = 0;
  s->current_digits = 0;
  s->tones.current_section = -1;
  return s;
}
#endif //NO TX
/*- End of function --------------------------------------------------------*/
/*- End of file ------------------------------------------------------------*/

void make_goertzel_descriptor(goertzel_descriptor_t * t, float freq, int samples)
{
  //t->fac = 2.0f*cosf(2.0f*M_PI*(freq/(float) SAMPLE_RATE));
  t->fac = 2.0f * cosf(2.0f * M_PI * (freq / (float) 8000));
  t->samples = samples;
}

/*- End of function --------------------------------------------------------*/

goertzel_state_t *goertzel_init(goertzel_state_t * s, goertzel_descriptor_t * t)
{
  if (s || (s = malloc(sizeof(goertzel_state_t)))) {
    s->v2 = s->v3 = 0.0;
    s->fac = t->fac;
    s->samples = t->samples;
    s->current_sample = 0;
  }
  return s;
}

/*- End of function --------------------------------------------------------*/

void goertzel_reset(goertzel_state_t * s)
{
  s->v2 = s->v3 = 0.0;
  s->current_sample = 0;
}

/*- End of function --------------------------------------------------------*/

int goertzel_update(goertzel_state_t * s, const int16_t amp[], int samples)
{
  int i;
  float v1;

  if (samples > s->samples - s->current_sample)
    samples = s->samples - s->current_sample;
  for (i = 0; i < samples; i++) {
    v1 = s->v2;
    s->v2 = s->v3;
    s->v3 = s->fac * s->v2 - v1 + amp[i];
  }
  s->current_sample += samples;
  return samples;
}

/*- End of function --------------------------------------------------------*/

float goertzel_result(goertzel_state_t * s)
{
  float v1;

  /* Push a zero through the process to finish things off. */
  v1 = s->v2;
  s->v2 = s->v3;
  s->v3 = s->fac * s->v2 - v1;
  /* Now calculate the non-recursive side of the filter. */
  /* The result here is not scaled down to allow for the magnification
     effect of the filter (the usual DFT magnification effect). */
  return s->v3 * s->v3 + s->v2 * s->v2 - s->v2 * s->v3 * s->fac;
}

/*- End of function --------------------------------------------------------*/
/*- End of file ------------------------------------------------------------*/