Browse Source

Transceiver52M: Replace convolve and related calls with SSE implementation

This large patch replaced the convolve() call with an SSE vector
enabled version. The lower C and SSE intrinsic based code operates
on fixed and aligned vectors for the filter taps. The storage format
of interleaved I/Q for both complex and real vectors is maintained.

SSE filter tap values must:

  1. Start 16-byte aligned
  2. Number with a multiple of 4 between 4 and 20 for real taps
  3. Number with a multiple of 4 for complex taps

Non-compliant values will fall back to non-SSE usage. Fixed length
iterators mean that head and tail cases may require reallocation of
the input vector, which is automatically handled by the upper C++
interface.

Other calls are affected by these changes and adjusted or rewritten
accordingly. The underlying algorithms, however, are unchanged.

  generateGSMPulse()
  analyzeTrafficBurst()
  detectRACHBurst()

Intel SSE configuration is automatically detected and configured at
build time with Autoconf macros.

Signed-off-by: Thomas Tsou <tom@tsou.cc>
kluchnikov/recv-timeout-fix
Thomas Tsou 9 years ago
parent
commit
3eaae80c90
  1. 1
      Makefile.am
  2. 19
      Transceiver52M/Makefile.am
  3. 714
      Transceiver52M/convolve.c
  4. 30
      Transceiver52M/convolve.h
  5. 644
      Transceiver52M/sigProcLib.cpp
  6. 97
      Transceiver52M/sigProcLib.h
  7. 15
      Transceiver52M/sigProcLibTest.cpp
  8. 72
      config/ax_check_compile_flag.m4
  9. 221
      config/ax_ext.m4
  10. 79
      config/ax_gcc_x86_avx_xgetbv.m4
  11. 79
      config/ax_gcc_x86_cpuid.m4
  12. 4
      configure.ac

1
Makefile.am

@ -20,6 +20,7 @@
include $(top_srcdir)/Makefile.common
ACLOCAL_AMFLAGS = -I config
AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(USB_INCLUDES) $(WITH_INCLUDES)
AM_CXXFLAGS = -Wall -pthread -ldl
#AM_CXXFLAGS = -Wall -O2 -NDEBUG -pthread -ldl

19
Transceiver52M/Makefile.am

@ -21,19 +21,18 @@
include $(top_srcdir)/Makefile.common
AM_CFLAGS = $(STD_DEFINES_AND_INCLUDES) -std=gnu99 -march=native
AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES)
AM_CXXFLAGS = -ldl -lpthread
#UHD wins if both are defined
if UHD
AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(UHD_CFLAGS)
AM_CPPFLAGS += $(UHD_CFLAGS)
else
if USRP1
AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(USRP_CFLAGS)
else
#we should never be here, as this doesn't build if one of the above
#doesn't exist
AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES)
AM_CPPFLAGS += $(USRP_CFLAGS)
endif
endif
AM_CXXFLAGS = -ldl -lpthread
rev2dir = $(datadir)/usrp/rev2
rev4dir = $(datadir)/usrp/rev4
@ -53,7 +52,8 @@ COMMON_SOURCES = \
radioClock.cpp \
sigProcLib.cpp \
Transceiver.cpp \
DummyLoad.cpp
DummyLoad.cpp \
convolve.c
libtransceiver_la_SOURCES = \
$(COMMON_SOURCES) \
@ -75,7 +75,8 @@ noinst_HEADERS = \
USRPDevice.h \
DummyLoad.h \
rcvLPF_651.h \
sendLPF_961.h
sendLPF_961.h \
convolve.h
USRPping_SOURCES = USRPping.cpp
USRPping_LDADD = \

714
Transceiver52M/convolve.c

@ -0,0 +1,714 @@
/*
* SSE Convolution
* Copyright (C) 2012, 2013 Thomas Tsou <tom@tsou.cc>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <malloc.h>
#include <string.h>
#include <stdio.h>
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#ifdef HAVE_SSE3
#include <xmmintrin.h>
#include <pmmintrin.h>
/* 4-tap SSE complex-real convolution */
static void sse_conv_real4(float *restrict x,
float *restrict h,
float *restrict y,
int len)
{
__m128 m0, m1, m2, m3, m4, m5, m6, m7;
/* Load (aligned) filter taps */
m0 = _mm_load_ps(&h[0]);
m1 = _mm_load_ps(&h[4]);
m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
for (int i = 0; i < len; i++) {
/* Load (unaligned) input data */
m0 = _mm_loadu_ps(&x[2 * i + 0]);
m1 = _mm_loadu_ps(&x[2 * i + 4]);
m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
/* Quad multiply */
m4 = _mm_mul_ps(m2, m7);
m5 = _mm_mul_ps(m3, m7);
/* Sum and store */
m6 = _mm_hadd_ps(m4, m5);
m0 = _mm_hadd_ps(m6, m6);
_mm_store_ss(&y[2 * i + 0], m0);
m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(0, 3, 2, 1));
_mm_store_ss(&y[2 * i + 1], m0);
}
}
/* 8-tap SSE complex-real convolution */
static void sse_conv_real8(float *restrict x,
float *restrict h,
float *restrict y,
int len)
{
__m128 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9;
/* Load (aligned) filter taps */
m0 = _mm_load_ps(&h[0]);
m1 = _mm_load_ps(&h[4]);
m2 = _mm_load_ps(&h[8]);
m3 = _mm_load_ps(&h[12]);
m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m5 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
for (int i = 0; i < len; i++) {
/* Load (unaligned) input data */
m0 = _mm_loadu_ps(&x[2 * i + 0]);
m1 = _mm_loadu_ps(&x[2 * i + 4]);
m2 = _mm_loadu_ps(&x[2 * i + 8]);
m3 = _mm_loadu_ps(&x[2 * i + 12]);
m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
m8 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
m9 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
/* Quad multiply */
m6 = _mm_mul_ps(m6, m4);
m7 = _mm_mul_ps(m7, m4);
m8 = _mm_mul_ps(m8, m5);
m9 = _mm_mul_ps(m9, m5);
/* Sum and store */
m6 = _mm_add_ps(m6, m8);
m7 = _mm_add_ps(m7, m9);
m6 = _mm_hadd_ps(m6, m7);
m6 = _mm_hadd_ps(m6, m6);
_mm_store_ss(&y[2 * i + 0], m6);
m6 = _mm_shuffle_ps(m6, m6, _MM_SHUFFLE(0, 3, 2, 1));
_mm_store_ss(&y[2 * i + 1], m6);
}
}
/* 12-tap SSE complex-real convolution */
static void sse_conv_real12(float *restrict x,
float *restrict h,
float *restrict y,
int len)
{
__m128 m0, m1, m2, m3, m4, m5, m6, m7;
__m128 m8, m9, m10, m11, m12, m13, m14;
/* Load (aligned) filter taps */
m0 = _mm_load_ps(&h[0]);
m1 = _mm_load_ps(&h[4]);
m2 = _mm_load_ps(&h[8]);
m3 = _mm_load_ps(&h[12]);
m4 = _mm_load_ps(&h[16]);
m5 = _mm_load_ps(&h[20]);
m12 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m13 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
m14 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
for (int i = 0; i < len; i++) {
/* Load (unaligned) input data */
m0 = _mm_loadu_ps(&x[2 * i + 0]);
m1 = _mm_loadu_ps(&x[2 * i + 4]);
m2 = _mm_loadu_ps(&x[2 * i + 8]);
m3 = _mm_loadu_ps(&x[2 * i + 12]);
m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
m0 = _mm_loadu_ps(&x[2 * i + 16]);
m1 = _mm_loadu_ps(&x[2 * i + 20]);
m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
/* Quad multiply */
m0 = _mm_mul_ps(m4, m12);
m1 = _mm_mul_ps(m5, m12);
m2 = _mm_mul_ps(m6, m13);
m3 = _mm_mul_ps(m7, m13);
m4 = _mm_mul_ps(m8, m14);
m5 = _mm_mul_ps(m9, m14);
/* Sum and store */
m8 = _mm_add_ps(m0, m2);
m9 = _mm_add_ps(m1, m3);
m10 = _mm_add_ps(m8, m4);
m11 = _mm_add_ps(m9, m5);
m2 = _mm_hadd_ps(m10, m11);
m3 = _mm_hadd_ps(m2, m2);
_mm_store_ss(&y[2 * i + 0], m3);
m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0, 3, 2, 1));
_mm_store_ss(&y[2 * i + 1], m3);
}
}
/* 16-tap SSE complex-real convolution */
static void sse_conv_real16(float *restrict x,
float *restrict h,
float *restrict y,
int len)
{
__m128 m0, m1, m2, m3, m4, m5, m6, m7;
__m128 m8, m9, m10, m11, m12, m13, m14, m15;
/* Load (aligned) filter taps */
m0 = _mm_load_ps(&h[0]);
m1 = _mm_load_ps(&h[4]);
m2 = _mm_load_ps(&h[8]);
m3 = _mm_load_ps(&h[12]);
m4 = _mm_load_ps(&h[16]);
m5 = _mm_load_ps(&h[20]);
m6 = _mm_load_ps(&h[24]);
m7 = _mm_load_ps(&h[28]);
m12 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m13 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
m14 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
m15 = _mm_shuffle_ps(m6, m7, _MM_SHUFFLE(0, 2, 0, 2));
for (int i = 0; i < len; i++) {
/* Load (unaligned) input data */
m0 = _mm_loadu_ps(&x[2 * i + 0]);
m1 = _mm_loadu_ps(&x[2 * i + 4]);
m2 = _mm_loadu_ps(&x[2 * i + 8]);
m3 = _mm_loadu_ps(&x[2 * i + 12]);
m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
m0 = _mm_loadu_ps(&x[2 * i + 16]);
m1 = _mm_loadu_ps(&x[2 * i + 20]);
m2 = _mm_loadu_ps(&x[2 * i + 24]);
m3 = _mm_loadu_ps(&x[2 * i + 28]);
m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
m10 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
m11 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
/* Quad multiply */
m0 = _mm_mul_ps(m4, m12);
m1 = _mm_mul_ps(m5, m12);
m2 = _mm_mul_ps(m6, m13);
m3 = _mm_mul_ps(m7, m13);
m4 = _mm_mul_ps(m8, m14);
m5 = _mm_mul_ps(m9, m14);
m6 = _mm_mul_ps(m10, m15);
m7 = _mm_mul_ps(m11, m15);
/* Sum and store */
m8 = _mm_add_ps(m0, m2);
m9 = _mm_add_ps(m1, m3);
m10 = _mm_add_ps(m4, m6);
m11 = _mm_add_ps(m5, m7);
m0 = _mm_add_ps(m8, m10);
m1 = _mm_add_ps(m9, m11);
m2 = _mm_hadd_ps(m0, m1);
m3 = _mm_hadd_ps(m2, m2);
_mm_store_ss(&y[2 * i + 0], m3);
m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0, 3, 2, 1));
_mm_store_ss(&y[2 * i + 1], m3);
}
}
/* 20-tap SSE complex-real convolution */
static void sse_conv_real20(float *restrict x,
float *restrict h,
float *restrict y,
int len)
{
__m128 m0, m1, m2, m3, m4, m5, m6, m7;
__m128 m8, m9, m11, m12, m13, m14, m15;
/* Load (aligned) filter taps */
m0 = _mm_load_ps(&h[0]);
m1 = _mm_load_ps(&h[4]);
m2 = _mm_load_ps(&h[8]);
m3 = _mm_load_ps(&h[12]);
m4 = _mm_load_ps(&h[16]);
m5 = _mm_load_ps(&h[20]);
m6 = _mm_load_ps(&h[24]);
m7 = _mm_load_ps(&h[28]);
m8 = _mm_load_ps(&h[32]);
m9 = _mm_load_ps(&h[36]);
m11 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m12 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
m13 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
m14 = _mm_shuffle_ps(m6, m7, _MM_SHUFFLE(0, 2, 0, 2));
m15 = _mm_shuffle_ps(m8, m9, _MM_SHUFFLE(0, 2, 0, 2));
for (int i = 0; i < len; i++) {
/* Multiply-accumulate first 12 taps */
m0 = _mm_loadu_ps(&x[2 * i + 0]);
m1 = _mm_loadu_ps(&x[2 * i + 4]);
m2 = _mm_loadu_ps(&x[2 * i + 8]);
m3 = _mm_loadu_ps(&x[2 * i + 12]);
m4 = _mm_loadu_ps(&x[2 * i + 16]);
m5 = _mm_loadu_ps(&x[2 * i + 20]);
m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
m8 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
m9 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
m0 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
m1 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(1, 3, 1, 3));
m2 = _mm_mul_ps(m6, m11);
m3 = _mm_mul_ps(m7, m11);
m4 = _mm_mul_ps(m8, m12);
m5 = _mm_mul_ps(m9, m12);
m6 = _mm_mul_ps(m0, m13);
m7 = _mm_mul_ps(m1, m13);
m0 = _mm_add_ps(m2, m4);
m1 = _mm_add_ps(m3, m5);
m8 = _mm_add_ps(m0, m6);
m9 = _mm_add_ps(m1, m7);
/* Multiply-accumulate last 8 taps */
m0 = _mm_loadu_ps(&x[2 * i + 24]);
m1 = _mm_loadu_ps(&x[2 * i + 28]);
m2 = _mm_loadu_ps(&x[2 * i + 32]);
m3 = _mm_loadu_ps(&x[2 * i + 36]);
m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
m0 = _mm_mul_ps(m4, m14);
m1 = _mm_mul_ps(m5, m14);
m2 = _mm_mul_ps(m6, m15);
m3 = _mm_mul_ps(m7, m15);
m4 = _mm_add_ps(m0, m2);
m5 = _mm_add_ps(m1, m3);
/* Final sum and store */
m0 = _mm_add_ps(m8, m4);
m1 = _mm_add_ps(m9, m5);
m2 = _mm_hadd_ps(m0, m1);
m3 = _mm_hadd_ps(m2, m2);
_mm_store_ss(&y[2 * i + 0], m3);
m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0, 3, 2, 1));
_mm_store_ss(&y[2 * i + 1], m3);
}
}
/* 4*N-tap SSE complex-real convolution */
static void sse_conv_real4n(float *x, float *h, float *y, int h_len, int len)
{
__m128 m0, m1, m2, m4, m5, m6, m7;
for (int i = 0; i < len; i++) {
/* Zero */
m6 = _mm_setzero_ps();
m7 = _mm_setzero_ps();
for (int n = 0; n < h_len / 4; n++) {
/* Load (aligned) filter taps */
m0 = _mm_load_ps(&h[8 * n + 0]);
m1 = _mm_load_ps(&h[8 * n + 4]);
m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
/* Load (unaligned) input data */
m0 = _mm_loadu_ps(&x[2 * i + 8 * n + 0]);
m1 = _mm_loadu_ps(&x[2 * i + 8 * n + 4]);
m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
/* Quad multiply */
m0 = _mm_mul_ps(m2, m4);
m1 = _mm_mul_ps(m2, m5);
/* Accumulate */
m6 = _mm_add_ps(m6, m0);
m7 = _mm_add_ps(m7, m1);
}
m0 = _mm_hadd_ps(m6, m7);
m0 = _mm_hadd_ps(m0, m0);
_mm_store_ss(&y[2 * i + 0], m0);
m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(0, 3, 2, 1));
_mm_store_ss(&y[2 * i + 1], m0);
}
}
/* 4*N-tap SSE complex-complex convolution */
static void sse_conv_cmplx_4n(float *x, float *h, float *y, int h_len, int len)
{
__m128 m0, m1, m2, m3, m4, m5, m6, m7;
for (int i = 0; i < len; i++) {
/* Zero */
m6 = _mm_setzero_ps();
m7 = _mm_setzero_ps();
for (int n = 0; n < h_len / 4; n++) {
/* Load (aligned) filter taps */
m0 = _mm_load_ps(&h[8 * n + 0]);
m1 = _mm_load_ps(&h[8 * n + 4]);
m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
/* Load (unaligned) input data */
m0 = _mm_loadu_ps(&x[2 * i + 8 * n + 0]);
m1 = _mm_loadu_ps(&x[2 * i + 8 * n + 4]);
m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
/* Quad multiply */
m0 = _mm_mul_ps(m2, m4);
m1 = _mm_mul_ps(m3, m5);
m2 = _mm_mul_ps(m2, m5);
m3 = _mm_mul_ps(m3, m4);
/* Sum */
m0 = _mm_sub_ps(m0, m1);
m2 = _mm_add_ps(m2, m3);
/* Accumulate */
m6 = _mm_add_ps(m6, m0);
m7 = _mm_add_ps(m7, m2);
}
m0 = _mm_hadd_ps(m6, m7);
m0 = _mm_hadd_ps(m0, m0);
_mm_store_ss(&y[2 * i + 0], m0);
m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(0, 3, 2, 1));
_mm_store_ss(&y[2 * i + 1], m0);
}
}
/* 8*N-tap SSE complex-complex convolution */
static void sse_conv_cmplx_8n(float *x, float *h, float *y, int h_len, int len)
{
__m128 m0, m1, m2, m3, m4, m5, m6, m7;
__m128 m8, m9, m10, m11, m12, m13, m14, m15;
for (int i = 0; i < len; i++) {
/* Zero */
m12 = _mm_setzero_ps();
m13 = _mm_setzero_ps();
m14 = _mm_setzero_ps();
m15 = _mm_setzero_ps();
for (int n = 0; n < h_len / 8; n++) {
/* Load (aligned) filter taps */
m0 = _mm_load_ps(&h[16 * n + 0]);
m1 = _mm_load_ps(&h[16 * n + 4]);
m2 = _mm_load_ps(&h[16 * n + 8]);
m3 = _mm_load_ps(&h[16 * n + 12]);
m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
m6 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 2, 0, 2));
m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
/* Load (unaligned) input data */
m0 = _mm_loadu_ps(&x[2 * i + 16 * n + 0]);
m1 = _mm_loadu_ps(&x[2 * i + 16 * n + 4]);
m2 = _mm_loadu_ps(&x[2 * i + 16 * n + 8]);
m3 = _mm_loadu_ps(&x[2 * i + 16 * n + 12]);
m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
m10 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
m11 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
/* Quad multiply */
m0 = _mm_mul_ps(m4, m8);
m1 = _mm_mul_ps(m5, m9);
m2 = _mm_mul_ps(m6, m10);
m3 = _mm_mul_ps(m7, m11);
m4 = _mm_mul_ps(m4, m9);
m5 = _mm_mul_ps(m5, m8);
m6 = _mm_mul_ps(m6, m11);
m7 = _mm_mul_ps(m7, m10);
/* Sum */
m0 = _mm_sub_ps(m0, m1);
m2 = _mm_sub_ps(m2, m3);
m4 = _mm_add_ps(m4, m5);
m6 = _mm_add_ps(m6, m7);
/* Accumulate */
m12 = _mm_add_ps(m12, m0);
m13 = _mm_add_ps(m13, m2);
m14 = _mm_add_ps(m14, m4);
m15 = _mm_add_ps(m15, m6);
}
m0 = _mm_add_ps(m12, m13);
m1 = _mm_add_ps(m14, m15);
m2 = _mm_hadd_ps(m0, m1);
m2 = _mm_hadd_ps(m2, m2);
_mm_store_ss(&y[2 * i + 0], m2);
m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 3, 2, 1));
_mm_store_ss(&y[2 * i + 1], m2);
}
}
#endif
/* Base multiply and accumulate complex-real */
static void mac_real(float *x, float *h, float *y)
{
y[0] += x[0] * h[0];
y[1] += x[1] * h[0];
}
/* Base multiply and accumulate complex-complex */
static void mac_cmplx(float *x, float *h, float *y)
{
y[0] += x[0] * h[0] - x[1] * h[1];
y[1] += x[0] * h[1] + x[1] * h[0];
}
/* Base vector complex-complex multiply and accumulate */
static void mac_real_vec_n(float *x, float *h, float *y,
int len, int step, int offset)
{
for (int i = offset; i < len; i += step)
mac_real(&x[2 * i], &h[2 * i], y);
}
/* Base vector complex-complex multiply and accumulate */
static void mac_cmplx_vec_n(float *x, float *h, float *y,
int len, int step, int offset)
{
for (int i = offset; i < len; i += step)
mac_cmplx(&x[2 * i], &h[2 * i], y);
}
/* Base complex-real convolution */
static int _base_convolve_real(float *x, int x_len,
float *h, int h_len,
float *y, int y_len,
int start, int len,
int step, int offset)
{
for (int i = 0; i < len; i++) {
mac_real_vec_n(&x[2 * (i - (h_len - 1) + start)],
h,
&y[2 * i], h_len,
step, offset);
}
return len;
}
/* Base complex-complex convolution */
static int _base_convolve_complex(float *x, int x_len,
float *h, int h_len,
float *y, int y_len,
int start, int len,
int step, int offset)
{
for (int i = 0; i < len; i++) {
mac_cmplx_vec_n(&x[2 * (i - (h_len - 1) + start)],
h,
&y[2 * i],
h_len, step, offset);
}
return len;
}
/* Buffer validity checks */
static int bounds_check(int x_len, int h_len, int y_len,
int start, int len, int step)
{
if ((x_len < 1) || (h_len < 1) ||
(y_len < 1) || (len < 1) || (step < 1)) {
fprintf(stderr, "Convolve: Invalid input\n");
return -1;
}
if ((start + len > x_len) || (len > y_len) || (x_len < h_len)) {
fprintf(stderr, "Convolve: Boundary exception\n");
fprintf(stderr, "start: %i, len: %i, x: %i, h: %i, y: %i\n",
start, len, x_len, h_len, y_len);
return -1;
}
return 0;
}
/* API: Aligned complex-real */
int convolve_real(float *x, int x_len,
float *h, int h_len,
float *y, int y_len,
int start, int len,
int step, int offset)
{
void (*conv_func)(float *, float *, float *, int) = NULL;
void (*conv_func_n)(float *, float *, float *, int, int) = NULL;
if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
return -1;
memset(y, 0, len * 2 * sizeof(float));
#ifdef HAVE_SSE3
if (step <= 4) {
switch (h_len) {
case 4:
conv_func = sse_conv_real4;
break;
case 8:
conv_func = sse_conv_real8;
break;
case 12:
conv_func = sse_conv_real12;
break;
case 16:
conv_func = sse_conv_real16;
break;
case 20:
conv_func = sse_conv_real20;
break;
default:
if (!(h_len % 4))
conv_func_n = sse_conv_real4n;
}
}
#endif
if (conv_func) {
conv_func(&x[2 * (-(h_len - 1) + start)],
h, y, len);
} else if (conv_func_n) {
conv_func_n(&x[2 * (-(h_len - 1) + start)],
h, y, h_len, len);
} else {
_base_convolve_real(x, x_len,
h, h_len,
y, y_len,
start, len, step, offset);
}
return len;
}
/* API: Aligned complex-complex */
int convolve_complex(float *x, int x_len,
float *h, int h_len,
float *y, int y_len,
int start, int len,
int step, int offset)
{
void (*conv_func)(float *, float *, float *, int, int) = NULL;
if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
return -1;
memset(y, 0, len * 2 * sizeof(float));
#ifdef HAVE_SSE3
if (step <= 4) {
if (!(h_len % 8))
conv_func = sse_conv_cmplx_8n;
else if (!(h_len % 4))
conv_func = sse_conv_cmplx_4n;
}
#endif
if (conv_func) {
conv_func(&x[2 * (-(h_len - 1) + start)],
h, y, h_len, len);
} else {
_base_convolve_complex(x, x_len,
h, h_len,
y, y_len,
start, len, step, offset);
}
return len;
}
/* API: Non-aligned (no SSE) complex-real */
int base_convolve_real(float *x, int x_len,
float *h, int h_len,
float *y, int y_len,
int start, int len,
int step, int offset)
{
if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
return -1;
memset(y, 0, len * 2 * sizeof(float));
return _base_convolve_real(x, x_len,
h, h_len,
y, y_len,
start, len, step, offset);
}
/* API: Non-aligned (no SSE) complex-complex */
int base_convolve_complex(float *x, int x_len,
float *h, int h_len,
float *y, int y_len,
int start, int len,
int step, int offset)
{
if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
return -1;
memset(y, 0, len * 2 * sizeof(float));
return _base_convolve_complex(x, x_len,
h, h_len,
y, y_len,
start, len, step, offset);
}
/* Aligned filter tap allocation */
void *convolve_h_alloc(int len)
{
#ifdef HAVE_SSE3
return memalign(16, len * 2 * sizeof(float));
#else
return malloc(len * 2 * sizeof(float));
#endif
}

30
Transceiver52M/convolve.h

@ -0,0 +1,30 @@
#ifndef _CONVOLVE_H_
#define _CONVOLVE_H_
void *convolve_h_alloc(int num);
int convolve_real(float *x, int x_len,
float *h, int h_len,
float *y, int y_len,
int start, int len,
int step, int offset);
int convolve_complex(float *x, int x_len,
float *h, int h_len,
float *y, int y_len,
int start, int len,
int step, int offset);
int base_convolve_real(float *x, int x_len,
float *h, int h_len,
float *y, int y_len,
int start, int len,
int step, int offset);
int base_convolve_complex(float *x, int x_len,
float *h, int h_len,
float *y, int y_len,
int start, int len,
int step, int offset);
#endif /* _CONVOLVE_H_ */

644
Transceiver52M/sigProcLib.cpp

@ -29,6 +29,10 @@
using namespace GSM;
extern "C" {
#include "convolve.h"
}
#define TABLESIZE 1024
/** Lookup tables for trigonometric approximation */
@ -45,28 +49,35 @@ signalVector *GMSKRotation = NULL;
signalVector *GMSKReverseRotation = NULL;
/*
* RACH and midamble correlation waveforms
* RACH and midamble correlation waveforms. Store the buffer separately
* because we need to allocate it explicitly outside of the signal vector
* constructor. This is because C++ (prior to C++11) is unable to natively
* perform 16-byte memory alignment required by many SSE instructions.
*/
struct CorrelationSequence {
CorrelationSequence() : sequence(NULL)
CorrelationSequence() : sequence(NULL), buffer(NULL)
{
}
~CorrelationSequence()
{
delete sequence;
free(buffer);
}
signalVector *sequence;
void *buffer;
float TOA;
complex gain;
};
/*
* Gaussian and empty modulation pulses
* Gaussian and empty modulation pulses. Like the correlation sequences,
* store the runtime (Gaussian) buffer separately because of needed alignment
* for SSE instructions.
*/
struct PulseSequence {
PulseSequence() : gaussian(NULL), empty(NULL)
PulseSequence() : gaussian(NULL), empty(NULL), buffer(NULL)
{
}
@ -74,10 +85,12 @@ struct PulseSequence {
{
delete gaussian;
delete empty;
free(buffer);
}
signalVector *gaussian;
signalVector *empty;
void *buffer;
};
CorrelationSequence *gMidambles[] = {NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL};
@ -246,7 +259,7 @@ void initGMSKRotationTables(int sps)
bool sigProcLibSetup(int sps)
{
if ((sps != 0) && (sps != 2) && (sps != 4))
if ((sps != 1) && (sps != 2) && (sps != 4))
return false;
initTrigTables();
@ -295,174 +308,106 @@ void GMSKReverseRotate(signalVector &x) {
}
}
signalVector* convolve(const signalVector *a,
const signalVector *b,
signalVector *c,
ConvType spanType,
unsigned startIx,
unsigned len)
signalVector *convolve(const signalVector *x,
const signalVector *h,
signalVector *y,
ConvType spanType, int start,
unsigned len, unsigned step, int offset)
{
if ((a==NULL) || (b==NULL)) return NULL;
int La = a->size();
int Lb = b->size();
int startIndex;
unsigned int outSize;
switch (spanType) {
case FULL_SPAN:
startIndex = 0;
outSize = La+Lb-1;
break;
case OVERLAP_ONLY:
startIndex = La;
outSize = abs(La-Lb)+1;
break;
case START_ONLY:
startIndex = 0;
outSize = La;
break;
case WITH_TAIL:
startIndex = Lb;
outSize = La;
break;
case NO_DELAY:
if (Lb % 2)
startIndex = Lb/2;
else
startIndex = Lb/2-1;
outSize = La;
break;
case CUSTOM:
startIndex = startIx;
outSize = len;
break;
default:
return NULL;
}
int rc, head = 0, tail = 0;
bool alloc = false, append = false;
const signalVector *_x = NULL;
if (c==NULL)
c = new signalVector(outSize);
else if (c->size()!=outSize)
if (!x || !h)
return NULL;
signalVector::const_iterator aStart = a->begin();
signalVector::const_iterator bStart = b->begin();
signalVector::const_iterator aEnd = a->end();
signalVector::const_iterator bEnd = b->end();
signalVector::iterator cPtr = c->begin();
int t = startIndex;
int stopIndex = startIndex + outSize;
switch (b->getSymmetry()) {
case NONE:
{
while (t < stopIndex) {
signalVector::const_iterator aP = aStart+t;
signalVector::const_iterator bP = bStart;
if (a->isRealOnly() && b->isRealOnly()) {
float sum = 0.0;
while (bP < bEnd) {
if (aP < aStart) break;
if (aP < aEnd) sum += (aP->real())*(bP->real());
aP--;
bP++;
}
*cPtr++ = sum;
}
else if (a->isRealOnly()) {
complex sum = 0.0;
while (bP < bEnd) {
if (aP < aStart) break;
if (aP < aEnd) sum += (*bP)*(aP->real());
aP--;
bP++;
}
*cPtr++ = sum;
}
else if (b->isRealOnly()) {
complex sum = 0.0;
while (bP < bEnd) {
if (aP < aStart) break;
if (aP < aEnd) sum += (*aP)*(bP->real());
aP--;
bP++;
}
*cPtr++ = sum;
}
else {
complex sum = 0.0;
while (bP < bEnd) {
if (aP < aStart) break;
if (aP < aEnd) sum += (*aP)*(*bP);
aP--;
bP++;
}
*cPtr++ = sum;
}
t++;
}
}
switch (spanType) {
case START_ONLY:
start = 0;
head = h->size();
len = x->size();
append = true;
break;
case ABSSYM:
{
complex sum = 0.0;
bool isOdd = (bool) (Lb % 2);
if (isOdd)
bEnd = bStart + (Lb+1)/2;
else
bEnd = bStart + Lb/2;
while (t < stopIndex) {
signalVector::const_iterator aP = aStart+t;
signalVector::const_iterator aPsym = aP-Lb+1;
signalVector::const_iterator bP = bStart;
sum = 0.0;
if (!b->isRealOnly()) {
while (bP < bEnd) {
if (aP < aStart) break;
if (aP == aPsym)
sum+= (*aP)*(*bP);
else if ((aP < aEnd) && (aPsym >= aStart))
sum+= ((*aP)+(*aPsym))*(*bP);
else if (aP < aEnd)
sum += (*aP)*(*bP);
else if (aPsym >= aStart)
sum += (*aPsym)*(*bP);
aP--;
aPsym++;
bP++;
}
}
else {
while (bP < bEnd) {
if (aP < aStart) break;
if (aP == aPsym)
sum+= (*aP)*(bP->real());
else if ((aP < aEnd) && (aPsym >= aStart))
sum+= ((*aP)+(*aPsym))*(bP->real());
else if (aP < aEnd)
sum += (*aP)*(bP->real());
else if (aPsym >= aStart)
sum += (*aPsym)*(bP->real());
aP--;
aPsym++;
bP++;
}
}
*cPtr++ = sum;
t++;
}
case NO_DELAY:
start = h->size() / 2;
head = start;
tail = start;
len = x->size();
append = true;
break;
case CUSTOM:
if (start < h->size() - 1) {
head = h->size() - start;
append = true;
}
if (start + len > x->size()) {
tail = start + len - x->size();
append = true;
}
break;
default:
return NULL;
break;
}
return c;
}
/*
* Error if the output vector is too small. Create the output vector
* if the pointer is NULL.
*/
if (y && (len > y->size()))
return NULL;
if (!y) {
y = new signalVector(len);
alloc = true;
}
/* Prepend or post-pend the input vector if the parameters require it */
if (append)
_x = new signalVector(*x, head, tail);
else
_x = x;
/*
* Four convovle types:
* 1. Complex-Real (aligned)
* 2. Complex-Complex (aligned)
* 3. Complex-Real (!aligned)
* 4. Complex-Complex (!aligned)
*/
if (h->isRealOnly() && h->isAligned()) {
rc = convolve_real((float *) _x->begin(), _x->size(),
(float *) h->begin(), h->size(),
(float *) y->begin(), y->size(),
start, len, step, offset);
} else if (!h->isRealOnly() && h->isAligned()) {
rc = convolve_complex((float *) _x->begin(), _x->size(),
(float *) h->begin(), h->size(),
(float *) y->begin(), y->size(),
start, len, step, offset);
} else if (h->isRealOnly() && !h->isAligned()) {
rc = base_convolve_real((float *) _x->begin(), _x->size(),
(float *) h->begin(), h->size(),
(float *) y->begin(), y->size(),
start, len, step, offset);
} else if (!h->isRealOnly() && !h->isAligned()) {
rc = base_convolve_complex((float *) _x->begin(), _x->size(),
(float *) h->begin(), h->size(),
(float *) y->begin(), y->size(),
start, len, step, offset);
} else {
rc = -1;
}
if (append)
delete _x;
if (rc < 0) {
if (alloc)
delete y;
return NULL;
}
return y;
}
void generateGSMPulse(int sps, int symbolLength)
{
@ -477,9 +422,17 @@ void generateGSMPulse(int sps, int symbolLength)
GSMPulse->empty->isRealOnly(true);
*(GSMPulse->empty->begin()) = 1.0f;
len = sps * symbolLength;
if (len < 4)
len = 4;
/* GSM pulse approximation */
GSMPulse->gaussian = new signalVector(len);
GSMPulse->buffer = convolve_h_alloc(len);
GSMPulse->gaussian = new signalVector((complex *)
GSMPulse->buffer, 0, len);
GSMPulse->gaussian->setAligned(true);
GSMPulse->gaussian->isRealOnly(true);
signalVector::iterator xP = GSMPulse->gaussian->begin();
center = (float) (len - 1.0) / 2.0;
@ -560,31 +513,6 @@ signalVector* reverseConjugate(signalVector *b)
return tmp;
}
signalVector* correlate(signalVector *a,
signalVector *b,
signalVector *c,
ConvType spanType,
bool bReversedConjugated,
unsigned startIx,
unsigned len)
{
signalVector *tmp = NULL;
if (!bReversedConjugated) {
tmp = reverseConjugate(b);
}
else {
tmp = b;
}
c = convolve(a,tmp,c,spanType,startIx,len);
if (!bReversedConjugated) delete tmp;
return c;
}
/* soft output slicer */
bool vectorSlicer(signalVector *x)
{
@ -599,12 +527,13 @@ bool vectorSlicer(signalVector *x)
}
return true;
}
/* Assume input bits are not differentially encoded */
signalVector *modulateBurst(const BitVector &wBurst, int guardPeriodLength,
int sps, bool emptyPulse)
{
int burstLen;
signalVector *pulse, modBurst;
signalVector *pulse, *shapedBurst, modBurst;
signalVector::iterator modBurstItr;
if (emptyPulse)
@ -628,7 +557,9 @@ signalVector *modulateBurst(const BitVector &wBurst, int guardPeriodLength,
modBurst.isRealOnly(false);
// filter w/ pulse shape
signalVector *shapedBurst = convolve(&modBurst, pulse, NULL, NO_DELAY);
shapedBurst = convolve(&modBurst, pulse, NULL, START_ONLY);
if (!shapedBurst)
return NULL;
return shapedBurst;
}
@ -639,24 +570,24 @@ float sinc(float x)
return 1.0F;
}
void delayVector(signalVector &wBurst,
float delay)
bool delayVector(signalVector &wBurst, float delay)
{
int intOffset = (int) floor(delay);
float fracOffset = delay - intOffset;
// do fractional shift first, only do it for reasonable offsets
if (fabs(fracOffset) > 1e-2) {
// create sinc function
signalVector sincVector(21);
sincVector.isRealOnly(true);
signalVector::iterator sincBurstItr = sincVector.begin();
signalVector::iterator sincBurstItr = sincVector.end();
for (int i = 0; i < 21; i++)
*sincBurstItr++ = (complex) sinc(M_PI_F*(i-10-fracOffset));
*--sincBurstItr = (complex) sinc(M_PI_F*(i-10-fracOffset));
signalVector shiftedBurst(wBurst.size());
convolve(&wBurst,&sincVector,&shiftedBurst,NO_DELAY);
if (!convolve(&wBurst, &sincVector, &shiftedBurst, NO_DELAY))
return false;
wBurst.clone(shiftedBurst);
}
@ -861,25 +792,25 @@ bool generateMidamble(int sps, int tsc)
bool status = true;
complex *data = NULL;
signalVector *autocorr = NULL, *midamble = NULL;
signalVector *midMidamble = NULL;
signalVector *midMidamble = NULL, *_midMidamble = NULL;
if ((tsc < 0) || (tsc > 7))
if ((tsc < 0) || (tsc > 7))
return false;
delete gMidambles[tsc];
/* Use middle 16 bits of each TSC. Correlation sequence is not pulse shaped */
midMidamble = modulateBurst(gTrainingSequence[tsc].segment(5,16), 0, sps, true);
if (!midMidamble)
return false;
/* Simulated receive sequence is pulse shaped */
/* Simulated receive sequence is pulse shaped */
midamble = modulateBurst(gTrainingSequence[tsc], 0, sps, false);
if (!midamble) {
status = false;
goto release;
}
// NOTE: Because ideal TSC 16-bit midamble is 66 symbols into burst,
// the ideal TSC has an + 180 degree phase shift,
// due to the pi/2 frequency shift, that
@ -890,22 +821,32 @@ bool generateMidamble(int sps, int tsc)
conjugateVector(*midMidamble);
autocorr = correlate(midamble, midMidamble, NULL, NO_DELAY);
/* For SSE alignment, reallocate the midamble sequence on 16-byte boundary */
data = (complex *) convolve_h_alloc(midMidamble->size());
_midMidamble = new signalVector(data, 0, midMidamble->size());
_midMidamble->setAligned(true);
memcpy(_midMidamble->begin(), midMidamble->begin(),
midMidamble->size() * sizeof(complex));
autocorr = convolve(midamble, _midMidamble, NULL, NO_DELAY);
if (!autocorr) {
status = false;
goto release;
}
gMidambles[tsc] = new CorrelationSequence;
gMidambles[tsc]->sequence = midMidamble;
gMidambles[tsc]->gain = peakDetect(*autocorr,&gMidambles[tsc]->TOA,NULL);
gMidambles[tsc]->buffer = data;
gMidambles[tsc]->sequence = _midMidamble;
gMidambles[tsc]->gain = peakDetect(*autocorr,&gMidambles[tsc]->TOA, NULL);
release:
delete autocorr;
delete midamble;
delete midMidamble;
if (!status) {
delete midMidamble;
delete _midMidamble;
free(data);
gMidambles[tsc] =