diff --git a/Makefile.am b/Makefile.am index 058876de..06251f0d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -20,6 +20,7 @@ include $(top_srcdir)/Makefile.common +ACLOCAL_AMFLAGS = -I config AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(USB_INCLUDES) $(WITH_INCLUDES) AM_CXXFLAGS = -Wall -pthread -ldl #AM_CXXFLAGS = -Wall -O2 -NDEBUG -pthread -ldl diff --git a/Transceiver52M/Makefile.am b/Transceiver52M/Makefile.am index 6fcef7c6..67ab0ea0 100644 --- a/Transceiver52M/Makefile.am +++ b/Transceiver52M/Makefile.am @@ -21,19 +21,18 @@ include $(top_srcdir)/Makefile.common +AM_CFLAGS = $(STD_DEFINES_AND_INCLUDES) -std=gnu99 -march=native +AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) +AM_CXXFLAGS = -ldl -lpthread + #UHD wins if both are defined if UHD -AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(UHD_CFLAGS) +AM_CPPFLAGS += $(UHD_CFLAGS) else if USRP1 -AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(USRP_CFLAGS) -else -#we should never be here, as this doesn't build if one of the above -#doesn't exist -AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) +AM_CPPFLAGS += $(USRP_CFLAGS) endif endif -AM_CXXFLAGS = -ldl -lpthread rev2dir = $(datadir)/usrp/rev2 rev4dir = $(datadir)/usrp/rev4 @@ -53,7 +52,8 @@ COMMON_SOURCES = \ radioClock.cpp \ sigProcLib.cpp \ Transceiver.cpp \ - DummyLoad.cpp + DummyLoad.cpp \ + convolve.c libtransceiver_la_SOURCES = \ $(COMMON_SOURCES) \ @@ -75,7 +75,8 @@ noinst_HEADERS = \ USRPDevice.h \ DummyLoad.h \ rcvLPF_651.h \ - sendLPF_961.h + sendLPF_961.h \ + convolve.h USRPping_SOURCES = USRPping.cpp USRPping_LDADD = \ diff --git a/Transceiver52M/convolve.c b/Transceiver52M/convolve.c new file mode 100644 index 00000000..6f48ea0c --- /dev/null +++ b/Transceiver52M/convolve.c @@ -0,0 +1,714 @@ +/* + * SSE Convolution + * Copyright (C) 2012, 2013 Thomas Tsou + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#ifdef HAVE_SSE3 +#include +#include + +/* 4-tap SSE complex-real convolution */ +static void sse_conv_real4(float *restrict x, + float *restrict h, + float *restrict y, + int len) +{ + __m128 m0, m1, m2, m3, m4, m5, m6, m7; + + /* Load (aligned) filter taps */ + m0 = _mm_load_ps(&h[0]); + m1 = _mm_load_ps(&h[4]); + m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); + + for (int i = 0; i < len; i++) { + /* Load (unaligned) input data */ + m0 = _mm_loadu_ps(&x[2 * i + 0]); + m1 = _mm_loadu_ps(&x[2 * i + 4]); + m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); + m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); + + /* Quad multiply */ + m4 = _mm_mul_ps(m2, m7); + m5 = _mm_mul_ps(m3, m7); + + /* Sum and store */ + m6 = _mm_hadd_ps(m4, m5); + m0 = _mm_hadd_ps(m6, m6); + + _mm_store_ss(&y[2 * i + 0], m0); + m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(0, 3, 2, 1)); + _mm_store_ss(&y[2 * i + 1], m0); + } +} + +/* 8-tap SSE complex-real convolution */ +static void sse_conv_real8(float *restrict x, + float *restrict h, + float *restrict y, + int len) +{ + __m128 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9; + + /* Load (aligned) filter taps */ + m0 = _mm_load_ps(&h[0]); + m1 = _mm_load_ps(&h[4]); + m2 = _mm_load_ps(&h[8]); + m3 = _mm_load_ps(&h[12]); + + m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); + m5 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2)); + + for (int i = 0; i < len; i++) { + /* Load (unaligned) input data */ + m0 = _mm_loadu_ps(&x[2 * i + 0]); + m1 = _mm_loadu_ps(&x[2 * i + 4]); + m2 = _mm_loadu_ps(&x[2 * i + 8]); + m3 = _mm_loadu_ps(&x[2 * i + 12]); + + m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); + m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); + m8 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2)); + m9 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3)); + + /* Quad multiply */ + m6 = _mm_mul_ps(m6, m4); + m7 = _mm_mul_ps(m7, m4); + m8 = _mm_mul_ps(m8, m5); + m9 = _mm_mul_ps(m9, m5); + + /* Sum and store */ + m6 = _mm_add_ps(m6, m8); + m7 = _mm_add_ps(m7, m9); + m6 = _mm_hadd_ps(m6, m7); + m6 = _mm_hadd_ps(m6, m6); + + _mm_store_ss(&y[2 * i + 0], m6); + m6 = _mm_shuffle_ps(m6, m6, _MM_SHUFFLE(0, 3, 2, 1)); + _mm_store_ss(&y[2 * i + 1], m6); + } +} + +/* 12-tap SSE complex-real convolution */ +static void sse_conv_real12(float *restrict x, + float *restrict h, + float *restrict y, + int len) +{ + __m128 m0, m1, m2, m3, m4, m5, m6, m7; + __m128 m8, m9, m10, m11, m12, m13, m14; + + /* Load (aligned) filter taps */ + m0 = _mm_load_ps(&h[0]); + m1 = _mm_load_ps(&h[4]); + m2 = _mm_load_ps(&h[8]); + m3 = _mm_load_ps(&h[12]); + m4 = _mm_load_ps(&h[16]); + m5 = _mm_load_ps(&h[20]); + + m12 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); + m13 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2)); + m14 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2)); + + for (int i = 0; i < len; i++) { + /* Load (unaligned) input data */ + m0 = _mm_loadu_ps(&x[2 * i + 0]); + m1 = _mm_loadu_ps(&x[2 * i + 4]); + m2 = _mm_loadu_ps(&x[2 * i + 8]); + m3 = _mm_loadu_ps(&x[2 * i + 12]); + + m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); + m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); + m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2)); + m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3)); + + m0 = _mm_loadu_ps(&x[2 * i + 16]); + m1 = _mm_loadu_ps(&x[2 * i + 20]); + + m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); + m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); + + /* Quad multiply */ + m0 = _mm_mul_ps(m4, m12); + m1 = _mm_mul_ps(m5, m12); + m2 = _mm_mul_ps(m6, m13); + m3 = _mm_mul_ps(m7, m13); + m4 = _mm_mul_ps(m8, m14); + m5 = _mm_mul_ps(m9, m14); + + /* Sum and store */ + m8 = _mm_add_ps(m0, m2); + m9 = _mm_add_ps(m1, m3); + m10 = _mm_add_ps(m8, m4); + m11 = _mm_add_ps(m9, m5); + + m2 = _mm_hadd_ps(m10, m11); + m3 = _mm_hadd_ps(m2, m2); + + _mm_store_ss(&y[2 * i + 0], m3); + m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0, 3, 2, 1)); + _mm_store_ss(&y[2 * i + 1], m3); + } +} + +/* 16-tap SSE complex-real convolution */ +static void sse_conv_real16(float *restrict x, + float *restrict h, + float *restrict y, + int len) +{ + __m128 m0, m1, m2, m3, m4, m5, m6, m7; + __m128 m8, m9, m10, m11, m12, m13, m14, m15; + + /* Load (aligned) filter taps */ + m0 = _mm_load_ps(&h[0]); + m1 = _mm_load_ps(&h[4]); + m2 = _mm_load_ps(&h[8]); + m3 = _mm_load_ps(&h[12]); + + m4 = _mm_load_ps(&h[16]); + m5 = _mm_load_ps(&h[20]); + m6 = _mm_load_ps(&h[24]); + m7 = _mm_load_ps(&h[28]); + + m12 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); + m13 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2)); + m14 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2)); + m15 = _mm_shuffle_ps(m6, m7, _MM_SHUFFLE(0, 2, 0, 2)); + + for (int i = 0; i < len; i++) { + /* Load (unaligned) input data */ + m0 = _mm_loadu_ps(&x[2 * i + 0]); + m1 = _mm_loadu_ps(&x[2 * i + 4]); + m2 = _mm_loadu_ps(&x[2 * i + 8]); + m3 = _mm_loadu_ps(&x[2 * i + 12]); + + m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); + m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); + m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2)); + m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3)); + + m0 = _mm_loadu_ps(&x[2 * i + 16]); + m1 = _mm_loadu_ps(&x[2 * i + 20]); + m2 = _mm_loadu_ps(&x[2 * i + 24]); + m3 = _mm_loadu_ps(&x[2 * i + 28]); + + m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); + m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); + m10 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2)); + m11 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3)); + + /* Quad multiply */ + m0 = _mm_mul_ps(m4, m12); + m1 = _mm_mul_ps(m5, m12); + m2 = _mm_mul_ps(m6, m13); + m3 = _mm_mul_ps(m7, m13); + + m4 = _mm_mul_ps(m8, m14); + m5 = _mm_mul_ps(m9, m14); + m6 = _mm_mul_ps(m10, m15); + m7 = _mm_mul_ps(m11, m15); + + /* Sum and store */ + m8 = _mm_add_ps(m0, m2); + m9 = _mm_add_ps(m1, m3); + m10 = _mm_add_ps(m4, m6); + m11 = _mm_add_ps(m5, m7); + + m0 = _mm_add_ps(m8, m10); + m1 = _mm_add_ps(m9, m11); + m2 = _mm_hadd_ps(m0, m1); + m3 = _mm_hadd_ps(m2, m2); + + _mm_store_ss(&y[2 * i + 0], m3); + m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0, 3, 2, 1)); + _mm_store_ss(&y[2 * i + 1], m3); + } +} + +/* 20-tap SSE complex-real convolution */ +static void sse_conv_real20(float *restrict x, + float *restrict h, + float *restrict y, + int len) +{ + __m128 m0, m1, m2, m3, m4, m5, m6, m7; + __m128 m8, m9, m11, m12, m13, m14, m15; + + /* Load (aligned) filter taps */ + m0 = _mm_load_ps(&h[0]); + m1 = _mm_load_ps(&h[4]); + m2 = _mm_load_ps(&h[8]); + m3 = _mm_load_ps(&h[12]); + m4 = _mm_load_ps(&h[16]); + m5 = _mm_load_ps(&h[20]); + m6 = _mm_load_ps(&h[24]); + m7 = _mm_load_ps(&h[28]); + m8 = _mm_load_ps(&h[32]); + m9 = _mm_load_ps(&h[36]); + + m11 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); + m12 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2)); + m13 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2)); + m14 = _mm_shuffle_ps(m6, m7, _MM_SHUFFLE(0, 2, 0, 2)); + m15 = _mm_shuffle_ps(m8, m9, _MM_SHUFFLE(0, 2, 0, 2)); + + for (int i = 0; i < len; i++) { + /* Multiply-accumulate first 12 taps */ + m0 = _mm_loadu_ps(&x[2 * i + 0]); + m1 = _mm_loadu_ps(&x[2 * i + 4]); + m2 = _mm_loadu_ps(&x[2 * i + 8]); + m3 = _mm_loadu_ps(&x[2 * i + 12]); + m4 = _mm_loadu_ps(&x[2 * i + 16]); + m5 = _mm_loadu_ps(&x[2 * i + 20]); + + m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); + m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); + m8 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2)); + m9 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3)); + m0 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2)); + m1 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(1, 3, 1, 3)); + + m2 = _mm_mul_ps(m6, m11); + m3 = _mm_mul_ps(m7, m11); + m4 = _mm_mul_ps(m8, m12); + m5 = _mm_mul_ps(m9, m12); + m6 = _mm_mul_ps(m0, m13); + m7 = _mm_mul_ps(m1, m13); + + m0 = _mm_add_ps(m2, m4); + m1 = _mm_add_ps(m3, m5); + m8 = _mm_add_ps(m0, m6); + m9 = _mm_add_ps(m1, m7); + + /* Multiply-accumulate last 8 taps */ + m0 = _mm_loadu_ps(&x[2 * i + 24]); + m1 = _mm_loadu_ps(&x[2 * i + 28]); + m2 = _mm_loadu_ps(&x[2 * i + 32]); + m3 = _mm_loadu_ps(&x[2 * i + 36]); + + m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); + m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); + m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2)); + m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3)); + + m0 = _mm_mul_ps(m4, m14); + m1 = _mm_mul_ps(m5, m14); + m2 = _mm_mul_ps(m6, m15); + m3 = _mm_mul_ps(m7, m15); + + m4 = _mm_add_ps(m0, m2); + m5 = _mm_add_ps(m1, m3); + + /* Final sum and store */ + m0 = _mm_add_ps(m8, m4); + m1 = _mm_add_ps(m9, m5); + m2 = _mm_hadd_ps(m0, m1); + m3 = _mm_hadd_ps(m2, m2); + + _mm_store_ss(&y[2 * i + 0], m3); + m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0, 3, 2, 1)); + _mm_store_ss(&y[2 * i + 1], m3); + } +} + +/* 4*N-tap SSE complex-real convolution */ +static void sse_conv_real4n(float *x, float *h, float *y, int h_len, int len) +{ + __m128 m0, m1, m2, m4, m5, m6, m7; + + for (int i = 0; i < len; i++) { + /* Zero */ + m6 = _mm_setzero_ps(); + m7 = _mm_setzero_ps(); + + for (int n = 0; n < h_len / 4; n++) { + /* Load (aligned) filter taps */ + m0 = _mm_load_ps(&h[8 * n + 0]); + m1 = _mm_load_ps(&h[8 * n + 4]); + m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); + + /* Load (unaligned) input data */ + m0 = _mm_loadu_ps(&x[2 * i + 8 * n + 0]); + m1 = _mm_loadu_ps(&x[2 * i + 8 * n + 4]); + m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); + m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); + + /* Quad multiply */ + m0 = _mm_mul_ps(m2, m4); + m1 = _mm_mul_ps(m2, m5); + + /* Accumulate */ + m6 = _mm_add_ps(m6, m0); + m7 = _mm_add_ps(m7, m1); + } + + m0 = _mm_hadd_ps(m6, m7); + m0 = _mm_hadd_ps(m0, m0); + + _mm_store_ss(&y[2 * i + 0], m0); + m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(0, 3, 2, 1)); + _mm_store_ss(&y[2 * i + 1], m0); + } +} + +/* 4*N-tap SSE complex-complex convolution */ +static void sse_conv_cmplx_4n(float *x, float *h, float *y, int h_len, int len) +{ + __m128 m0, m1, m2, m3, m4, m5, m6, m7; + + for (int i = 0; i < len; i++) { + /* Zero */ + m6 = _mm_setzero_ps(); + m7 = _mm_setzero_ps(); + + for (int n = 0; n < h_len / 4; n++) { + /* Load (aligned) filter taps */ + m0 = _mm_load_ps(&h[8 * n + 0]); + m1 = _mm_load_ps(&h[8 * n + 4]); + m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); + m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); + + /* Load (unaligned) input data */ + m0 = _mm_loadu_ps(&x[2 * i + 8 * n + 0]); + m1 = _mm_loadu_ps(&x[2 * i + 8 * n + 4]); + m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); + m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); + + /* Quad multiply */ + m0 = _mm_mul_ps(m2, m4); + m1 = _mm_mul_ps(m3, m5); + + m2 = _mm_mul_ps(m2, m5); + m3 = _mm_mul_ps(m3, m4); + + /* Sum */ + m0 = _mm_sub_ps(m0, m1); + m2 = _mm_add_ps(m2, m3); + + /* Accumulate */ + m6 = _mm_add_ps(m6, m0); + m7 = _mm_add_ps(m7, m2); + } + + m0 = _mm_hadd_ps(m6, m7); + m0 = _mm_hadd_ps(m0, m0); + + _mm_store_ss(&y[2 * i + 0], m0); + m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(0, 3, 2, 1)); + _mm_store_ss(&y[2 * i + 1], m0); + } +} + +/* 8*N-tap SSE complex-complex convolution */ +static void sse_conv_cmplx_8n(float *x, float *h, float *y, int h_len, int len) +{ + __m128 m0, m1, m2, m3, m4, m5, m6, m7; + __m128 m8, m9, m10, m11, m12, m13, m14, m15; + + for (int i = 0; i < len; i++) { + /* Zero */ + m12 = _mm_setzero_ps(); + m13 = _mm_setzero_ps(); + m14 = _mm_setzero_ps(); + m15 = _mm_setzero_ps(); + + for (int n = 0; n < h_len / 8; n++) { + /* Load (aligned) filter taps */ + m0 = _mm_load_ps(&h[16 * n + 0]); + m1 = _mm_load_ps(&h[16 * n + 4]); + m2 = _mm_load_ps(&h[16 * n + 8]); + m3 = _mm_load_ps(&h[16 * n + 12]); + + m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); + m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); + m6 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 2, 0, 2)); + m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3)); + + /* Load (unaligned) input data */ + m0 = _mm_loadu_ps(&x[2 * i + 16 * n + 0]); + m1 = _mm_loadu_ps(&x[2 * i + 16 * n + 4]); + m2 = _mm_loadu_ps(&x[2 * i + 16 * n + 8]); + m3 = _mm_loadu_ps(&x[2 * i + 16 * n + 12]); + + m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); + m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); + m10 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2)); + m11 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3)); + + /* Quad multiply */ + m0 = _mm_mul_ps(m4, m8); + m1 = _mm_mul_ps(m5, m9); + m2 = _mm_mul_ps(m6, m10); + m3 = _mm_mul_ps(m7, m11); + + m4 = _mm_mul_ps(m4, m9); + m5 = _mm_mul_ps(m5, m8); + m6 = _mm_mul_ps(m6, m11); + m7 = _mm_mul_ps(m7, m10); + + /* Sum */ + m0 = _mm_sub_ps(m0, m1); + m2 = _mm_sub_ps(m2, m3); + m4 = _mm_add_ps(m4, m5); + m6 = _mm_add_ps(m6, m7); + + /* Accumulate */ + m12 = _mm_add_ps(m12, m0); + m13 = _mm_add_ps(m13, m2); + m14 = _mm_add_ps(m14, m4); + m15 = _mm_add_ps(m15, m6); + } + + m0 = _mm_add_ps(m12, m13); + m1 = _mm_add_ps(m14, m15); + m2 = _mm_hadd_ps(m0, m1); + m2 = _mm_hadd_ps(m2, m2); + + _mm_store_ss(&y[2 * i + 0], m2); + m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 3, 2, 1)); + _mm_store_ss(&y[2 * i + 1], m2); + } +} +#endif + +/* Base multiply and accumulate complex-real */ +static void mac_real(float *x, float *h, float *y) +{ + y[0] += x[0] * h[0]; + y[1] += x[1] * h[0]; +} + +/* Base multiply and accumulate complex-complex */ +static void mac_cmplx(float *x, float *h, float *y) +{ + y[0] += x[0] * h[0] - x[1] * h[1]; + y[1] += x[0] * h[1] + x[1] * h[0]; +} + +/* Base vector complex-complex multiply and accumulate */ +static void mac_real_vec_n(float *x, float *h, float *y, + int len, int step, int offset) +{ + for (int i = offset; i < len; i += step) + mac_real(&x[2 * i], &h[2 * i], y); +} + +/* Base vector complex-complex multiply and accumulate */ +static void mac_cmplx_vec_n(float *x, float *h, float *y, + int len, int step, int offset) +{ + for (int i = offset; i < len; i += step) + mac_cmplx(&x[2 * i], &h[2 * i], y); +} + +/* Base complex-real convolution */ +static int _base_convolve_real(float *x, int x_len, + float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset) +{ + for (int i = 0; i < len; i++) { + mac_real_vec_n(&x[2 * (i - (h_len - 1) + start)], + h, + &y[2 * i], h_len, + step, offset); + } + + return len; +} + +/* Base complex-complex convolution */ +static int _base_convolve_complex(float *x, int x_len, + float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset) +{ + for (int i = 0; i < len; i++) { + mac_cmplx_vec_n(&x[2 * (i - (h_len - 1) + start)], + h, + &y[2 * i], + h_len, step, offset); + } + + return len; +} + +/* Buffer validity checks */ +static int bounds_check(int x_len, int h_len, int y_len, + int start, int len, int step) +{ + if ((x_len < 1) || (h_len < 1) || + (y_len < 1) || (len < 1) || (step < 1)) { + fprintf(stderr, "Convolve: Invalid input\n"); + return -1; + } + + if ((start + len > x_len) || (len > y_len) || (x_len < h_len)) { + fprintf(stderr, "Convolve: Boundary exception\n"); + fprintf(stderr, "start: %i, len: %i, x: %i, h: %i, y: %i\n", + start, len, x_len, h_len, y_len); + return -1; + } + + return 0; +} + +/* API: Aligned complex-real */ +int convolve_real(float *x, int x_len, + float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset) +{ + void (*conv_func)(float *, float *, float *, int) = NULL; + void (*conv_func_n)(float *, float *, float *, int, int) = NULL; + + if (bounds_check(x_len, h_len, y_len, start, len, step) < 0) + return -1; + + memset(y, 0, len * 2 * sizeof(float)); + +#ifdef HAVE_SSE3 + if (step <= 4) { + switch (h_len) { + case 4: + conv_func = sse_conv_real4; + break; + case 8: + conv_func = sse_conv_real8; + break; + case 12: + conv_func = sse_conv_real12; + break; + case 16: + conv_func = sse_conv_real16; + break; + case 20: + conv_func = sse_conv_real20; + break; + default: + if (!(h_len % 4)) + conv_func_n = sse_conv_real4n; + } + } +#endif + if (conv_func) { + conv_func(&x[2 * (-(h_len - 1) + start)], + h, y, len); + } else if (conv_func_n) { + conv_func_n(&x[2 * (-(h_len - 1) + start)], + h, y, h_len, len); + } else { + _base_convolve_real(x, x_len, + h, h_len, + y, y_len, + start, len, step, offset); + } + + return len; +} + +/* API: Aligned complex-complex */ +int convolve_complex(float *x, int x_len, + float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset) +{ + void (*conv_func)(float *, float *, float *, int, int) = NULL; + + if (bounds_check(x_len, h_len, y_len, start, len, step) < 0) + return -1; + + memset(y, 0, len * 2 * sizeof(float)); + +#ifdef HAVE_SSE3 + if (step <= 4) { + if (!(h_len % 8)) + conv_func = sse_conv_cmplx_8n; + else if (!(h_len % 4)) + conv_func = sse_conv_cmplx_4n; + } +#endif + if (conv_func) { + conv_func(&x[2 * (-(h_len - 1) + start)], + h, y, h_len, len); + } else { + _base_convolve_complex(x, x_len, + h, h_len, + y, y_len, + start, len, step, offset); + } + + return len; +} + +/* API: Non-aligned (no SSE) complex-real */ +int base_convolve_real(float *x, int x_len, + float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset) +{ + if (bounds_check(x_len, h_len, y_len, start, len, step) < 0) + return -1; + + memset(y, 0, len * 2 * sizeof(float)); + + return _base_convolve_real(x, x_len, + h, h_len, + y, y_len, + start, len, step, offset); +} + +/* API: Non-aligned (no SSE) complex-complex */ +int base_convolve_complex(float *x, int x_len, + float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset) +{ + if (bounds_check(x_len, h_len, y_len, start, len, step) < 0) + return -1; + + memset(y, 0, len * 2 * sizeof(float)); + + return _base_convolve_complex(x, x_len, + h, h_len, + y, y_len, + start, len, step, offset); +} + +/* Aligned filter tap allocation */ +void *convolve_h_alloc(int len) +{ +#ifdef HAVE_SSE3 + return memalign(16, len * 2 * sizeof(float)); +#else + return malloc(len * 2 * sizeof(float)); +#endif +} diff --git a/Transceiver52M/convolve.h b/Transceiver52M/convolve.h new file mode 100644 index 00000000..aef9953b --- /dev/null +++ b/Transceiver52M/convolve.h @@ -0,0 +1,30 @@ +#ifndef _CONVOLVE_H_ +#define _CONVOLVE_H_ + +void *convolve_h_alloc(int num); + +int convolve_real(float *x, int x_len, + float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset); + +int convolve_complex(float *x, int x_len, + float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset); + +int base_convolve_real(float *x, int x_len, + float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset); + +int base_convolve_complex(float *x, int x_len, + float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset); + +#endif /* _CONVOLVE_H_ */ diff --git a/Transceiver52M/sigProcLib.cpp b/Transceiver52M/sigProcLib.cpp index 2ccc714f..8237aa5d 100644 --- a/Transceiver52M/sigProcLib.cpp +++ b/Transceiver52M/sigProcLib.cpp @@ -29,6 +29,10 @@ using namespace GSM; +extern "C" { +#include "convolve.h" +} + #define TABLESIZE 1024 /** Lookup tables for trigonometric approximation */ @@ -45,28 +49,35 @@ signalVector *GMSKRotation = NULL; signalVector *GMSKReverseRotation = NULL; /* - * RACH and midamble correlation waveforms + * RACH and midamble correlation waveforms. Store the buffer separately + * because we need to allocate it explicitly outside of the signal vector + * constructor. This is because C++ (prior to C++11) is unable to natively + * perform 16-byte memory alignment required by many SSE instructions. */ struct CorrelationSequence { - CorrelationSequence() : sequence(NULL) + CorrelationSequence() : sequence(NULL), buffer(NULL) { } ~CorrelationSequence() { delete sequence; + free(buffer); } signalVector *sequence; + void *buffer; float TOA; complex gain; }; /* - * Gaussian and empty modulation pulses + * Gaussian and empty modulation pulses. Like the correlation sequences, + * store the runtime (Gaussian) buffer separately because of needed alignment + * for SSE instructions. */ struct PulseSequence { - PulseSequence() : gaussian(NULL), empty(NULL) + PulseSequence() : gaussian(NULL), empty(NULL), buffer(NULL) { } @@ -74,10 +85,12 @@ struct PulseSequence { { delete gaussian; delete empty; + free(buffer); } signalVector *gaussian; signalVector *empty; + void *buffer; }; CorrelationSequence *gMidambles[] = {NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL}; @@ -246,7 +259,7 @@ void initGMSKRotationTables(int sps) bool sigProcLibSetup(int sps) { - if ((sps != 0) && (sps != 2) && (sps != 4)) + if ((sps != 1) && (sps != 2) && (sps != 4)) return false; initTrigTables(); @@ -295,174 +308,106 @@ void GMSKReverseRotate(signalVector &x) { } } - -signalVector* convolve(const signalVector *a, - const signalVector *b, - signalVector *c, - ConvType spanType, - unsigned startIx, - unsigned len) +signalVector *convolve(const signalVector *x, + const signalVector *h, + signalVector *y, + ConvType spanType, int start, + unsigned len, unsigned step, int offset) { - if ((a==NULL) || (b==NULL)) return NULL; - int La = a->size(); - int Lb = b->size(); + int rc, head = 0, tail = 0; + bool alloc = false, append = false; + const signalVector *_x = NULL; - int startIndex; - unsigned int outSize; - switch (spanType) { - case FULL_SPAN: - startIndex = 0; - outSize = La+Lb-1; - break; - case OVERLAP_ONLY: - startIndex = La; - outSize = abs(La-Lb)+1; - break; - case START_ONLY: - startIndex = 0; - outSize = La; - break; - case WITH_TAIL: - startIndex = Lb; - outSize = La; - break; - case NO_DELAY: - if (Lb % 2) - startIndex = Lb/2; - else - startIndex = Lb/2-1; - outSize = La; - break; - case CUSTOM: - startIndex = startIx; - outSize = len; - break; - default: - return NULL; - } - - - if (c==NULL) - c = new signalVector(outSize); - else if (c->size()!=outSize) + if (!x || !h) return NULL; - signalVector::const_iterator aStart = a->begin(); - signalVector::const_iterator bStart = b->begin(); - signalVector::const_iterator aEnd = a->end(); - signalVector::const_iterator bEnd = b->end(); - signalVector::iterator cPtr = c->begin(); - int t = startIndex; - int stopIndex = startIndex + outSize; - switch (b->getSymmetry()) { - case NONE: - { - while (t < stopIndex) { - signalVector::const_iterator aP = aStart+t; - signalVector::const_iterator bP = bStart; - if (a->isRealOnly() && b->isRealOnly()) { - float sum = 0.0; - while (bP < bEnd) { - if (aP < aStart) break; - if (aP < aEnd) sum += (aP->real())*(bP->real()); - aP--; - bP++; - } - *cPtr++ = sum; - } - else if (a->isRealOnly()) { - complex sum = 0.0; - while (bP < bEnd) { - if (aP < aStart) break; - if (aP < aEnd) sum += (*bP)*(aP->real()); - aP--; - bP++; - } - *cPtr++ = sum; - } - else if (b->isRealOnly()) { - complex sum = 0.0; - while (bP < bEnd) { - if (aP < aStart) break; - if (aP < aEnd) sum += (*aP)*(bP->real()); - aP--; - bP++; - } - *cPtr++ = sum; - } - else { - complex sum = 0.0; - while (bP < bEnd) { - if (aP < aStart) break; - if (aP < aEnd) sum += (*aP)*(*bP); - aP--; - bP++; - } - *cPtr++ = sum; - } - t++; - } - } + switch (spanType) { + case START_ONLY: + start = 0; + head = h->size(); + len = x->size(); + append = true; break; - case ABSSYM: - { - complex sum = 0.0; - bool isOdd = (bool) (Lb % 2); - if (isOdd) - bEnd = bStart + (Lb+1)/2; - else - bEnd = bStart + Lb/2; - while (t < stopIndex) { - signalVector::const_iterator aP = aStart+t; - signalVector::const_iterator aPsym = aP-Lb+1; - signalVector::const_iterator bP = bStart; - sum = 0.0; - if (!b->isRealOnly()) { - while (bP < bEnd) { - if (aP < aStart) break; - if (aP == aPsym) - sum+= (*aP)*(*bP); - else if ((aP < aEnd) && (aPsym >= aStart)) - sum+= ((*aP)+(*aPsym))*(*bP); - else if (aP < aEnd) - sum += (*aP)*(*bP); - else if (aPsym >= aStart) - sum += (*aPsym)*(*bP); - aP--; - aPsym++; - bP++; - } - } - else { - while (bP < bEnd) { - if (aP < aStart) break; - if (aP == aPsym) - sum+= (*aP)*(bP->real()); - else if ((aP < aEnd) && (aPsym >= aStart)) - sum+= ((*aP)+(*aPsym))*(bP->real()); - else if (aP < aEnd) - sum += (*aP)*(bP->real()); - else if (aPsym >= aStart) - sum += (*aPsym)*(bP->real()); - aP--; - aPsym++; - bP++; - } - } - *cPtr++ = sum; - t++; - } + case NO_DELAY: + start = h->size() / 2; + head = start; + tail = start; + len = x->size(); + append = true; + break; + case CUSTOM: + if (start < h->size() - 1) { + head = h->size() - start; + append = true; + } + if (start + len > x->size()) { + tail = start + len - x->size(); + append = true; } break; default: return NULL; - break; } - - - return c; -} + /* + * Error if the output vector is too small. Create the output vector + * if the pointer is NULL. + */ + if (y && (len > y->size())) + return NULL; + if (!y) { + y = new signalVector(len); + alloc = true; + } + + /* Prepend or post-pend the input vector if the parameters require it */ + if (append) + _x = new signalVector(*x, head, tail); + else + _x = x; + + /* + * Four convovle types: + * 1. Complex-Real (aligned) + * 2. Complex-Complex (aligned) + * 3. Complex-Real (!aligned) + * 4. Complex-Complex (!aligned) + */ + if (h->isRealOnly() && h->isAligned()) { + rc = convolve_real((float *) _x->begin(), _x->size(), + (float *) h->begin(), h->size(), + (float *) y->begin(), y->size(), + start, len, step, offset); + } else if (!h->isRealOnly() && h->isAligned()) { + rc = convolve_complex((float *) _x->begin(), _x->size(), + (float *) h->begin(), h->size(), + (float *) y->begin(), y->size(), + start, len, step, offset); + } else if (h->isRealOnly() && !h->isAligned()) { + rc = base_convolve_real((float *) _x->begin(), _x->size(), + (float *) h->begin(), h->size(), + (float *) y->begin(), y->size(), + start, len, step, offset); + } else if (!h->isRealOnly() && !h->isAligned()) { + rc = base_convolve_complex((float *) _x->begin(), _x->size(), + (float *) h->begin(), h->size(), + (float *) y->begin(), y->size(), + start, len, step, offset); + } else { + rc = -1; + } + + if (append) + delete _x; + + if (rc < 0) { + if (alloc) + delete y; + return NULL; + } + + return y; +} void generateGSMPulse(int sps, int symbolLength) { @@ -477,9 +422,17 @@ void generateGSMPulse(int sps, int symbolLength) GSMPulse->empty->isRealOnly(true); *(GSMPulse->empty->begin()) = 1.0f; + len = sps * symbolLength; + if (len < 4) + len = 4; + /* GSM pulse approximation */ - GSMPulse->gaussian = new signalVector(len); + GSMPulse->buffer = convolve_h_alloc(len); + GSMPulse->gaussian = new signalVector((complex *) + GSMPulse->buffer, 0, len); + GSMPulse->gaussian->setAligned(true); GSMPulse->gaussian->isRealOnly(true); + signalVector::iterator xP = GSMPulse->gaussian->begin(); center = (float) (len - 1.0) / 2.0; @@ -560,31 +513,6 @@ signalVector* reverseConjugate(signalVector *b) return tmp; } -signalVector* correlate(signalVector *a, - signalVector *b, - signalVector *c, - ConvType spanType, - bool bReversedConjugated, - unsigned startIx, - unsigned len) -{ - signalVector *tmp = NULL; - - if (!bReversedConjugated) { - tmp = reverseConjugate(b); - } - else { - tmp = b; - } - - c = convolve(a,tmp,c,spanType,startIx,len); - - if (!bReversedConjugated) delete tmp; - - return c; -} - - /* soft output slicer */ bool vectorSlicer(signalVector *x) { @@ -599,12 +527,13 @@ bool vectorSlicer(signalVector *x) } return true; } - + +/* Assume input bits are not differentially encoded */ signalVector *modulateBurst(const BitVector &wBurst, int guardPeriodLength, int sps, bool emptyPulse) { int burstLen; - signalVector *pulse, modBurst; + signalVector *pulse, *shapedBurst, modBurst; signalVector::iterator modBurstItr; if (emptyPulse) @@ -628,7 +557,9 @@ signalVector *modulateBurst(const BitVector &wBurst, int guardPeriodLength, modBurst.isRealOnly(false); // filter w/ pulse shape - signalVector *shapedBurst = convolve(&modBurst, pulse, NULL, NO_DELAY); + shapedBurst = convolve(&modBurst, pulse, NULL, START_ONLY); + if (!shapedBurst) + return NULL; return shapedBurst; } @@ -639,24 +570,24 @@ float sinc(float x) return 1.0F; } -void delayVector(signalVector &wBurst, - float delay) +bool delayVector(signalVector &wBurst, float delay) { int intOffset = (int) floor(delay); float fracOffset = delay - intOffset; - + // do fractional shift first, only do it for reasonable offsets if (fabs(fracOffset) > 1e-2) { // create sinc function signalVector sincVector(21); sincVector.isRealOnly(true); - signalVector::iterator sincBurstItr = sincVector.begin(); + signalVector::iterator sincBurstItr = sincVector.end(); for (int i = 0; i < 21; i++) - *sincBurstItr++ = (complex) sinc(M_PI_F*(i-10-fracOffset)); + *--sincBurstItr = (complex) sinc(M_PI_F*(i-10-fracOffset)); signalVector shiftedBurst(wBurst.size()); - convolve(&wBurst,&sincVector,&shiftedBurst,NO_DELAY); + if (!convolve(&wBurst, &sincVector, &shiftedBurst, NO_DELAY)) + return false; wBurst.clone(shiftedBurst); } @@ -861,25 +792,25 @@ bool generateMidamble(int sps, int tsc) bool status = true; complex *data = NULL; signalVector *autocorr = NULL, *midamble = NULL; - signalVector *midMidamble = NULL; + signalVector *midMidamble = NULL, *_midMidamble = NULL; - if ((tsc < 0) || (tsc > 7)) + if ((tsc < 0) || (tsc > 7)) return false; delete gMidambles[tsc]; - + /* Use middle 16 bits of each TSC. Correlation sequence is not pulse shaped */ midMidamble = modulateBurst(gTrainingSequence[tsc].segment(5,16), 0, sps, true); if (!midMidamble) return false; - /* Simulated receive sequence is pulse shaped */ + /* Simulated receive sequence is pulse shaped */ midamble = modulateBurst(gTrainingSequence[tsc], 0, sps, false); if (!midamble) { status = false; goto release; } - + // NOTE: Because ideal TSC 16-bit midamble is 66 symbols into burst, // the ideal TSC has an + 180 degree phase shift, // due to the pi/2 frequency shift, that @@ -890,22 +821,32 @@ bool generateMidamble(int sps, int tsc) conjugateVector(*midMidamble); - autocorr = correlate(midamble, midMidamble, NULL, NO_DELAY); + /* For SSE alignment, reallocate the midamble sequence on 16-byte boundary */ + data = (complex *) convolve_h_alloc(midMidamble->size()); + _midMidamble = new signalVector(data, 0, midMidamble->size()); + _midMidamble->setAligned(true); + memcpy(_midMidamble->begin(), midMidamble->begin(), + midMidamble->size() * sizeof(complex)); + + autocorr = convolve(midamble, _midMidamble, NULL, NO_DELAY); if (!autocorr) { status = false; goto release; } gMidambles[tsc] = new CorrelationSequence; - gMidambles[tsc]->sequence = midMidamble; - gMidambles[tsc]->gain = peakDetect(*autocorr,&gMidambles[tsc]->TOA,NULL); + gMidambles[tsc]->buffer = data; + gMidambles[tsc]->sequence = _midMidamble; + gMidambles[tsc]->gain = peakDetect(*autocorr,&gMidambles[tsc]->TOA, NULL); release: delete autocorr; delete midamble; + delete midMidamble; if (!status) { - delete midMidamble; + delete _midMidamble; + free(data); gMidambles[tsc] = NULL; } @@ -917,7 +858,7 @@ bool generateRACHSequence(int sps) bool status = true; complex *data = NULL; signalVector *autocorr = NULL; - signalVector *seq0 = NULL, *seq1 = NULL; + signalVector *seq0 = NULL, *seq1 = NULL, *_seq1 = NULL; delete gRACHSequence; @@ -933,74 +874,100 @@ bool generateRACHSequence(int sps) conjugateVector(*seq1); - autocorr = new signalVector(seq0->size()); - if (!convolve(seq0, seq1, autocorr, NO_DELAY)) { + /* For SSE alignment, reallocate the midamble sequence on 16-byte boundary */ + data = (complex *) convolve_h_alloc(seq1->size()); + _seq1 = new signalVector(data, 0, seq1->size()); + _seq1->setAligned(true); + memcpy(_seq1->begin(), seq1->begin(), seq1->size() * sizeof(complex)); + + autocorr = convolve(seq0, _seq1, autocorr, NO_DELAY); + if (!autocorr) { status = false; goto release; } gRACHSequence = new CorrelationSequence; - gRACHSequence->sequence = seq1; - gRACHSequence->gain = peakDetect(*autocorr,&gRACHSequence->TOA,NULL); + gRACHSequence->sequence = _seq1; + gRACHSequence->buffer = data; + gRACHSequence->gain = peakDetect(*autocorr,&gRACHSequence->TOA, NULL); release: delete autocorr; delete seq0; + delete seq1; if (!status) { - delete seq1; + delete _seq1; + free(data); gRACHSequence = NULL; } return status; } - -bool detectRACHBurst(signalVector &rxBurst, - float detectThreshold, - int sps, - complex *amplitude, - float* TOA) + +int detectRACHBurst(signalVector &rxBurst, + float thresh, + int sps, + complex *amp, + float *toa) { + int start, len, num = 0; + float _toa, rms, par, avg = 0.0f; + complex _amp, *peak; + signalVector corr, *sync = gRACHSequence->sequence; - //static complex staticData[500]; - - //signalVector correlatedRACH(staticData,0,rxBurst.size()); - signalVector correlatedRACH(rxBurst.size()); - correlate(&rxBurst,gRACHSequence->sequence,&correlatedRACH,NO_DELAY,true); + if ((sps != 1) && (sps != 2) && (sps != 4)) + return -1; - float meanPower; - complex peakAmpl = peakDetect(correlatedRACH,TOA,&meanPower); + start = 40 * sps; + len = 24 * sps; + corr = signalVector(len); - float valleyPower = 0.0; - - // check for bogus results - if ((*TOA < 0.0) || (*TOA > correlatedRACH.size())) { - *amplitude = 0.0; - return false; - } - complex *peakPtr = correlatedRACH.begin() + (int) rint(*TOA); - - float numSamples = 0.0; - for (int i = 57 * sps; i <= 107 * sps; i++) { - if (peakPtr+i >= correlatedRACH.end()) - break; - valleyPower += (peakPtr+i)->norm2(); - numSamples++; + if (!convolve(&rxBurst, sync, &corr, + CUSTOM, start, len, sps, 0)) { + return -1; } - if (numSamples < 2) { - *amplitude = 0.0; - return false; + _amp = peakDetect(corr, &_toa, NULL); + if ((_toa < 3) || (_toa > len - 3)) + goto notfound; + + peak = corr.begin() + (int) rint(_toa); + + for (int i = 2 * sps; i <= 5 * sps; i++) { + if (peak - i >= corr.begin()) { + avg += (peak - i)->norm2(); + num++; + } + if (peak + i < corr.end()) { + avg += (peak + i)->norm2(); + num++; + } } - float RMS = sqrtf(valleyPower/(float) numSamples)+0.00001; - float peakToMean = peakAmpl.abs()/RMS; + if (num < 2) + goto notfound; - *amplitude = peakAmpl/(gRACHSequence->gain); + rms = sqrtf(avg / (float) num) + 0.00001; + par = _amp.abs() / rms; + if (par < thresh) + goto notfound; - *TOA = (*TOA) - gRACHSequence->TOA - 8 * sps; + /* Subtract forward tail bits from delay */ + if (toa) + *toa = _toa - 8 * sps; + if (amp) + *amp = _amp / gRACHSequence->gain; - return (peakToMean > detectThreshold); + return 1; + +notfound: + if (amp) + *amp = 0.0f; + if (toa) + *toa = 0.0f; + + return 0; } bool energyDetect(signalVector &rxBurst, @@ -1020,120 +987,95 @@ bool energyDetect(signalVector &rxBurst, if (avgPwr) *avgPwr = energy/windowLength; return (energy/windowLength > detectThreshold*detectThreshold); } - -bool analyzeTrafficBurst(signalVector &rxBurst, - unsigned TSC, - float detectThreshold, - int sps, - complex *amplitude, - float *TOA, - unsigned maxTOA, - bool requestChannel, - signalVector **channelResponse, - float *channelResponseOffset) +int analyzeTrafficBurst(signalVector &rxBurst, unsigned tsc, float thresh, + int sps, complex *amp, float *toa, unsigned max_toa, + bool chan_req, signalVector **chan, float *chan_offset) { + int start, target, len, num = 0; + complex _amp, *peak; + float _toa, rms, par, avg = 0.0f; + signalVector corr, *sync, *_chan; - assert(TSC<8); - assert(amplitude); - assert(TOA); - assert(gMidambles[TSC]); + if ((tsc < 0) || (tsc > 7) || ((sps != 1) && (sps != 2) && (sps != 4))) + return -1; - if (maxTOA < 3*sps) maxTOA = 3*sps; - unsigned spanTOA = maxTOA; - if (spanTOA < 5*sps) spanTOA = 5*sps; + target = 3 + 58 + 5 + 16; + start = (target - 8) * sps; + len = (8 + 8 + max_toa) * sps; - unsigned startIx = 66*sps-spanTOA; - unsigned endIx = (66+16)*sps+spanTOA; - unsigned windowLen = endIx - startIx; - unsigned corrLen = 2*maxTOA+1; + sync = gMidambles[tsc]->sequence; + sync = gMidambles[tsc]->sequence; + corr = signalVector(len); - unsigned expectedTOAPeak = (unsigned) round(gMidambles[TSC]->TOA + (gMidambles[TSC]->sequence->size()-1)/2); - - signalVector burstSegment(rxBurst.begin(),startIx,windowLen); - - //static complex staticData[200]; - //signalVector correlatedBurst(staticData,0,corrLen); - signalVector correlatedBurst(corrLen); - correlate(&burstSegment, gMidambles[TSC]->sequence, - &correlatedBurst, CUSTOM,true, - expectedTOAPeak-maxTOA,corrLen); - - float meanPower; - *amplitude = peakDetect(correlatedBurst,TOA,&meanPower); - float valleyPower = 0.0; //amplitude->norm2(); - complex *peakPtr = correlatedBurst.begin() + (int) rint(*TOA); - - // check for bogus results - if ((*TOA < 0.0) || (*TOA > correlatedBurst.size())) { - *amplitude = 0.0; - return false; + if (!convolve(&rxBurst, sync, &corr, + CUSTOM, start, len, sps, 0)) { + return -1; } - int numRms = 0; - for (int i = 2*sps; i <= 5*sps;i++) { - if (peakPtr - i >= correlatedBurst.begin()) { - valleyPower += (peakPtr-i)->norm2(); - numRms++; + _amp = peakDetect(corr, &_toa, NULL); + peak = corr.begin() + (int) rint(_toa); + + /* Check for bogus results */ + if ((_toa < 0.0) || (_toa > corr.size())) + goto notfound; + + for (int i = 2 * sps; i <= 5 * sps; i++) { + if (peak - i >= corr.begin()) { + avg += (peak - i)->norm2(); + num++; } - if (peakPtr + i < correlatedBurst.end()) { - valleyPower += (peakPtr+i)->norm2(); - numRms++; + if (peak + i < corr.end()) { + avg += (peak + i)->norm2(); + num++; } } - if (numRms < 2) { - // check for bogus results - *amplitude = 0.0; - return false; + if (num < 2) + goto notfound; + + rms = sqrtf(avg / (float) num) + 0.00001; + par = (_amp.abs()) / rms; + if (par < thresh) + goto notfound; + + /* + * NOTE: Because ideal TSC is 66 symbols into burst, + * the ideal TSC has an +/- 180 degree phase shift, + * due to the pi/4 frequency shift, that + * needs to be accounted for. + */ + if (amp) + *amp = _amp / gMidambles[tsc]->gain; + + /* Delay one half of peak-centred correlation length */ + _toa -= sps * 8; + + if (toa) + *toa = _toa; + + if (chan_req) { + _chan = new signalVector(6 * sps); + + delayVector(corr, -_toa); + corr.segmentCopyTo(*_chan, target - 3, _chan->size()); + scaleVector(*_chan, complex(1.0, 0.0) / gMidambles[tsc]->gain); + + *chan = _chan; + + if (chan_offset) + *chan_offset = 3.0 * sps;; } - float RMS = sqrtf(valleyPower/(float)numRms)+0.00001; - float peakToMean = (amplitude->abs())/RMS; + return 1; - // NOTE: Because ideal TSC is 66 symbols into burst, - // the ideal TSC has an +/- 180 degree phase shift, - // due to the pi/4 frequency shift, that - // needs to be accounted for. - - *amplitude = (*amplitude)/gMidambles[TSC]->gain; - *TOA = (*TOA) - (maxTOA); +notfound: + if (amp) + *amp = 0.0f; + if (toa) + *toa = 0.0f; - if (requestChannel && (peakToMean > detectThreshold)) { - float TOAoffset = maxTOA; - delayVector(correlatedBurst,-(*TOA)); - // midamble only allows estimation of a 6-tap channel - signalVector chanVector(6 * sps); - float maxEnergy = -1.0; - int maxI = -1; - for (int i = 0; i < 7; i++) { - if (TOAoffset + (i-5) * sps + chanVector.size() > correlatedBurst.size()) - continue; - if (TOAoffset + (i-5) * sps < 0) - continue; - correlatedBurst.segmentCopyTo(chanVector, - (int) floor(TOAoffset + (i - 5) * sps), - chanVector.size()); - float energy = vectorNorm2(chanVector); - if (energy > 0.95*maxEnergy) { - maxI = i; - maxEnergy = energy; - } - } - - *channelResponse = new signalVector(chanVector.size()); - correlatedBurst.segmentCopyTo(**channelResponse, - (int) floor(TOAoffset + (maxI - 5) * sps), - (*channelResponse)->size()); - scaleVector(**channelResponse, complex(1.0, 0.0) / gMidambles[TSC]->gain); - - if (channelResponseOffset) - *channelResponseOffset = 5 * sps - maxI; - - } - - return (peakToMean > detectThreshold); - + return 0; } signalVector *decimateVector(signalVector &wVector, @@ -1452,7 +1394,7 @@ bool designDFE(signalVector &channelResponse, } *feedForwardFilter = new signalVector(Nf); - signalVector::iterator w = (*feedForwardFilter)->begin(); + signalVector::iterator w = (*feedForwardFilter)->end(); for (int i = 0; i < Nf; i++) { delete L[i]; complex w_i = 0.0; @@ -1463,8 +1405,7 @@ bool designDFE(signalVector &channelResponse, w_i += (*vPtr)*(chanPtr->conj()); vPtr++; chanPtr++; } - *w = w_i/d; - w++; + *--w = w_i/d; } @@ -1479,10 +1420,15 @@ SoftVector *equalizeBurst(signalVector &rxBurst, signalVector &w, // feedforward filter signalVector &b) // feedback filter { + signalVector *postForwardFull; - delayVector(rxBurst,-TOA); + if (!delayVector(rxBurst, -TOA)) + return NULL; - signalVector* postForwardFull = convolve(&rxBurst,&w,NULL,FULL_SPAN); + postForwardFull = convolve(&rxBurst, &w, NULL, + CUSTOM, 0, rxBurst.size() + w.size() - 1); + if (!postForwardFull) + return NULL; signalVector* postForward = new signalVector(rxBurst.size()); postForwardFull->segmentCopyTo(*postForward,w.size()-1,rxBurst.size()); diff --git a/Transceiver52M/sigProcLib.h b/Transceiver52M/sigProcLib.h index ee152d54..194002fe 100644 --- a/Transceiver52M/sigProcLib.h +++ b/Transceiver52M/sigProcLib.h @@ -27,13 +27,10 @@ enum Symmetry { /** Convolution type indicator */ enum ConvType { - FULL_SPAN = 0, - OVERLAP_ONLY = 1, - START_ONLY = 2, - WITH_TAIL = 3, - NO_DELAY = 4, - CUSTOM = 5, - UNDEFINED = 255 + START_ONLY, + NO_DELAY, + CUSTOM, + UNDEFINED, }; /** the core data structure of the Transceiver */ @@ -44,13 +41,14 @@ class signalVector: public Vector Symmetry symmetry; ///< the symmetry of the vector bool realOnly; ///< true if vector is real-valued, not complex-valued - + bool aligned; + public: /** Constructors */ signalVector(int dSize=0, Symmetry wSymmetry = NONE): Vector(dSize), - realOnly(false) + realOnly(false), aligned(false) { symmetry = wSymmetry; }; @@ -58,26 +56,45 @@ class signalVector: public Vector signalVector(complex* wData, size_t start, size_t span, Symmetry wSymmetry = NONE): Vector(NULL,wData+start,wData+start+span), - realOnly(false) + realOnly(false), aligned(false) { symmetry = wSymmetry; }; signalVector(const signalVector &vec1, const signalVector &vec2): Vector(vec1,vec2), - realOnly(false) + realOnly(false), aligned(false) { symmetry = vec1.symmetry; }; signalVector(const signalVector &wVector): Vector(wVector.size()), - realOnly(false) + realOnly(false), aligned(false) { wVector.copyTo(*this); symmetry = wVector.getSymmetry(); }; + signalVector(size_t size, size_t start): + Vector(size + start), + realOnly(false), aligned(false) + { + mStart = mData + start; + symmetry = NONE; + }; + + signalVector(const signalVector &wVector, size_t start, size_t tail = 0): + Vector(start + wVector.size() + tail), + realOnly(false), aligned(false) + { + mStart = mData + start; + wVector.copyTo(*this); + memset(mData, 0, start * sizeof(complex)); + memset(mStart + wVector.size(), 0, tail * sizeof(complex)); + symmetry = NONE; + }; + /** symmetry operators */ Symmetry getSymmetry() const { return symmetry;}; void setSymmetry(Symmetry wSymmetry) { symmetry = wSymmetry;}; @@ -85,6 +102,10 @@ class signalVector: public Vector /** real-valued operators */ bool isRealOnly() const { return realOnly;}; void isRealOnly(bool wOnly) { realOnly = wOnly;}; + + /** alignment markers */ + bool isAligned() const { return aligned; }; + void setAligned(bool aligned) { this->aligned = aligned; }; }; /** Convert a linear number to a dB value */ @@ -110,14 +131,15 @@ void sigProcLibDestroy(void); @param a,b The vectors to be convolved. @param c, A preallocated vector to hold the convolution result. @param spanType The type/span of the convolution. - @return The convolution result. + @return The convolution result or NULL on error. */ -signalVector* convolve(const signalVector *a, - const signalVector *b, - signalVector *c, - ConvType spanType, - unsigned startIx = 0, - unsigned len = 0); +signalVector *convolve(const signalVector *a, + const signalVector *b, + signalVector *c, + ConvType spanType, + int start = 0, + unsigned len = 0, + unsigned step = 1, int offset = 0); /** Generate the GSM pulse. @@ -169,8 +191,7 @@ signalVector *modulateBurst(const BitVector &wBurst, float sinc(float x); /** Delay a vector */ -void delayVector(signalVector &wBurst, - float delay); +bool delayVector(signalVector &wBurst, float delay); /** Add two vectors in-place */ bool addVector(signalVector &x, @@ -257,13 +278,13 @@ bool energyDetect(signalVector &rxBurst, @param sps The number of samples per GSM symbol. @param amplitude The estimated amplitude of received RACH burst. @param TOA The estimate time-of-arrival of received RACH burst. - @return True if burst SNR is larger that the detectThreshold value. + @return positive if threshold value is reached, negative on error, zero otherwise */ -bool detectRACHBurst(signalVector &rxBurst, - float detectThreshold, - int sps, - complex *amplitude, - float* TOA); +int detectRACHBurst(signalVector &rxBurst, + float detectThreshold, + int sps, + complex *amplitude, + float* TOA); /** Normal burst correlator, detector, channel estimator. @@ -277,18 +298,18 @@ bool detectRACHBurst(signalVector &rxBurst, @param requestChannel Set to true if channel estimation is desired. @param channelResponse The estimated channel. @param channelResponseOffset The time offset b/w the first sample of the channel response and the reported TOA. - @return True if burst SNR is larger that the detectThreshold value. + @return positive if threshold value is reached, negative on error, zero otherwise */ -bool analyzeTrafficBurst(signalVector &rxBurst, - unsigned TSC, - float detectThreshold, - int sps, - complex *amplitude, - float *TOA, - unsigned maxTOA, - bool requestChannel = false, - signalVector** channelResponse = NULL, - float *channelResponseOffset = NULL); +int analyzeTrafficBurst(signalVector &rxBurst, + unsigned TSC, + float detectThreshold, + int sps, + complex *amplitude, + float *TOA, + unsigned maxTOA, + bool requestChannel = false, + signalVector** channelResponse = NULL, + float *channelResponseOffset = NULL); /** Decimate a vector. diff --git a/Transceiver52M/sigProcLibTest.cpp b/Transceiver52M/sigProcLibTest.cpp index 4f927176..32661f41 100644 --- a/Transceiver52M/sigProcLibTest.cpp +++ b/Transceiver52M/sigProcLibTest.cpp @@ -50,9 +50,6 @@ int main(int argc, char **argv) { sigProcLibSetup(samplesPerSymbol); - signalVector *gsmPulse = generateGSMPulse(2,samplesPerSymbol); - cout << *gsmPulse << endl; - BitVector RACHBurstStart = "01010101"; BitVector RACHBurstRest = "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"; @@ -60,12 +57,9 @@ int main(int argc, char **argv) { signalVector *RACHSeq = modulateBurst(RACHBurst, - *gsmPulse, 9, samplesPerSymbol); - generateRACHSequence(*gsmPulse,samplesPerSymbol); - complex a; float t; detectRACHBurst(*RACHSeq, 5, samplesPerSymbol,&a,&t); @@ -94,12 +88,9 @@ int main(int argc, char **argv) { BitVector normalBurst(BitVector(normalBurstSeg,gTrainingSequence[TSC]),normalBurstSeg); + generateMidamble(samplesPerSymbol,TSC); - generateMidamble(*gsmPulse,samplesPerSymbol,TSC); - - - signalVector *modBurst = modulateBurst(normalBurst,*gsmPulse, - 0,samplesPerSymbol); + signalVector *modBurst = modulateBurst(normalBurst,0,samplesPerSymbol); //delayVector(*rsVector2,6.932); @@ -133,7 +124,7 @@ int main(int argc, char **argv) { cout << "ampl:" << ampl << endl; cout << "TOA: " << TOA << endl; //cout << "chanResp: " << *chanResp << endl; - SoftVector *demodBurst = demodulateBurst(*modBurst,*gsmPulse,samplesPerSymbol,(complex) ampl, TOA); + SoftVector *demodBurst = demodulateBurst(*modBurst,samplesPerSymbol,(complex) ampl, TOA); cout << *demodBurst << endl; diff --git a/config/ax_check_compile_flag.m4 b/config/ax_check_compile_flag.m4 new file mode 100644 index 00000000..c3a8d695 --- /dev/null +++ b/config/ax_check_compile_flag.m4 @@ -0,0 +1,72 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS]) +# +# DESCRIPTION +# +# Check whether the given FLAG works with the current language's compiler +# or gives an error. (Warnings, however, are ignored) +# +# ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on +# success/failure. +# +# If EXTRA-FLAGS is defined, it is added to the current language's default +# flags (e.g. CFLAGS) when the check is done. The check is thus made with +# the flags: "CFLAGS EXTRA-FLAGS FLAG". This can for example be used to +# force the compiler to issue an error when a bad flag is given. +# +# NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this +# macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG. +# +# LICENSE +# +# Copyright (c) 2008 Guido U. Draheim +# Copyright (c) 2011 Maarten Bosmans +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see . +# +# As a special exception, the respective Autoconf Macro's copyright owner +# gives unlimited permission to copy, distribute and modify the configure +# scripts that are the output of Autoconf when processing the Macro. You +# need not follow the terms of the GNU General Public License when using +# or distributing such scripts, even though portions of the text of the +# Macro appear in them. The GNU General Public License (GPL) does govern +# all other use of the material that constitutes the Autoconf Macro. +# +# This special exception to the GPL applies to versions of the Autoconf +# Macro released by the Autoconf Archive. When you make and distribute a +# modified version of the Autoconf Macro, you may extend this special +# exception to the GPL to apply to your modified version as well. + +#serial 2 + +AC_DEFUN([AX_CHECK_COMPILE_FLAG], +[AC_PREREQ(2.59)dnl for _AC_LANG_PREFIX +AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl +AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [ + ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS + _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1" + AC_COMPILE_IFELSE([AC_LANG_PROGRAM()], + [AS_VAR_SET(CACHEVAR,[yes])], + [AS_VAR_SET(CACHEVAR,[no])]) + _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags]) +AS_IF([test x"AS_VAR_GET(CACHEVAR)" = xyes], + [m4_default([$2], :)], + [m4_default([$3], :)]) +AS_VAR_POPDEF([CACHEVAR])dnl +])dnl AX_CHECK_COMPILE_FLAGS diff --git a/config/ax_ext.m4 b/config/ax_ext.m4 new file mode 100644 index 00000000..fbd10cc6 --- /dev/null +++ b/config/ax_ext.m4 @@ -0,0 +1,221 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_ext.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_EXT +# +# DESCRIPTION +# +# Find supported SIMD extensions by requesting cpuid. When an SIMD +# extension is found, the -m"simdextensionname" is added to SIMD_FLAGS if +# compiler supports it. For example, if "sse2" is available, then "-msse2" +# is added to SIMD_FLAGS. +# +# This macro calls: +# +# AC_SUBST(SIMD_FLAGS) +# +# And defines: +# +# HAVE_MMX / HAVE_SSE / HAVE_SSE2 / HAVE_SSE3 / HAVE_SSSE3 / HAVE_SSE4.1 / HAVE_SSE4.2 / HAVE_AVX +# +# LICENSE +# +# Copyright (c) 2007 Christophe Tournayre +# Copyright (c) 2013 Michael Petch +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 12 + +AC_DEFUN([AX_EXT], +[ + AC_REQUIRE([AC_CANONICAL_HOST]) + + case $host_cpu in + i[[3456]]86*|x86_64*|amd64*) + + AC_REQUIRE([AX_GCC_X86_CPUID]) + AC_REQUIRE([AX_GCC_X86_AVX_XGETBV]) + + AX_GCC_X86_CPUID(0x00000001) + ecx=`echo $ax_cv_gcc_x86_cpuid_0x00000001 | cut -d ":" -f 3` + edx=`echo $ax_cv_gcc_x86_cpuid_0x00000001 | cut -d ":" -f 4` + + AC_CACHE_CHECK([whether mmx is supported], [ax_cv_have_mmx_ext], + [ + ax_cv_have_mmx_ext=no + if test "$((0x$edx>>23&0x01))" = 1; then + ax_cv_have_mmx_ext=yes + fi + ]) + + AC_CACHE_CHECK([whether sse is supported], [ax_cv_have_sse_ext], + [ + ax_cv_have_sse_ext=no + if test "$((0x$edx>>25&0x01))" = 1; then + ax_cv_have_sse_ext=yes + fi + ]) + + AC_CACHE_CHECK([whether sse2 is supported], [ax_cv_have_sse2_ext], + [ + ax_cv_have_sse2_ext=no + if test "$((0x$edx>>26&0x01))" = 1; then + ax_cv_have_sse2_ext=yes + fi + ]) + + AC_CACHE_CHECK([whether sse3 is supported], [ax_cv_have_sse3_ext], + [ + ax_cv_have_sse3_ext=no + if test "$((0x$ecx&0x01))" = 1; then + ax_cv_have_sse3_ext=yes + fi + ]) + + AC_CACHE_CHECK([whether ssse3 is supported], [ax_cv_have_ssse3_ext], + [ + ax_cv_have_ssse3_ext=no + if test "$((0x$ecx>>9&0x01))" = 1; then + ax_cv_have_ssse3_ext=yes + fi + ]) + + AC_CACHE_CHECK([whether sse4.1 is supported], [ax_cv_have_sse41_ext], + [ + ax_cv_have_sse41_ext=no + if test "$((0x$ecx>>19&0x01))" = 1; then + ax_cv_have_sse41_ext=yes + fi + ]) + + AC_CACHE_CHECK([whether sse4.2 is supported], [ax_cv_have_sse42_ext], + [ + ax_cv_have_sse42_ext=no + if test "$((0x$ecx>>20&0x01))" = 1; then + ax_cv_have_sse42_ext=yes + fi + ]) + + AC_CACHE_CHECK([whether avx is supported by processor], [ax_cv_have_avx_cpu_ext], + [ + ax_cv_have_avx_cpu_ext=no + if test "$((0x$ecx>>28&0x01))" = 1; then + ax_cv_have_avx_cpu_ext=yes + fi + ]) + + if test x"$ax_cv_have_avx_cpu_ext" = x"yes"; then + AX_GCC_X86_AVX_XGETBV(0x00000000) + + xgetbv_eax="0" + if test x"$ax_cv_gcc_x86_avx_xgetbv_0x00000000" != x"unknown"; then + xgetbv_eax=`echo $ax_cv_gcc_x86_avx_xgetbv_0x00000000 | cut -d ":" -f 1` + fi + + AC_CACHE_CHECK([whether avx is supported by operating system], [ax_cv_have_avx_ext], + [ + ax_cv_have_avx_ext=no + + if test "$((0x$ecx>>27&0x01))" = 1; then + if test "$((0x$xgetbv_eax&0x6))" = 6; then + ax_cv_have_avx_ext=yes + fi + fi + ]) + if test x"$ax_cv_have_avx_ext" = x"no"; then + AC_MSG_WARN([Your processor supports AVX, but your operating system doesn't]) + fi + fi + + if test "$ax_cv_have_mmx_ext" = yes; then + AX_CHECK_COMPILE_FLAG(-mmmx, ax_cv_support_mmx_ext=yes, []) + if test x"$ax_cv_support_mmx_ext" = x"yes"; then + SIMD_FLAGS="$SIMD_FLAGS -mmmx" + AC_DEFINE(HAVE_MMX,,[Support mmx instructions]) + else + AC_MSG_WARN([Your processor supports mmx instructions but not your compiler, can you try another compiler?]) + fi + fi + + if test "$ax_cv_have_sse_ext" = yes; then + AX_CHECK_COMPILE_FLAG(-msse, ax_cv_support_sse_ext=yes, []) + if test x"$ax_cv_support_sse_ext" = x"yes"; then + SIMD_FLAGS="$SIMD_FLAGS -msse" + AC_DEFINE(HAVE_SSE,,[Support SSE (Streaming SIMD Extensions) instructions]) + else + AC_MSG_WARN([Your processor supports sse instructions but not your compiler, can you try another compiler?]) + fi + fi + + if test "$ax_cv_have_sse2_ext" = yes; then + AX_CHECK_COMPILE_FLAG(-msse2, ax_cv_support_sse2_ext=yes, []) + if test x"$ax_cv_support_sse2_ext" = x"yes"; then + SIMD_FLAGS="$SIMD_FLAGS -msse2" + AC_DEFINE(HAVE_SSE2,,[Support SSE2 (Streaming SIMD Extensions 2) instructions]) + else + AC_MSG_WARN([Your processor supports sse2 instructions but not your compiler, can you try another compiler?]) + fi + fi + + if test "$ax_cv_have_sse3_ext" = yes; then + AX_CHECK_COMPILE_FLAG(-msse3, ax_cv_support_sse3_ext=yes, []) + if test x"$ax_cv_support_sse3_ext" = x"yes"; then + SIMD_FLAGS="$SIMD_FLAGS -msse3" + AC_DEFINE(HAVE_SSE3,,[Support SSE3 (Streaming SIMD Extensions 3) instructions]) + else + AC_MSG_WARN([Your processor supports sse3 instructions but not your compiler, can you try another compiler?]) + fi + fi + + if test "$ax_cv_have_ssse3_ext" = yes; then + AX_CHECK_COMPILE_FLAG(-mssse3, ax_cv_support_ssse3_ext=yes, []) + if test x"$ax_cv_support_ssse3_ext" = x"yes"; then + SIMD_FLAGS="$SIMD_FLAGS -mssse3" + AC_DEFINE(HAVE_SSSE3,,[Support SSSE3 (Supplemental Streaming SIMD Extensions 3) instructions]) + else + AC_MSG_WARN([Your processor supports ssse3 instructions but not your compiler, can you try another compiler?]) + fi + fi + + if test "$ax_cv_have_sse41_ext" = yes; then + AX_CHECK_COMPILE_FLAG(-msse4.1, ax_cv_support_sse41_ext=yes, []) + if test x"$ax_cv_support_sse41_ext" = x"yes"; then + SIMD_FLAGS="$SIMD_FLAGS -msse4.1" + AC_DEFINE(HAVE_SSE4_1,,[Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions]) + else + AC_MSG_WARN([Your processor supports sse4.1 instructions but not your compiler, can you try another compiler?]) + fi + fi + + if test "$ax_cv_have_sse42_ext" = yes; then + AX_CHECK_COMPILE_FLAG(-msse4.2, ax_cv_support_sse42_ext=yes, []) + if test x"$ax_cv_support_sse42_ext" = x"yes"; then + SIMD_FLAGS="$SIMD_FLAGS -msse4.2" + AC_DEFINE(HAVE_SSE4_2,,[Support SSSE4.2 (Streaming SIMD Extensions 4.2) instructions]) + else + AC_MSG_WARN([Your processor supports sse4.2 instructions but not your compiler, can you try another compiler?]) + fi + fi + + if test "$ax_cv_have_avx_ext" = yes; then + AX_CHECK_COMPILE_FLAG(-mavx, ax_cv_support_avx_ext=yes, []) + if test x"$ax_cv_support_avx_ext" = x"yes"; then + SIMD_FLAGS="$SIMD_FLAGS -mavx" + AC_DEFINE(HAVE_AVX,,[Support AVX (Advanced Vector Extensions) instructions]) + else + AC_MSG_WARN([Your processor supports avx instructions but not your compiler, can you try another compiler?]) + fi + fi + + ;; + esac + + AC_SUBST(SIMD_FLAGS) +]) diff --git a/config/ax_gcc_x86_avx_xgetbv.m4 b/config/ax_gcc_x86_avx_xgetbv.m4 new file mode 100644 index 00000000..0624eebc --- /dev/null +++ b/config/ax_gcc_x86_avx_xgetbv.m4 @@ -0,0 +1,79 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_gcc_x86_avx_xgetbv.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_GCC_X86_AVX_XGETBV +# +# DESCRIPTION +# +# On later x86 processors with AVX SIMD support, with gcc or a compiler +# that has a compatible syntax for inline assembly instructions, run a +# small program that executes the xgetbv instruction with input OP. This +# can be used to detect if the OS supports AVX instruction usage. +# +# On output, the values of the eax and edx registers are stored as +# hexadecimal strings as "eax:edx" in the cache variable +# ax_cv_gcc_x86_avx_xgetbv. +# +# If the xgetbv instruction fails (because you are running a +# cross-compiler, or because you are not using gcc, or because you are on +# a processor that doesn't have this instruction), +# ax_cv_gcc_x86_avx_xgetbv_OP is set to the string "unknown". +# +# This macro mainly exists to be used in AX_EXT. +# +# LICENSE +# +# Copyright (c) 2013 Michael Petch +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see . +# +# As a special exception, the respective Autoconf Macro's copyright owner +# gives unlimited permission to copy, distribute and modify the configure +# scripts that are the output of Autoconf when processing the Macro. You +# need not follow the terms of the GNU General Public License when using +# or distributing such scripts, even though portions of the text of the +# Macro appear in them. The GNU General Public License (GPL) does govern +# all other use of the material that constitutes the Autoconf Macro. +# +# This special exception to the GPL applies to versions of the Autoconf +# Macro released by the Autoconf Archive. When you make and distribute a +# modified version of the Autoconf Macro, you may extend this special +# exception to the GPL to apply to your modified version as well. + +#serial 1 + +AC_DEFUN([AX_GCC_X86_AVX_XGETBV], +[AC_REQUIRE([AC_PROG_CC]) +AC_LANG_PUSH([C]) +AC_CACHE_CHECK(for x86-AVX xgetbv $1 output, ax_cv_gcc_x86_avx_xgetbv_$1, + [AC_RUN_IFELSE([AC_LANG_PROGRAM([#include ], [ + int op = $1, eax, edx; + FILE *f; + /* Opcodes for xgetbv */ + __asm__(".byte 0x0f, 0x01, 0xd0" + : "=a" (eax), "=d" (edx) + : "c" (op)); + f = fopen("conftest_xgetbv", "w"); if (!f) return 1; + fprintf(f, "%x:%x\n", eax, edx); + fclose(f); + return 0; +])], + [ax_cv_gcc_x86_avx_xgetbv_$1=`cat conftest_xgetbv`; rm -f conftest_xgetbv], + [ax_cv_gcc_x86_avx_xgetbv_$1=unknown; rm -f conftest_xgetbv], + [ax_cv_gcc_x86_avx_xgetbv_$1=unknown])]) +AC_LANG_POP([C]) +]) diff --git a/config/ax_gcc_x86_cpuid.m4 b/config/ax_gcc_x86_cpuid.m4 new file mode 100644 index 00000000..7d46fee0 --- /dev/null +++ b/config/ax_gcc_x86_cpuid.m4 @@ -0,0 +1,79 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_gcc_x86_cpuid.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_GCC_X86_CPUID(OP) +# +# DESCRIPTION +# +# On Pentium and later x86 processors, with gcc or a compiler that has a +# compatible syntax for inline assembly instructions, run a small program +# that executes the cpuid instruction with input OP. This can be used to +# detect the CPU type. +# +# On output, the values of the eax, ebx, ecx, and edx registers are stored +# as hexadecimal strings as "eax:ebx:ecx:edx" in the cache variable +# ax_cv_gcc_x86_cpuid_OP. +# +# If the cpuid instruction fails (because you are running a +# cross-compiler, or because you are not using gcc, or because you are on +# a processor that doesn't have this instruction), ax_cv_gcc_x86_cpuid_OP +# is set to the string "unknown". +# +# This macro mainly exists to be used in AX_GCC_ARCHFLAG. +# +# LICENSE +# +# Copyright (c) 2008 Steven G. Johnson +# Copyright (c) 2008 Matteo Frigo +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see . +# +# As a special exception, the respective Autoconf Macro's copyright owner +# gives unlimited permission to copy, distribute and modify the configure +# scripts that are the output of Autoconf when processing the Macro. You +# need not follow the terms of the GNU General Public License when using +# or distributing such scripts, even though portions of the text of the +# Macro appear in them. The GNU General Public License (GPL) does govern +# all other use of the material that constitutes the Autoconf Macro. +# +# This special exception to the GPL applies to versions of the Autoconf +# Macro released by the Autoconf Archive. When you make and distribute a +# modified version of the Autoconf Macro, you may extend this special +# exception to the GPL to apply to your modified version as well. + +#serial 7 + +AC_DEFUN([AX_GCC_X86_CPUID], +[AC_REQUIRE([AC_PROG_CC]) +AC_LANG_PUSH([C]) +AC_CACHE_CHECK(for x86 cpuid $1 output, ax_cv_gcc_x86_cpuid_$1, + [AC_RUN_IFELSE([AC_LANG_PROGRAM([#include ], [ + int op = $1, eax, ebx, ecx, edx; + FILE *f; + __asm__("cpuid" + : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) + : "a" (op)); + f = fopen("conftest_cpuid", "w"); if (!f) return 1; + fprintf(f, "%x:%x:%x:%x\n", eax, ebx, ecx, edx); + fclose(f); + return 0; +])], + [ax_cv_gcc_x86_cpuid_$1=`cat conftest_cpuid`; rm -f conftest_cpuid], + [ax_cv_gcc_x86_cpuid_$1=unknown; rm -f conftest_cpuid], + [ax_cv_gcc_x86_cpuid_$1=unknown])]) +AC_LANG_POP([C]) +]) diff --git a/configure.ac b/configure.ac index 3cf2783f..6ab694d9 100644 --- a/configure.ac +++ b/configure.ac @@ -22,6 +22,7 @@ AC_INIT(openbts,P2.8TRUNK) AC_PREREQ(2.57) AC_CONFIG_SRCDIR([Transceiver52M/Makefile.am]) AC_CONFIG_AUX_DIR([.]) +AC_CONFIG_MACRO_DIR([config]) AM_CONFIG_HEADER(config.h) AC_CANONICAL_BUILD @@ -90,11 +91,14 @@ AS_IF([test "x$with_usrp1" = "xyes"], [ if test "x$libusrp_3_3" = "xyes";then AC_DEFINE(HAVE_LIBUSRP_3_3, 1, Define to 1 if you have libusrp >= 3.3) fi + # Find and define supported SIMD extensions + AX_EXT ]) AS_IF([test "x$with_uhd" = "xyes"],[ PKG_CHECK_MODULES(UHD, uhd >= 003.004.000) AC_DEFINE(USE_UHD, 1, Define to 1 if using UHD) + AX_EXT ]) AS_IF([test "x$with_extref" = "xyes"], [