diff --git a/Makefile.am b/Makefile.am
index 058876de..06251f0d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -20,6 +20,7 @@
 
 include $(top_srcdir)/Makefile.common
 
+ACLOCAL_AMFLAGS = -I config
 AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(USB_INCLUDES) $(WITH_INCLUDES)
 AM_CXXFLAGS = -Wall -pthread -ldl
 #AM_CXXFLAGS = -Wall -O2 -NDEBUG -pthread -ldl
diff --git a/Transceiver52M/Makefile.am b/Transceiver52M/Makefile.am
index 6fcef7c6..67ab0ea0 100644
--- a/Transceiver52M/Makefile.am
+++ b/Transceiver52M/Makefile.am
@@ -21,19 +21,18 @@
 
 include $(top_srcdir)/Makefile.common
 
+AM_CFLAGS = $(STD_DEFINES_AND_INCLUDES) -std=gnu99 -march=native
+AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES)
+AM_CXXFLAGS = -ldl -lpthread
+
 #UHD wins if both are defined
 if UHD
-AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(UHD_CFLAGS)
+AM_CPPFLAGS += $(UHD_CFLAGS)
 else
 if USRP1
-AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(USRP_CFLAGS)
-else
-#we should never be here, as this doesn't build if one of the above
-#doesn't exist
-AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES)
+AM_CPPFLAGS += $(USRP_CFLAGS)
 endif
 endif
-AM_CXXFLAGS = -ldl -lpthread
 
 rev2dir = $(datadir)/usrp/rev2
 rev4dir = $(datadir)/usrp/rev4
@@ -53,7 +52,8 @@ COMMON_SOURCES = \
 	radioClock.cpp \
 	sigProcLib.cpp \
 	Transceiver.cpp \
-	DummyLoad.cpp
+	DummyLoad.cpp \
+	convolve.c
 
 libtransceiver_la_SOURCES = \
 	$(COMMON_SOURCES) \
@@ -75,7 +75,8 @@ noinst_HEADERS = \
 	USRPDevice.h \
 	DummyLoad.h \
 	rcvLPF_651.h \
-	sendLPF_961.h
+	sendLPF_961.h \
+	convolve.h
 
 USRPping_SOURCES = USRPping.cpp
 USRPping_LDADD = \
diff --git a/Transceiver52M/convolve.c b/Transceiver52M/convolve.c
new file mode 100644
index 00000000..6f48ea0c
--- /dev/null
+++ b/Transceiver52M/convolve.c
@@ -0,0 +1,714 @@
+/*
+ * SSE Convolution
+ * Copyright (C) 2012, 2013 Thomas Tsou <tom@tsou.cc>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <malloc.h>
+#include <string.h>
+#include <stdio.h>
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifdef HAVE_SSE3
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+
+/* 4-tap SSE complex-real convolution */
+static void sse_conv_real4(float *restrict x,
+			   float *restrict h,
+			   float *restrict y,
+			   int len)
+{
+	__m128 m0, m1, m2, m3, m4, m5, m6, m7;
+
+	/* Load (aligned) filter taps */
+	m0 = _mm_load_ps(&h[0]);
+	m1 = _mm_load_ps(&h[4]);
+	m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
+
+	for (int i = 0; i < len; i++) {
+		/* Load (unaligned) input data */
+		m0 = _mm_loadu_ps(&x[2 * i + 0]);
+		m1 = _mm_loadu_ps(&x[2 * i + 4]);
+		m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
+		m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
+
+		/* Quad multiply */
+		m4 = _mm_mul_ps(m2, m7);
+		m5 = _mm_mul_ps(m3, m7);
+
+		/* Sum and store */
+		m6 = _mm_hadd_ps(m4, m5);
+		m0 = _mm_hadd_ps(m6, m6);
+
+		_mm_store_ss(&y[2 * i + 0], m0);
+		m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(0, 3, 2, 1));
+		_mm_store_ss(&y[2 * i + 1], m0);
+	}
+}
+
+/* 8-tap SSE complex-real convolution */
+static void sse_conv_real8(float *restrict x,
+			   float *restrict h,
+			   float *restrict y,
+			   int len)
+{
+	__m128 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9;
+
+	/* Load (aligned) filter taps */
+	m0 = _mm_load_ps(&h[0]);
+	m1 = _mm_load_ps(&h[4]);
+	m2 = _mm_load_ps(&h[8]);
+	m3 = _mm_load_ps(&h[12]);
+
+	m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
+	m5 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
+
+	for (int i = 0; i < len; i++) {
+		/* Load (unaligned) input data */
+		m0 = _mm_loadu_ps(&x[2 * i + 0]);
+		m1 = _mm_loadu_ps(&x[2 * i + 4]);
+		m2 = _mm_loadu_ps(&x[2 * i + 8]);
+		m3 = _mm_loadu_ps(&x[2 * i + 12]);
+
+		m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
+		m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
+		m8 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
+		m9 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
+
+		/* Quad multiply */
+		m6 = _mm_mul_ps(m6, m4);
+		m7 = _mm_mul_ps(m7, m4);
+		m8 = _mm_mul_ps(m8, m5);
+		m9 = _mm_mul_ps(m9, m5);
+
+		/* Sum and store */
+		m6 = _mm_add_ps(m6, m8);
+		m7 = _mm_add_ps(m7, m9);
+		m6 = _mm_hadd_ps(m6, m7);
+		m6 = _mm_hadd_ps(m6, m6);
+
+		_mm_store_ss(&y[2 * i + 0], m6);
+		m6 = _mm_shuffle_ps(m6, m6, _MM_SHUFFLE(0, 3, 2, 1));
+		_mm_store_ss(&y[2 * i + 1], m6);
+	}
+}
+
+/* 12-tap SSE complex-real convolution */
+static void sse_conv_real12(float *restrict x,
+			    float *restrict h,
+			    float *restrict y,
+			    int len)
+{
+	__m128 m0, m1, m2, m3, m4, m5, m6, m7;
+	__m128 m8, m9, m10, m11, m12, m13, m14;
+
+	/* Load (aligned) filter taps */
+	m0 = _mm_load_ps(&h[0]);
+	m1 = _mm_load_ps(&h[4]);
+	m2 = _mm_load_ps(&h[8]);
+	m3 = _mm_load_ps(&h[12]);
+	m4 = _mm_load_ps(&h[16]);
+	m5 = _mm_load_ps(&h[20]);
+
+	m12 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
+	m13 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
+	m14 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
+
+	for (int i = 0; i < len; i++) {
+		/* Load (unaligned) input data */
+		m0 = _mm_loadu_ps(&x[2 * i + 0]);
+		m1 = _mm_loadu_ps(&x[2 * i + 4]);
+		m2 = _mm_loadu_ps(&x[2 * i + 8]);
+		m3 = _mm_loadu_ps(&x[2 * i + 12]);
+
+		m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
+		m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
+		m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
+		m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
+
+		m0 = _mm_loadu_ps(&x[2 * i + 16]);
+		m1 = _mm_loadu_ps(&x[2 * i + 20]);
+
+		m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
+		m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
+
+		/* Quad multiply */
+		m0 = _mm_mul_ps(m4, m12);
+		m1 = _mm_mul_ps(m5, m12);
+		m2 = _mm_mul_ps(m6, m13);
+		m3 = _mm_mul_ps(m7, m13);
+		m4 = _mm_mul_ps(m8, m14);
+		m5 = _mm_mul_ps(m9, m14);
+
+		/* Sum and store */
+		m8  = _mm_add_ps(m0, m2);
+		m9  = _mm_add_ps(m1, m3);
+		m10 = _mm_add_ps(m8, m4);
+		m11 = _mm_add_ps(m9, m5);
+
+		m2 = _mm_hadd_ps(m10, m11);
+		m3 = _mm_hadd_ps(m2, m2);
+
+		_mm_store_ss(&y[2 * i + 0], m3);
+		m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0, 3, 2, 1));
+		_mm_store_ss(&y[2 * i + 1], m3);
+	}
+}
+
+/* 16-tap SSE complex-real convolution */
+static void sse_conv_real16(float *restrict x,
+			    float *restrict h,
+			    float *restrict y,
+			    int len)
+{
+	__m128 m0, m1, m2, m3, m4, m5, m6, m7;
+	__m128 m8, m9, m10, m11, m12, m13, m14, m15;
+
+	/* Load (aligned) filter taps */
+	m0 = _mm_load_ps(&h[0]);
+	m1 = _mm_load_ps(&h[4]);
+	m2 = _mm_load_ps(&h[8]);
+	m3 = _mm_load_ps(&h[12]);
+
+	m4 = _mm_load_ps(&h[16]);
+	m5 = _mm_load_ps(&h[20]);
+	m6 = _mm_load_ps(&h[24]);
+	m7 = _mm_load_ps(&h[28]);
+
+	m12 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
+	m13 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
+	m14 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
+	m15 = _mm_shuffle_ps(m6, m7, _MM_SHUFFLE(0, 2, 0, 2));
+
+	for (int i = 0; i < len; i++) {
+		/* Load (unaligned) input data */
+		m0 = _mm_loadu_ps(&x[2 * i + 0]);
+		m1 = _mm_loadu_ps(&x[2 * i + 4]);
+		m2 = _mm_loadu_ps(&x[2 * i + 8]);
+		m3 = _mm_loadu_ps(&x[2 * i + 12]);
+
+		m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
+		m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
+		m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
+		m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
+
+		m0 = _mm_loadu_ps(&x[2 * i + 16]);
+		m1 = _mm_loadu_ps(&x[2 * i + 20]);
+		m2 = _mm_loadu_ps(&x[2 * i + 24]);
+		m3 = _mm_loadu_ps(&x[2 * i + 28]);
+
+		m8  = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
+		m9  = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
+		m10 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
+		m11 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
+
+		/* Quad multiply */
+		m0 = _mm_mul_ps(m4, m12);
+		m1 = _mm_mul_ps(m5, m12);
+		m2 = _mm_mul_ps(m6, m13);
+		m3 = _mm_mul_ps(m7, m13);
+
+		m4 = _mm_mul_ps(m8, m14);
+		m5 = _mm_mul_ps(m9, m14);
+		m6 = _mm_mul_ps(m10, m15);
+		m7 = _mm_mul_ps(m11, m15);
+
+		/* Sum and store */
+		m8  = _mm_add_ps(m0, m2);
+		m9  = _mm_add_ps(m1, m3);
+		m10 = _mm_add_ps(m4, m6);
+		m11 = _mm_add_ps(m5, m7);
+
+		m0 = _mm_add_ps(m8, m10);
+		m1 = _mm_add_ps(m9, m11);
+		m2 = _mm_hadd_ps(m0, m1);
+		m3 = _mm_hadd_ps(m2, m2);
+
+		_mm_store_ss(&y[2 * i + 0], m3);
+		m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0, 3, 2, 1));
+		_mm_store_ss(&y[2 * i + 1], m3);
+	}
+}
+
+/* 20-tap SSE complex-real convolution */
+static void sse_conv_real20(float *restrict x,
+			    float *restrict h,
+			    float *restrict y,
+			    int len)
+{
+	__m128 m0, m1, m2, m3, m4, m5, m6, m7;
+	__m128 m8, m9, m11, m12, m13, m14, m15;
+
+	/* Load (aligned) filter taps */
+	m0 = _mm_load_ps(&h[0]);
+	m1 = _mm_load_ps(&h[4]);
+	m2 = _mm_load_ps(&h[8]);
+	m3 = _mm_load_ps(&h[12]);
+	m4 = _mm_load_ps(&h[16]);
+	m5 = _mm_load_ps(&h[20]);
+	m6 = _mm_load_ps(&h[24]);
+	m7 = _mm_load_ps(&h[28]);
+	m8 = _mm_load_ps(&h[32]);
+	m9 = _mm_load_ps(&h[36]);
+
+	m11 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
+	m12 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
+	m13 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
+	m14 = _mm_shuffle_ps(m6, m7, _MM_SHUFFLE(0, 2, 0, 2));
+	m15 = _mm_shuffle_ps(m8, m9, _MM_SHUFFLE(0, 2, 0, 2));
+
+	for (int i = 0; i < len; i++) {
+		/* Multiply-accumulate first 12 taps */
+		m0 = _mm_loadu_ps(&x[2 * i + 0]);
+		m1 = _mm_loadu_ps(&x[2 * i + 4]);
+		m2 = _mm_loadu_ps(&x[2 * i + 8]);
+		m3 = _mm_loadu_ps(&x[2 * i + 12]);
+		m4 = _mm_loadu_ps(&x[2 * i + 16]);
+		m5 = _mm_loadu_ps(&x[2 * i + 20]);
+
+		m6  = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
+		m7  = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
+		m8  = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
+		m9  = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
+		m0  = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
+		m1  = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(1, 3, 1, 3));
+
+		m2 = _mm_mul_ps(m6, m11);
+		m3 = _mm_mul_ps(m7, m11);
+		m4 = _mm_mul_ps(m8, m12);
+		m5 = _mm_mul_ps(m9, m12);
+		m6 = _mm_mul_ps(m0, m13);
+		m7 = _mm_mul_ps(m1, m13);
+
+		m0  = _mm_add_ps(m2, m4);
+		m1  = _mm_add_ps(m3, m5);
+		m8  = _mm_add_ps(m0, m6);
+		m9  = _mm_add_ps(m1, m7);
+
+		/* Multiply-accumulate last 8 taps */
+		m0 = _mm_loadu_ps(&x[2 * i + 24]);
+		m1 = _mm_loadu_ps(&x[2 * i + 28]);
+		m2 = _mm_loadu_ps(&x[2 * i + 32]);
+		m3 = _mm_loadu_ps(&x[2 * i + 36]);
+
+		m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
+		m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
+		m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
+		m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
+
+		m0 = _mm_mul_ps(m4, m14);
+		m1 = _mm_mul_ps(m5, m14);
+		m2 = _mm_mul_ps(m6, m15);
+		m3 = _mm_mul_ps(m7, m15);
+
+		m4  = _mm_add_ps(m0, m2);
+		m5  = _mm_add_ps(m1, m3);
+
+		/* Final sum and store */
+		m0 = _mm_add_ps(m8, m4);
+		m1 = _mm_add_ps(m9, m5);
+		m2 = _mm_hadd_ps(m0, m1);
+		m3 = _mm_hadd_ps(m2, m2);
+
+		_mm_store_ss(&y[2 * i + 0], m3);
+		m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0, 3, 2, 1));
+		_mm_store_ss(&y[2 * i + 1], m3);
+	}
+}
+
+/* 4*N-tap SSE complex-real convolution */
+static void sse_conv_real4n(float *x, float *h, float *y, int h_len, int len)
+{
+	__m128 m0, m1, m2, m4, m5, m6, m7;
+
+	for (int i = 0; i < len; i++) {
+		/* Zero */
+		m6 = _mm_setzero_ps();
+		m7 = _mm_setzero_ps();
+
+		for (int n = 0; n < h_len / 4; n++) {
+			/* Load (aligned) filter taps */
+			m0 = _mm_load_ps(&h[8 * n + 0]);
+			m1 = _mm_load_ps(&h[8 * n + 4]);
+			m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
+
+			/* Load (unaligned) input data */
+			m0 = _mm_loadu_ps(&x[2 * i + 8 * n + 0]);
+			m1 = _mm_loadu_ps(&x[2 * i + 8 * n + 4]);
+			m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
+			m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
+
+			/* Quad multiply */
+			m0 = _mm_mul_ps(m2, m4);
+			m1 = _mm_mul_ps(m2, m5);
+
+			/* Accumulate */
+			m6 = _mm_add_ps(m6, m0);
+			m7 = _mm_add_ps(m7, m1);
+		}
+
+		m0 = _mm_hadd_ps(m6, m7);
+		m0 = _mm_hadd_ps(m0, m0);
+
+		_mm_store_ss(&y[2 * i + 0], m0);
+		m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(0, 3, 2, 1));
+		_mm_store_ss(&y[2 * i + 1], m0);
+	}
+}
+
+/* 4*N-tap SSE complex-complex convolution */
+static void sse_conv_cmplx_4n(float *x, float *h, float *y, int h_len, int len)
+{
+	__m128 m0, m1, m2, m3, m4, m5, m6, m7;
+
+	for (int i = 0; i < len; i++) {
+		/* Zero */
+		m6 = _mm_setzero_ps();
+		m7 = _mm_setzero_ps();
+
+		for (int n = 0; n < h_len / 4; n++) {
+			/* Load (aligned) filter taps */
+			m0 = _mm_load_ps(&h[8 * n + 0]);
+			m1 = _mm_load_ps(&h[8 * n + 4]);
+			m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
+			m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
+
+			/* Load (unaligned) input data */
+			m0 = _mm_loadu_ps(&x[2 * i + 8 * n + 0]);
+			m1 = _mm_loadu_ps(&x[2 * i + 8 * n + 4]);
+			m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
+			m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
+
+			/* Quad multiply */
+			m0 = _mm_mul_ps(m2, m4);
+			m1 = _mm_mul_ps(m3, m5);
+
+			m2 = _mm_mul_ps(m2, m5);
+			m3 = _mm_mul_ps(m3, m4);
+
+			/* Sum */
+			m0 = _mm_sub_ps(m0, m1);
+			m2 = _mm_add_ps(m2, m3);
+
+			/* Accumulate */
+			m6 = _mm_add_ps(m6, m0);
+			m7 = _mm_add_ps(m7, m2);
+		}
+
+		m0 = _mm_hadd_ps(m6, m7);
+		m0 = _mm_hadd_ps(m0, m0);
+
+		_mm_store_ss(&y[2 * i + 0], m0);
+		m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(0, 3, 2, 1));
+		_mm_store_ss(&y[2 * i + 1], m0);
+	}
+}
+
+/* 8*N-tap SSE complex-complex convolution */
+static void sse_conv_cmplx_8n(float *x, float *h, float *y, int h_len, int len)
+{
+	__m128 m0, m1, m2, m3, m4, m5, m6, m7;
+	__m128 m8, m9, m10, m11, m12, m13, m14, m15;
+
+	for (int i = 0; i < len; i++) {
+		/* Zero */
+		m12 = _mm_setzero_ps();
+		m13 = _mm_setzero_ps();
+		m14 = _mm_setzero_ps();
+		m15 = _mm_setzero_ps();
+
+		for (int n = 0; n < h_len / 8; n++) {
+			/* Load (aligned) filter taps */
+			m0 = _mm_load_ps(&h[16 * n + 0]);
+			m1 = _mm_load_ps(&h[16 * n + 4]);
+			m2 = _mm_load_ps(&h[16 * n + 8]);
+			m3 = _mm_load_ps(&h[16 * n + 12]);
+
+			m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
+			m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
+			m6 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 2, 0, 2));
+			m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
+
+			/* Load (unaligned) input data */
+			m0 = _mm_loadu_ps(&x[2 * i + 16 * n + 0]);
+			m1 = _mm_loadu_ps(&x[2 * i + 16 * n + 4]);
+			m2 = _mm_loadu_ps(&x[2 * i + 16 * n + 8]);
+			m3 = _mm_loadu_ps(&x[2 * i + 16 * n + 12]);
+
+			m8  = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
+			m9  = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
+			m10 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
+			m11 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
+
+			/* Quad multiply */
+			m0 = _mm_mul_ps(m4, m8);
+			m1 = _mm_mul_ps(m5, m9);
+			m2 = _mm_mul_ps(m6, m10);
+			m3 = _mm_mul_ps(m7, m11);
+
+			m4 = _mm_mul_ps(m4, m9);
+			m5 = _mm_mul_ps(m5, m8);
+			m6 = _mm_mul_ps(m6, m11);
+			m7 = _mm_mul_ps(m7, m10);
+
+			/* Sum */
+			m0 = _mm_sub_ps(m0, m1);
+			m2 = _mm_sub_ps(m2, m3);
+			m4 = _mm_add_ps(m4, m5);
+			m6 = _mm_add_ps(m6, m7);
+
+			/* Accumulate */
+			m12 = _mm_add_ps(m12, m0);
+			m13 = _mm_add_ps(m13, m2);
+			m14 = _mm_add_ps(m14, m4);
+			m15 = _mm_add_ps(m15, m6);
+		}
+
+		m0 = _mm_add_ps(m12, m13);
+		m1 = _mm_add_ps(m14, m15);
+		m2 = _mm_hadd_ps(m0, m1);
+		m2 = _mm_hadd_ps(m2, m2);
+
+		_mm_store_ss(&y[2 * i + 0], m2);
+		m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 3, 2, 1));
+		_mm_store_ss(&y[2 * i + 1], m2);
+	}
+}
+#endif
+
+/* Base multiply and accumulate complex-real */
+static void mac_real(float *x, float *h, float *y)
+{
+	y[0] += x[0] * h[0];
+	y[1] += x[1] * h[0];
+}
+
+/* Base multiply and accumulate complex-complex */
+static void mac_cmplx(float *x, float *h, float *y)
+{
+	y[0] += x[0] * h[0] - x[1] * h[1];
+	y[1] += x[0] * h[1] + x[1] * h[0];
+}
+
+/* Base vector complex-complex multiply and accumulate */
+static void mac_real_vec_n(float *x, float *h, float *y,
+			   int len, int step, int offset)
+{
+	for (int i = offset; i < len; i += step)
+		mac_real(&x[2 * i], &h[2 * i], y);
+}
+
+/* Base vector complex-complex multiply and accumulate */
+static void mac_cmplx_vec_n(float *x, float *h, float *y,
+			    int len, int step, int offset)
+{
+	for (int i = offset; i < len; i += step)
+		mac_cmplx(&x[2 * i], &h[2 * i], y);
+}
+
+/* Base complex-real convolution */
+static int _base_convolve_real(float *x, int x_len,
+			       float *h, int h_len,
+			       float *y, int y_len,
+			       int start, int len,
+			       int step, int offset)
+{
+	for (int i = 0; i < len; i++) {
+		mac_real_vec_n(&x[2 * (i - (h_len - 1) + start)],
+			       h,
+			       &y[2 * i], h_len,
+			       step, offset);
+	}
+
+	return len;
+}
+
+/* Base complex-complex convolution */
+static int _base_convolve_complex(float *x, int x_len,
+				  float *h, int h_len,
+				  float *y, int y_len,
+				  int start, int len,
+				  int step, int offset)
+{
+	for (int i = 0; i < len; i++) {
+		mac_cmplx_vec_n(&x[2 * (i - (h_len - 1) + start)],
+				h,
+				&y[2 * i],
+				h_len, step, offset);
+	}
+
+	return len;
+}
+
+/* Buffer validity checks */
+static int bounds_check(int x_len, int h_len, int y_len,
+			int start, int len, int step)
+{
+	if ((x_len < 1) || (h_len < 1) ||
+	    (y_len < 1) || (len < 1) || (step < 1)) {
+		fprintf(stderr, "Convolve: Invalid input\n");
+		return -1;
+	}
+
+	if ((start + len > x_len) || (len > y_len) || (x_len < h_len)) {
+		fprintf(stderr, "Convolve: Boundary exception\n");
+		fprintf(stderr, "start: %i, len: %i, x: %i, h: %i, y: %i\n",
+				start, len, x_len, h_len, y_len);
+		return -1;
+	}
+
+	return 0;
+}
+
+/* API: Aligned complex-real */
+int convolve_real(float *x, int x_len,
+		  float *h, int h_len,
+		  float *y, int y_len,
+		  int start, int len,
+		  int step, int offset)
+{
+	void (*conv_func)(float *, float *, float *, int) = NULL;
+	void (*conv_func_n)(float *, float *, float *, int, int) = NULL;
+
+	if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
+		return -1;
+
+	memset(y, 0, len * 2 * sizeof(float));
+
+#ifdef HAVE_SSE3
+	if (step <= 4) {
+		switch (h_len) {
+		case 4:
+			conv_func = sse_conv_real4;
+			break;
+		case 8:
+			conv_func = sse_conv_real8;
+			break;
+		case 12:
+			conv_func = sse_conv_real12;
+			break;
+		case 16:
+			conv_func = sse_conv_real16;
+			break;
+		case 20:
+			conv_func = sse_conv_real20;
+			break;
+		default:
+			if (!(h_len % 4))
+				conv_func_n = sse_conv_real4n;
+		}
+	}
+#endif
+	if (conv_func) {
+		conv_func(&x[2 * (-(h_len - 1) + start)],
+			  h, y, len);
+	} else if (conv_func_n) {
+		conv_func_n(&x[2 * (-(h_len - 1) + start)],
+			    h, y, h_len, len);
+	} else {
+		_base_convolve_real(x, x_len,
+				    h, h_len,
+				    y, y_len,
+				    start, len, step, offset);
+	}
+
+	return len;
+}
+
+/* API: Aligned complex-complex */
+int convolve_complex(float *x, int x_len,
+		     float *h, int h_len,
+		     float *y, int y_len,
+		     int start, int len,
+		     int step, int offset)
+{
+	void (*conv_func)(float *, float *, float *, int, int) = NULL;
+
+	if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
+		return -1;
+
+	memset(y, 0, len * 2 * sizeof(float));
+
+#ifdef HAVE_SSE3
+	if (step <= 4) {
+		if (!(h_len % 8))
+			conv_func = sse_conv_cmplx_8n;
+		else if (!(h_len % 4))
+			conv_func = sse_conv_cmplx_4n;
+	}
+#endif
+	if (conv_func) {
+		conv_func(&x[2 * (-(h_len - 1) + start)],
+			  h, y, h_len, len);
+	} else {
+		_base_convolve_complex(x, x_len,
+				       h, h_len,
+				       y, y_len,
+				       start, len, step, offset);
+	}
+
+	return len;
+}
+
+/* API: Non-aligned (no SSE) complex-real */
+int base_convolve_real(float *x, int x_len,
+		       float *h, int h_len,
+		       float *y, int y_len,
+		       int start, int len,
+		       int step, int offset)
+{
+	if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
+		return -1;
+
+	memset(y, 0, len * 2 * sizeof(float));
+
+	return _base_convolve_real(x, x_len,
+				   h, h_len,
+				   y, y_len,
+				   start, len, step, offset);
+}
+
+/* API: Non-aligned (no SSE) complex-complex */
+int base_convolve_complex(float *x, int x_len,
+			  float *h, int h_len,
+			  float *y, int y_len,
+			  int start, int len,
+			  int step, int offset)
+{
+	if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
+		return -1;
+
+	memset(y, 0, len * 2 * sizeof(float));
+
+	return _base_convolve_complex(x, x_len,
+				      h, h_len,
+				      y, y_len,
+				      start, len, step, offset);
+}
+
+/* Aligned filter tap allocation */
+void *convolve_h_alloc(int len)
+{
+#ifdef HAVE_SSE3
+	return memalign(16, len * 2 * sizeof(float));
+#else
+	return malloc(len * 2 * sizeof(float));
+#endif
+}
diff --git a/Transceiver52M/convolve.h b/Transceiver52M/convolve.h
new file mode 100644
index 00000000..aef9953b
--- /dev/null
+++ b/Transceiver52M/convolve.h
@@ -0,0 +1,30 @@
+#ifndef _CONVOLVE_H_
+#define _CONVOLVE_H_
+
+void *convolve_h_alloc(int num);
+
+int convolve_real(float *x, int x_len,
+		  float *h, int h_len,
+		  float *y, int y_len,
+		  int start, int len,
+		  int step, int offset);
+
+int convolve_complex(float *x, int x_len,
+		     float *h, int h_len,
+		     float *y, int y_len,
+		     int start, int len,
+		     int step, int offset);
+
+int base_convolve_real(float *x, int x_len,
+		       float *h, int h_len,
+		       float *y, int y_len,
+		       int start, int len,
+		       int step, int offset);
+
+int base_convolve_complex(float *x, int x_len,
+			  float *h, int h_len,
+			  float *y, int y_len,
+			  int start, int len,
+			  int step, int offset);
+
+#endif /* _CONVOLVE_H_ */
diff --git a/Transceiver52M/sigProcLib.cpp b/Transceiver52M/sigProcLib.cpp
index 2ccc714f..8237aa5d 100644
--- a/Transceiver52M/sigProcLib.cpp
+++ b/Transceiver52M/sigProcLib.cpp
@@ -29,6 +29,10 @@
 
 using namespace GSM;
 
+extern "C" {
+#include "convolve.h"
+}
+
 #define TABLESIZE 1024
 
 /** Lookup tables for trigonometric approximation */
@@ -45,28 +49,35 @@ signalVector *GMSKRotation = NULL;
 signalVector *GMSKReverseRotation = NULL;
 
 /*
- * RACH and midamble correlation waveforms
+ * RACH and midamble correlation waveforms. Store the buffer separately
+ * because we need to allocate it explicitly outside of the signal vector
+ * constructor. This is because C++ (prior to C++11) is unable to natively
+ * perform 16-byte memory alignment required by many SSE instructions.
  */
 struct CorrelationSequence {
-  CorrelationSequence() : sequence(NULL)
+  CorrelationSequence() : sequence(NULL), buffer(NULL)
   {
   }
 
   ~CorrelationSequence()
   {
     delete sequence;
+    free(buffer);
   }
 
   signalVector *sequence;
+  void         *buffer;
   float        TOA;
   complex      gain;
 };
 
 /*
- * Gaussian and empty modulation pulses
+ * Gaussian and empty modulation pulses. Like the correlation sequences,
+ * store the runtime (Gaussian) buffer separately because of needed alignment
+ * for SSE instructions.
  */
 struct PulseSequence {
-  PulseSequence() : gaussian(NULL), empty(NULL)
+  PulseSequence() : gaussian(NULL), empty(NULL), buffer(NULL)
   {
   }
 
@@ -74,10 +85,12 @@ struct PulseSequence {
   {
     delete gaussian;
     delete empty;
+    free(buffer);
   }
 
   signalVector *gaussian;
   signalVector *empty;
+  void         *buffer;
 };
 
 CorrelationSequence *gMidambles[] = {NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL};
@@ -246,7 +259,7 @@ void initGMSKRotationTables(int sps)
 
 bool sigProcLibSetup(int sps)
 {
-  if ((sps != 0) && (sps != 2) && (sps != 4))
+  if ((sps != 1) && (sps != 2) && (sps != 4))
     return false;
 
   initTrigTables();
@@ -295,174 +308,106 @@ void GMSKReverseRotate(signalVector &x) {
   }
 }
 
-
-signalVector* convolve(const signalVector *a,
-		       const signalVector *b,
-		       signalVector *c,
-		       ConvType spanType,
-		       unsigned startIx,
-		       unsigned len)
+signalVector *convolve(const signalVector *x,
+                        const signalVector *h,
+                        signalVector *y,
+                        ConvType spanType, int start,
+                        unsigned len, unsigned step, int offset)
 {
-  if ((a==NULL) || (b==NULL)) return NULL; 
-  int La = a->size();
-  int Lb = b->size();
+  int rc, head = 0, tail = 0;
+  bool alloc = false, append = false;
+  const signalVector *_x = NULL;
 
-  int startIndex;
-  unsigned int outSize;
-  switch (spanType) {
-    case FULL_SPAN:
-      startIndex = 0;
-      outSize = La+Lb-1;
-      break;
-    case OVERLAP_ONLY:
-      startIndex = La;
-      outSize = abs(La-Lb)+1;
-      break;
-    case START_ONLY:
-      startIndex = 0;
-      outSize = La;
-      break;
-    case WITH_TAIL:
-      startIndex = Lb;
-      outSize = La;
-      break;
-    case NO_DELAY:
-      if (Lb % 2) 
-	startIndex = Lb/2;
-      else
-	startIndex = Lb/2-1;
-      outSize = La;
-      break;
-    case CUSTOM:
-      startIndex = startIx;
-      outSize = len;
-      break;
-    default:
-      return NULL;
-  }
-
-  
-  if (c==NULL)
-    c = new signalVector(outSize);
-  else if (c->size()!=outSize)
+  if (!x || !h)
     return NULL;
 
-  signalVector::const_iterator aStart = a->begin();
-  signalVector::const_iterator bStart = b->begin();
-  signalVector::const_iterator aEnd = a->end();
-  signalVector::const_iterator bEnd = b->end();
-  signalVector::iterator cPtr = c->begin();
-  int t = startIndex;
-  int stopIndex = startIndex + outSize;
-  switch (b->getSymmetry()) {
-  case NONE:
-    {
-      while (t < stopIndex) {
-	signalVector::const_iterator aP = aStart+t;
-	signalVector::const_iterator bP = bStart;
-	if (a->isRealOnly() && b->isRealOnly()) {
-	  float sum = 0.0;
-	  while (bP < bEnd) {
-	    if (aP < aStart) break;
-	    if (aP < aEnd) sum += (aP->real())*(bP->real());
-	    aP--;
-	    bP++;
-	  }
-	  *cPtr++ = sum;
-	}
-	else if (a->isRealOnly()) {
-	  complex sum = 0.0;
-	  while (bP < bEnd) {
-	    if (aP < aStart) break;
-	    if (aP < aEnd) sum += (*bP)*(aP->real());
-	    aP--;
-	    bP++;
-	  }
-	  *cPtr++ = sum;
-	}
-	else if (b->isRealOnly()) {
-	  complex sum = 0.0;
-	  while (bP < bEnd) {
-	    if (aP < aStart) break;
-	    if (aP < aEnd) sum += (*aP)*(bP->real());
-	    aP--;
-	    bP++;
-	  }
-	  *cPtr++ = sum;
-	}
-	else {
-	  complex sum = 0.0;
-	  while (bP < bEnd) {
-	    if (aP < aStart) break;
-	    if (aP < aEnd) sum += (*aP)*(*bP);
-	    aP--;
-	    bP++;
-	  }
-	  *cPtr++ = sum;
-	}
-	t++;
-      }
-    }
+  switch (spanType) {
+  case START_ONLY:
+    start = 0;
+    head = h->size();
+    len = x->size();
+    append = true;
     break;
-  case ABSSYM:
-    {
-      complex sum = 0.0;
-      bool isOdd = (bool) (Lb % 2);
-      if (isOdd) 
-	bEnd = bStart + (Lb+1)/2;
-      else 
-	bEnd = bStart + Lb/2;
-      while (t < stopIndex) {
-	signalVector::const_iterator aP = aStart+t;
-	signalVector::const_iterator aPsym = aP-Lb+1;
-	signalVector::const_iterator bP = bStart;
-	sum = 0.0;
-        if (!b->isRealOnly()) {
-	  while (bP < bEnd) {
-	    if (aP < aStart) break;
-	    if (aP == aPsym)
-	      sum+= (*aP)*(*bP);
-	    else if ((aP < aEnd) && (aPsym >= aStart)) 
-	      sum+= ((*aP)+(*aPsym))*(*bP);
-	    else if (aP < aEnd)
-	      sum += (*aP)*(*bP);
-	    else if (aPsym >= aStart)
-	      sum += (*aPsym)*(*bP);
-	    aP--;
-	    aPsym++;
-	    bP++;
-	  }
-        }
-        else {
-          while (bP < bEnd) {
-            if (aP < aStart) break;
-            if (aP == aPsym)
-              sum+= (*aP)*(bP->real());
-            else if ((aP < aEnd) && (aPsym >= aStart))
-              sum+= ((*aP)+(*aPsym))*(bP->real());
-            else if (aP < aEnd)
-              sum += (*aP)*(bP->real());
-            else if (aPsym >= aStart)
-              sum += (*aPsym)*(bP->real());
-            aP--;
-            aPsym++;
-            bP++;
-          }
-        }
-	*cPtr++ = sum;
-	t++;
-      }
+  case NO_DELAY:
+    start = h->size() / 2;
+    head = start;
+    tail = start;
+    len = x->size();
+    append = true;
+    break;
+  case CUSTOM:
+    if (start < h->size() - 1) {
+      head = h->size() - start;
+      append = true;
+    }
+    if (start + len > x->size()) {
+      tail = start + len - x->size();
+      append = true;
     }
     break;
   default:
     return NULL;
-    break;
   }
-    
-    
-  return c;
-}
 
+  /*
+   * Error if the output vector is too small. Create the output vector
+   * if the pointer is NULL.
+   */
+  if (y && (len > y->size()))
+    return NULL;
+  if (!y) {
+    y = new signalVector(len);
+    alloc = true;
+  }
+
+  /* Prepend or post-pend the input vector if the parameters require it */
+  if (append)
+    _x = new signalVector(*x, head, tail);
+  else
+    _x = x;
+
+  /*
+   * Four convovle types:
+   *   1. Complex-Real (aligned)
+   *   2. Complex-Complex (aligned)
+   *   3. Complex-Real (!aligned)
+   *   4. Complex-Complex (!aligned)
+   */
+  if (h->isRealOnly() && h->isAligned()) {
+    rc = convolve_real((float *) _x->begin(), _x->size(),
+                       (float *) h->begin(), h->size(),
+                       (float *) y->begin(), y->size(),
+                       start, len, step, offset);
+  } else if (!h->isRealOnly() && h->isAligned()) {
+    rc = convolve_complex((float *) _x->begin(), _x->size(),
+                          (float *) h->begin(), h->size(),
+                          (float *) y->begin(), y->size(),
+                          start, len, step, offset);
+  } else if (h->isRealOnly() && !h->isAligned()) {
+    rc = base_convolve_real((float *) _x->begin(), _x->size(),
+                            (float *) h->begin(), h->size(),
+                            (float *) y->begin(), y->size(),
+                            start, len, step, offset);
+  } else if (!h->isRealOnly() && !h->isAligned()) {
+    rc = base_convolve_complex((float *) _x->begin(), _x->size(),
+                               (float *) h->begin(), h->size(),
+                               (float *) y->begin(), y->size(),
+                               start, len, step, offset);
+  } else {
+    rc = -1;
+  }
+
+  if (append)
+    delete _x;
+
+  if (rc < 0) {
+    if (alloc)
+      delete y;
+    return NULL;
+  }
+
+  return y;
+}
 
 void generateGSMPulse(int sps, int symbolLength)
 {
@@ -477,9 +422,17 @@ void generateGSMPulse(int sps, int symbolLength)
   GSMPulse->empty->isRealOnly(true);
   *(GSMPulse->empty->begin()) = 1.0f;
 
+  len = sps * symbolLength;
+  if (len < 4)
+    len = 4;
+
   /* GSM pulse approximation */
-  GSMPulse->gaussian = new signalVector(len);
+  GSMPulse->buffer = convolve_h_alloc(len);
+  GSMPulse->gaussian = new signalVector((complex *)
+                                        GSMPulse->buffer, 0, len);
+  GSMPulse->gaussian->setAligned(true);
   GSMPulse->gaussian->isRealOnly(true);
+
   signalVector::iterator xP = GSMPulse->gaussian->begin();
 
   center = (float) (len - 1.0) / 2.0;
@@ -560,31 +513,6 @@ signalVector* reverseConjugate(signalVector *b)
     return tmp;
 }
 
-signalVector* correlate(signalVector *a,
-			signalVector *b,
-			signalVector *c,
-			ConvType spanType,
-			bool bReversedConjugated,
-		        unsigned startIx,
-			unsigned len)
-{
-  signalVector *tmp = NULL;
-
-  if (!bReversedConjugated) {
-    tmp = reverseConjugate(b);
-  }
-  else {
-    tmp = b;
-  }
-
-  c = convolve(a,tmp,c,spanType,startIx,len);
-
-  if (!bReversedConjugated) delete tmp;
-
-  return c;
-}
-
-
 /* soft output slicer */
 bool vectorSlicer(signalVector *x) 
 {
@@ -599,12 +527,13 @@ bool vectorSlicer(signalVector *x)
   }
   return true;
 }
-  
+
+/* Assume input bits are not differentially encoded */
 signalVector *modulateBurst(const BitVector &wBurst, int guardPeriodLength,
 			    int sps, bool emptyPulse)
 {
   int burstLen;
-  signalVector *pulse, modBurst;
+  signalVector *pulse, *shapedBurst, modBurst;
   signalVector::iterator modBurstItr;
 
   if (emptyPulse)
@@ -628,7 +557,9 @@ signalVector *modulateBurst(const BitVector &wBurst, int guardPeriodLength,
   modBurst.isRealOnly(false);
 
   // filter w/ pulse shape
-  signalVector *shapedBurst = convolve(&modBurst, pulse, NULL, NO_DELAY);
+  shapedBurst = convolve(&modBurst, pulse, NULL, START_ONLY);
+  if (!shapedBurst)
+    return NULL;
 
   return shapedBurst;
 }
@@ -639,24 +570,24 @@ float sinc(float x)
   return 1.0F;
 }
 
-void delayVector(signalVector &wBurst,
-		 float delay)
+bool delayVector(signalVector &wBurst, float delay)
 {
   
   int   intOffset = (int) floor(delay);
   float fracOffset = delay - intOffset;
-  
+
   // do fractional shift first, only do it for reasonable offsets
   if (fabs(fracOffset) > 1e-2) {
     // create sinc function
     signalVector sincVector(21); 
     sincVector.isRealOnly(true);
-    signalVector::iterator sincBurstItr = sincVector.begin();
+    signalVector::iterator sincBurstItr = sincVector.end();
     for (int i = 0; i < 21; i++) 
-      *sincBurstItr++ = (complex) sinc(M_PI_F*(i-10-fracOffset));
+      *--sincBurstItr = (complex) sinc(M_PI_F*(i-10-fracOffset));
   
     signalVector shiftedBurst(wBurst.size());
-    convolve(&wBurst,&sincVector,&shiftedBurst,NO_DELAY);
+    if (!convolve(&wBurst, &sincVector, &shiftedBurst, NO_DELAY))
+      return false;
     wBurst.clone(shiftedBurst);
   }
 
@@ -861,25 +792,25 @@ bool generateMidamble(int sps, int tsc)
   bool status = true;
   complex *data = NULL;
   signalVector *autocorr = NULL, *midamble = NULL;
-  signalVector *midMidamble = NULL;
+  signalVector *midMidamble = NULL, *_midMidamble = NULL;
 
-  if ((tsc < 0) || (tsc > 7)) 
+  if ((tsc < 0) || (tsc > 7))
     return false;
 
   delete gMidambles[tsc];
- 
+
   /* Use middle 16 bits of each TSC. Correlation sequence is not pulse shaped */
   midMidamble = modulateBurst(gTrainingSequence[tsc].segment(5,16), 0, sps, true);
   if (!midMidamble)
     return false;
 
-  /* Simulated receive sequence is pulse shaped */ 
+  /* Simulated receive sequence is pulse shaped */
   midamble = modulateBurst(gTrainingSequence[tsc], 0, sps, false);
   if (!midamble) {
     status = false;
     goto release;
   }
- 
+
   // NOTE: Because ideal TSC 16-bit midamble is 66 symbols into burst,
   //       the ideal TSC has an + 180 degree phase shift,
   //       due to the pi/2 frequency shift, that 
@@ -890,22 +821,32 @@ bool generateMidamble(int sps, int tsc)
 
   conjugateVector(*midMidamble);
 
-  autocorr = correlate(midamble, midMidamble, NULL, NO_DELAY);
+  /* For SSE alignment, reallocate the midamble sequence on 16-byte boundary */
+  data = (complex *) convolve_h_alloc(midMidamble->size());
+  _midMidamble = new signalVector(data, 0, midMidamble->size());
+  _midMidamble->setAligned(true);
+  memcpy(_midMidamble->begin(), midMidamble->begin(),
+	 midMidamble->size() * sizeof(complex));
+
+  autocorr = convolve(midamble, _midMidamble, NULL, NO_DELAY);
   if (!autocorr) {
     status = false;
     goto release;
   }
 
   gMidambles[tsc] = new CorrelationSequence;
-  gMidambles[tsc]->sequence = midMidamble;
-  gMidambles[tsc]->gain = peakDetect(*autocorr,&gMidambles[tsc]->TOA,NULL);
+  gMidambles[tsc]->buffer = data;
+  gMidambles[tsc]->sequence = _midMidamble;
+  gMidambles[tsc]->gain = peakDetect(*autocorr,&gMidambles[tsc]->TOA, NULL);
 
 release:
   delete autocorr;
   delete midamble;
+  delete midMidamble;
 
   if (!status) {
-    delete midMidamble;
+    delete _midMidamble;
+    free(data);
     gMidambles[tsc] = NULL;
   }
 
@@ -917,7 +858,7 @@ bool generateRACHSequence(int sps)
   bool status = true;
   complex *data = NULL;
   signalVector *autocorr = NULL;
-  signalVector *seq0 = NULL, *seq1 = NULL;
+  signalVector *seq0 = NULL, *seq1 = NULL, *_seq1 = NULL;
 
   delete gRACHSequence;
 
@@ -933,74 +874,100 @@ bool generateRACHSequence(int sps)
 
   conjugateVector(*seq1);
 
-  autocorr = new signalVector(seq0->size());
-  if (!convolve(seq0, seq1, autocorr, NO_DELAY)) {
+  /* For SSE alignment, reallocate the midamble sequence on 16-byte boundary */
+  data = (complex *) convolve_h_alloc(seq1->size());
+  _seq1 = new signalVector(data, 0, seq1->size());
+  _seq1->setAligned(true);
+  memcpy(_seq1->begin(), seq1->begin(), seq1->size() * sizeof(complex));
+
+  autocorr = convolve(seq0, _seq1, autocorr, NO_DELAY);
+  if (!autocorr) {
     status = false;
     goto release;
   }
 
   gRACHSequence = new CorrelationSequence;
-  gRACHSequence->sequence = seq1;
-  gRACHSequence->gain = peakDetect(*autocorr,&gRACHSequence->TOA,NULL);
+  gRACHSequence->sequence = _seq1;
+  gRACHSequence->buffer = data;
+  gRACHSequence->gain = peakDetect(*autocorr,&gRACHSequence->TOA, NULL);
 
 release:
   delete autocorr;
   delete seq0;
+  delete seq1;
 
   if (!status) {
-    delete seq1;
+    delete _seq1;
+    free(data);
     gRACHSequence = NULL;
   }
 
   return status;
 }
-				
-bool detectRACHBurst(signalVector &rxBurst,
-		     float detectThreshold,
-		     int sps,
-		     complex *amplitude,
-		     float* TOA)
+
+int detectRACHBurst(signalVector &rxBurst,
+		    float thresh,
+		    int sps,
+		    complex *amp,
+		    float *toa)
 {
+  int start, len, num = 0;
+  float _toa, rms, par, avg = 0.0f;
+  complex _amp, *peak;
+  signalVector corr, *sync = gRACHSequence->sequence;
 
-  //static complex staticData[500];
- 
-  //signalVector correlatedRACH(staticData,0,rxBurst.size());
-  signalVector correlatedRACH(rxBurst.size());
-  correlate(&rxBurst,gRACHSequence->sequence,&correlatedRACH,NO_DELAY,true);
+  if ((sps != 1) && (sps != 2) && (sps != 4))
+    return -1;
 
-  float meanPower;
-  complex peakAmpl = peakDetect(correlatedRACH,TOA,&meanPower);
+  start = 40 * sps;
+  len = 24 * sps;
+  corr = signalVector(len);
 
-  float valleyPower = 0.0; 
-
-  // check for bogus results
-  if ((*TOA < 0.0) || (*TOA > correlatedRACH.size())) {
-        *amplitude = 0.0;
-	return false;
-  }
-  complex *peakPtr = correlatedRACH.begin() + (int) rint(*TOA);
-
-  float numSamples = 0.0;
-  for (int i = 57 * sps; i <= 107 * sps; i++) {
-    if (peakPtr+i >= correlatedRACH.end())
-      break;
-    valleyPower += (peakPtr+i)->norm2();
-    numSamples++;
+  if (!convolve(&rxBurst, sync, &corr,
+                CUSTOM, start, len, sps, 0)) {
+    return -1;
   }
 
-  if (numSamples < 2) {
-        *amplitude = 0.0;
-        return false;
+  _amp = peakDetect(corr, &_toa, NULL);
+  if ((_toa < 3) || (_toa > len - 3))
+    goto notfound;
+
+  peak = corr.begin() + (int) rint(_toa);
+
+  for (int i = 2 * sps; i <= 5 * sps; i++) {
+    if (peak - i >= corr.begin()) {
+      avg += (peak - i)->norm2();
+      num++;
+    }
+    if (peak + i < corr.end()) {
+      avg += (peak + i)->norm2();
+      num++;
+    }
   }
 
-  float RMS = sqrtf(valleyPower/(float) numSamples)+0.00001;
-  float peakToMean = peakAmpl.abs()/RMS;
+  if (num < 2)
+    goto notfound;
 
-  *amplitude = peakAmpl/(gRACHSequence->gain);
+  rms = sqrtf(avg / (float) num) + 0.00001;
+  par = _amp.abs() / rms;
+  if (par < thresh)
+    goto notfound;
 
-  *TOA = (*TOA) - gRACHSequence->TOA - 8 * sps;
+  /* Subtract forward tail bits from delay */
+  if (toa)
+    *toa = _toa - 8 * sps;
+  if (amp)
+    *amp = _amp / gRACHSequence->gain;
 
-  return (peakToMean > detectThreshold);
+  return 1;
+
+notfound:
+  if (amp)
+    *amp = 0.0f;
+  if (toa)
+    *toa = 0.0f;
+
+  return 0;
 }
 
 bool energyDetect(signalVector &rxBurst,
@@ -1020,120 +987,95 @@ bool energyDetect(signalVector &rxBurst,
   if (avgPwr) *avgPwr = energy/windowLength;
   return (energy/windowLength > detectThreshold*detectThreshold);
 }
-  
 
-bool analyzeTrafficBurst(signalVector &rxBurst,
-			 unsigned TSC,
-			 float detectThreshold,
-			 int sps,
-			 complex *amplitude,
-			 float *TOA,
-			 unsigned maxTOA,
-                         bool requestChannel,
-                         signalVector **channelResponse,
-			 float *channelResponseOffset) 
+int analyzeTrafficBurst(signalVector &rxBurst, unsigned tsc, float thresh,
+                        int sps, complex *amp, float *toa, unsigned max_toa,
+                        bool chan_req, signalVector **chan, float *chan_offset)
 {
+  int start, target, len, num = 0;
+  complex _amp, *peak;
+  float _toa, rms, par, avg = 0.0f;
+  signalVector corr, *sync, *_chan;
 
-  assert(TSC<8);
-  assert(amplitude);
-  assert(TOA);
-  assert(gMidambles[TSC]);
+  if ((tsc < 0) || (tsc > 7) || ((sps != 1) && (sps != 2) && (sps != 4)))
+    return -1;
 
-  if (maxTOA < 3*sps) maxTOA = 3*sps;
-  unsigned spanTOA = maxTOA;
-  if (spanTOA < 5*sps) spanTOA = 5*sps;
+  target = 3 + 58 + 5 + 16;
+  start = (target - 8) * sps;
+  len = (8 + 8 + max_toa) * sps;
 
-  unsigned startIx = 66*sps-spanTOA;
-  unsigned endIx = (66+16)*sps+spanTOA;
-  unsigned windowLen = endIx - startIx;
-  unsigned corrLen = 2*maxTOA+1;
+  sync = gMidambles[tsc]->sequence;
+  sync = gMidambles[tsc]->sequence;
+  corr = signalVector(len);
 
-  unsigned expectedTOAPeak = (unsigned) round(gMidambles[TSC]->TOA + (gMidambles[TSC]->sequence->size()-1)/2);
-
-  signalVector burstSegment(rxBurst.begin(),startIx,windowLen);
-
-  //static complex staticData[200];
-  //signalVector correlatedBurst(staticData,0,corrLen);
-  signalVector correlatedBurst(corrLen);
-  correlate(&burstSegment, gMidambles[TSC]->sequence,
-					    &correlatedBurst, CUSTOM,true,
-					    expectedTOAPeak-maxTOA,corrLen);
-
-  float meanPower;
-  *amplitude = peakDetect(correlatedBurst,TOA,&meanPower);
-  float valleyPower = 0.0; //amplitude->norm2();
-  complex *peakPtr = correlatedBurst.begin() + (int) rint(*TOA);
-
-  // check for bogus results
-  if ((*TOA < 0.0) || (*TOA > correlatedBurst.size())) {
-        *amplitude = 0.0;
-        return false;
+  if (!convolve(&rxBurst, sync, &corr,
+                CUSTOM, start, len, sps, 0)) {
+    return -1;
   }
 
-  int numRms = 0;
-  for (int i = 2*sps; i <= 5*sps;i++) {
-    if (peakPtr - i >= correlatedBurst.begin()) { 
-      valleyPower += (peakPtr-i)->norm2();
-      numRms++;
+  _amp = peakDetect(corr, &_toa, NULL);
+  peak = corr.begin() + (int) rint(_toa);
+
+  /* Check for bogus results */
+  if ((_toa < 0.0) || (_toa > corr.size()))
+    goto notfound;
+
+  for (int i = 2 * sps; i <= 5 * sps; i++) {
+    if (peak - i >= corr.begin()) {
+      avg += (peak - i)->norm2();
+      num++;
     }
-    if (peakPtr + i < correlatedBurst.end()) {
-      valleyPower += (peakPtr+i)->norm2();
-      numRms++;
+    if (peak + i < corr.end()) {
+      avg += (peak + i)->norm2();
+      num++;
     }
   }
 
-  if (numRms < 2) {
-        // check for bogus results
-        *amplitude = 0.0;
-        return false;
+  if (num < 2)
+    goto notfound;
+
+  rms = sqrtf(avg / (float) num) + 0.00001;
+  par = (_amp.abs()) / rms;
+  if (par < thresh)
+    goto notfound;
+
+  /*
+   *  NOTE: Because ideal TSC is 66 symbols into burst,
+   *      the ideal TSC has an +/- 180 degree phase shift,
+   *      due to the pi/4 frequency shift, that 
+   *      needs to be accounted for.
+   */
+  if (amp)
+    *amp = _amp / gMidambles[tsc]->gain;
+
+  /* Delay one half of peak-centred correlation length */
+  _toa -= sps * 8;
+
+  if (toa)
+    *toa = _toa;
+
+  if (chan_req) {
+    _chan = new signalVector(6 * sps);
+
+    delayVector(corr, -_toa);
+    corr.segmentCopyTo(*_chan, target - 3, _chan->size());
+    scaleVector(*_chan, complex(1.0, 0.0) / gMidambles[tsc]->gain);
+
+    *chan = _chan;
+
+    if (chan_offset)
+      *chan_offset = 3.0 * sps;;
   }
 
-  float RMS = sqrtf(valleyPower/(float)numRms)+0.00001;
-  float peakToMean = (amplitude->abs())/RMS;
+  return 1;
 
-  // NOTE: Because ideal TSC is 66 symbols into burst,
-  //       the ideal TSC has an +/- 180 degree phase shift,
-  //       due to the pi/4 frequency shift, that 
-  //       needs to be accounted for.
-  
-  *amplitude = (*amplitude)/gMidambles[TSC]->gain;
-  *TOA = (*TOA) - (maxTOA); 
+notfound:
+  if (amp)
+    *amp = 0.0f;
+  if (toa)
+    *toa = 0.0f;
 
-  if (requestChannel && (peakToMean > detectThreshold)) {
-    float TOAoffset = maxTOA;
-    delayVector(correlatedBurst,-(*TOA));
-    // midamble only allows estimation of a 6-tap channel
-    signalVector chanVector(6 * sps);
-    float maxEnergy = -1.0;
-    int maxI = -1;
-    for (int i = 0; i < 7; i++) {
-      if (TOAoffset + (i-5) * sps + chanVector.size() > correlatedBurst.size())
-        continue;
-      if (TOAoffset + (i-5) * sps < 0)
-        continue;
-      correlatedBurst.segmentCopyTo(chanVector,
-                                    (int) floor(TOAoffset + (i - 5) * sps),
-                                    chanVector.size());
-      float energy = vectorNorm2(chanVector);
-      if (energy > 0.95*maxEnergy) {
-	maxI = i;
-	maxEnergy = energy;
-      }
-    }
-	
-    *channelResponse = new signalVector(chanVector.size());
-    correlatedBurst.segmentCopyTo(**channelResponse,
-                                  (int) floor(TOAoffset + (maxI - 5) * sps),
-                                  (*channelResponse)->size());
-    scaleVector(**channelResponse, complex(1.0, 0.0) / gMidambles[TSC]->gain);
-    
-    if (channelResponseOffset) 
-      *channelResponseOffset = 5 * sps - maxI;
-      
-  }
-
-  return (peakToMean > detectThreshold);
-		  
+  return 0;
 }
 
 signalVector *decimateVector(signalVector &wVector,
@@ -1452,7 +1394,7 @@ bool designDFE(signalVector &channelResponse,
   }
 
   *feedForwardFilter = new signalVector(Nf);
-  signalVector::iterator w = (*feedForwardFilter)->begin();
+  signalVector::iterator w = (*feedForwardFilter)->end();
   for (int i = 0; i < Nf; i++) {
     delete L[i];
     complex w_i = 0.0;
@@ -1463,8 +1405,7 @@ bool designDFE(signalVector &channelResponse,
       w_i += (*vPtr)*(chanPtr->conj());
       vPtr++; chanPtr++;
     }
-    *w = w_i/d;
-    w++;
+    *--w = w_i/d;
   }
 
 
@@ -1479,10 +1420,15 @@ SoftVector *equalizeBurst(signalVector &rxBurst,
 		       signalVector &w, // feedforward filter
 		       signalVector &b) // feedback filter
 {
+  signalVector *postForwardFull;
 
-  delayVector(rxBurst,-TOA);
+  if (!delayVector(rxBurst, -TOA))
+    return NULL;
 
-  signalVector* postForwardFull = convolve(&rxBurst,&w,NULL,FULL_SPAN);
+  postForwardFull = convolve(&rxBurst, &w, NULL,
+                             CUSTOM, 0, rxBurst.size() + w.size() - 1);
+  if (!postForwardFull)
+    return NULL;
 
   signalVector* postForward = new signalVector(rxBurst.size());
   postForwardFull->segmentCopyTo(*postForward,w.size()-1,rxBurst.size());
diff --git a/Transceiver52M/sigProcLib.h b/Transceiver52M/sigProcLib.h
index ee152d54..194002fe 100644
--- a/Transceiver52M/sigProcLib.h
+++ b/Transceiver52M/sigProcLib.h
@@ -27,13 +27,10 @@ enum Symmetry {
 
 /** Convolution type indicator */
 enum ConvType {
-  FULL_SPAN = 0,
-  OVERLAP_ONLY = 1,
-  START_ONLY = 2,
-  WITH_TAIL = 3,
-  NO_DELAY = 4,
-  CUSTOM = 5,
-  UNDEFINED = 255
+  START_ONLY,
+  NO_DELAY,
+  CUSTOM,
+  UNDEFINED,
 };
 
 /** the core data structure of the Transceiver */
@@ -44,13 +41,14 @@ class signalVector: public Vector<complex>
   
   Symmetry symmetry;   ///< the symmetry of the vector
   bool realOnly;       ///< true if vector is real-valued, not complex-valued
-  
+  bool aligned;
+ 
  public:
   
   /** Constructors */
   signalVector(int dSize=0, Symmetry wSymmetry = NONE):
     Vector<complex>(dSize),
-    realOnly(false)
+    realOnly(false), aligned(false)
     { 
       symmetry = wSymmetry; 
     };
@@ -58,26 +56,45 @@ class signalVector: public Vector<complex>
   signalVector(complex* wData, size_t start, 
 	       size_t span, Symmetry wSymmetry = NONE):
     Vector<complex>(NULL,wData+start,wData+start+span),
-    realOnly(false)
+    realOnly(false), aligned(false)
     { 
       symmetry = wSymmetry; 
     };
       
   signalVector(const signalVector &vec1, const signalVector &vec2):
     Vector<complex>(vec1,vec2),
-    realOnly(false)
+    realOnly(false), aligned(false)
     { 
       symmetry = vec1.symmetry; 
     };
 	
   signalVector(const signalVector &wVector):
     Vector<complex>(wVector.size()),
-    realOnly(false)
+    realOnly(false), aligned(false)
     {
       wVector.copyTo(*this); 
       symmetry = wVector.getSymmetry();
     };
 
+  signalVector(size_t size, size_t start):
+    Vector<complex>(size + start),
+    realOnly(false), aligned(false)
+    {
+      mStart = mData + start;
+      symmetry = NONE;
+    };
+
+  signalVector(const signalVector &wVector, size_t start, size_t tail = 0):
+    Vector<complex>(start + wVector.size() + tail),
+    realOnly(false), aligned(false)
+    {
+      mStart = mData + start;
+      wVector.copyTo(*this);
+      memset(mData, 0, start * sizeof(complex));
+      memset(mStart + wVector.size(), 0, tail * sizeof(complex));
+      symmetry = NONE;
+    };
+
   /** symmetry operators */
   Symmetry getSymmetry() const { return symmetry;};
   void setSymmetry(Symmetry wSymmetry) { symmetry = wSymmetry;}; 
@@ -85,6 +102,10 @@ class signalVector: public Vector<complex>
   /** real-valued operators */
   bool isRealOnly() const { return realOnly;};
   void isRealOnly(bool wOnly) { realOnly = wOnly;};
+
+  /** alignment markers */
+  bool isAligned() const { return aligned; };
+  void setAligned(bool aligned) { this->aligned = aligned; };
 };
 
 /** Convert a linear number to a dB value */
@@ -110,14 +131,15 @@ void sigProcLibDestroy(void);
 	@param a,b The vectors to be convolved.
 	@param c, A preallocated vector to hold the convolution result.
 	@param spanType The type/span of the convolution.
-	@return The convolution result.
+	@return The convolution result or NULL on error.
 */
-signalVector* convolve(const signalVector *a,
-		       const signalVector *b,
-		       signalVector *c,
-		       ConvType spanType,
-		       unsigned startIx = 0,
-		       unsigned len = 0);
+signalVector *convolve(const signalVector *a,
+                       const signalVector *b,
+                       signalVector *c,
+                       ConvType spanType,
+                       int start = 0,
+                       unsigned len = 0,
+                       unsigned step = 1, int offset = 0);
 
 /** 
 	Generate the GSM pulse. 
@@ -169,8 +191,7 @@ signalVector *modulateBurst(const BitVector &wBurst,
 float sinc(float x);
 
 /** Delay a vector */
-void delayVector(signalVector &wBurst,
-		 float delay);
+bool delayVector(signalVector &wBurst, float delay);
 
 /** Add two vectors in-place */
 bool addVector(signalVector &x,
@@ -257,13 +278,13 @@ bool energyDetect(signalVector &rxBurst,
         @param sps The number of samples per GSM symbol.
         @param amplitude The estimated amplitude of received RACH burst.
         @param TOA The estimate time-of-arrival of received RACH burst.
-        @return True if burst SNR is larger that the detectThreshold value.
+        @return positive if threshold value is reached, negative on error, zero otherwise
 */
-bool detectRACHBurst(signalVector &rxBurst,
-		     float detectThreshold,
-		     int sps,
-		     complex *amplitude,
-		     float* TOA);
+int detectRACHBurst(signalVector &rxBurst,
+                    float detectThreshold,
+                    int sps,
+                    complex *amplitude,
+                    float* TOA);
 
 /**
         Normal burst correlator, detector, channel estimator.
@@ -277,18 +298,18 @@ bool detectRACHBurst(signalVector &rxBurst,
         @param requestChannel Set to true if channel estimation is desired.
         @param channelResponse The estimated channel.
         @param channelResponseOffset The time offset b/w the first sample of the channel response and the reported TOA.
-        @return True if burst SNR is larger that the detectThreshold value.
+        @return positive if threshold value is reached, negative on error, zero otherwise
 */
-bool analyzeTrafficBurst(signalVector &rxBurst,
-			 unsigned TSC,
-			 float detectThreshold,
-			 int sps,
-			 complex *amplitude,
-			 float *TOA,
-                         unsigned maxTOA,
-                         bool requestChannel = false,
-			 signalVector** channelResponse = NULL,
-			 float *channelResponseOffset = NULL);
+int analyzeTrafficBurst(signalVector &rxBurst,
+			unsigned TSC,
+			float detectThreshold,
+			int sps,
+			complex *amplitude,
+			float *TOA,
+                        unsigned maxTOA,
+                        bool requestChannel = false,
+			signalVector** channelResponse = NULL,
+			float *channelResponseOffset = NULL);
 
 /**
 	Decimate a vector.
diff --git a/Transceiver52M/sigProcLibTest.cpp b/Transceiver52M/sigProcLibTest.cpp
index 4f927176..32661f41 100644
--- a/Transceiver52M/sigProcLibTest.cpp
+++ b/Transceiver52M/sigProcLibTest.cpp
@@ -50,9 +50,6 @@ int main(int argc, char **argv) {
 
   sigProcLibSetup(samplesPerSymbol);
   
-  signalVector *gsmPulse = generateGSMPulse(2,samplesPerSymbol);
-  cout << *gsmPulse << endl;
-
   BitVector RACHBurstStart = "01010101";
   BitVector RACHBurstRest = "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000";
 
@@ -60,12 +57,9 @@ int main(int argc, char **argv) {
  
 
   signalVector *RACHSeq = modulateBurst(RACHBurst,
-                                        *gsmPulse,
                                         9,
                                         samplesPerSymbol);
 
-  generateRACHSequence(*gsmPulse,samplesPerSymbol);
-
   complex a; float t;
   detectRACHBurst(*RACHSeq, 5, samplesPerSymbol,&a,&t); 
 
@@ -94,12 +88,9 @@ int main(int argc, char **argv) {
 
   BitVector normalBurst(BitVector(normalBurstSeg,gTrainingSequence[TSC]),normalBurstSeg);
 
+  generateMidamble(samplesPerSymbol,TSC);
 
-  generateMidamble(*gsmPulse,samplesPerSymbol,TSC);
-
-
-  signalVector *modBurst = modulateBurst(normalBurst,*gsmPulse,
-                                         0,samplesPerSymbol);
+  signalVector *modBurst = modulateBurst(normalBurst,0,samplesPerSymbol);
 
   
   //delayVector(*rsVector2,6.932);
@@ -133,7 +124,7 @@ int main(int argc, char **argv) {
   cout << "ampl:" << ampl << endl;
   cout << "TOA: " << TOA << endl;
   //cout << "chanResp: " << *chanResp << endl;
-  SoftVector *demodBurst = demodulateBurst(*modBurst,*gsmPulse,samplesPerSymbol,(complex) ampl, TOA);
+  SoftVector *demodBurst = demodulateBurst(*modBurst,samplesPerSymbol,(complex) ampl, TOA);
   
   cout << *demodBurst << endl;
 
diff --git a/config/ax_check_compile_flag.m4 b/config/ax_check_compile_flag.m4
new file mode 100644
index 00000000..c3a8d695
--- /dev/null
+++ b/config/ax_check_compile_flag.m4
@@ -0,0 +1,72 @@
+# ===========================================================================
+#   http://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS])
+#
+# DESCRIPTION
+#
+#   Check whether the given FLAG works with the current language's compiler
+#   or gives an error.  (Warnings, however, are ignored)
+#
+#   ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on
+#   success/failure.
+#
+#   If EXTRA-FLAGS is defined, it is added to the current language's default
+#   flags (e.g. CFLAGS) when the check is done.  The check is thus made with
+#   the flags: "CFLAGS EXTRA-FLAGS FLAG".  This can for example be used to
+#   force the compiler to issue an error when a bad flag is given.
+#
+#   NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this
+#   macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Guido U. Draheim <guidod@gmx.de>
+#   Copyright (c) 2011 Maarten Bosmans <mkbosmans@gmail.com>
+#
+#   This program is free software: you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation, either version 3 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 2
+
+AC_DEFUN([AX_CHECK_COMPILE_FLAG],
+[AC_PREREQ(2.59)dnl for _AC_LANG_PREFIX
+AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl
+AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [
+  ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS
+  _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1"
+  AC_COMPILE_IFELSE([AC_LANG_PROGRAM()],
+    [AS_VAR_SET(CACHEVAR,[yes])],
+    [AS_VAR_SET(CACHEVAR,[no])])
+  _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags])
+AS_IF([test x"AS_VAR_GET(CACHEVAR)" = xyes],
+  [m4_default([$2], :)],
+  [m4_default([$3], :)])
+AS_VAR_POPDEF([CACHEVAR])dnl
+])dnl AX_CHECK_COMPILE_FLAGS
diff --git a/config/ax_ext.m4 b/config/ax_ext.m4
new file mode 100644
index 00000000..fbd10cc6
--- /dev/null
+++ b/config/ax_ext.m4
@@ -0,0 +1,221 @@
+# ===========================================================================
+#          http://www.gnu.org/software/autoconf-archive/ax_ext.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_EXT
+#
+# DESCRIPTION
+#
+#   Find supported SIMD extensions by requesting cpuid. When an SIMD
+#   extension is found, the -m"simdextensionname" is added to SIMD_FLAGS if
+#   compiler supports it. For example, if "sse2" is available, then "-msse2"
+#   is added to SIMD_FLAGS.
+#
+#   This macro calls:
+#
+#     AC_SUBST(SIMD_FLAGS)
+#
+#   And defines:
+#
+#     HAVE_MMX / HAVE_SSE / HAVE_SSE2 / HAVE_SSE3 / HAVE_SSSE3 / HAVE_SSE4.1 / HAVE_SSE4.2 / HAVE_AVX
+#
+# LICENSE
+#
+#   Copyright (c) 2007 Christophe Tournayre <turn3r@users.sourceforge.net>
+#   Copyright (c) 2013 Michael Petch <mpetch@capp-sysware.com>
+#
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved. This file is offered as-is, without any
+#   warranty.
+
+#serial 12
+
+AC_DEFUN([AX_EXT],
+[
+  AC_REQUIRE([AC_CANONICAL_HOST])
+
+  case $host_cpu in
+    i[[3456]]86*|x86_64*|amd64*)
+
+      AC_REQUIRE([AX_GCC_X86_CPUID])
+      AC_REQUIRE([AX_GCC_X86_AVX_XGETBV])
+
+      AX_GCC_X86_CPUID(0x00000001)
+      ecx=`echo $ax_cv_gcc_x86_cpuid_0x00000001 | cut -d ":" -f 3`
+      edx=`echo $ax_cv_gcc_x86_cpuid_0x00000001 | cut -d ":" -f 4`
+
+      AC_CACHE_CHECK([whether mmx is supported], [ax_cv_have_mmx_ext],
+      [
+        ax_cv_have_mmx_ext=no
+        if test "$((0x$edx>>23&0x01))" = 1; then
+          ax_cv_have_mmx_ext=yes
+        fi
+      ])
+
+      AC_CACHE_CHECK([whether sse is supported], [ax_cv_have_sse_ext],
+      [
+        ax_cv_have_sse_ext=no
+        if test "$((0x$edx>>25&0x01))" = 1; then
+          ax_cv_have_sse_ext=yes
+        fi
+      ])
+
+      AC_CACHE_CHECK([whether sse2 is supported], [ax_cv_have_sse2_ext],
+      [
+        ax_cv_have_sse2_ext=no
+        if test "$((0x$edx>>26&0x01))" = 1; then
+          ax_cv_have_sse2_ext=yes
+        fi
+      ])
+
+      AC_CACHE_CHECK([whether sse3 is supported], [ax_cv_have_sse3_ext],
+      [
+        ax_cv_have_sse3_ext=no
+        if test "$((0x$ecx&0x01))" = 1; then
+          ax_cv_have_sse3_ext=yes
+        fi
+      ])
+
+      AC_CACHE_CHECK([whether ssse3 is supported], [ax_cv_have_ssse3_ext],
+      [
+        ax_cv_have_ssse3_ext=no
+        if test "$((0x$ecx>>9&0x01))" = 1; then
+          ax_cv_have_ssse3_ext=yes
+        fi
+      ])
+
+      AC_CACHE_CHECK([whether sse4.1 is supported], [ax_cv_have_sse41_ext],
+      [
+        ax_cv_have_sse41_ext=no
+        if test "$((0x$ecx>>19&0x01))" = 1; then
+          ax_cv_have_sse41_ext=yes
+        fi
+      ])
+
+      AC_CACHE_CHECK([whether sse4.2 is supported], [ax_cv_have_sse42_ext],
+      [
+        ax_cv_have_sse42_ext=no
+        if test "$((0x$ecx>>20&0x01))" = 1; then
+          ax_cv_have_sse42_ext=yes
+        fi
+      ])
+
+      AC_CACHE_CHECK([whether avx is supported by processor], [ax_cv_have_avx_cpu_ext],
+      [
+        ax_cv_have_avx_cpu_ext=no
+        if test "$((0x$ecx>>28&0x01))" = 1; then
+          ax_cv_have_avx_cpu_ext=yes
+        fi
+      ])
+
+      if test x"$ax_cv_have_avx_cpu_ext" = x"yes"; then
+        AX_GCC_X86_AVX_XGETBV(0x00000000)
+
+        xgetbv_eax="0"
+        if test x"$ax_cv_gcc_x86_avx_xgetbv_0x00000000" != x"unknown"; then
+          xgetbv_eax=`echo $ax_cv_gcc_x86_avx_xgetbv_0x00000000 | cut -d ":" -f 1`
+        fi
+
+        AC_CACHE_CHECK([whether avx is supported by operating system], [ax_cv_have_avx_ext],
+        [
+          ax_cv_have_avx_ext=no
+
+          if test "$((0x$ecx>>27&0x01))" = 1; then
+            if test "$((0x$xgetbv_eax&0x6))" = 6; then
+              ax_cv_have_avx_ext=yes
+            fi
+          fi
+        ])
+        if test x"$ax_cv_have_avx_ext" = x"no"; then
+          AC_MSG_WARN([Your processor supports AVX, but your operating system doesn't])
+        fi
+      fi
+
+      if test "$ax_cv_have_mmx_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-mmmx, ax_cv_support_mmx_ext=yes, [])
+        if test x"$ax_cv_support_mmx_ext" = x"yes"; then
+          SIMD_FLAGS="$SIMD_FLAGS -mmmx"
+          AC_DEFINE(HAVE_MMX,,[Support mmx instructions])
+        else
+          AC_MSG_WARN([Your processor supports mmx instructions but not your compiler, can you try another compiler?])
+        fi
+      fi
+
+      if test "$ax_cv_have_sse_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-msse, ax_cv_support_sse_ext=yes, [])
+        if test x"$ax_cv_support_sse_ext" = x"yes"; then
+          SIMD_FLAGS="$SIMD_FLAGS -msse"
+          AC_DEFINE(HAVE_SSE,,[Support SSE (Streaming SIMD Extensions) instructions])
+        else
+          AC_MSG_WARN([Your processor supports sse instructions but not your compiler, can you try another compiler?])
+        fi
+      fi
+
+      if test "$ax_cv_have_sse2_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-msse2, ax_cv_support_sse2_ext=yes, [])
+        if test x"$ax_cv_support_sse2_ext" = x"yes"; then
+          SIMD_FLAGS="$SIMD_FLAGS -msse2"
+          AC_DEFINE(HAVE_SSE2,,[Support SSE2 (Streaming SIMD Extensions 2) instructions])
+        else
+          AC_MSG_WARN([Your processor supports sse2 instructions but not your compiler, can you try another compiler?])
+        fi
+      fi
+
+      if test "$ax_cv_have_sse3_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-msse3, ax_cv_support_sse3_ext=yes, [])
+        if test x"$ax_cv_support_sse3_ext" = x"yes"; then
+          SIMD_FLAGS="$SIMD_FLAGS -msse3"
+          AC_DEFINE(HAVE_SSE3,,[Support SSE3 (Streaming SIMD Extensions 3) instructions])
+        else
+          AC_MSG_WARN([Your processor supports sse3 instructions but not your compiler, can you try another compiler?])
+        fi
+      fi
+
+      if test "$ax_cv_have_ssse3_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-mssse3, ax_cv_support_ssse3_ext=yes, [])
+        if test x"$ax_cv_support_ssse3_ext" = x"yes"; then
+          SIMD_FLAGS="$SIMD_FLAGS -mssse3"
+          AC_DEFINE(HAVE_SSSE3,,[Support SSSE3 (Supplemental Streaming SIMD Extensions 3) instructions])
+        else
+          AC_MSG_WARN([Your processor supports ssse3 instructions but not your compiler, can you try another compiler?])
+        fi
+      fi
+
+      if test "$ax_cv_have_sse41_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-msse4.1, ax_cv_support_sse41_ext=yes, [])
+        if test x"$ax_cv_support_sse41_ext" = x"yes"; then
+          SIMD_FLAGS="$SIMD_FLAGS -msse4.1"
+          AC_DEFINE(HAVE_SSE4_1,,[Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions])
+        else
+          AC_MSG_WARN([Your processor supports sse4.1 instructions but not your compiler, can you try another compiler?])
+        fi
+      fi
+
+      if test "$ax_cv_have_sse42_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-msse4.2, ax_cv_support_sse42_ext=yes, [])
+        if test x"$ax_cv_support_sse42_ext" = x"yes"; then
+          SIMD_FLAGS="$SIMD_FLAGS -msse4.2"
+          AC_DEFINE(HAVE_SSE4_2,,[Support SSSE4.2 (Streaming SIMD Extensions 4.2) instructions])
+        else
+          AC_MSG_WARN([Your processor supports sse4.2 instructions but not your compiler, can you try another compiler?])
+        fi
+      fi
+
+      if test "$ax_cv_have_avx_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-mavx, ax_cv_support_avx_ext=yes, [])
+        if test x"$ax_cv_support_avx_ext" = x"yes"; then
+          SIMD_FLAGS="$SIMD_FLAGS -mavx"
+          AC_DEFINE(HAVE_AVX,,[Support AVX (Advanced Vector Extensions) instructions])
+        else
+          AC_MSG_WARN([Your processor supports avx instructions but not your compiler, can you try another compiler?])
+        fi
+      fi
+
+  ;;
+  esac
+
+  AC_SUBST(SIMD_FLAGS)
+])
diff --git a/config/ax_gcc_x86_avx_xgetbv.m4 b/config/ax_gcc_x86_avx_xgetbv.m4
new file mode 100644
index 00000000..0624eebc
--- /dev/null
+++ b/config/ax_gcc_x86_avx_xgetbv.m4
@@ -0,0 +1,79 @@
+# ===========================================================================
+#   http://www.gnu.org/software/autoconf-archive/ax_gcc_x86_avx_xgetbv.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_GCC_X86_AVX_XGETBV
+#
+# DESCRIPTION
+#
+#   On later x86 processors with AVX SIMD support, with gcc or a compiler
+#   that has a compatible syntax for inline assembly instructions, run a
+#   small program that executes the xgetbv instruction with input OP. This
+#   can be used to detect if the OS supports AVX instruction usage.
+#
+#   On output, the values of the eax and edx registers are stored as
+#   hexadecimal strings as "eax:edx" in the cache variable
+#   ax_cv_gcc_x86_avx_xgetbv.
+#
+#   If the xgetbv instruction fails (because you are running a
+#   cross-compiler, or because you are not using gcc, or because you are on
+#   a processor that doesn't have this instruction),
+#   ax_cv_gcc_x86_avx_xgetbv_OP is set to the string "unknown".
+#
+#   This macro mainly exists to be used in AX_EXT.
+#
+# LICENSE
+#
+#   Copyright (c) 2013 Michael Petch <mpetch@capp-sysware.com>
+#
+#   This program is free software: you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation, either version 3 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 1
+
+AC_DEFUN([AX_GCC_X86_AVX_XGETBV],
+[AC_REQUIRE([AC_PROG_CC])
+AC_LANG_PUSH([C])
+AC_CACHE_CHECK(for x86-AVX xgetbv $1 output, ax_cv_gcc_x86_avx_xgetbv_$1,
+ [AC_RUN_IFELSE([AC_LANG_PROGRAM([#include <stdio.h>], [
+     int op = $1, eax, edx;
+     FILE *f;
+      /* Opcodes for xgetbv */
+      __asm__(".byte 0x0f, 0x01, 0xd0"
+        : "=a" (eax), "=d" (edx)
+        : "c" (op));
+     f = fopen("conftest_xgetbv", "w"); if (!f) return 1;
+     fprintf(f, "%x:%x\n", eax, edx);
+     fclose(f);
+     return 0;
+])],
+     [ax_cv_gcc_x86_avx_xgetbv_$1=`cat conftest_xgetbv`; rm -f conftest_xgetbv],
+     [ax_cv_gcc_x86_avx_xgetbv_$1=unknown; rm -f conftest_xgetbv],
+     [ax_cv_gcc_x86_avx_xgetbv_$1=unknown])])
+AC_LANG_POP([C])
+])
diff --git a/config/ax_gcc_x86_cpuid.m4 b/config/ax_gcc_x86_cpuid.m4
new file mode 100644
index 00000000..7d46fee0
--- /dev/null
+++ b/config/ax_gcc_x86_cpuid.m4
@@ -0,0 +1,79 @@
+# ===========================================================================
+#     http://www.gnu.org/software/autoconf-archive/ax_gcc_x86_cpuid.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_GCC_X86_CPUID(OP)
+#
+# DESCRIPTION
+#
+#   On Pentium and later x86 processors, with gcc or a compiler that has a
+#   compatible syntax for inline assembly instructions, run a small program
+#   that executes the cpuid instruction with input OP. This can be used to
+#   detect the CPU type.
+#
+#   On output, the values of the eax, ebx, ecx, and edx registers are stored
+#   as hexadecimal strings as "eax:ebx:ecx:edx" in the cache variable
+#   ax_cv_gcc_x86_cpuid_OP.
+#
+#   If the cpuid instruction fails (because you are running a
+#   cross-compiler, or because you are not using gcc, or because you are on
+#   a processor that doesn't have this instruction), ax_cv_gcc_x86_cpuid_OP
+#   is set to the string "unknown".
+#
+#   This macro mainly exists to be used in AX_GCC_ARCHFLAG.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Steven G. Johnson <stevenj@alum.mit.edu>
+#   Copyright (c) 2008 Matteo Frigo
+#
+#   This program is free software: you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation, either version 3 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 7
+
+AC_DEFUN([AX_GCC_X86_CPUID],
+[AC_REQUIRE([AC_PROG_CC])
+AC_LANG_PUSH([C])
+AC_CACHE_CHECK(for x86 cpuid $1 output, ax_cv_gcc_x86_cpuid_$1,
+ [AC_RUN_IFELSE([AC_LANG_PROGRAM([#include <stdio.h>], [
+     int op = $1, eax, ebx, ecx, edx;
+     FILE *f;
+      __asm__("cpuid"
+        : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
+        : "a" (op));
+     f = fopen("conftest_cpuid", "w"); if (!f) return 1;
+     fprintf(f, "%x:%x:%x:%x\n", eax, ebx, ecx, edx);
+     fclose(f);
+     return 0;
+])],
+     [ax_cv_gcc_x86_cpuid_$1=`cat conftest_cpuid`; rm -f conftest_cpuid],
+     [ax_cv_gcc_x86_cpuid_$1=unknown; rm -f conftest_cpuid],
+     [ax_cv_gcc_x86_cpuid_$1=unknown])])
+AC_LANG_POP([C])
+])
diff --git a/configure.ac b/configure.ac
index 3cf2783f..6ab694d9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -22,6 +22,7 @@ AC_INIT(openbts,P2.8TRUNK)
 AC_PREREQ(2.57)
 AC_CONFIG_SRCDIR([Transceiver52M/Makefile.am])
 AC_CONFIG_AUX_DIR([.])	
+AC_CONFIG_MACRO_DIR([config])
 AM_CONFIG_HEADER(config.h)
 
 AC_CANONICAL_BUILD
@@ -90,11 +91,14 @@ AS_IF([test "x$with_usrp1" = "xyes"], [
     if test "x$libusrp_3_3" = "xyes";then
         AC_DEFINE(HAVE_LIBUSRP_3_3, 1, Define to 1 if you have libusrp >= 3.3)
     fi
+    # Find and define supported SIMD extensions
+    AX_EXT
 ])
 
 AS_IF([test "x$with_uhd" = "xyes"],[
     PKG_CHECK_MODULES(UHD, uhd >= 003.004.000)
     AC_DEFINE(USE_UHD, 1, Define to 1 if using UHD)
+    AX_EXT
 ])
 
 AS_IF([test "x$with_extref" = "xyes"], [