core/conv: do not mix up AVX and SSE code

According to GCC's wiki: If you specify command-line switches such as -msse, the compiler could use the extended instruction sets even if the built-ins are not used explicitly in the program. For this reason, applications that perform run-time CPU detection must compile separate files for each supported architecture, using the appropriate flags. In particular, the file containing the CPU detection code should be compiled without these options. So, this change introduces a separate Viterbi implementation, which is almost the same as previous one, but is being compiled with -mavx2. This implementation will be only used by CPUs with both SSE and AVX support: SSE3 and AVX2: viterbi_sse_avx.c SSE3 only: viterbi_sse.c Generic: viterbi_generic.c Change-Id: I042cc76258df7e4c6c90a73af3d0a6e75999b2b0
2017-05-28 18:20:02 +07:00 · 2017-05-28 18:20:02 +07:00 · 0d49f47aeb
parent b51d51b29c
commit 0d49f47aeb
6 changed files with 741 additions and 571 deletions
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -25,11 +25,24 @@ libosmocore_la_SOURCES = timer.c timer_gettimeofday.c select.c signal.c msgb.c b

 if HAVE_SSE3
 libosmocore_la_SOURCES += viterbi_sse.c
-# Per-object flags hack
-viterbi_sse.lo : CFLAGS += $(SIMD_FLAGS)
+if HAVE_SSE4_1
+viterbi_sse.lo : CFLAGS += -msse3 -msse4.1
+else
+viterbi_sse.lo : CFLAGS += -msse3
+endif
+
+if HAVE_AVX2
+libosmocore_la_SOURCES += viterbi_sse_avx.c
+if HAVE_SSE4_1
+viterbi_sse_avx.lo : CFLAGS += -msse3 -mavx2 -msse4.1
+else
+viterbi_sse_avx.lo : CFLAGS += -msse3 -mavx2
+endif
+endif
 endif

 BUILT_SOURCES = crc8gen.c crc16gen.c crc32gen.c crc64gen.c
+EXTRA_DIST = viterbi_sse_common.h

 if ENABLE_PLUGIN
 libosmocore_la_SOURCES += plugin.c
--- a/src/viterbi.c
+++ b/src/viterbi.c
@ -31,6 +31,18 @@
 #define BIT2NRZ(REG,N)	(((REG >> N) & 0x01) * 2 - 1) * -1
 #define NUM_STATES(K)	(K == 7 ? 64 : 16)

+#define INIT_POINTERS(simd) \
+{ \
+	osmo_conv_metrics_k5_n2 = osmo_conv_##simd##_metrics_k5_n2; \
+	osmo_conv_metrics_k5_n3 = osmo_conv_##simd##_metrics_k5_n3; \
+	osmo_conv_metrics_k5_n4 = osmo_conv_##simd##_metrics_k5_n4; \
+	osmo_conv_metrics_k7_n2 = osmo_conv_##simd##_metrics_k7_n2; \
+	osmo_conv_metrics_k7_n3 = osmo_conv_##simd##_metrics_k7_n3; \
+	osmo_conv_metrics_k7_n4 = osmo_conv_##simd##_metrics_k7_n4; \
+	vdec_malloc = &osmo_conv_##simd##_vdec_malloc; \
+	vdec_free = &osmo_conv_##simd##_vdec_free; \
+}
+
 static int init_complete = 0;

 __attribute__ ((visibility("hidden"))) int avx2_supported = 0;
@ -38,19 +50,37 @@ __attribute__ ((visibility("hidden"))) int sse3_supported = 0;
 __attribute__ ((visibility("hidden"))) int sse41_supported = 0;

 /**
- * This pointers will be initialized by the osmo_conv_init()
- * depending on supported SIMD extensions.
+ * These pointers are being initialized at runtime by the
+ * osmo_conv_init() depending on supported SIMD extensions.
 */
 static int16_t *(*vdec_malloc)(size_t n);
 static void (*vdec_free)(int16_t *ptr);

-/* Forward malloc wrappers */
-int16_t *osmo_conv_vdec_malloc(size_t n);
-void osmo_conv_vdec_free(int16_t *ptr);
+void (*osmo_conv_metrics_k5_n2)(const int8_t *seq,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm);
+void (*osmo_conv_metrics_k5_n3)(const int8_t *seq,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm);
+void (*osmo_conv_metrics_k5_n4)(const int8_t *seq,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm);
+void (*osmo_conv_metrics_k7_n2)(const int8_t *seq,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm);
+void (*osmo_conv_metrics_k7_n3)(const int8_t *seq,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm);
+void (*osmo_conv_metrics_k7_n4)(const int8_t *seq,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm);

-#ifdef HAVE_SSE3
-int16_t *osmo_conv_vdec_malloc_sse3(size_t n);
-void osmo_conv_vdec_free_sse3(int16_t *ptr);
+/* Forward malloc wrappers */
+int16_t *osmo_conv_gen_vdec_malloc(size_t n);
+void osmo_conv_gen_vdec_free(int16_t *ptr);
+
+#if defined(HAVE_SSE3)
+int16_t *osmo_conv_sse_vdec_malloc(size_t n);
+void osmo_conv_sse_vdec_free(int16_t *ptr);
+#endif
+
+#if defined(HAVE_SSE3) && defined(HAVE_AVX2)
+int16_t *osmo_conv_sse_avx_vdec_malloc(size_t n);
+void osmo_conv_sse_avx_vdec_free(int16_t *ptr);
 #endif

 /* Forward Metric Units */
@ -67,18 +97,33 @@ void osmo_conv_gen_metrics_k7_n3(const int8_t *seq, const int16_t *out,
 void osmo_conv_gen_metrics_k7_n4(const int8_t *seq, const int16_t *out,
 	int16_t *sums, int16_t *paths, int norm);

-#ifdef HAVE_SSE3
-void osmo_conv_gen_metrics_k5_n2_sse(const int8_t *seq, const int16_t *out,
+#if defined(HAVE_SSE3)
+void osmo_conv_sse_metrics_k5_n2(const int8_t *seq, const int16_t *out,
 	int16_t *sums, int16_t *paths, int norm);
-void osmo_conv_gen_metrics_k5_n3_sse(const int8_t *seq, const int16_t *out,
+void osmo_conv_sse_metrics_k5_n3(const int8_t *seq, const int16_t *out,
 	int16_t *sums, int16_t *paths, int norm);
-void osmo_conv_gen_metrics_k5_n4_sse(const int8_t *seq, const int16_t *out,
+void osmo_conv_sse_metrics_k5_n4(const int8_t *seq, const int16_t *out,
 	int16_t *sums, int16_t *paths, int norm);
-void osmo_conv_gen_metrics_k7_n2_sse(const int8_t *seq, const int16_t *out,
+void osmo_conv_sse_metrics_k7_n2(const int8_t *seq, const int16_t *out,
 	int16_t *sums, int16_t *paths, int norm);
-void osmo_conv_gen_metrics_k7_n3_sse(const int8_t *seq, const int16_t *out,
+void osmo_conv_sse_metrics_k7_n3(const int8_t *seq, const int16_t *out,
 	int16_t *sums, int16_t *paths, int norm);
-void osmo_conv_gen_metrics_k7_n4_sse(const int8_t *seq, const int16_t *out,
+void osmo_conv_sse_metrics_k7_n4(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+#endif
+
+#if defined(HAVE_SSE3) && defined(HAVE_AVX2)
+void osmo_conv_sse_avx_metrics_k5_n2(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_sse_avx_metrics_k5_n3(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_sse_avx_metrics_k5_n4(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_sse_avx_metrics_k7_n2(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_sse_avx_metrics_k7_n3(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_sse_avx_metrics_k7_n4(const int8_t *seq, const int16_t *out,
 	int16_t *sums, int16_t *paths, int norm);
 #endif

@ -488,31 +533,13 @@ static struct vdecoder *alloc_vdec(const struct osmo_conv_code *code)
 	if (dec->k == 5) {
 		switch (dec->n) {
 		case 2:
-		#ifdef HAVE_SSE3
-			dec->metric_func = !sse3_supported ?
-				osmo_conv_gen_metrics_k5_n2 :
-				osmo_conv_gen_metrics_k5_n2_sse;
-		#else
-			dec->metric_func = osmo_conv_gen_metrics_k5_n2;
-		#endif
+			dec->metric_func = osmo_conv_metrics_k5_n2;
 			break;
 		case 3:
-		#ifdef HAVE_SSE3
-			dec->metric_func = !sse3_supported ?
-				osmo_conv_gen_metrics_k5_n3 :
-				osmo_conv_gen_metrics_k5_n3_sse;
-		#else
-			dec->metric_func = osmo_conv_gen_metrics_k5_n3;
-		#endif
+			dec->metric_func = osmo_conv_metrics_k5_n3;
 			break;
 		case 4:
-		#ifdef HAVE_SSE3
-			dec->metric_func = !sse3_supported ?
-				osmo_conv_gen_metrics_k5_n4 :
-				osmo_conv_gen_metrics_k5_n4_sse;
-		#else
-			dec->metric_func = osmo_conv_gen_metrics_k5_n4;
-		#endif
+			dec->metric_func = osmo_conv_metrics_k5_n4;
 			break;
 		default:
 			goto fail;
@ -520,31 +547,13 @@ static struct vdecoder *alloc_vdec(const struct osmo_conv_code *code)
 	} else if (dec->k == 7) {
 		switch (dec->n) {
 		case 2:
-		#ifdef HAVE_SSE3
-			dec->metric_func = !sse3_supported ?
-				osmo_conv_gen_metrics_k7_n2 :
-				osmo_conv_gen_metrics_k7_n2_sse;
-		#else
-			dec->metric_func = osmo_conv_gen_metrics_k7_n2;
-		#endif
+			dec->metric_func = osmo_conv_metrics_k7_n2;
 			break;
 		case 3:
-		#ifdef HAVE_SSE3
-			dec->metric_func = !sse3_supported ?
-				osmo_conv_gen_metrics_k7_n3 :
-				osmo_conv_gen_metrics_k7_n3_sse;
-		#else
-			dec->metric_func = osmo_conv_gen_metrics_k7_n3;
-		#endif
+			dec->metric_func = osmo_conv_metrics_k7_n3;
 			break;
 		case 4:
-		#ifdef HAVE_SSE3
-			dec->metric_func = !sse3_supported ?
-				osmo_conv_gen_metrics_k7_n4 :
-				osmo_conv_gen_metrics_k7_n4_sse;
-		#else
-			dec->metric_func = osmo_conv_gen_metrics_k7_n4;
-		#endif
+			dec->metric_func = osmo_conv_metrics_k7_n4;
 			break;
 		default:
 			goto fail;
@ -656,14 +665,26 @@ static void osmo_conv_init(void)
 	#endif
 #endif

-#ifdef HAVE_SSE3
-	vdec_malloc = !sse3_supported ?
-		&osmo_conv_vdec_malloc : &osmo_conv_vdec_malloc_sse3;
-	vdec_free = !sse3_supported ?
-		&osmo_conv_vdec_free : &osmo_conv_vdec_free_sse3;
+/**
+ * Usage of curly braces is mandatory,
+ * because we use multi-line define.
+ */
+#if defined(HAVE_SSE3) && defined(HAVE_AVX2)
+	if (sse3_supported && avx2_supported) {
+		INIT_POINTERS(sse_avx);
+	} else if (sse3_supported) {
+		INIT_POINTERS(sse);
+	} else {
+		INIT_POINTERS(gen);
+	}
+#elif defined(HAVE_SSE3)
+	if (sse3_supported) {
+		INIT_POINTERS(sse);
+	} else {
+		INIT_POINTERS(gen);
+	}
 #else
-	vdec_malloc = &osmo_conv_vdec_malloc;
-	vdec_free = &osmo_conv_vdec_free;
+	INIT_POINTERS(gen);
 #endif
 }

--- a/src/viterbi_generic.c
+++ b/src/viterbi_generic.c
@ -129,13 +129,13 @@ static void gen_path_metrics(int num_states, int16_t *sums,

 /* Not-aligned Memory Allocator */
 __attribute__ ((visibility("hidden")))
-int16_t *osmo_conv_vdec_malloc(size_t n)
+int16_t *osmo_conv_gen_vdec_malloc(size_t n)
 {
 	return (int16_t *) malloc(sizeof(int16_t) * n);
 }

 __attribute__ ((visibility("hidden")))
-void osmo_conv_vdec_free(int16_t *ptr)
+void osmo_conv_gen_vdec_free(int16_t *ptr)
 {
 	free(ptr);
 }
--- a/src/viterbi_sse.c
+++ b/src/viterbi_sse.c
@ -21,159 +21,18 @@
 */

 #include <stdint.h>
+#include "config.h"
+
 #include <emmintrin.h>
 #include <tmmintrin.h>
 #include <xmmintrin.h>

-#include "config.h"
-
-#if defined(HAVE_SSE4_1) || defined(HAVE_SSE41)
-	#include <smmintrin.h>
-#endif
-
-#ifdef HAVE_AVX2
-	#include <immintrin.h>
+#if defined(HAVE_SSE4_1)
+#include <smmintrin.h>
 #endif

 #define SSE_ALIGN 16

-extern int sse41_supported;
-extern int sse3_supported;
-extern int avx2_supported;
-
-/* Octo-Viterbi butterfly
- * Compute 8-wide butterfly generating 16 path decisions and 16 accumulated
- * sums. Inputs all packed 16-bit integers in three 128-bit XMM registers.
- * Two intermediate registers are used and results are set in the upper 4
- * registers.
- *
- * Input:
- * M0 - Path metrics 0 (packed 16-bit integers)
- * M1 - Path metrics 1 (packed 16-bit integers)
- * M2 - Branch metrics (packed 16-bit integers)
- *
- * Output:
- * M2 - Selected and accumulated path metrics 0
- * M4 - Selected and accumulated path metrics 1
- * M3 - Path selections 0
- * M1 - Path selections 1
- */
-#define SSE_BUTTERFLY(M0, M1, M2, M3, M4) \
-{ \
-	M3 = _mm_adds_epi16(M0, M2); \
-	M4 = _mm_subs_epi16(M1, M2); \
-	M0 = _mm_subs_epi16(M0, M2); \
-	M1 = _mm_adds_epi16(M1, M2); \
-	M2 = _mm_max_epi16(M3, M4); \
-	M3 = _mm_or_si128(_mm_cmpgt_epi16(M3, M4), _mm_cmpeq_epi16(M3, M4)); \
-	M4 = _mm_max_epi16(M0, M1); \
-	M1 = _mm_or_si128(_mm_cmpgt_epi16(M0, M1), _mm_cmpeq_epi16(M0, M1)); \
-}
-
-/* Two lane deinterleaving K = 5:
- * Take 16 interleaved 16-bit integers and deinterleave to 2 packed 128-bit
- * registers. The operation summarized below. Four registers are used with
- * the lower 2 as input and upper 2 as output.
- *
- * In   - 10101010 10101010 10101010 10101010
- * Out  - 00000000 11111111 00000000 11111111
- *
- * Input:
- * M0:1 - Packed 16-bit integers
- *
- * Output:
- * M2:3 - Deinterleaved packed 16-bit integers
- */
-#define _I8_SHUFFLE_MASK 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
-
-#define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3) \
-{ \
-	M2 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
-	M0 = _mm_shuffle_epi8(M0, M2); \
-	M1 = _mm_shuffle_epi8(M1, M2); \
-	M2 = _mm_unpacklo_epi64(M0, M1); \
-	M3 = _mm_unpackhi_epi64(M0, M1); \
-}
-
-/* Two lane deinterleaving K = 7:
- * Take 64 interleaved 16-bit integers and deinterleave to 8 packed 128-bit
- * registers. The operation summarized below. 16 registers are used with the
- * lower 8 as input and upper 8 as output.
- *
- * In   - 10101010 10101010 10101010 10101010 ...
- * Out  - 00000000 11111111 00000000 11111111 ...
- *
- * Input:
- * M0:7 - Packed 16-bit integers
- *
- * Output:
- * M8:15 - Deinterleaved packed 16-bit integers
- */
-#define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, \
-	M8, M9, M10, M11, M12, M13, M14, M15) \
-{ \
-	M8  = _mm_set_epi8(_I8_SHUFFLE_MASK); \
-	M0  = _mm_shuffle_epi8(M0, M8); \
-	M1  = _mm_shuffle_epi8(M1, M8); \
-	M2  = _mm_shuffle_epi8(M2, M8); \
-	M3  = _mm_shuffle_epi8(M3, M8); \
-	M4  = _mm_shuffle_epi8(M4, M8); \
-	M5  = _mm_shuffle_epi8(M5, M8); \
-	M6  = _mm_shuffle_epi8(M6, M8); \
-	M7  = _mm_shuffle_epi8(M7, M8); \
-	M8  = _mm_unpacklo_epi64(M0, M1); \
-	M9  = _mm_unpackhi_epi64(M0, M1); \
-	M10 = _mm_unpacklo_epi64(M2, M3); \
-	M11 = _mm_unpackhi_epi64(M2, M3); \
-	M12 = _mm_unpacklo_epi64(M4, M5); \
-	M13 = _mm_unpackhi_epi64(M4, M5); \
-	M14 = _mm_unpacklo_epi64(M6, M7); \
-	M15 = _mm_unpackhi_epi64(M6, M7); \
-}
-
-/* Generate branch metrics N = 2:
- * Compute 16 branch metrics from trellis outputs and input values.
- *
- * Input:
- * M0:3 - 16 x 2 packed 16-bit trellis outputs
- * M4   - Expanded and packed 16-bit input value
- *
- * Output:
- * M6:7 - 16 computed 16-bit branch metrics
- */
-#define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7) \
-{ \
-	M0 = _mm_sign_epi16(M4, M0); \
-	M1 = _mm_sign_epi16(M4, M1); \
-	M2 = _mm_sign_epi16(M4, M2); \
-	M3 = _mm_sign_epi16(M4, M3); \
-	M6 = _mm_hadds_epi16(M0, M1); \
-	M7 = _mm_hadds_epi16(M2, M3); \
-}
-
-/* Generate branch metrics N = 4:
- * Compute 8 branch metrics from trellis outputs and input values. This
- * macro is reused for N less than 4 where the extra soft input bits are
- * padded.
- *
- * Input:
- * M0:3 - 8 x 4 packed 16-bit trellis outputs
- * M4   - Expanded and packed 16-bit input value
- *
- * Output:
- * M5   - 8 computed 16-bit branch metrics
- */
-#define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5) \
-{ \
-	M0 = _mm_sign_epi16(M4, M0); \
-	M1 = _mm_sign_epi16(M4, M1); \
-	M2 = _mm_sign_epi16(M4, M2); \
-	M3 = _mm_sign_epi16(M4, M3); \
-	M0 = _mm_hadds_epi16(M0, M1); \
-	M1 = _mm_hadds_epi16(M2, M3); \
-	M5 = _mm_hadds_epi16(M0, M1); \
-}
-
 /* Broadcast 16-bit integer
 * Repeat the low 16-bit integer to all elements of the 128-bit SSE
 * register. Only AVX2 has a dedicated broadcast instruction; use repeat
@ -186,364 +45,17 @@ extern int avx2_supported;
 * Output:
 * M0 - Contains broadcasted values
 */
-#ifdef HAVE_AVX2
-#define SSE_BROADCAST(M0) \
-{ \
-	if (avx2_supported) { \
-		M0 = _mm_broadcastw_epi16(M0); \
-	} else { \
-		M0 = _mm_unpacklo_epi16(M0, M0); \
-		M0 = _mm_unpacklo_epi32(M0, M0); \
-		M0 = _mm_unpacklo_epi64(M0, M0); \
-	} \
-}
-#else
 #define SSE_BROADCAST(M0) \
 { \
 	M0 = _mm_unpacklo_epi16(M0, M0); \
 	M0 = _mm_unpacklo_epi32(M0, M0); \
 	M0 = _mm_unpacklo_epi64(M0, M0); \
 }
-#endif

-/* Horizontal minimum
- * Compute horizontal minimum of packed unsigned 16-bit integers and place
- * result in the low 16-bit element of the source register. Only SSE 4.1
- * has a dedicated minpos instruction. One intermediate register is used
- * if SSE 4.1 is not available. This is a destructive operation and the
- * source register is overwritten.
- *
- * Input:
- * M0 - Packed unsigned 16-bit integers
- *
- * Output:
- * M0 - Minimum value placed in low 16-bit element
+/**
+ * Include common SSE implementation
 */
-#if defined(HAVE_SSE4_1) || defined(HAVE_SSE41)
-#define SSE_MINPOS(M0, M1) \
-{ \
-	if (sse41_supported) { \
-		M0 = _mm_minpos_epu16(M0); \
-	} else { \
-		M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
-		M0 = _mm_min_epi16(M0, M1); \
-		M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
-		M0 = _mm_min_epi16(M0, M1); \
-		M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
-		M0 = _mm_min_epi16(M0, M1); \
-	} \
-}
-#else
-#define SSE_MINPOS(M0, M1) \
-{ \
-	M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
-	M0 = _mm_min_epi16(M0, M1); \
-	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
-	M0 = _mm_min_epi16(M0, M1); \
-	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
-	M0 = _mm_min_epi16(M0, M1); \
-}
-#endif
-
-/* Normalize state metrics K = 5:
- * Compute 16-wide normalization by subtracting the smallest value from
- * all values. Inputs are 16 packed 16-bit integers across 2 XMM registers.
- * Two intermediate registers are used and normalized results are placed
- * in the originating locations.
- *
- * Input:
- * M0:1 - Path metrics 0:1 (packed 16-bit integers)
- *
- * Output:
- * M0:1 - Normalized path metrics 0:1
- */
-#define SSE_NORMALIZE_K5(M0, M1, M2, M3) \
-{ \
-	M2 = _mm_min_epi16(M0, M1); \
-	SSE_MINPOS(M2, M3) \
-	SSE_BROADCAST(M2) \
-	M0 = _mm_subs_epi16(M0, M2); \
-	M1 = _mm_subs_epi16(M1, M2); \
-}
-
-/* Normalize state metrics K = 7:
- * Compute 64-wide normalization by subtracting the smallest value from
- * all values. Inputs are 8 registers of accumulated sums and 4 temporary
- * registers. Normalized results are returned in the originating locations.
- *
- * Input:
- * M0:7 - Path metrics 0:7 (packed 16-bit integers)
- *
- * Output:
- * M0:7 - Normalized path metrics 0:7
- */
-#define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11) \
-{ \
-	M8  = _mm_min_epi16(M0, M1); \
-	M9  = _mm_min_epi16(M2, M3); \
-	M10 = _mm_min_epi16(M4, M5); \
-	M11 = _mm_min_epi16(M6, M7); \
-	M8  = _mm_min_epi16(M8, M9); \
-	M10 = _mm_min_epi16(M10, M11); \
-	M8  = _mm_min_epi16(M8, M10); \
-	SSE_MINPOS(M8, M9) \
-	SSE_BROADCAST(M8) \
-	M0  = _mm_subs_epi16(M0, M8); \
-	M1  = _mm_subs_epi16(M1, M8); \
-	M2  = _mm_subs_epi16(M2, M8); \
-	M3  = _mm_subs_epi16(M3, M8); \
-	M4  = _mm_subs_epi16(M4, M8); \
-	M5  = _mm_subs_epi16(M5, M8); \
-	M6  = _mm_subs_epi16(M6, M8); \
-	M7  = _mm_subs_epi16(M7, M8); \
-}
-
-/* Combined BMU/PMU (K=5, N=2)
- * Compute branch metrics followed by path metrics for half rate 16-state
- * trellis. 8 butterflies are computed. Accumulated path sums are not
- * preserved and read and written into the same memory location. Normalize
- * sums if requires.
- */
-__always_inline static void _sse_metrics_k5_n2(const int16_t *val,
-	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
-{
-	__m128i m0, m1, m2, m3, m4, m5, m6;
-
-	/* (BMU) Load input sequence */
-	m2 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
-
-	/* (BMU) Load trellis outputs */
-	m0 = _mm_load_si128((__m128i *) &out[0]);
-	m1 = _mm_load_si128((__m128i *) &out[8]);
-
-	/* (BMU) Compute branch metrics */
-	m0 = _mm_sign_epi16(m2, m0);
-	m1 = _mm_sign_epi16(m2, m1);
-	m2 = _mm_hadds_epi16(m0, m1);
-
-	/* (PMU) Load accumulated path metrics */
-	m0 = _mm_load_si128((__m128i *) &sums[0]);
-	m1 = _mm_load_si128((__m128i *) &sums[8]);
-
-	SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
-
-	/* (PMU) Butterflies: 0-7 */
-	SSE_BUTTERFLY(m3, m4, m2, m5, m6)
-
-	if (norm)
-		SSE_NORMALIZE_K5(m2, m6, m0, m1)
-
-	_mm_store_si128((__m128i *) &sums[0], m2);
-	_mm_store_si128((__m128i *) &sums[8], m6);
-	_mm_store_si128((__m128i *) &paths[0], m5);
-	_mm_store_si128((__m128i *) &paths[8], m4);
-}
-
-/* Combined BMU/PMU (K=5, N=3 and N=4)
- * Compute branch metrics followed by path metrics for 16-state and rates
- * to 1/4. 8 butterflies are computed. The input sequence is read four 16-bit
- * values at a time, and extra values should be set to zero for rates other
- * than 1/4. Normally only rates 1/3 and 1/4 are used as there is a
- * dedicated implementation of rate 1/2.
- */
-__always_inline static void _sse_metrics_k5_n4(const int16_t *val,
-	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
-{
-	__m128i m0, m1, m2, m3, m4, m5, m6;
-
-	/* (BMU) Load input sequence */
-	m4 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
-
-	/* (BMU) Load trellis outputs */
-	m0 = _mm_load_si128((__m128i *) &out[0]);
-	m1 = _mm_load_si128((__m128i *) &out[8]);
-	m2 = _mm_load_si128((__m128i *) &out[16]);
-	m3 = _mm_load_si128((__m128i *) &out[24]);
-
-	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m4, m2)
-
-	/* (PMU) Load accumulated path metrics */
-	m0 = _mm_load_si128((__m128i *) &sums[0]);
-	m1 = _mm_load_si128((__m128i *) &sums[8]);
-
-	SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
-
-	/* (PMU) Butterflies: 0-7 */
-	SSE_BUTTERFLY(m3, m4, m2, m5, m6)
-
-	if (norm)
-		SSE_NORMALIZE_K5(m2, m6, m0, m1)
-
-	_mm_store_si128((__m128i *) &sums[0], m2);
-	_mm_store_si128((__m128i *) &sums[8], m6);
-	_mm_store_si128((__m128i *) &paths[0], m5);
-	_mm_store_si128((__m128i *) &paths[8], m4);
-}
-
-/* Combined BMU/PMU (K=7, N=2)
- * Compute branch metrics followed by path metrics for half rate 64-state
- * trellis. 32 butterfly operations are computed. Deinterleaving path
- * metrics requires usage of the full SSE register file, so separate sums
- * before computing branch metrics to avoid register spilling.
- */
-__always_inline static void _sse_metrics_k7_n2(const int16_t *val,
-	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
-{
-	__m128i m0, m1, m2, m3, m4, m5, m6, m7, m8,
-		m9, m10, m11, m12, m13, m14, m15;
-
-	/* (PMU) Load accumulated path metrics */
-	m0 = _mm_load_si128((__m128i *) &sums[0]);
-	m1 = _mm_load_si128((__m128i *) &sums[8]);
-	m2 = _mm_load_si128((__m128i *) &sums[16]);
-	m3 = _mm_load_si128((__m128i *) &sums[24]);
-	m4 = _mm_load_si128((__m128i *) &sums[32]);
-	m5 = _mm_load_si128((__m128i *) &sums[40]);
-	m6 = _mm_load_si128((__m128i *) &sums[48]);
-	m7 = _mm_load_si128((__m128i *) &sums[56]);
-
-	/* (PMU) Deinterleave to even-odd registers */
-	SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
-			    m8, m9, m10, m11, m12, m13, m14, m15)
-
-	/* (BMU) Load input symbols */
-	m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
-
-	/* (BMU) Load trellis outputs */
-	m0 = _mm_load_si128((__m128i *) &out[0]);
-	m1 = _mm_load_si128((__m128i *) &out[8]);
-	m2 = _mm_load_si128((__m128i *) &out[16]);
-	m3 = _mm_load_si128((__m128i *) &out[24]);
-
-	SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m4, m5)
-
-	m0 = _mm_load_si128((__m128i *) &out[32]);
-	m1 = _mm_load_si128((__m128i *) &out[40]);
-	m2 = _mm_load_si128((__m128i *) &out[48]);
-	m3 = _mm_load_si128((__m128i *) &out[56]);
-
-	SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m6, m7)
-
-	/* (PMU) Butterflies: 0-15 */
-	SSE_BUTTERFLY(m8, m9, m4, m0, m1)
-	SSE_BUTTERFLY(m10, m11, m5, m2, m3)
-
-	_mm_store_si128((__m128i *) &paths[0], m0);
-	_mm_store_si128((__m128i *) &paths[8], m2);
-	_mm_store_si128((__m128i *) &paths[32], m9);
-	_mm_store_si128((__m128i *) &paths[40], m11);
-
-	/* (PMU) Butterflies: 17-31 */
-	SSE_BUTTERFLY(m12, m13, m6, m0, m2)
-	SSE_BUTTERFLY(m14, m15, m7, m9, m11)
-
-	_mm_store_si128((__m128i *) &paths[16], m0);
-	_mm_store_si128((__m128i *) &paths[24], m9);
-	_mm_store_si128((__m128i *) &paths[48], m13);
-	_mm_store_si128((__m128i *) &paths[56], m15);
-
-	if (norm)
-		SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
-				 m7, m11, m0, m8, m9, m10)
-
-	_mm_store_si128((__m128i *) &sums[0], m4);
-	_mm_store_si128((__m128i *) &sums[8], m5);
-	_mm_store_si128((__m128i *) &sums[16], m6);
-	_mm_store_si128((__m128i *) &sums[24], m7);
-	_mm_store_si128((__m128i *) &sums[32], m1);
-	_mm_store_si128((__m128i *) &sums[40], m3);
-	_mm_store_si128((__m128i *) &sums[48], m2);
-	_mm_store_si128((__m128i *) &sums[56], m11);
-}
-
-/* Combined BMU/PMU (K=7, N=3 and N=4)
- * Compute branch metrics followed by path metrics for half rate 64-state
- * trellis. 32 butterfly operations are computed. Deinterleave path
- * metrics before computing branch metrics as in the half rate case.
- */
-__always_inline static void _sse_metrics_k7_n4(const int16_t *val,
-	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
-{
-	__m128i m0, m1, m2, m3, m4, m5, m6, m7;
-	__m128i m8, m9, m10, m11, m12, m13, m14, m15;
-
-	/* (PMU) Load accumulated path metrics */
-	m0 = _mm_load_si128((__m128i *) &sums[0]);
-	m1 = _mm_load_si128((__m128i *) &sums[8]);
-	m2 = _mm_load_si128((__m128i *) &sums[16]);
-	m3 = _mm_load_si128((__m128i *) &sums[24]);
-	m4 = _mm_load_si128((__m128i *) &sums[32]);
-	m5 = _mm_load_si128((__m128i *) &sums[40]);
-	m6 = _mm_load_si128((__m128i *) &sums[48]);
-	m7 = _mm_load_si128((__m128i *) &sums[56]);
-
-	/* (PMU) Deinterleave into even and odd packed registers */
-	SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
-			    m8, m9, m10, m11, m12, m13, m14, m15)
-
-	/* (BMU) Load and expand 8-bit input out to 16-bits */
-	m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
-
-	/* (BMU) Load and compute branch metrics */
-	m0 = _mm_load_si128((__m128i *) &out[0]);
-	m1 = _mm_load_si128((__m128i *) &out[8]);
-	m2 = _mm_load_si128((__m128i *) &out[16]);
-	m3 = _mm_load_si128((__m128i *) &out[24]);
-
-	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m4)
-
-	m0 = _mm_load_si128((__m128i *) &out[32]);
-	m1 = _mm_load_si128((__m128i *) &out[40]);
-	m2 = _mm_load_si128((__m128i *) &out[48]);
-	m3 = _mm_load_si128((__m128i *) &out[56]);
-
-	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m5)
-
-	m0 = _mm_load_si128((__m128i *) &out[64]);
-	m1 = _mm_load_si128((__m128i *) &out[72]);
-	m2 = _mm_load_si128((__m128i *) &out[80]);
-	m3 = _mm_load_si128((__m128i *) &out[88]);
-
-	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m6)
-
-	m0 = _mm_load_si128((__m128i *) &out[96]);
-	m1 = _mm_load_si128((__m128i *) &out[104]);
-	m2 = _mm_load_si128((__m128i *) &out[112]);
-	m3 = _mm_load_si128((__m128i *) &out[120]);
-
-	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m7)
-
-	/* (PMU) Butterflies: 0-15 */
-	SSE_BUTTERFLY(m8, m9, m4, m0, m1)
-	SSE_BUTTERFLY(m10, m11, m5, m2, m3)
-
-	_mm_store_si128((__m128i *) &paths[0], m0);
-	_mm_store_si128((__m128i *) &paths[8], m2);
-	_mm_store_si128((__m128i *) &paths[32], m9);
-	_mm_store_si128((__m128i *) &paths[40], m11);
-
-	/* (PMU) Butterflies: 17-31 */
-	SSE_BUTTERFLY(m12, m13, m6, m0, m2)
-	SSE_BUTTERFLY(m14, m15, m7, m9, m11)
-
-	_mm_store_si128((__m128i *) &paths[16], m0);
-	_mm_store_si128((__m128i *) &paths[24], m9);
-	_mm_store_si128((__m128i *) &paths[48], m13);
-	_mm_store_si128((__m128i *) &paths[56], m15);
-
-	if (norm)
-		SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
-				 m7, m11, m0, m8, m9, m10)
-
-	_mm_store_si128((__m128i *) &sums[0], m4);
-	_mm_store_si128((__m128i *) &sums[8], m5);
-	_mm_store_si128((__m128i *) &sums[16], m6);
-	_mm_store_si128((__m128i *) &sums[24], m7);
-	_mm_store_si128((__m128i *) &sums[32], m1);
-	_mm_store_si128((__m128i *) &sums[40], m3);
-	_mm_store_si128((__m128i *) &sums[48], m2);
-	_mm_store_si128((__m128i *) &sums[56], m11);
-}
+#include <viterbi_sse_common.h>

 /* Aligned Memory Allocator
 * SSE requires 16-byte memory alignment. We store relevant trellis values
@ -551,19 +63,19 @@ __always_inline static void _sse_metrics_k7_n4(const int16_t *val,
 * so the allocated memory is casted as such.
 */
 __attribute__ ((visibility("hidden")))
-int16_t *osmo_conv_vdec_malloc_sse3(size_t n)
+int16_t *osmo_conv_sse_vdec_malloc(size_t n)
 {
 	return (int16_t *) _mm_malloc(sizeof(int16_t) * n, SSE_ALIGN);
 }

 __attribute__ ((visibility("hidden")))
-void osmo_conv_vdec_free_sse3(int16_t *ptr)
+void osmo_conv_sse_vdec_free(int16_t *ptr)
 {
 	_mm_free(ptr);
 }

 __attribute__ ((visibility("hidden")))
-void osmo_conv_gen_metrics_k5_n2_sse(const int8_t *val, const int16_t *out,
+void osmo_conv_sse_metrics_k5_n2(const int8_t *val, const int16_t *out,
 	int16_t *sums, int16_t *paths, int norm)
 {
 	const int16_t _val[4] = { val[0], val[1], val[0], val[1] };
@ -572,7 +84,7 @@ void osmo_conv_gen_metrics_k5_n2_sse(const int8_t *val, const int16_t *out,
 }

 __attribute__ ((visibility("hidden")))
-void osmo_conv_gen_metrics_k5_n3_sse(const int8_t *val, const int16_t *out,
+void osmo_conv_sse_metrics_k5_n3(const int8_t *val, const int16_t *out,
 	int16_t *sums, int16_t *paths, int norm)
 {
 	const int16_t _val[4] = { val[0], val[1], val[2], 0 };
@ -581,7 +93,7 @@ void osmo_conv_gen_metrics_k5_n3_sse(const int8_t *val, const int16_t *out,
 }

 __attribute__ ((visibility("hidden")))
-void osmo_conv_gen_metrics_k5_n4_sse(const int8_t *val, const int16_t *out,
+void osmo_conv_sse_metrics_k5_n4(const int8_t *val, const int16_t *out,
 	int16_t *sums, int16_t *paths, int norm)
 {
 	const int16_t _val[4] = { val[0], val[1], val[2], val[3] };
@ -590,7 +102,7 @@ void osmo_conv_gen_metrics_k5_n4_sse(const int8_t *val, const int16_t *out,
 }

 __attribute__ ((visibility("hidden")))
-void osmo_conv_gen_metrics_k7_n2_sse(const int8_t *val, const int16_t *out,
+void osmo_conv_sse_metrics_k7_n2(const int8_t *val, const int16_t *out,
 	int16_t *sums, int16_t *paths, int norm)
 {
 	const int16_t _val[4] = { val[0], val[1], val[0], val[1] };
@ -599,7 +111,7 @@ void osmo_conv_gen_metrics_k7_n2_sse(const int8_t *val, const int16_t *out,
 }

 __attribute__ ((visibility("hidden")))
-void osmo_conv_gen_metrics_k7_n3_sse(const int8_t *val, const int16_t *out,
+void osmo_conv_sse_metrics_k7_n3(const int8_t *val, const int16_t *out,
 	int16_t *sums, int16_t *paths, int norm)
 {
 	const int16_t _val[4] = { val[0], val[1], val[2], 0 };
@ -608,7 +120,7 @@ void osmo_conv_gen_metrics_k7_n3_sse(const int8_t *val, const int16_t *out,
 }

 __attribute__ ((visibility("hidden")))
-void osmo_conv_gen_metrics_k7_n4_sse(const int8_t *val, const int16_t *out,
+void osmo_conv_sse_metrics_k7_n4(const int8_t *val, const int16_t *out,
 	int16_t *sums, int16_t *paths, int norm)
 {
 	const int16_t _val[4] = { val[0], val[1], val[2], val[3] };
--- a/src/viterbi_sse_avx.c
+++ b/src/viterbi_sse_avx.c
@ -0,0 +1,129 @@
+/*
+ * Intel SSE + AVX Viterbi decoder
+ *
+ * Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc>
+ *
+ * All Rights Reserved
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stdint.h>
+#include "config.h"
+
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <xmmintrin.h>
+#include <immintrin.h>
+
+#if defined(HAVE_SSE4_1)
+#include <smmintrin.h>
+#endif
+
+#define SSE_ALIGN 16
+
+
+/* Broadcast 16-bit integer
+ * Repeat the low 16-bit integer to all elements of the 128-bit SSE
+ * register. Only AVX2 has a dedicated broadcast instruction; use repeat
+ * unpacks for SSE only architectures. This is a destructive operation and
+ * the source register is overwritten.
+ *
+ * Input:
+ * M0 - Low 16-bit element is read
+ *
+ * Output:
+ * M0 - Contains broadcasted values
+ */
+#define SSE_BROADCAST(M0) \
+{ \
+	M0 = _mm_broadcastw_epi16(M0); \
+}
+
+/**
+ * Include common SSE implementation
+ */
+#include <viterbi_sse_common.h>
+
+/* Aligned Memory Allocator
+ * SSE requires 16-byte memory alignment. We store relevant trellis values
+ * (accumulated sums, outputs, and path decisions) as 16 bit signed integers
+ * so the allocated memory is casted as such.
+ */
+__attribute__ ((visibility("hidden")))
+int16_t *osmo_conv_sse_avx_vdec_malloc(size_t n)
+{
+	return (int16_t *) _mm_malloc(sizeof(int16_t) * n, SSE_ALIGN);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_avx_vdec_free(int16_t *ptr)
+{
+	_mm_free(ptr);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_avx_metrics_k5_n2(const int8_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[0], val[1] };
+
+	_sse_metrics_k5_n2(_val, out, sums, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_avx_metrics_k5_n3(const int8_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[2], 0 };
+
+	_sse_metrics_k5_n4(_val, out, sums, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_avx_metrics_k5_n4(const int8_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[2], val[3] };
+
+	_sse_metrics_k5_n4(_val, out, sums, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_avx_metrics_k7_n2(const int8_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[0], val[1] };
+
+	_sse_metrics_k7_n2(_val, out, sums, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_avx_metrics_k7_n3(const int8_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[2], 0 };
+
+	_sse_metrics_k7_n4(_val, out, sums, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_avx_metrics_k7_n4(const int8_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[2], val[3] };
+
+	_sse_metrics_k7_n4(_val, out, sums, paths, norm);
+}
--- a/src/viterbi_sse_common.h
+++ b/src/viterbi_sse_common.h
@ -0,0 +1,495 @@
+/*
+ * Intel SSE Viterbi decoder
+ *
+ * Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc>
+ *
+ * All Rights Reserved
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+extern int sse41_supported;
+
+/* Octo-Viterbi butterfly
+ * Compute 8-wide butterfly generating 16 path decisions and 16 accumulated
+ * sums. Inputs all packed 16-bit integers in three 128-bit XMM registers.
+ * Two intermediate registers are used and results are set in the upper 4
+ * registers.
+ *
+ * Input:
+ * M0 - Path metrics 0 (packed 16-bit integers)
+ * M1 - Path metrics 1 (packed 16-bit integers)
+ * M2 - Branch metrics (packed 16-bit integers)
+ *
+ * Output:
+ * M2 - Selected and accumulated path metrics 0
+ * M4 - Selected and accumulated path metrics 1
+ * M3 - Path selections 0
+ * M1 - Path selections 1
+ */
+#define SSE_BUTTERFLY(M0, M1, M2, M3, M4) \
+{ \
+	M3 = _mm_adds_epi16(M0, M2); \
+	M4 = _mm_subs_epi16(M1, M2); \
+	M0 = _mm_subs_epi16(M0, M2); \
+	M1 = _mm_adds_epi16(M1, M2); \
+	M2 = _mm_max_epi16(M3, M4); \
+	M3 = _mm_or_si128(_mm_cmpgt_epi16(M3, M4), _mm_cmpeq_epi16(M3, M4)); \
+	M4 = _mm_max_epi16(M0, M1); \
+	M1 = _mm_or_si128(_mm_cmpgt_epi16(M0, M1), _mm_cmpeq_epi16(M0, M1)); \
+}
+
+/* Two lane deinterleaving K = 5:
+ * Take 16 interleaved 16-bit integers and deinterleave to 2 packed 128-bit
+ * registers. The operation summarized below. Four registers are used with
+ * the lower 2 as input and upper 2 as output.
+ *
+ * In   - 10101010 10101010 10101010 10101010
+ * Out  - 00000000 11111111 00000000 11111111
+ *
+ * Input:
+ * M0:1 - Packed 16-bit integers
+ *
+ * Output:
+ * M2:3 - Deinterleaved packed 16-bit integers
+ */
+#define _I8_SHUFFLE_MASK 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+
+#define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3) \
+{ \
+	M2 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
+	M0 = _mm_shuffle_epi8(M0, M2); \
+	M1 = _mm_shuffle_epi8(M1, M2); \
+	M2 = _mm_unpacklo_epi64(M0, M1); \
+	M3 = _mm_unpackhi_epi64(M0, M1); \
+}
+
+/* Two lane deinterleaving K = 7:
+ * Take 64 interleaved 16-bit integers and deinterleave to 8 packed 128-bit
+ * registers. The operation summarized below. 16 registers are used with the
+ * lower 8 as input and upper 8 as output.
+ *
+ * In   - 10101010 10101010 10101010 10101010 ...
+ * Out  - 00000000 11111111 00000000 11111111 ...
+ *
+ * Input:
+ * M0:7 - Packed 16-bit integers
+ *
+ * Output:
+ * M8:15 - Deinterleaved packed 16-bit integers
+ */
+#define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, \
+	M8, M9, M10, M11, M12, M13, M14, M15) \
+{ \
+	M8  = _mm_set_epi8(_I8_SHUFFLE_MASK); \
+	M0  = _mm_shuffle_epi8(M0, M8); \
+	M1  = _mm_shuffle_epi8(M1, M8); \
+	M2  = _mm_shuffle_epi8(M2, M8); \
+	M3  = _mm_shuffle_epi8(M3, M8); \
+	M4  = _mm_shuffle_epi8(M4, M8); \
+	M5  = _mm_shuffle_epi8(M5, M8); \
+	M6  = _mm_shuffle_epi8(M6, M8); \
+	M7  = _mm_shuffle_epi8(M7, M8); \
+	M8  = _mm_unpacklo_epi64(M0, M1); \
+	M9  = _mm_unpackhi_epi64(M0, M1); \
+	M10 = _mm_unpacklo_epi64(M2, M3); \
+	M11 = _mm_unpackhi_epi64(M2, M3); \
+	M12 = _mm_unpacklo_epi64(M4, M5); \
+	M13 = _mm_unpackhi_epi64(M4, M5); \
+	M14 = _mm_unpacklo_epi64(M6, M7); \
+	M15 = _mm_unpackhi_epi64(M6, M7); \
+}
+
+/* Generate branch metrics N = 2:
+ * Compute 16 branch metrics from trellis outputs and input values.
+ *
+ * Input:
+ * M0:3 - 16 x 2 packed 16-bit trellis outputs
+ * M4   - Expanded and packed 16-bit input value
+ *
+ * Output:
+ * M6:7 - 16 computed 16-bit branch metrics
+ */
+#define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7) \
+{ \
+	M0 = _mm_sign_epi16(M4, M0); \
+	M1 = _mm_sign_epi16(M4, M1); \
+	M2 = _mm_sign_epi16(M4, M2); \
+	M3 = _mm_sign_epi16(M4, M3); \
+	M6 = _mm_hadds_epi16(M0, M1); \
+	M7 = _mm_hadds_epi16(M2, M3); \
+}
+
+/* Generate branch metrics N = 4:
+ * Compute 8 branch metrics from trellis outputs and input values. This
+ * macro is reused for N less than 4 where the extra soft input bits are
+ * padded.
+ *
+ * Input:
+ * M0:3 - 8 x 4 packed 16-bit trellis outputs
+ * M4   - Expanded and packed 16-bit input value
+ *
+ * Output:
+ * M5   - 8 computed 16-bit branch metrics
+ */
+#define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5) \
+{ \
+	M0 = _mm_sign_epi16(M4, M0); \
+	M1 = _mm_sign_epi16(M4, M1); \
+	M2 = _mm_sign_epi16(M4, M2); \
+	M3 = _mm_sign_epi16(M4, M3); \
+	M0 = _mm_hadds_epi16(M0, M1); \
+	M1 = _mm_hadds_epi16(M2, M3); \
+	M5 = _mm_hadds_epi16(M0, M1); \
+}
+
+/* Horizontal minimum
+ * Compute horizontal minimum of packed unsigned 16-bit integers and place
+ * result in the low 16-bit element of the source register. Only SSE 4.1
+ * has a dedicated minpos instruction. One intermediate register is used
+ * if SSE 4.1 is not available. This is a destructive operation and the
+ * source register is overwritten.
+ *
+ * Input:
+ * M0 - Packed unsigned 16-bit integers
+ *
+ * Output:
+ * M0 - Minimum value placed in low 16-bit element
+ */
+#if defined(HAVE_SSE4_1) || defined(HAVE_SSE41)
+#define SSE_MINPOS(M0, M1) \
+{ \
+	if (sse41_supported) { \
+		M0 = _mm_minpos_epu16(M0); \
+	} else { \
+		M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
+		M0 = _mm_min_epi16(M0, M1); \
+		M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
+		M0 = _mm_min_epi16(M0, M1); \
+		M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
+		M0 = _mm_min_epi16(M0, M1); \
+	} \
+}
+#else
+#define SSE_MINPOS(M0, M1) \
+{ \
+	M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
+	M0 = _mm_min_epi16(M0, M1); \
+	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
+	M0 = _mm_min_epi16(M0, M1); \
+	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
+	M0 = _mm_min_epi16(M0, M1); \
+}
+#endif
+
+/* Normalize state metrics K = 5:
+ * Compute 16-wide normalization by subtracting the smallest value from
+ * all values. Inputs are 16 packed 16-bit integers across 2 XMM registers.
+ * Two intermediate registers are used and normalized results are placed
+ * in the originating locations.
+ *
+ * Input:
+ * M0:1 - Path metrics 0:1 (packed 16-bit integers)
+ *
+ * Output:
+ * M0:1 - Normalized path metrics 0:1
+ */
+#define SSE_NORMALIZE_K5(M0, M1, M2, M3) \
+{ \
+	M2 = _mm_min_epi16(M0, M1); \
+	SSE_MINPOS(M2, M3) \
+	SSE_BROADCAST(M2) \
+	M0 = _mm_subs_epi16(M0, M2); \
+	M1 = _mm_subs_epi16(M1, M2); \
+}
+
+/* Normalize state metrics K = 7:
+ * Compute 64-wide normalization by subtracting the smallest value from
+ * all values. Inputs are 8 registers of accumulated sums and 4 temporary
+ * registers. Normalized results are returned in the originating locations.
+ *
+ * Input:
+ * M0:7 - Path metrics 0:7 (packed 16-bit integers)
+ *
+ * Output:
+ * M0:7 - Normalized path metrics 0:7
+ */
+#define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11) \
+{ \
+	M8  = _mm_min_epi16(M0, M1); \
+	M9  = _mm_min_epi16(M2, M3); \
+	M10 = _mm_min_epi16(M4, M5); \
+	M11 = _mm_min_epi16(M6, M7); \
+	M8  = _mm_min_epi16(M8, M9); \
+	M10 = _mm_min_epi16(M10, M11); \
+	M8  = _mm_min_epi16(M8, M10); \
+	SSE_MINPOS(M8, M9) \
+	SSE_BROADCAST(M8) \
+	M0  = _mm_subs_epi16(M0, M8); \
+	M1  = _mm_subs_epi16(M1, M8); \
+	M2  = _mm_subs_epi16(M2, M8); \
+	M3  = _mm_subs_epi16(M3, M8); \
+	M4  = _mm_subs_epi16(M4, M8); \
+	M5  = _mm_subs_epi16(M5, M8); \
+	M6  = _mm_subs_epi16(M6, M8); \
+	M7  = _mm_subs_epi16(M7, M8); \
+}
+
+/* Combined BMU/PMU (K=5, N=2)
+ * Compute branch metrics followed by path metrics for half rate 16-state
+ * trellis. 8 butterflies are computed. Accumulated path sums are not
+ * preserved and read and written into the same memory location. Normalize
+ * sums if requires.
+ */
+__always_inline static void _sse_metrics_k5_n2(const int16_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	__m128i m0, m1, m2, m3, m4, m5, m6;
+
+	/* (BMU) Load input sequence */
+	m2 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
+
+	/* (BMU) Load trellis outputs */
+	m0 = _mm_load_si128((__m128i *) &out[0]);
+	m1 = _mm_load_si128((__m128i *) &out[8]);
+
+	/* (BMU) Compute branch metrics */
+	m0 = _mm_sign_epi16(m2, m0);
+	m1 = _mm_sign_epi16(m2, m1);
+	m2 = _mm_hadds_epi16(m0, m1);
+
+	/* (PMU) Load accumulated path metrics */
+	m0 = _mm_load_si128((__m128i *) &sums[0]);
+	m1 = _mm_load_si128((__m128i *) &sums[8]);
+
+	SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
+
+	/* (PMU) Butterflies: 0-7 */
+	SSE_BUTTERFLY(m3, m4, m2, m5, m6)
+
+	if (norm)
+		SSE_NORMALIZE_K5(m2, m6, m0, m1)
+
+	_mm_store_si128((__m128i *) &sums[0], m2);
+	_mm_store_si128((__m128i *) &sums[8], m6);
+	_mm_store_si128((__m128i *) &paths[0], m5);
+	_mm_store_si128((__m128i *) &paths[8], m4);
+}
+
+/* Combined BMU/PMU (K=5, N=3 and N=4)
+ * Compute branch metrics followed by path metrics for 16-state and rates
+ * to 1/4. 8 butterflies are computed. The input sequence is read four 16-bit
+ * values at a time, and extra values should be set to zero for rates other
+ * than 1/4. Normally only rates 1/3 and 1/4 are used as there is a
+ * dedicated implementation of rate 1/2.
+ */
+__always_inline static void _sse_metrics_k5_n4(const int16_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	__m128i m0, m1, m2, m3, m4, m5, m6;
+
+	/* (BMU) Load input sequence */
+	m4 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
+
+	/* (BMU) Load trellis outputs */
+	m0 = _mm_load_si128((__m128i *) &out[0]);
+	m1 = _mm_load_si128((__m128i *) &out[8]);
+	m2 = _mm_load_si128((__m128i *) &out[16]);
+	m3 = _mm_load_si128((__m128i *) &out[24]);
+
+	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m4, m2)
+
+	/* (PMU) Load accumulated path metrics */
+	m0 = _mm_load_si128((__m128i *) &sums[0]);
+	m1 = _mm_load_si128((__m128i *) &sums[8]);
+
+	SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
+
+	/* (PMU) Butterflies: 0-7 */
+	SSE_BUTTERFLY(m3, m4, m2, m5, m6)
+
+	if (norm)
+		SSE_NORMALIZE_K5(m2, m6, m0, m1)
+
+	_mm_store_si128((__m128i *) &sums[0], m2);
+	_mm_store_si128((__m128i *) &sums[8], m6);
+	_mm_store_si128((__m128i *) &paths[0], m5);
+	_mm_store_si128((__m128i *) &paths[8], m4);
+}
+
+/* Combined BMU/PMU (K=7, N=2)
+ * Compute branch metrics followed by path metrics for half rate 64-state
+ * trellis. 32 butterfly operations are computed. Deinterleaving path
+ * metrics requires usage of the full SSE register file, so separate sums
+ * before computing branch metrics to avoid register spilling.
+ */
+__always_inline static void _sse_metrics_k7_n2(const int16_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	__m128i m0, m1, m2, m3, m4, m5, m6, m7, m8,
+		m9, m10, m11, m12, m13, m14, m15;
+
+	/* (PMU) Load accumulated path metrics */
+	m0 = _mm_load_si128((__m128i *) &sums[0]);
+	m1 = _mm_load_si128((__m128i *) &sums[8]);
+	m2 = _mm_load_si128((__m128i *) &sums[16]);
+	m3 = _mm_load_si128((__m128i *) &sums[24]);
+	m4 = _mm_load_si128((__m128i *) &sums[32]);
+	m5 = _mm_load_si128((__m128i *) &sums[40]);
+	m6 = _mm_load_si128((__m128i *) &sums[48]);
+	m7 = _mm_load_si128((__m128i *) &sums[56]);
+
+	/* (PMU) Deinterleave to even-odd registers */
+	SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
+			    m8, m9, m10, m11, m12, m13, m14, m15)
+
+	/* (BMU) Load input symbols */
+	m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
+
+	/* (BMU) Load trellis outputs */
+	m0 = _mm_load_si128((__m128i *) &out[0]);
+	m1 = _mm_load_si128((__m128i *) &out[8]);
+	m2 = _mm_load_si128((__m128i *) &out[16]);
+	m3 = _mm_load_si128((__m128i *) &out[24]);
+
+	SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m4, m5)
+
+	m0 = _mm_load_si128((__m128i *) &out[32]);
+	m1 = _mm_load_si128((__m128i *) &out[40]);
+	m2 = _mm_load_si128((__m128i *) &out[48]);
+	m3 = _mm_load_si128((__m128i *) &out[56]);
+
+	SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m6, m7)
+
+	/* (PMU) Butterflies: 0-15 */
+	SSE_BUTTERFLY(m8, m9, m4, m0, m1)
+	SSE_BUTTERFLY(m10, m11, m5, m2, m3)
+
+	_mm_store_si128((__m128i *) &paths[0], m0);
+	_mm_store_si128((__m128i *) &paths[8], m2);
+	_mm_store_si128((__m128i *) &paths[32], m9);
+	_mm_store_si128((__m128i *) &paths[40], m11);
+
+	/* (PMU) Butterflies: 17-31 */
+	SSE_BUTTERFLY(m12, m13, m6, m0, m2)
+	SSE_BUTTERFLY(m14, m15, m7, m9, m11)
+
+	_mm_store_si128((__m128i *) &paths[16], m0);
+	_mm_store_si128((__m128i *) &paths[24], m9);
+	_mm_store_si128((__m128i *) &paths[48], m13);
+	_mm_store_si128((__m128i *) &paths[56], m15);
+
+	if (norm)
+		SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
+				 m7, m11, m0, m8, m9, m10)
+
+	_mm_store_si128((__m128i *) &sums[0], m4);
+	_mm_store_si128((__m128i *) &sums[8], m5);
+	_mm_store_si128((__m128i *) &sums[16], m6);
+	_mm_store_si128((__m128i *) &sums[24], m7);
+	_mm_store_si128((__m128i *) &sums[32], m1);
+	_mm_store_si128((__m128i *) &sums[40], m3);
+	_mm_store_si128((__m128i *) &sums[48], m2);
+	_mm_store_si128((__m128i *) &sums[56], m11);
+}
+
+/* Combined BMU/PMU (K=7, N=3 and N=4)
+ * Compute branch metrics followed by path metrics for half rate 64-state
+ * trellis. 32 butterfly operations are computed. Deinterleave path
+ * metrics before computing branch metrics as in the half rate case.
+ */
+__always_inline static void _sse_metrics_k7_n4(const int16_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	__m128i m0, m1, m2, m3, m4, m5, m6, m7;
+	__m128i m8, m9, m10, m11, m12, m13, m14, m15;
+
+	/* (PMU) Load accumulated path metrics */
+	m0 = _mm_load_si128((__m128i *) &sums[0]);
+	m1 = _mm_load_si128((__m128i *) &sums[8]);
+	m2 = _mm_load_si128((__m128i *) &sums[16]);
+	m3 = _mm_load_si128((__m128i *) &sums[24]);
+	m4 = _mm_load_si128((__m128i *) &sums[32]);
+	m5 = _mm_load_si128((__m128i *) &sums[40]);
+	m6 = _mm_load_si128((__m128i *) &sums[48]);
+	m7 = _mm_load_si128((__m128i *) &sums[56]);
+
+	/* (PMU) Deinterleave into even and odd packed registers */
+	SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
+			    m8, m9, m10, m11, m12, m13, m14, m15)
+
+	/* (BMU) Load and expand 8-bit input out to 16-bits */
+	m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
+
+	/* (BMU) Load and compute branch metrics */
+	m0 = _mm_load_si128((__m128i *) &out[0]);
+	m1 = _mm_load_si128((__m128i *) &out[8]);
+	m2 = _mm_load_si128((__m128i *) &out[16]);
+	m3 = _mm_load_si128((__m128i *) &out[24]);
+
+	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m4)
+
+	m0 = _mm_load_si128((__m128i *) &out[32]);
+	m1 = _mm_load_si128((__m128i *) &out[40]);
+	m2 = _mm_load_si128((__m128i *) &out[48]);
+	m3 = _mm_load_si128((__m128i *) &out[56]);
+
+	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m5)
+
+	m0 = _mm_load_si128((__m128i *) &out[64]);
+	m1 = _mm_load_si128((__m128i *) &out[72]);
+	m2 = _mm_load_si128((__m128i *) &out[80]);
+	m3 = _mm_load_si128((__m128i *) &out[88]);
+
+	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m6)
+
+	m0 = _mm_load_si128((__m128i *) &out[96]);
+	m1 = _mm_load_si128((__m128i *) &out[104]);
+	m2 = _mm_load_si128((__m128i *) &out[112]);
+	m3 = _mm_load_si128((__m128i *) &out[120]);
+
+	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m7)
+
+	/* (PMU) Butterflies: 0-15 */
+	SSE_BUTTERFLY(m8, m9, m4, m0, m1)
+	SSE_BUTTERFLY(m10, m11, m5, m2, m3)
+
+	_mm_store_si128((__m128i *) &paths[0], m0);
+	_mm_store_si128((__m128i *) &paths[8], m2);
+	_mm_store_si128((__m128i *) &paths[32], m9);
+	_mm_store_si128((__m128i *) &paths[40], m11);
+
+	/* (PMU) Butterflies: 17-31 */
+	SSE_BUTTERFLY(m12, m13, m6, m0, m2)
+	SSE_BUTTERFLY(m14, m15, m7, m9, m11)
+
+	_mm_store_si128((__m128i *) &paths[16], m0);
+	_mm_store_si128((__m128i *) &paths[24], m9);
+	_mm_store_si128((__m128i *) &paths[48], m13);
+	_mm_store_si128((__m128i *) &paths[56], m15);
+
+	if (norm)
+		SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
+				 m7, m11, m0, m8, m9, m10)
+
+	_mm_store_si128((__m128i *) &sums[0], m4);
+	_mm_store_si128((__m128i *) &sums[8], m5);
+	_mm_store_si128((__m128i *) &sums[16], m6);
+	_mm_store_si128((__m128i *) &sums[24], m7);
+	_mm_store_si128((__m128i *) &sums[32], m1);
+	_mm_store_si128((__m128i *) &sums[40], m3);
+	_mm_store_si128((__m128i *) &sums[48], m2);
+	_mm_store_si128((__m128i *) &sums[56], m11);
+}