srsRAN/lib/src/phy/fec/turbodecoder_sse.c

685 lines
20 KiB
C

/**
*
* \section COPYRIGHT
*
* Copyright 2013-2015 Software Radio Systems Limited
*
* \section LICENSE
*
* This file is part of the srsLTE library.
*
* srsLTE is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* srsLTE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* A copy of the GNU Affero General Public License can be found in
* the LICENSE file in the top-level directory of this distribution
* and at http://www.gnu.org/licenses/.
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <math.h>
#include "srslte/phy/fec/turbodecoder_sse.h"
#include "srslte/phy/utils/vector.h"
#include <inttypes.h>
#ifdef LV_HAVE_SSE
#include <smmintrin.h>
#include <srslte/phy/fec/turbodecoder_sse.h>
#endif
#define NUMSTATES 8
#define NINPUTS 2
#define TAIL 3
#define TOTALTAIL 12
#define INF 10000
#ifdef LV_HAVE_SSE
#define debug_enabled 0
#if debug_enabled
#define debug_state(c,d) printf("k=%5d, in=%5d, pa=%5d, out=%5d, alpha=", k-d,\
s->branch[2*(k-d)] + s->branch[2*(k-d)+1], \
-s->branch[2*(k-d)] + s->branch[2*(k-d)+1], output[k-d]);print_128i(alpha_k);\
printf(", beta=");print_128i(beta_k);printf("\n");
static void print_128i(__m128i x) {
int16_t *s = (int16_t*) &x;
printf("[%5d", s[0]);
for (int i=1;i<8;i++) {
printf(",%5d", s[i]);
}
printf("]");
}
static uint32_t max_128i(__m128i x) {
int16_t *s = (int16_t*) &x;
int16_t m = -INF;
uint32_t max = 0;
for (int i=1;i<8;i++) {
if (s[i] > m) {
max = i;
m = s[i];
}
}
return max;
}
#else
#define debug_state(c,d)
#endif
//#define use_beta_transposed_max
#ifndef use_beta_transposed_max
/* Computes the horizontal MAX from 8 16-bit integers using the minpos_epu16 SSE4.1 instruction */
static inline int16_t hMax(__m128i buffer)
{
__m128i tmp1 = _mm_sub_epi16(_mm_set1_epi16(0x7FFF), buffer);
__m128i tmp3 = _mm_minpos_epu16(tmp1);
return (int16_t)(_mm_cvtsi128_si32(tmp3));
}
/* Computes beta values */
void tdec_sse_beta(tdec_sse_t * s, int16_t * output, uint32_t long_cb)
{
int k;
uint32_t end = long_cb + 3;
const __m128i *alphaPtr = (const __m128i*) s->alpha;
__m128i beta_k = _mm_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0);
__m128i g, bp, bn, alpha_k;
/* Define the shuffle constant for the positive beta */
__m128i shuf_bp = _mm_set_epi8(
15, 14, // 7
7, 6, // 3
5, 4, // 2
13, 12, // 6
11, 10, // 5
3, 2, // 1
1, 0, // 0
9, 8 // 4
);
/* Define the shuffle constant for the negative beta */
__m128i shuf_bn = _mm_set_epi8(
7, 6, // 3
15, 14, // 7
13, 12, // 6
5, 4, // 2
3, 2, // 1
11, 10, // 5
9, 8, // 4
1, 0 // 0
);
alphaPtr += long_cb-1;
/* Define shuffle for branch costs */
__m128i shuf_g[4];
shuf_g[3] = _mm_set_epi8(3,2,1,0,1,0,3,2,3,2,1,0,1,0,3,2);
shuf_g[2] = _mm_set_epi8(7,6,5,4,5,4,7,6,7,6,5,4,5,4,7,6);
shuf_g[1] = _mm_set_epi8(11,10,9,8,9,8,11,10,11,10,9,8,9,8,11,10);
shuf_g[0] = _mm_set_epi8(15,14,13,12,13,12,15,14,15,14,13,12,13,12,15,14);
__m128i gv;
int16_t *b = &s->branch[2*long_cb-8];
__m128i *gPtr = (__m128i*) b;
/* Define shuffle for beta normalization */
__m128i shuf_norm = _mm_set_epi8(1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0);
/* This defines a beta computation step:
* Adds and substracts the branch metrics to the previous beta step,
* shuffles the states according to the trellis path and selects maximum state
*/
#define BETA_STEP(g) bp = _mm_add_epi16(beta_k, g);\
bn = _mm_sub_epi16(beta_k, g);\
bp = _mm_shuffle_epi8(bp, shuf_bp);\
bn = _mm_shuffle_epi8(bn, shuf_bn);\
beta_k = _mm_max_epi16(bp, bn);
/* Loads the alpha metrics from memory and adds them to the temporal bn and bp
* metrics. Then computes horizontal maximum of both metrics and computes difference
*/
#define BETA_STEP_CNT(c,d) g = _mm_shuffle_epi8(gv, shuf_g[c]);\
BETA_STEP(g)\
alpha_k = _mm_load_si128(alphaPtr);\
alphaPtr--;\
bp = _mm_add_epi16(bp, alpha_k);\
bn = _mm_add_epi16(bn, alpha_k);\
output[k-d] = hMax(bn)-hMax(bp);\
debug_state(c,d);
/* The tail does not require to load alpha or produce outputs. Only update
* beta metrics accordingly */
for (k=end-1; k>=long_cb; k--) {
int16_t g0 = s->branch[2*k];
int16_t g1 = s->branch[2*k+1];
g = _mm_set_epi16(g1, g0, g0, g1, g1, g0, g0, g1);
BETA_STEP(g);
}
/* We inline 2 trelis steps for each normalization */
__m128i norm;
for (; k >= 0; k-=8) {
gv = _mm_load_si128(gPtr);
gPtr--;
BETA_STEP_CNT(0,0);
BETA_STEP_CNT(1,1);
BETA_STEP_CNT(2,2);
BETA_STEP_CNT(3,3);
norm = _mm_shuffle_epi8(beta_k, shuf_norm);
beta_k = _mm_sub_epi16(beta_k, norm);
gv = _mm_load_si128(gPtr);
gPtr--;
BETA_STEP_CNT(0,4);
BETA_STEP_CNT(1,5);
BETA_STEP_CNT(2,6);
BETA_STEP_CNT(3,7);
norm = _mm_shuffle_epi8(beta_k, shuf_norm);
beta_k = _mm_sub_epi16(beta_k, norm);
}
}
#endif
/* Computes alpha metrics */
void tdec_sse_alpha(tdec_sse_t * s, uint32_t long_cb)
{
uint32_t k;
int16_t *alpha = s->alpha;
uint32_t i;
alpha[0] = 0;
for (i = 1; i < 8; i++) {
alpha[i] = -INF;
}
/* Define the shuffle constant for the positive alpha */
__m128i shuf_ap = _mm_set_epi8(
15, 14, // 7
9, 8, // 4
7, 6, // 3
1, 0, // 0
13, 12, // 6
11, 10, // 5
5, 4, // 2
3, 2 // 1
);
/* Define the shuffle constant for the negative alpha */
__m128i shuf_an = _mm_set_epi8(
13, 12, // 6
11, 10, // 5
5, 4, // 2
3, 2, // 1
15, 14, // 7
9, 8, // 4
7, 6, // 3
1, 0 // 0
);
/* Define shuffle for branch costs */
__m128i shuf_g[4];
shuf_g[0] = _mm_set_epi8(3,2,3,2,1,0,1,0,1,0,1,0,3,2,3,2);
shuf_g[1] = _mm_set_epi8(7,6,7,6,5,4,5,4,5,4,5,4,7,6,7,6);
shuf_g[2] = _mm_set_epi8(11,10,11,10,9,8,9,8,9,8,9,8,11,10,11,10);
shuf_g[3] = _mm_set_epi8(15,14,15,14,13,12,13,12,13,12,13,12,15,14,15,14);
__m128i shuf_norm = _mm_set_epi8(1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0);
__m128i* alphaPtr = (__m128i*) alpha;
alphaPtr++;
__m128i gv;
__m128i *gPtr = (__m128i*) s->branch;
__m128i g, ap, an;
__m128i alpha_k = _mm_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0);
/* This defines a alpha computation step:
* Adds and substracts the branch metrics to the previous alpha step,
* shuffles the states according to the trellis path and selects maximum state
*/
#define ALPHA_STEP(c) g = _mm_shuffle_epi8(gv, shuf_g[c]); \
ap = _mm_add_epi16(alpha_k, g);\
an = _mm_sub_epi16(alpha_k, g);\
ap = _mm_shuffle_epi8(ap, shuf_ap);\
an = _mm_shuffle_epi8(an, shuf_an);\
alpha_k = _mm_max_epi16(ap, an);\
_mm_store_si128(alphaPtr, alpha_k);\
alphaPtr++; \
/* In this loop, we compute 8 steps and normalize twice for each branch metrics memory load */
__m128i norm;
for (k = 0; k < long_cb/8; k++) {
gv = _mm_load_si128(gPtr);
gPtr++;
ALPHA_STEP(0);
ALPHA_STEP(1);
ALPHA_STEP(2);
ALPHA_STEP(3);
norm = _mm_shuffle_epi8(alpha_k, shuf_norm);
alpha_k = _mm_sub_epi16(alpha_k, norm);
gv = _mm_load_si128(gPtr);
gPtr++;
ALPHA_STEP(0);
ALPHA_STEP(1);
ALPHA_STEP(2);
ALPHA_STEP(3);
norm = _mm_shuffle_epi8(alpha_k, shuf_norm);
alpha_k = _mm_sub_epi16(alpha_k, norm);
}
}
/* Compute branch metrics (gamma) */
void tdec_sse_gamma(tdec_sse_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t long_cb)
{
__m128i res00, res10, res01, res11, res0, res1;
__m128i in, ap, pa, g1, g0;
__m128i *inPtr = (__m128i*) input;
__m128i *appPtr = (__m128i*) app;
__m128i *paPtr = (__m128i*) parity;
__m128i *resPtr = (__m128i*) h->branch;
__m128i res00_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0);
__m128i res10_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8);
__m128i res01_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff);
__m128i res11_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff);
for (int i=0;i<long_cb/8;i++) {
in = _mm_load_si128(inPtr);
inPtr++;
pa = _mm_load_si128(paPtr);
paPtr++;
if (appPtr) {
ap = _mm_load_si128(appPtr);
appPtr++;
in = _mm_add_epi16(ap, in);
}
g1 = _mm_add_epi16(in, pa);
g0 = _mm_sub_epi16(in, pa);
g1 = _mm_srai_epi16(g1, 1);
g0 = _mm_srai_epi16(g0, 1);
res00 = _mm_shuffle_epi8(g0, res00_mask);
res10 = _mm_shuffle_epi8(g0, res10_mask);
res01 = _mm_shuffle_epi8(g1, res01_mask);
res11 = _mm_shuffle_epi8(g1, res11_mask);
res0 = _mm_or_si128(res00, res01);
res1 = _mm_or_si128(res10, res11);
_mm_store_si128(resPtr, res0);
resPtr++;
_mm_store_si128(resPtr, res1);
resPtr++;
//printf("k=%d, in=%d, pa=%d, g0=%d, g1=%d\n", i, input[i], parity[i], h->branch[2*i], h->branch[2*i+1]);
}
for (int i=long_cb;i<long_cb+3;i++) {
h->branch[2*i] = (input[i] - parity[i])/2;
h->branch[2*i+1] = (input[i] + parity[i])/2;
}
}
/* Inititalizes constituent decoder object */
int tdec_sse_init(void **hh, uint32_t max_long_cb)
{
*hh = calloc(1, sizeof(tdec_sse_t));
tdec_sse_t *h = (tdec_sse_t*) *hh;
h->max_long_cb = max_long_cb;
h->alpha = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + TOTALTAIL + 1) * NUMSTATES);
if (!h->alpha) {
perror("srslte_vec_malloc");
return -1;
}
h->branch = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + TOTALTAIL + 1) * NUMSTATES);
if (!h->branch) {
perror("srslte_vec_malloc");
return -1;
}
return 1;
}
void tdec_sse_free(void *hh)
{
tdec_sse_t *h = (tdec_sse_t*) hh;
if (h->alpha) {
free(h->alpha);
}
if (h->branch) {
free(h->branch);
}
bzero(h, sizeof(tdec_sse_t));
}
/* Runs one instance of a decoder */
void tdec_sse_dec(void *hh, int16_t * input, int16_t *app, int16_t * parity,
int16_t *output, uint32_t long_cb)
{
tdec_sse_t *h = (tdec_sse_t*) hh;
// Compute branch metrics
tdec_sse_gamma(h, input, app, parity, long_cb);
// Forward recursion
tdec_sse_alpha(h, long_cb);
// Backwards recursion + LLR computation
tdec_sse_beta(h, output, long_cb);
}
/* Deinterleaves the 3 streams from the input (systematic and 2 parity bits) into
* 3 buffers ready to be used by compute_gamma()
*/
void tdec_sse_extract_input(int16_t *input, int16_t *syst0, int16_t *app2, int16_t *parity0, int16_t *parity1, uint32_t long_cb) {
uint32_t i;
__m128i *inputPtr = (__m128i*) input;
__m128i in0, in1, in2;
__m128i s0, s1, s2, s;
__m128i p00, p01, p02, p0;
__m128i p10, p11, p12, p1;
__m128i *sysPtr = (__m128i*) syst0;
__m128i *pa0Ptr = (__m128i*) parity0;
__m128i *pa1Ptr = (__m128i*) parity1;
// pick bits 0, 3, 6 from 1st word
__m128i s0_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0);
// pick bits 1, 4, 7 from 2st word
__m128i s1_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff);
// pick bits 2, 5 from 3rd word
__m128i s2_mask = _mm_set_epi8(11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
// pick bits 1, 4, 7 from 1st word
__m128i p00_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,15,14,9,8,3,2);
// pick bits 2, 5, from 2st word
__m128i p01_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff);
// pick bits 0, 3, 6 from 3rd word
__m128i p02_mask = _mm_set_epi8(13,12,7,6,1,0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
// pick bits 2, 5 from 1st word
__m128i p10_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4);
// pick bits 0, 3, 6, from 2st word
__m128i p11_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0,0xff,0xff,0xff,0xff);
// pick bits 1, 4, 7 from 3rd word
__m128i p12_mask = _mm_set_epi8(15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
// Split systematic and parity bits
for (i = 0; i < long_cb/8; i++) {
in0 = _mm_load_si128(inputPtr); inputPtr++;
in1 = _mm_load_si128(inputPtr); inputPtr++;
in2 = _mm_load_si128(inputPtr); inputPtr++;
/* Deinterleave Systematic bits */
s0 = _mm_shuffle_epi8(in0, s0_mask);
s1 = _mm_shuffle_epi8(in1, s1_mask);
s2 = _mm_shuffle_epi8(in2, s2_mask);
s = _mm_or_si128(s0, s1);
s = _mm_or_si128(s, s2);
_mm_store_si128(sysPtr, s);
sysPtr++;
/* Deinterleave parity 0 bits */
p00 = _mm_shuffle_epi8(in0, p00_mask);
p01 = _mm_shuffle_epi8(in1, p01_mask);
p02 = _mm_shuffle_epi8(in2, p02_mask);
p0 = _mm_or_si128(p00, p01);
p0 = _mm_or_si128(p0, p02);
_mm_store_si128(pa0Ptr, p0);
pa0Ptr++;
/* Deinterleave parity 1 bits */
p10 = _mm_shuffle_epi8(in0, p10_mask);
p11 = _mm_shuffle_epi8(in1, p11_mask);
p12 = _mm_shuffle_epi8(in2, p12_mask);
p1 = _mm_or_si128(p10, p11);
p1 = _mm_or_si128(p1, p12);
_mm_store_si128(pa1Ptr, p1);
pa1Ptr++;
}
for (i = 0; i < 3; i++) {
syst0[i+long_cb] = input[3*long_cb + 2*i];
parity0[i+long_cb] = input[3*long_cb + 2*i + 1];
}
for (i = 0; i < 3; i++) {
app2[i+long_cb] = input[3*long_cb + 6 + 2*i];
parity1[i+long_cb] = input[3*long_cb + 6 + 2*i + 1];
}
}
void tdec_sse_decision_byte(int16_t *app1, uint8_t *output, uint32_t long_cb)
{
uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1};
// long_cb is always byte aligned
for (uint32_t i = 0; i < long_cb/8; i++) {
uint8_t out0 = app1[8*i+0]>0?mask[0]:0;
uint8_t out1 = app1[8*i+1]>0?mask[1]:0;
uint8_t out2 = app1[8*i+2]>0?mask[2]:0;
uint8_t out3 = app1[8*i+3]>0?mask[3]:0;
uint8_t out4 = app1[8*i+4]>0?mask[4]:0;
uint8_t out5 = app1[8*i+5]>0?mask[5]:0;
uint8_t out6 = app1[8*i+6]>0?mask[6]:0;
uint8_t out7 = app1[8*i+7]>0?mask[7]:0;
output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7;
}
}
/***********************
*
* This is an attempt to parallelize the horizontal max
* by doing a 8x8 tranpose of the vectors and computing max
* in cascade. However since we need to store 16 registers
* for the positive and negative values the performance is not very good
*/
#ifdef use_beta_transposed_max
static inline __m128i transposed_max(__m128i a, __m128i b, __m128i c, __m128i d,
__m128i e, __m128i f, __m128i g, __m128i h)
{
// Transpose 8 vectors
__m128i t0 = _mm_unpacklo_epi16(a, b);
__m128i t1 = _mm_unpacklo_epi16(c, d);
__m128i t2 = _mm_unpacklo_epi16(e, f);
__m128i t3 = _mm_unpacklo_epi16(g, h);
__m128i t4 = _mm_unpackhi_epi16(a, b);
__m128i t5 = _mm_unpackhi_epi16(c, d);
__m128i t6 = _mm_unpackhi_epi16(e, f);
__m128i t7 = _mm_unpackhi_epi16(g, h);
__m128i s0 = _mm_unpacklo_epi32(t0, t1);
__m128i s1 = _mm_unpackhi_epi32(t0, t1);
__m128i s2 = _mm_unpacklo_epi32(t2, t3);
__m128i s3 = _mm_unpackhi_epi32(t2, t3);
__m128i s4 = _mm_unpacklo_epi32(t4, t5);
__m128i s5 = _mm_unpackhi_epi32(t4, t5);
__m128i s6 = _mm_unpacklo_epi32(t6, t7);
__m128i s7 = _mm_unpackhi_epi32(t6, t7);
__m128i x0 = _mm_unpacklo_epi64(s0, s2);
__m128i x1 = _mm_unpackhi_epi64(s0, s2);
__m128i x2 = _mm_unpacklo_epi64(s1, s3);
__m128i x3 = _mm_unpackhi_epi64(s1, s3);
__m128i x4 = _mm_unpacklo_epi64(s4, s6);
__m128i x5 = _mm_unpackhi_epi64(s4, s6);
__m128i x6 = _mm_unpacklo_epi64(s5, s7);
__m128i x7 = _mm_unpackhi_epi64(s5, s7);
// Cascade max on the transposed vector
__m128i res = _mm_max_epi16(x0,
_mm_max_epi16(x1,
_mm_max_epi16(x2,
_mm_max_epi16(x3,
_mm_max_epi16(x4,
_mm_max_epi16(x5,
_mm_max_epi16(x6,
x7)))))));
return res;
}
void tdec_sse_beta(tdec_sse_t * s, int16_t * output, uint32_t long_cb)
{
int k;
uint32_t end = long_cb + 3;
const __m128i *alphaPtr = (const __m128i*) s->alpha;
__m128i beta_k = _mm_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0);
__m128i g, alpha_k;
__m128i bn, bn_0, bn_1, bn_2, bn_3, bn_4, bn_5, bn_6, bn_7;
__m128i bp, bp_0, bp_1, bp_2, bp_3, bp_4, bp_5, bp_6, bp_7;
/* Define the shuffle constant for the positive beta */
__m128i shuf_bp = _mm_set_epi8(
15, 14, // 7
7, 6, // 3
5, 4, // 2
13, 12, // 6
11, 10, // 5
3, 2, // 1
1, 0, // 0
9, 8 // 4
);
/* Define the shuffle constant for the negative beta */
__m128i shuf_bn = _mm_set_epi8(
7, 6, // 3
15, 14, // 7
13, 12, // 6
5, 4, // 2
3, 2, // 1
11, 10, // 5
9, 8, // 4
1, 0 // 0
);
alphaPtr += long_cb-1;
/* Define shuffle for branch costs */
__m128i shuf_g[4];
shuf_g[3] = _mm_set_epi8(3,2,1,0,1,0,3,2,3,2,1,0,1,0,3,2);
shuf_g[2] = _mm_set_epi8(7,6,5,4,5,4,7,6,7,6,5,4,5,4,7,6);
shuf_g[1] = _mm_set_epi8(11,10,9,8,9,8,11,10,11,10,9,8,9,8,11,10);
shuf_g[0] = _mm_set_epi8(15,14,13,12,13,12,15,14,15,14,13,12,13,12,15,14);
__m128i gv;
int16_t *b = &s->branch[2*long_cb-8];
__m128i *gPtr = (__m128i*) b;
/* Define shuffle for beta normalization */
__m128i shuf_norm = _mm_set_epi8(1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0);
/* This defines a beta computation step:
* Adds and substracts the branch metrics to the previous beta step,
* shuffles the states according to the trellis path and selects maximum state
*/
#define BETA_STEP(g) bp = _mm_add_epi16(beta_k, g);\
bn = _mm_sub_epi16(beta_k, g);\
bp = _mm_shuffle_epi8(bp, shuf_bp);\
bn = _mm_shuffle_epi8(bn, shuf_bn);\
beta_k = _mm_max_epi16(bp, bn);
/* Loads the alpha metrics from memory and adds them to the temporal bn and bp
* metrics.
*/
#define BETA_STEP_CNT(c,d) g = _mm_shuffle_epi8(gv, shuf_g[c]);\
BETA_STEP(g)\
alpha_k = _mm_load_si128(alphaPtr);\
alphaPtr--;\
bp_##d = _mm_add_epi16(bp, alpha_k);\
bn_##d = _mm_add_epi16(bn, alpha_k);\
/* The tail does not require to load alpha or produce outputs. Only update
* beta metrics accordingly */
for (k=end-1; k>=long_cb; k--) {
int16_t g0 = s->branch[2*k];
int16_t g1 = s->branch[2*k+1];
g = _mm_set_epi16(g1, g0, g0, g1, g1, g0, g0, g1);
BETA_STEP(g);
}
/* We inline 2 trelis steps for each normalization */
__m128i norm;
__m128i *outPtr = (__m128i*) &output[long_cb-8];
for (; k >= 0; k-=8) {
gv = _mm_load_si128(gPtr);
gPtr--;
BETA_STEP_CNT(0,0);
BETA_STEP_CNT(1,1);
BETA_STEP_CNT(2,2);
BETA_STEP_CNT(3,3);
norm = _mm_shuffle_epi8(beta_k, shuf_norm);
beta_k = _mm_sub_epi16(beta_k, norm);
gv = _mm_load_si128(gPtr);
gPtr--;
BETA_STEP_CNT(0,4);
BETA_STEP_CNT(1,5);
BETA_STEP_CNT(2,6);
BETA_STEP_CNT(3,7);
norm = _mm_shuffle_epi8(beta_k, shuf_norm);
beta_k = _mm_sub_epi16(beta_k, norm);
__m128i bn_transp = transposed_max(bn_7, bn_6, bn_5, bn_4, bn_3, bn_2, bn_1, bn_0);
__m128i bp_transp = transposed_max(bp_7, bp_6, bp_5, bp_4, bp_3, bp_2, bp_1, bp_0);
__m128i outval = _mm_sub_epi16(bp_transp,bn_transp);
_mm_store_si128(outPtr, outval);
outPtr--;
}
}
#endif
#endif