Merge branch 'next' into next_novolk

2017-06-22 18:08:14 +02:00 · 2017-06-22 18:08:14 +02:00 · 681a2455d9
parent ec34d56e77 c47c4b8b03
commit 681a2455d9
42 changed files with 2855 additions and 970 deletions
--- a/9
+++ b/9
@ -1,6 +1,15 @@
 Change Log for Releases
 ==============================

+## 002.000.000
+  * Added fully functional srsENB to srsLTE code
+  * Merged srsUE code into srsLTE and reestructured PHY code 
+  * Added support for SoapySDR devices (eg LimeSDR)
+  * Fixed issues in RLC AM 
+  * Added support for NEON and AVX in many kernels and Viterbi decoder
+  * Added support for CPU affinity
+  * Other minor bug-fixes and new features 
+
 ## 001.004.000
  * Fixed issue in rv for format1C causing incorrect SIB1 decoding in some networks
  * Improved PDCCH decoding BER (fixed incorrect trellis initialization)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -213,6 +213,7 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
  endif (HAVE_AVX2)
 endif(CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")

+ADD_CXX_COMPILER_FLAG_IF_AVAILABLE("-Werror=incompatible-pointer-types" HAVE_ERROR_INCOMPATIBLE)

 if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=${GCC_ARCH} -Wall -Wno-comment -Wno-write-strings -Wno-format-extra-args -Winline -Wno-unused-result -Wno-format -std=c99 -D_GNU_SOURCE")
--- a/README.md
+++ b/README.md
@ -3,31 +3,61 @@ srsLTE

 [![Coverity Scan Build Status](https://scan.coverity.com/projects/10045/badge.svg)](https://scan.coverity.com/projects/10045)

-srsLTE is a free and open-source LTE library for SDR UE and eNodeB developed by SRS (www.softwareradiosystems.com). The library is highly modular with minimum inter-module or external dependencies. It is entirely written in C and, if available in the system, uses the acceleration library VOLK distributed in GNURadio. 
+srsLTE is a free and open-source LTE software suite developed by SRS (www.softwareradiosystems.com). 

-**srsLTE is used by srsUE, a full stack (PHY to IP) implementation of an LTE UE. srsUE is available at https://github.com/srslte/srsue**   
+It includes:
+  * srsUE - a complete SDR LTE UE application featuring all layers from PHY to IP
+  * srsENB - a complete SDR LTE eNodeB application 
+  * a highly modular set of common libraries for PHY, MAC, RLC, PDCP, RRC, NAS, S1AP and GW layers. 

+srsLTE is released under the AGPLv3 license and uses software from the OpenLTE project (http://sourceforge.net/projects/openlte) for some security functions and for RRC/NAS message parsing.

-The srsLTE software license is AGPLv3.
+Common Features
+---------------

-Current Features: 
 * LTE Release 8 compliant
 * FDD configuration
 * Tested bandwidths: 1.4, 3, 5, 10, 15 and 20 MHz
 * Transmission mode 1 (single antenna) and 2 (transmit diversity) 
- * Cell search and synchronization procedure for the UE
- * All DL channels/signals are supported for UE and eNodeB side: PSS, SSS, PBCH, PCFICH, PHICH, PDCCH, PDSCH
- * All UL channels/signals are supported for UE side: PRACH, PUSCH, PUCCH, SRS 
 * Frequency-based ZF and MMSE equalizer
 * Highly optimized Turbo Decoder available in Intel SSE4.1/AVX (+100 Mbps) and standard C (+25 Mbps)
- * UE receiver tested and verified with Amarisoft LTE 100 eNodeB and commercial LTE networks (Telefonica Spain, Three.ie and Eircom in Ireland)
+ * MAC, RLC, PDCP, RRC, NAS, S1AP and GW layers
+ * Detailed log system with per-layer log levels and hex dumps
+ * MAC layer wireshark packet capture
+ * Command-line trace metrics
+ * Detailed input configuration files

-Missing Features: 
- * Closed-loop power control 
- * Semi-Persistent Scheduling
+srsUE Features
+--------------
+ 
+ * Cell search and synchronization procedure for the UE
+ * Soft USIM supporting Milenage and XOR authentication 
+ * Virtual network interface *tun_srsue* created upon network attach
+ * 75 Mbps DL in 20 MHz SISO configuration in i7 Quad-Core CPU.
+ * 36 Mbps DL in 10 MHz SISO configuration in i5 Dual-Core CPU.
+
+srsUE has been fully tested and validated with the following network equipment: 
+ * Amarisoft LTE100 eNodeB and EPC
+ * Nokia FlexiRadio family FSMF system module with 1800MHz FHED radio module and TravelHawk EPC simulator
+ * Huawei DBS3900 
+ * Octasic Flexicell LTE-FDD NIB
+
+srsENB Features
+---------------
+
+ * Round Robin MAC scheduler with FAPI-like C++ API
+ * SR support
+ * Periodic and Aperiodic CQI feedback support
+ * Standard S1AP and GTP-U interfaces to the Core Network
+ * Tested up to 75 Mbps DL in SISO configuration with commercial UEs 
+
+srsENB has been tested and validated with the following handsets:
+ * LG Nexus 5
+ * LG Nexus 4
+ * Motorola Moto G4 plus

 Hardware
-========
+--------

 The library currently supports the Ettus Universal Hardware Driver (UHD) and the bladeRF driver. Thus, any hardware supported by UHD or bladeRF can be used. There is no sampling rate conversion, therefore the hardware should support 30.72 MHz clock in order to work correctly with LTE sampling frequencies and decode signals from live LTE base stations. 

@ -35,15 +65,29 @@ We have tested the following hardware:
 * USRP B210
 * USRP X300
 * bladeRF
+ * limeSDR

-Download & Install Instructions
-=================================
+Build Instructions
+------------------
+
+* Mandatory requirements: 
+  * Common:
+    * libfftw            http://www.fftw.org/
+    * PolarSSL/mbedTLS   https://tls.mbed.org
+  * srsUE:
+    * Boost:             http://www.boost.org
+  * srsENB:
+    * Boost:             http://www.boost.org
+    * lksctp:            http://lksctp.sourceforge.net/
+    * config:            http://www.hyperrealm.com/libconfig/

-* Mandatory dependencies: 
-  * libfftw
 * Optional requirements: 
-  * srsgui:        for real-time plotting. Download it here: https://github.com/srslte/srsgui 
-  * VOLK:          if the VOLK library and headers are detected, they will be used for accelerating some signal processing functions.  
+  * srsgui:              https://github.com/srslte/srsgui - for real-time plotting.
+  * VOLK:                https://github.com/gnuradio/volk -  if the VOLK library and headers are detected, they will be used to accelerate some signal processing functions.
+
+* RF front-end driver:
+  * UHD:                 https://github.com/EttusResearch/uhd
+  * BladeRF:             https://github.com/Nuand/bladeRF

 Download and build srsLTE: 
 ```
@ -55,64 +99,27 @@ cmake ../
 make 
 ```

-The library can also be installed using the command ```sudo make install```. 
+The software suite can also be installed using the command ```sudo make install```. 

-Running srsLTE Examples
-========================
+Execution Instructions
+----------------------

-* SIB1 reception and UE measurement from commercial LTE networks: 
+The srsUE and srsENB applications include example configuration files. Execute the applications with root privileges to enable real-time thread priorities and to permit creation of virtual network interfaces.
+
+### srsUE
+
+Run the srsUE application as follows:
 ```
-lte/examples/pdsch_ue -f [frequency_in_Hz]
-```
-Where -f is the LTE channel frequency. 
-
-* eNodeB to UE Downlink PHY test
-
-You will need two computers, each equipped with a USRP. At the transmitter side, run: 
-
-```
-lte/examples/pdsch_enodeb -f [frequency_in_Hz] [-h for more commands]
+sudo ./srsue ue.conf
 ```

-At the receiver run:
+### srsENB
+
+As the srsLTE software suite does not include EPC functionality, a separate EPC is required to run srsENB. Run the application as follows:
 ```
-lte/examples/pdsch_ue -r 1234 -f [frequency_in_Hz]
+sudo ./srsenb enb.conf
 ```

-At the transmitter console, it is possible to change the Modulation and Coding Scheme (MCS) by typing a new number (between 0 and 28) and pressing Enter. 
-
-
-The output at the receiver should look something similar to the following video. In this example, we removed the transmitter and receiver antennas in the middle of the demonstration, showing how reception is still possible (despite with some erros). 
-
-https://www.dropbox.com/s/txh1nuzdb0igq5n/demo_pbch.ogv
-
-![Screenshopt of the PBCH example output](pbch_capture.png "Screenshopt of the PBCH example output")
-
-* Video over Downlink PHY (eNodeB to UE)
-
-The previous example sends random bits to the UE. It is possible to open a TCP socket and stream video over the LTE PHY DL wireless connection. At the transmitter side, run the following command:  
-
-```
-lte/examples/pdsch_enodeb -f [frequency_in_Hz] -u 2000 [-h for more commands]
-```
-
-The argument -u 2000 will open port 2000 for listening for TCP connections. Set a high-order MCS, like 16 by typing 16 in the eNodeB console and pressing Enter. 
-
-```
-lte/examples/pdsch_ue -r 1234 -u 2001 -U 127.0.0.1 -f [frequency_in_Hz]
-```
-
-The arguments -u 2001 -U 127.0.0.1 will forward the data that was injected at the eNodeB to address:port indicated by the argument. Once you have the system running, you can transmit some useful data, like a video stream. At the transmitter side, run:  
-
-```
-avconv -f video4linux2 -i /dev/video0 -c:v mp4 -f mpegts tcp://127.0.0.1:2000 
-```
-to stream the video captured from the webcam throught the local host port 2000. At the receiver, run: 
-
-```
-avplay tcp://127.0.0.1:2001?listen -analyzeduration 100 -loglevel verbose
-```
-to watch the video. 

 Support
 ========
--- a/cmake/modules/SRSLTEVersion.cmake
+++ b/cmake/modules/SRSLTEVersion.cmake
@ -18,7 +18,7 @@
 # and at http://www.gnu.org/licenses/.
 #

-SET(SRSLTE_VERSION_MAJOR 001)
-SET(SRSLTE_VERSION_MINOR 004)
+SET(SRSLTE_VERSION_MAJOR 002)
+SET(SRSLTE_VERSION_MINOR 000)
 SET(SRSLTE_VERSION_PATCH 000)
 SET(SRSLTE_VERSION_STRING "${SRSLTE_VERSION_MAJOR}.${SRSLTE_VERSION_MINOR}.${SRSLTE_VERSION_PATCH}")
--- a/lib/include/srslte/phy/common/phy_common.h
+++ b/lib/include/srslte/phy/common/phy_common.h
@ -53,6 +53,8 @@
 #define SRSLTE_MAX_LAYERS    4
 #define SRSLTE_MAX_CODEWORDS 2

+#define SRSLTE_MAX_CODEBLOCKS 32
+
 #define SRSLTE_LTE_CRC24A  0x1864CFB
 #define SRSLTE_LTE_CRC24B  0X1800063
 #define SRSLTE_LTE_CRC16   0x11021
--- a/lib/include/srslte/phy/fec/turbodecoder.h
+++ b/lib/include/srslte/phy/fec/turbodecoder.h
@ -52,12 +52,14 @@
 #include "srslte/phy/fec/turbodecoder_gen.h"

 #ifdef LV_HAVE_SSE
-#include "srslte/phy/fec/turbodecoder_sse.h"
+#include "srslte/phy/fec/turbodecoder_simd.h"
+#else
+#define SRSLTE_TDEC_NPAR 1
 #endif

 typedef struct SRSLTE_API {
 #ifdef LV_HAVE_SSE
-  srslte_tdec_sse_t tdec_sse;
+  srslte_tdec_simd_t tdec_simd;
 #else
  float *input_conv; 
  srslte_tdec_gen_t tdec_gen;
@ -69,7 +71,16 @@ SRSLTE_API int srslte_tdec_init(srslte_tdec_t * h,

 SRSLTE_API void srslte_tdec_free(srslte_tdec_t * h);

-SRSLTE_API int srslte_tdec_reset(srslte_tdec_t * h, uint32_t long_cb);
+SRSLTE_API int srslte_tdec_reset(srslte_tdec_t * h, 
+                                 uint32_t long_cb);
+
+SRSLTE_API int srslte_tdec_reset_cb(srslte_tdec_t * h, 
+                                    uint32_t cb_idx);
+
+SRSLTE_API int srslte_tdec_get_nof_iterations_cb(srslte_tdec_t * h, 
+                                                 uint32_t cb_idx);
+
+SRSLTE_API int srslte_tdec_get_nof_parallel(srslte_tdec_t * h); 

 SRSLTE_API void srslte_tdec_iteration(srslte_tdec_t * h, 
                                      int16_t* input, 
@ -89,4 +100,27 @@ SRSLTE_API int srslte_tdec_run_all(srslte_tdec_t * h,
                                   uint32_t nof_iterations, 
                                   uint32_t long_cb);

+SRSLTE_API void srslte_tdec_iteration_par(srslte_tdec_t * h, 
+                                          int16_t* input[SRSLTE_TDEC_NPAR], 
+                                          uint32_t long_cb);
+
+SRSLTE_API void srslte_tdec_decision_par(srslte_tdec_t * h, 
+                                         uint8_t *output[SRSLTE_TDEC_NPAR], 
+                                         uint32_t long_cb);
+
+SRSLTE_API void srslte_tdec_decision_byte_par(srslte_tdec_t * h, 
+                                              uint8_t *output[SRSLTE_TDEC_NPAR], 
+                                              uint32_t long_cb); 
+
+SRSLTE_API void srslte_tdec_decision_byte_par_cb(srslte_tdec_t * h, 
+                                                 uint8_t *output, 
+                                                 uint32_t cb_idx, 
+                                                 uint32_t long_cb);
+
+SRSLTE_API int srslte_tdec_run_all_par(srslte_tdec_t * h, 
+                                       int16_t * input[SRSLTE_TDEC_NPAR], 
+                                       uint8_t *output[SRSLTE_TDEC_NPAR],
+                                       uint32_t nof_iterations, 
+                                       uint32_t long_cb);
+
 #endif
--- a/lib/include/srslte/phy/fec/turbodecoder_gen.h
+++ b/lib/include/srslte/phy/fec/turbodecoder_gen.h
@ -66,6 +66,8 @@ typedef struct SRSLTE_API {
  float *parity;

  int current_cbidx; 
+  uint32_t current_cb_len;
+  uint32_t n_iter; 
  srslte_tc_interl_t interleaver[SRSLTE_NOF_TC_CB_SIZES];
 } srslte_tdec_gen_t;

--- a/lib/include/srslte/phy/fec/turbodecoder_simd.h
+++ b/lib/include/srslte/phy/fec/turbodecoder_simd.h
@ -0,0 +1,135 @@
+/**
+ *
+ * \section COPYRIGHT
+ *
+ * Copyright 2013-2015 Software Radio Systems Limited
+ *
+ * \section LICENSE
+ *
+ * This file is part of the srsLTE library.
+ *
+ * srsLTE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * srsLTE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * A copy of the GNU Affero General Public License can be found in
+ * the LICENSE file in the top-level directory of this distribution
+ * and at http://www.gnu.org/licenses/.
+ *
+ */
+
+/**********************************************************************************************
+ *  File:         turbodecoder.h
+ *
+ *  Description:  Turbo Decoder.
+ *                Parallel Concatenated Convolutional Code (PCCC) with two 8-state constituent
+ *                encoders and one turbo code internal interleaver. The coding rate of turbo
+ *                encoder is 1/3.
+ *                MAP_GEN is the MAX-LOG-MAP generic implementation of the decoder.
+ *
+ *  Reference:    3GPP TS 36.212 version 10.0.0 Release 10 Sec. 5.1.3.2
+ *********************************************************************************************/
+
+#ifndef TURBODECODER_SSE_
+#define TURBODECODER_SSE_
+
+#include "srslte/config.h"
+#include "srslte/phy/fec/tc_interl.h"
+#include "srslte/phy/fec/cbsegm.h"
+
+//#define ENABLE_SIMD_INTER
+
+// The constant SRSLTE_TDEC_NPAR defines the maximum number of parallel CB supported by all SIMD decoders 
+#ifdef ENABLE_SIMD_INTER
+  #include "srslte/phy/fec/turbodecoder_simd_inter.h"
+  #if LV_HAVE_AVX2
+    #define SRSLTE_TDEC_NPAR_INTRA 2
+  #else
+    #define SRSLTE_TDEC_NPAR_INTRA 1
+  #endif
+#else
+  #if LV_HAVE_AVX2
+    #define SRSLTE_TDEC_NPAR 2
+  #else
+    #define SRSLTE_TDEC_NPAR 1
+  #endif
+#endif
+
+#define SRSLTE_TCOD_RATE 3
+#define SRSLTE_TCOD_TOTALTAIL 12
+
+#define SRSLTE_TCOD_MAX_LEN_CB     6144
+#define SRSLTE_TCOD_MAX_LEN_CODED  (SRSLTE_TCOD_RATE*SRSLTE_TCOD_MAX_LEN_CB+SRSLTE_TCOD_TOTALTAIL)
+
+typedef struct SRSLTE_API {
+  uint32_t max_long_cb;
+  uint32_t max_par_cb; 
+  int16_t *alpha;
+  int16_t *branch;
+} map_gen_t;
+
+typedef struct SRSLTE_API {
+  uint32_t max_long_cb;
+  uint32_t max_par_cb; 
+  
+  map_gen_t dec;
+
+  int16_t *app1[SRSLTE_TDEC_NPAR];
+  int16_t *app2[SRSLTE_TDEC_NPAR];
+  int16_t *ext1[SRSLTE_TDEC_NPAR];
+  int16_t *ext2[SRSLTE_TDEC_NPAR];
+  int16_t *syst[SRSLTE_TDEC_NPAR];
+  int16_t *parity0[SRSLTE_TDEC_NPAR];
+  int16_t *parity1[SRSLTE_TDEC_NPAR];
+  
+  int cb_mask; 
+  int current_cbidx; 
+  srslte_tc_interl_t interleaver[SRSLTE_NOF_TC_CB_SIZES];
+  int n_iter[SRSLTE_TDEC_NPAR];
+} srslte_tdec_simd_t;
+
+SRSLTE_API int srslte_tdec_simd_init(srslte_tdec_simd_t * h, 
+                                     uint32_t max_par_cb, 
+                                     uint32_t max_long_cb);
+
+SRSLTE_API void srslte_tdec_simd_free(srslte_tdec_simd_t * h);
+
+SRSLTE_API int srslte_tdec_simd_reset(srslte_tdec_simd_t * h, 
+                                      uint32_t long_cb);
+
+SRSLTE_API int srslte_tdec_simd_get_nof_iterations_cb(srslte_tdec_simd_t * h, 
+                                                      uint32_t cb_idx);
+
+SRSLTE_API int srslte_tdec_simd_reset_cb(srslte_tdec_simd_t * h, 
+                                         uint32_t cb_idx);
+
+SRSLTE_API void srslte_tdec_simd_iteration(srslte_tdec_simd_t * h, 
+                                           int16_t * input[SRSLTE_TDEC_NPAR], 
+                                           uint32_t long_cb);
+
+SRSLTE_API void srslte_tdec_simd_decision(srslte_tdec_simd_t * h, 
+                                          uint8_t *output[SRSLTE_TDEC_NPAR], 
+                                          uint32_t long_cb);
+
+SRSLTE_API void srslte_tdec_simd_decision_byte(srslte_tdec_simd_t * h, 
+                                               uint8_t *output[SRSLTE_TDEC_NPAR], 
+                                               uint32_t long_cb); 
+
+SRSLTE_API void srslte_tdec_simd_decision_byte_cb(srslte_tdec_simd_t * h, 
+                                                  uint8_t *output, 
+                                                  uint32_t cbidx, 
+                                                  uint32_t long_cb); 
+
+SRSLTE_API int srslte_tdec_simd_run_all(srslte_tdec_simd_t * h, 
+                                        int16_t * input[SRSLTE_TDEC_NPAR], 
+                                        uint8_t *output[SRSLTE_TDEC_NPAR],
+                                        uint32_t nof_iterations, 
+                                        uint32_t long_cb);
+
+#endif
--- a/lib/include/srslte/phy/fec/turbodecoder_simd_inter.h
+++ b/lib/include/srslte/phy/fec/turbodecoder_simd_inter.h
@ -0,0 +1,119 @@
+/**
+ *
+ * \section COPYRIGHT
+ *
+ * Copyright 2013-2015 Software Radio Systems Limited
+ *
+ * \section LICENSE
+ *
+ * This file is part of the srsLTE library.
+ *
+ * srsLTE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * srsLTE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * A copy of the GNU Affero General Public License can be found in
+ * the LICENSE file in the top-level directory of this distribution
+ * and at http://www.gnu.org/licenses/.
+ *
+ */
+
+/**********************************************************************************************
+ *  File:         turbodecoder.h
+ *
+ *  Description:  Turbo Decoder.
+ *                Parallel Concatenated Convolutional Code (PCCC) with two 8-state constituent
+ *                encoders and one turbo code internal interleaver. The coding rate of turbo
+ *                encoder is 1/3.
+ *                MAP_GEN is the MAX-LOG-MAP generic implementation of the decoder.
+ *
+ *  Reference:    3GPP TS 36.212 version 10.0.0 Release 10 Sec. 5.1.3.2
+ *********************************************************************************************/
+
+#ifndef TURBODECODER_SSE_INTER_
+#define TURBODECODER_SSE_INTER_
+
+
+/** This is an simd inter-frame parallel turbo decoder. Parallizes 8 code-blocks using SSE
+ *  This implementation is currently not functional and not used by the rest of the code 
+ */
+
+#include "srslte/config.h"
+#include "srslte/phy/fec/tc_interl.h"
+#include "srslte/phy/fec/cbsegm.h"
+
+#if LV_HAVE_AVX2
+  #define SRSLTE_TDEC_NPAR 16
+#else
+  #define SRSLTE_TDEC_NPAR 8
+#endif
+
+
+typedef struct SRSLTE_API {
+  int max_long_cb;
+
+  int16_t *syst0;
+  int16_t *parity0;
+  int16_t *syst1;
+  int16_t *parity1;
+  int16_t *llr1;
+  int16_t *llr2;
+  int16_t *w;
+  int16_t *alpha;
+
+  uint32_t max_par_cb;   
+  int current_cbidx; 
+  uint32_t current_long_cb;
+  srslte_tc_interl_t interleaver[SRSLTE_NOF_TC_CB_SIZES];
+  int n_iter[SRSLTE_TDEC_NPAR];
+} srslte_tdec_simd_inter_t;
+
+SRSLTE_API int srslte_tdec_simd_inter_init(srslte_tdec_simd_inter_t * h, 
+                                           uint32_t max_par_cb, 
+                                           uint32_t max_long_cb);
+
+SRSLTE_API void srslte_tdec_simd_inter_free(srslte_tdec_simd_inter_t * h);
+
+SRSLTE_API int srslte_tdec_simd_inter_reset(srslte_tdec_simd_inter_t * h, 
+                                      uint32_t long_cb);
+
+SRSLTE_API int srslte_tdec_simd_inter_get_nof_iterations_cb(srslte_tdec_simd_inter_t * h, 
+                                                      uint32_t cb_idx);
+
+SRSLTE_API int srslte_tdec_simd_inter_reset_cb(srslte_tdec_simd_inter_t * h, 
+                                         uint32_t cb_idx);
+
+SRSLTE_API void srslte_tdec_simd_inter_iteration(srslte_tdec_simd_inter_t * h, 
+                                           int16_t * input[SRSLTE_TDEC_NPAR], 
+                                           uint32_t nof_cb,
+                                           uint32_t long_cb);
+
+SRSLTE_API void srslte_tdec_simd_inter_decision(srslte_tdec_simd_inter_t * h, 
+                                          uint8_t *output[SRSLTE_TDEC_NPAR], 
+                                          uint32_t nof_cb,
+                                          uint32_t long_cb);
+
+SRSLTE_API void srslte_tdec_simd_inter_decision_byte(srslte_tdec_simd_inter_t * h, 
+                                               uint8_t *output[SRSLTE_TDEC_NPAR], 
+                                               uint32_t nof_cb,
+                                               uint32_t long_cb); 
+
+SRSLTE_API void srslte_tdec_simd_inter_decision_byte_cb(srslte_tdec_simd_inter_t * h, 
+                                                  uint8_t *output, 
+                                                  uint32_t cbidx, 
+                                                  uint32_t long_cb); 
+
+SRSLTE_API int srslte_tdec_simd_inter_run_all(srslte_tdec_simd_inter_t * h, 
+                                        int16_t *input[SRSLTE_TDEC_NPAR], 
+                                        uint8_t *output[SRSLTE_TDEC_NPAR],
+                                        uint32_t nof_iterations, 
+                                        uint32_t nof_cb,
+                                        uint32_t long_cb);
+
+#endif
--- a/lib/include/srslte/phy/utils/vector_simd.h
+++ b/lib/include/srslte/phy/utils/vector_simd.h
@ -37,44 +37,52 @@ extern "C" {

 SRSLTE_API int srslte_vec_dot_prod_sss_sse(short *x, short *y, uint32_t len); 

-SRSLTE_API int srslte_vec_dot_prod_sss_avx(short *x, short *y, uint32_t len); 
+SRSLTE_API int srslte_vec_dot_prod_sss_avx2(short *x, short *y, uint32_t len); 

-
-  
 SRSLTE_API void srslte_vec_sum_sss_sse(short *x, short *y, short *z, uint32_t len);

-SRSLTE_API void srslte_vec_sum_sss_avx(short *x, short *y, short *z, uint32_t len);
-
-
+SRSLTE_API void srslte_vec_sum_sss_avx2(short *x, short *y, short *z, uint32_t len);

 SRSLTE_API void srslte_vec_sub_sss_sse(short *x, short *y, short *z, uint32_t len); 

-SRSLTE_API void srslte_vec_sub_sss_avx(short *x, short *y, short *z, uint32_t len);
+SRSLTE_API void srslte_vec_sub_sss_avx2(short *x, short *y, short *z, uint32_t len);

+SRSLTE_API void srslte_vec_sum_fff_sse(float *x, float *y, float *z, uint32_t len);

+SRSLTE_API void srslte_vec_sub_fff_sse(float *x, float *y, float *z, uint32_t len);

+SRSLTE_API void srslte_vec_sc_prod_fff_sse(float *x, float h, float *z, uint32_t len);

+SRSLTE_API void srslte_vec_sc_prod_ccc_sse(cf_t *x, cf_t h, cf_t *z, uint32_t len);
+
+SRSLTE_API void srslte_vec_prod_ccc_sse(cf_t *x,cf_t *y, cf_t *z, uint32_t len);
+
+SRSLTE_API void srslte_vec_prod_sss_avx2(short *x, short *y, short *z, uint32_t len);
+
+SRSLTE_API cf_t srslte_vec_dot_prod_conj_ccc_sse(cf_t *x, cf_t *y, uint32_t len);
+
+SRSLTE_API void srslte_vec_prod_conj_ccc_sse(cf_t *x,cf_t *y, cf_t *z, uint32_t len);
+
+SRSLTE_API cf_t srslte_vec_dot_prod_ccc_sse(cf_t *x, cf_t *y, uint32_t len);
+
+SRSLTE_API  void srslte_vec_sc_div2_sss_avx2(short *x, int k, short *z, uint32_t len);
+
+SRSLTE_API void srslte_vec_abs_square_cf_sse(cf_t *x, float *z, uint32_t len);

 SRSLTE_API void srslte_vec_prod_sss_sse(short *x, short *y, short *z, uint32_t len); 

 SRSLTE_API void srslte_vec_prod_sss_avx(short *x, short *y, short *z, uint32_t len);

-
 SRSLTE_API void srslte_vec_sc_div2_sss_sse(short *x, int n_rightshift, short *z, uint32_t len); 

 SRSLTE_API  void srslte_vec_sc_div2_sss_avx(short *x, int k, short *z, uint32_t len);

-
-
-
-
 SRSLTE_API void srslte_vec_lut_sss_sse(short *x, unsigned short *lut, short *y, uint32_t len); 

 SRSLTE_API void srslte_vec_convert_fi_sse(float *x, int16_t *z, float scale, uint32_t len); 

-
-
 SRSLTE_API void srslte_vec_mult_scalar_cf_f_avx( cf_t *z,const cf_t *x,const float h,const uint32_t len);
+
 #ifdef __cplusplus
 }
 #endif
--- a/lib/src/phy/CMakeLists.txt
+++ b/lib/src/phy/CMakeLists.txt
@ -81,28 +81,6 @@ else(MKL_FOUND)
  target_link_libraries(srslte_phy ${FFTW3F_LIBRARIES})
 endif(MKL_FOUND)

-## This linkage is required for the examples and tests only
-if(RF_FOUND)
-
-  target_link_libraries(srslte_phy)
-
-  if(UHD_FOUND)
-    target_link_libraries(srslte_phy ${UHD_LIBRARIES})
-  endif(UHD_FOUND)
-
-  if(BLADERF_FOUND)
-    target_link_libraries(srslte_phy ${BLADERF_LIBRARIES})
-  endif(BLADERF_FOUND)
-
-  if(LIMESDR_FOUND)
-    target_link_libraries(srslte_phy ${LIMESDR_LIBRARIES})
-  endif(LIMESDR_FOUND)
-
-  if(SOAPYSDR_FOUND)
-    target_link_libraries(srslte_phy ${SOAPYSDR_LIBRARIES})
-  endif(SOAPYSDR_FOUND)
-
-endif(RF_FOUND)

 if(VOLK_FOUND)
  target_link_libraries(srslte_phy ${VOLK_LIBRARIES})
--- a/lib/src/phy/fec/cbsegm.c
+++ b/lib/src/phy/fec/cbsegm.c
@ -48,7 +48,13 @@ const uint32_t tc_cb_sizes[SRSLTE_NOF_TC_CB_SIZES] = { 40, 48, 56, 64, 72, 80, 8
    4800, 4864, 4928, 4992, 5056, 5120, 5184, 5248, 5312, 5376, 5440, 5504,
    5568, 5632, 5696, 5760, 5824, 5888, 5952, 6016, 6080, 6144 };

-/* Calculate Codeblock Segmentation as in Section 5.1.2 of 36.212 */
+/**
+ * Calculate Codeblock Segmentation parameters as in Section 5.1.2 of 36.212
+ *
+ * @param[out] s Output of code block segmentation calculation
+ * @param[in] tbs Input Transport Block Size in bits. CRC's will be added to this
+ * @return Error code
+ */
 int srslte_cbsegm(srslte_cbsegm_t *s, uint32_t tbs) {
  uint32_t Bp, B, idx1;
  int ret; 
@ -104,6 +110,8 @@ int srslte_cbsegm(srslte_cbsegm_t *s, uint32_t tbs) {

 /*
 * Finds index of minimum K>=long_cb in Table 5.1.3-3 of 36.212
+ *
+ * @return I_TBS or error code
 */
 int srslte_cbsegm_cbindex(uint32_t long_cb) {
  int j = 0;
@ -120,6 +128,8 @@ int srslte_cbsegm_cbindex(uint32_t long_cb) {

 /*
 * Returns Turbo coder interleaver size for Table 5.1.3-3 (36.212) index
+ *
+ * @return Code block size in bits or error code
 */
 int srslte_cbsegm_cbsize(uint32_t index) {
  if (index < SRSLTE_NOF_TC_CB_SIZES) {
@ -129,6 +139,12 @@ int srslte_cbsegm_cbsize(uint32_t index) {
  }
 }

+/**
+ * Check is code block size is valid for LTE Turbo Code
+ *
+ * @param[in] size Size of code block in bits
+ * @return true if Code Block size is allowed
+ */
 bool srslte_cbsegm_cbsize_isvalid(uint32_t size) {
  for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
    if (tc_cb_sizes[i] == size) {
--- a/lib/src/phy/fec/convcoder.c
+++ b/lib/src/phy/fec/convcoder.c
@ -33,6 +33,19 @@
 #include "srslte/phy/fec/convcoder.h"
 #include "parity.h"

+/**
+ * Convolution encodes according to given parameters.
+ *
+ * q->R is rate
+ * q->tail_biting enables tail biting
+ * q->K is a parameter for tail biting
+ *
+ * @param[in] q Convolution coder parameters
+ * @param[in] input Unpacked bit array. Size frame_length
+ * @param[out] output Unpacked bit array. Size q->R*frame_length if q->tail_biting, else q->R*(frame_length + q->K - 1)
+ * @param[in] frame_length Number of bits in input_array
+ * @return Number of bits in output
+ */
 int srslte_convcoder_encode(srslte_convcoder_t *q, uint8_t *input, uint8_t *output, uint32_t frame_length) {
  uint32_t sr;
  uint32_t i,j;
--- a/lib/src/phy/fec/rm_conv.c
+++ b/lib/src/phy/fec/rm_conv.c
@ -39,6 +39,12 @@ uint8_t RM_PERM_CC_INV[NCOLS] =
    { 16, 0, 24, 8, 20, 4, 28, 12, 18, 2, 26, 10, 22, 6, 30, 14, 17, 1, 25, 9,
        21, 5, 29, 13, 19, 3, 27, 11, 23, 7, 31, 15 };

+/**
+ * Rate matching for convolution encoder
+ *
+ * @param[in] input Unpacked bit array. Size in_len
+ * @param[output] output Unpacked bit array. Size out_len <= in_len
+ */
 int srslte_rm_conv_tx(uint8_t *input, uint32_t in_len, uint8_t *output, uint32_t out_len) {

  uint8_t tmp[3 * NCOLS * NROWS_MAX];
--- a/lib/src/phy/fec/rm_turbo.c
+++ b/lib/src/phy/fec/rm_turbo.c
@ -245,6 +245,20 @@ void srslte_rm_turbo_gentables() {
 }


+/**
+ * Rate matching for LTE Turbo Coder
+ *
+ * @param[out] w_buff Preallocated softbuffer
+ * @param[in] systematic Input code block in a byte array
+ * @param[in] parity Input code turbo coder parity bits in a byte array
+ * @param[out] output Rate matched output array of size out_len
+ * @param out_len Output buffer size to be filled with as many FEC bits as fit
+ * @param w_offset Start writing to output at this bit offset
+ * @param cb_idx Code block index. Used to lookup interleaver parameters
+ * @param rv_idx Redundancy Version Index. Indexed offset of FEC bits to copy
+ *
+ * @return Error code
+ */
 int srslte_rm_turbo_tx_lut(uint8_t *w_buff, uint8_t *systematic, uint8_t *parity, uint8_t *output, 
                           uint32_t cb_idx, uint32_t out_len, 
                           uint32_t w_offset, uint32_t rv_idx) 
@ -289,6 +303,15 @@ int srslte_rm_turbo_tx_lut(uint8_t *w_buff, uint8_t *systematic, uint8_t *parity
  }
 }

+/**
+ * Undoes rate matching for LTE Turbo Coder. Expands rate matched buffer to full size buffer.
+ *
+ * @param[in] input Input buffer of size in_len
+ * @param[out] output Output buffer of size 3*srslte_cbsegm_cbsize(cb_idx)+12
+ * @param[in] cb_idx Code block table index
+ * @param[in] rv_idx Redundancy Version from DCI control message
+ * @return Error code
+ */
 int srslte_rm_turbo_rx_lut(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx) 
 { 
  
--- a/lib/src/phy/fec/test/turbodecoder_test.c
+++ b/lib/src/phy/fec/test/turbodecoder_test.c
@ -46,6 +46,7 @@ uint32_t seed = 0;
 int K = -1;

 #define MAX_ITERATIONS  10
+int nof_cb = 1; 
 int nof_iterations = MAX_ITERATIONS;
 int test_known_data = 0;
 int test_errors = 0;
@ -59,6 +60,7 @@ void usage(char *prog) {
  printf("Usage: %s [nlesv]\n", prog);
  printf(
      "\t-k Test with known data (ignores frame_length) [Default disabled]\n");
+  printf("\t-c nof_cb in parallel [Default %d]\n", nof_cb);
  printf("\t-i nof_iterations [Default %d]\n", nof_iterations);
  printf("\t-n nof_frames [Default %d]\n", nof_frames);
  printf("\t-N nof_repetitions [Default %d]\n", nof_repetitions);
@ -70,8 +72,11 @@ void usage(char *prog) {

 void parse_args(int argc, char **argv) {
  int opt;
-  while ((opt = getopt(argc, argv, "inNlstvekt")) != -1) {
+  while ((opt = getopt(argc, argv, "cinNlstvekt")) != -1) {
    switch (opt) {
+    case 'c':
+      nof_cb = atoi(argv[optind]);
+      break;
    case 'n':
      nof_frames = atoi(argv[optind]);
      break;
@ -112,7 +117,7 @@ int main(int argc, char **argv) {
  float *llr;
  short *llr_s;
  uint8_t *llr_c;
-  uint8_t *data_tx, *data_rx, *data_rx_bytes, *symbols;
+  uint8_t *data_tx, *data_rx, *data_rx_bytes[SRSLTE_TDEC_NPAR], *symbols;
  uint32_t i, j;
  float var[SNR_POINTS];
  uint32_t snr_points;
@ -154,10 +159,12 @@ int main(int argc, char **argv) {
    perror("malloc");
    exit(-1);
  }
-  data_rx_bytes = srslte_vec_malloc(frame_length * sizeof(uint8_t));
-  if (!data_rx_bytes) {
-    perror("malloc");
-    exit(-1);
+  for (int cb=0;cb<SRSLTE_TDEC_NPAR;cb++) {
+    data_rx_bytes[cb] = srslte_vec_malloc(frame_length * sizeof(uint8_t));
+    if (!data_rx_bytes[cb]) {
+      perror("malloc");
+      exit(-1);
+    }    
  }

  symbols = srslte_vec_malloc(coded_length * sizeof(uint8_t));
@ -232,7 +239,6 @@ int main(int argc, char **argv) {
      for (j = 0; j < coded_length; j++) {
        llr[j] = symbols[j] ? 1 : -1;
      }
-
      srslte_ch_awgn_f(llr, llr, var[i], coded_length);

      for (j=0;j<coded_length;j++) {
@ -248,23 +254,39 @@ int main(int argc, char **argv) {
        t = nof_iterations;
      }

+      int16_t *input[SRSLTE_TDEC_NPAR];
+      uint8_t *output[SRSLTE_TDEC_NPAR];        
+      
+      for (int n=0;n<SRSLTE_TDEC_NPAR;n++) {
+        if (n < nof_cb) {
+          input[n] = llr_s;         
+        } else {
+          input[n] = NULL; 
+        }
+        output[n] = data_rx_bytes[n];           
+      }
+
      gettimeofday(&tdata[1], NULL); 
-      for (int k=0;k<nof_repetitions;k++) {     
-        srslte_tdec_run_all(&tdec, llr_s, data_rx_bytes, t, frame_length);        
+      for (int k=0;k<nof_repetitions;k++) { 
+        srslte_tdec_run_all_par(&tdec, input, output, t, frame_length);        
      }
      gettimeofday(&tdata[2], NULL);
      get_time_interval(tdata);
      mean_usec = (float) mean_usec * 0.9 + (float) (tdata[0].tv_usec/nof_repetitions) * 0.1;
      
-      srslte_bit_unpack_vector(data_rx_bytes, data_rx, frame_length);
-
-      errors += srslte_bit_diff(data_tx, data_rx, frame_length);
-      
      frame_cnt++;
+      uint32_t errors_this = 0; 
+      for (int cb=0;cb<nof_cb;cb++) {
+        srslte_bit_unpack_vector(data_rx_bytes[cb], data_rx, frame_length);
+        
+        errors_this=srslte_bit_diff(data_tx, data_rx, frame_length);
+        //printf("error[%d]=%d\n", cb, errors_this);
+        errors += errors_this;
+      }
      printf("Eb/No: %2.2f %10d/%d   ", SNR_MIN + i * ebno_inc, frame_cnt, nof_frames);
-      printf("BER: %.2e  ", (float) errors / (frame_cnt * frame_length));
-      printf("%3.1f Mbps (%6.2f usec)", (float) frame_length / mean_usec, mean_usec);
-      printf("\r");
+      printf("BER: %.2e  ", (float) errors / (nof_cb*frame_cnt * frame_length));
+      printf("%3.1f Mbps (%6.2f usec)", (float) (nof_cb*frame_length) / mean_usec, mean_usec);
+      printf("\r");        

    }    
    printf("\n");
@ -273,7 +295,7 @@ int main(int argc, char **argv) {
  printf("\n");
  if (snr_points == 1) {
    if (errors) {
-      printf("%d Errors\n", errors);
+      printf("%d Errors\n", errors/nof_cb);
    }
  }    

--- a/lib/src/phy/fec/turbodecoder.c
+++ b/lib/src/phy/fec/turbodecoder.c
@ -35,7 +35,7 @@


 #ifdef LV_HAVE_SSE
-#include "srslte/phy/fec/turbodecoder_sse.h"
+#include "srslte/phy/fec/turbodecoder_simd.h"
 #endif

 #include "srslte/phy/utils/vector.h"
@ -43,7 +43,7 @@

 int srslte_tdec_init(srslte_tdec_t * h, uint32_t max_long_cb) {
 #ifdef LV_HAVE_SSE
-  return srslte_tdec_sse_init(&h->tdec_sse, max_long_cb);
+  return srslte_tdec_simd_init(&h->tdec_simd, SRSLTE_TDEC_NPAR, max_long_cb);
 #else
  h->input_conv = srslte_vec_malloc(sizeof(float) * (3*max_long_cb+12));
  if (!h->input_conv) {
@ -56,7 +56,7 @@ int srslte_tdec_init(srslte_tdec_t * h, uint32_t max_long_cb) {

 void srslte_tdec_free(srslte_tdec_t * h) {
 #ifdef LV_HAVE_SSE
-  srslte_tdec_sse_free(&h->tdec_sse);
+  srslte_tdec_simd_free(&h->tdec_simd);  
 #else
  if (h->input_conv) {
    free(h->input_conv);
@ -68,45 +68,99 @@ void srslte_tdec_free(srslte_tdec_t * h) {

 int srslte_tdec_reset(srslte_tdec_t * h, uint32_t long_cb) {
 #ifdef LV_HAVE_SSE
-  return srslte_tdec_sse_reset(&h->tdec_sse, long_cb);
+  return srslte_tdec_simd_reset(&h->tdec_simd, long_cb);
 #else
  return srslte_tdec_gen_reset(&h->tdec_gen, long_cb);
 #endif
 }

-void srslte_tdec_iteration(srslte_tdec_t * h, int16_t* input, uint32_t long_cb) {
+int srslte_tdec_reset_cb(srslte_tdec_t * h, uint32_t cb_idx) {
 #ifdef LV_HAVE_SSE
-  srslte_tdec_sse_iteration(&h->tdec_sse, input, long_cb);
+  return srslte_tdec_simd_reset_cb(&h->tdec_simd, cb_idx);      
 #else
-  srslte_vec_convert_if(input, h->input_conv, 0.01, 3*long_cb+12);
+  return srslte_tdec_gen_reset(&h->tdec_gen, h->tdec_gen.current_cb_len);
+#endif
+}
+
+int srslte_tdec_get_nof_iterations_cb(srslte_tdec_t * h, uint32_t cb_idx)
+{
+#ifdef LV_HAVE_SSE
+  return srslte_tdec_simd_get_nof_iterations_cb(&h->tdec_simd, cb_idx);
+#else
+  return h->tdec_gen.n_iter;
+#endif  
+}
+
+void srslte_tdec_iteration_par(srslte_tdec_t * h, int16_t* input[SRSLTE_TDEC_NPAR], uint32_t long_cb) {
+#ifdef LV_HAVE_SSE
+  srslte_tdec_simd_iteration(&h->tdec_simd, input, long_cb);      
+#else
+  srslte_vec_convert_if(input[0], h->input_conv, 0.01, 3*long_cb+12);
  srslte_tdec_gen_iteration(&h->tdec_gen, h->input_conv, long_cb);
 #endif
 }

-void srslte_tdec_decision(srslte_tdec_t * h, uint8_t *output, uint32_t long_cb) {
-#ifdef LV_HAVE_SSE
-  return srslte_tdec_sse_decision(&h->tdec_sse, output, long_cb);
-#else
-  return srslte_tdec_gen_decision(&h->tdec_gen, output, long_cb);
-#endif
+void srslte_tdec_iteration(srslte_tdec_t * h, int16_t* input, uint32_t long_cb) {
+  int16_t *input_par[SRSLTE_TDEC_NPAR]; 
+  input_par[0] = input; 
+  return srslte_tdec_iteration_par(h, input_par, long_cb);
+}

+void srslte_tdec_decision_par(srslte_tdec_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t long_cb) {
+#ifdef LV_HAVE_SSE
+  return srslte_tdec_simd_decision(&h->tdec_simd, output, long_cb);
+#else
+  return srslte_tdec_gen_decision(&h->tdec_gen, output[0], long_cb);
+#endif
+}
+
+void srslte_tdec_decision(srslte_tdec_t * h, uint8_t *output, uint32_t long_cb) {
+  uint8_t *output_par[SRSLTE_TDEC_NPAR]; 
+  output_par[0] = output; 
+  srslte_tdec_decision_par(h, output_par, long_cb);
+}
+
+void srslte_tdec_decision_byte_par(srslte_tdec_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t long_cb) {
+#ifdef LV_HAVE_SSE
+  srslte_tdec_simd_decision_byte(&h->tdec_simd, output, long_cb);  
+#else
+  srslte_tdec_gen_decision_byte(&h->tdec_gen, output[0], long_cb);
+#endif  
+}
+
+void srslte_tdec_decision_byte_par_cb(srslte_tdec_t * h, uint8_t *output, uint32_t cb_idx, uint32_t long_cb) {
+#ifdef LV_HAVE_SSE
+  srslte_tdec_simd_decision_byte_cb(&h->tdec_simd, output, cb_idx, long_cb);
+#else
+  srslte_tdec_gen_decision_byte(&h->tdec_gen, output, long_cb);
+#endif  
 }

 void srslte_tdec_decision_byte(srslte_tdec_t * h, uint8_t *output, uint32_t long_cb) {
+  uint8_t *output_par[SRSLTE_TDEC_NPAR]; 
+  output_par[0] = output; 
+  srslte_tdec_decision_byte_par(h, output_par, long_cb);
+}
+
+int srslte_tdec_run_all_par(srslte_tdec_t * h, int16_t * input[SRSLTE_TDEC_NPAR], 
+                            uint8_t *output[SRSLTE_TDEC_NPAR], 
+                            uint32_t nof_iterations, uint32_t long_cb) {
 #ifdef LV_HAVE_SSE
-  return srslte_tdec_sse_decision_byte(&h->tdec_sse, output, long_cb);
+  return srslte_tdec_simd_run_all(&h->tdec_simd, input, output, nof_iterations, long_cb);  
 #else
-  return srslte_tdec_gen_decision_byte(&h->tdec_gen, output, long_cb);
+  srslte_vec_convert_if(input[0], h->input_conv, 0.01, 3*long_cb+12);
+  return srslte_tdec_gen_run_all(&h->tdec_gen, h->input_conv, output[0], nof_iterations, long_cb);
 #endif
-  
 }

 int srslte_tdec_run_all(srslte_tdec_t * h, int16_t * input, uint8_t *output, uint32_t nof_iterations, uint32_t long_cb)
 {
-#ifdef LV_HAVE_SSE
-  return srslte_tdec_sse_run_all(&h->tdec_sse, input, output, nof_iterations, long_cb);
-#else
-  srslte_vec_convert_if(input, h->input_conv, 0.01, 3*long_cb+12);
-  return srslte_tdec_gen_run_all(&h->tdec_gen, h->input_conv, output, nof_iterations, long_cb);
-#endif
+  uint8_t *output_par[SRSLTE_TDEC_NPAR]; 
+  output_par[0] = output;   
+  int16_t *input_par[SRSLTE_TDEC_NPAR]; 
+  input_par[0] = input; 
+ 
+  return srslte_tdec_run_all_par(h, input_par, output_par, nof_iterations, long_cb);
 }
+
+
--- a/lib/src/phy/fec/turbodecoder_avx.c
+++ b/lib/src/phy/fec/turbodecoder_avx.c
@ -0,0 +1,475 @@
+/**
+ *
+ * \section COPYRIGHT
+ *
+ * Copyright 2013-2015 Software Radio Systems Limited
+ *
+ * \section LICENSE
+ *
+ * This file is part of the srsLTE library.
+ *
+ * srsLTE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * srsLTE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * A copy of the GNU Affero General Public License can be found in
+ * the LICENSE file in the top-level directory of this distribution
+ * and at http://www.gnu.org/licenses/.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <math.h>
+
+#include "srslte/phy/fec/turbodecoder_simd.h"
+#include "srslte/phy/utils/vector.h"
+
+#include <inttypes.h>
+
+#define NUMSTATES       8
+#define NINPUTS         2
+#define TAIL            3
+#define TOTALTAIL       12
+
+#define INF 10000
+#define ZERO 0
+
+
+#ifdef LV_HAVE_AVX2
+
+#include <smmintrin.h>
+#include <immintrin.h>
+
+
+// Number of CB processed in parllel in AVX
+#define NCB 2 
+
+/*
+static void print_256i(__m256i x) {
+  int16_t *s = (int16_t*) &x; 
+  printf("[%d", s[0]);
+  for (int i=1;i<16;i++) {
+    printf(",%d", s[i]);
+  }
+  printf("]\n");
+}
+*/
+
+/* Computes the horizontal MAX from 8 16-bit integers using the minpos_epu16 SSE4.1 instruction */
+static inline int16_t hMax0(__m256i masked_value)
+{
+  __m128i tmp1 = _mm256_extractf128_si256(masked_value, 0); 
+  __m128i tmp3 = _mm_minpos_epu16(tmp1);
+  return (int16_t)(_mm_cvtsi128_si32(tmp3));
+}
+
+static inline int16_t hMax1(__m256i masked_value)
+{
+  __m128i tmp1 = _mm256_extractf128_si256(masked_value, 1); 
+  __m128i tmp3 = _mm_minpos_epu16(tmp1);
+  return (int16_t)(_mm_cvtsi128_si32(tmp3));
+}
+
+/* Computes beta values */
+void map_avx_beta(map_gen_t * s, int16_t * output[SRSLTE_TDEC_NPAR], uint32_t long_cb)
+{
+  int k;
+  uint32_t end = long_cb + 3;
+  const __m256i *alphaPtr = (const __m256i*) s->alpha;
+ 
+  __m256i beta_k = _mm256_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0, -INF, -INF, -INF, -INF, -INF, -INF, -INF, 0);
+  __m256i g, bp, bn, alpha_k; 
+  
+  /* Define the shuffle constant for the positive beta */
+  __m256i shuf_bp = _mm256_set_epi8(
+    // 1st CB
+    15+16, 14+16, // 7
+    7+16,  6+16,  // 3
+    5+16,  4+16,  // 2
+    13+16, 12+16, // 6
+    11+16, 10+16, // 5
+    3+16,  2+16,  // 1
+    1+16,  0+16,  // 0
+    9+16,  8+16,  // 4
+
+    // 2nd CB
+    15, 14, // 7
+    7,  6,  // 3
+    5,  4,  // 2
+    13, 12, // 6
+    11, 10, // 5
+    3,  2,  // 1
+    1,  0,  // 0
+    9,  8   // 4
+  );
+
+  /* Define the shuffle constant for the negative beta */
+  __m256i shuf_bn = _mm256_set_epi8(
+    7+16, 6+16,   // 3
+    15+16, 14+16, // 7
+    13+16, 12+16, // 6
+    5+16,  4+16,  // 2
+    3+16,  2+16,  // 1
+    11+16, 10+16, // 5
+    9+16,  8+16,  // 4
+    1+16,  0+16,  // 0
+    
+    7,   6, // 3
+    15, 14, // 7
+    13, 12, // 6
+    5,  4,  // 2
+    3,  2,  // 1
+    11, 10, // 5
+    9,  8,  // 4
+    1,  0   // 0
+  );
+ 
+  alphaPtr += long_cb-1;
+
+  /* Define shuffle for branch costs */
+  __m256i shuf_g[4];
+  shuf_g[3] = _mm256_set_epi8(3+16,2+16,1+16,0+16,1+16,0+16,3+16,2+16,3+16,2+16,1+16,0+16,1+16,0+16,3+16,2+16, 
+                              3,2,1,0,1,0,3,2,3,2,1,0,1,0,3,2);
+  shuf_g[2] = _mm256_set_epi8(7+16,6+16,5+16,4+16,5+16,4+16,7+16,6+16,7+16,6+16,5+16,4+16,5+16,4+16,7+16,6+16,
+                              7,6,5,4,5,4,7,6,7,6,5,4,5,4,7,6);
+  shuf_g[1] = _mm256_set_epi8(11+16,10+16,9+16,8+16,9+16,8+16,11+16,10+16,11+16,10+16,9+16,8+16,9+16,8+16,11+16,10+16,
+                              11,10,9,8,9,8,11,10,11,10,9,8,9,8,11,10);
+  shuf_g[0] = _mm256_set_epi8(15+16,14+16,13+16,12+16,13+16,12+16,15+16,14+16,15+16,14+16,13+16,12+16,13+16,12+16,15+16,14+16,
+                              15,14,13,12,13,12,15,14,15,14,13,12,13,12,15,14);
+
+  /* Define shuffle for beta normalization */
+  __m256i shuf_norm = _mm256_set_epi8(17,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0);
+
+  __m256i gv;
+  int16_t *b = &s->branch[2*NCB*long_cb-16];
+  __m256i *gPtr = (__m256i*) b;
+  
+  /* This defines a beta computation step: 
+   * Adds and substracts the branch metrics to the previous beta step, 
+   * shuffles the states according to the trellis path and selects maximum state 
+   */
+#define BETA_STEP(g)     bp = _mm256_add_epi16(beta_k, g);\
+    bn = _mm256_sub_epi16(beta_k, g);\
+    bp = _mm256_shuffle_epi8(bp, shuf_bp);\
+    bn = _mm256_shuffle_epi8(bn, shuf_bn);\
+    beta_k = _mm256_max_epi16(bp, bn);    
+
+    /* Loads the alpha metrics from memory and adds them to the temporal bn and bp 
+     * metrics. Then computes horizontal maximum of both metrics and computes difference
+     */
+#define BETA_STEP_CNT(c,d) g = _mm256_shuffle_epi8(gv, shuf_g[c]);\
+    BETA_STEP(g)\
+    alpha_k = _mm256_load_si256(alphaPtr);\
+    alphaPtr--;\
+    bp = _mm256_add_epi16(bp, alpha_k);\
+    bn = _mm256_add_epi16(bn, alpha_k);\
+    bn = _mm256_sub_epi16(_mm256_set1_epi16(0x7FFF), bn);\
+    bp = _mm256_sub_epi16(_mm256_set1_epi16(0x7FFF), bp);\
+    output[0][k-d] = hMax0(bn) - hMax0(bp);\
+    output[1][k-d] = hMax1(bn) - hMax1(bp);
+
+  /* The tail does not require to load alpha or produce outputs. Only update 
+   * beta metrics accordingly */
+  for (k=end-1; k>=long_cb; k--) {
+    int16_t g0_1 = s->branch[2*NCB*k];
+    int16_t g1_1 = s->branch[2*NCB*k+1];
+    int16_t g0_2 = s->branch[2*NCB*k+6];
+    int16_t g1_2 = s->branch[2*NCB*k+6+1];
+    g = _mm256_set_epi16(g1_2, g0_2, g0_2, g1_2, g1_2, g0_2, g0_2, g1_2, g1_1, g0_1, g0_1, g1_1, g1_1, g0_1, g0_1, g1_1);
+    BETA_STEP(g);
+  }  
+  
+  /* We inline 2 trelis steps for each normalization */
+  __m256i norm;
+  for (; k >= 0; k-=8) {    
+    gv = _mm256_load_si256(gPtr);
+    gPtr--;
+    BETA_STEP_CNT(0,0);
+    BETA_STEP_CNT(1,1);
+    BETA_STEP_CNT(2,2);
+    BETA_STEP_CNT(3,3);
+    norm = _mm256_shuffle_epi8(beta_k, shuf_norm); 
+    beta_k = _mm256_sub_epi16(beta_k, norm);
+    gv = _mm256_load_si256(gPtr);
+    gPtr--;
+    BETA_STEP_CNT(0,4);
+    BETA_STEP_CNT(1,5);
+    BETA_STEP_CNT(2,6);
+    BETA_STEP_CNT(3,7);
+    norm = _mm256_shuffle_epi8(beta_k, shuf_norm); 
+    beta_k = _mm256_sub_epi16(beta_k, norm);
+  }  
+}
+
+/* Computes alpha metrics */
+void map_avx_alpha(map_gen_t * s, uint32_t long_cb)
+{
+  uint32_t k;
+  int16_t *alpha1 = s->alpha;
+  int16_t *alpha2 = &s->alpha[8];
+  uint32_t i;
+
+  alpha1[0] = 0; 
+  alpha2[0] = 0; 
+  for (i = 1; i < 8; i++) {
+    alpha1[i] = -INF;
+    alpha2[i] = -INF;
+  }
+    
+  /* Define the shuffle constant for the positive alpha */
+  __m256i shuf_ap = _mm256_set_epi8(
+
+    // 1st CB 
+    31, 30, // 7
+    25, 24, // 4
+    23, 22, // 3
+    17, 16, // 0
+    29, 28, // 6
+    27, 26, // 5
+    21, 20, // 2
+    19, 18, // 1
+
+    // 2nd CB
+    15, 14, // 7
+    9,  8,  // 4
+    7,  6,  // 3
+    1,  0,  // 0
+    13, 12, // 6
+    11, 10, // 5
+    5,  4,  // 2
+    3,  2   // 1
+  );
+
+  /* Define the shuffle constant for the negative alpha */
+  __m256i shuf_an = _mm256_set_epi8(
+
+    // 1nd CB 
+    29, 28, // 6
+    27, 26, // 5
+    21, 20, // 2
+    19, 18, // 1
+    31, 30, // 7
+    25, 24, // 4
+    23, 22, // 3
+    17, 16, // 0
+
+    // 2nd CB 
+    13, 12, // 6
+    11, 10, // 5
+    5,  4,  // 2
+    3,  2,  // 1
+    15, 14, // 7
+    9,  8,  // 4
+    7,  6,  // 3
+    1,  0   // 0
+  );
+  
+  /* Define shuffle for branch costs */
+  __m256i shuf_g[4];
+  shuf_g[0] = _mm256_set_epi8(3+16,2+16,3+16,2+16,1+16,0+16,1+16,0+16,1+16,0+16,1+16,0+16,3+16,2+16,3+16,2+16,       
+                              3,2,3,2,1,0,1,0,1,0,1,0,3,2,3,2);
+  shuf_g[1] = _mm256_set_epi8(7+16,6+16,7+16,6+16,5+16,4+16,5+16,4+16,5+16,4+16,5+16,4+16,7+16,6+16,7+16,6+16,    
+                              7,6,7,6,5,4,5,4,5,4,5,4,7,6,7,6);
+  shuf_g[2] = _mm256_set_epi8(11+16,10+16,11+16,10+16,9+16,8+16,9+16,8+16,9+16,8+16,9+16,8+16,11+16,10+16,11+16,10+16,       
+                              11,10,11,10,9,8,9,8,9,8,9,8,11,10,11,10);
+  shuf_g[3] = _mm256_set_epi8(15+16,14+16,15+16,14+16,13+16,12+16,13+16,12+16,13+16,12+16,13+16,12+16,15+16,14+16,15+16,14+16,
+                              15,14,15,14,13,12,13,12,13,12,13,12,15,14,15,14);
+
+  __m256i shuf_norm = _mm256_set_epi8(17,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0);
+  
+  __m256i* alphaPtr = (__m256i*) s->alpha;
+  alphaPtr++;
+
+  __m256i gv; 
+  __m256i *gPtr = (__m256i*) s->branch;
+  __m256i g, ap, an; 
+    
+  __m256i alpha_k = _mm256_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0, -INF, -INF, -INF, -INF, -INF, -INF, -INF, 0);
+  
+  /* This defines a alpha computation step: 
+   * Adds and substracts the branch metrics to the previous alpha step, 
+   * shuffles the states according to the trellis path and selects maximum state 
+   */
+#define ALPHA_STEP(c)  g = _mm256_shuffle_epi8(gv, shuf_g[c]); \
+  ap = _mm256_add_epi16(alpha_k, g);\
+  an = _mm256_sub_epi16(alpha_k, g);\
+  ap = _mm256_shuffle_epi8(ap, shuf_ap);\
+  an = _mm256_shuffle_epi8(an, shuf_an);\
+  alpha_k = _mm256_max_epi16(ap, an);\
+  _mm256_store_si256(alphaPtr, alpha_k);\
+  alphaPtr++;\
+
+
+  /* In this loop, we compute 8 steps and normalize twice for each branch metrics memory load */
+  __m256i norm;
+  for (k = 0; k < long_cb/8; k++) {
+    gv = _mm256_load_si256(gPtr);
+    
+    gPtr++;
+    ALPHA_STEP(0);
+    ALPHA_STEP(1);
+    ALPHA_STEP(2);
+    ALPHA_STEP(3);
+    norm = _mm256_shuffle_epi8(alpha_k, shuf_norm); 
+    alpha_k = _mm256_sub_epi16(alpha_k, norm);
+    gv = _mm256_load_si256(gPtr);
+    gPtr++;
+    ALPHA_STEP(0);
+    ALPHA_STEP(1);
+    ALPHA_STEP(2);
+    ALPHA_STEP(3);
+    norm = _mm256_shuffle_epi8(alpha_k, shuf_norm); 
+    alpha_k = _mm256_sub_epi16(alpha_k, norm);
+  }  
+}
+
+void map_sse_gamma_single(int16_t *output, int16_t *input, int16_t *app, int16_t *parity) 
+{
+  __m128i res00, res10, res01, res11, res0, res1; 
+  __m128i in, ap, pa, g1, g0;
+
+  __m128i *inPtr  = (__m128i*) input;
+  __m128i *appPtr = (__m128i*) app;
+  __m128i *paPtr  = (__m128i*) parity;
+  __m128i *resPtr = (__m128i*) output;
+  
+  __m128i res00_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0);
+  __m128i res10_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8);
+  __m128i res01_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff);
+  __m128i res11_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff);
+  
+  in = _mm_load_si128(inPtr);
+  inPtr++;
+  pa = _mm_load_si128(paPtr);
+  paPtr++;
+  
+  if (appPtr) {
+    ap = _mm_load_si128(appPtr);
+    appPtr++;
+    in = _mm_add_epi16(ap, in);
+  }
+  
+  g1 = _mm_add_epi16(in, pa);
+  g0 = _mm_sub_epi16(in, pa);
+
+  g1 = _mm_srai_epi16(g1, 1);
+  g0 = _mm_srai_epi16(g0, 1);
+  
+  res00 = _mm_shuffle_epi8(g0, res00_mask);
+  res10 = _mm_shuffle_epi8(g0, res10_mask);
+  res01 = _mm_shuffle_epi8(g1, res01_mask);
+  res11 = _mm_shuffle_epi8(g1, res11_mask);
+
+  res0  = _mm_or_si128(res00, res01);
+  res1  = _mm_or_si128(res10, res11);
+
+  _mm_store_si128(resPtr, res0);
+  resPtr++;
+  _mm_store_si128(resPtr, res1);    
+  resPtr++;
+}
+
+
+/* Compute branch metrics (gamma) */
+void map_avx_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t cbidx, uint32_t long_cb) 
+{
+  __m128i res10, res20, res11, res21, res1, res2; 
+  __m256i in, ap, pa, g1, g0;
+
+  __m256i *inPtr  = (__m256i*) input;
+  __m256i *appPtr = (__m256i*) app;
+  __m256i *paPtr  = (__m256i*) parity;
+  __m128i *resPtr = (__m128i*) h->branch;
+  
+  if (cbidx) {
+    resPtr++;
+  }
+  
+  __m128i res10_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0);
+  __m128i res11_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff);
+
+  __m128i res20_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8);
+  __m128i res21_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff);
+
+  for (int i=0;i<long_cb/16;i++) {
+    in = _mm256_load_si256(inPtr);
+    inPtr++;
+    pa = _mm256_load_si256(paPtr);
+    paPtr++;
+    
+    if (appPtr) {
+      ap = _mm256_load_si256(appPtr);
+      appPtr++;
+      in = _mm256_add_epi16(ap, in);
+    }
+    
+    g0 = _mm256_sub_epi16(in, pa);
+    g1 = _mm256_add_epi16(in, pa);
+ 
+    g0 = _mm256_srai_epi16(g0, 1);
+    g1 = _mm256_srai_epi16(g1, 1);
+    
+    __m128i g0_t = _mm256_extractf128_si256(g0, 0);
+    __m128i g1_t = _mm256_extractf128_si256(g1, 0);
+    
+    res10 = _mm_shuffle_epi8(g0_t, res10_mask);
+    res11 = _mm_shuffle_epi8(g1_t, res11_mask);
+
+    res20 = _mm_shuffle_epi8(g0_t, res20_mask);
+    res21 = _mm_shuffle_epi8(g1_t, res21_mask);
+
+    res1  = _mm_or_si128(res10, res11);
+    res2  = _mm_or_si128(res20, res21);
+
+    _mm_store_si128(resPtr, res1);
+    resPtr++;
+    resPtr++;
+    _mm_store_si128(resPtr, res2);    
+    resPtr++;
+    resPtr++;          
+    
+    g0_t = _mm256_extractf128_si256(g0, 1);
+    g1_t = _mm256_extractf128_si256(g1, 1);
+    
+    res10 = _mm_shuffle_epi8(g0_t, res10_mask);
+    res11 = _mm_shuffle_epi8(g1_t, res11_mask);
+
+    res20 = _mm_shuffle_epi8(g0_t, res20_mask);
+    res21 = _mm_shuffle_epi8(g1_t, res21_mask);
+
+    res1  = _mm_or_si128(res10, res11);
+    res2  = _mm_or_si128(res20, res21);
+
+    _mm_store_si128(resPtr, res1);
+    resPtr++;
+    resPtr++;
+    _mm_store_si128(resPtr, res2);    
+    resPtr++;
+    resPtr++;          
+
+  }
+  
+  if (long_cb%16) {
+    map_sse_gamma_single((int16_t*) resPtr, (int16_t*) inPtr, (int16_t*) appPtr, (int16_t*) paPtr);
+  }
+    
+  for (int i=long_cb;i<long_cb+3;i++) {
+    h->branch[2*i*NCB+cbidx*6]   = (input[i] - parity[i])/2;
+    h->branch[2*i*NCB+cbidx*6+1] = (input[i] + parity[i])/2;
+  }
+}
+
+
+#endif
+
+
--- a/lib/src/phy/fec/turbodecoder_simd.c
+++ b/lib/src/phy/fec/turbodecoder_simd.c
@ -0,0 +1,540 @@
+/**
+ *
+ * \section COPYRIGHT
+ *
+ * Copyright 2013-2015 Software Radio Systems Limited
+ *
+ * \section LICENSE
+ *
+ * This file is part of the srsLTE library.
+ *
+ * srsLTE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * srsLTE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * A copy of the GNU Affero General Public License can be found in
+ * the LICENSE file in the top-level directory of this distribution
+ * and at http://www.gnu.org/licenses/.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <math.h>
+
+#include "srslte/phy/fec/turbodecoder_simd.h"
+#include "srslte/phy/utils/vector.h"
+
+#include <inttypes.h>
+
+#define NUMSTATES       8
+#define NINPUTS         2
+#define TAIL            3
+#define TOTALTAIL       12
+
+#define INF 10000
+#define ZERO 0
+
+
+#ifdef LV_HAVE_SSE
+#include <smmintrin.h>
+
+// Define SSE/AVX implementations 
+void map_sse_beta(map_gen_t * s, int16_t * output, uint32_t long_cb);
+void map_sse_alpha(map_gen_t * s, uint32_t long_cb);
+void map_sse_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t long_cb);
+
+#ifdef LV_HAVE_AVX2
+void map_avx_beta(map_gen_t * s, int16_t * output[SRSLTE_TDEC_NPAR], uint32_t long_cb);
+void map_avx_alpha(map_gen_t * s, uint32_t long_cb);
+void map_avx_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t cbidx, uint32_t long_cb);
+#endif
+
+
+void map_simd_beta(map_gen_t * s, int16_t * output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb)
+{
+  if (nof_cb == 1) {
+    map_sse_beta(s, output[0], long_cb);
+  } 
+#ifdef LV_HAVE_AVX2
+  else if (nof_cb == 2) {
+    map_avx_beta(s, output, long_cb);
+  }
+#endif
+}
+
+void map_simd_alpha(map_gen_t * s, uint32_t nof_cb, uint32_t long_cb)
+{
+  if (nof_cb == 1) {
+    map_sse_alpha(s, long_cb);
+  }
+#ifdef LV_HAVE_AVX2
+  else if (nof_cb == 2) {
+    map_avx_alpha(s, long_cb);
+  }
+#endif
+}
+void map_simd_gamma(map_gen_t * s, int16_t *input, int16_t *app, int16_t *parity, uint32_t cbidx, uint32_t nof_cb, uint32_t long_cb) 
+{
+  if (nof_cb == 1) {
+    map_sse_gamma(s, input, app, parity, long_cb);
+  } 
+#ifdef LV_HAVE_AVX2
+  else if (nof_cb == 2) {
+    map_avx_gamma(s, input, app, parity, cbidx, long_cb);
+  }    
+#endif
+}
+
+/* Inititalizes constituent decoder object */
+int map_simd_init(map_gen_t * h, uint32_t max_par_cb, uint32_t max_long_cb)
+{
+  bzero(h, sizeof(map_gen_t));
+
+  h->max_par_cb  = max_par_cb;
+  h->max_long_cb = max_long_cb;
+
+  h->alpha = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES * h->max_par_cb);
+  if (!h->alpha) {
+    perror("srslte_vec_malloc");
+    return -1;
+  }
+  h->branch = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES * h->max_par_cb);
+  if (!h->branch) {
+    perror("srslte_vec_malloc");
+    return -1;
+  }
+  return 0;
+}
+
+void map_simd_free(map_gen_t * h)
+{
+  if (h->alpha) {
+    free(h->alpha);
+  }
+  if (h->branch) {
+    free(h->branch);
+  }
+  bzero(h, sizeof(map_gen_t));
+}
+
+/* Runs one instance of a decoder */
+void map_simd_dec(map_gen_t * h, int16_t * input[SRSLTE_TDEC_NPAR], int16_t *app[SRSLTE_TDEC_NPAR], int16_t * parity[SRSLTE_TDEC_NPAR], 
+                  int16_t *output[SRSLTE_TDEC_NPAR], uint32_t cb_mask, uint32_t long_cb)
+{
+  
+  uint32_t nof_cb = 1;
+  int16_t *outptr[SRSLTE_TDEC_NPAR];
+  
+  // Compute branch metrics
+  switch(cb_mask) {
+    case 1:
+      nof_cb = 1; 
+      outptr[0] = output[0];
+      map_simd_gamma(h, input[0], app?app[0]:NULL, parity[0], 0, 1, long_cb);    
+      break;
+    case 2:
+      nof_cb = 1; 
+      outptr[0] = output[1];
+      map_simd_gamma(h, input[1], app?app[1]:NULL, parity[1], 0, 1, long_cb);    
+      break;
+    case 3:
+      nof_cb = 2; 
+      for (int i=0;i<2;i++) {    
+        outptr[i] = output[i];
+        map_simd_gamma(h, input[i], app?app[i]:NULL, parity[i], i, 2, long_cb);    
+      }
+      break;
+  }
+
+  // Forward recursion
+  map_simd_alpha(h, nof_cb, long_cb);
+
+  // Backwards recursion + LLR computation
+  map_simd_beta(h, outptr, nof_cb, long_cb);
+}
+
+/* Initializes the turbo decoder object */
+int srslte_tdec_simd_init(srslte_tdec_simd_t * h, uint32_t max_par_cb, uint32_t max_long_cb)
+{
+  int ret = -1;
+  bzero(h, sizeof(srslte_tdec_simd_t));
+  uint32_t len = max_long_cb + SRSLTE_TCOD_TOTALTAIL;
+
+  h->max_long_cb = max_long_cb;
+  h->max_par_cb  = max_par_cb; 
+  
+  for (int i=0;i<h->max_par_cb;i++) {
+    h->app1[i] = srslte_vec_malloc(sizeof(int16_t) * len);
+    if (!h->app1[i]) {
+      perror("srslte_vec_malloc");
+      goto clean_and_exit;
+    }
+    h->app2[i] = srslte_vec_malloc(sizeof(int16_t) * len);
+    if (!h->app2[i]) {
+      perror("srslte_vec_malloc");
+      goto clean_and_exit;
+    }
+    h->ext1[i] = srslte_vec_malloc(sizeof(int16_t) * len);
+    if (!h->ext1[i]) {
+      perror("srslte_vec_malloc");
+      goto clean_and_exit;
+    }
+    h->ext2[i] = srslte_vec_malloc(sizeof(int16_t) * len);
+    if (!h->ext2[i]) {
+      perror("srslte_vec_malloc");
+      goto clean_and_exit;
+    }
+    h->syst[i] = srslte_vec_malloc(sizeof(int16_t) * len);
+    if (!h->syst[i]) {
+      perror("srslte_vec_malloc");
+      goto clean_and_exit;
+    }
+    h->parity0[i] = srslte_vec_malloc(sizeof(int16_t) * len);
+    if (!h->parity0[i]) {
+      perror("srslte_vec_malloc");
+      goto clean_and_exit;
+    }
+    h->parity1[i] = srslte_vec_malloc(sizeof(int16_t) * len);
+    if (!h->parity1[i]) {
+      perror("srslte_vec_malloc");
+      goto clean_and_exit;
+    }
+    
+  }
+
+  if (map_simd_init(&h->dec, h->max_par_cb, h->max_long_cb)) {
+    goto clean_and_exit;
+  }
+
+  for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
+    if (srslte_tc_interl_init(&h->interleaver[i], srslte_cbsegm_cbsize(i)) < 0) {
+      goto clean_and_exit;
+    }
+    srslte_tc_interl_LTE_gen(&h->interleaver[i], srslte_cbsegm_cbsize(i));
+  }
+  h->current_cbidx = -1; 
+  h->cb_mask = 0; 
+  ret = 0;
+clean_and_exit:if (ret == -1) {
+    srslte_tdec_simd_free(h);
+  }
+  return ret;
+}
+
+void srslte_tdec_simd_free(srslte_tdec_simd_t * h)
+{
+  for (int i=0;i<h->max_par_cb;i++) {
+    if (h->app1[i]) {
+      free(h->app1[i]);
+    }
+    if (h->app2[i]) {
+      free(h->app2[i]);
+    }
+    if (h->ext1[i]) {
+      free(h->ext1[i]);
+    }
+    if (h->ext2[i]) {
+      free(h->ext2[i]);
+    }
+    if (h->syst[i]) {
+      free(h->syst[i]);
+    }
+    if (h->parity0[i]) {
+      free(h->parity0[i]);
+    }
+    if (h->parity1[i]) {
+      free(h->parity1[i]);
+    }    
+  }
+
+  map_simd_free(&h->dec);
+
+  for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
+    srslte_tc_interl_free(&h->interleaver[i]);    
+  }
+
+  bzero(h, sizeof(srslte_tdec_simd_t));
+}
+
+/* Deinterleaves the 3 streams from the input (systematic and 2 parity bits) into 
+ * 3 buffers ready to be used by compute_gamma() 
+ */
+void deinterleave_input_simd(srslte_tdec_simd_t *h, int16_t *input, uint32_t cbidx, uint32_t long_cb) {
+  uint32_t i;
+ 
+  __m128i *inputPtr = (__m128i*) input; 
+  __m128i in0, in1, in2;
+  __m128i s0, s1, s2, s;
+  __m128i p00, p01, p02, p0;
+  __m128i p10, p11, p12, p1;
+  
+  __m128i *sysPtr = (__m128i*) h->syst[cbidx]; 
+  __m128i *pa0Ptr = (__m128i*) h->parity0[cbidx]; 
+  __m128i *pa1Ptr = (__m128i*) h->parity1[cbidx]; 
+  
+  // pick bits 0, 3, 6 from 1st word
+  __m128i s0_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0);
+  // pick bits 1, 4, 7 from 2st word
+  __m128i s1_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff);
+  // pick bits 2, 5 from 3rd word
+  __m128i s2_mask = _mm_set_epi8(11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
+
+  // pick bits 1, 4, 7 from 1st word
+  __m128i p00_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,15,14,9,8,3,2);
+  // pick bits 2, 5, from 2st word
+  __m128i p01_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff);
+  // pick bits 0, 3, 6 from 3rd word
+  __m128i p02_mask = _mm_set_epi8(13,12,7,6,1,0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
+  
+  // pick bits 2, 5 from 1st word
+  __m128i p10_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4);
+  // pick bits 0, 3, 6, from 2st word
+  __m128i p11_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0,0xff,0xff,0xff,0xff);
+  // pick bits 1, 4, 7 from 3rd word
+  __m128i p12_mask = _mm_set_epi8(15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
+      
+  // Split systematic and parity bits
+  for (i = 0; i < long_cb/8; i++) {
+        
+    in0 = _mm_load_si128(inputPtr); inputPtr++; 
+    in1 = _mm_load_si128(inputPtr); inputPtr++;   
+    in2 = _mm_load_si128(inputPtr); inputPtr++;
+    
+    /* Deinterleave Systematic bits */
+    s0 = _mm_shuffle_epi8(in0, s0_mask);
+    s1 = _mm_shuffle_epi8(in1, s1_mask);
+    s2 = _mm_shuffle_epi8(in2, s2_mask);    
+    s = _mm_or_si128(s0, s1);
+    s = _mm_or_si128(s, s2);
+
+    _mm_store_si128(sysPtr, s);
+    sysPtr++;
+
+    /* Deinterleave parity 0 bits */
+    p00 = _mm_shuffle_epi8(in0, p00_mask);
+    p01 = _mm_shuffle_epi8(in1, p01_mask);
+    p02 = _mm_shuffle_epi8(in2, p02_mask);    
+    p0 = _mm_or_si128(p00, p01);
+    p0 = _mm_or_si128(p0, p02);
+    
+    _mm_store_si128(pa0Ptr, p0);
+    pa0Ptr++;
+
+    /* Deinterleave parity 1 bits */
+    p10 = _mm_shuffle_epi8(in0, p10_mask);
+    p11 = _mm_shuffle_epi8(in1, p11_mask);
+    p12 = _mm_shuffle_epi8(in2, p12_mask);    
+    p1 = _mm_or_si128(p10, p11);
+    p1 = _mm_or_si128(p1, p12);
+
+    _mm_store_si128(pa1Ptr, p1);
+    pa1Ptr++;    
+
+  }
+  
+  for (i = 0; i < 3; i++) {
+    h->syst[cbidx][i+long_cb]    = input[3*long_cb + 2*i];
+    h->parity0[cbidx][i+long_cb] = input[3*long_cb + 2*i + 1];
+  }
+  for (i = 0; i < 3; i++) {
+    h->app2[cbidx][i+long_cb]    = input[3*long_cb + 6 + 2*i];
+    h->parity1[cbidx][i+long_cb] = input[3*long_cb + 6 + 2*i + 1];
+  }
+
+}
+
+/* Runs 1 turbo decoder iteration */
+void srslte_tdec_simd_iteration(srslte_tdec_simd_t * h, int16_t * input[SRSLTE_TDEC_NPAR], uint32_t long_cb)
+{
+
+  int16_t *tmp_app[SRSLTE_TDEC_NPAR];
+
+  if (h->current_cbidx >= 0) {
+    uint16_t *inter   = h->interleaver[h->current_cbidx].forward;
+    uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
+    
+#if SRSLTE_TDEC_NPAR == 2
+    h->cb_mask = (input[0]?1:0) | (input[1]?2:0);
+#else
+    h->cb_mask = input[0]?1:0;
+#endif
+    
+    for (int i=0;i<h->max_par_cb;i++) {
+      if (h->n_iter[i] == 0 && input[i]) {
+        //printf("deinterleaveing %d\n",i);
+        deinterleave_input_simd(h, input[i], i, long_cb);        
+      }        
+    }
+    
+    // Add apriori information to decoder 1 
+    for (int i=0;i<h->max_par_cb;i++) {
+      if (h->n_iter[i] > 0 && input[i]) {
+        srslte_vec_sub_sss(h->app1[i], h->ext1[i], h->app1[i], long_cb);
+      }
+    }
+
+    // Run MAP DEC #1
+    for (int i=0;i<h->max_par_cb;i++) {
+      if (input[i]) {
+        tmp_app[i] = h->n_iter[i]?h->app1[i]:NULL;
+      } else {
+        tmp_app[i] = NULL; 
+      }
+    }
+    map_simd_dec(&h->dec, h->syst, tmp_app, h->parity0, h->ext1, h->cb_mask, long_cb);            
+    
+    // Convert aposteriori information into extrinsic information    
+    for (int i=0;i<h->max_par_cb;i++) {
+      if (h->n_iter[i] > 0 && input[i]) {
+        srslte_vec_sub_sss(h->ext1[i], h->app1[i], h->ext1[i], long_cb);
+      }
+    }
+    
+    // Interleave extrinsic output of DEC1 to form apriori info for decoder 2
+    for (int i=0;i<h->max_par_cb;i++) {
+      if (input[i]) {
+        srslte_vec_lut_sss(h->ext1[i], deinter, h->app2[i], long_cb);
+      }
+    }
+
+    // Run MAP DEC #2. 2nd decoder uses apriori information as systematic bits
+    map_simd_dec(&h->dec, h->app2, NULL, h->parity1, h->ext2, h->cb_mask, long_cb);
+
+    // Deinterleaved extrinsic bits become apriori info for decoder 1 
+    for (int i=0;i<h->max_par_cb;i++) {
+      if (input[i]) {
+        srslte_vec_lut_sss(h->ext2[i], inter, h->app1[i], long_cb);
+      }
+    }
+
+    for (int i=0;i<h->max_par_cb;i++) {
+      if (input[i]) {
+        h->n_iter[i]++;     
+      }
+    }
+  } else {
+    fprintf(stderr, "Error CB index not set (call srslte_tdec_simd_reset() first\n");    
+  }
+}
+
+/* Resets the decoder and sets the codeblock length */
+int srslte_tdec_simd_reset(srslte_tdec_simd_t * h, uint32_t long_cb)
+{
+  if (long_cb > h->max_long_cb) {
+    fprintf(stderr, "TDEC was initialized for max_long_cb=%d\n",
+            h->max_long_cb);
+    return -1;
+  }
+  for (int i=0;i<h->max_par_cb;i++) {
+    h->n_iter[i] = 0;     
+  }
+  h->cb_mask = 0; 
+  h->current_cbidx = srslte_cbsegm_cbindex(long_cb);
+  if (h->current_cbidx < 0) {
+    fprintf(stderr, "Invalid CB length %d\n", long_cb);
+    return -1; 
+  }
+  return 0;
+}
+
+int srslte_tdec_simd_reset_cb(srslte_tdec_simd_t * h, uint32_t cb_idx)
+{
+  h->n_iter[cb_idx] = 0; 
+  return 0;
+}
+
+int srslte_tdec_simd_get_nof_iterations_cb(srslte_tdec_simd_t * h, uint32_t cb_idx)
+{
+  return h->n_iter[cb_idx];
+}
+
+void tdec_simd_decision(srslte_tdec_simd_t * h, uint8_t *output, uint32_t cbidx, uint32_t long_cb)
+{
+  __m128i zero     = _mm_set1_epi16(0);
+  __m128i lsb_mask = _mm_set1_epi16(1);
+  
+  __m128i *appPtr = (__m128i*) h->app1[cbidx];
+  __m128i *outPtr = (__m128i*) output;
+  __m128i ap, out, out0, out1; 
+    
+  for (uint32_t i = 0; i < long_cb/16; i++) {
+    ap   = _mm_load_si128(appPtr); appPtr++;    
+    out0 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask);
+    ap   = _mm_load_si128(appPtr); appPtr++;
+    out1 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask);
+    
+    out  = _mm_packs_epi16(out0, out1);
+    _mm_store_si128(outPtr, out);
+    outPtr++;
+  }
+  if (long_cb%16) {
+    for (int i=0;i<8;i++) {
+      output[long_cb-8+i] = h->app1[cbidx][long_cb-8+i]>0?1:0;
+    }
+  }
+}
+
+void srslte_tdec_simd_decision(srslte_tdec_simd_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t long_cb)
+{
+  for (int i=0;i<h->max_par_cb;i++) {    
+    tdec_simd_decision(h, output[i], i, long_cb);
+  }
+}
+
+void srslte_tdec_simd_decision_byte_cb(srslte_tdec_simd_t * h, uint8_t *output, uint32_t cbidx, uint32_t long_cb)
+{
+  uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1};
+  
+  // long_cb is always byte aligned
+  for (uint32_t i = 0; i < long_cb/8; i++) {
+    uint8_t out0 = h->app1[cbidx][8*i+0]>0?mask[0]:0;
+    uint8_t out1 = h->app1[cbidx][8*i+1]>0?mask[1]:0;
+    uint8_t out2 = h->app1[cbidx][8*i+2]>0?mask[2]:0;
+    uint8_t out3 = h->app1[cbidx][8*i+3]>0?mask[3]:0;
+    uint8_t out4 = h->app1[cbidx][8*i+4]>0?mask[4]:0;
+    uint8_t out5 = h->app1[cbidx][8*i+5]>0?mask[5]:0;
+    uint8_t out6 = h->app1[cbidx][8*i+6]>0?mask[6]:0;
+    uint8_t out7 = h->app1[cbidx][8*i+7]>0?mask[7]:0;
+    
+    output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7;
+  }
+}
+
+void srslte_tdec_simd_decision_byte(srslte_tdec_simd_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t long_cb)
+{
+  for (int i=0;i<h->max_par_cb;i++) {
+    srslte_tdec_simd_decision_byte_cb(h, output[i], i, long_cb);
+  }
+}
+
+
+/* Runs nof_iterations iterations and decides the output bits */
+int srslte_tdec_simd_run_all(srslte_tdec_simd_t * h, int16_t * input[SRSLTE_TDEC_NPAR], uint8_t *output[SRSLTE_TDEC_NPAR],
+                            uint32_t nof_iterations, uint32_t long_cb)
+{
+  if (srslte_tdec_simd_reset(h, long_cb)) {
+    return SRSLTE_ERROR; 
+  }
+
+  do {
+    srslte_tdec_simd_iteration(h, input, long_cb);
+  } while (h->n_iter[0] < nof_iterations);
+
+  srslte_tdec_simd_decision_byte(h, output, long_cb);
+  
+  return SRSLTE_SUCCESS;
+}
+
+#endif
+
+
--- a/lib/src/phy/fec/turbodecoder_simd_inter.c
+++ b/lib/src/phy/fec/turbodecoder_simd_inter.c
@ -0,0 +1,299 @@
+/**
+ *
+ * \section COPYRIGHT
+ *
+ * Copyright 2013-2015 Software Radio Systems Limited
+ *
+ * \section LICENSE
+ *
+ * This file is part of the srsLTE library.
+ *
+ * srsLTE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * srsLTE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * A copy of the GNU Affero General Public License can be found in
+ * the LICENSE file in the top-level directory of this distribution
+ * and at http://www.gnu.org/licenses/.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <math.h>
+
+#include "srslte/phy/fec/turbodecoder_simd_inter.h"
+#include "srslte/phy/utils/vector.h"
+
+#define TOTALTAIL       12
+
+#ifdef LV_HAVE_SSE
+#include <smmintrin.h>
+
+void map_see_inter_alpha(srslte_tdec_simd_inter_t * s, int16_t *input, int16_t *parity, uint32_t long_cb);
+void map_sse_inter_beta(srslte_tdec_simd_inter_t * s, int16_t *input, int16_t *parity, int16_t * output, uint32_t long_cb);
+void sse_inter_update_w(srslte_tdec_simd_inter_t *h, uint16_t *deinter, uint32_t long_cb); 
+void sse_inter_extract_syst1(srslte_tdec_simd_inter_t *h, uint16_t *inter, uint32_t long_cb);
+
+
+static void map_sse_inter_dec(srslte_tdec_simd_inter_t * h, int16_t * input, int16_t * parity, int16_t * output,
+                 uint32_t long_cb)
+{
+  map_see_inter_alpha(h, input, parity, long_cb);
+  map_sse_inter_beta(h, input, parity, output, long_cb);
+}
+
+/************************************************
+ *
+ *  TURBO DECODER INTERFACE
+ *
+ ************************************************/
+int srslte_tdec_simd_inter_init(srslte_tdec_simd_inter_t * h, uint32_t max_par_cb, uint32_t max_long_cb)
+{
+  int ret = -1;
+  bzero(h, sizeof(srslte_tdec_simd_inter_t));
+  uint32_t len = max_long_cb + 12;
+
+  h->max_long_cb = max_long_cb;
+  h->max_par_cb  = max_par_cb;
+
+  h->llr1 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
+  if (!h->llr1) {
+    perror("srslte_vec_malloc");
+    goto clean_and_exit;
+  }
+  h->llr2 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
+  if (!h->llr2) {
+    perror("srslte_vec_malloc");
+    goto clean_and_exit;
+  }
+  h->w = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
+  if (!h->w) {
+    perror("srslte_vec_malloc");
+    goto clean_and_exit;
+  }
+  h->syst0 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
+  if (!h->syst0) {
+    perror("srslte_vec_malloc");
+    goto clean_and_exit;
+  }
+  h->syst1 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
+  if (!h->syst1) {
+    perror("srslte_vec_malloc");
+    goto clean_and_exit;
+  }
+  h->parity0 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
+  if (!h->parity0) {
+    perror("srslte_vec_malloc");
+    goto clean_and_exit;
+  }
+  h->parity1 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
+  if (!h->parity1) {
+    perror("srslte_vec_malloc");
+    goto clean_and_exit;
+  }
+  h->alpha = srslte_vec_malloc(sizeof(int16_t) * 8*(len+12) * h->max_par_cb);
+  if (!h->alpha) {
+    perror("srslte_vec_malloc");
+    goto clean_and_exit;
+  }
+
+  for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
+    if (srslte_tc_interl_init(&h->interleaver[i], srslte_cbsegm_cbsize(i)) < 0) {
+      goto clean_and_exit;
+    }
+    srslte_tc_interl_LTE_gen(&h->interleaver[i], srslte_cbsegm_cbsize(i));
+  }
+  h->current_cbidx = -1; 
+  ret = 0;
+clean_and_exit:if (ret == -1) {
+    srslte_tdec_simd_inter_free(h);
+  }
+  return ret;
+}
+
+void srslte_tdec_simd_inter_free(srslte_tdec_simd_inter_t * h)
+{
+  if (h->llr1) {
+    free(h->llr1);
+  }
+  if (h->llr2) {
+    free(h->llr2);
+  }
+  if (h->w) {
+    free(h->w);
+  }
+  if (h->syst0) {
+    free(h->syst0);
+  }
+  if (h->syst1) {
+    free(h->syst1);
+  }
+  if (h->parity0) {
+    free(h->parity0);
+  }
+  if (h->parity1) {
+    free(h->parity1);
+  }
+  if (h->alpha) {
+    free(h->alpha);
+  }
+
+  for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
+    srslte_tc_interl_free(&h->interleaver[i]);    
+  }
+
+  bzero(h, sizeof(srslte_tdec_simd_inter_t));
+}
+
+
+/* Deinterleave for inter-frame parallelization */
+void extract_input(srslte_tdec_simd_inter_t *h, int16_t *input, uint32_t cbidx, uint32_t long_cb) 
+{  
+  for (int i=0;i<long_cb;i++) {
+    h->syst0[h->max_par_cb*i+cbidx]    = input[3*i+0];
+    h->parity0[h->max_par_cb*i+cbidx] = input[3*i+1];
+    h->parity1[h->max_par_cb*i+cbidx] = input[3*i+2];
+  }
+  for (int i = long_cb; i < long_cb + 3; i++) {
+    h->syst0[h->max_par_cb*i+cbidx] = input[3*long_cb + 2*(i - long_cb)];
+    h->syst1[h->max_par_cb*i+cbidx] = input[3*long_cb + 2*(i - long_cb)];
+    h->parity0[h->max_par_cb*i+cbidx] = input[3*long_cb + 2*(i - long_cb) + 1];
+    h->parity0[h->max_par_cb*i+cbidx] = input[3*long_cb + 2*(i - long_cb) + 2];
+  }
+}
+
+void srslte_tdec_simd_inter_iteration(srslte_tdec_simd_inter_t * h, int16_t *input[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb)
+{
+
+  if (h->current_cbidx >= 0) {
+
+    uint16_t *inter = h->interleaver[h->current_cbidx].forward;
+    uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
+    
+    // Prepare systematic and parity bits for MAP DEC #1
+    for (int i=0;i<nof_cb;i++) {
+      if (h->n_iter[i] == 0) {
+        extract_input(h, input[i], i, long_cb);
+      }
+      srslte_vec_sum_sss(h->syst0, h->w, h->syst0, long_cb*h->max_par_cb);      
+    }
+    
+    // Run MAP DEC #1
+    map_sse_inter_dec(h, h->syst0, h->parity0, h->llr1, long_cb);
+
+    // Prepare systematic and parity bits for MAP DEC #1
+    sse_inter_extract_syst1(h, inter, long_cb);
+
+    // Run MAP DEC #2
+    map_sse_inter_dec(h, h->syst1, h->parity1, h->llr2, long_cb);
+    
+    // Update a-priori LLR from the last iteration
+    sse_inter_update_w(h, deinter, long_cb);
+    
+  } else {
+    fprintf(stderr, "Error CB index not set (call srslte_tdec_simd_inter_reset() first\n");    
+  }
+}
+
+int srslte_tdec_simd_inter_reset_cb(srslte_tdec_simd_inter_t * h, uint32_t cb_idx)
+{
+  for (int i=0;i<h->current_long_cb;i++) {
+    h->w[h->max_par_cb*i+cb_idx] = 0; 
+  }
+  return 0;
+}
+
+int srslte_tdec_simd_inter_reset(srslte_tdec_simd_inter_t * h, uint32_t long_cb)
+{
+  if (long_cb > h->max_long_cb) {
+    fprintf(stderr, "TDEC was initialized for max_long_cb=%d\n",
+            h->max_long_cb);
+    return -1;
+  }
+  h->current_long_cb = long_cb; 
+  h->current_cbidx   = srslte_cbsegm_cbindex(long_cb);
+  if (h->current_cbidx < 0) {
+    fprintf(stderr, "Invalid CB length %d\n", long_cb);
+    return -1; 
+  }
+  memset(h->w, 0, sizeof(int16_t) * long_cb * h->max_par_cb);
+  return 0; 
+}
+
+void srslte_tdec_simd_inter_decision_cb(srslte_tdec_simd_inter_t * h, uint8_t *output, uint32_t cb_idx, uint32_t long_cb)
+{
+  uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
+  uint32_t i;
+  for (i = 0; i < long_cb; i++) {
+    output[i] = (h->llr2[h->max_par_cb*deinter[i]+cb_idx] > 0) ? 1 : 0;    
+  }
+}
+
+void srslte_tdec_simd_inter_decision(srslte_tdec_simd_inter_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb)
+{
+  for (int i=0;i<nof_cb;i++) {
+    srslte_tdec_simd_inter_decision_cb(h, output[i], i, long_cb);
+  }
+}
+
+void srslte_tdec_simd_inter_decision_byte_cb(srslte_tdec_simd_inter_t * h, uint8_t *output, uint32_t cb_idx, uint32_t long_cb)
+{
+  uint32_t i;
+  uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1};
+  uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
+  
+#define indexOf_cb(idx, cb) (h->max_par_cb*(deinter[8*i+idx])+cb)
+  
+  // long_cb is always byte aligned
+  for (i = 0; i < long_cb/8; i++) {
+    uint8_t out0 = h->llr2[indexOf_cb(0, cb_idx)]>0?mask[0]:0;
+    uint8_t out1 = h->llr2[indexOf_cb(1, cb_idx)]>0?mask[1]:0;
+    uint8_t out2 = h->llr2[indexOf_cb(2, cb_idx)]>0?mask[2]:0;
+    uint8_t out3 = h->llr2[indexOf_cb(3, cb_idx)]>0?mask[3]:0;
+    uint8_t out4 = h->llr2[indexOf_cb(4, cb_idx)]>0?mask[4]:0;
+    uint8_t out5 = h->llr2[indexOf_cb(5, cb_idx)]>0?mask[5]:0;
+    uint8_t out6 = h->llr2[indexOf_cb(6, cb_idx)]>0?mask[6]:0;
+    uint8_t out7 = h->llr2[indexOf_cb(7, cb_idx)]>0?mask[7]:0;
+    
+    output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7; 
+  }
+}
+
+void srslte_tdec_simd_inter_decision_byte(srslte_tdec_simd_inter_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb)
+{
+  for (int i=0;i<nof_cb;i++) {
+    srslte_tdec_simd_inter_decision_byte_cb(h, output[i], i, long_cb);
+  }
+}
+
+int srslte_tdec_simd_inter_run_all(srslte_tdec_simd_inter_t * h, 
+                                  int16_t *input[SRSLTE_TDEC_NPAR], uint8_t *output[SRSLTE_TDEC_NPAR],
+                                  uint32_t nof_iterations, uint32_t nof_cb, uint32_t long_cb)
+{
+  uint32_t iter = 0;
+
+  if (srslte_tdec_simd_inter_reset(h, long_cb)) {
+    return SRSLTE_ERROR; 
+  }
+
+  do {
+    srslte_tdec_simd_inter_iteration(h, input, nof_cb, long_cb);
+    iter++;
+  } while (iter < nof_iterations);
+
+  srslte_tdec_simd_inter_decision_byte(h, output, nof_cb, long_cb);
+  
+  return SRSLTE_SUCCESS;
+}
+
+#endif
--- a/lib/src/phy/fec/turbodecoder_sse.c
+++ b/lib/src/phy/fec/turbodecoder_sse.c
@ -31,7 +31,7 @@
 #include <strings.h>
 #include <math.h>

-#include "srslte/phy/fec/turbodecoder_sse.h"
+#include "srslte/phy/fec/turbodecoder_simd.h"
 #include "srslte/phy/utils/vector.h"

 #include <inttypes.h>
@ -52,16 +52,30 @@

 #ifdef LV_HAVE_SSE

+/*
+static void print_128i(__m128i x) {
+  int16_t *s = (int16_t*) &x; 
+  printf("[%d", s[0]);
+  for (int i=1;i<8;i++) {
+    printf(",%d", s[i]);
+  }
+  printf("]\n");
+}
+*/
+//#define use_beta_transposed_max
+
+#ifndef use_beta_transposed_max
+
 /* Computes the horizontal MAX from 8 16-bit integers using the minpos_epu16 SSE4.1 instruction */
 static inline int16_t hMax(__m128i buffer)
 {
-  __m128i tmp1 = _mm_sub_epi8(_mm_set1_epi16(0x7FFF), buffer);
+  __m128i tmp1 = _mm_sub_epi16(_mm_set1_epi16(0x7FFF), buffer);
  __m128i tmp3 = _mm_minpos_epu16(tmp1);
  return (int16_t)(_mm_cvtsi128_si32(tmp3));
 }

 /* Computes beta values */
-void map_gen_beta(map_gen_t * s, int16_t * output, uint32_t long_cb)
+void map_sse_beta(map_gen_t * s, int16_t * output, uint32_t long_cb)
 {
  int k;
  uint32_t end = long_cb + 3;
@ -126,15 +140,15 @@ void map_gen_beta(map_gen_t * s, int16_t * output, uint32_t long_cb)
    alpha_k = _mm_load_si128(alphaPtr);\
    alphaPtr--;\
    bp = _mm_add_epi16(bp, alpha_k);\
-    bn = _mm_add_epi16(bn, alpha_k); output[k-d] = hMax(bn) - hMax(bp);
-
+    bn = _mm_add_epi16(bn, alpha_k);\
+    output[k-d] = hMax(bn)-hMax(bp);
+    
  /* The tail does not require to load alpha or produce outputs. Only update 
   * beta metrics accordingly */
  for (k=end-1; k>=long_cb; k--) {
    int16_t g0 = s->branch[2*k];
    int16_t g1 = s->branch[2*k+1];
    g = _mm_set_epi16(g1, g0, g0, g1, g1, g0, g0, g1);
-  
    BETA_STEP(g);
  }  
  
@ -143,6 +157,7 @@ void map_gen_beta(map_gen_t * s, int16_t * output, uint32_t long_cb)
  for (; k >= 0; k-=8) {    
    gv = _mm_load_si128(gPtr);
    gPtr--;
+
    BETA_STEP_CNT(0,0);
    BETA_STEP_CNT(1,1);
    BETA_STEP_CNT(2,2);
@ -154,14 +169,17 @@ void map_gen_beta(map_gen_t * s, int16_t * output, uint32_t long_cb)
    BETA_STEP_CNT(0,4);
    BETA_STEP_CNT(1,5);
    BETA_STEP_CNT(2,6);
-    BETA_STEP_CNT(3,7);
+    BETA_STEP_CNT(3,7);    
+        
    norm = _mm_shuffle_epi8(beta_k, shuf_norm); 
    beta_k = _mm_sub_epi16(beta_k, norm);
  }  
 }

+#endif
+
 /* Computes alpha metrics */
-void map_gen_alpha(map_gen_t * s, uint32_t long_cb)
+void map_sse_alpha(map_gen_t * s, uint32_t long_cb)
 {
  uint32_t k;
  int16_t *alpha = s->alpha;
@ -250,9 +268,9 @@ void map_gen_alpha(map_gen_t * s, uint32_t long_cb)
 }

 /* Compute branch metrics (gamma) */
-void map_gen_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t long_cb) 
+void map_sse_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t long_cb) 
 {
-  __m128i res10, res20, res11, res21, res1, res2; 
+  __m128i res00, res10, res01, res11, res0, res1; 
  __m128i in, ap, pa, g1, g0;

  __m128i *inPtr  = (__m128i*) input;
@ -260,10 +278,10 @@ void map_gen_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity,
  __m128i *paPtr  = (__m128i*) parity;
  __m128i *resPtr = (__m128i*) h->branch;
  
-  __m128i res10_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0);
-  __m128i res20_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8);
-  __m128i res11_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff);
-  __m128i res21_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff);
+  __m128i res00_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0);
+  __m128i res10_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8);
+  __m128i res01_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff);
+  __m128i res11_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff);
  
  for (int i=0;i<long_cb/8;i++) {
    in = _mm_load_si128(inPtr);
@ -283,17 +301,17 @@ void map_gen_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity,
    g1 = _mm_srai_epi16(g1, 1);
    g0 = _mm_srai_epi16(g0, 1);
    
+    res00 = _mm_shuffle_epi8(g0, res00_mask);
    res10 = _mm_shuffle_epi8(g0, res10_mask);
-    res20 = _mm_shuffle_epi8(g0, res20_mask);
+    res01 = _mm_shuffle_epi8(g1, res01_mask);
    res11 = _mm_shuffle_epi8(g1, res11_mask);
-    res21 = _mm_shuffle_epi8(g1, res21_mask);

+    res0  = _mm_or_si128(res00, res01);
    res1  = _mm_or_si128(res10, res11);
-    res2  = _mm_or_si128(res20, res21);

-    _mm_store_si128(resPtr, res1);
+    _mm_store_si128(resPtr, res0);
    resPtr++;
-    _mm_store_si128(resPtr, res2);    
+    _mm_store_si128(resPtr, res1);    
    resPtr++;
  }

@ -303,356 +321,177 @@ void map_gen_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity,
  }
 }

-/* Inititalizes constituent decoder object */
-int map_gen_init(map_gen_t * h, int max_long_cb)
-{
-  bzero(h, sizeof(map_gen_t));
-  h->alpha = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES);
-  if (!h->alpha) {
-    perror("srslte_vec_malloc");
-    return -1;
-  }
-  h->branch = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES);
-  if (!h->branch) {
-    perror("srslte_vec_malloc");
-    return -1;
-  }
-  h->max_long_cb = max_long_cb;
-  return 0;
-}

-void map_gen_free(map_gen_t * h)
-{
-  if (h->alpha) {
-    free(h->alpha);
-  }
-  if (h->branch) {
-    free(h->branch);
-  }
-  bzero(h, sizeof(map_gen_t));
-}

-/* Runs one instance of a decoder */
-void map_gen_dec(map_gen_t * h, int16_t * input, int16_t *app, int16_t * parity, int16_t * output,
-                 uint32_t long_cb)
-{
- 
-  // Compute branch metrics
-  map_gen_gamma(h, input, app, parity, long_cb);

-  // Forward recursion
-  map_gen_alpha(h, long_cb);

-  // Backwards recursion + LLR computation
-  map_gen_beta(h, output, long_cb);
-  
-}

-/* Initializes the turbo decoder object */
-int srslte_tdec_sse_init(srslte_tdec_sse_t * h, uint32_t max_long_cb)
-{
-  int ret = -1;
-  bzero(h, sizeof(srslte_tdec_sse_t));
-  uint32_t len = max_long_cb + SRSLTE_TCOD_TOTALTAIL;
-
-  h->max_long_cb = max_long_cb;
-
-  h->app1 = srslte_vec_malloc(sizeof(int16_t) * len);
-  if (!h->app1) {
-    perror("srslte_vec_malloc");
-    goto clean_and_exit;
-  }
-  h->app2 = srslte_vec_malloc(sizeof(int16_t) * len);
-  if (!h->app2) {
-    perror("srslte_vec_malloc");
-    goto clean_and_exit;
-  }
-  h->ext1 = srslte_vec_malloc(sizeof(int16_t) * len);
-  if (!h->ext1) {
-    perror("srslte_vec_malloc");
-    goto clean_and_exit;
-  }
-  h->ext2 = srslte_vec_malloc(sizeof(int16_t) * len);
-  if (!h->ext2) {
-    perror("srslte_vec_malloc");
-    goto clean_and_exit;
-  }
-  h->syst = srslte_vec_malloc(sizeof(int16_t) * len);
-  if (!h->syst) {
-    perror("srslte_vec_malloc");
-    goto clean_and_exit;
-  }
-  h->parity0 = srslte_vec_malloc(sizeof(int16_t) * len);
-  if (!h->parity0) {
-    perror("srslte_vec_malloc");
-    goto clean_and_exit;
-  }
-  h->parity1 = srslte_vec_malloc(sizeof(int16_t) * len);
-  if (!h->parity1) {
-    perror("srslte_vec_malloc");
-    goto clean_and_exit;
-  }
-
-  if (map_gen_init(&h->dec, h->max_long_cb)) {
-    goto clean_and_exit;
-  }
-
-  for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
-    if (srslte_tc_interl_init(&h->interleaver[i], srslte_cbsegm_cbsize(i)) < 0) {
-      goto clean_and_exit;
-    }
-    srslte_tc_interl_LTE_gen(&h->interleaver[i], srslte_cbsegm_cbsize(i));
-  }
-  h->current_cbidx = -1; 
-  ret = 0;
-clean_and_exit:if (ret == -1) {
-    srslte_tdec_sse_free(h);
-  }
-  return ret;
-}
-
-void srslte_tdec_sse_free(srslte_tdec_sse_t * h)
-{
-  if (h->app1) {
-    free(h->app1);
-  }
-  if (h->app2) {
-    free(h->app2);
-  }
-  if (h->ext1) {
-    free(h->ext1);
-  }
-  if (h->ext2) {
-    free(h->ext2);
-  }
-  if (h->syst) {
-    free(h->syst);
-  }
-  if (h->parity0) {
-    free(h->parity0);
-  }
-  if (h->parity1) {
-    free(h->parity1);
-  }
-
-  map_gen_free(&h->dec);
-
-  for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
-    srslte_tc_interl_free(&h->interleaver[i]);    
-  }
-
-  bzero(h, sizeof(srslte_tdec_sse_t));
-}
-
-/* Deinterleaves the 3 streams from the input (systematic and 2 parity bits) into 
- * 3 buffers ready to be used by compute_gamma() 
+/***********************
+ * 
+ * This is an attempt to parallelize the horizontal max 
+ * by doing a 8x8 tranpose of the vectors and computing max 
+ * in cascade. However since we need to store 16 registers 
+ * for the positive and negative values the performance is not very good
 */
-void deinterleave_input(srslte_tdec_sse_t *h, int16_t *input, uint32_t long_cb) {
-  uint32_t i;
+
+
+#ifdef use_beta_transposed_max
+
+static inline __m128i transposed_max(__m128i a, __m128i b, __m128i c, __m128i d, 
+                                     __m128i e, __m128i f, __m128i g, __m128i h)
+{
+  // Transpose 8 vectors 
+  __m128i t0 = _mm_unpacklo_epi16(a, b);
+  __m128i t1 = _mm_unpacklo_epi16(c, d);
+  __m128i t2 = _mm_unpacklo_epi16(e, f);
+  __m128i t3 = _mm_unpacklo_epi16(g, h);
+  __m128i t4 = _mm_unpackhi_epi16(a, b);
+  __m128i t5 = _mm_unpackhi_epi16(c, d);
+  __m128i t6 = _mm_unpackhi_epi16(e, f);
+  __m128i t7 = _mm_unpackhi_epi16(g, h);
+
+  __m128i s0 = _mm_unpacklo_epi32(t0, t1);
+  __m128i s1 = _mm_unpackhi_epi32(t0, t1);
+  __m128i s2 = _mm_unpacklo_epi32(t2, t3);
+  __m128i s3 = _mm_unpackhi_epi32(t2, t3);
+  __m128i s4 = _mm_unpacklo_epi32(t4, t5);
+  __m128i s5 = _mm_unpackhi_epi32(t4, t5);
+  __m128i s6 = _mm_unpacklo_epi32(t6, t7);
+  __m128i s7 = _mm_unpackhi_epi32(t6, t7);
+
+  __m128i x0 = _mm_unpacklo_epi64(s0, s2);
+  __m128i x1 = _mm_unpackhi_epi64(s0, s2);
+  __m128i x2 = _mm_unpacklo_epi64(s1, s3);
+  __m128i x3 = _mm_unpackhi_epi64(s1, s3);
+  __m128i x4 = _mm_unpacklo_epi64(s4, s6);
+  __m128i x5 = _mm_unpackhi_epi64(s4, s6);
+  __m128i x6 = _mm_unpacklo_epi64(s5, s7);
+  __m128i x7 = _mm_unpackhi_epi64(s5, s7);
+
+  // Cascade max on the transposed vector 
+  __m128i res = _mm_max_epi16(x0,
+                _mm_max_epi16(x1,
+                _mm_max_epi16(x2,
+                _mm_max_epi16(x3,
+                _mm_max_epi16(x4,
+                _mm_max_epi16(x5,
+                _mm_max_epi16(x6,
+                              x7)))))));
+                              
+  return res;   
+}
+
+void map_sse_beta(map_gen_t * s, int16_t * output, uint32_t long_cb)
+{
+  int k;
+  uint32_t end = long_cb + 3;
+  const __m128i *alphaPtr = (const __m128i*) s->alpha;
 
-  __m128i *inputPtr = (__m128i*) input; 
-  __m128i in0, in1, in2;
-  __m128i s0, s1, s2, s;
-  __m128i p00, p01, p02, p0;
-  __m128i p10, p11, p12, p1;
+  __m128i beta_k = _mm_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0);
+  __m128i g, alpha_k; 
+  __m128i bn, bn_0, bn_1, bn_2, bn_3, bn_4, bn_5, bn_6, bn_7;
+  __m128i bp, bp_0, bp_1, bp_2, bp_3, bp_4, bp_5, bp_6, bp_7;
  
-  __m128i *sysPtr = (__m128i*) h->syst; 
-  __m128i *pa0Ptr = (__m128i*) h->parity0; 
-  __m128i *pa1Ptr = (__m128i*) h->parity1; 
-  
-  // pick bits 0, 3, 6 from 1st word
-  __m128i s0_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0);
-  // pick bits 1, 4, 7 from 2st word
-  __m128i s1_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff);
-  // pick bits 2, 5 from 3rd word
-  __m128i s2_mask = _mm_set_epi8(11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
+  /* Define the shuffle constant for the positive beta */
+  __m128i shuf_bp = _mm_set_epi8(
+    15, 14, // 7
+    7,  6,  // 3
+    5,  4,  // 2
+    13, 12, // 6
+    11, 10, // 5
+    3,  2,  // 1
+    1,  0,  // 0
+    9,  8   // 4
+  );

-  // pick bits 1, 4, 7 from 1st word
-  __m128i p00_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,15,14,9,8,3,2);
-  // pick bits 2, 5, from 2st word
-  __m128i p01_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff);
-  // pick bits 0, 3, 6 from 3rd word
-  __m128i p02_mask = _mm_set_epi8(13,12,7,6,1,0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
+  /* Define the shuffle constant for the negative beta */
+  __m128i shuf_bn = _mm_set_epi8(
+    7,   6, // 3
+    15, 14, // 7
+    13, 12, // 6
+    5,  4,  // 2
+    3,  2,  // 1
+    11, 10, // 5
+    9,  8,  // 4
+    1,  0   // 0
+  );
+ 
+  alphaPtr += long_cb-1;
+
+  /* Define shuffle for branch costs */
+  __m128i shuf_g[4];  
+  shuf_g[3] = _mm_set_epi8(3,2,1,0,1,0,3,2,3,2,1,0,1,0,3,2);
+  shuf_g[2] = _mm_set_epi8(7,6,5,4,5,4,7,6,7,6,5,4,5,4,7,6);
+  shuf_g[1] = _mm_set_epi8(11,10,9,8,9,8,11,10,11,10,9,8,9,8,11,10);
+  shuf_g[0] = _mm_set_epi8(15,14,13,12,13,12,15,14,15,14,13,12,13,12,15,14);
+  __m128i gv;
+  int16_t *b = &s->branch[2*long_cb-8];
+  __m128i *gPtr = (__m128i*) b;
+  /* Define shuffle for beta normalization */
+  __m128i shuf_norm = _mm_set_epi8(1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0);
  
-  // pick bits 2, 5 from 1st word
-  __m128i p10_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4);
-  // pick bits 0, 3, 6, from 2st word
-  __m128i p11_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0,0xff,0xff,0xff,0xff);
-  // pick bits 1, 4, 7 from 3rd word
-  __m128i p12_mask = _mm_set_epi8(15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-      
-  // Split systematic and parity bits
-  for (i = 0; i < long_cb/8; i++) {
-        
-    in0 = _mm_load_si128(inputPtr); inputPtr++; 
-    in1 = _mm_load_si128(inputPtr); inputPtr++;   
-    in2 = _mm_load_si128(inputPtr); inputPtr++;
+  /* This defines a beta computation step: 
+   * Adds and substracts the branch metrics to the previous beta step, 
+   * shuffles the states according to the trellis path and selects maximum state 
+   */
+#define BETA_STEP(g)     bp = _mm_add_epi16(beta_k, g);\
+    bn = _mm_sub_epi16(beta_k, g);\
+    bp = _mm_shuffle_epi8(bp, shuf_bp);\
+    bn = _mm_shuffle_epi8(bn, shuf_bn);\
+    beta_k = _mm_max_epi16(bp, bn);    
+
+    /* Loads the alpha metrics from memory and adds them to the temporal bn and bp 
+     * metrics. 
+     */
+#define BETA_STEP_CNT(c,d) g = _mm_shuffle_epi8(gv, shuf_g[c]);\
+    BETA_STEP(g)\
+    alpha_k = _mm_load_si128(alphaPtr);\
+    alphaPtr--;\
+    bp_##d = _mm_add_epi16(bp, alpha_k);\
+    bn_##d = _mm_add_epi16(bn, alpha_k);\
+
+  /* The tail does not require to load alpha or produce outputs. Only update 
+   * beta metrics accordingly */
+  for (k=end-1; k>=long_cb; k--) {
+    int16_t g0 = s->branch[2*k];
+    int16_t g1 = s->branch[2*k+1];
+    g = _mm_set_epi16(g1, g0, g0, g1, g1, g0, g0, g1);
+    BETA_STEP(g);
+  }  
+  
+  /* We inline 2 trelis steps for each normalization */
+  __m128i norm;
+  __m128i *outPtr = (__m128i*) &output[long_cb-8];
+  for (; k >= 0; k-=8) {    
+    gv = _mm_load_si128(gPtr);
+    gPtr--;
    
-    /* Deinterleave Systematic bits */
-    s0 = _mm_shuffle_epi8(in0, s0_mask);
-    s1 = _mm_shuffle_epi8(in1, s1_mask);
-    s2 = _mm_shuffle_epi8(in2, s2_mask);    
-    s = _mm_or_si128(s0, s1);
-    s = _mm_or_si128(s, s2);
-
-    _mm_store_si128(sysPtr, s);
-    sysPtr++;
-
-    /* Deinterleave parity 0 bits */
-    p00 = _mm_shuffle_epi8(in0, p00_mask);
-    p01 = _mm_shuffle_epi8(in1, p01_mask);
-    p02 = _mm_shuffle_epi8(in2, p02_mask);    
-    p0 = _mm_or_si128(p00, p01);
-    p0 = _mm_or_si128(p0, p02);
+    BETA_STEP_CNT(0,0);
+    BETA_STEP_CNT(1,1);
+    BETA_STEP_CNT(2,2);
+    BETA_STEP_CNT(3,3);
+    norm = _mm_shuffle_epi8(beta_k, shuf_norm); 
+    beta_k = _mm_sub_epi16(beta_k, norm);
+    gv = _mm_load_si128(gPtr);
+    gPtr--;
+    BETA_STEP_CNT(0,4);
+    BETA_STEP_CNT(1,5);
+    BETA_STEP_CNT(2,6);
+    BETA_STEP_CNT(3,7);    
+    norm = _mm_shuffle_epi8(beta_k, shuf_norm); 
+    beta_k = _mm_sub_epi16(beta_k, norm);
    
-    _mm_store_si128(pa0Ptr, p0);
-    pa0Ptr++;
-
-    /* Deinterleave parity 1 bits */
-    p10 = _mm_shuffle_epi8(in0, p10_mask);
-    p11 = _mm_shuffle_epi8(in1, p11_mask);
-    p12 = _mm_shuffle_epi8(in2, p12_mask);    
-    p1 = _mm_or_si128(p10, p11);
-    p1 = _mm_or_si128(p1, p12);
-
-    _mm_store_si128(pa1Ptr, p1);
-    pa1Ptr++;    
-
-  }
-  
-  for (i = 0; i < 3; i++) {
-    h->syst[i+long_cb]    = input[3*long_cb + 2*i];
-    h->parity0[i+long_cb] = input[3*long_cb + 2*i + 1];
-  }
-  for (i = 0; i < 3; i++) {
-    h->app2[i+long_cb]    = input[3*long_cb + 6 + 2*i];
-    h->parity1[i+long_cb] = input[3*long_cb + 6 + 2*i + 1];
-  }
-
+    __m128i bn_transp = transposed_max(bn_7, bn_6, bn_5, bn_4, bn_3, bn_2, bn_1, bn_0);
+    __m128i bp_transp = transposed_max(bp_7, bp_6, bp_5, bp_4, bp_3, bp_2, bp_1, bp_0);
+    __m128i outval = _mm_sub_epi16(bp_transp,bn_transp);
+    _mm_store_si128(outPtr, outval);
+    outPtr--;    
+  }  
 }
+#endif

-/* Runs 1 turbo decoder iteration */
-void srslte_tdec_sse_iteration(srslte_tdec_sse_t * h, int16_t * input, uint32_t long_cb)
-{

-  if (h->current_cbidx >= 0) {
-    uint16_t *inter   = h->interleaver[h->current_cbidx].forward;
-    uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
-    
-    if (h->n_iter == 0) {
-      deinterleave_input(h, input, long_cb);
-    }
-    
-    // Add apriori information to decoder 1 
-    if (h->n_iter > 0) {
-      srslte_vec_sub_sss(h->app1, h->ext1, h->app1, long_cb);
-    }
-        
-    // Run MAP DEC #1
-    if (h->n_iter == 0) {
-      map_gen_dec(&h->dec, h->syst, NULL, h->parity0, h->ext1, long_cb);            
-    } else {
-      map_gen_dec(&h->dec, h->syst, h->app1, h->parity0, h->ext1, long_cb);      
-    }

-    // Convert aposteriori information into extrinsic information    
-    if (h->n_iter > 0) {
-      srslte_vec_sub_sss(h->ext1, h->app1, h->ext1, long_cb);
-    }
-    
-    // Interleave extrinsic output of DEC1 to form apriori info for decoder 2
-    srslte_vec_lut_sss(h->ext1, deinter, h->app2, long_cb);
-
-    // Run MAP DEC #2. 2nd decoder uses apriori information as systematic bits
-    map_gen_dec(&h->dec, h->app2, NULL, h->parity1, h->ext2, long_cb);
-
-    // Deinterleaved extrinsic bits become apriori info for decoder 1 
-    srslte_vec_lut_sss(h->ext2, inter, h->app1, long_cb);
-    
-    h->n_iter++;
-  } else {
-    fprintf(stderr, "Error CB index not set (call srslte_tdec_sse_reset() first\n");    
-  }
-}
-
-/* Resets the decoder and sets the codeblock length */
-int srslte_tdec_sse_reset(srslte_tdec_sse_t * h, uint32_t long_cb)
-{
-  if (long_cb > h->max_long_cb) {
-    fprintf(stderr, "TDEC was initialized for max_long_cb=%d\n",
-            h->max_long_cb);
-    return -1;
-  }
-  h->n_iter = 0; 
-  h->current_cbidx = srslte_cbsegm_cbindex(long_cb);
-  if (h->current_cbidx < 0) {
-    fprintf(stderr, "Invalid CB length %d\n", long_cb);
-    return -1; 
-  }
-  return 0;
-}
-
-void srslte_tdec_sse_decision(srslte_tdec_sse_t * h, uint8_t *output, uint32_t long_cb)
-{
-  __m128i zero     = _mm_set1_epi16(0);
-  __m128i lsb_mask = _mm_set1_epi16(1);
-  
-  __m128i *appPtr = (__m128i*) h->app1;
-  __m128i *outPtr = (__m128i*) output;
-  __m128i ap, out, out0, out1; 
-    
-  for (uint32_t i = 0; i < long_cb/16; i++) {
-    ap   = _mm_load_si128(appPtr); appPtr++;    
-    out0 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask);
-    ap   = _mm_load_si128(appPtr); appPtr++;
-    out1 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask);
-    
-    out  = _mm_packs_epi16(out0, out1);
-    _mm_store_si128(outPtr, out);
-    outPtr++;
-  }
-  if (long_cb%16) {
-    for (int i=0;i<8;i++) {
-      output[long_cb-8+i] = h->app1[long_cb-8+i]>0?1:0;
-    }
-  }
-}
-
-void srslte_tdec_sse_decision_byte(srslte_tdec_sse_t * h, uint8_t *output, uint32_t long_cb)
-{
-  uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1};
-  
-  // long_cb is always byte aligned
-  for (uint32_t i = 0; i < long_cb/8; i++) {
-    uint8_t out0 = h->app1[8*i+0]>0?mask[0]:0;
-    uint8_t out1 = h->app1[8*i+1]>0?mask[1]:0;
-    uint8_t out2 = h->app1[8*i+2]>0?mask[2]:0;
-    uint8_t out3 = h->app1[8*i+3]>0?mask[3]:0;
-    uint8_t out4 = h->app1[8*i+4]>0?mask[4]:0;
-    uint8_t out5 = h->app1[8*i+5]>0?mask[5]:0;
-    uint8_t out6 = h->app1[8*i+6]>0?mask[6]:0;
-    uint8_t out7 = h->app1[8*i+7]>0?mask[7]:0;
-    
-    output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7; 
-  }
-}
-
-/* Runs nof_iterations iterations and decides the output bits */
-int srslte_tdec_sse_run_all(srslte_tdec_sse_t * h, int16_t * input, uint8_t *output,
-                  uint32_t nof_iterations, uint32_t long_cb)
-{
-  if (srslte_tdec_sse_reset(h, long_cb)) {
-    return SRSLTE_ERROR; 
-  }
-
-  do {
-    srslte_tdec_sse_iteration(h, input, long_cb);
-  } while (h->n_iter < nof_iterations);
-
-  srslte_tdec_sse_decision_byte(h, output, long_cb);
-  
-  return SRSLTE_SUCCESS;
-}

 #endif

--- a/lib/src/phy/fec/turbodecoder_sse_inter.c
+++ b/lib/src/phy/fec/turbodecoder_sse_inter.c
@ -0,0 +1,198 @@
+/**
+ *
+ * \section COPYRIGHT
+ *
+ * Copyright 2013-2015 Software Radio Systems Limited
+ *
+ * \section LICENSE
+ *
+ * This file is part of the srsLTE library.
+ *
+ * srsLTE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * srsLTE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * A copy of the GNU Affero General Public License can be found in
+ * the LICENSE file in the top-level directory of this distribution
+ * and at http://www.gnu.org/licenses/.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <math.h>
+
+#include "srslte/phy/fec/turbodecoder_simd_inter.h"
+#include "srslte/phy/utils/vector.h"
+
+
+#define NCB 8
+
+#define INF 10000
+
+#ifdef LV_HAVE_SSE
+#include <smmintrin.h>
+
+void sse_inter_extract_syst1(srslte_tdec_simd_inter_t *h, uint16_t *inter, uint32_t long_cb) 
+{
+  __m128i *llr1Ptr  = (__m128i*) h->llr1; 
+  __m128i *wPtr     = (__m128i*) h->w; 
+  __m128i *syst1Ptr = (__m128i*) h->syst1; 
+  
+  for (int i = 0; i < long_cb; i++) {    
+    __m128i llr1 = _mm_load_si128(&llr1Ptr[inter[i]]);
+    __m128i w    = _mm_load_si128(&wPtr[inter[i]]);
+    _mm_store_si128(syst1Ptr++, _mm_sub_epi16(llr1, w));
+  }
+}
+
+void sse_inter_update_w(srslte_tdec_simd_inter_t *h, uint16_t *deinter, uint32_t long_cb) 
+{
+  __m128i *llr1Ptr  = (__m128i*) h->llr1; 
+  __m128i *llr2Ptr  = (__m128i*) h->llr2; 
+  __m128i *wPtr     = (__m128i*) h->w; 
+  __m128i *syst1Ptr = (__m128i*) h->syst1; 
+  
+  for (int i = 0; i < long_cb; i++) {    
+    __m128i llr1 = _mm_load_si128(llr1Ptr++);
+    __m128i w    = _mm_load_si128(wPtr++);
+    __m128i llr2 = _mm_load_si128(&llr2Ptr[deinter[i]]);
+    
+    _mm_store_si128(syst1Ptr++, _mm_add_epi16(w, _mm_sub_epi16(llr2, llr1)));
+  }
+}
+
+/* Computes beta values */
+void map_sse_inter_beta(srslte_tdec_simd_inter_t * s, int16_t *input, int16_t *parity, int16_t * output, uint32_t long_cb)
+{
+  __m128i m_b[8], new[8], old[8], max1[8], max0[8];
+  __m128i x, y, xy;
+  __m128i m1, m0;
+  uint32_t end = long_cb + 3;
+  uint32_t i;
+
+  __m128i *inputPtr  = (__m128i*) input;
+  __m128i *parityPtr = (__m128i*) parity;
+  __m128i *outputPtr = (__m128i*) output;
+  __m128i *alphaPtr  = (__m128i*) s->alpha;
+
+  for (int k = end - 1; k >= 0; k--) {
+    x = _mm_load_si128(inputPtr++);
+    y = _mm_load_si128(parityPtr++);
+
+    xy = _mm_add_epi16(x,y);
+
+    m_b[0] = _mm_add_epi16(old[4], xy);
+    m_b[1] = old[4];
+    m_b[2] = _mm_add_epi16(old[5], y);
+    m_b[3] = _mm_add_epi16(old[5], x);
+    m_b[4] = _mm_add_epi16(old[6], x);
+    m_b[5] = _mm_add_epi16(old[6], y);
+    m_b[6] = old[7];
+    m_b[7] = _mm_add_epi16(old[7], xy);
+
+    new[0] = old[0];
+    new[1] = _mm_add_epi16(old[0], xy);
+    new[2] = _mm_add_epi16(old[1], x);
+    new[3] = _mm_add_epi16(old[1], y);
+    new[4] = _mm_add_epi16(old[2], y);
+    new[5] = _mm_add_epi16(old[2], x);
+    new[6] = _mm_add_epi16(old[3], xy);
+    new[7] = old[3];
+
+    for (i = 0; i < 8; i++) {
+      __m128i alpha = _mm_load_si128(alphaPtr++);
+      max0[i] = _mm_add_epi16(alpha, m_b[i]);
+      max1[i] = _mm_add_epi16(alpha, new[i]);
+    }
+
+    m1 = _mm_max_epi16(max1[0], max1[1]);
+    m0 = _mm_max_epi16(max0[0], max0[1]);
+
+    for (i = 2; i < 8; i++) {
+      m1 = _mm_max_epi16(m1, max1[i]);
+      m0 = _mm_max_epi16(m0, max0[i]);      
+    }
+
+    for (i = 0; i < 8; i++) {
+      new[i] = _mm_max_epi16(m_b[i], new[i]);
+      old[i] = new[i];
+    }
+
+    __m128i out = _mm_sub_epi16(m1, m0);
+    _mm_store_si128(outputPtr++, out);
+
+    // normalize 
+    if ((k%4)==0) {
+      for (int i=1;i<8;i++) {
+        _mm_sub_epi16(old[i], old[0]);
+      }
+    }
+  }
+}
+
+/* Computes alpha metrics */
+void map_see_inter_alpha(srslte_tdec_simd_inter_t * s, int16_t *input, int16_t *parity, uint32_t long_cb)
+{
+  __m128i m_b[8], new[8], old[8];
+  __m128i x, y, xy;
+  uint32_t k;
+  
+  __m128i *inputPtr  = (__m128i*) input;
+  __m128i *parityPtr = (__m128i*) parity;
+  __m128i *alphaPtr  = (__m128i*) s->alpha;
+  
+  old[0] = _mm_set1_epi16(0);
+  for (int i = 1; i < 8; i++) {
+    old[i] = _mm_set1_epi16(-INF);
+  }
+
+  for (k = 0; k < long_cb; k++) {
+    x = _mm_load_si128(inputPtr++);
+    y = _mm_load_si128(parityPtr++);
+
+    xy = _mm_add_epi16(x,y);
+
+    m_b[0] = old[0];
+    m_b[1] = _mm_add_epi16(old[3], y);
+    m_b[2] = _mm_add_epi16(old[4], y);
+    m_b[3] = old[7];
+    m_b[4] = old[1];
+    m_b[5] = _mm_add_epi16(old[2], y);
+    m_b[6] = _mm_add_epi16(old[5], y);
+    m_b[7] = old[6];
+
+    new[0] = _mm_add_epi16(old[1], xy);
+    new[1] = _mm_add_epi16(old[2], x);
+    new[2] = _mm_add_epi16(old[5], x);
+    new[3] = _mm_add_epi16(old[6], xy);
+    new[4] = _mm_add_epi16(old[0], xy);
+    new[5] = _mm_add_epi16(old[3], x);
+    new[6] = _mm_add_epi16(old[4], x);
+    new[7] = _mm_add_epi16(old[7], xy);    
+    
+    for (int i = 0; i < 8; i++) {
+      new[i] = _mm_max_epi16(m_b[i], new[i]);
+      old[i] = new[i];
+      _mm_store_si128(alphaPtr++, old[i]);
+    }
+
+    // normalize 
+    if ((k%4)==0) {
+      for (int i=1;i<8;i++) {
+        _mm_sub_epi16(old[i], old[0]);
+      }
+    }
+  }
+}
+
+#endif
--- a/lib/src/phy/fec/viterbi.c
+++ b/lib/src/phy/fec/viterbi.c
@ -122,7 +122,7 @@ void free37_sse(void *o) {



-#ifdef LV_HAVE_AVX
+#ifdef LV_HAVE_AVX2
 int decode37_avx2(void *o, uint8_t *symbols, uint8_t *data, uint32_t frame_length) {
  srslte_viterbi_t *q = o;

@ -333,7 +333,7 @@ int init37_neon(srslte_viterbi_t *q, int poly[3], uint32_t framebits, bool tail_
 #endif


-#ifdef LV_HAVE_AVX
+#ifdef LV_HAVE_AVX2
 int init37_avx2(srslte_viterbi_t *q, int poly[3], uint32_t framebits, bool tail_biting) {
  q->K = 7;
  q->R = 3;
@ -383,7 +383,7 @@ int srslte_viterbi_init(srslte_viterbi_t *q, srslte_viterbi_type_t type, int pol
  switch (type) {
  case SRSLTE_VITERBI_37:
 #ifdef LV_HAVE_SSE
-    #ifdef LV_HAVE_AVX 
+    #ifdef LV_HAVE_AVX2
        return init37_avx2(q, poly, max_frame_length, tail_bitting);
    #else
        return init37_sse(q, poly, max_frame_length, tail_bitting);
@ -408,7 +408,7 @@ int srslte_viterbi_init_sse(srslte_viterbi_t *q, srslte_viterbi_type_t type, int
 }
 #endif

-#ifdef LV_HAVE_AVX
+#ifdef LV_HAVE_AVX2
 int srslte_viterbi_init_avx2(srslte_viterbi_t *q, srslte_viterbi_type_t type, int poly[3], uint32_t max_frame_length, bool tail_bitting) 
 {
  return init37_avx2(q, poly, max_frame_length, tail_bitting);      
--- a/lib/src/phy/fec/viterbi37_avx2.c
+++ b/lib/src/phy/fec/viterbi37_avx2.c
@ -14,7 +14,7 @@

 //#define DEBUG

-#ifdef LV_HAVE_SSE
+#ifdef LV_HAVE_AVX2

 #include <emmintrin.h>
 #include <tmmintrin.h>
--- a/lib/src/phy/io/filesink.c
+++ b/lib/src/phy/io/filesink.c
@ -52,7 +52,7 @@ void srslte_filesink_free(srslte_filesink_t *q) {
 }

 int srslte_filesink_write(srslte_filesink_t *q, void *buffer, int nsamples) {
-  int i;
+  int i = 0;
  float *fbuf = (float*) buffer;
  _Complex float *cbuf = (_Complex float*) buffer;
  _Complex short *sbuf = (_Complex short*) buffer;
--- a/lib/src/phy/phch/pbch.c
+++ b/lib/src/phy/phch/pbch.c
@ -111,6 +111,10 @@ int srslte_pbch_cp(cf_t *input, cf_t *output, srslte_cell_t cell, bool put) {
 * Returns the number of symbols written to slot1_data
 *
 * 36.211 10.3 section 6.6.4
+ *
+ * @param[in] pbch PBCH complex symbols to place in slot1_data
+ * @param[out] slot1_data Complex symbol buffer for slot1
+ * @param[in] cell Cell configuration
 */
 int srslte_pbch_put(cf_t *pbch, cf_t *slot1_data, srslte_cell_t cell) {
  return srslte_pbch_cp(pbch, slot1_data, cell, true);
@ -122,6 +126,10 @@ int srslte_pbch_put(cf_t *pbch, cf_t *slot1_data, srslte_cell_t cell) {
 * Returns the number of symbols written to pbch
 *
 * 36.211 10.3 section 6.6.4
+ *
+ * @param[in] slot1_data Complex symbols for slot1
+ * @param[out] pbch Extracted complex PBCH symbols
+ * @param[in] cell Cell configuration
 */
 int srslte_pbch_get(cf_t *slot1_data, cf_t *pbch, srslte_cell_t cell) {
  return srslte_pbch_cp(slot1_data, pbch, cell, false);
@ -244,8 +252,12 @@ void srslte_pbch_free(srslte_pbch_t *q) {
 }


-/** Unpacks MIB from PBCH message.
- * msg buffer must be 24 byte length at least
+/**
+ * Unpacks MIB from PBCH message.
+ *
+ * @param[in] msg PBCH in an unpacked bit array of size 24
+ * @param[out] sfn System frame number
+ * @param[out] cell MIB information about PHICH and system bandwidth will be saved here
 */
 void srslte_pbch_mib_unpack(uint8_t *msg, srslte_cell_t *cell, uint32_t *sfn) {
  int phich_res;
@ -289,8 +301,12 @@ void srslte_pbch_mib_unpack(uint8_t *msg, srslte_cell_t *cell, uint32_t *sfn) {
  }
 }

-/** Unpacks MIB from PBCH message.
- * msg buffer must be 24 byte length at least
+/**
+ * Packs MIB to PBCH message.
+ *
+ * @param[out] payload Output unpacked bit array of size 24
+ * @param[in] sfn System frame number
+ * @param[in] cell Cell configuration to be encoded in MIB
 */
 void srslte_pbch_mib_pack(srslte_cell_t *cell, uint32_t sfn, uint8_t *payload) {
  int bw, phich_res = 0;
--- a/lib/src/phy/phch/pdsch.c
+++ b/lib/src/phy/phch/pdsch.c
@ -450,11 +450,11 @@ int srslte_pdsch_decode_multi(srslte_pdsch_t *q,
    
    if (SRSLTE_VERBOSE_ISDEBUG()) {
      DEBUG("SAVED FILE subframe.dat: received subframe symbols\n",0);
-      srslte_vec_save_file("subframe.dat", sf_symbols, SRSLTE_SF_LEN_RE(q->cell.nof_prb, q->cell.cp)*sizeof(cf_t));
+      srslte_vec_save_file("subframe.dat", sf_symbols[0], SRSLTE_SF_LEN_RE(q->cell.nof_prb, q->cell.cp)*sizeof(cf_t));
      DEBUG("SAVED FILE hest0.dat and hest1.dat: channel estimates for port 0 and port 1\n",0);
-      srslte_vec_save_file("hest0.dat", ce[0], SRSLTE_SF_LEN_RE(q->cell.nof_prb, q->cell.cp)*sizeof(cf_t));
+      srslte_vec_save_file("hest0.dat", ce[0][0], SRSLTE_SF_LEN_RE(q->cell.nof_prb, q->cell.cp)*sizeof(cf_t));
      if (q->cell.nof_ports > 1) {
-        srslte_vec_save_file("hest1.dat", ce[1], SRSLTE_SF_LEN_RE(q->cell.nof_prb, q->cell.cp)*sizeof(cf_t));
+        srslte_vec_save_file("hest1.dat", ce[1][0], SRSLTE_SF_LEN_RE(q->cell.nof_prb, q->cell.cp)*sizeof(cf_t));
      }
      DEBUG("SAVED FILE pdsch_symbols.dat: symbols after equalization\n",0);
      srslte_vec_save_file("pdsch_symbols.dat", q->d, cfg->nbits.nof_re*sizeof(cf_t));
--- a/lib/src/phy/phch/ra.c
+++ b/lib/src/phy/phch/ra.c
@ -295,6 +295,11 @@ uint32_t srslte_ra_dl_grant_nof_re(srslte_ra_dl_grant_t *grant, srslte_cell_t ce
  return nof_re; 
 }

+/** Compute PRB allocation for Downlink as defined in 7.1.6 of 36.213
+ * Decode dci->type?_alloc to grant
+ * This function only reads dci->type?_alloc and dci->alloc_type fields.
+ * This function only writes grant->prb_idx and grant->nof_prb.
+ */
 /** Compute PRB allocation for Downlink as defined in 7.1.6 of 36.213 */
 int srslte_ra_dl_dci_to_grant_prb_allocation(srslte_ra_dl_dci_t *dci, srslte_ra_dl_grant_t *grant, uint32_t nof_prb) {
  int i, j;
@ -427,7 +432,7 @@ int srslte_ra_dl_dci_to_grant_prb_allocation(srslte_ra_dl_dci_t *dci, srslte_ra_
  return SRSLTE_SUCCESS;
 }

-static int dl_fill_ra_mcs(srslte_ra_mcs_t *mcs, uint32_t nprb) {
+int dl_fill_ra_mcs(srslte_ra_mcs_t *mcs, uint32_t nprb) {
  uint32_t i_tbs = 0; 
  int tbs = -1; 
  if (mcs->idx < 10) {
@ -461,7 +466,12 @@ static int dl_fill_ra_mcs(srslte_ra_mcs_t *mcs, uint32_t nprb) {
  return tbs; 
 }

-/* Modulation order and transport block size determination 7.1.7 in 36.213 */
+/* Modulation order and transport block size determination 7.1.7 in 36.213
+ * This looks at DCI type, type of RNTI and reads fields dci->type?_alloc, dci->mcs_idx,
+ * dci->dci_is_1a and dci->dci_is_1c
+ * Reads global variable last_dl_tbs if mcs>=29
+ * Writes global variable last_dl_tbs if mcs<29
+ * */
 static int dl_dci_to_grant_mcs(srslte_ra_dl_dci_t *dci, srslte_ra_dl_grant_t *grant, bool crc_is_crnti) {
  uint32_t n_prb=0;
  int tbs = -1; 
--- a/lib/src/phy/phch/regs.c
+++ b/lib/src/phy/phch/regs.c
@ -89,7 +89,7 @@ int regs_pdcch_init(srslte_regs_t *h) {
  bzero(&h->pdcch, sizeof(srslte_regs_ch_t));

  for (cfi=0;cfi<3;cfi++) {
-    if (h->cell.nof_prb < 10) {
+    if (h->cell.nof_prb <= 10) {
      nof_ctrl_symbols = cfi+2;
    } else {
      nof_ctrl_symbols = cfi+1;
@ -673,7 +673,7 @@ void srslte_regs_free(srslte_regs_t *h) {
 int srslte_regs_set_cfi(srslte_regs_t *h, uint32_t cfi) {  
  if (cfi > 0 && cfi <= 3) {
    if (h->phich_len == SRSLTE_PHICH_EXT &&
-        ((h->cell.nof_prb < 10 && cfi < 2) || (h->cell.nof_prb >= 10 && cfi < 3))) {
+        ((h->cell.nof_prb <= 10 && cfi < 2) || (h->cell.nof_prb >= 10 && cfi < 3))) {
      fprintf(stderr, "PHICH length is extended. The number of control symbols should be at least 3.\n");
      return SRSLTE_ERROR_INVALID_INPUTS;
    } else {
@ -705,7 +705,7 @@ int srslte_regs_init(srslte_regs_t *h, srslte_cell_t cell) {
    bzero(h, sizeof(srslte_regs_t));
    ret = SRSLTE_ERROR;
    
-    max_ctrl_symbols = cell.nof_prb<10?4:3;
+    max_ctrl_symbols = cell.nof_prb<=10?4:3;
    vo = cell.id % 3;
    h->cell = cell;
    h->max_ctrl_symbols = max_ctrl_symbols;
--- a/lib/src/phy/phch/sch.c
+++ b/lib/src/phy/phch/sch.c
@ -311,102 +311,91 @@ static int encode_tb(srslte_sch_t *q,
  return encode_tb_off(q, soft_buffer, cb_segm, Qm, rv, nof_e_bits, data, e_bits, 0);
 }

-  
-
-
-/* Decode a transport block according to 36.212 5.3.2
- *
- */
-static int decode_tb(srslte_sch_t *q, 
+bool decode_tb_cb(srslte_sch_t *q, 
                     srslte_softbuffer_rx_t *softbuffer, srslte_cbsegm_t *cb_segm, 
                     uint32_t Qm, uint32_t rv, uint32_t nof_e_bits, 
-                     int16_t *e_bits, uint8_t *data) 
+                     int16_t *e_bits, uint8_t *data,
+                     uint32_t cb_size_group) 
 {
-  uint8_t parity[3] = {0, 0, 0};
-  uint32_t par_rx, par_tx;
-  uint32_t i;
-  uint32_t cb_len, rp, wp, rlen, n_e;
+
+  bool cb_map[SRSLTE_MAX_CODEBLOCKS];
+    
+  uint32_t cb_idx[SRSLTE_TDEC_NPAR];
+  int16_t *decoder_input[SRSLTE_TDEC_NPAR];
  
-  if (q            != NULL && 
-      data         != NULL &&       
-      softbuffer   != NULL &&
-      e_bits       != NULL &&
-      cb_segm      != NULL)
-  {
+  uint32_t nof_cb     = cb_size_group?cb_segm->C2:cb_segm->C1;
+  uint32_t first_cb   = cb_size_group?cb_segm->C1:0;
+  uint32_t cb_len     = cb_size_group?cb_segm->K2:cb_segm->K1;
+  uint32_t cb_len_idx = cb_size_group?cb_segm->K2_idx:cb_segm->K1_idx;

-    if (cb_segm->tbs == 0 || cb_segm->C == 0) {
-      return SRSLTE_SUCCESS;
-    }
+  uint32_t rlen       = cb_segm->C==1?cb_len:(cb_len-24);
+  uint32_t Gp         = nof_e_bits / Qm;
+  uint32_t gamma      = cb_segm->C>0?Gp%cb_segm->C:Gp;
+  uint32_t n_e        = Qm * (Gp/cb_segm->C);
+  
+  if (nof_cb > SRSLTE_MAX_CODEBLOCKS) {
+    fprintf(stderr, "Error SRSLTE_MAX_CODEBLOCKS=%d\n", SRSLTE_MAX_CODEBLOCKS);
+    return false; 
+  }
+  
+  for (int i=0;i<SRSLTE_TDEC_NPAR;i++) {
+    cb_idx[i]        = i+first_cb; 
+    decoder_input[i] = false; 
+  }
+  
+  for (int i=0;i<nof_cb;i++) {
+    cb_map[i] = false; 
+  }
    
-    rp = 0;
-    rp = 0;
-    wp = 0;
-    uint32_t Gp = nof_e_bits / Qm;
-    uint32_t gamma=Gp;
-
-    if (cb_segm->F) {
-      fprintf(stderr, "Error filler bits are not supported. Use standard TBS\n");
-      return SRSLTE_ERROR;       
-    }
-
-    if (cb_segm->C > softbuffer->max_cb) {
-      fprintf(stderr, "Error number of CB (%d) exceeds soft buffer size (%d CBs)\n", cb_segm->C, softbuffer->max_cb);
-      return -1; 
-    }
-    
-    if (cb_segm->C>0) {
-      gamma = Gp%cb_segm->C;
-    }
-    
-    bool early_stop = true;
-    for (i = 0; i < cb_segm->C && early_stop; i++) {
-
-      /* Get read/write lengths */
-      uint32_t cblen_idx; 
-      if (i < cb_segm->C2) {
-        cb_len = cb_segm->K2;
-        cblen_idx = cb_segm->K2_idx;
-      } else {
-        cb_len = cb_segm->K1;
-        cblen_idx = cb_segm->K1_idx;
-      }
+  srslte_tdec_reset(&q->decoder, cb_len);
+  
+  uint32_t remaining_cb = nof_cb; 
+  
+  while(remaining_cb>0) {
+        
+    // Unratematch the codeblocks left to decode 
+    for (int i=0;i<SRSLTE_TDEC_NPAR;i++) {
      
-      if (cb_segm->C == 1) {
-        rlen = cb_len;
-      } else {
-        rlen = cb_len - 24;
+      if (!decoder_input[i] && remaining_cb > 0) {        
+        // Find an unprocessed CB 
+        cb_idx[i]=first_cb;
+        while(cb_idx[i]<first_cb+nof_cb-1 && cb_map[cb_idx[i]]) {
+          cb_idx[i]++;
+        }
+        if (cb_map[cb_idx[i]] == false) {
+          cb_map[cb_idx[i]] = true; 
+          
+          uint32_t rp   = cb_idx[i]*n_e;  
+          uint32_t n_e2 = n_e;
+          
+          if (cb_idx[i] > cb_segm->C - gamma) {
+            n_e2 = n_e+Qm;
+            rp   = (cb_segm->C - gamma)*n_e + (cb_idx[i]-(cb_segm->C - gamma))*n_e2;
+          }
+                
+          INFO("CB %d: rp=%d, n_e=%d, i=%d\n", cb_idx[i], rp, n_e2, i);
+          if (srslte_rm_turbo_rx_lut(&e_bits[rp], softbuffer->buffer_f[cb_idx[i]], n_e2, cb_len_idx, rv)) {
+            fprintf(stderr, "Error in rate matching\n");
+            return SRSLTE_ERROR;
+          }
+
+          decoder_input[i] = softbuffer->buffer_f[cb_idx[i]];          
+        }
      }
+    }
+        
+    // Run 1 iteration for up to TDEC_NPAR codeblocks 
+    srslte_tdec_iteration_par(&q->decoder, decoder_input, cb_len);

-      if (i <= cb_segm->C - gamma - 1) {
-        n_e = Qm * (Gp/cb_segm->C);
-      } else {
-        n_e = Qm * ((uint32_t) ceilf((float) Gp/cb_segm->C));
-      }
+    q->nof_iterations = srslte_tdec_get_nof_iterations_cb(&q->decoder, 0); 
+    
+    // Decide output bits and compute CRC 
+    for (int i=0;i<SRSLTE_TDEC_NPAR;i++) {
+      if (decoder_input[i]) {        
+        srslte_tdec_decision_byte_par_cb(&q->decoder, q->cb_in, i, cb_len);

-      /* Rate Unmatching */
-      if (srslte_rm_turbo_rx_lut(&e_bits[rp], softbuffer->buffer_f[i], n_e, cblen_idx, rv)) {
-        fprintf(stderr, "Error in rate matching\n");
-        return SRSLTE_ERROR;
-      }
-
-      if (SRSLTE_VERBOSE_ISDEBUG()) {
-        char tmpstr[64]; 
-        snprintf(tmpstr,64,"rmout_%d.dat",i);
-        DEBUG("SAVED FILE %s: Encoded turbo code block %d\n", tmpstr, i);
-        srslte_vec_save_file(tmpstr, softbuffer->buffer_f[i], (3*cb_len+12)*sizeof(int16_t));
-      }
-
-      /* Turbo Decoding with CRC-based early stopping */
-      q->nof_iterations = 0; 
-      uint32_t len_crc; 
-      srslte_crc_t *crc_ptr; 
-      early_stop = false; 
-
-      srslte_tdec_reset(&q->decoder, cb_len);
-            
-      do {
-        srslte_tdec_iteration(&q->decoder, softbuffer->buffer_f[i], cb_len); 
-        q->nof_iterations++;
+        uint32_t len_crc; 
+        srslte_crc_t *crc_ptr; 
        
        if (cb_segm->C > 1) {
          len_crc = cb_len; 
@ -416,65 +405,110 @@ static int decode_tb(srslte_sch_t *q,
          crc_ptr = &q->crc_tb; 
        }

-        srslte_tdec_decision_byte(&q->decoder, q->cb_in, cb_len);
-                 
-        /* Check Codeblock CRC and stop early if correct */
+        // CRC is OK
        if (!srslte_crc_checksum_byte(crc_ptr, q->cb_in, len_crc)) {
-          early_stop = true;           
+
+          memcpy(&data[(cb_idx[i]*rlen)/8], q->cb_in, rlen/8 * sizeof(uint8_t));
+          
+          // Reset number of iterations for that CB in the decoder 
+          srslte_tdec_reset_cb(&q->decoder, i);
+          remaining_cb--;        
+          decoder_input[i] = NULL; 
+          cb_idx[i] = 0; 
+          
+        // CRC is error and exceeded maximum iterations for this CB. 
+        // Early stop the whole transport block.
+        } else if (srslte_tdec_get_nof_iterations_cb(&q->decoder, i) >= q->max_iterations) {
+          INFO("CB %d: Error. CB is erroneous. remaining_cb=%d, i=%d, first_cb=%d, nof_cb=%d\n", 
+                cb_idx[i], remaining_cb, i, first_cb, nof_cb);          
+          return false; 
        }
-       
-      } while (q->nof_iterations < q->max_iterations && !early_stop);
-      q->average_nof_iterations = SRSLTE_VEC_EMA((float) q->nof_iterations, q->average_nof_iterations, 0.2);
-
-      INFO("CB#%d: cb_len: %d, rlen: %d, wp: %d, rp: %d, E: %d, n_iters=%d\n", i,
-          cb_len, rlen, wp, rp, n_e, q->nof_iterations);
-      
-            
-      // If CB CRC is not correct, early_stop will be false and wont continue with rest of CBs
-
-      /* Copy data to another buffer, removing the Codeblock CRC */
-      if (i < cb_segm->C - 1) {
-        memcpy(&data[wp/8], q->cb_in, rlen/8 * sizeof(uint8_t));
-      } else {        
-        /* Append Transport Block parity bits to the last CB */
-        memcpy(&data[wp/8], q->cb_in, (rlen - 24)/8 * sizeof(uint8_t));
-        memcpy(parity, &q->cb_in[(rlen - 24)/8], 3 * sizeof(uint8_t));
      }
-      
-      if (SRSLTE_VERBOSE_ISDEBUG()) {
-        early_stop = true; 
-      }
-      
-      /* Set read/write pointers */
-      wp += rlen;
-      rp += n_e;
+    }    
+  }
+  
+  return true;  
+}
+
+/**
+ * Decode a transport block according to 36.212 5.3.2
+ *
+ * @param[in] q
+ * @param[inout] softbuffer Initialized softbuffer
+ * @param[in] cb_segm Code block segmentation parameters
+ * @param[in] e_bits Input transport block
+ * @param[in] Qm Modulation type
+ * @param[in] rv Redundancy Version. Indicates which part of FEC bits is in input buffer
+ * @param[out] softbuffer Initialized output softbuffer
+ * @param[out] data Decoded transport block
+ * @return negative if error in parameters or CRC error in decoding
+ */
+static int decode_tb(srslte_sch_t *q, 
+                     srslte_softbuffer_rx_t *softbuffer, srslte_cbsegm_t *cb_segm, 
+                     uint32_t Qm, uint32_t rv, uint32_t nof_e_bits, 
+                     int16_t *e_bits, uint8_t *data) 
+{
+  
+  if (q            != NULL && 
+      data         != NULL &&       
+      softbuffer   != NULL &&
+      e_bits       != NULL &&
+      cb_segm      != NULL)
+  {
+    
+    if (cb_segm->tbs == 0 || cb_segm->C == 0) {
+      return SRSLTE_SUCCESS;
    }
    
-    if (!early_stop) {
-      INFO("CB %d failed. TB is erroneous.\n",i-1);
-      return SRSLTE_ERROR; 
-    } else {
-      INFO("END CB#%d: wp: %d, rp: %d\n", i, wp, rp);
+    if (cb_segm->F) {
+      fprintf(stderr, "Error filler bits are not supported. Use standard TBS\n");
+      return SRSLTE_ERROR;       
+    }

+    if (cb_segm->C > softbuffer->max_cb) {
+      fprintf(stderr, "Error number of CB (%d) exceeds soft buffer size (%d CBs)\n", cb_segm->C, softbuffer->max_cb);
+      return SRSLTE_ERROR;
+    }
+        
+    bool crc_ok = true; 
+    
+    uint32_t nof_cb_groups = cb_segm->C2>0?2:1; 
+    
+    data[cb_segm->tbs/8+0] = 0; 
+    data[cb_segm->tbs/8+1] = 0; 
+    data[cb_segm->tbs/8+2] = 0; 
+    
+    // Process Codeblocks in groups of equal CB size to parallelize according to SRSLTE_TDEC_NPAR
+    for (uint32_t i=0;i<nof_cb_groups && crc_ok;i++) {
+      crc_ok = decode_tb_cb(q, softbuffer, cb_segm, Qm, rv, nof_e_bits, e_bits, data, i);            
+    }
+    
+    if (crc_ok) {
+
+      uint32_t par_rx = 0, par_tx = 0;
+  
      // Compute transport block CRC
      par_rx = srslte_crc_checksum_byte(&q->crc_tb, data, cb_segm->tbs);

      // check parity bits
-      par_tx = ((uint32_t) parity[0])<<16 | ((uint32_t) parity[1])<<8 | ((uint32_t) parity[2]);
-      
+      par_tx = ((uint32_t) data[cb_segm->tbs/8+0])<<16  | 
+               ((uint32_t) data[cb_segm->tbs/8+1])<<8   | 
+               ((uint32_t) data[cb_segm->tbs/8+2]);
+
      if (!par_rx) {
-        INFO("Warning: Received all-zero transport block\n\n", 0);      
+        INFO("Warning: Received all-zero transport block\n\n",0);      
      }

      if (par_rx == par_tx) {
-        INFO("TB decoded OK\n",i);
+        INFO("TB decoded OK\n",0);
        return SRSLTE_SUCCESS;
      } else {
        INFO("Error in TB parity: par_tx=0x%x, par_rx=0x%x\n", par_tx, par_rx);
        return SRSLTE_ERROR;
      }
-      
-    }
+    } else {
+      return SRSLTE_ERROR; 
+    }        
  } else {
    return SRSLTE_ERROR_INVALID_INPUTS;
  }
@ -489,6 +523,16 @@ int srslte_dlsch_decode(srslte_sch_t *q, srslte_pdsch_cfg_t *cfg, srslte_softbuf
                   e_bits, data);
 }

+/**
+ * Encode transport block. Segments into code blocks, adds channel coding, and does rate matching.
+ *
+ * @param[in] q Initialized 
+ * @param[in] cfg Encoding parameters
+ * @param[inout] softbuffer Initialized softbuffer
+ * @param[in] data Byte array of data. Size is implicit in cfg->cb_segm
+ * @param e_bits
+ * @return Error code
+ */
 int srslte_dlsch_encode(srslte_sch_t *q, srslte_pdsch_cfg_t *cfg, srslte_softbuffer_tx_t *softbuffer,
                        uint8_t *data, uint8_t *e_bits) 
 {
--- a/lib/src/phy/phch/test/pdsch_test.c
+++ b/lib/src/phy/phch/test/pdsch_test.c
@ -171,7 +171,7 @@ int main(int argc, char **argv) {
    }
  }
  
-  data = srslte_vec_malloc(sizeof(uint8_t) * grant.mcs.tbs/8);
+  data = srslte_vec_malloc(sizeof(uint8_t) * (grant.mcs.tbs/8)+24);
  if (!data) {
    perror("srslte_vec_malloc");
    goto quit;
--- a/lib/src/phy/phch/test/pusch_test.c
+++ b/lib/src/phy/phch/test/pusch_test.c
@ -188,7 +188,7 @@ int main(int argc, char **argv) {
    exit(-1);
  }
  
-  data = srslte_vec_malloc(sizeof(uint8_t) * cfg.grant.mcs.tbs);
+  data = srslte_vec_malloc(sizeof(uint8_t) * (cfg.grant.mcs.tbs+24));
  if (!data) {
    perror("malloc");
    exit(-1);
@ -202,10 +202,12 @@ int main(int argc, char **argv) {
    fprintf(stderr, "Error initiating soft buffer\n");
    goto quit;
  }
+  srslte_softbuffer_tx_reset(&softbuffer_tx);
  if (srslte_softbuffer_rx_init(&softbuffer_rx, 100)) {
    fprintf(stderr, "Error initiating soft buffer\n");
    goto quit;
  }
+  srslte_softbuffer_rx_reset(&softbuffer_rx);
  
  uint32_t ntrials = 100; 

--- a/lib/src/phy/utils/bit.c
+++ b/lib/src/phy/utils/bit.c
@ -209,6 +209,15 @@ bitarray_copy(const unsigned char *src_org, int src_offset, int src_len,
    }
 }

+/**
+ * Copy bits from src to dst, with offsets and length in bits
+ *
+ * @param[out] dst Output array
+ * @param[in] src Input array
+ * @param dst_offset Output array write offset in bits
+ * @param src_offset Input array read offset in bits
+ * @param nof_bits Number of bits to copy
+ */
 void srslte_bit_copy(uint8_t *dst, uint32_t dst_offset, uint8_t *src, uint32_t src_offset, uint32_t nof_bits)
 {
  static const uint8_t mask_dst[] =
@ -247,6 +256,13 @@ void srslte_bit_unpack_l(uint64_t value, uint8_t **bits, int nof_bits)
  *bits += nof_bits;
 }

+/**
+ * Unpacks nof_bits from LSBs of value in MSB order to *bits. Advances pointer past unpacked bits.
+ *
+ * @param[in] value nof_bits lowest order bits will be unpacked in MSB order
+ * @param[in] nof_bits Number of bits to unpack
+ * @param[out] bits Points to buffer pointer. The buffer pointer will be advanced by nof_bits
+ */
 void srslte_bit_unpack(uint32_t value, uint8_t **bits, int nof_bits)
 {
    int i;
--- a/lib/src/phy/utils/vector.c
+++ b/lib/src/phy/utils/vector.c
@ -102,13 +102,13 @@ void srslte_vec_sub_fff(float *x, float *y, float *z, uint32_t len) {
    z[i] = x[i]-y[i];
  }
 #else
-  srslte_vec_sub_fff_simd(x, y, z, len);
+  srslte_vec_sub_fff_sse(x, y, z, len);
 #endif
 }

 void srslte_vec_sub_sss(short *x, short *y, short *z, uint32_t len) {
-#ifdef LV_HAVE_AVX
-  srslte_vec_sub_sss_avx(x, y, z, len);
+#ifdef LV_HAVE_AVX2
+  srslte_vec_sub_sss_avx2(x, y, z, len);
 #else
 #ifdef LV_HAVE_SSE
  srslte_vec_sub_sss_sse(x, y, z, len);
@ -134,13 +134,13 @@ void srslte_vec_sum_fff(float *x, float *y, float *z, uint32_t len) {
    z[i] = x[i]+y[i];
  }
 #else
-  srslte_vec_sum_fff_simd(x, y, z, len);
+  srslte_vec_sum_fff_sse(x, y, z, len);
 #endif
 }

 void srslte_vec_sum_sss(short *x, short *y, short *z, uint32_t len) {
-#ifdef LV_HAVE_AVX
-  srslte_vec_sum_sss_avx(x, y, z, len);
+#ifdef LV_HAVE_AVX2
+  srslte_vec_sum_sss_avx2(x, y, z, len);
 #else
 #ifdef LV_HAVE_SSE
  srslte_vec_sum_sss_sse(x, y, z, len);
@ -199,7 +199,7 @@ void srslte_vec_sc_prod_fff(float *x, float h, float *z, uint32_t len) {
    z[i] = x[i]*h;
  }
 #else
-  srslte_vec_sc_prod_fff_simd(x, h, z, len);
+  srslte_vec_sc_prod_fff_sse(x, h, z, len);
 #endif
 }

@ -211,8 +211,8 @@ void srslte_vec_sc_prod_sfs(short *x, float h, short *z, uint32_t len) {
 }

 void srslte_vec_sc_div2_sss(short *x, int n_rightshift, short *z, uint32_t len) {
-#ifdef LV_HAVE_AVX
-  srslte_vec_sc_div2_sss_avx(x, n_rightshift, z, len);
+#ifdef LV_HAVE_AVX2
+  srslte_vec_sc_div2_sss_avx2(x, n_rightshift, z, len);
 #else
 #ifdef LV_HAVE_SSE
  srslte_vec_sc_div2_sss_sse(x, n_rightshift, z, len);
@ -258,7 +258,7 @@ void srslte_vec_sc_prod_ccc(cf_t *x, cf_t h, cf_t *z, uint32_t len) {
    z[i] = x[i]*h;
  }
 #else
-  srslte_vec_sc_prod_ccc_simd(x,h,z,len);
+  srslte_vec_sc_prod_ccc_sse(x,h,z,len);
 #endif
 }

@ -335,7 +335,7 @@ void srslte_vec_deinterleave_real_cf(cf_t *x, float *real, uint32_t len) {
  }
 }

-/* Note: We align memory to 32 bytes (for AVX compatibility) 
+/* Note: We align memory to 32 bytes (for AVX2 compatibility) 
 * because in some cases volk can incorrectly detect the architecture. 
 * This could be inefficient for SSE or non-SIMD platforms but shouldn't 
 * be a huge problem. 
@ -354,7 +354,7 @@ void *srslte_vec_realloc(void *ptr, uint32_t old_size, uint32_t new_size) {
  return realloc(ptr, new_size);
 #else
  void *new_ptr;
-  if (posix_memalign(&new_ptr,64,new_size)) {
+  if (posix_memalign(&new_ptr,256,new_size)) {
    return NULL;
  } else {
    memcpy(new_ptr, ptr, old_size);
@ -501,8 +501,8 @@ void srslte_vec_prod_fff(float *x, float *y, float *z, uint32_t len) {

 // Scrambling Short
 void srslte_vec_prod_sss(short *x, short *y, short *z, uint32_t len) {
-#ifdef LV_HAVE_AVX
-  srslte_vec_prod_sss_avx(x,y,z,len);
+#ifdef LV_HAVE_AVX2
+  srslte_vec_prod_sss_avx2(x,y,z,len);
 #else
 #ifdef LV_HAVE_SSE
  srslte_vec_prod_sss_sse(x,y,z,len);
@ -523,7 +523,7 @@ void srslte_vec_prod_ccc(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
    z[i] = x[i]*y[i];
  }
 #else
-  srslte_vec_prod_ccc_simd(x,y,z,len);
+  srslte_vec_prod_ccc_sse(x,y,z,len);
 #endif
 }

@ -535,7 +535,7 @@ void srslte_vec_prod_conj_ccc(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
    z[i] = x[i]*conjf(y[i]);
  }
 #else
-  srslte_vec_prod_conj_ccc_simd(x,y,z,len);
+  srslte_vec_prod_conj_ccc_sse(x,y,z,len);
 #endif
 }

@ -588,7 +588,7 @@ cf_t srslte_vec_dot_prod_ccc(cf_t *x, cf_t *y, uint32_t len) {
  }
  return res;
 #else
-  return srslte_vec_dot_prod_ccc_simd(x, y, len); 
+  return srslte_vec_dot_prod_ccc_sse(x, y, len);
 #endif
 }

@ -612,7 +612,7 @@ cf_t srslte_vec_dot_prod_conj_ccc(cf_t *x, cf_t *y, uint32_t len) {
  }
  return res;
 #else
-  return srslte_vec_dot_prod_conj_ccc_simd(x, y, len); 
+  return srslte_vec_dot_prod_conj_ccc_sse(x, y, len);
 #endif
  
 }
@ -628,8 +628,8 @@ float srslte_vec_dot_prod_fff(float *x, float *y, uint32_t len) {
 }

 int32_t srslte_vec_dot_prod_sss(int16_t *x, int16_t *y, uint32_t len) {
-#ifdef LV_HAVE_AVX
-  return srslte_vec_dot_prod_sss_avx(x, y, len);
+#ifdef LV_HAVE_AVX2
+  return srslte_vec_dot_prod_sss_avx2(x, y, len);
 #else
 #ifdef LV_HAVE_SSE
  return srslte_vec_dot_prod_sss_sse(x, y, len);
@ -664,7 +664,7 @@ void srslte_vec_abs_square_cf(cf_t *x, float *abs_square, uint32_t len) {
    abs_square[i] = crealf(x[i])*crealf(x[i])+cimagf(x[i])*cimagf(x[i]);
  }
 #else
-  srslte_vec_abs_square_cf_simd(x,abs_square,len);
+  srslte_vec_abs_square_cf_sse(x,abs_square,len);
 #endif
 }

@ -677,11 +677,17 @@ void srslte_vec_arg_cf(cf_t *x, float *arg, uint32_t len) {
 }

 uint32_t srslte_vec_max_fi(float *x, uint32_t len) {
-#ifdef HAVE_VOLK_MAX_FUNCTION
-  uint16_t target=0;
+
+  // This is to solve an issue with incorrect type of 1st parameter in version 1.2 of volk
+#ifdef HAVE_VOLK_MAX_FUNCTION_32
+  uint32_t target=0;
+  volk_32f_index_max_32u(&target,x,len);
+  return target;
+#else
+#ifdef HAVE_VOLK_MAX_FUNCTION_16
+  uint32_t target=0;
  volk_32f_index_max_16u(&target,x,len);
  return target;
-
 #else
  uint32_t i;
  float m=-FLT_MAX;
@ -694,6 +700,7 @@ uint32_t srslte_vec_max_fi(float *x, uint32_t len) {
  }
  return p;
 #endif
+#endif
 }

 int16_t srslte_vec_max_star_si(int16_t *x, uint32_t len) {
@ -732,11 +739,15 @@ void srslte_vec_max_fff(float *x, float *y, float *z, uint32_t len) {

 // CP autocorr
 uint32_t srslte_vec_max_abs_ci(cf_t *x, uint32_t len) {
-#ifdef HAVE_VOLK_MAX_ABS_FUNCTION
-  uint16_t target=0;
+#ifdef HAVE_VOLK_MAX_ABS_FUNCTION_32
+  uint32_t target=0;
+  volk_32fc_index_max_32u(&target,x,len);
+  return target;
+#else
+#ifdef HAVE_VOLK_MAX_ABS_FUNCTION_16
+  uint32_t target=0;
  volk_32fc_index_max_16u(&target,x,len);
  return target;
-
 #else
  uint32_t i;
  float m=-FLT_MAX;
@ -751,6 +762,7 @@ uint32_t srslte_vec_max_abs_ci(cf_t *x, uint32_t len) {
  }
  return p;
 #endif
+#endif
 }

 void srslte_vec_quant_fuc(float *in, uint8_t *out, float gain, float offset, float clip, uint32_t len) {
--- a/lib/src/phy/utils/vector_simd.c
+++ b/lib/src/phy/utils/vector_simd.c
@ -87,10 +87,10 @@ int srslte_vec_dot_prod_sss_sse(short *x, short *y, uint32_t len)
 }


-int srslte_vec_dot_prod_sss_avx(short *x, short *y, uint32_t len)
+int srslte_vec_dot_prod_sss_avx2(short *x, short *y, uint32_t len)
 {
  int result = 0; 
-#ifdef LV_HAVE_AVX
+#ifdef LV_HAVE_AVX2
  unsigned int number = 0;
  const unsigned int points = len / 16;

@ -110,7 +110,7 @@ int srslte_vec_dot_prod_sss_avx(short *x, short *y, uint32_t len)
    yPtr ++;
  }
  
-  short dotProdVector[16];
+  __attribute__ ((aligned (256))) short dotProdVector[16];
  _mm256_store_si256((__m256i*) dotProdVector, dotProdVal);
  for (int i=0;i<16;i++) {
    result += dotProdVector[i]; 
@ -160,9 +160,9 @@ void srslte_vec_sum_sss_sse(short *x, short *y, short *z, uint32_t len)

 }

-void srslte_vec_sum_sss_avx(short *x, short *y, short *z, uint32_t len)
+void srslte_vec_sum_sss_avx2(short *x, short *y, short *z, uint32_t len)
 {
-#ifdef LV_HAVE_SSE
+#ifdef LV_HAVE_AVX2
  unsigned int number = 0;
  const unsigned int points = len / 16;

@ -225,9 +225,9 @@ void srslte_vec_sub_sss_sse(short *x, short *y, short *z, uint32_t len)
 #endif
 }

-void srslte_vec_sub_sss_avx(short *x, short *y, short *z, uint32_t len)
+void srslte_vec_sub_sss_avx2(short *x, short *y, short *z, uint32_t len)
 {
-#ifdef LV_HAVE_AVX
+#ifdef LV_HAVE_AVX2
  unsigned int number = 0;
  const unsigned int points = len / 16;

@ -292,9 +292,9 @@ void srslte_vec_prod_sss_sse(short *x, short *y, short *z, uint32_t len)
 #endif
 }

-void srslte_vec_prod_sss_avx(short *x, short *y, short *z, uint32_t len)
+void srslte_vec_prod_sss_avx2(short *x, short *y, short *z, uint32_t len)
 {
-#ifdef LV_HAVE_SSE
+#ifdef LV_HAVE_AVX2
  unsigned int number = 0;
  const unsigned int points = len / 16;

@ -359,9 +359,9 @@ void srslte_vec_sc_div2_sss_sse(short *x, int k, short *z, uint32_t len)
 #endif
 }

-void srslte_vec_sc_div2_sss_avx(short *x, int k, short *z, uint32_t len)
+void srslte_vec_sc_div2_sss_avx2(short *x, int k, short *z, uint32_t len)
 {
-#ifdef LV_HAVE_AVX
+#ifdef LV_HAVE_AVX2
  unsigned int number = 0;
  const unsigned int points = len / 16;

@ -394,7 +394,11 @@ void srslte_vec_sc_div2_sss_avx(short *x, int k, short *z, uint32_t len)
 /* No improvement with AVX */
 void srslte_vec_lut_sss_sse(short *x, unsigned short *lut, short *y, uint32_t len)
 {
-#ifndef DEBUG_MODE
+#ifdef DEBUG_MODE
+  for (int i=0;i<len;i++) {
+    y[lut[i]] = x[i];
+  }
+#else
 #ifdef LV_HAVE_SSE
  unsigned int number = 0;
  const unsigned int points = len / 8;
@ -466,7 +470,7 @@ void srslte_vec_convert_fi_sse(float *x, int16_t *z, float scale, uint32_t len)


 // for enb no-volk
-void srslte_vec_sum_fff_simd(float *x, float *y, float *z, uint32_t len) {
+void srslte_vec_sum_fff_sse(float *x, float *y, float *z, uint32_t len) {
 #ifdef LV_HAVE_SSE
  unsigned int number = 0;
  const unsigned int points = len / 4;
@ -497,7 +501,7 @@ void srslte_vec_sum_fff_simd(float *x, float *y, float *z, uint32_t len) {
 #endif
 }

-void srslte_vec_sub_fff_simd(float *x, float *y, float *z, uint32_t len) {
+void srslte_vec_sub_fff_sse(float *x, float *y, float *z, uint32_t len) {
 #ifdef LV_HAVE_SSE
  unsigned int number = 0;
  const unsigned int points = len / 4;
@ -550,7 +554,7 @@ static inline __m128 _mm_complexmulconj_ps(__m128 x, __m128 y) {
 }
 #endif

-cf_t srslte_vec_dot_prod_ccc_simd(cf_t *x, cf_t *y, uint32_t len)
+cf_t srslte_vec_dot_prod_ccc_sse(cf_t *x, cf_t *y, uint32_t len)
 {
  cf_t result = 0; 
 #ifdef LV_HAVE_SSE
@ -591,8 +595,7 @@ cf_t srslte_vec_dot_prod_ccc_simd(cf_t *x, cf_t *y, uint32_t len)
  return result; 
 }

-
-cf_t srslte_vec_dot_prod_conj_ccc_simd(cf_t *x, cf_t *y, uint32_t len)
+cf_t srslte_vec_dot_prod_conj_ccc_sse(cf_t *x, cf_t *y, uint32_t len)
 {
  cf_t result = 0; 
 #ifdef LV_HAVE_SSE
@ -632,7 +635,8 @@ cf_t srslte_vec_dot_prod_conj_ccc_simd(cf_t *x, cf_t *y, uint32_t len)
 #endif
  return result; 
 }
-void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len) 
+
+void srslte_vec_prod_ccc_sse(cf_t *x,cf_t *y, cf_t *z, uint32_t len)
 {
 #ifdef LV_HAVE_SSE
  unsigned int number = 0;
@ -662,7 +666,7 @@ void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len)
 }


-void srslte_vec_prod_conj_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
+void srslte_vec_prod_conj_ccc_sse(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
 #ifdef LV_HAVE_SSE
  unsigned int number = 0;
  const unsigned int halfPoints = len / 2;
@ -690,7 +694,7 @@ void srslte_vec_prod_conj_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
 #endif
 }

-void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, uint32_t len) {
+void srslte_vec_sc_prod_ccc_sse(cf_t *x, cf_t h, cf_t *z, uint32_t len) {
 #ifdef LV_HAVE_SSE
  unsigned int number = 0;
  const unsigned int halfPoints = len / 2;
@ -724,7 +728,7 @@ void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, uint32_t len) {
 }


-void srslte_vec_sc_prod_cfc_simd(cf_t *x, float h, cf_t *z, uint32_t len) {
+void srslte_vec_sc_prod_cfc_sse(cf_t *x, float h, cf_t *z, uint32_t len) {
 #ifdef LV_HAVE_SSE
  unsigned int number = 0;
  const unsigned int halfPoints = len / 2;
@ -756,7 +760,7 @@ void srslte_vec_sc_prod_cfc_simd(cf_t *x, float h, cf_t *z, uint32_t len) {



-void srslte_vec_sc_prod_fff_simd(float *x, float h, float *z, uint32_t len) {
+void srslte_vec_sc_prod_fff_sse(float *x, float h, float *z, uint32_t len) {
 #ifdef LV_HAVE_SSE
  unsigned int number = 0;
  const unsigned int quarterPoints = len / 4;
@ -786,8 +790,7 @@ void srslte_vec_sc_prod_fff_simd(float *x, float h, float *z, uint32_t len) {
 #endif
 }

-
-void srslte_vec_abs_square_cf_simd(cf_t *x, float *z, uint32_t len) {
+void srslte_vec_abs_square_cf_sse(cf_t *x, float *z, uint32_t len) {
 #ifdef LV_HAVE_SSE
  unsigned int number = 0;
  const unsigned int quarterPoints = len / 4;
--- a/pbch_capture.png
+++ b/pbch_capture.png
--- a/srsenb/drb.conf.example
+++ b/srsenb/drb.conf.example
@ -28,25 +28,22 @@ qci_config = (
 {
  qci=9;
  pdcp_config = {
-    discard_timer = -1;                 
-    status_report_required = false;
+    discard_timer = 100;                
+    pdcp_sn_size = 12;                  
  }
  rlc_config = {
-    ul_am = {
-      t_poll_retx = 200; 
-      poll_pdu    = 16; 
-      poll_byte   = -1; 
-      max_retx_thresh = 8;
+    ul_um = {
+      sn_field_length = 10; 
    };
-    dl_am = {
-      t_reordering      = 80;
-      t_status_prohibit = 35; 
+    dl_um = {
+      sn_field_length = 10; 
+      t_reordering    = 80;             
    };
  };
  logical_channel_config = {
-    priority = 3; 
-    prioritized_bit_rate  = 8; 
-    bucket_size_duration  = 50; 
+    priority = 11; 
+    prioritized_bit_rate   = -1; 
+    bucket_size_duration  = 100; 
    log_chan_group = 3; 
  };
 }
--- a/srsenb/enb.conf.example
+++ b/srsenb/enb.conf.example
@ -143,7 +143,7 @@ nof_ctrl_symbols = 2
 #pregenerate_signals  = false
 #tx_amplitude         = 0.8
 #link_failure_nof_err = 50
-#rrc_inactivity_timer = 5000
+#rrc_inactivity_timer = 30000
 #max_prach_offset_us  = 30

 #####################################################################
--- a/srsenb/src/main.cc
+++ b/srsenb/src/main.cc
@ -176,7 +176,7 @@ void parse_args(all_args_t *args, int argc, char* argv[]) {
        "Chooses the coefficients for the 3-tap channel estimator centered filter.")

    ("expert.rrc_inactivity_timer",
-        bpo::value<uint32_t>(&args->expert.rrc_inactivity_timer)->default_value(5000),
+        bpo::value<uint32_t>(&args->expert.rrc_inactivity_timer)->default_value(30000),
        "Inactivity timer in ms")


--- a/srsenb/src/upper/rrc.cc
+++ b/srsenb/src/upper/rrc.cc
@ -995,6 +995,7 @@ bool rrc::ue::release_erabs()
 void rrc::ue::notify_s1ap_ue_ctxt_setup_complete()
 {
  LIBLTE_S1AP_MESSAGE_INITIALCONTEXTSETUPRESPONSE_STRUCT res;
+  res.ext = false;
  res.E_RABSetupListCtxtSURes.len = 0;
  res.E_RABFailedToSetupListCtxtSURes.len = 0;

--- a/srsue/src/CMakeLists.txt
+++ b/srsue/src/CMakeLists.txt
@ -46,7 +46,7 @@ endif (RPATH)
 ########################################################################
 if (NOT ${BUILDUE_CMD} STREQUAL "")
  message(STATUS "Added custom post-build-UE command: ${BUILDUE_CMD}")
-  add_custom_command(TARGET ue POST_BUILD COMMAND ${BUILDUE_CMD})
+  add_custom_command(TARGET srsue POST_BUILD COMMAND ${BUILDUE_CMD})
 else(NOT ${BUILDUE_CMD} STREQUAL "")
  message(STATUS "No post-build-UE command defined")
 endif (NOT ${BUILDUE_CMD} STREQUAL "")
--- a/srsue/src/main.cc
+++ b/srsue/src/main.cc
@ -49,284 +49,289 @@ namespace bpo = boost::program_options;
 ***********************************************************************/
 string config_file;

-void parse_args(all_args_t *args, int argc, char* argv[]) {
+void parse_args(all_args_t *args, int argc, char *argv[]) {

-    // Command line only options
-    bpo::options_description general("General options");
-    general.add_options()
-        ("help,h", "Produce help message")
-        ("version,v", "Print version information and exit")
-        ;
+  // Command line only options
+  bpo::options_description general("General options");

-    // Command line or config file options
-    bpo::options_description common("Configuration options");
-    common.add_options()
-        ("rf.dl_freq",        bpo::value<float>(&args->rf.dl_freq)->default_value(2680000000),  "Downlink centre frequency")
-        ("rf.ul_freq",        bpo::value<float>(&args->rf.ul_freq)->default_value(2560000000),  "Uplink centre frequency")
-        ("rf.rx_gain",        bpo::value<float>(&args->rf.rx_gain)->default_value(-1),          "Front-end receiver gain")
-        ("rf.tx_gain",        bpo::value<float>(&args->rf.tx_gain)->default_value(-1),          "Front-end transmitter gain")
-        ("rf.nof_rx_ant",     bpo::value<uint32_t>(&args->rf.nof_rx_ant)->default_value(1),         "Number of RX antennas")
+  general.add_options()
+    ("help,h", "Produce help message")
+    ("version,v", "Print version information and exit");

-        ("rf.device_name",       bpo::value<string>(&args->rf.device_name)->default_value("auto"),    "Front-end device name")
-        ("rf.device_args",       bpo::value<string>(&args->rf.device_args)->default_value("auto"),    "Front-end device arguments")
-        ("rf.time_adv_nsamples", bpo::value<string>(&args->rf.time_adv_nsamples)->default_value("auto"),    "Transmission time advance")
-        ("rf.burst_preamble_us", bpo::value<string>(&args->rf.burst_preamble)->default_value("auto"), "Transmission time advance")
+  // Command line or config file options
+  bpo::options_description common("Configuration options");
+  common.add_options()
+    ("rf.dl_freq", bpo::value<float>(&args->rf.dl_freq)->default_value(2680000000), "Downlink centre frequency")
+    ("rf.ul_freq", bpo::value<float>(&args->rf.ul_freq)->default_value(2560000000), "Uplink centre frequency")
+    ("rf.rx_gain", bpo::value<float>(&args->rf.rx_gain)->default_value(-1), "Front-end receiver gain")
+    ("rf.tx_gain", bpo::value<float>(&args->rf.tx_gain)->default_value(-1), "Front-end transmitter gain")
+    ("rf.nof_rx_ant", bpo::value<uint32_t>(&args->rf.nof_rx_ant)->default_value(1), "Number of RX antennas")

-        ("pcap.enable",       bpo::value<bool>(&args->pcap.enable)->default_value(false),           "Enable MAC packet captures for wireshark")
-        ("pcap.filename",     bpo::value<string>(&args->pcap.filename)->default_value("ue.pcap"),   "MAC layer capture filename")
+    ("rf.device_name", bpo::value<string>(&args->rf.device_name)->default_value("auto"), "Front-end device name")
+    ("rf.device_args", bpo::value<string>(&args->rf.device_args)->default_value("auto"), "Front-end device arguments")
+    ("rf.time_adv_nsamples", bpo::value<string>(&args->rf.time_adv_nsamples)->default_value("auto"),
+     "Transmission time advance")
+    ("rf.burst_preamble_us", bpo::value<string>(&args->rf.burst_preamble)->default_value("auto"),
+     "Transmission time advance")

-        ("trace.enable",      bpo::value<bool>(&args->trace.enable)->default_value(false),                  "Enable PHY and radio timing traces")
-        ("trace.phy_filename",bpo::value<string>(&args->trace.phy_filename)->default_value("ue.phy_trace"), "PHY timing traces filename")
-        ("trace.radio_filename",bpo::value<string>(&args->trace.radio_filename)->default_value("ue.radio_trace"), "Radio timing traces filename")
+    ("pcap.enable", bpo::value<bool>(&args->pcap.enable)->default_value(false),
+     "Enable MAC packet captures for wireshark")
+    ("pcap.filename", bpo::value<string>(&args->pcap.filename)->default_value("ue.pcap"), "MAC layer capture filename")

-        ("gui.enable",        bpo::value<bool>(&args->gui.enable)->default_value(false),                  "Enable GUI plots")
-        
-        ("log.phy_level",     bpo::value<string>(&args->log.phy_level),   "PHY log level")
-        ("log.phy_hex_limit", bpo::value<int>(&args->log.phy_hex_limit),  "PHY log hex dump limit")
-        ("log.mac_level",     bpo::value<string>(&args->log.mac_level),   "MAC log level")
-        ("log.mac_hex_limit", bpo::value<int>(&args->log.mac_hex_limit),  "MAC log hex dump limit")
-        ("log.rlc_level",     bpo::value<string>(&args->log.rlc_level),   "RLC log level")
-        ("log.rlc_hex_limit", bpo::value<int>(&args->log.rlc_hex_limit),  "RLC log hex dump limit")
-        ("log.pdcp_level",    bpo::value<string>(&args->log.pdcp_level),  "PDCP log level")
-        ("log.pdcp_hex_limit",bpo::value<int>(&args->log.pdcp_hex_limit), "PDCP log hex dump limit")
-        ("log.rrc_level",     bpo::value<string>(&args->log.rrc_level),   "RRC log level")
-        ("log.rrc_hex_limit", bpo::value<int>(&args->log.rrc_hex_limit),  "RRC log hex dump limit")
-        ("log.gw_level",      bpo::value<string>(&args->log.gw_level),    "GW log level")
-        ("log.gw_hex_limit",  bpo::value<int>(&args->log.gw_hex_limit),   "GW log hex dump limit")
-        ("log.nas_level",     bpo::value<string>(&args->log.nas_level),   "NAS log level")
-        ("log.nas_hex_limit", bpo::value<int>(&args->log.nas_hex_limit),  "NAS log hex dump limit")
-        ("log.usim_level",    bpo::value<string>(&args->log.usim_level),  "USIM log level")
-        ("log.usim_hex_limit",bpo::value<int>(&args->log.usim_hex_limit), "USIM log hex dump limit")
+    ("trace.enable", bpo::value<bool>(&args->trace.enable)->default_value(false), "Enable PHY and radio timing traces")
+    ("trace.phy_filename", bpo::value<string>(&args->trace.phy_filename)->default_value("ue.phy_trace"),
+     "PHY timing traces filename")
+    ("trace.radio_filename", bpo::value<string>(&args->trace.radio_filename)->default_value("ue.radio_trace"),
+     "Radio timing traces filename")

-        ("log.all_level",     bpo::value<string>(&args->log.all_level)->default_value("info"),   "ALL log level")
-        ("log.all_hex_limit", bpo::value<int>(&args->log.all_hex_limit)->default_value(32),  "ALL log hex dump limit")
+    ("gui.enable", bpo::value<bool>(&args->gui.enable)->default_value(false), "Enable GUI plots")

-        ("log.filename",      bpo::value<string>(&args->log.filename)->default_value("/tmp/ue.log"),"Log filename")
+    ("log.phy_level", bpo::value<string>(&args->log.phy_level), "PHY log level")
+    ("log.phy_hex_limit", bpo::value<int>(&args->log.phy_hex_limit), "PHY log hex dump limit")
+    ("log.mac_level", bpo::value<string>(&args->log.mac_level), "MAC log level")
+    ("log.mac_hex_limit", bpo::value<int>(&args->log.mac_hex_limit), "MAC log hex dump limit")
+    ("log.rlc_level", bpo::value<string>(&args->log.rlc_level), "RLC log level")
+    ("log.rlc_hex_limit", bpo::value<int>(&args->log.rlc_hex_limit), "RLC log hex dump limit")
+    ("log.pdcp_level", bpo::value<string>(&args->log.pdcp_level), "PDCP log level")
+    ("log.pdcp_hex_limit", bpo::value<int>(&args->log.pdcp_hex_limit), "PDCP log hex dump limit")
+    ("log.rrc_level", bpo::value<string>(&args->log.rrc_level), "RRC log level")
+    ("log.rrc_hex_limit", bpo::value<int>(&args->log.rrc_hex_limit), "RRC log hex dump limit")
+    ("log.gw_level", bpo::value<string>(&args->log.gw_level), "GW log level")
+    ("log.gw_hex_limit", bpo::value<int>(&args->log.gw_hex_limit), "GW log hex dump limit")
+    ("log.nas_level", bpo::value<string>(&args->log.nas_level), "NAS log level")
+    ("log.nas_hex_limit", bpo::value<int>(&args->log.nas_hex_limit), "NAS log hex dump limit")
+    ("log.usim_level", bpo::value<string>(&args->log.usim_level), "USIM log level")
+    ("log.usim_hex_limit", bpo::value<int>(&args->log.usim_hex_limit), "USIM log hex dump limit")

-        ("usim.algo",         bpo::value<string>(&args->usim.algo),        "USIM authentication algorithm")
-        ("usim.op",           bpo::value<string>(&args->usim.op),          "USIM operator variant")
-        ("usim.amf",          bpo::value<string>(&args->usim.amf),         "USIM authentication management field")
-        ("usim.imsi",         bpo::value<string>(&args->usim.imsi),        "USIM IMSI")
-        ("usim.imei",         bpo::value<string>(&args->usim.imei),        "USIM IMEI")
-        ("usim.k",            bpo::value<string>(&args->usim.k),           "USIM K")
-        
-        
-        /* Expert section */
-        ("expert.phy.worker_cpu_mask",
-            bpo::value<int>(&args->expert.phy.worker_cpu_mask)->default_value(-1),
-            "cpu bit mask (eg 255 = 1111 1111)")        
-        
-        ("expert.phy.sync_cpu_affinity",
-            bpo::value<int>(&args->expert.phy.sync_cpu_affinity)->default_value(-1),
-            "index of the core used by the sync thread")
-    
-        ("expert.ue_category",
-            bpo::value<int>(&args->expert.ue_cateogry)->default_value(4), 
-            "UE Category (1 to 5)")
+    ("log.all_level", bpo::value<string>(&args->log.all_level)->default_value("info"), "ALL log level")
+    ("log.all_hex_limit", bpo::value<int>(&args->log.all_hex_limit)->default_value(32), "ALL log hex dump limit")

-        ("expert.metrics_period_secs",
-            bpo::value<float>(&args->expert.metrics_period_secs)->default_value(1.0), 
-            "Periodicity for metrics in seconds")
+    ("log.filename", bpo::value<string>(&args->log.filename)->default_value("/tmp/ue.log"), "Log filename")

-        ("expert.pregenerate_signals",
-            bpo::value<bool>(&args->expert.pregenerate_signals)->default_value(false), 
-            "Pregenerate uplink signals after attach. Improves CPU performance.")
-        
-        ("expert.rssi_sensor_enabled", 
-            bpo::value<bool>(&args->expert.phy.rssi_sensor_enabled)->default_value(true),  
-            "Enable or disable RF frontend RSSI sensor. In some USRP devices can cause segmentation fault")
-        
-        ("expert.prach_gain", 
-            bpo::value<float>(&args->expert.phy.prach_gain)->default_value(-1.0),  
-            "Disable PRACH power control")
-        
-        ("expert.cqi_max",         
-            bpo::value<int>(&args->expert.phy.cqi_max)->default_value(15), 
-            "Upper bound on the maximum CQI to be reported. Default 15.")
-        
-        ("expert.cqi_fixed",         
-            bpo::value<int>(&args->expert.phy.cqi_fixed)->default_value(-1), 
-            "Fixes the reported CQI to a constant value. Default disabled.")
-        
-        ("expert.snr_ema_coeff",         
-            bpo::value<float>(&args->expert.phy.snr_ema_coeff)->default_value(0.1), 
-            "Sets the SNR exponential moving average coefficient (Default 0.1)")
-        
-        ("expert.snr_estim_alg",         
-            bpo::value<string>(&args->expert.phy.snr_estim_alg)->default_value("refs"), 
-            "Sets the noise estimation algorithm. (Default refs)")
-        
-        ("expert.pdsch_max_its",         
-            bpo::value<int>(&args->expert.phy.pdsch_max_its)->default_value(4), 
-            "Maximum number of turbo decoder iterations")
+    ("usim.algo", bpo::value<string>(&args->usim.algo), "USIM authentication algorithm")
+    ("usim.op", bpo::value<string>(&args->usim.op), "USIM operator variant")
+    ("usim.amf", bpo::value<string>(&args->usim.amf), "USIM authentication management field")
+    ("usim.imsi", bpo::value<string>(&args->usim.imsi), "USIM IMSI")
+    ("usim.imei", bpo::value<string>(&args->usim.imei), "USIM IMEI")
+    ("usim.k", bpo::value<string>(&args->usim.k), "USIM K")

-        ("expert.attach_enable_64qam",      
-            bpo::value<bool>(&args->expert.phy.attach_enable_64qam)->default_value(false), 
-            "PUSCH 64QAM modulation before attachment")
-        
-        ("expert.nof_phy_threads",    
-            bpo::value<int>(&args->expert.phy.nof_phy_threads)->default_value(2), 
-            "Number of PHY threads")
-        
-        ("expert.equalizer_mode",    
-            bpo::value<string>(&args->expert.phy.equalizer_mode)->default_value("mmse"), 
-            "Equalizer mode")
-     
-        ("expert.cfo_integer_enabled",    
-            bpo::value<bool>(&args->expert.phy.cfo_integer_enabled)->default_value(false), 
-            "Enables integer CFO estimation and correction.")
-        
-        ("expert.cfo_correct_tol_hz",    
-            bpo::value<float>(&args->expert.phy.cfo_correct_tol_hz)->default_value(50.0), 
-            "Tolerance (in Hz) for digial CFO compensation.")
-        
-        ("expert.time_correct_period",    
-            bpo::value<int>(&args->expert.phy.time_correct_period)->default_value(5), 
-            "Period for sampling time offset correction.")
-        
-        ("expert.sfo_correct_disable",    
-            bpo::value<bool>(&args->expert.phy.sfo_correct_disable)->default_value(false), 
-            "Disables phase correction before channel estimation.")
-        
-        ("expert.sss_algorithm",    
-            bpo::value<string>(&args->expert.phy.sss_algorithm)->default_value("full"), 
-            "Selects the SSS estimation algorithm.")

-        ("expert.estimator_fil_w",    
-            bpo::value<float>(&args->expert.phy.estimator_fil_w)->default_value(0.1), 
-            "Chooses the coefficients for the 3-tap channel estimator centered filter.")
-        
-        
-        ("rf_calibration.tx_corr_dc_gain",  bpo::value<float>(&args->rf_cal.tx_corr_dc_gain)->default_value(0.0),  "TX DC offset gain correction")
-        ("rf_calibration.tx_corr_dc_phase", bpo::value<float>(&args->rf_cal.tx_corr_dc_phase)->default_value(0.0), "TX DC offset phase correction")
-        ("rf_calibration.tx_corr_iq_i",     bpo::value<float>(&args->rf_cal.tx_corr_iq_i)->default_value(0.0),     "TX IQ imbalance inphase correction")
-        ("rf_calibration.tx_corr_iq_q",     bpo::value<float>(&args->rf_cal.tx_corr_iq_q)->default_value(0.0),     "TX IQ imbalance quadrature correction")
-        
-    ;
-    
-    // Positional options - config file location
-    bpo::options_description position("Positional options");
-    position.add_options()
-    ("config_file", bpo::value< string >(&config_file), "UE configuration file")
-    ;
-    bpo::positional_options_description p;
-    p.add("config_file", -1);
+    /* Expert section */
+    ("expert.phy.worker_cpu_mask",
+     bpo::value<int>(&args->expert.phy.worker_cpu_mask)->default_value(-1),
+     "cpu bit mask (eg 255 = 1111 1111)")

-    // these options are allowed on the command line
-    bpo::options_description cmdline_options;
-    cmdline_options.add(common).add(position).add(general);
+    ("expert.phy.sync_cpu_affinity",
+     bpo::value<int>(&args->expert.phy.sync_cpu_affinity)->default_value(-1),
+     "index of the core used by the sync thread")

-    // parse the command line and store result in vm
-    bpo::variables_map vm;
-    bpo::store(bpo::command_line_parser(argc, argv).options(cmdline_options).positional(p).run(), vm);
+    ("expert.ue_category",
+     bpo::value<int>(&args->expert.ue_cateogry)->default_value(4),
+     "UE Category (1 to 5)")
+
+    ("expert.metrics_period_secs",
+     bpo::value<float>(&args->expert.metrics_period_secs)->default_value(1.0),
+     "Periodicity for metrics in seconds")
+
+    ("expert.pregenerate_signals",
+     bpo::value<bool>(&args->expert.pregenerate_signals)->default_value(false),
+     "Pregenerate uplink signals after attach. Improves CPU performance.")
+
+    ("expert.rssi_sensor_enabled",
+     bpo::value<bool>(&args->expert.phy.rssi_sensor_enabled)->default_value(true),
+     "Enable or disable RF frontend RSSI sensor. In some USRP devices can cause segmentation fault")
+
+    ("expert.prach_gain",
+     bpo::value<float>(&args->expert.phy.prach_gain)->default_value(-1.0),
+     "Disable PRACH power control")
+
+    ("expert.cqi_max",
+     bpo::value<int>(&args->expert.phy.cqi_max)->default_value(15),
+     "Upper bound on the maximum CQI to be reported. Default 15.")
+
+    ("expert.cqi_fixed",
+     bpo::value<int>(&args->expert.phy.cqi_fixed)->default_value(-1),
+     "Fixes the reported CQI to a constant value. Default disabled.")
+
+    ("expert.snr_ema_coeff",
+     bpo::value<float>(&args->expert.phy.snr_ema_coeff)->default_value(0.1),
+     "Sets the SNR exponential moving average coefficient (Default 0.1)")
+
+    ("expert.snr_estim_alg",
+     bpo::value<string>(&args->expert.phy.snr_estim_alg)->default_value("refs"),
+     "Sets the noise estimation algorithm. (Default refs)")
+
+    ("expert.pdsch_max_its",
+     bpo::value<int>(&args->expert.phy.pdsch_max_its)->default_value(4),
+     "Maximum number of turbo decoder iterations")
+
+    ("expert.attach_enable_64qam",
+     bpo::value<bool>(&args->expert.phy.attach_enable_64qam)->default_value(false),
+     "PUSCH 64QAM modulation before attachment")
+
+    ("expert.nof_phy_threads",
+     bpo::value<int>(&args->expert.phy.nof_phy_threads)->default_value(2),
+     "Number of PHY threads")
+
+    ("expert.equalizer_mode",
+     bpo::value<string>(&args->expert.phy.equalizer_mode)->default_value("mmse"),
+     "Equalizer mode")
+
+    ("expert.cfo_integer_enabled",
+     bpo::value<bool>(&args->expert.phy.cfo_integer_enabled)->default_value(false),
+     "Enables integer CFO estimation and correction.")
+
+    ("expert.cfo_correct_tol_hz",
+     bpo::value<float>(&args->expert.phy.cfo_correct_tol_hz)->default_value(50.0),
+     "Tolerance (in Hz) for digial CFO compensation.")
+
+    ("expert.time_correct_period",
+     bpo::value<int>(&args->expert.phy.time_correct_period)->default_value(5),
+     "Period for sampling time offset correction.")
+
+    ("expert.sfo_correct_disable",
+     bpo::value<bool>(&args->expert.phy.sfo_correct_disable)->default_value(false),
+     "Disables phase correction before channel estimation.")
+
+    ("expert.sss_algorithm",
+     bpo::value<string>(&args->expert.phy.sss_algorithm)->default_value("full"),
+     "Selects the SSS estimation algorithm.")
+
+    ("expert.estimator_fil_w",
+     bpo::value<float>(&args->expert.phy.estimator_fil_w)->default_value(0.1),
+     "Chooses the coefficients for the 3-tap channel estimator centered filter.")
+
+
+    ("rf_calibration.tx_corr_dc_gain", bpo::value<float>(&args->rf_cal.tx_corr_dc_gain)->default_value(0.0),
+     "TX DC offset gain correction")
+    ("rf_calibration.tx_corr_dc_phase", bpo::value<float>(&args->rf_cal.tx_corr_dc_phase)->default_value(0.0),
+     "TX DC offset phase correction")
+    ("rf_calibration.tx_corr_iq_i", bpo::value<float>(&args->rf_cal.tx_corr_iq_i)->default_value(0.0),
+     "TX IQ imbalance inphase correction")
+    ("rf_calibration.tx_corr_iq_q", bpo::value<float>(&args->rf_cal.tx_corr_iq_q)->default_value(0.0),
+     "TX IQ imbalance quadrature correction");
+
+  // Positional options - config file location
+  bpo::options_description position("Positional options");
+  position.add_options()
+    ("config_file", bpo::value<string>(&config_file), "UE configuration file");
+  bpo::positional_options_description p;
+  p.add("config_file", -1);
+
+  // these options are allowed on the command line
+  bpo::options_description cmdline_options;
+  cmdline_options.add(common).add(position).add(general);
+
+  // parse the command line and store result in vm
+  bpo::variables_map vm;
+  bpo::store(bpo::command_line_parser(argc, argv).options(cmdline_options).positional(p).run(), vm);
+  bpo::notify(vm);
+
+  // help option was given - print usage and exit
+  if (vm.count("help")) {
+    cout << "Usage: " << argv[0] << " [OPTIONS] config_file" << endl << endl;
+    cout << common << endl << general << endl;
+    exit(0);
+  }
+
+  // print version number and exit
+  if (vm.count("version")) {
+    cout << "Version " <<
+         srslte_get_version_major() << "." <<
+         srslte_get_version_minor() << "." <<
+         srslte_get_version_patch() << endl;
+    exit(0);
+  }
+
+  // no config file given - print usage and exit
+  if (!vm.count("config_file")) {
+    cout << "Error: Configuration file not provided" << endl;
+    cout << "Usage: " << argv[0] << " [OPTIONS] config_file" << endl << endl;
+    exit(0);
+  } else {
+    cout << "Reading configuration file " << config_file << "..." << endl;
+    ifstream conf(config_file.c_str(), ios::in);
+    if (conf.fail()) {
+      cout << "Failed to read configuration file " << config_file << " - exiting" << endl;
+      exit(1);
+    }
+    bpo::store(bpo::parse_config_file(conf, common), vm);
    bpo::notify(vm);
+  }

-    // help option was given - print usage and exit
-    if (vm.count("help")) {
-        cout << "Usage: " << argv[0] << " [OPTIONS] config_file" << endl << endl;
-        cout << common << endl << general << endl;
-        exit(0);
+  // Apply all_level to any unset layers
+  if (vm.count("log.all_level")) {
+    if (!vm.count("log.phy_level")) {
+      args->log.phy_level = args->log.all_level;
    }
+    if (!vm.count("log.mac_level")) {
+      args->log.mac_level = args->log.all_level;
+    }
+    if (!vm.count("log.rlc_level")) {
+      args->log.rlc_level = args->log.all_level;
+    }
+    if (!vm.count("log.pdcp_level")) {
+      args->log.pdcp_level = args->log.all_level;
+    }
+    if (!vm.count("log.rrc_level")) {
+      args->log.rrc_level = args->log.all_level;
+    }
+    if (!vm.count("log.nas_level")) {
+      args->log.nas_level = args->log.all_level;
+    }
+    if (!vm.count("log.gw_level")) {
+      args->log.gw_level = args->log.all_level;
+    }
+    if (!vm.count("log.usim_level")) {
+      args->log.usim_level = args->log.all_level;
+    }
+  }

-    // print version number and exit
-    if (vm.count("version")) {
-        cout << "Version " <<
-                srslte_get_version_major() << "." <<
-                srslte_get_version_minor() << "." <<
-                srslte_get_version_patch() << endl;
-        exit(0);
-    }

-    // no config file given - print usage and exit
-    if (!vm.count("config_file")) {
-        cout << "Error: Configuration file not provided" << endl;
-        cout << "Usage: " << argv[0] << " [OPTIONS] config_file" << endl << endl;
-        exit(0);
-    } else {
-        cout << "Reading configuration file " << config_file << "..." << endl;
-        ifstream conf(config_file.c_str(), ios::in);
-        if(conf.fail()) {
-          cout << "Failed to read configuration file " << config_file << " - exiting" << endl;
-          exit(1);
-        }
-        bpo::store(bpo::parse_config_file(conf, common), vm);
-        bpo::notify(vm);
+  // Apply all_hex_limit to any unset layers
+  if (vm.count("log.all_hex_limit")) {
+    if (!vm.count("log.phy_hex_limit")) {
+      args->log.phy_hex_limit = args->log.all_hex_limit;
    }
-
-    // Apply all_level to any unset layers
-    if (vm.count("log.all_level")) {
-      if(!vm.count("log.phy_level")) {
-        args->log.phy_level = args->log.all_level;
-      }
-      if(!vm.count("log.mac_level")) {
-        args->log.mac_level = args->log.all_level;
-      }
-      if(!vm.count("log.rlc_level")) {
-        args->log.rlc_level = args->log.all_level;
-      }
-      if(!vm.count("log.pdcp_level")) {
-        args->log.pdcp_level = args->log.all_level;
-      }
-      if(!vm.count("log.rrc_level")) {
-        args->log.rrc_level = args->log.all_level;
-      }
-      if(!vm.count("log.nas_level")) {
-        args->log.nas_level = args->log.all_level;
-      }
-      if(!vm.count("log.gw_level")) {
-        args->log.gw_level = args->log.all_level;
-      }
-      if(!vm.count("log.usim_level")) {
-        args->log.usim_level = args->log.all_level;
-      }
+    if (!vm.count("log.mac_hex_limit")) {
+      args->log.mac_hex_limit = args->log.all_hex_limit;
    }
-
-    // Apply all_hex_limit to any unset layers
-    if (vm.count("log.all_hex_limit")) {
-      if(!vm.count("log.phy_hex_limit")) {
-        args->log.phy_hex_limit = args->log.all_hex_limit;
-      }
-      if(!vm.count("log.mac_hex_limit")) {
-        args->log.mac_hex_limit = args->log.all_hex_limit;
-      }
-      if(!vm.count("log.rlc_hex_limit")) {
-        args->log.rlc_hex_limit = args->log.all_hex_limit;
-      }
-      if(!vm.count("log.pdcp_hex_limit")) {
-        args->log.pdcp_hex_limit = args->log.all_hex_limit;
-      }
-      if(!vm.count("log.rrc_hex_limit")) {
-        args->log.rrc_hex_limit = args->log.all_hex_limit;
-      }
-      if(!vm.count("log.nas_hex_limit")) {
-        args->log.nas_hex_limit = args->log.all_hex_limit;
-      }
-      if(!vm.count("log.gw_hex_limit")) {
-        args->log.gw_hex_limit = args->log.all_hex_limit;
-      }
-      if(!vm.count("log.usim_hex_limit")) {
-        args->log.usim_hex_limit = args->log.all_hex_limit;
-      }
+    if (!vm.count("log.rlc_hex_limit")) {
+      args->log.rlc_hex_limit = args->log.all_hex_limit;
    }
+    if (!vm.count("log.pdcp_hex_limit")) {
+      args->log.pdcp_hex_limit = args->log.all_hex_limit;
+    }
+    if (!vm.count("log.rrc_hex_limit")) {
+      args->log.rrc_hex_limit = args->log.all_hex_limit;
+    }
+    if (!vm.count("log.nas_hex_limit")) {
+      args->log.nas_hex_limit = args->log.all_hex_limit;
+    }
+    if (!vm.count("log.gw_hex_limit")) {
+      args->log.gw_hex_limit = args->log.all_hex_limit;
+    }
+    if (!vm.count("log.usim_hex_limit")) {
+      args->log.usim_hex_limit = args->log.all_hex_limit;
+    }
+  }
 }

-static bool running    = true;
+static bool running = true;
 static bool do_metrics = false;

-void sig_int_handler(int signo)
-{
+void sig_int_handler(int signo) {
  running = false;
 }

-void *input_loop(void *m)
-{
-  metrics_stdout *metrics = (metrics_stdout*)m;
+void *input_loop(void *m) {
+  metrics_stdout *metrics = (metrics_stdout *) m;
  char key;
-  while(running) {
+  while (running) {
    cin >> key;
-    if('t' == key) {
+    if ('t' == key) {
      do_metrics = !do_metrics;
-      if(do_metrics) {
+      if (do_metrics) {
        cout << "Enter t to stop trace." << endl;
      } else {
        cout << "Enter t to restart trace." << endl;
@ -337,17 +342,16 @@ void *input_loop(void *m)
  return NULL;
 }

-int main(int argc, char *argv[])
-{
+int main(int argc, char *argv[]) {
  signal(SIGINT, sig_int_handler);
-  all_args_t     args;
+  all_args_t args;
  metrics_stdout metrics;
-  ue            *ue = ue::get_instance();
+  ue *ue = ue::get_instance();

  cout << "---  Software Radio Systems LTE UE  ---" << endl << endl;

  parse_args(&args, argc, argv);
-  if(!ue->init(&args)) {
+  if (!ue->init(&args)) {
    exit(1);
  }
  metrics.init(ue, args.expert.metrics_period_secs);
@ -355,17 +359,17 @@ int main(int argc, char *argv[])
  pthread_t input;
  pthread_create(&input, NULL, &input_loop, &metrics);

-  bool plot_started         = false; 
-  bool signals_pregenerated = false; 
-  while(running) {
+  bool plot_started = false;
+  bool signals_pregenerated = false;
+  while (running) {
    if (ue->is_attached()) {
      if (!signals_pregenerated && args.expert.pregenerate_signals) {
        ue->pregenerate_signals(true);
-        signals_pregenerated = true; 
+        signals_pregenerated = true;
      }
      if (!plot_started && args.gui.enable) {
        ue->start_plot();
-        plot_started = true; 
+        plot_started = true;
      }
    }
    sleep(1);