Conditional SSE compilation

This commit is contained in:
ismagom 2015-10-16 11:05:13 +02:00
parent 438a5aa240
commit 6c194dc078
26 changed files with 435 additions and 191 deletions

View File

@ -83,8 +83,18 @@ IF(CMAKE_COMPILER_IS_GNUCXX)
#Any additional flags for CXX
ENDIF(CMAKE_COMPILER_IS_GNUCXX)
FIND_PACKAGE(SSE)
IF(CMAKE_COMPILER_IS_GNUCC)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-write-strings -Wno-format-extra-args -Winline -Wno-unused-result -Wno-format -std=c99 -D_GNU_SOURCE -g -mfpmath=sse -mavx -O3")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-write-strings -Wno-format-extra-args -Winline -Wno-unused-result -Wno-format -std=c99 -D_GNU_SOURCE -g -march=native -O3")
IF(AVX_FOUND)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpmath=sse -mavx -DLV_HAVE_AVX -DLV_HAVE_SSE")
ELSEIF(SSE4_2_FOUND)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpmath=sse -msse_4.2 -DLV_HAVE_SSE")
ELSEIF(SSE4_1_FOUND)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpmath=sse -msse_4.1 -DLV_HAVE_SSE")
ENDIF(AVX_FOUND)
# IF(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
# set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror -Wno-error=implicit-function-declaration -Wno-error=unused-but-set-variable")
# ENDIF(${CMAKE_BUILD_TYPE} STREQUAL "Debug")

View File

@ -47,6 +47,7 @@ CHECK_FUNCTION_EXISTS_MATH(volk_32f_x2_subtract_32f HAVE_VOLK_SUB_FLOAT_FUNCTION
CHECK_FUNCTION_EXISTS_MATH(volk_32fc_x2_square_dist_32f HAVE_VOLK_SQUARE_DIST_FUNCTION)
CHECK_FUNCTION_EXISTS_MATH(volk_32fc_deinterleave_real_32f HAVE_VOLK_DEINTERLEAVE_FUNCTION)
CHECK_FUNCTION_EXISTS_MATH(volk_32fc_index_max_16u HAVE_VOLK_MAX_ABS_FUNCTION)
CHECK_FUNCTION_EXISTS_MATH(volk_16i_s32f_convert_32f HAVE_VOLK_CONVERT_IF_FUNCTION)
INCLUDE(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(VOLK DEFAULT_MSG VOLK_LIBRARIES VOLK_INCLUDE_DIRS)
@ -54,6 +55,7 @@ MARK_AS_ADVANCED(VOLK_LIBRARIES VOLK_INCLUDE_DIRS VOLK_DEFINITIONS)
IF(VOLK_FOUND)
SET(CMAKE_REQUIRED_LIBRARIES ${VOLK_LIBRARIES} m)
CHECK_FUNCTION_EXISTS_MATH(volk_16i_s32f_convert_32f HAVE_VOLK_CONVERT_IF_FUNCTION)
CHECK_FUNCTION_EXISTS_MATH(volk_32f_index_max_16u HAVE_VOLK_MAX_FUNCTION)
CHECK_FUNCTION_EXISTS_MATH(volk_32f_x2_max_32f HAVE_VOLK_MAX_VEC_FUNCTION)
CHECK_FUNCTION_EXISTS_MATH(volk_32f_accumulator_s32f HAVE_VOLK_ACC_FUNCTION)

View File

@ -85,8 +85,7 @@ SRSLTE_API int srslte_precoding_type(srslte_precoding_t *q,
/* Estimates the vector "x" based on the received signal "y" and the channel estimates "h"
*/
SRSLTE_API int srslte_predecoding_single(srslte_precoding_t *q,
cf_t *y,
SRSLTE_API int srslte_predecoding_single(cf_t *y,
cf_t *h,
cf_t *x,
int nof_symbols,

View File

@ -40,8 +40,7 @@
#include "srslte/common/phy_common.h"
#include "srslte/fec/rm_turbo.h"
#include "srslte/fec/turbocoder.h"
#include "srslte/fec/turbodecoder_gen.h"
#include "srslte/fec/turbodecoder_sse.h"
#include "srslte/fec/turbodecoder.h"
#include "srslte/fec/crc.h"
#include "srslte/phch/pdsch_cfg.h"
#include "srslte/phch/pusch_cfg.h"
@ -66,12 +65,12 @@ typedef struct SRSLTE_API {
uint8_t *parity_bits;
void *e;
uint8_t *temp_g_bits;
uint32_t *ul_interleaver;
uint16_t *ul_interleaver;
srslte_uci_bit_t ack_ri_bits[12*288];
uint32_t nof_ri_ack_bits;
srslte_tcod_t encoder;
srslte_tdec_sse_t decoder;
srslte_tdec_t decoder;
srslte_crc_t crc_tb;
srslte_crc_t crc_cb;

View File

@ -63,8 +63,7 @@
#include "srslte/fec/crc.h"
#include "srslte/fec/tc_interl.h"
#include "srslte/fec/turbocoder.h"
#include "srslte/fec/turbodecoder_sse.h"
#include "srslte/fec/turbodecoder_gen.h"
#include "srslte/fec/turbodecoder.h"
#include "srslte/fec/cbsegm.h"
#include "srslte/fec/rm_conv.h"
#include "srslte/fec/rm_turbo.h"

View File

@ -109,7 +109,7 @@ SRSLTE_API void srslte_vec_sc_div2_sss(short *x, int pow2_div, short *z, uint32_
SRSLTE_API void srslte_vec_norm_cfc(cf_t *x, float amplitude, cf_t *y, uint32_t len);
SRSLTE_API void srslte_vec_convert_fi(float *x, int16_t *z, float scale, uint32_t len);
SRSLTE_API void srslte_vec_convert_if(int16_t *x, float *z, float scale, uint32_t len);
SRSLTE_API void srslte_vec_lut_fuf(float *x, uint32_t *lut, float *y, uint32_t len);
SRSLTE_API void srslte_vec_lut_sss(short *x, unsigned short *lut, short *y, uint32_t len);

View File

@ -102,24 +102,24 @@ int main(int argc, char **argv) {
num_re = 2 * cell.nof_prb * SRSLTE_NRE * SRSLTE_CP_NSYMB(cell.cp);
input = malloc(num_re * sizeof(cf_t));
input = srslte_vec_malloc(num_re * sizeof(cf_t));
if (!input) {
perror("malloc");
perror("srslte_vec_malloc");
goto do_exit;
}
output = malloc(num_re * sizeof(cf_t));
output = srslte_vec_malloc(num_re * sizeof(cf_t));
if (!output) {
perror("malloc");
perror("srslte_vec_malloc");
goto do_exit;
}
h = malloc(num_re * sizeof(cf_t));
h = srslte_vec_malloc(num_re * sizeof(cf_t));
if (!h) {
perror("malloc");
perror("srslte_vec_malloc");
goto do_exit;
}
ce = malloc(num_re * sizeof(cf_t));
ce = srslte_vec_malloc(num_re * sizeof(cf_t));
if (!ce) {
perror("malloc");
perror("srslte_vec_malloc");
goto do_exit;
}
@ -173,7 +173,7 @@ int main(int argc, char **argv) {
gettimeofday(&t[1], NULL);
for (int j=0;j<100;j++) {
srslte_predecoding_single(&cheq, input, ce, output, num_re, 0);
srslte_predecoding_single(input, ce, output, num_re, 0);
}
gettimeofday(&t[2], NULL);
get_time_interval(t);
@ -188,7 +188,7 @@ int main(int argc, char **argv) {
gettimeofday(&t[1], NULL);
for (int j=0;j<100;j++) {
srslte_predecoding_single(&cheq, input, ce, output, num_re, srslte_chest_dl_get_noise_estimate(&est));
srslte_predecoding_single(input, ce, output, num_re, srslte_chest_dl_get_noise_estimate(&est));
}
gettimeofday(&t[2], NULL);
get_time_interval(t);

View File

@ -189,7 +189,7 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
}
if (cell.nof_ports == 1) {
srslte_predecoding_single(&cheq, input_signal, ce[0], output_signal2, nof_re, srslte_chest_dl_get_noise_estimate(&chest));
srslte_predecoding_single(input_signal, ce[0], output_signal2, nof_re, srslte_chest_dl_get_noise_estimate(&chest));
} else {
srslte_predecoding_diversity(&cheq, input_signal, ce, output_signal, cell.nof_ports, nof_re, srslte_chest_dl_get_noise_estimate(&chest));
srslte_layerdemap_diversity(output_signal, output_signal2, cell.nof_ports, nof_re/cell.nof_ports);

View File

@ -181,7 +181,7 @@ void srslte_ofdm_rx_sf(srslte_ofdm_t *q, cf_t *input, cf_t *output) {
srslte_vec_prod_ccc(input, q->shift_buffer, input, 2*q->slot_sz);
}
for (n=0;n<2;n++) {
srslte_ofdm_rx_slot_zerocopy(q, &input[n*q->slot_sz], &output[n*q->nof_re*q->nof_symbols]);
srslte_ofdm_rx_slot(q, &input[n*q->slot_sz], &output[n*q->nof_re*q->nof_symbols]);
}
}

View File

@ -37,14 +37,16 @@
#include "srslte/utils/vector.h"
#include "srslte/fec/cbsegm.h"
#define HAVE_SIMD
#ifdef HAVE_SIMD
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
#include <tmmintrin.h>
int srslte_rm_turbo_rx_lut_simd(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx);
#include <pmmintrin.h>
int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx);
#endif
#ifdef LV_HAVE_AVX
#include <immintrin.h>
int srslte_rm_turbo_rx_lut_avx(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx);
#endif
#define NCOLS 32
@ -286,29 +288,32 @@ int srslte_rm_turbo_tx_lut(uint8_t *w_buff, uint8_t *systematic, uint8_t *parity
int srslte_rm_turbo_rx_lut(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx)
{
#ifndef HAVE_SIMD
if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) {
uint32_t out_len = 3*srslte_cbsegm_cbsize(cb_idx)+12;
uint16_t *deinter = deinterleaver[cb_idx][rv_idx];
for (int i=0;i<in_len;i++) {
//printf("i=%d=%d goes to %d\n", i%out_len, input[i], deinter[i%out_len]);
output[deinter[i%out_len]] += input[i];
#ifdef LV_HAVE_AVX
return srslte_rm_turbo_rx_lut_avx(input, output, in_len, cb_idx, rv_idx);
#else
#ifdef LV_HAVE_SSE
return srslte_rm_turbo_rx_lut_sse(input, output, in_len, cb_idx, rv_idx);
#else
if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) {
uint32_t out_len = 3*srslte_cbsegm_cbsize(cb_idx)+12;
uint16_t *deinter = deinterleaver[cb_idx][rv_idx];
for (int i=0;i<in_len;i++) {
//printf("i=%d=%d goes to %d\n", i%out_len, input[i], deinter[i%out_len]);
output[deinter[i%out_len]] += input[i];
}
return 0;
} else {
printf("Invalid inputs rv_idx=%d, cb_idx=%d\n", rv_idx, cb_idx);
return SRSLTE_ERROR_INVALID_INPUTS;
}
return 0;
} else {
printf("Invalid inputs rv_idx=%d, cb_idx=%d\n", rv_idx, cb_idx);
return SRSLTE_ERROR_INVALID_INPUTS;
}
#else
return srslte_rm_turbo_rx_lut_simd(input, output, in_len, cb_idx, rv_idx);
#endif
#endif
}
#ifdef HAVE_SIMD
#ifdef LV_HAVE_SSE
int srslte_rm_turbo_rx_lut_simd(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx)
int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx)
{
if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) {
uint32_t out_len = 3*srslte_cbsegm_cbsize(cb_idx)+12;
@ -381,7 +386,116 @@ int srslte_rm_turbo_rx_lut_simd(int16_t *input, int16_t *output, uint32_t in_len
#endif
#ifdef LV_HAVE_AVX
#define SAVE_OUTPUT(j) x = (int16_t) _mm256_extract_epi16(xVal, j);\
l = (uint16_t) _mm256_extract_epi16(lutVal, j);\
output[l] += x;
int srslte_rm_turbo_rx_lut_avx(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx)
{
if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) {
uint32_t out_len = 3*srslte_cbsegm_cbsize(cb_idx)+12;
uint16_t *deinter = deinterleaver[cb_idx][rv_idx];
const __m256i* xPtr = (const __m256i*) input;
const __m256i* lutPtr = (const __m256i*) deinter;
__m256i xVal, lutVal;
int16_t x;
uint16_t l;
/* Simplify load if we do not need to wrap (ie high rates) */
if (in_len <= out_len) {
for (int i=0;i<in_len/16;i++) {
xVal = _mm256_loadu_si256(xPtr);
lutVal = _mm256_loadu_si256(lutPtr);
SAVE_OUTPUT(0);
SAVE_OUTPUT(1);
SAVE_OUTPUT(2);
SAVE_OUTPUT(3);
SAVE_OUTPUT(4);
SAVE_OUTPUT(5);
SAVE_OUTPUT(6);
SAVE_OUTPUT(7);
SAVE_OUTPUT(8);
SAVE_OUTPUT(9);
SAVE_OUTPUT(10);
SAVE_OUTPUT(11);
SAVE_OUTPUT(12);
SAVE_OUTPUT(13);
SAVE_OUTPUT(14);
SAVE_OUTPUT(15);
xPtr ++;
lutPtr ++;
}
for (int i=16*(in_len/16);i<in_len;i++) {
output[deinter[i%out_len]] += input[i];
}
} else {
int intCnt = 16;
int inputCnt = 0;
int nwrapps = 0;
while(inputCnt < in_len - 16) {
xVal = _mm256_loadu_si256(xPtr);
lutVal = _mm256_loadu_si256(lutPtr);
SAVE_OUTPUT(0);
SAVE_OUTPUT(1);
SAVE_OUTPUT(2);
SAVE_OUTPUT(3);
SAVE_OUTPUT(4);
SAVE_OUTPUT(5);
SAVE_OUTPUT(6);
SAVE_OUTPUT(7);
SAVE_OUTPUT(8);
SAVE_OUTPUT(9);
SAVE_OUTPUT(10);
SAVE_OUTPUT(11);
SAVE_OUTPUT(12);
SAVE_OUTPUT(13);
SAVE_OUTPUT(14);
SAVE_OUTPUT(15);
xPtr++;
lutPtr++;
intCnt += 16;
inputCnt += 16;
if (intCnt >= out_len && inputCnt < in_len - 16) {
/* Copy last elements */
if ((out_len%16) == 12) {
for (int j=(nwrapps+1)*out_len-12;j<(nwrapps+1)*out_len;j++) {
output[deinter[j%out_len]] += input[j];
inputCnt++;
}
} else {
for (int j=(nwrapps+1)*out_len-4;j<(nwrapps+1)*out_len;j++) {
output[deinter[j%out_len]] += input[j];
inputCnt++;
}
}
/* And wrap pointers */
nwrapps++;
intCnt = 16;
xPtr = (const __m256i*) &input[nwrapps*out_len];
lutPtr = (const __m256i*) deinter;
}
}
for (int i=inputCnt;i<in_len;i++) {
output[deinter[i%out_len]] += input[i];
}
}
return 0;
} else {
printf("Invalid inputs rv_idx=%d, cb_idx=%d\n", rv_idx, cb_idx);
return SRSLTE_ERROR_INVALID_INPUTS;
}
}
#endif

View File

@ -391,7 +391,7 @@ int srslte_tdec_gen_run_all(srslte_tdec_gen_t * h, float * input, uint8_t *outpu
iter++;
} while (iter < nof_iterations);
srslte_tdec_gen_decision(h, output, long_cb);
srslte_tdec_gen_decision_byte(h, output, long_cb);
return SRSLTE_SUCCESS;
}

View File

@ -37,8 +37,11 @@
#include <inttypes.h>
#ifdef LV_HAVE_SSE
#include <emmintrin.h>
#include <immintrin.h>
#include <nmmintrin.h>
#endif
#define NUMSTATES 8
#define NINPUTS 2
@ -55,11 +58,13 @@
*
************************************************/
#ifdef LV_HAVE_SSE
static inline int16_t hMax(__m128i buffer)
{
__m128i tmp1 = _mm_sub_epi8(_mm_set1_epi16(0x7FFF), buffer);
__m128i tmp3 = _mm_minpos_epu16(tmp1);
return (int16_t)(_mm_cvtsi128_si32(tmp3));
__m128i tmp1 = _mm_sub_epi8(_mm_set1_epi16(0x7FFF), buffer);
__m128i tmp3 = _mm_minpos_epu16(tmp1);
return (int16_t)(_mm_cvtsi128_si32(tmp3));
}
void srslte_map_gen_beta(srslte_map_gen_t * s, int16_t * output, uint32_t long_cb)
@ -626,3 +631,7 @@ int srslte_tdec_sse_run_all(srslte_tdec_sse_t * h, int16_t * input, uint8_t *out
return SRSLTE_SUCCESS;
}
#endif

View File

@ -118,12 +118,10 @@ int main(int argc, char **argv) {
float var[SNR_POINTS];
uint32_t snr_points;
uint32_t errors;
uint32_t errors_gen;
uint32_t coded_length;
struct timeval tdata[3];
float mean_usec, mean_usec_gen;
srslte_tdec_sse_t tdec;
srslte_tdec_gen_t tdec_gen;
float mean_usec;
srslte_tdec_t tdec;
srslte_tcod_t tcod;
parse_args(argc, argv);
@ -189,12 +187,7 @@ int main(int argc, char **argv) {
exit(-1);
}
if (srslte_tdec_sse_init(&tdec, frame_length)) {
fprintf(stderr, "Error initiating Turbo decoder\n");
exit(-1);
}
if (srslte_tdec_gen_init(&tdec_gen, frame_length)) {
if (srslte_tdec_init(&tdec, frame_length)) {
fprintf(stderr, "Error initiating Turbo decoder\n");
exit(-1);
}
@ -216,9 +209,7 @@ int main(int argc, char **argv) {
for (i = 0; i < snr_points; i++) {
mean_usec = 0;
mean_usec_gen = 0;
errors = 0;
errors_gen = 0;
frame_cnt = 0;
while (frame_cnt < nof_frames) {
/* generate data_tx */
@ -249,8 +240,7 @@ int main(int argc, char **argv) {
llr_s[j] = (int16_t) (100*llr[j]);
}
/* decoder */
srslte_tdec_sse_reset(&tdec, frame_length);
srslte_tdec_gen_reset(&tdec_gen, frame_length);
srslte_tdec_reset(&tdec, frame_length);
uint32_t t;
if (nof_iterations == -1) {
@ -261,7 +251,7 @@ int main(int argc, char **argv) {
gettimeofday(&tdata[1], NULL);
for (int k=0;k<nof_repetitions;k++) {
srslte_tdec_sse_run_all(&tdec, llr_s, data_rx_bytes, t, frame_length);
srslte_tdec_run_all(&tdec, llr_s, data_rx_bytes, t, frame_length);
}
gettimeofday(&tdata[2], NULL);
get_time_interval(tdata);
@ -270,24 +260,11 @@ int main(int argc, char **argv) {
srslte_bit_unpack_vector(data_rx_bytes, data_rx, frame_length);
errors += srslte_bit_diff(data_tx, data_rx, frame_length);
gettimeofday(&tdata[1], NULL);
for (int k=0;k<nof_repetitions;k++) {
srslte_tdec_gen_run_all(&tdec_gen, llr, data_rx, t, frame_length);
}
gettimeofday(&tdata[2], NULL);
get_time_interval(tdata);
mean_usec_gen = (float) mean_usec_gen * 0.9 + (float) (tdata[0].tv_usec/nof_repetitions) * 0.1;
/* check errors */
errors_gen += srslte_bit_diff(data_tx, data_rx, frame_length);
frame_cnt++;
printf("Eb/No: %2.2f %10d/%d ", SNR_MIN + i * ebno_inc, frame_cnt, nof_frames);
printf("BER: %.2e ", (float) errors / (frame_cnt * frame_length));
printf("BER_gen: %.2e ", (float) errors_gen / (frame_cnt * frame_length));
printf("%3.1f Mbps (%6.2f usec) -- gen: ", (float) frame_length / mean_usec, mean_usec);
printf("%3.1f Mbps (%6.2f usec)", (float) frame_length / mean_usec_gen, mean_usec_gen);
printf("%3.1f Mbps (%6.2f usec)", (float) frame_length / mean_usec, mean_usec);
printf("\r");
}
@ -297,10 +274,7 @@ int main(int argc, char **argv) {
printf("\n");
if (snr_points == 1) {
if (errors) {
printf("%d Errors in SSE\n", errors);
}
if (errors_gen) {
printf("%d Errors in GEN\n", errors_gen);
printf("%d Errors\n", errors);
}
}
@ -311,8 +285,7 @@ int main(int argc, char **argv) {
free(llr_c);
free(data_rx);
srslte_tdec_sse_free(&tdec);
srslte_tdec_gen_free(&tdec_gen);
srslte_tdec_free(&tdec);
srslte_tcod_free(&tcod);
printf("\n");

View File

@ -35,6 +35,18 @@
#include "srslte/mimo/precoding.h"
#include "srslte/utils/vector.h"
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
#include <pmmintrin.h>
int srslte_predecoding_single_sse(cf_t *y, cf_t *h, cf_t *x, int nof_symbols, float noise_estimate);
#endif
#ifdef LV_HAVE_AVX
#include <immintrin.h>
int srslte_predecoding_single_avx(cf_t *y, cf_t *h, cf_t *x, int nof_symbols, float noise_estimate);
#endif
/************************************************
*
@ -117,23 +129,138 @@ void srslte_precoding_free(srslte_precoding_t *q) {
bzero(q, sizeof(srslte_precoding_t));
}
/* ZF/MMSE SISO equalizer x=y(h'h+no)^(-1)h' (ZF if n0=0.0)*/
int srslte_predecoding_single(srslte_precoding_t *q, cf_t *y, cf_t *h, cf_t *x, int nof_symbols, float noise_estimate) {
if (nof_symbols <= q->max_frame_len) {
// h'h
srslte_vec_abs_square_cf(h, q->y_mod, nof_symbols);
if (noise_estimate > 0.0) {
// (h'h + n0)
srslte_vec_sc_add_fff(q->y_mod, noise_estimate, q->y_mod, nof_symbols);
#ifdef LV_HAVE_SSE
#define PROD(a,b) _mm_addsub_ps(_mm_mul_ps(a,_mm_moveldup_ps(b)),_mm_mul_ps(_mm_shuffle_ps(a,a,0xB1),_mm_movehdup_ps(b)))
int srslte_predecoding_single_sse(cf_t *y, cf_t *h, cf_t *x, int nof_symbols, float noise_estimate) {
float *xPtr = (float*) x;
const float *hPtr = (const float*) h;
const float *yPtr = (const float*) y;
__m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
__m128 noise = _mm_set1_ps(noise_estimate);
__m128 h1Val, h2Val, y1Val, y2Val, h12square, h1square, h2square, h1conj, h2conj, x1Val, x2Val;
for (int i=0;i<nof_symbols/4;i++) {
y1Val = _mm_load_ps(yPtr); yPtr+=4;
y2Val = _mm_load_ps(yPtr); yPtr+=4;
h1Val = _mm_load_ps(hPtr); hPtr+=4;
h2Val = _mm_load_ps(hPtr); hPtr+=4;
h12square = _mm_hadd_ps(_mm_mul_ps(h1Val, h1Val), _mm_mul_ps(h2Val, h2Val));
if (noise_estimate > 0) {
h12square = _mm_add_ps(h12square, noise);
}
// y*h'
srslte_vec_prod_conj_ccc(y, h, x, nof_symbols);
// divide by (h'h+no)
srslte_vec_div_cfc(x,q->y_mod,x,q->z_real,q->z_imag, nof_symbols);
return nof_symbols;
} else {
return SRSLTE_ERROR;
h1square = _mm_shuffle_ps(h12square, h12square, _MM_SHUFFLE(1, 1, 0, 0));
h2square = _mm_shuffle_ps(h12square, h12square, _MM_SHUFFLE(3, 3, 2, 2));
/* Conjugate channel */
h1conj = _mm_xor_ps(h1Val, conjugator);
h2conj = _mm_xor_ps(h2Val, conjugator);
/* Complex product */
x1Val = PROD(y1Val, h1conj);
x2Val = PROD(y2Val, h2conj);
x1Val = _mm_div_ps(x1Val, h1square);
x2Val = _mm_div_ps(x2Val, h2square);
_mm_store_ps(xPtr, x1Val); xPtr+=4;
_mm_store_ps(xPtr, x2Val); xPtr+=4;
}
for (int i=8*(nof_symbols/8);i<nof_symbols;i++) {
x[i] = y[i]*conj(h[i])/(conj(h[i])*h[i]+noise_estimate);
}
return nof_symbols;
}
#endif
#ifdef LV_HAVE_AVX
#define PROD_AVX(a,b) _mm256_addsub_ps(_mm256_mul_ps(a,_mm256_moveldup_ps(b)),_mm256_mul_ps(_mm256_shuffle_ps(a,a,0xB1),_mm256_movehdup_ps(b)))
int srslte_predecoding_single_avx(cf_t *y, cf_t *h, cf_t *x, int nof_symbols, float noise_estimate) {
float *xPtr = (float*) x;
const float *hPtr = (const float*) h;
const float *yPtr = (const float*) y;
__m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
__m256 noise = _mm256_set1_ps(noise_estimate);
__m256 h1Val, h2Val, y1Val, y2Val, h12square, h1square, h2square, h1_p, h2_p, h1conj, h2conj, x1Val, x2Val;
for (int i=0;i<nof_symbols/8;i++) {
y1Val = _mm256_load_ps(yPtr); yPtr+=8;
y2Val = _mm256_load_ps(yPtr); yPtr+=8;
h1Val = _mm256_load_ps(hPtr); hPtr+=8;
h2Val = _mm256_load_ps(hPtr); hPtr+=8;
__m256 t1 = _mm256_mul_ps(h1Val, h1Val);
__m256 t2 = _mm256_mul_ps(h2Val, h2Val);
h12square = _mm256_hadd_ps(_mm256_permute2f128_ps(t1, t2, 0x20), _mm256_permute2f128_ps(t1, t2, 0x31));
if (noise_estimate > 0) {
h12square = _mm256_add_ps(h12square, noise);
}
h1_p = _mm256_permute_ps(h12square, _MM_SHUFFLE(1, 1, 0, 0));
h2_p = _mm256_permute_ps(h12square, _MM_SHUFFLE(3, 3, 2, 2));
h1square = _mm256_permute2f128_ps(h1_p, h2_p, 2<<4);
h2square = _mm256_permute2f128_ps(h1_p, h2_p, 3<<4 | 1);
/* Conjugate channel */
h1conj = _mm256_xor_ps(h1Val, conjugator);
h2conj = _mm256_xor_ps(h2Val, conjugator);
/* Complex product */
x1Val = PROD_AVX(y1Val, h1conj);
x2Val = PROD_AVX(y2Val, h2conj);
x1Val = _mm256_div_ps(x1Val, h1square);
x2Val = _mm256_div_ps(x2Val, h2square);
_mm256_store_ps(xPtr, x1Val); xPtr+=8;
_mm256_store_ps(xPtr, x2Val); xPtr+=8;
}
for (int i=16*(nof_symbols/16);i<nof_symbols;i++) {
x[i] = y[i]*conj(h[i])/(conj(h[i])*h[i]+noise_estimate);
}
return nof_symbols;
}
#endif
int srslte_predecoding_single_gen(cf_t *y, cf_t *h, cf_t *x, int nof_symbols, float noise_estimate) {
for (int i=0;i<nof_symbols;i++) {
x[i] = y[i]*conj(h[i])/(conj(h[i])*h[i]+noise_estimate);
}
return nof_symbols;
}
/* ZF/MMSE SISO equalizer x=y(h'h+no)^(-1)h' (ZF if n0=0.0)*/
int srslte_predecoding_single(cf_t *y, cf_t *h, cf_t *x, int nof_symbols, float noise_estimate) {
#ifdef LV_HAVE_AVX
if (nof_symbols > 32) {
return srslte_predecoding_single_avx(y, h, x, nof_symbols, noise_estimate);
} else {
return srslte_predecoding_single_gen(y, h, x, nof_symbols, noise_estimate);
}
#else
#ifdef LV_HAVE_SSE
if (nof_symbols > 32) {
return srslte_predecoding_single_sse(y, h, x, nof_symbols, noise_estimate);
} else {
return srslte_predecoding_single_gen(y, h, x, nof_symbols, noise_estimate);
}
#else
return srslte_predecoding_single_gen(y, h, x, nof_symbols, noise_estimate);
#endif
#endif
}
/* ZF/MMSE STBC equalizer x=y(H'H+n0·I)^(-1)H' (ZF is n0=0.0)
@ -257,7 +384,7 @@ int srslte_predecoding_type(srslte_precoding_t *q, cf_t *y, cf_t *h[SRSLTE_MAX_P
switch (type) {
case SRSLTE_MIMO_TYPE_SINGLE_ANTENNA:
if (nof_ports == 1 && nof_layers == 1) {
return srslte_predecoding_single(q, y, h[0], x[0], nof_symbols, noise_estimate);
return srslte_predecoding_single(y, h[0], x[0], nof_symbols, noise_estimate);
} else {
fprintf(stderr,
"Number of ports and layers must be 1 for transmission on single antenna ports\n");

View File

@ -102,7 +102,7 @@ int main(int argc, char **argv) {
perror("srslte_vec_malloc");
exit(-1);
}
xr[i] = calloc(1,sizeof(cf_t) * nof_symbols);
xr[i] = srslte_vec_malloc(sizeof(cf_t) * nof_symbols);
if (!xr[i]) {
perror("srslte_vec_malloc");
exit(-1);
@ -186,7 +186,6 @@ int main(int argc, char **argv) {
mse = 0;
for (i = 0; i < nof_layers; i++) {
for (j = 0; j < nof_symbols; j++) {
printf("%f - %f\n", crealf(xr[i][j]), crealf(x[i][j]));
mse += cabsf(xr[i][j] - x[i][j]);
}
}

View File

@ -33,16 +33,16 @@
#include "srslte/utils/bit.h"
#include "srslte/modem/demod_soft.h"
#define HAVE_SIMD
// AVX implementation not useful for integers. Wait for AVX2
#ifdef HAVE_SIMD
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
#include <pmmintrin.h>
#include <tmmintrin.h>
void demod_16qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols);
#endif
//#define SCALE_DEMOD16QAM
#define SCALE_SHORT_CONV_QPSK 100
#define SCALE_SHORT_CONV_QAM16 400
#define SCALE_SHORT_CONV_QAM64 700
@ -72,48 +72,17 @@ void demod_16qam_lte(const cf_t *symbols, float *llr, int nsymbols) {
float yre = crealf(symbols[i]);
float yim = cimagf(symbols[i]);
#ifdef SCALE_DEMOD16QAM
llr[4*i+2] = (fabsf(yre)-2/sqrt(10))*sqrt(10);
llr[4*i+3] = (fabsf(yim)-2/sqrt(10))*sqrt(10);
if (llr[4*i+2] > 0) {
llr[4*i+0] = -yre/(3/sqrt(10));
} else {
llr[4*i+0] = -yre/(1/sqrt(10));
}
if (llr[4*i+3] > 0) {
llr[4*i+1] = -yim/(3/sqrt(10));
} else {
llr[4*i+1] = -yim/(1/sqrt(10));
}
#else
llr[4*i+0] = -yre;
llr[4*i+1] = -yim;
llr[4*i+2] = fabsf(yre)-2/sqrt(10);
llr[4*i+3] = fabsf(yim)-2/sqrt(10);
#endif
}
}
void demod_16qam_lte_s(const cf_t *symbols, short *llr, int nsymbols) {
#ifndef HAVE_SIMD
for (int i=0;i<nsymbols;i++) {
short yre = (short) (SCALE_SHORT_CONV_QAM16*crealf(symbols[i]));
short yim = (short) (SCALE_SHORT_CONV_QAM16*cimagf(symbols[i]));
llr[4*i+0] = -yre;
llr[4*i+1] = -yim;
llr[4*i+2] = abs(yre)-2*SCALE_SHORT_CONV_QAM16/sqrt(10);
llr[4*i+3] = abs(yim)-2*SCALE_SHORT_CONV_QAM16/sqrt(10);
}
#else
#ifdef LV_HAVE_SSE
float *symbolsPtr = (float*) symbols;
void demod_16qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols) {
float *symbolsPtr = (float*) symbols;
__m128i *resultPtr = (__m128i*) llr;
__m128 symbol1, symbol2;
__m128i symbol_i1, symbol_i2, symbol_i, symbol_abs;
@ -148,6 +117,22 @@ void demod_16qam_lte_s(const cf_t *symbols, short *llr, int nsymbols) {
short yre = (short) (SCALE_SHORT_CONV_QAM16*crealf(symbols[i]));
short yim = (short) (SCALE_SHORT_CONV_QAM16*cimagf(symbols[i]));
llr[4*i+0] = -yre;
llr[4*i+1] = -yim;
llr[4*i+2] = abs(yre)-2*SCALE_SHORT_CONV_QAM16/sqrt(10);
llr[4*i+3] = abs(yim)-2*SCALE_SHORT_CONV_QAM16/sqrt(10);
}
}
#endif
void demod_16qam_lte_s(const cf_t *symbols, short *llr, int nsymbols) {
#ifdef LV_HAVE_SSE
demod_16qam_lte_s_sse(symbols, llr, nsymbols);
#else
for (int i=0;i<nsymbols;i++) {
short yre = (short) (SCALE_SHORT_CONV_QAM16*crealf(symbols[i]));
short yim = (short) (SCALE_SHORT_CONV_QAM16*cimagf(symbols[i]));
llr[4*i+0] = -yre;
llr[4*i+1] = -yim;
llr[4*i+2] = abs(yre)-2*SCALE_SHORT_CONV_QAM16/sqrt(10);
@ -172,21 +157,10 @@ void demod_64qam_lte(const cf_t *symbols, float *llr, int nsymbols)
}
void demod_64qam_lte_s(const cf_t *symbols, short *llr, int nsymbols)
{
#ifndef HAVE_SIMD
for (int i=0;i<nsymbols;i++) {
float yre = (short) (SCALE_SHORT_CONV_QAM64*crealf(symbols[i]));
float yim = (short) (SCALE_SHORT_CONV*cimagf(symbols[i]));
#ifdef LV_HAVE_SSE
llr[6*i+0] = -yre;
llr[6*i+1] = -yim;
llr[6*i+2] = abs(yre)-4*SCALE_SHORT_CONV_QAM64/sqrt(42);
llr[6*i+3] = abs(yim)-4*SCALE_SHORT_CONV_QAM64/sqrt(42);
llr[6*i+4] = abs(llr[6*i+2])-2*SCALE_SHORT_CONV_QAM64/sqrt(42);
llr[6*i+5] = abs(llr[6*i+3])-2*SCALE_SHORT_CONV_QAM64/sqrt(42);
}
#else
void demod_64qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols)
{
float *symbolsPtr = (float*) symbols;
__m128i *resultPtr = (__m128i*) llr;
__m128 symbol1, symbol2;
@ -239,6 +213,26 @@ void demod_64qam_lte_s(const cf_t *symbols, short *llr, int nsymbols)
float yre = (short) (SCALE_SHORT_CONV_QAM64*crealf(symbols[i]));
float yim = (short) (SCALE_SHORT_CONV_QAM64*cimagf(symbols[i]));
llr[6*i+0] = -yre;
llr[6*i+1] = -yim;
llr[6*i+2] = abs(yre)-4*SCALE_SHORT_CONV_QAM64/sqrt(42);
llr[6*i+3] = abs(yim)-4*SCALE_SHORT_CONV_QAM64/sqrt(42);
llr[6*i+4] = abs(llr[6*i+2])-2*SCALE_SHORT_CONV_QAM64/sqrt(42);
llr[6*i+5] = abs(llr[6*i+3])-2*SCALE_SHORT_CONV_QAM64/sqrt(42);
}
}
#endif
void demod_64qam_lte_s(const cf_t *symbols, short *llr, int nsymbols)
{
#ifdef LV_HAVE_SSE
demod_64qam_lte_s_sse(symbols, llr, nsymbols);
#else
for (int i=0;i<nsymbols;i++) {
float yre = (short) (SCALE_SHORT_CONV_QAM64*crealf(symbols[i]));
float yim = (short) (SCALE_SHORT_CONV_QAM64*cimagf(symbols[i]));
llr[6*i+0] = -yre;
llr[6*i+1] = -yim;
llr[6*i+2] = abs(yre)-4*SCALE_SHORT_CONV_QAM64/sqrt(42);

View File

@ -470,8 +470,7 @@ int srslte_pbch_decode(srslte_pbch_t *q, cf_t *slot1_symbols, cf_t *ce_slot1[SRS
/* in control channels, only diversity is supported */
if (nant == 1) {
/* no need for layer demapping */
srslte_predecoding_single(&q->precoding, q->symbols[0], q->ce[0], q->d,
q->nof_symbols, noise_estimate);
srslte_predecoding_single(q->symbols[0], q->ce[0], q->d, q->nof_symbols, noise_estimate);
} else {
srslte_predecoding_diversity(&q->precoding, q->symbols[0], q->ce, x, nant,
q->nof_symbols, noise_estimate);

View File

@ -193,8 +193,7 @@ int srslte_pcfich_decode(srslte_pcfich_t *q, cf_t *slot_symbols, cf_t *ce[SRSLTE
/* in control channels, only diversity is supported */
if (q->cell.nof_ports == 1) {
/* no need for layer demapping */
srslte_predecoding_single(&q->precoding, q->symbols[0], q->ce[0], q->d,
q->nof_symbols, noise_estimate);
srslte_predecoding_single(q->symbols[0], q->ce[0], q->d, q->nof_symbols, noise_estimate);
} else {
srslte_predecoding_diversity(&q->precoding, q->symbols[0], ce_precoding, x,
q->cell.nof_ports, q->nof_symbols, noise_estimate);

View File

@ -408,7 +408,7 @@ int srslte_pdcch_extract_llr(srslte_pdcch_t *q, cf_t *sf_symbols, cf_t *ce[SRSLT
/* in control channels, only diversity is supported */
if (q->cell.nof_ports == 1) {
/* no need for layer demapping */
srslte_predecoding_single(&q->precoding, q->symbols[0], q->ce[0], q->d, nof_symbols, noise_estimate);
srslte_predecoding_single(q->symbols[0], q->ce[0], q->d, nof_symbols, noise_estimate);
} else {
srslte_predecoding_diversity(&q->precoding, q->symbols[0], q->ce, x, q->cell.nof_ports, nof_symbols, noise_estimate);
srslte_layerdemap_diversity(x, q->d, q->cell.nof_ports, nof_symbols / q->cell.nof_ports);

View File

@ -404,8 +404,7 @@ int srslte_pdsch_decode_rnti(srslte_pdsch_t *q,
/* TODO: only diversity is supported */
if (q->cell.nof_ports == 1) {
/* no need for layer demapping */
srslte_predecoding_single(&q->precoding, q->symbols[0], q->ce[0], q->d,
cfg->nbits.nof_re, noise_estimate);
srslte_predecoding_single(q->symbols[0], q->ce[0], q->d, cfg->nbits.nof_re, noise_estimate);
} else {
srslte_predecoding_diversity(&q->precoding, q->symbols[0], q->ce, x, q->cell.nof_ports,
cfg->nbits.nof_re, noise_estimate);

View File

@ -216,8 +216,7 @@ int srslte_phich_decode(srslte_phich_t *q, cf_t *slot_symbols, cf_t *ce[SRSLTE_M
/* in control channels, only diversity is supported */
if (q->cell.nof_ports == 1) {
/* no need for layer demapping */
srslte_predecoding_single(&q->precoding, q->symbols[0], q->ce[0], q->d0,
SRSLTE_PHICH_MAX_NSYMB, noise_estimate);
srslte_predecoding_single(q->symbols[0], q->ce[0], q->d0, SRSLTE_PHICH_MAX_NSYMB, noise_estimate);
} else {
srslte_predecoding_diversity(&q->precoding, q->symbols[0], ce_precoding, x,
q->cell.nof_ports, SRSLTE_PHICH_MAX_NSYMB, noise_estimate);

View File

@ -438,8 +438,7 @@ int srslte_pusch_decode(srslte_pusch_t *q,
return SRSLTE_ERROR;
}
srslte_predecoding_single(&q->equalizer, q->d, q->ce, q->z,
cfg->nbits.nof_re, noise_estimate);
srslte_predecoding_single(q->d, q->ce, q->z, cfg->nbits.nof_re, noise_estimate);
srslte_dft_predecoding(&q->dft_precoding, q->z, q->d, cfg->grant.L_prb, cfg->nbits.nof_symb);

View File

@ -111,7 +111,7 @@ int srslte_sch_init(srslte_sch_t *q) {
fprintf(stderr, "Error initiating Turbo Coder\n");
goto clean;
}
if (srslte_tdec_sse_init(&q->decoder, SRSLTE_TCOD_MAX_LEN_CB)) {
if (srslte_tdec_init(&q->decoder, SRSLTE_TCOD_MAX_LEN_CB)) {
fprintf(stderr, "Error initiating Turbo Decoder\n");
goto clean;
}
@ -133,7 +133,7 @@ int srslte_sch_init(srslte_sch_t *q) {
goto clean;
}
bzero(q->temp_g_bits, SRSLTE_MAX_PRB*12*12*12);
q->ul_interleaver = srslte_vec_malloc(sizeof(uint32_t)*SRSLTE_MAX_PRB*12*12*12);
q->ul_interleaver = srslte_vec_malloc(sizeof(uint16_t)*SRSLTE_MAX_PRB*12*12*12);
if (!q->ul_interleaver) {
goto clean;
}
@ -163,7 +163,7 @@ void srslte_sch_free(srslte_sch_t *q) {
if (q->ul_interleaver) {
free(q->ul_interleaver);
}
srslte_tdec_sse_free(&q->decoder);
srslte_tdec_free(&q->decoder);
srslte_tcod_free(&q->encoder);
srslte_uci_cqi_free(&q->uci_cqi);
bzero(q, sizeof(srslte_sch_t));
@ -413,10 +413,10 @@ static int decode_tb(srslte_sch_t *q,
srslte_crc_t *crc_ptr;
early_stop = false;
srslte_tdec_sse_reset(&q->decoder, cb_len);
srslte_tdec_reset(&q->decoder, cb_len);
do {
srslte_tdec_sse_iteration(&q->decoder, softbuffer->buffer_f[i], cb_len);
srslte_tdec_iteration(&q->decoder, softbuffer->buffer_f[i], cb_len);
q->nof_iterations++;
if (cb_segm->C > 1) {
@ -427,10 +427,10 @@ static int decode_tb(srslte_sch_t *q,
crc_ptr = &q->crc_tb;
}
srslte_tdec_sse_decision_byte(&q->decoder, q->cb_in, cb_len);
srslte_tdec_decision_byte(&q->decoder, q->cb_in, cb_len);
if (i == 9) {
srslte_tdec_sse_decision(&q->decoder, q->temp_data, cb_len);
srslte_tdec_decision(&q->decoder, q->temp_data, cb_len);
}
/* Check Codeblock CRC and stop early if incorrect */
if (!srslte_crc_checksum_byte(crc_ptr, q->cb_in, len_crc)) {
@ -525,7 +525,7 @@ int srslte_ulsch_decode(srslte_sch_t *q, srslte_pusch_cfg_t *cfg, srslte_softbuf
/* UL-SCH channel interleaver according to 5.2.2.8 of 36.212 */
void ulsch_interleave(uint8_t *g_bits, uint32_t Qm, uint32_t H_prime_total,
uint32_t N_pusch_symbs, uint8_t *q_bits, srslte_uci_bit_t *ri_bits, uint32_t nof_ri_bits,
uint32_t *interleaver_buffer, uint8_t *temp_buffer, uint32_t buffer_sz)
uint16_t *interleaver_buffer, uint8_t *temp_buffer, uint32_t buffer_sz)
{
uint32_t rows = H_prime_total/N_pusch_symbs;

View File

@ -147,24 +147,24 @@ int main(int argc, char **argv) {
/* init memory */
for (i=0;i<cell.nof_ports;i++) {
ce[i] = malloc(sizeof(cf_t) * SRSLTE_SF_LEN_RE(cell.nof_prb, cell.cp));
ce[i] = srslte_vec_malloc(sizeof(cf_t) * SRSLTE_SF_LEN_RE(cell.nof_prb, cell.cp));
if (!ce[i]) {
perror("malloc");
perror("srslte_vec_malloc");
goto quit;
}
for (j=0;j<SRSLTE_SF_LEN_RE(cell.nof_prb, cell.cp);j++) {
ce[i][j] = 1;
}
slot_symbols[i] = calloc(sizeof(cf_t) , SRSLTE_SF_LEN_RE(cell.nof_prb, cell.cp));
slot_symbols[i] = srslte_vec_malloc(sizeof(cf_t)*SRSLTE_SF_LEN_RE(cell.nof_prb, cell.cp));
if (!slot_symbols[i]) {
perror("malloc");
perror("srslte_vec_malloc");
goto quit;
}
}
data = srslte_vec_malloc(sizeof(uint8_t) * grant.mcs.tbs/8);
if (!data) {
perror("malloc");
perror("srslte_vec_malloc");
goto quit;
}

View File

@ -240,6 +240,17 @@ void srslte_vec_sc_prod_ccc(cf_t *x, cf_t h, cf_t *z, uint32_t len) {
#endif
}
void srslte_vec_convert_if(int16_t *x, float *z, float scale, uint32_t len) {
#ifndef HAVE_VOLK_CONVERT_IF_FUNCTION
int i;
for (i=0;i<len;i++) {
z[i] = ((float) x[i])*scale;
}
#else
volk_16i_s32f_convert_32f(z,x,scale,len);
#endif
}
void srslte_vec_convert_fi(float *x, int16_t *z, float scale, uint32_t len) {
#ifndef HAVE_VECTOR_SIMD
int i;
@ -303,7 +314,12 @@ void srslte_vec_deinterleave_real_cf(cf_t *x, float *real, uint32_t len) {
void *srslte_vec_malloc(uint32_t size) {
#ifndef HAVE_VOLK
return malloc(size);
void *ptr;
if (posix_memalign(&ptr,32,size)) {
return NULL;
} else {
return ptr;
}
#else
void *ptr;
if (posix_memalign(&ptr,volk_get_alignment(),size)) {

View File

@ -37,18 +37,15 @@
#include <inttypes.h>
#include <stdio.h>
#include <xmmintrin.h>
#ifdef LV_HAVE_SSE
#include <emmintrin.h>
#include <nmmintrin.h>
#endif
void print128_num(__m128i var)
{
int16_t *val = (int16_t*) &var;//can also use uint32_t instead of 16_t
printf("Numerical: %d %d %d %d %d %d %d %d \n",
val[0], val[1], val[2], val[3], val[4], val[5],
val[6], val[7]);
}
void srslte_vec_sum_sss_simd(short *x, short *y, short *z, uint32_t len)
{
#ifdef LV_HAVE_SSE
unsigned int number = 0;
const unsigned int points = len / 8;
@ -75,10 +72,13 @@ void srslte_vec_sum_sss_simd(short *x, short *y, short *z, uint32_t len)
for(;number < len; number++){
z[number] = x[number] + y[number];
}
#endif
}
void srslte_vec_sub_sss_simd(short *x, short *y, short *z, uint32_t len)
{
#ifdef LV_HAVE_SSE
unsigned int number = 0;
const unsigned int points = len / 8;
@ -105,10 +105,12 @@ void srslte_vec_sub_sss_simd(short *x, short *y, short *z, uint32_t len)
for(;number < len; number++){
z[number] = x[number] - y[number];
}
#endif
}
void srslte_vec_prod_sss_simd(short *x, short *y, short *z, uint32_t len)
{
#ifdef LV_HAVE_SSE
unsigned int number = 0;
const unsigned int points = len / 8;
@ -135,10 +137,12 @@ void srslte_vec_prod_sss_simd(short *x, short *y, short *z, uint32_t len)
for(;number < len; number++){
z[number] = x[number] * y[number];
}
#endif
}
void srslte_vec_sc_div2_sss_simd(short *x, int k, short *z, uint32_t len)
{
#ifdef LV_HAVE_SSE
unsigned int number = 0;
const unsigned int points = len / 8;
@ -163,10 +167,13 @@ void srslte_vec_sc_div2_sss_simd(short *x, int k, short *z, uint32_t len)
for(;number < len; number++){
z[number] = x[number] / divn;
}
#endif
}
/* No improvement with AVX */
void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, uint32_t len)
{
#ifdef LV_HAVE_SSE
unsigned int number = 0;
const unsigned int points = len / 8;
@ -192,12 +199,13 @@ void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, uint32_t l
for(;number < len; number++){
y[lut[number]] = x[number];
}
#endif
}
/* Modified from volk_32f_s32f_convert_16i_a_sse2. Removed clipping */
/* Modified from volk_32f_s32f_convert_16i_a_simd2. Removed clipping */
void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, uint32_t len)
{
#ifdef LV_HAVE_SSE
unsigned int number = 0;
const unsigned int eighthPoints = len / 8;
@ -230,5 +238,5 @@ void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, uint32_t len)
for(; number < len; number++){
z[number] = (int16_t) (x[number] * scale);
}
#endif
}