diff --git a/Transceiver52M/x86/convert.c b/Transceiver52M/x86/convert.c index eafe7b21..862a2e73 100644 --- a/Transceiver52M/x86/convert.c +++ b/Transceiver52M/x86/convert.c @@ -176,26 +176,34 @@ static void convert_si16_ps(float *out, const short *in, int len) void convert_float_short(short *out, const float *in, float scale, int len) { + void (*conv_func)(short *, const float *, float, int); + #ifdef HAVE_SSE3 if (!(len % 16)) - _sse_convert_scale_ps_si16_16n(out, in, scale, len); + conv_func = _sse_convert_scale_ps_si16_16n; else if (!(len % 8)) - _sse_convert_scale_ps_si16_8n(out, in, scale, len); + conv_func = _sse_convert_scale_ps_si16_8n; else - _sse_convert_scale_ps_si16(out, in, scale, len); + conv_func = _sse_convert_scale_ps_si16; #else - convert_scale_ps_si16(out, in, scale, len); + conv_func = convert_scale_ps_si16; #endif + + conv_func(out, in, scale, len); } void convert_short_float(float *out, const short *in, int len) { + void (*conv_func) (float *, const short *, int); + #ifdef HAVE_SSE4_1 if (!(len % 16)) - _sse_convert_si16_ps_16n(out, in, len); + conv_func = _sse_convert_si16_ps_16n; else - _sse_convert_si16_ps(out, in, len); + conv_func = _sse_convert_si16_ps; #else - convert_si16_ps(out, in, len); + conv_func = convert_si16_ps; #endif + + conv_func(out, in, len); } diff --git a/Transceiver52M/x86/convolve.c b/Transceiver52M/x86/convolve.c index 04923bcd..e2a1dea7 100644 --- a/Transceiver52M/x86/convolve.c +++ b/Transceiver52M/x86/convolve.c @@ -47,13 +47,21 @@ int bounds_check(int x_len, int h_len, int y_len, #include /* 4-tap SSE complex-real convolution */ -static void sse_conv_real4(const float *restrict x, - const float *restrict h, - float *restrict y, - int len) +static void sse_conv_real4(const float *x, int x_len, + const float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset) { + /* NOTE: The parameter list of this function has to match the parameter + * list of _base_convolve_real() in convolve_base.c. This specific + * implementation, ignores some of the parameters of + * _base_convolve_complex(), which are: x_len, y_len, offset, step */ + __m128 m0, m1, m2, m3, m4, m5, m6, m7; + const float *_x = &x[2 * (-(h_len - 1) + start)]; + /* Load (aligned) filter taps */ m0 = _mm_load_ps(&h[0]); m1 = _mm_load_ps(&h[4]); @@ -61,8 +69,8 @@ static void sse_conv_real4(const float *restrict x, for (int i = 0; i < len; i++) { /* Load (unaligned) input data */ - m0 = _mm_loadu_ps(&x[2 * i + 0]); - m1 = _mm_loadu_ps(&x[2 * i + 4]); + m0 = _mm_loadu_ps(&_x[2 * i + 0]); + m1 = _mm_loadu_ps(&_x[2 * i + 4]); m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); @@ -81,13 +89,18 @@ static void sse_conv_real4(const float *restrict x, } /* 8-tap SSE complex-real convolution */ -static void sse_conv_real8(const float *restrict x, - const float *restrict h, - float *restrict y, - int len) +static void sse_conv_real8(const float *x, int x_len, + const float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset) { + /* See NOTE in sse_conv_real4() */ + __m128 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9; + const float *_x = &x[2 * (-(h_len - 1) + start)]; + /* Load (aligned) filter taps */ m0 = _mm_load_ps(&h[0]); m1 = _mm_load_ps(&h[4]); @@ -99,10 +112,10 @@ static void sse_conv_real8(const float *restrict x, for (int i = 0; i < len; i++) { /* Load (unaligned) input data */ - m0 = _mm_loadu_ps(&x[2 * i + 0]); - m1 = _mm_loadu_ps(&x[2 * i + 4]); - m2 = _mm_loadu_ps(&x[2 * i + 8]); - m3 = _mm_loadu_ps(&x[2 * i + 12]); + m0 = _mm_loadu_ps(&_x[2 * i + 0]); + m1 = _mm_loadu_ps(&_x[2 * i + 4]); + m2 = _mm_loadu_ps(&_x[2 * i + 8]); + m3 = _mm_loadu_ps(&_x[2 * i + 12]); m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); @@ -128,14 +141,19 @@ static void sse_conv_real8(const float *restrict x, } /* 12-tap SSE complex-real convolution */ -static void sse_conv_real12(const float *restrict x, - const float *restrict h, - float *restrict y, - int len) +static void sse_conv_real12(const float *x, int x_len, + const float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset) { + /* See NOTE in sse_conv_real4() */ + __m128 m0, m1, m2, m3, m4, m5, m6, m7; __m128 m8, m9, m10, m11, m12, m13, m14; + const float *_x = &x[2 * (-(h_len - 1) + start)]; + /* Load (aligned) filter taps */ m0 = _mm_load_ps(&h[0]); m1 = _mm_load_ps(&h[4]); @@ -150,18 +168,18 @@ static void sse_conv_real12(const float *restrict x, for (int i = 0; i < len; i++) { /* Load (unaligned) input data */ - m0 = _mm_loadu_ps(&x[2 * i + 0]); - m1 = _mm_loadu_ps(&x[2 * i + 4]); - m2 = _mm_loadu_ps(&x[2 * i + 8]); - m3 = _mm_loadu_ps(&x[2 * i + 12]); + m0 = _mm_loadu_ps(&_x[2 * i + 0]); + m1 = _mm_loadu_ps(&_x[2 * i + 4]); + m2 = _mm_loadu_ps(&_x[2 * i + 8]); + m3 = _mm_loadu_ps(&_x[2 * i + 12]); m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2)); m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3)); - m0 = _mm_loadu_ps(&x[2 * i + 16]); - m1 = _mm_loadu_ps(&x[2 * i + 20]); + m0 = _mm_loadu_ps(&_x[2 * i + 16]); + m1 = _mm_loadu_ps(&_x[2 * i + 20]); m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); @@ -190,14 +208,19 @@ static void sse_conv_real12(const float *restrict x, } /* 16-tap SSE complex-real convolution */ -static void sse_conv_real16(const float *restrict x, - const float *restrict h, - float *restrict y, - int len) +static void sse_conv_real16(const float *x, int x_len, + const float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset) { + /* See NOTE in sse_conv_real4() */ + __m128 m0, m1, m2, m3, m4, m5, m6, m7; __m128 m8, m9, m10, m11, m12, m13, m14, m15; + const float *_x = &x[2 * (-(h_len - 1) + start)]; + /* Load (aligned) filter taps */ m0 = _mm_load_ps(&h[0]); m1 = _mm_load_ps(&h[4]); @@ -216,20 +239,20 @@ static void sse_conv_real16(const float *restrict x, for (int i = 0; i < len; i++) { /* Load (unaligned) input data */ - m0 = _mm_loadu_ps(&x[2 * i + 0]); - m1 = _mm_loadu_ps(&x[2 * i + 4]); - m2 = _mm_loadu_ps(&x[2 * i + 8]); - m3 = _mm_loadu_ps(&x[2 * i + 12]); + m0 = _mm_loadu_ps(&_x[2 * i + 0]); + m1 = _mm_loadu_ps(&_x[2 * i + 4]); + m2 = _mm_loadu_ps(&_x[2 * i + 8]); + m3 = _mm_loadu_ps(&_x[2 * i + 12]); m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2)); m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3)); - m0 = _mm_loadu_ps(&x[2 * i + 16]); - m1 = _mm_loadu_ps(&x[2 * i + 20]); - m2 = _mm_loadu_ps(&x[2 * i + 24]); - m3 = _mm_loadu_ps(&x[2 * i + 28]); + m0 = _mm_loadu_ps(&_x[2 * i + 16]); + m1 = _mm_loadu_ps(&_x[2 * i + 20]); + m2 = _mm_loadu_ps(&_x[2 * i + 24]); + m3 = _mm_loadu_ps(&_x[2 * i + 28]); m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); @@ -265,14 +288,19 @@ static void sse_conv_real16(const float *restrict x, } /* 20-tap SSE complex-real convolution */ -static void sse_conv_real20(const float *restrict x, - const float *restrict h, - float *restrict y, - int len) +static void sse_conv_real20(const float *x, int x_len, + const float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset) { + /* See NOTE in sse_conv_real4() */ + __m128 m0, m1, m2, m3, m4, m5, m6, m7; __m128 m8, m9, m11, m12, m13, m14, m15; + const float *_x = &x[2 * (-(h_len - 1) + start)]; + /* Load (aligned) filter taps */ m0 = _mm_load_ps(&h[0]); m1 = _mm_load_ps(&h[4]); @@ -293,12 +321,12 @@ static void sse_conv_real20(const float *restrict x, for (int i = 0; i < len; i++) { /* Multiply-accumulate first 12 taps */ - m0 = _mm_loadu_ps(&x[2 * i + 0]); - m1 = _mm_loadu_ps(&x[2 * i + 4]); - m2 = _mm_loadu_ps(&x[2 * i + 8]); - m3 = _mm_loadu_ps(&x[2 * i + 12]); - m4 = _mm_loadu_ps(&x[2 * i + 16]); - m5 = _mm_loadu_ps(&x[2 * i + 20]); + m0 = _mm_loadu_ps(&_x[2 * i + 0]); + m1 = _mm_loadu_ps(&_x[2 * i + 4]); + m2 = _mm_loadu_ps(&_x[2 * i + 8]); + m3 = _mm_loadu_ps(&_x[2 * i + 12]); + m4 = _mm_loadu_ps(&_x[2 * i + 16]); + m5 = _mm_loadu_ps(&_x[2 * i + 20]); m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); @@ -320,10 +348,10 @@ static void sse_conv_real20(const float *restrict x, m9 = _mm_add_ps(m1, m7); /* Multiply-accumulate last 8 taps */ - m0 = _mm_loadu_ps(&x[2 * i + 24]); - m1 = _mm_loadu_ps(&x[2 * i + 28]); - m2 = _mm_loadu_ps(&x[2 * i + 32]); - m3 = _mm_loadu_ps(&x[2 * i + 36]); + m0 = _mm_loadu_ps(&_x[2 * i + 24]); + m1 = _mm_loadu_ps(&_x[2 * i + 28]); + m2 = _mm_loadu_ps(&_x[2 * i + 32]); + m3 = _mm_loadu_ps(&_x[2 * i + 36]); m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); @@ -351,13 +379,18 @@ static void sse_conv_real20(const float *restrict x, } /* 4*N-tap SSE complex-real convolution */ -static void sse_conv_real4n(const float *x, - const float *h, - float *y, - int h_len, int len) +static void sse_conv_real4n(const float *x, int x_len, + const float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset) { + /* See NOTE in sse_conv_real4() */ + __m128 m0, m1, m2, m4, m5, m6, m7; + const float *_x = &x[2 * (-(h_len - 1) + start)]; + for (int i = 0; i < len; i++) { /* Zero */ m6 = _mm_setzero_ps(); @@ -370,8 +403,8 @@ static void sse_conv_real4n(const float *x, m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); /* Load (unaligned) input data */ - m0 = _mm_loadu_ps(&x[2 * i + 8 * n + 0]); - m1 = _mm_loadu_ps(&x[2 * i + 8 * n + 4]); + m0 = _mm_loadu_ps(&_x[2 * i + 8 * n + 0]); + m1 = _mm_loadu_ps(&_x[2 * i + 8 * n + 4]); m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); @@ -394,13 +427,21 @@ static void sse_conv_real4n(const float *x, } /* 4*N-tap SSE complex-complex convolution */ -static void sse_conv_cmplx_4n(const float *x, - const float *h, - float *y, - int h_len, int len) +static void sse_conv_cmplx_4n(const float *x, int x_len, + const float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset) { + /* NOTE: The parameter list of this function has to match the parameter + * list of _base_convolve_complex() in convolve_base.c. This specific + * implementation, ignores some of the parameters of + * _base_convolve_complex(), which are: x_len, y_len, offset, step. */ + __m128 m0, m1, m2, m3, m4, m5, m6, m7; + const float *_x = &x[2 * (-(h_len - 1) + start)]; + for (int i = 0; i < len; i++) { /* Zero */ m6 = _mm_setzero_ps(); @@ -414,8 +455,8 @@ static void sse_conv_cmplx_4n(const float *x, m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); /* Load (unaligned) input data */ - m0 = _mm_loadu_ps(&x[2 * i + 8 * n + 0]); - m1 = _mm_loadu_ps(&x[2 * i + 8 * n + 4]); + m0 = _mm_loadu_ps(&_x[2 * i + 8 * n + 0]); + m1 = _mm_loadu_ps(&_x[2 * i + 8 * n + 4]); m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); @@ -445,14 +486,19 @@ static void sse_conv_cmplx_4n(const float *x, } /* 8*N-tap SSE complex-complex convolution */ -static void sse_conv_cmplx_8n(const float *x, - const float *h, - float *y, - int h_len, int len) +static void sse_conv_cmplx_8n(const float *x, int x_len, + const float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset) { + /* See NOTE in sse_conv_cmplx_4n() */ + __m128 m0, m1, m2, m3, m4, m5, m6, m7; __m128 m8, m9, m10, m11, m12, m13, m14, m15; + const float *_x = &x[2 * (-(h_len - 1) + start)]; + for (int i = 0; i < len; i++) { /* Zero */ m12 = _mm_setzero_ps(); @@ -473,10 +519,10 @@ static void sse_conv_cmplx_8n(const float *x, m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3)); /* Load (unaligned) input data */ - m0 = _mm_loadu_ps(&x[2 * i + 16 * n + 0]); - m1 = _mm_loadu_ps(&x[2 * i + 16 * n + 4]); - m2 = _mm_loadu_ps(&x[2 * i + 16 * n + 8]); - m3 = _mm_loadu_ps(&x[2 * i + 16 * n + 12]); + m0 = _mm_loadu_ps(&_x[2 * i + 16 * n + 0]); + m1 = _mm_loadu_ps(&_x[2 * i + 16 * n + 4]); + m2 = _mm_loadu_ps(&_x[2 * i + 16 * n + 8]); + m3 = _mm_loadu_ps(&_x[2 * i + 16 * n + 12]); m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); @@ -522,14 +568,10 @@ static void sse_conv_cmplx_8n(const float *x, /* API: Aligned complex-real */ int convolve_real(const float *x, int x_len, const float *h, int h_len, - float *y, int y_len, - int start, int len, - int step, int offset) + float *y, int y_len, int start, int len, int step, int offset) { - void (*conv_func)(const float *, const float *, - float *, int) = NULL; - void (*conv_func_n)(const float *, const float *, - float *, int, int) = NULL; + void (*conv_func) (const float *, int, const float *, int, float *, int, + int, int, int, int) = (void *)_base_convolve_real; if (bounds_check(x_len, h_len, y_len, start, len, step) < 0) return -1; @@ -556,22 +598,12 @@ int convolve_real(const float *x, int x_len, break; default: if (!(h_len % 4)) - conv_func_n = sse_conv_real4n; + conv_func = sse_conv_real4n; } } #endif - if (conv_func) { - conv_func(&x[2 * (-(h_len - 1) + start)], - h, y, len); - } else if (conv_func_n) { - conv_func_n(&x[2 * (-(h_len - 1) + start)], - h, y, h_len, len); - } else { - _base_convolve_real(x, x_len, - h, h_len, - y, y_len, - start, len, step, offset); - } + + conv_func(x, x_len, h, h_len, y, y_len, start, len, step, offset); return len; } @@ -580,11 +612,11 @@ int convolve_real(const float *x, int x_len, int convolve_complex(const float *x, int x_len, const float *h, int h_len, float *y, int y_len, - int start, int len, - int step, int offset) + int start, int len, int step, int offset) { - void (*conv_func)(const float *, const float *, - float *, int, int) = NULL; + void (*conv_func) (const float *, int, const float *, int, float *, int, + int, int, int, int) = + (void *)_base_convolve_complex; if (bounds_check(x_len, h_len, y_len, start, len, step) < 0) return -1; @@ -599,15 +631,8 @@ int convolve_complex(const float *x, int x_len, conv_func = sse_conv_cmplx_4n; } #endif - if (conv_func) { - conv_func(&x[2 * (-(h_len - 1) + start)], - h, y, h_len, len); - } else { - _base_convolve_complex(x, x_len, - h, h_len, - y, y_len, - start, len, step, offset); - } + + conv_func(x, x_len, h, h_len, y, y_len, start, len, step, offset); return len; }