cosmetic: Make parameter lists uniform

The non-sse implementation and the sse implementation of the convert
and convolve functions have different parameter lists. This makes it
difficult to use function pointers in order to select the right
function depending on the SSE-Level and CPU.

This commit uniformizes the parameter lists in preparation for
planned runtime cpu detection support

Change-Id: Ice063b89791537c4b591751f12f5ef5c413a2d27
This commit is contained in:
Philipp Maier 2017-03-15 12:39:25 +01:00 committed by Tom Tsou
parent 78b5627fa1
commit 131f82bfac
2 changed files with 143 additions and 110 deletions

View File

@ -176,26 +176,34 @@ static void convert_si16_ps(float *out, const short *in, int len)
void convert_float_short(short *out, const float *in, float scale, int len)
{
void (*conv_func)(short *, const float *, float, int);
#ifdef HAVE_SSE3
if (!(len % 16))
_sse_convert_scale_ps_si16_16n(out, in, scale, len);
conv_func = _sse_convert_scale_ps_si16_16n;
else if (!(len % 8))
_sse_convert_scale_ps_si16_8n(out, in, scale, len);
conv_func = _sse_convert_scale_ps_si16_8n;
else
_sse_convert_scale_ps_si16(out, in, scale, len);
conv_func = _sse_convert_scale_ps_si16;
#else
convert_scale_ps_si16(out, in, scale, len);
conv_func = convert_scale_ps_si16;
#endif
conv_func(out, in, scale, len);
}
void convert_short_float(float *out, const short *in, int len)
{
void (*conv_func) (float *, const short *, int);
#ifdef HAVE_SSE4_1
if (!(len % 16))
_sse_convert_si16_ps_16n(out, in, len);
conv_func = _sse_convert_si16_ps_16n;
else
_sse_convert_si16_ps(out, in, len);
conv_func = _sse_convert_si16_ps;
#else
convert_si16_ps(out, in, len);
conv_func = convert_si16_ps;
#endif
conv_func(out, in, len);
}

View File

@ -47,13 +47,21 @@ int bounds_check(int x_len, int h_len, int y_len,
#include <pmmintrin.h>
/* 4-tap SSE complex-real convolution */
static void sse_conv_real4(const float *restrict x,
const float *restrict h,
float *restrict y,
int len)
static void sse_conv_real4(const float *x, int x_len,
const float *h, int h_len,
float *y, int y_len,
int start, int len,
int step, int offset)
{
/* NOTE: The parameter list of this function has to match the parameter
* list of _base_convolve_real() in convolve_base.c. This specific
* implementation, ignores some of the parameters of
* _base_convolve_complex(), which are: x_len, y_len, offset, step */
__m128 m0, m1, m2, m3, m4, m5, m6, m7;
const float *_x = &x[2 * (-(h_len - 1) + start)];
/* Load (aligned) filter taps */
m0 = _mm_load_ps(&h[0]);
m1 = _mm_load_ps(&h[4]);
@ -61,8 +69,8 @@ static void sse_conv_real4(const float *restrict x,
for (int i = 0; i < len; i++) {
/* Load (unaligned) input data */
m0 = _mm_loadu_ps(&x[2 * i + 0]);
m1 = _mm_loadu_ps(&x[2 * i + 4]);
m0 = _mm_loadu_ps(&_x[2 * i + 0]);
m1 = _mm_loadu_ps(&_x[2 * i + 4]);
m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@ -81,13 +89,18 @@ static void sse_conv_real4(const float *restrict x,
}
/* 8-tap SSE complex-real convolution */
static void sse_conv_real8(const float *restrict x,
const float *restrict h,
float *restrict y,
int len)
static void sse_conv_real8(const float *x, int x_len,
const float *h, int h_len,
float *y, int y_len,
int start, int len,
int step, int offset)
{
/* See NOTE in sse_conv_real4() */
__m128 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9;
const float *_x = &x[2 * (-(h_len - 1) + start)];
/* Load (aligned) filter taps */
m0 = _mm_load_ps(&h[0]);
m1 = _mm_load_ps(&h[4]);
@ -99,10 +112,10 @@ static void sse_conv_real8(const float *restrict x,
for (int i = 0; i < len; i++) {
/* Load (unaligned) input data */
m0 = _mm_loadu_ps(&x[2 * i + 0]);
m1 = _mm_loadu_ps(&x[2 * i + 4]);
m2 = _mm_loadu_ps(&x[2 * i + 8]);
m3 = _mm_loadu_ps(&x[2 * i + 12]);
m0 = _mm_loadu_ps(&_x[2 * i + 0]);
m1 = _mm_loadu_ps(&_x[2 * i + 4]);
m2 = _mm_loadu_ps(&_x[2 * i + 8]);
m3 = _mm_loadu_ps(&_x[2 * i + 12]);
m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@ -128,14 +141,19 @@ static void sse_conv_real8(const float *restrict x,
}
/* 12-tap SSE complex-real convolution */
static void sse_conv_real12(const float *restrict x,
const float *restrict h,
float *restrict y,
int len)
static void sse_conv_real12(const float *x, int x_len,
const float *h, int h_len,
float *y, int y_len,
int start, int len,
int step, int offset)
{
/* See NOTE in sse_conv_real4() */
__m128 m0, m1, m2, m3, m4, m5, m6, m7;
__m128 m8, m9, m10, m11, m12, m13, m14;
const float *_x = &x[2 * (-(h_len - 1) + start)];
/* Load (aligned) filter taps */
m0 = _mm_load_ps(&h[0]);
m1 = _mm_load_ps(&h[4]);
@ -150,18 +168,18 @@ static void sse_conv_real12(const float *restrict x,
for (int i = 0; i < len; i++) {
/* Load (unaligned) input data */
m0 = _mm_loadu_ps(&x[2 * i + 0]);
m1 = _mm_loadu_ps(&x[2 * i + 4]);
m2 = _mm_loadu_ps(&x[2 * i + 8]);
m3 = _mm_loadu_ps(&x[2 * i + 12]);
m0 = _mm_loadu_ps(&_x[2 * i + 0]);
m1 = _mm_loadu_ps(&_x[2 * i + 4]);
m2 = _mm_loadu_ps(&_x[2 * i + 8]);
m3 = _mm_loadu_ps(&_x[2 * i + 12]);
m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
m0 = _mm_loadu_ps(&x[2 * i + 16]);
m1 = _mm_loadu_ps(&x[2 * i + 20]);
m0 = _mm_loadu_ps(&_x[2 * i + 16]);
m1 = _mm_loadu_ps(&_x[2 * i + 20]);
m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@ -190,14 +208,19 @@ static void sse_conv_real12(const float *restrict x,
}
/* 16-tap SSE complex-real convolution */
static void sse_conv_real16(const float *restrict x,
const float *restrict h,
float *restrict y,
int len)
static void sse_conv_real16(const float *x, int x_len,
const float *h, int h_len,
float *y, int y_len,
int start, int len,
int step, int offset)
{
/* See NOTE in sse_conv_real4() */
__m128 m0, m1, m2, m3, m4, m5, m6, m7;
__m128 m8, m9, m10, m11, m12, m13, m14, m15;
const float *_x = &x[2 * (-(h_len - 1) + start)];
/* Load (aligned) filter taps */
m0 = _mm_load_ps(&h[0]);
m1 = _mm_load_ps(&h[4]);
@ -216,20 +239,20 @@ static void sse_conv_real16(const float *restrict x,
for (int i = 0; i < len; i++) {
/* Load (unaligned) input data */
m0 = _mm_loadu_ps(&x[2 * i + 0]);
m1 = _mm_loadu_ps(&x[2 * i + 4]);
m2 = _mm_loadu_ps(&x[2 * i + 8]);
m3 = _mm_loadu_ps(&x[2 * i + 12]);
m0 = _mm_loadu_ps(&_x[2 * i + 0]);
m1 = _mm_loadu_ps(&_x[2 * i + 4]);
m2 = _mm_loadu_ps(&_x[2 * i + 8]);
m3 = _mm_loadu_ps(&_x[2 * i + 12]);
m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
m0 = _mm_loadu_ps(&x[2 * i + 16]);
m1 = _mm_loadu_ps(&x[2 * i + 20]);
m2 = _mm_loadu_ps(&x[2 * i + 24]);
m3 = _mm_loadu_ps(&x[2 * i + 28]);
m0 = _mm_loadu_ps(&_x[2 * i + 16]);
m1 = _mm_loadu_ps(&_x[2 * i + 20]);
m2 = _mm_loadu_ps(&_x[2 * i + 24]);
m3 = _mm_loadu_ps(&_x[2 * i + 28]);
m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@ -265,14 +288,19 @@ static void sse_conv_real16(const float *restrict x,
}
/* 20-tap SSE complex-real convolution */
static void sse_conv_real20(const float *restrict x,
const float *restrict h,
float *restrict y,
int len)
static void sse_conv_real20(const float *x, int x_len,
const float *h, int h_len,
float *y, int y_len,
int start, int len,
int step, int offset)
{
/* See NOTE in sse_conv_real4() */
__m128 m0, m1, m2, m3, m4, m5, m6, m7;
__m128 m8, m9, m11, m12, m13, m14, m15;
const float *_x = &x[2 * (-(h_len - 1) + start)];
/* Load (aligned) filter taps */
m0 = _mm_load_ps(&h[0]);
m1 = _mm_load_ps(&h[4]);
@ -293,12 +321,12 @@ static void sse_conv_real20(const float *restrict x,
for (int i = 0; i < len; i++) {
/* Multiply-accumulate first 12 taps */
m0 = _mm_loadu_ps(&x[2 * i + 0]);
m1 = _mm_loadu_ps(&x[2 * i + 4]);
m2 = _mm_loadu_ps(&x[2 * i + 8]);
m3 = _mm_loadu_ps(&x[2 * i + 12]);
m4 = _mm_loadu_ps(&x[2 * i + 16]);
m5 = _mm_loadu_ps(&x[2 * i + 20]);
m0 = _mm_loadu_ps(&_x[2 * i + 0]);
m1 = _mm_loadu_ps(&_x[2 * i + 4]);
m2 = _mm_loadu_ps(&_x[2 * i + 8]);
m3 = _mm_loadu_ps(&_x[2 * i + 12]);
m4 = _mm_loadu_ps(&_x[2 * i + 16]);
m5 = _mm_loadu_ps(&_x[2 * i + 20]);
m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@ -320,10 +348,10 @@ static void sse_conv_real20(const float *restrict x,
m9 = _mm_add_ps(m1, m7);
/* Multiply-accumulate last 8 taps */
m0 = _mm_loadu_ps(&x[2 * i + 24]);
m1 = _mm_loadu_ps(&x[2 * i + 28]);
m2 = _mm_loadu_ps(&x[2 * i + 32]);
m3 = _mm_loadu_ps(&x[2 * i + 36]);
m0 = _mm_loadu_ps(&_x[2 * i + 24]);
m1 = _mm_loadu_ps(&_x[2 * i + 28]);
m2 = _mm_loadu_ps(&_x[2 * i + 32]);
m3 = _mm_loadu_ps(&_x[2 * i + 36]);
m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@ -351,13 +379,18 @@ static void sse_conv_real20(const float *restrict x,
}
/* 4*N-tap SSE complex-real convolution */
static void sse_conv_real4n(const float *x,
const float *h,
float *y,
int h_len, int len)
static void sse_conv_real4n(const float *x, int x_len,
const float *h, int h_len,
float *y, int y_len,
int start, int len,
int step, int offset)
{
/* See NOTE in sse_conv_real4() */
__m128 m0, m1, m2, m4, m5, m6, m7;
const float *_x = &x[2 * (-(h_len - 1) + start)];
for (int i = 0; i < len; i++) {
/* Zero */
m6 = _mm_setzero_ps();
@ -370,8 +403,8 @@ static void sse_conv_real4n(const float *x,
m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
/* Load (unaligned) input data */
m0 = _mm_loadu_ps(&x[2 * i + 8 * n + 0]);
m1 = _mm_loadu_ps(&x[2 * i + 8 * n + 4]);
m0 = _mm_loadu_ps(&_x[2 * i + 8 * n + 0]);
m1 = _mm_loadu_ps(&_x[2 * i + 8 * n + 4]);
m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@ -394,13 +427,21 @@ static void sse_conv_real4n(const float *x,
}
/* 4*N-tap SSE complex-complex convolution */
static void sse_conv_cmplx_4n(const float *x,
const float *h,
float *y,
int h_len, int len)
static void sse_conv_cmplx_4n(const float *x, int x_len,
const float *h, int h_len,
float *y, int y_len,
int start, int len,
int step, int offset)
{
/* NOTE: The parameter list of this function has to match the parameter
* list of _base_convolve_complex() in convolve_base.c. This specific
* implementation, ignores some of the parameters of
* _base_convolve_complex(), which are: x_len, y_len, offset, step. */
__m128 m0, m1, m2, m3, m4, m5, m6, m7;
const float *_x = &x[2 * (-(h_len - 1) + start)];
for (int i = 0; i < len; i++) {
/* Zero */
m6 = _mm_setzero_ps();
@ -414,8 +455,8 @@ static void sse_conv_cmplx_4n(const float *x,
m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
/* Load (unaligned) input data */
m0 = _mm_loadu_ps(&x[2 * i + 8 * n + 0]);
m1 = _mm_loadu_ps(&x[2 * i + 8 * n + 4]);
m0 = _mm_loadu_ps(&_x[2 * i + 8 * n + 0]);
m1 = _mm_loadu_ps(&_x[2 * i + 8 * n + 4]);
m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@ -445,14 +486,19 @@ static void sse_conv_cmplx_4n(const float *x,
}
/* 8*N-tap SSE complex-complex convolution */
static void sse_conv_cmplx_8n(const float *x,
const float *h,
float *y,
int h_len, int len)
static void sse_conv_cmplx_8n(const float *x, int x_len,
const float *h, int h_len,
float *y, int y_len,
int start, int len,
int step, int offset)
{
/* See NOTE in sse_conv_cmplx_4n() */
__m128 m0, m1, m2, m3, m4, m5, m6, m7;
__m128 m8, m9, m10, m11, m12, m13, m14, m15;
const float *_x = &x[2 * (-(h_len - 1) + start)];
for (int i = 0; i < len; i++) {
/* Zero */
m12 = _mm_setzero_ps();
@ -473,10 +519,10 @@ static void sse_conv_cmplx_8n(const float *x,
m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
/* Load (unaligned) input data */
m0 = _mm_loadu_ps(&x[2 * i + 16 * n + 0]);
m1 = _mm_loadu_ps(&x[2 * i + 16 * n + 4]);
m2 = _mm_loadu_ps(&x[2 * i + 16 * n + 8]);
m3 = _mm_loadu_ps(&x[2 * i + 16 * n + 12]);
m0 = _mm_loadu_ps(&_x[2 * i + 16 * n + 0]);
m1 = _mm_loadu_ps(&_x[2 * i + 16 * n + 4]);
m2 = _mm_loadu_ps(&_x[2 * i + 16 * n + 8]);
m3 = _mm_loadu_ps(&_x[2 * i + 16 * n + 12]);
m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@ -522,14 +568,10 @@ static void sse_conv_cmplx_8n(const float *x,
/* API: Aligned complex-real */
int convolve_real(const float *x, int x_len,
const float *h, int h_len,
float *y, int y_len,
int start, int len,
int step, int offset)
float *y, int y_len, int start, int len, int step, int offset)
{
void (*conv_func)(const float *, const float *,
float *, int) = NULL;
void (*conv_func_n)(const float *, const float *,
float *, int, int) = NULL;
void (*conv_func) (const float *, int, const float *, int, float *, int,
int, int, int, int) = (void *)_base_convolve_real;
if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
return -1;
@ -556,22 +598,12 @@ int convolve_real(const float *x, int x_len,
break;
default:
if (!(h_len % 4))
conv_func_n = sse_conv_real4n;
conv_func = sse_conv_real4n;
}
}
#endif
if (conv_func) {
conv_func(&x[2 * (-(h_len - 1) + start)],
h, y, len);
} else if (conv_func_n) {
conv_func_n(&x[2 * (-(h_len - 1) + start)],
h, y, h_len, len);
} else {
_base_convolve_real(x, x_len,
h, h_len,
y, y_len,
start, len, step, offset);
}
conv_func(x, x_len, h, h_len, y, y_len, start, len, step, offset);
return len;
}
@ -580,11 +612,11 @@ int convolve_real(const float *x, int x_len,
int convolve_complex(const float *x, int x_len,
const float *h, int h_len,
float *y, int y_len,
int start, int len,
int step, int offset)
int start, int len, int step, int offset)
{
void (*conv_func)(const float *, const float *,
float *, int, int) = NULL;
void (*conv_func) (const float *, int, const float *, int, float *, int,
int, int, int, int) =
(void *)_base_convolve_complex;
if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
return -1;
@ -599,15 +631,8 @@ int convolve_complex(const float *x, int x_len,
conv_func = sse_conv_cmplx_4n;
}
#endif
if (conv_func) {
conv_func(&x[2 * (-(h_len - 1) + start)],
h, y, h_len, len);
} else {
_base_convolve_complex(x, x_len,
h, h_len,
y, y_len,
start, len, step, offset);
}
conv_func(x, x_len, h, h_len, y, y_len, start, len, step, offset);
return len;
}