offer avx and sse to speed up float->uint8 for tx

This commit is contained in:
Hoernchen 2013-05-06 21:52:19 +02:00
parent 2d9e29ee46
commit 80b4ad2921
2 changed files with 132 additions and 29 deletions

View File

@ -43,6 +43,15 @@ include(GrVersion) #setup version info
######################################################################## ########################################################################
# Compiler specific setup # Compiler specific setup
######################################################################## ########################################################################
SET(USE_SIMD "no" CACHE STRING "Use SIMD instructions")
SET(USE_SIMD_VALUES "no" "SSE2" "AVX")
SET_PROPERTY(CACHE USE_SIMD PROPERTY STRINGS ${USE_SIMD_VALUES})
LIST(FIND USE_SIMD_VALUES ${USE_SIMD} USE_SIMD_INDEX)
IF(${USE_SIMD_INDEX} EQUAL -1)
message(FATAL_ERROR "Option ${USE_SIMD} not supported, valid entries are ${USE_SIMD_VALUES}")
ENDIF()
IF(CMAKE_COMPILER_IS_GNUCXX) IF(CMAKE_COMPILER_IS_GNUCXX)
ADD_DEFINITIONS(-Wall) ADD_DEFINITIONS(-Wall)
ADD_DEFINITIONS(-Wextra) ADD_DEFINITIONS(-Wextra)
@ -56,6 +65,23 @@ IF(CMAKE_COMPILER_IS_GNUCXX)
ADD_DEFINITIONS(-fvisibility=hidden) ADD_DEFINITIONS(-fvisibility=hidden)
ADD_DEFINITIONS(-fvisibility-inlines-hidden) ADD_DEFINITIONS(-fvisibility-inlines-hidden)
ENDIF(NOT WIN32) ENDIF(NOT WIN32)
IF(USE_SIMD MATCHES SSE2)
ADD_DEFINITIONS(-msse2)
ADD_DEFINITIONS(-DUSE_SSE2)
ENDIF()
IF(USE_SIMD MATCHES AVX)
ADD_DEFINITIONS(-march=native)
ADD_DEFINITIONS(-DUSE_AVX)
ENDIF()
ELSEIF(MSVC)
IF(USE_SIMD MATCHES SSE2)
ADD_DEFINITIONS(/arch:SSE2)
ADD_DEFINITIONS(-DUSE_SSE2)
ENDIF()
IF(USE_SIMD MATCHES AVX)
ADD_DEFINITIONS(/arch:AVX)
ADD_DEFINITIONS(-DUSE_AVX)
ENDIF()
ENDIF(CMAKE_COMPILER_IS_GNUCXX) ENDIF(CMAKE_COMPILER_IS_GNUCXX)
######################################################################## ########################################################################

View File

@ -29,6 +29,12 @@
#include <stdexcept> #include <stdexcept>
#include <iostream> #include <iostream>
#include <algorithm>
#ifdef USE_AVX
#include <immintrin.h>
#elif USE_SSE2
#include <emmintrin.h>
#endif
#include <boost/assign.hpp> #include <boost/assign.hpp>
#include <boost/format.hpp> #include <boost/format.hpp>
@ -126,10 +132,10 @@ hackrf_sink_c_sptr make_hackrf_sink_c (const std::string & args)
* are connected to this block. In this case, we accept * are connected to this block. In this case, we accept
* only 0 input and 1 output. * only 0 input and 1 output.
*/ */
static const int MIN_IN = 1; // mininum number of input streams static const int MIN_IN = 1; // mininum number of input streams
static const int MAX_IN = 1; // maximum number of input streams static const int MAX_IN = 1; // maximum number of input streams
static const int MIN_OUT = 0; // minimum number of output streams static const int MIN_OUT = 0; // minimum number of output streams
static const int MAX_OUT = 0; // maximum number of output streams static const int MAX_OUT = 0; // maximum number of output streams
/* /*
* The private constructor * The private constructor
@ -320,9 +326,74 @@ bool hackrf_sink_c::stop()
return ! (bool) hackrf_is_streaming( _dev ); return ! (bool) hackrf_is_streaming( _dev );
} }
#ifdef USE_AVX
void convert_avx(const float* inbuf, unsigned char* outbuf,const unsigned int count)
{
__m256 mulme = _mm256_set_ps(127.0f, 127.0f, 127.0f, 127.0f, 127.0f, 127.0f, 127.0f, 127.0f);
__m128i addme = _mm_set_epi16(127, 127, 127, 127, 127, 127, 127, 127);
for(unsigned int i=0; i<count;i++){
__m256i itmp3 = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(&inbuf[i*16+0]), mulme));
__m256i itmp4 = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(&inbuf[i*16+8]), mulme));
__m128i a1 = _mm256_extractf128_si256(itmp3, 1);
__m128i a0 = _mm256_castsi256_si128(itmp3);
__m128i a3 = _mm256_extractf128_si256(itmp4, 1);
__m128i a2 = _mm256_castsi256_si128(itmp4);
__m128i outshorts1 = _mm_add_epi16(_mm_packs_epi32(a0, a1), addme);
__m128i outshorts2 = _mm_add_epi16(_mm_packs_epi32(a2, a3), addme);
__m128i outbytes = _mm_packus_epi16(outshorts1, outshorts2);
_mm_storeu_si128 ((__m128i*)&outbuf[i*16], outbytes);
}
}
#elif USE_SSE2
void convert_sse2(const float* inbuf, unsigned char* outbuf,const unsigned int count)
{
const register __m128 mulme = _mm_set_ps( 127.0f, 127.0f, 127.0f, 127.0f );
__m128i addme = _mm_set_epi16( 127, 127, 127, 127, 127, 127, 127, 127);
__m128 itmp1,itmp2,itmp3,itmp4;
__m128i otmp1,otmp2,otmp3,otmp4;
__m128i outshorts1,outshorts2;
__m128i outbytes;
for(unsigned int i=0; i<count;i++){
itmp1 = _mm_mul_ps(_mm_loadu_ps(&inbuf[i*16+0]), mulme);
itmp2 = _mm_mul_ps(_mm_loadu_ps(&inbuf[i*16+4]), mulme);
itmp3 = _mm_mul_ps(_mm_loadu_ps(&inbuf[i*16+8]), mulme);
itmp4 = _mm_mul_ps(_mm_loadu_ps(&inbuf[i*16+12]), mulme);
otmp1 = _mm_cvtps_epi32(itmp1);
otmp2 = _mm_cvtps_epi32(itmp2);
otmp3 = _mm_cvtps_epi32(itmp3);
otmp4 = _mm_cvtps_epi32(itmp4);
outshorts1 = _mm_add_epi16(_mm_packs_epi32(otmp1, otmp2), addme);
outshorts2 = _mm_add_epi16(_mm_packs_epi32(otmp3, otmp4), addme);
outbytes = _mm_packus_epi16(outshorts1, outshorts2);
_mm_storeu_si128 ((__m128i*)&outbuf[i*16], outbytes);
}
}
#endif
void convert_default(float* inbuf, unsigned char* outbuf,const unsigned int count)
{
for(unsigned int i=0; i<count;i++){
outbuf[i]= inbuf[i]*127+127;
}
}
int hackrf_sink_c::work( int noutput_items, int hackrf_sink_c::work( int noutput_items,
gr_vector_const_void_star &input_items, gr_vector_const_void_star &input_items,
gr_vector_void_star &output_items ) gr_vector_void_star &output_items )
{ {
const gr_complex *in = (const gr_complex *) input_items[0]; const gr_complex *in = (const gr_complex *) input_items[0];
@ -334,34 +405,40 @@ int hackrf_sink_c::work( int noutput_items,
} }
unsigned char *buf = _buf + _buf_used; unsigned char *buf = _buf + _buf_used;
int items_consumed = 0;
unsigned int prev_buf_used = _buf_used; unsigned int prev_buf_used = _buf_used;
for (int i = 0; i < noutput_items; i++) { unsigned int remaining = (BUF_LEN-_buf_used)/2; //complex
if ( _buf_used + BYTES_PER_SAMPLE > BUF_LEN ) {
{
boost::mutex::scoped_lock lock( _buf_mutex );
if ( ! cb_push_back( &_cbuf, _buf ) ) { unsigned int count = std::min((unsigned int)noutput_items,remaining);
_buf_used = prev_buf_used; unsigned int sse_rem = count/8; // 8 complex = 16f==512bit for avx
items_consumed = 0; unsigned int nosse_rem = count%8; // remainder
std::cerr << "O" << std::flush;
break; #ifdef USE_AVX
} else { convert_avx((float*)in, buf, sse_rem);
// std::cerr << "." << std::flush; convert_default((float*)(in+sse_rem*8), buf+(sse_rem*8*2), nosse_rem*2);
} #elif USE_SSE2
convert_sse2((float*)in, buf, sse_rem);
convert_default((float*)(in+sse_rem*8), buf+(sse_rem*8*2), nosse_rem*2);
#else
convert_default((float*)in, buf, count*2);
#endif
_buf_used += (sse_rem*8+nosse_rem)*2;
int items_consumed = sse_rem*8+nosse_rem;
if(noutput_items >= remaining) {
{
boost::mutex::scoped_lock lock( _buf_mutex );
if ( ! cb_push_back( &_cbuf, _buf ) ) {
_buf_used = prev_buf_used;
items_consumed = 0;
std::cerr << "O" << std::flush;
} else {
// std::cerr << "." << std::flush;
_buf_used = 0;
} }
_buf_used = 0;
break;
} }
*buf++ = (in[i].real() + 1.0) * 127;
*buf++ = (in[i].imag() + 1.0) * 127;
_buf_used += BYTES_PER_SAMPLE;
items_consumed++;
} }
noutput_items = items_consumed; noutput_items = items_consumed;